qt-creator/shared/cpaster/cgi.cpp

/***************************************************************************
**
** This file is part of Qt Creator
**
** Copyright (c) 2008 Nokia Corporation and/or its subsidiary(-ies).
**
** Contact:  Qt Software Information (qt-info@nokia.com)
**
**
** Non-Open Source Usage
**
** Licensees may use this file in accordance with the Qt Beta Version
** License Agreement, Agreement version 2.2 provided with the Software or,
** alternatively, in accordance with the terms contained in a written
** agreement between you and Nokia.
**
** GNU General Public License Usage
**
** Alternatively, this file may be used under the terms of the GNU General
** Public License versions 2.0 or 3.0 as published by the Free Software
** Foundation and appearing in the file LICENSE.GPL included in the packaging
** of this file.  Please review the following information to ensure GNU
** General Public Licensing requirements will be met:
**
** http://www.fsf.org/licensing/licenses/info/GPLv2.html and
** http://www.gnu.org/copyleft/gpl.html.
**
** In addition, as a special exception, Nokia gives you certain additional
** rights. These rights are described in the Nokia Qt GPL Exception
** version 1.2, included in the file GPL_EXCEPTION.txt in this package.
**
***************************************************************************/

#include "cgi.h"

#include <QByteArray>


const char *cgi_chars = "0123456789abcdef"; // RFC 1738 suggests lower-case to be optimal

QString CGI::encodeURL(const QString &rawText)
{
    QByteArray utf = rawText.toUtf8();
    QString enc;
    enc.reserve(utf.length()); // Make sure we at least have space for a normal US-ASCII URL
    
    QByteArray::const_iterator it = utf.constBegin();
    while (it != utf.constEnd()) {
        char ch = *it;
        if (('A' <= ch && ch <= 'Z')
            || ('a' <= ch && ch <= 'z')
            || ('0' <= ch && ch <= '9'))
            enc.append(*it);
        else if (ch == ' ')
            enc.append('+');
        else {
            switch (ch) {
            case '-': case '_':
            case '(': case ')':
            case '.': case '!':
            case '~': case '*':
            case '\'':
                enc.append(ch);
                break;
            default:
                ushort c1 = (*it & 0xF0) >> 4;
                ushort c2 = (*it & 0x0F);
                enc.append('%');
                enc.append(QChar(*(cgi_chars + c1)));
                enc.append(QChar(*(cgi_chars + c2)));
                break;
            }
        }
        ++it;
    }
    return enc;
}

QString CGI::decodeURL(const QString &urlText)
{
    QByteArray dec;
    QString::const_iterator it = urlText.constBegin();
    while (it != urlText.constEnd()) {
        ushort ch = (*it).unicode();
        switch (ch) {
        case '%':
            {
                char c1 = char(0x00ff & (*(++it)).unicode());
                char c2 = char(0x00ff & (*(++it)).unicode());
                ushort v = 0;
                if ('A' <= c1 && c1 <= 'Z')
                    v = c1 - 'A' + 10;
                else if ('a' <= c1 && c1 <= 'z')
                    v = c1 - 'a' + 10;
                else if ('0' <= c1 && c1 <= '9')
                    v = c1 - '0';
                else
                    continue; // Malformed URL!
                v <<= 4; // c1 was MSB half
                if ('A' <= c2 && c2 <= 'Z')
                    v |= c2 - 'A' + 10;
                else if ('a' <= c2 && c2 <= 'z')
                    v |= c2 - 'a' + 10;
                else if ('0' <= c2 && c2 <= '9')
                    v |= c2 - '0';
                else
                    continue; // Malformed URL!
                dec.append((char)v);
            }
            break;
        case '+':
            dec.append(' ');
            break;
        default:
            dec.append(*it);
            break;
        }
        ++it;
    }
    return QString::fromUtf8(dec.constData(), dec.length());
}

// -------------------------------------------------------------------------------------------------
inline const char *unicodeToHTML(ushort unicode_char)
{
    switch (unicode_char) {
    // Latin -------------------------------
    case 0x0022: return "quot";    // (34  ) quotation mark = APL quote
    case 0x0026: return "amp";     // (38  ) ampersand
    case 0x003C: return "lt";      // (60  ) less-than sign
    case 0x003E: return "gt";      // (62  ) greater-than sign
    case 0x00A0: return "nbsp";    // (160 ) no-break space = non-breaking space
    case 0x00A1: return "iexcl";   // (161 ) inverted exclamation mark
    case 0x00A2: return "cent";    // (162 ) cent sign
    case 0x00A3: return "pound";   // (163 ) pound sign
    case 0x00A4: return "curren";  // (164 ) currency sign
    case 0x00A5: return "yen";     // (165 ) yen sign = yuan sign
    case 0x00A6: return "brvbar";  // (166 ) broken bar = broken vertical bar
    case 0x00A7: return "sect";    // (167 ) section sign
    case 0x00A8: return "uml";     // (168 ) diaeresis = spacing diaeresis
    case 0x00A9: return "copy";    // (169 ) copyright sign
    case 0x00AA: return "ordf";    // (170 ) feminine ordinal indicator
    case 0x00AB: return "laquo";   // (171 ) left-pointing double angle quotation mark = left pointing guillemet
    case 0x00AC: return "not";     // (172 ) not sign
    case 0x00AD: return "shy";     // (173 ) soft hyphen = discretionary hyphen
    case 0x00AE: return "reg";     // (174 ) registered sign = registered trade mark sign
    case 0x00AF: return "macr";    // (175 ) macron = spacing macron = overline = APL overbar
    case 0x00B0: return "deg";     // (176 ) degree sign
    case 0x00B1: return "plusmn";  // (177 ) plus-minus sign = plus-or-minus sign
    case 0x00B2: return "sup2";    // (178 ) superscript two = superscript digit two = squared
    case 0x00B3: return "sup3";    // (179 ) superscript three = superscript digit three = cubed
    case 0x00B4: return "acute";   // (180 ) acute accent = spacing acute
    case 0x00B5: return "micro";   // (181 ) micro sign
    case 0x00B6: return "para";    // (182 ) pilcrow sign = paragraph sign
    case 0x00B7: return "middot";  // (183 ) middle dot = Georgian comma = Greek middle dot
    case 0x00B8: return "cedil";   // (184 ) cedilla = spacing cedilla
    case 0x00B9: return "sup1";    // (185 ) superscript one = superscript digit one
    case 0x00BA: return "ordm";    // (186 ) masculine ordinal indicator
    case 0x00BB: return "raquo";   // (187 ) right-pointing double angle quotation mark = right pointing guillemet
    case 0x00BC: return "frac14";  // (188 ) vulgar fraction one quarter = fraction one quarter
    case 0x00BD: return "frac12";  // (189 ) vulgar fraction one half = fraction one half
    case 0x00BE: return "frac34";  // (190 ) vulgar fraction three quarters = fraction three quarters
    case 0x00BF: return "iquest";  // (191 ) inverted question mark = turned question mark
    case 0x00C0: return "Agrave";  // (192 ) capital letter A with grave = capital letter <20>
    case 0x00C1: return "Aacute";  // (193 ) capital letter A with acute
    case 0x00C2: return "Acirc";   // (194 ) capital letter A with circumflex
    case 0x00C3: return "Atilde";  // (195 ) capital letter A with tilde
    case 0x00C4: return "Auml";    // (196 ) capital letter A with diaeresis
    case 0x00C5: return "Aring";   // (197 ) capital letter A with ring above = capital letter <20>
    case 0x00C6: return "AElig";   // (198 ) capital letter AE =  capital ligature <20>
    case 0x00C7: return "Ccedil";  // (199 ) capital letter C with cedilla
    case 0x00C8: return "Egrave";  // (200 ) capital letter E with grave
    case 0x00C9: return "Eacute";  // (201 ) capital letter E with acute
    case 0x00CA: return "Ecirc";   // (202 ) capital letter E with circumflex
    case 0x00CB: return "Euml";    // (203 ) capital letter E with diaeresis
    case 0x00CC: return "Igrave";  // (204 ) capital letter I with grave
    case 0x00CD: return "Iacute";  // (205 ) capital letter I with acute
    case 0x00CE: return "Icirc";   // (206 ) capital letter I with circumflex
    case 0x00CF: return "Iuml";    // (207 ) capital letter I with diaeresis
    case 0x00D0: return "ETH";     // (208 ) capital letter ETH
    case 0x00D1: return "Ntilde";  // (209 ) capital letter N with tilde
    case 0x00D2: return "Ograve";  // (210 ) capital letter O with grave
    case 0x00D3: return "Oacute";  // (211 ) capital letter O with acute
    case 0x00D4: return "Ocirc";   // (212 ) capital letter O with circumflex
    case 0x00D5: return "Otilde";  // (213 ) capital letter O with tilde
    case 0x00D6: return "Ouml";    // (214 ) capital letter O with diaeresis
    case 0x00D7: return "times";   // (215 ) multiplication sign
    case 0x00D8: return "Oslash";  // (216 ) capital letter O with stroke = capital letter <20>
    case 0x00D9: return "Ugrave";  // (217 ) capital letter U with grave
    case 0x00DA: return "Uacute";  // (218 ) capital letter U with acute
    case 0x00DB: return "Ucirc";   // (219 ) capital letter U with circumflex
    case 0x00DC: return "Uuml";    // (220 ) capital letter U with diaeresis
    case 0x00DD: return "Yacute";  // (221 ) capital letter Y with acute
    case 0x00DE: return "THORN";   // (222 ) capital letter THORN
    case 0x00DF: return "szlig";   // (223 ) small letter sharp s = ess-zed
    case 0x00E0: return "agrave";  // (224 ) small letter a with grave = small letter <20>
    case 0x00E1: return "aacute";  // (225 ) small letter a with acute
    case 0x00E2: return "acirc";   // (226 ) small letter a with circumflex
    case 0x00E3: return "atilde";  // (227 ) small letter a with tilde
    case 0x00E4: return "auml";    // (228 ) small letter a with diaeresis
    case 0x00E5: return "aring";   // (229 ) small letter a with ring above = small letter <20>
    case 0x00E6: return "aelig";   // (230 ) small letter ae = small letter <20>
    case 0x00E7: return "ccedil";  // (231 ) small letter c with cedilla
    case 0x00E8: return "egrave";  // (232 ) small letter e with grave
    case 0x00E9: return "eacute";  // (233 ) small letter e with acute
    case 0x00EA: return "ecirc";   // (234 ) small letter e with circumflex
    case 0x00EB: return "euml";    // (235 ) small letter e with diaeresis
    case 0x00EC: return "igrave";  // (236 ) small letter i with grave
    case 0x00ED: return "iacute";  // (237 ) small letter i with acute
    case 0x00EE: return "icirc";   // (238 ) small letter i with circumflex
    case 0x00EF: return "iuml";    // (239 ) small letter i with diaeresis
    case 0x00F0: return "eth";     // (240 ) small letter eth
    case 0x00F1: return "ntilde";  // (241 ) small letter n with tilde
    case 0x00F2: return "ograve";  // (242 ) small letter o with grave
    case 0x00F3: return "oacute";  // (243 ) small letter o with acute
    case 0x00F4: return "ocirc";   // (244 ) small letter o with circumflex
    case 0x00F5: return "otilde";  // (245 ) small letter o with tilde
    case 0x00F6: return "ouml";    // (246 ) small letter o with diaeresis
    case 0x00F7: return "divide";  // (247 ) division sign
    case 0x00F8: return "oslash";  // (248 ) small letter o with stroke = small letter <20>
    case 0x00F9: return "ugrave";  // (249 ) small letter u with grave
    case 0x00FA: return "uacute";  // (250 ) small letter u with acute
    case 0x00FB: return "ucirc";   // (251 ) small letter u with circumflex
    case 0x00FC: return "uuml";    // (252 ) small letter u with diaeresis
    case 0x00FD: return "yacute";  // (253 ) small letter y with acute
    case 0x00FE: return "thorn";   // (254 ) small letter thorn
    case 0x00FF: return "yuml";    // (255 ) small letter y with diaeresis
    case 0x0152: return "OElig";   // (338 ) capital ligature OE
    case 0x0153: return "oelig";   // (339 ) small ligature oe
    case 0x0160: return "Scaron";  // (352 ) capital letter S with caron
    case 0x0161: return "scaron";  // (353 ) small letter s with caron
    case 0x0178: return "Yuml";    // (376 ) capital letter Y with diaeresis
    case 0x0192: return "fnof";    // (402 ) small f with hook = function = florin
    case 0x02C6: return "circ";    // (710 ) modifier letter circumflex accent
    case 0x02DC: return "tilde";   // (732 ) small tilde
    // Greek -------------------------------
    case 0x0391: return "Alpha";   // (913 ) capital letter alpha
    case 0x0392: return "Beta";    // (914 ) capital letter beta
    case 0x0393: return "Gamma";   // (915 ) capital letter gamma
    case 0x0394: return "Delta";   // (916 ) capital letter delta
    case 0x0395: return "Epsilon"; // (917 ) capital letter epsilon
    case 0x0396: return "Zeta";    // (918 ) capital letter zeta
    case 0x0397: return "Eta";     // (919 ) capital letter eta
    case 0x0398: return "Theta";   // (920 ) capital letter theta
    case 0x0399: return "Iota";    // (921 ) capital letter iota
    case 0x039A: return "Kappa";   // (922 ) capital letter kappa
    case 0x039B: return "Lambda";  // (923 ) capital letter lambda
    case 0x039C: return "Mu";      // (924 ) capital letter mu
    case 0x039D: return "Nu";      // (925 ) capital letter nu
    case 0x039E: return "Xi";      // (926 ) capital letter xi
    case 0x039F: return "Omicron"; // (927 ) capital letter omicron
    case 0x03A0: return "Pi";      // (928 ) capital letter pi
    case 0x03A1: return "Rho";     // (929 ) capital letter rho
    case 0x03A3: return "Sigma";   // (931 ) capital letter sigma
    case 0x03A4: return "Tau";     // (932 ) capital letter tau
    case 0x03A5: return "Upsilon"; // (933 ) capital letter upsilon
    case 0x03A6: return "Phi";     // (934 ) capital letter phi
    case 0x03A7: return "Chi";     // (935 ) capital letter chi
    case 0x03A8: return "Psi";     // (936 ) capital letter psi
    case 0x03A9: return "Omega";   // (937 ) capital letter omega
    case 0x03B1: return "alpha";   // (945 ) small letter alpha
    case 0x03B2: return "beta";    // (946 ) small letter beta
    case 0x03B3: return "gamma";   // (947 ) small letter gamma
    case 0x03B4: return "delta";   // (948 ) small letter delta
    case 0x03B5: return "epsilon"; // (949 ) small letter epsilon
    case 0x03B6: return "zeta";    // (950 ) small letter zeta
    case 0x03B7: return "eta";     // (951 ) small letter eta
    case 0x03B8: return "theta";   // (952 ) small letter theta
    case 0x03B9: return "iota";    // (953 ) small letter iota
    case 0x03BA: return "kappa";   // (954 ) small letter kappa
    case 0x03BB: return "lambda";  // (955 ) small letter lambda
    case 0x03BC: return "mu";      // (956 ) small letter mu
    case 0x03BD: return "nu";      // (957 ) small letter nu
    case 0x03BE: return "xi";      // (958 ) small letter xi
    case 0x03BF: return "omicron"; // (959 ) small letter omicron
    case 0x03C0: return "pi";      // (960 ) small letter pi
    case 0x03C1: return "rho";     // (961 ) small letter rho
    case 0x03C2: return "sigmaf";  // (962 ) small letter final sigma
    case 0x03C3: return "sigma";   // (963 ) small letter sigma
    case 0x03C4: return "tau";     // (964 ) small letter tau
    case 0x03C5: return "upsilon"; // (965 ) small letter upsilon
    case 0x03C6: return "phi";     // (966 ) small letter phi
    case 0x03C7: return "chi";     // (967 ) small letter chi
    case 0x03C8: return "psi";     // (968 ) small letter psi
    case 0x03C9: return "omega";   // (969 ) small letter omega
    case 0x03D1: return "thetasym";// (977 ) small letter theta symbol
    case 0x03D2: return "upsih";   // (978 ) upsilon with hook symbol
    case 0x03D6: return "piv";     // (982 ) pi symbol
    // General Punctuation -----------------
    case 0x2002: return "ensp";    // (8194) en space
    case 0x2003: return "emsp";    // (8195) em space
    case 0x2009: return "thinsp";  // (8201) thin space
    case 0x200C: return "zwnj";    // (8204) zero width non-joiner
    case 0x200D: return "zwj";     // (8205) zero width joiner
    case 0x200E: return "lrm";     // (8206) left-to-right mark
    case 0x200F: return "rlm";     // (8207) right-to-left mark
    case 0x2013: return "ndash";   // (8211) en dash
    case 0x2014: return "mdash";   // (8212) em dash
    case 0x2018: return "lsquo";   // (8216) left single quotation mark
    case 0x2019: return "rsquo";   // (8217) right single quotation mark
    case 0x201A: return "sbquo";   // (8218) single low-9 quotation mark
    case 0x201C: return "ldquo";   // (8220) left double quotation mark
    case 0x201D: return "rdquo";   // (8221) right double quotation mark
    case 0x201E: return "bdquo";   // (8222) double low-9 quotation mark
    case 0x2020: return "dagger";  // (8224) dagger
    case 0x2021: return "Dagger";  // (8225) double dagger
    case 0x2022: return "bull";    // (8226) bullet = black small circle
    case 0x2026: return "hellip";  // (8230) horizontal ellipsis = three dot leader
    case 0x2030: return "permil";  // (8240) per mille sign
    case 0x2032: return "prime";   // (8242) prime = minutes = feet
    case 0x2033: return "Prime";   // (8243) double prime = seconds = inches
    case 0x2039: return "lsaquo";  // (8249) single left-pointing angle quotation mark
    case 0x203A: return "rsaquo";  // (8250) single right-pointing angle quotation mark
    case 0x203E: return "oline";   // (8254) overline = spacing overscore
    case 0x2044: return "frasl";   // (8260) fraction slash
    // Currency Symbols --------------------
    case 0x20AC: return "euro";    // (8364) euro sign
    // Letterlike Symbols ------------------
    case 0x2111: return "image";   // (8465) blackletter capital I = imaginary part
    case 0x2118: return "weierp";  // (8472) script capital P = power set = Weierstrass p
    case 0x211C: return "real";    // (8476) blackletter capital R = real part symbol
    case 0x2122: return "trade";   // (8482) trade mark sign
    case 0x2135: return "alefsym"; // (8501) alef symbol = first transfinite cardinal
    // Arrows ------------------------------
    case 0x2190: return "larr";    // (8592) leftwards arrow
    case 0x2191: return "uarr";    // (8593) upwards arrow
    case 0x2192: return "rarr";    // (8594) rightwards arrow
    case 0x2193: return "darr";    // (8595) downwards arrow
    case 0x2194: return "harr";    // (8596) left right arrow
    case 0x21B5: return "crarr";   // (8629) downwards arrow with corner leftwards = carriage return
    case 0x21D0: return "lArr";    // (8656) leftwards double arrow
    case 0x21D1: return "uArr";    // (8657) upwards double arrow
    case 0x21D2: return "rArr";    // (8658) rightwards double arrow
    case 0x21D3: return "dArr";    // (8659) downwards double arrow
    case 0x21D4: return "hArr";    // (8660) left right double arrow
    // Mathematical Operators --------------
    case 0x2200: return "forall";  // (8704) for all
    case 0x2202: return "part";    // (8706) partial differential
    case 0x2203: return "exist";   // (8707) there exists
    case 0x2205: return "empty";   // (8709) empty set = null set = diameter
    case 0x2207: return "nabla";   // (8711) nabla = backward difference
    case 0x2208: return "isin";    // (8712) element of
    case 0x2209: return "notin";   // (8713) not an element of
    case 0x220B: return "ni";      // (8715) contains as member
    case 0x220F: return "prod";    // (8719) n-ary product = product sign
    case 0x2211: return "sum";     // (8721) n-ary sumation
    case 0x2212: return "minus";   // (8722) minus sign
    case 0x2217: return "lowast";  // (8727) asterisk operator
    case 0x221A: return "radic";   // (8730) square root = radical sign
    case 0x221D: return "prop";    // (8733) proportional to
    case 0x221E: return "infin";   // (8734) infinity
    case 0x2220: return "ang";     // (8736) angle
    case 0x2227: return "and";     // (8743) logical and = wedge
    case 0x2228: return "or";      // (8744) logical or = vee
    case 0x2229: return "cap";     // (8745) intersection = cap
    case 0x222A: return "cup";     // (8746) union = cup
    case 0x222B: return "int";     // (8747) integral
    case 0x2234: return "there4";  // (8756) therefore
    case 0x223C: return "sim";     // (8764) tilde operator = varies with = similar to
    case 0x2245: return "cong";    // (8773) approximately equal to
    case 0x2248: return "asymp";   // (8776) almost equal to = asymptotic to
    case 0x2260: return "ne";      // (8800) not equal to
    case 0x2261: return "equiv";   // (8801) identical to
    case 0x2264: return "le";      // (8804) less-than or equal to
    case 0x2265: return "ge";      // (8805) greater-than or equal to
    case 0x2282: return "sub";     // (8834) subset of
    case 0x2283: return "sup";     // (8835) superset of
    case 0x2284: return "nsub";    // (8836) not a subset of
    case 0x2286: return "sube";    // (8838) subset of or equal to
    case 0x2287: return "supe";    // (8839) superset of or equal to
    case 0x2295: return "oplus";   // (8853) circled plus = direct sum
    case 0x2297: return "otimes";  // (8855) circled times = vector product
    case 0x22A5: return "perp";    // (8869) up tack = orthogonal to = perpendicular
    case 0x22C5: return "sdot";    // (8901) dot operator
    // Miscellaneous Technical -------------
    case 0x2308: return "lceil";   // (8968) left ceiling = apl upstile
    case 0x2309: return "rceil";   // (8969) right ceiling
    case 0x230A: return "lfloor";  // (8970) left floor = apl downstile
    case 0x230B: return "rfloor";  // (8971) right floor
    case 0x2329: return "lang";    // (9001) left-pointing angle bracket = bra
    case 0x232A: return "rang";    // (9002) right-pointing angle bracket = ket
    // Geometric Shapes --------------------
    case 0x25CA: return "loz";     // (9674) lozenge
    // Miscellaneous Symbols ---------------
    case 0x2660: return "spades";  // (9824) black spade suit
    case 0x2663: return "clubs";   // (9827) black club suit = shamrock
    case 0x2665: return "hearts";  // (9829) black heart suit = valentine
    case 0x2666: return "diams";   // (9830) black diamond suit
    default: break;
    }
    return 0;
}

QString CGI::encodeHTML(const QString &rawText, int conversionFlags)
{
    QString enc;
    enc.reserve(rawText.length()); // at least

    QString::const_iterator it = rawText.constBegin();
    while (it != rawText.constEnd()) {
        const char *html = unicodeToHTML((*it).unicode());
        if (html) {
            enc.append('&');
            enc.append(html);
            enc.append(';');
        } else if ((conversionFlags & CGI::LineBreaks)
                   && ((*it).toLatin1() == '\n')) {
                enc.append("<BR>\n");
        } else if ((conversionFlags & CGI::Spaces)
                   && ((*it).toLatin1() == ' ')) {
                enc.append("&nbsp;");
        } else if ((conversionFlags & CGI::Tabs)
                   && ((*it).toLatin1() == '\t')) {
                enc.append("&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;");
        } else if ((*it).unicode() > 0x00FF) {
            enc.append("&#");
            enc.append(QString::number((*it).unicode()));
            enc.append(';');
        } else {
            enc.append(*it);
        }
        ++it;
    }

    return enc;
}