tentakelfabrik
/
tiny-consent


								var TokenStream = require('../common/TokenStream');

								var adoptBuffer = require('../common/adopt-buffer');


								var constants = require('./const');

								var TYPE = constants.TYPE;


								var charCodeDefinitions = require('./char-code-definitions');

								var isNewline = charCodeDefinitions.isNewline;

								var isName = charCodeDefinitions.isName;

								var isValidEscape = charCodeDefinitions.isValidEscape;

								var isNumberStart = charCodeDefinitions.isNumberStart;

								var isIdentifierStart = charCodeDefinitions.isIdentifierStart;

								var charCodeCategory = charCodeDefinitions.charCodeCategory;

								var isBOM = charCodeDefinitions.isBOM;


								var utils = require('./utils');

								var cmpStr = utils.cmpStr;

								var getNewlineLength = utils.getNewlineLength;

								var findWhiteSpaceEnd = utils.findWhiteSpaceEnd;

								var consumeEscaped = utils.consumeEscaped;

								var consumeName = utils.consumeName;

								var consumeNumber = utils.consumeNumber;

								var consumeBadUrlRemnants = utils.consumeBadUrlRemnants;


								var OFFSET_MASK = 0x00FFFFFF;

								var TYPE_SHIFT = 24;


								function tokenize(source, stream) {

								    function getCharCode(offset) {

								        return offset < sourceLength ? source.charCodeAt(offset) : 0;

								    }


								    // § 4.3.3. Consume a numeric token

								    function consumeNumericToken() {

								        // Consume a number and let number be the result.

								        offset = consumeNumber(source, offset);


								        // If the next 3 input code points would start an identifier, then:

								        if (isIdentifierStart(getCharCode(offset), getCharCode(offset + 1), getCharCode(offset + 2))) {

								            // Create a <dimension-token> with the same value and type flag as number, and a unit set initially to the empty string.

								            // Consume a name. Set the <dimension-token>’s unit to the returned value.

								            // Return the <dimension-token>.

								            type = TYPE.Dimension;

								            offset = consumeName(source, offset);

								            return;

								        }


								        // Otherwise, if the next input code point is U+0025 PERCENTAGE SIGN (%), consume it.

								        if (getCharCode(offset) === 0x0025) {

								            // Create a <percentage-token> with the same value as number, and return it.

								            type = TYPE.Percentage;

								            offset++;

								            return;

								        }


								        // Otherwise, create a <number-token> with the same value and type flag as number, and return it.

								        type = TYPE.Number;

								    }


								    // § 4.3.4. Consume an ident-like token

								    function consumeIdentLikeToken() {

								        const nameStartOffset = offset;


								        // Consume a name, and let string be the result.

								        offset = consumeName(source, offset);


								        // If string’s value is an ASCII case-insensitive match for "url",

								        // and the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.

								        if (cmpStr(source, nameStartOffset, offset, 'url') && getCharCode(offset) === 0x0028) {

								            // While the next two input code points are whitespace, consume the next input code point.

								            offset = findWhiteSpaceEnd(source, offset + 1);


								            // If the next one or two input code points are U+0022 QUOTATION MARK ("), U+0027 APOSTROPHE ('),

								            // or whitespace followed by U+0022 QUOTATION MARK (") or U+0027 APOSTROPHE ('),

								            // then create a <function-token> with its value set to string and return it.

								            if (getCharCode(offset) === 0x0022 ||

								                getCharCode(offset) === 0x0027) {

								                type = TYPE.Function;

								                offset = nameStartOffset + 4;

								                return;

								            }


								            // Otherwise, consume a url token, and return it.

								            consumeUrlToken();

								            return;

								        }


								        // Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.

								        // Create a <function-token> with its value set to string and return it.

								        if (getCharCode(offset) === 0x0028) {

								            type = TYPE.Function;

								            offset++;

								            return;

								        }


								        // Otherwise, create an <ident-token> with its value set to string and return it.

								        type = TYPE.Ident;

								    }


								    // § 4.3.5. Consume a string token

								    function consumeStringToken(endingCodePoint) {

								        // This algorithm may be called with an ending code point, which denotes the code point

								        // that ends the string. If an ending code point is not specified,

								        // the current input code point is used.

								        if (!endingCodePoint) {

								            endingCodePoint = getCharCode(offset++);

								        }


								        // Initially create a <string-token> with its value set to the empty string.

								        type = TYPE.String;


								        // Repeatedly consume the next input code point from the stream:

								        for (; offset < source.length; offset++) {

								            var code = source.charCodeAt(offset);


								            switch (charCodeCategory(code)) {

								                // ending code point

								                case endingCodePoint:

								                    // Return the <string-token>.

								                    offset++;

								                    return;


								                // EOF

								                case charCodeCategory.Eof:

								                    // This is a parse error. Return the <string-token>.

								                    return;


								                // newline

								                case charCodeCategory.WhiteSpace:

								                    if (isNewline(code)) {

								                        // This is a parse error. Reconsume the current input code point,

								                        // create a <bad-string-token>, and return it.

								                        offset += getNewlineLength(source, offset, code);

								                        type = TYPE.BadString;

								                        return;

								                    }

								                    break;


								                // U+005C REVERSE SOLIDUS (\)

								                case 0x005C:

								                    // If the next input code point is EOF, do nothing.

								                    if (offset === source.length - 1) {

								                        break;

								                    }


								                    var nextCode = getCharCode(offset + 1);


								                    // Otherwise, if the next input code point is a newline, consume it.

								                    if (isNewline(nextCode)) {

								                        offset += getNewlineLength(source, offset + 1, nextCode);

								                    } else if (isValidEscape(code, nextCode)) {

								                        // Otherwise, (the stream starts with a valid escape) consume

								                        // an escaped code point and append the returned code point to

								                        // the <string-token>’s value.

								                        offset = consumeEscaped(source, offset) - 1;

								                    }

								                    break;


								                // anything else

								                // Append the current input code point to the <string-token>’s value.

								            }

								        }

								    }


								    // § 4.3.6. Consume a url token

								    // Note: This algorithm assumes that the initial "url(" has already been consumed.

								    // This algorithm also assumes that it’s being called to consume an "unquoted" value, like url(foo).

								    // A quoted value, like url("foo"), is parsed as a <function-token>. Consume an ident-like token

								    // automatically handles this distinction; this algorithm shouldn’t be called directly otherwise.

								    function consumeUrlToken() {

								        // Initially create a <url-token> with its value set to the empty string.

								        type = TYPE.Url;


								        // Consume as much whitespace as possible.

								        offset = findWhiteSpaceEnd(source, offset);


								        // Repeatedly consume the next input code point from the stream:

								        for (; offset < source.length; offset++) {

								            var code = source.charCodeAt(offset);


								            switch (charCodeCategory(code)) {

								                // U+0029 RIGHT PARENTHESIS ())

								                case 0x0029:

								                    // Return the <url-token>.

								                    offset++;

								                    return;


								                // EOF

								                case charCodeCategory.Eof:

								                    // This is a parse error. Return the <url-token>.

								                    return;


								                // whitespace

								                case charCodeCategory.WhiteSpace:

								                    // Consume as much whitespace as possible.

								                    offset = findWhiteSpaceEnd(source, offset);


								                    // If the next input code point is U+0029 RIGHT PARENTHESIS ()) or EOF,

								                    // consume it and return the <url-token>

								                    // (if EOF was encountered, this is a parse error);

								                    if (getCharCode(offset) === 0x0029 || offset >= source.length) {

								                        if (offset < source.length) {

								                            offset++;

								                        }

								                        return;

								                    }


								                    // otherwise, consume the remnants of a bad url, create a <bad-url-token>,

								                    // and return it.

								                    offset = consumeBadUrlRemnants(source, offset);

								                    type = TYPE.BadUrl;

								                    return;


								                // U+0022 QUOTATION MARK (")

								                // U+0027 APOSTROPHE (')

								                // U+0028 LEFT PARENTHESIS (()

								                // non-printable code point

								                case 0x0022:

								                case 0x0027:

								                case 0x0028:

								                case charCodeCategory.NonPrintable:

								                    // This is a parse error. Consume the remnants of a bad url,

								                    // create a <bad-url-token>, and return it.

								                    offset = consumeBadUrlRemnants(source, offset);

								                    type = TYPE.BadUrl;

								                    return;


								                // U+005C REVERSE SOLIDUS (\)

								                case 0x005C:

								                    // If the stream starts with a valid escape, consume an escaped code point and

								                    // append the returned code point to the <url-token>’s value.

								                    if (isValidEscape(code, getCharCode(offset + 1))) {

								                        offset = consumeEscaped(source, offset) - 1;

								                        break;

								                    }


								                    // Otherwise, this is a parse error. Consume the remnants of a bad url,

								                    // create a <bad-url-token>, and return it.

								                    offset = consumeBadUrlRemnants(source, offset);

								                    type = TYPE.BadUrl;

								                    return;


								                // anything else

								                // Append the current input code point to the <url-token>’s value.

								            }

								        }

								    }


								    if (!stream) {

								        stream = new TokenStream();

								    }


								    // ensure source is a string

								    source = String(source || '');


								    var sourceLength = source.length;

								    var offsetAndType = adoptBuffer(stream.offsetAndType, sourceLength + 1); // +1 because of eof-token

								    var balance = adoptBuffer(stream.balance, sourceLength + 1);

								    var tokenCount = 0;

								    var start = isBOM(getCharCode(0));

								    var offset = start;

								    var balanceCloseType = 0;

								    var balanceStart = 0;

								    var balancePrev = 0;


								    // https://drafts.csswg.org/css-syntax-3/#consume-token

								    // § 4.3.1. Consume a token

								    while (offset < sourceLength) {

								        var code = source.charCodeAt(offset);

								        var type = 0;


								        balance[tokenCount] = sourceLength;


								        switch (charCodeCategory(code)) {

								            // whitespace

								            case charCodeCategory.WhiteSpace:

								                // Consume as much whitespace as possible. Return a <whitespace-token>.

								                type = TYPE.WhiteSpace;

								                offset = findWhiteSpaceEnd(source, offset + 1);

								                break;


								            // U+0022 QUOTATION MARK (")

								            case 0x0022:

								                // Consume a string token and return it.

								                consumeStringToken();

								                break;


								            // U+0023 NUMBER SIGN (#)

								            case 0x0023:

								                // If the next input code point is a name code point or the next two input code points are a valid escape, then:

								                if (isName(getCharCode(offset + 1)) || isValidEscape(getCharCode(offset + 1), getCharCode(offset + 2))) {

								                    // Create a <hash-token>.

								                    type = TYPE.Hash;


								                    // If the next 3 input code points would start an identifier, set the <hash-token>’s type flag to "id".

								                    // if (isIdentifierStart(getCharCode(offset + 1), getCharCode(offset + 2), getCharCode(offset + 3))) {

								                    //     // TODO: set id flag

								                    // }


								                    // Consume a name, and set the <hash-token>’s value to the returned string.

								                    offset = consumeName(source, offset + 1);


								                    // Return the <hash-token>.

								                } else {

								                    // Otherwise, return a <delim-token> with its value set to the current input code point.

								                    type = TYPE.Delim;

								                    offset++;

								                }


								                break;


								            // U+0027 APOSTROPHE (')

								            case 0x0027:

								                // Consume a string token and return it.

								                consumeStringToken();

								                break;


								            // U+0028 LEFT PARENTHESIS (()

								            case 0x0028:

								                // Return a <(-token>.

								                type = TYPE.LeftParenthesis;

								                offset++;

								                break;


								            // U+0029 RIGHT PARENTHESIS ())

								            case 0x0029:

								                // Return a <)-token>.

								                type = TYPE.RightParenthesis;

								                offset++;

								                break;


								            // U+002B PLUS SIGN (+)

								            case 0x002B:

								                // If the input stream starts with a number, ...

								                if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {

								                    // ... reconsume the current input code point, consume a numeric token, and return it.

								                    consumeNumericToken();

								                } else {

								                    // Otherwise, return a <delim-token> with its value set to the current input code point.

								                    type = TYPE.Delim;

								                    offset++;

								                }

								                break;


								            // U+002C COMMA (,)

								            case 0x002C:

								                // Return a <comma-token>.

								                type = TYPE.Comma;

								                offset++;

								                break;


								            // U+002D HYPHEN-MINUS (-)

								            case 0x002D:

								                // If the input stream starts with a number, reconsume the current input code point, consume a numeric token, and return it.

								                if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {

								                    consumeNumericToken();

								                } else {

								                    // Otherwise, if the next 2 input code points are U+002D HYPHEN-MINUS U+003E GREATER-THAN SIGN (->), consume them and return a <CDC-token>.

								                    if (getCharCode(offset + 1) === 0x002D &&

								                        getCharCode(offset + 2) === 0x003E) {

								                        type = TYPE.CDC;

								                        offset = offset + 3;

								                    } else {

								                        // Otherwise, if the input stream starts with an identifier, ...

								                        if (isIdentifierStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {

								                            // ... reconsume the current input code point, consume an ident-like token, and return it.

								                            consumeIdentLikeToken();

								                        } else {

								                            // Otherwise, return a <delim-token> with its value set to the current input code point.

								                            type = TYPE.Delim;

								                            offset++;

								                        }

								                    }

								                }

								                break;


								            // U+002E FULL STOP (.)

								            case 0x002E:

								                // If the input stream starts with a number, ...

								                if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {

								                    // ... reconsume the current input code point, consume a numeric token, and return it.

								                    consumeNumericToken();

								                } else {

								                    // Otherwise, return a <delim-token> with its value set to the current input code point.

								                    type = TYPE.Delim;

								                    offset++;

								                }


								                break;


								            // U+002F SOLIDUS (/)

								            case 0x002F:

								                // If the next two input code point are U+002F SOLIDUS (/) followed by a U+002A ASTERISK (*),

								                if (getCharCode(offset + 1) === 0x002A) {

								                    // ... consume them and all following code points up to and including the first U+002A ASTERISK (*)

								                    // followed by a U+002F SOLIDUS (/), or up to an EOF code point.

								                    type = TYPE.Comment;

								                    offset = source.indexOf('*/', offset + 2) + 2;

								                    if (offset === 1) {

								                        offset = source.length;

								                    }

								                } else {

								                    type = TYPE.Delim;

								                    offset++;

								                }

								                break;


								            // U+003A COLON (:)

								            case 0x003A:

								                // Return a <colon-token>.

								                type = TYPE.Colon;

								                offset++;

								                break;


								            // U+003B SEMICOLON (;)

								            case 0x003B:

								                // Return a <semicolon-token>.

								                type = TYPE.Semicolon;

								                offset++;

								                break;


								            // U+003C LESS-THAN SIGN (<)

								            case 0x003C:

								                // If the next 3 input code points are U+0021 EXCLAMATION MARK U+002D HYPHEN-MINUS U+002D HYPHEN-MINUS (!--), ...

								                if (getCharCode(offset + 1) === 0x0021 &&

								                    getCharCode(offset + 2) === 0x002D &&

								                    getCharCode(offset + 3) === 0x002D) {

								                    // ... consume them and return a <CDO-token>.

								                    type = TYPE.CDO;

								                    offset = offset + 4;

								                } else {

								                    // Otherwise, return a <delim-token> with its value set to the current input code point.

								                    type = TYPE.Delim;

								                    offset++;

								                }


								                break;


								            // U+0040 COMMERCIAL AT (@)

								            case 0x0040:

								                // If the next 3 input code points would start an identifier, ...

								                if (isIdentifierStart(getCharCode(offset + 1), getCharCode(offset + 2), getCharCode(offset + 3))) {

								                    // ... consume a name, create an <at-keyword-token> with its value set to the returned value, and return it.

								                    type = TYPE.AtKeyword;

								                    offset = consumeName(source, offset + 1);

								                } else {

								                    // Otherwise, return a <delim-token> with its value set to the current input code point.

								                    type = TYPE.Delim;

								                    offset++;

								                }


								                break;


								            // U+005B LEFT SQUARE BRACKET ([)

								            case 0x005B:

								                // Return a <[-token>.

								                type = TYPE.LeftSquareBracket;

								                offset++;

								                break;


								            // U+005C REVERSE SOLIDUS (\)

								            case 0x005C:

								                // If the input stream starts with a valid escape, ...

								                if (isValidEscape(code, getCharCode(offset + 1))) {

								                    // ... reconsume the current input code point, consume an ident-like token, and return it.

								                    consumeIdentLikeToken();

								                } else {

								                    // Otherwise, this is a parse error. Return a <delim-token> with its value set to the current input code point.

								                    type = TYPE.Delim;

								                    offset++;

								                }

								                break;


								            // U+005D RIGHT SQUARE BRACKET (])

								            case 0x005D:

								                // Return a <]-token>.

								                type = TYPE.RightSquareBracket;

								                offset++;

								                break;


								            // U+007B LEFT CURLY BRACKET ({)

								            case 0x007B:

								                // Return a <{-token>.

								                type = TYPE.LeftCurlyBracket;

								                offset++;

								                break;


								            // U+007D RIGHT CURLY BRACKET (})

								            case 0x007D:

								                // Return a <}-token>.

								                type = TYPE.RightCurlyBracket;

								                offset++;

								                break;


								            // digit

								            case charCodeCategory.Digit:

								                // Reconsume the current input code point, consume a numeric token, and return it.

								                consumeNumericToken();

								                break;


								            // name-start code point

								            case charCodeCategory.NameStart:

								                // Reconsume the current input code point, consume an ident-like token, and return it.

								                consumeIdentLikeToken();

								                break;


								            // EOF

								            case charCodeCategory.Eof:

								                // Return an <EOF-token>.

								                break;


								            // anything else

								            default:

								                // Return a <delim-token> with its value set to the current input code point.

								                type = TYPE.Delim;

								                offset++;

								        }


								        switch (type) {

								            case balanceCloseType:

								                balancePrev = balanceStart & OFFSET_MASK;

								                balanceStart = balance[balancePrev];

								                balanceCloseType = balanceStart >> TYPE_SHIFT;

								                balance[tokenCount] = balancePrev;

								                balance[balancePrev++] = tokenCount;

								                for (; balancePrev < tokenCount; balancePrev++) {

								                    if (balance[balancePrev] === sourceLength) {

								                        balance[balancePrev] = tokenCount;

								                    }

								                }

								                break;


								            case TYPE.LeftParenthesis:

								            case TYPE.Function:

								                balance[tokenCount] = balanceStart;

								                balanceCloseType = TYPE.RightParenthesis;

								                balanceStart = (balanceCloseType << TYPE_SHIFT) | tokenCount;

								                break;


								            case TYPE.LeftSquareBracket:

								                balance[tokenCount] = balanceStart;

								                balanceCloseType = TYPE.RightSquareBracket;

								                balanceStart = (balanceCloseType << TYPE_SHIFT) | tokenCount;

								                break;


								            case TYPE.LeftCurlyBracket:

								                balance[tokenCount] = balanceStart;

								                balanceCloseType = TYPE.RightCurlyBracket;

								                balanceStart = (balanceCloseType << TYPE_SHIFT) | tokenCount;

								                break;

								        }


								        offsetAndType[tokenCount++] = (type << TYPE_SHIFT) | offset;

								    }


								    // finalize buffers

								    offsetAndType[tokenCount] = (TYPE.EOF << TYPE_SHIFT) | offset; // <EOF-token>

								    balance[tokenCount] = sourceLength;

								    balance[sourceLength] = sourceLength; // prevents false positive balance match with any token

								    while (balanceStart !== 0) {

								        balancePrev = balanceStart & OFFSET_MASK;

								        balanceStart = balance[balancePrev];

								        balance[balancePrev] = sourceLength;

								    }


								    // update stream

								    stream.source = source;

								    stream.firstCharOffset = start;

								    stream.offsetAndType = offsetAndType;

								    stream.tokenCount = tokenCount;

								    stream.balance = balance;

								    stream.reset();

								    stream.next();


								    return stream;

								}


								// extend tokenizer with constants

								Object.keys(constants).forEach(function(key) {

								    tokenize[key] = constants[key];

								});


								// extend tokenizer with static methods from utils

								Object.keys(charCodeDefinitions).forEach(function(key) {

								    tokenize[key] = charCodeDefinitions[key];

								});

								Object.keys(utils).forEach(function(key) {

								    tokenize[key] = utils[key];

								});


								module.exports = tokenize;