You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

591 lines
23 KiB

4 years ago
  1. var TokenStream = require('../common/TokenStream');
  2. var adoptBuffer = require('../common/adopt-buffer');
  3. var constants = require('./const');
  4. var TYPE = constants.TYPE;
  5. var charCodeDefinitions = require('./char-code-definitions');
  6. var isNewline = charCodeDefinitions.isNewline;
  7. var isName = charCodeDefinitions.isName;
  8. var isValidEscape = charCodeDefinitions.isValidEscape;
  9. var isNumberStart = charCodeDefinitions.isNumberStart;
  10. var isIdentifierStart = charCodeDefinitions.isIdentifierStart;
  11. var charCodeCategory = charCodeDefinitions.charCodeCategory;
  12. var isBOM = charCodeDefinitions.isBOM;
  13. var utils = require('./utils');
  14. var cmpStr = utils.cmpStr;
  15. var getNewlineLength = utils.getNewlineLength;
  16. var findWhiteSpaceEnd = utils.findWhiteSpaceEnd;
  17. var consumeEscaped = utils.consumeEscaped;
  18. var consumeName = utils.consumeName;
  19. var consumeNumber = utils.consumeNumber;
  20. var consumeBadUrlRemnants = utils.consumeBadUrlRemnants;
  21. var OFFSET_MASK = 0x00FFFFFF;
  22. var TYPE_SHIFT = 24;
  23. function tokenize(source, stream) {
  24. function getCharCode(offset) {
  25. return offset < sourceLength ? source.charCodeAt(offset) : 0;
  26. }
  27. // § 4.3.3. Consume a numeric token
  28. function consumeNumericToken() {
  29. // Consume a number and let number be the result.
  30. offset = consumeNumber(source, offset);
  31. // If the next 3 input code points would start an identifier, then:
  32. if (isIdentifierStart(getCharCode(offset), getCharCode(offset + 1), getCharCode(offset + 2))) {
  33. // Create a <dimension-token> with the same value and type flag as number, and a unit set initially to the empty string.
  34. // Consume a name. Set the <dimension-token>’s unit to the returned value.
  35. // Return the <dimension-token>.
  36. type = TYPE.Dimension;
  37. offset = consumeName(source, offset);
  38. return;
  39. }
  40. // Otherwise, if the next input code point is U+0025 PERCENTAGE SIGN (%), consume it.
  41. if (getCharCode(offset) === 0x0025) {
  42. // Create a <percentage-token> with the same value as number, and return it.
  43. type = TYPE.Percentage;
  44. offset++;
  45. return;
  46. }
  47. // Otherwise, create a <number-token> with the same value and type flag as number, and return it.
  48. type = TYPE.Number;
  49. }
  50. // § 4.3.4. Consume an ident-like token
  51. function consumeIdentLikeToken() {
  52. const nameStartOffset = offset;
  53. // Consume a name, and let string be the result.
  54. offset = consumeName(source, offset);
  55. // If string’s value is an ASCII case-insensitive match for "url",
  56. // and the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
  57. if (cmpStr(source, nameStartOffset, offset, 'url') && getCharCode(offset) === 0x0028) {
  58. // While the next two input code points are whitespace, consume the next input code point.
  59. offset = findWhiteSpaceEnd(source, offset + 1);
  60. // If the next one or two input code points are U+0022 QUOTATION MARK ("), U+0027 APOSTROPHE ('),
  61. // or whitespace followed by U+0022 QUOTATION MARK (") or U+0027 APOSTROPHE ('),
  62. // then create a <function-token> with its value set to string and return it.
  63. if (getCharCode(offset) === 0x0022 ||
  64. getCharCode(offset) === 0x0027) {
  65. type = TYPE.Function;
  66. offset = nameStartOffset + 4;
  67. return;
  68. }
  69. // Otherwise, consume a url token, and return it.
  70. consumeUrlToken();
  71. return;
  72. }
  73. // Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
  74. // Create a <function-token> with its value set to string and return it.
  75. if (getCharCode(offset) === 0x0028) {
  76. type = TYPE.Function;
  77. offset++;
  78. return;
  79. }
  80. // Otherwise, create an <ident-token> with its value set to string and return it.
  81. type = TYPE.Ident;
  82. }
  83. // § 4.3.5. Consume a string token
  84. function consumeStringToken(endingCodePoint) {
  85. // This algorithm may be called with an ending code point, which denotes the code point
  86. // that ends the string. If an ending code point is not specified,
  87. // the current input code point is used.
  88. if (!endingCodePoint) {
  89. endingCodePoint = getCharCode(offset++);
  90. }
  91. // Initially create a <string-token> with its value set to the empty string.
  92. type = TYPE.String;
  93. // Repeatedly consume the next input code point from the stream:
  94. for (; offset < source.length; offset++) {
  95. var code = source.charCodeAt(offset);
  96. switch (charCodeCategory(code)) {
  97. // ending code point
  98. case endingCodePoint:
  99. // Return the <string-token>.
  100. offset++;
  101. return;
  102. // EOF
  103. case charCodeCategory.Eof:
  104. // This is a parse error. Return the <string-token>.
  105. return;
  106. // newline
  107. case charCodeCategory.WhiteSpace:
  108. if (isNewline(code)) {
  109. // This is a parse error. Reconsume the current input code point,
  110. // create a <bad-string-token>, and return it.
  111. offset += getNewlineLength(source, offset, code);
  112. type = TYPE.BadString;
  113. return;
  114. }
  115. break;
  116. // U+005C REVERSE SOLIDUS (\)
  117. case 0x005C:
  118. // If the next input code point is EOF, do nothing.
  119. if (offset === source.length - 1) {
  120. break;
  121. }
  122. var nextCode = getCharCode(offset + 1);
  123. // Otherwise, if the next input code point is a newline, consume it.
  124. if (isNewline(nextCode)) {
  125. offset += getNewlineLength(source, offset + 1, nextCode);
  126. } else if (isValidEscape(code, nextCode)) {
  127. // Otherwise, (the stream starts with a valid escape) consume
  128. // an escaped code point and append the returned code point to
  129. // the <string-token>’s value.
  130. offset = consumeEscaped(source, offset) - 1;
  131. }
  132. break;
  133. // anything else
  134. // Append the current input code point to the <string-token>’s value.
  135. }
  136. }
  137. }
  138. // § 4.3.6. Consume a url token
  139. // Note: This algorithm assumes that the initial "url(" has already been consumed.
  140. // This algorithm also assumes that it’s being called to consume an "unquoted" value, like url(foo).
  141. // A quoted value, like url("foo"), is parsed as a <function-token>. Consume an ident-like token
  142. // automatically handles this distinction; this algorithm shouldn’t be called directly otherwise.
  143. function consumeUrlToken() {
  144. // Initially create a <url-token> with its value set to the empty string.
  145. type = TYPE.Url;
  146. // Consume as much whitespace as possible.
  147. offset = findWhiteSpaceEnd(source, offset);
  148. // Repeatedly consume the next input code point from the stream:
  149. for (; offset < source.length; offset++) {
  150. var code = source.charCodeAt(offset);
  151. switch (charCodeCategory(code)) {
  152. // U+0029 RIGHT PARENTHESIS ())
  153. case 0x0029:
  154. // Return the <url-token>.
  155. offset++;
  156. return;
  157. // EOF
  158. case charCodeCategory.Eof:
  159. // This is a parse error. Return the <url-token>.
  160. return;
  161. // whitespace
  162. case charCodeCategory.WhiteSpace:
  163. // Consume as much whitespace as possible.
  164. offset = findWhiteSpaceEnd(source, offset);
  165. // If the next input code point is U+0029 RIGHT PARENTHESIS ()) or EOF,
  166. // consume it and return the <url-token>
  167. // (if EOF was encountered, this is a parse error);
  168. if (getCharCode(offset) === 0x0029 || offset >= source.length) {
  169. if (offset < source.length) {
  170. offset++;
  171. }
  172. return;
  173. }
  174. // otherwise, consume the remnants of a bad url, create a <bad-url-token>,
  175. // and return it.
  176. offset = consumeBadUrlRemnants(source, offset);
  177. type = TYPE.BadUrl;
  178. return;
  179. // U+0022 QUOTATION MARK (")
  180. // U+0027 APOSTROPHE (')
  181. // U+0028 LEFT PARENTHESIS (()
  182. // non-printable code point
  183. case 0x0022:
  184. case 0x0027:
  185. case 0x0028:
  186. case charCodeCategory.NonPrintable:
  187. // This is a parse error. Consume the remnants of a bad url,
  188. // create a <bad-url-token>, and return it.
  189. offset = consumeBadUrlRemnants(source, offset);
  190. type = TYPE.BadUrl;
  191. return;
  192. // U+005C REVERSE SOLIDUS (\)
  193. case 0x005C:
  194. // If the stream starts with a valid escape, consume an escaped code point and
  195. // append the returned code point to the <url-token>’s value.
  196. if (isValidEscape(code, getCharCode(offset + 1))) {
  197. offset = consumeEscaped(source, offset) - 1;
  198. break;
  199. }
  200. // Otherwise, this is a parse error. Consume the remnants of a bad url,
  201. // create a <bad-url-token>, and return it.
  202. offset = consumeBadUrlRemnants(source, offset);
  203. type = TYPE.BadUrl;
  204. return;
  205. // anything else
  206. // Append the current input code point to the <url-token>’s value.
  207. }
  208. }
  209. }
  210. if (!stream) {
  211. stream = new TokenStream();
  212. }
  213. // ensure source is a string
  214. source = String(source || '');
  215. var sourceLength = source.length;
  216. var offsetAndType = adoptBuffer(stream.offsetAndType, sourceLength + 1); // +1 because of eof-token
  217. var balance = adoptBuffer(stream.balance, sourceLength + 1);
  218. var tokenCount = 0;
  219. var start = isBOM(getCharCode(0));
  220. var offset = start;
  221. var balanceCloseType = 0;
  222. var balanceStart = 0;
  223. var balancePrev = 0;
  224. // https://drafts.csswg.org/css-syntax-3/#consume-token
  225. // § 4.3.1. Consume a token
  226. while (offset < sourceLength) {
  227. var code = source.charCodeAt(offset);
  228. var type = 0;
  229. balance[tokenCount] = sourceLength;
  230. switch (charCodeCategory(code)) {
  231. // whitespace
  232. case charCodeCategory.WhiteSpace:
  233. // Consume as much whitespace as possible. Return a <whitespace-token>.
  234. type = TYPE.WhiteSpace;
  235. offset = findWhiteSpaceEnd(source, offset + 1);
  236. break;
  237. // U+0022 QUOTATION MARK (")
  238. case 0x0022:
  239. // Consume a string token and return it.
  240. consumeStringToken();
  241. break;
  242. // U+0023 NUMBER SIGN (#)
  243. case 0x0023:
  244. // If the next input code point is a name code point or the next two input code points are a valid escape, then:
  245. if (isName(getCharCode(offset + 1)) || isValidEscape(getCharCode(offset + 1), getCharCode(offset + 2))) {
  246. // Create a <hash-token>.
  247. type = TYPE.Hash;
  248. // If the next 3 input code points would start an identifier, set the <hash-token>’s type flag to "id".
  249. // if (isIdentifierStart(getCharCode(offset + 1), getCharCode(offset + 2), getCharCode(offset + 3))) {
  250. // // TODO: set id flag
  251. // }
  252. // Consume a name, and set the <hash-token>’s value to the returned string.
  253. offset = consumeName(source, offset + 1);
  254. // Return the <hash-token>.
  255. } else {
  256. // Otherwise, return a <delim-token> with its value set to the current input code point.
  257. type = TYPE.Delim;
  258. offset++;
  259. }
  260. break;
  261. // U+0027 APOSTROPHE (')
  262. case 0x0027:
  263. // Consume a string token and return it.
  264. consumeStringToken();
  265. break;
  266. // U+0028 LEFT PARENTHESIS (()
  267. case 0x0028:
  268. // Return a <(-token>.
  269. type = TYPE.LeftParenthesis;
  270. offset++;
  271. break;
  272. // U+0029 RIGHT PARENTHESIS ())
  273. case 0x0029:
  274. // Return a <)-token>.
  275. type = TYPE.RightParenthesis;
  276. offset++;
  277. break;
  278. // U+002B PLUS SIGN (+)
  279. case 0x002B:
  280. // If the input stream starts with a number, ...
  281. if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {
  282. // ... reconsume the current input code point, consume a numeric token, and return it.
  283. consumeNumericToken();
  284. } else {
  285. // Otherwise, return a <delim-token> with its value set to the current input code point.
  286. type = TYPE.Delim;
  287. offset++;
  288. }
  289. break;
  290. // U+002C COMMA (,)
  291. case 0x002C:
  292. // Return a <comma-token>.
  293. type = TYPE.Comma;
  294. offset++;
  295. break;
  296. // U+002D HYPHEN-MINUS (-)
  297. case 0x002D:
  298. // If the input stream starts with a number, reconsume the current input code point, consume a numeric token, and return it.
  299. if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {
  300. consumeNumericToken();
  301. } else {
  302. // Otherwise, if the next 2 input code points are U+002D HYPHEN-MINUS U+003E GREATER-THAN SIGN (->), consume them and return a <CDC-token>.
  303. if (getCharCode(offset + 1) === 0x002D &&
  304. getCharCode(offset + 2) === 0x003E) {
  305. type = TYPE.CDC;
  306. offset = offset + 3;
  307. } else {
  308. // Otherwise, if the input stream starts with an identifier, ...
  309. if (isIdentifierStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {
  310. // ... reconsume the current input code point, consume an ident-like token, and return it.
  311. consumeIdentLikeToken();
  312. } else {
  313. // Otherwise, return a <delim-token> with its value set to the current input code point.
  314. type = TYPE.Delim;
  315. offset++;
  316. }
  317. }
  318. }
  319. break;
  320. // U+002E FULL STOP (.)
  321. case 0x002E:
  322. // If the input stream starts with a number, ...
  323. if (isNumberStart(code, getCharCode(offset + 1), getCharCode(offset + 2))) {
  324. // ... reconsume the current input code point, consume a numeric token, and return it.
  325. consumeNumericToken();
  326. } else {
  327. // Otherwise, return a <delim-token> with its value set to the current input code point.
  328. type = TYPE.Delim;
  329. offset++;
  330. }
  331. break;
  332. // U+002F SOLIDUS (/)
  333. case 0x002F:
  334. // If the next two input code point are U+002F SOLIDUS (/) followed by a U+002A ASTERISK (*),
  335. if (getCharCode(offset + 1) === 0x002A) {
  336. // ... consume them and all following code points up to and including the first U+002A ASTERISK (*)
  337. // followed by a U+002F SOLIDUS (/), or up to an EOF code point.
  338. type = TYPE.Comment;
  339. offset = source.indexOf('*/', offset + 2) + 2;
  340. if (offset === 1) {
  341. offset = source.length;
  342. }
  343. } else {
  344. type = TYPE.Delim;
  345. offset++;
  346. }
  347. break;
  348. // U+003A COLON (:)
  349. case 0x003A:
  350. // Return a <colon-token>.
  351. type = TYPE.Colon;
  352. offset++;
  353. break;
  354. // U+003B SEMICOLON (;)
  355. case 0x003B:
  356. // Return a <semicolon-token>.
  357. type = TYPE.Semicolon;
  358. offset++;
  359. break;
  360. // U+003C LESS-THAN SIGN (<)
  361. case 0x003C:
  362. // If the next 3 input code points are U+0021 EXCLAMATION MARK U+002D HYPHEN-MINUS U+002D HYPHEN-MINUS (!--), ...
  363. if (getCharCode(offset + 1) === 0x0021 &&
  364. getCharCode(offset + 2) === 0x002D &&
  365. getCharCode(offset + 3) === 0x002D) {
  366. // ... consume them and return a <CDO-token>.
  367. type = TYPE.CDO;
  368. offset = offset + 4;
  369. } else {
  370. // Otherwise, return a <delim-token> with its value set to the current input code point.
  371. type = TYPE.Delim;
  372. offset++;
  373. }
  374. break;
  375. // U+0040 COMMERCIAL AT (@)
  376. case 0x0040:
  377. // If the next 3 input code points would start an identifier, ...
  378. if (isIdentifierStart(getCharCode(offset + 1), getCharCode(offset + 2), getCharCode(offset + 3))) {
  379. // ... consume a name, create an <at-keyword-token> with its value set to the returned value, and return it.
  380. type = TYPE.AtKeyword;
  381. offset = consumeName(source, offset + 1);
  382. } else {
  383. // Otherwise, return a <delim-token> with its value set to the current input code point.
  384. type = TYPE.Delim;
  385. offset++;
  386. }
  387. break;
  388. // U+005B LEFT SQUARE BRACKET ([)
  389. case 0x005B:
  390. // Return a <[-token>.
  391. type = TYPE.LeftSquareBracket;
  392. offset++;
  393. break;
  394. // U+005C REVERSE SOLIDUS (\)
  395. case 0x005C:
  396. // If the input stream starts with a valid escape, ...
  397. if (isValidEscape(code, getCharCode(offset + 1))) {
  398. // ... reconsume the current input code point, consume an ident-like token, and return it.
  399. consumeIdentLikeToken();
  400. } else {
  401. // Otherwise, this is a parse error. Return a <delim-token> with its value set to the current input code point.
  402. type = TYPE.Delim;
  403. offset++;
  404. }
  405. break;
  406. // U+005D RIGHT SQUARE BRACKET (])
  407. case 0x005D:
  408. // Return a <]-token>.
  409. type = TYPE.RightSquareBracket;
  410. offset++;
  411. break;
  412. // U+007B LEFT CURLY BRACKET ({)
  413. case 0x007B:
  414. // Return a <{-token>.
  415. type = TYPE.LeftCurlyBracket;
  416. offset++;
  417. break;
  418. // U+007D RIGHT CURLY BRACKET (})
  419. case 0x007D:
  420. // Return a <}-token>.
  421. type = TYPE.RightCurlyBracket;
  422. offset++;
  423. break;
  424. // digit
  425. case charCodeCategory.Digit:
  426. // Reconsume the current input code point, consume a numeric token, and return it.
  427. consumeNumericToken();
  428. break;
  429. // name-start code point
  430. case charCodeCategory.NameStart:
  431. // Reconsume the current input code point, consume an ident-like token, and return it.
  432. consumeIdentLikeToken();
  433. break;
  434. // EOF
  435. case charCodeCategory.Eof:
  436. // Return an <EOF-token>.
  437. break;
  438. // anything else
  439. default:
  440. // Return a <delim-token> with its value set to the current input code point.
  441. type = TYPE.Delim;
  442. offset++;
  443. }
  444. switch (type) {
  445. case balanceCloseType:
  446. balancePrev = balanceStart & OFFSET_MASK;
  447. balanceStart = balance[balancePrev];
  448. balanceCloseType = balanceStart >> TYPE_SHIFT;
  449. balance[tokenCount] = balancePrev;
  450. balance[balancePrev++] = tokenCount;
  451. for (; balancePrev < tokenCount; balancePrev++) {
  452. if (balance[balancePrev] === sourceLength) {
  453. balance[balancePrev] = tokenCount;
  454. }
  455. }
  456. break;
  457. case TYPE.LeftParenthesis:
  458. case TYPE.Function:
  459. balance[tokenCount] = balanceStart;
  460. balanceCloseType = TYPE.RightParenthesis;
  461. balanceStart = (balanceCloseType << TYPE_SHIFT) | tokenCount;
  462. break;
  463. case TYPE.LeftSquareBracket:
  464. balance[tokenCount] = balanceStart;
  465. balanceCloseType = TYPE.RightSquareBracket;
  466. balanceStart = (balanceCloseType << TYPE_SHIFT) | tokenCount;
  467. break;
  468. case TYPE.LeftCurlyBracket:
  469. balance[tokenCount] = balanceStart;
  470. balanceCloseType = TYPE.RightCurlyBracket;
  471. balanceStart = (balanceCloseType << TYPE_SHIFT) | tokenCount;
  472. break;
  473. }
  474. offsetAndType[tokenCount++] = (type << TYPE_SHIFT) | offset;
  475. }
  476. // finalize buffers
  477. offsetAndType[tokenCount] = (TYPE.EOF << TYPE_SHIFT) | offset; // <EOF-token>
  478. balance[tokenCount] = sourceLength;
  479. balance[sourceLength] = sourceLength; // prevents false positive balance match with any token
  480. while (balanceStart !== 0) {
  481. balancePrev = balanceStart & OFFSET_MASK;
  482. balanceStart = balance[balancePrev];
  483. balance[balancePrev] = sourceLength;
  484. }
  485. // update stream
  486. stream.source = source;
  487. stream.firstCharOffset = start;
  488. stream.offsetAndType = offsetAndType;
  489. stream.tokenCount = tokenCount;
  490. stream.balance = balance;
  491. stream.reset();
  492. stream.next();
  493. return stream;
  494. }
  495. // extend tokenizer with constants
  496. Object.keys(constants).forEach(function(key) {
  497. tokenize[key] = constants[key];
  498. });
  499. // extend tokenizer with static methods from utils
  500. Object.keys(charCodeDefinitions).forEach(function(key) {
  501. tokenize[key] = charCodeDefinitions[key];
  502. });
  503. Object.keys(utils).forEach(function(key) {
  504. tokenize[key] = utils[key];
  505. });
  506. module.exports = tokenize;