Parser.js 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499
  1. var Tokenizer_js_1 = require('./Tokenizer.js');
  2. var decode_js_1 = require('./entities/decode.js');
  3. var formTags = new Set(['input', 'option', 'optgroup', 'select', 'button', 'datalist', 'textarea']);
  4. var pTag = new Set(['p']);
  5. var tableSectionTags = new Set(['thead', 'tbody']);
  6. var ddtTags = new Set(['dd', 'dt']);
  7. var rtpTags = new Set(['rt', 'rp']);
  8. var openImpliesClose = new Map([
  9. ['tr', new Set(['tr', 'th', 'td'])],
  10. ['th', new Set(['th'])],
  11. ['td', new Set(['thead', 'th', 'td'])],
  12. ['body', new Set(['head', 'link', 'script'])],
  13. ['li', new Set(['li'])],
  14. ['p', pTag],
  15. ['h1', pTag],
  16. ['h2', pTag],
  17. ['h3', pTag],
  18. ['h4', pTag],
  19. ['h5', pTag],
  20. ['h6', pTag],
  21. ['select', formTags],
  22. ['input', formTags],
  23. ['output', formTags],
  24. ['button', formTags],
  25. ['datalist', formTags],
  26. ['textarea', formTags],
  27. ['option', new Set(['option'])],
  28. ['optgroup', new Set(['optgroup', 'option'])],
  29. ['dd', ddtTags],
  30. ['dt', ddtTags],
  31. ['address', pTag],
  32. ['article', pTag],
  33. ['aside', pTag],
  34. ['blockquote', pTag],
  35. ['details', pTag],
  36. ['div', pTag],
  37. ['dl', pTag],
  38. ['fieldset', pTag],
  39. ['figcaption', pTag],
  40. ['figure', pTag],
  41. ['footer', pTag],
  42. ['form', pTag],
  43. ['header', pTag],
  44. ['hr', pTag],
  45. ['main', pTag],
  46. ['nav', pTag],
  47. ['ol', pTag],
  48. ['pre', pTag],
  49. ['section', pTag],
  50. ['table', pTag],
  51. ['ul', pTag],
  52. ['rt', rtpTags],
  53. ['rp', rtpTags],
  54. ['tbody', tableSectionTags],
  55. ['tfoot', tableSectionTags]
  56. ]);
  57. var voidElements = new Set([
  58. 'area',
  59. 'base',
  60. 'basefont',
  61. 'br',
  62. 'col',
  63. 'command',
  64. 'embed',
  65. 'frame',
  66. 'hr',
  67. 'img',
  68. 'input',
  69. 'isindex',
  70. 'keygen',
  71. 'link',
  72. 'meta',
  73. 'param',
  74. 'source',
  75. 'track',
  76. 'wbr'
  77. ]);
  78. var foreignContextElements = new Set(['math', 'svg']);
  79. var htmlIntegrationElements = new Set(['mi', 'mo', 'mn', 'ms', 'mtext', 'annotation-xml', 'foreignobject', 'desc', 'title']);
  80. var reNameEnd = /\s|\//;
  81. var Parser = /** @class */ (function () {
  82. function Parser(cbs, options) {
  83. if (options === void 0) {
  84. options = {};
  85. }
  86. var _a;
  87. var _b;
  88. var _c;
  89. var _d;
  90. var _e;
  91. this.options = options;
  92. /** The start index of the last event. */
  93. this.startIndex = 0;
  94. /** The end index of the last event. */
  95. this.endIndex = 0;
  96. /**
  97. * Store the start index of the current open tag,
  98. * so we can update the start index for attributes.
  99. */
  100. this.openTagStart = 0;
  101. this.tagname = '';
  102. this.attribname = '';
  103. this.attribvalue = '';
  104. this.attribs = null;
  105. this.stack = [];
  106. this.foreignContext = [];
  107. this.buffers = [];
  108. this.bufferOffset = 0;
  109. /** The index of the last written buffer. Used when resuming after a `pause()`. */
  110. this.writeIndex = 0;
  111. /** Indicates whether the parser has finished running / `.end` has been called. */
  112. this.ended = false;
  113. this.cbs = cbs !== null && cbs !== void 0 ? cbs : {};
  114. this.lowerCaseTagNames = (_a = options.lowerCaseTags) !== null && _a !== void 0 ? _a : !options.xmlMode;
  115. this.lowerCaseAttributeNames = (_b = options.lowerCaseAttributeNames) !== null && _b !== void 0 ? _b : !options.xmlMode;
  116. this.tokenizer = new ((_c = options.Tokenizer) !== null && _c !== void 0 ? _c : Tokenizer_js_1.default)(this.options, this);
  117. (_e = (_d = this.cbs).onparserinit) === null || _e === void 0 ? void 0 : _e.call(_d, this);
  118. }
  119. // Tokenizer event handlers
  120. /** @internal */
  121. Parser.prototype.ontext = function (start, endIndex) {
  122. var _a;
  123. var _b;
  124. var data = this.getSlice(start, endIndex);
  125. this.endIndex = endIndex - 1;
  126. (_b = (_a = this.cbs).ontext) === null || _b === void 0 ? void 0 : _b.call(_a, data);
  127. this.startIndex = endIndex;
  128. };
  129. /** @internal */
  130. Parser.prototype.ontextentity = function (cp) {
  131. var _a;
  132. var _b;
  133. /*
  134. * Entities can be emitted on the character, or directly after.
  135. * We use the section start here to get accurate indices.
  136. */
  137. var idx = this.tokenizer.getSectionStart();
  138. this.endIndex = idx - 1;
  139. (_b = (_a = this.cbs).ontext) === null || _b === void 0 ? void 0 : _b.call(_a, (0, decode_js_1.fromCodePoint)(cp));
  140. this.startIndex = idx;
  141. };
  142. Parser.prototype.isVoidElement = function (name) {
  143. return !this.options.xmlMode && voidElements.has(name);
  144. };
  145. /** @internal */
  146. Parser.prototype.onopentagname = function (start, endIndex) {
  147. this.endIndex = endIndex;
  148. var name = this.getSlice(start, endIndex);
  149. if (this.lowerCaseTagNames) {
  150. name = name.toLowerCase();
  151. }
  152. this.emitOpenTag(name);
  153. };
  154. Parser.prototype.emitOpenTag = function (name) {
  155. var _a;
  156. var _b;
  157. var _c;
  158. var _d;
  159. this.openTagStart = this.startIndex;
  160. this.tagname = name;
  161. var impliesClose = !this.options.xmlMode && openImpliesClose.get(name);
  162. if (impliesClose) {
  163. while (this.stack.length > 0 && impliesClose.has(this.stack[this.stack.length - 1])) {
  164. var el = this.stack.pop();
  165. (_b = (_a = this.cbs).onclosetag) === null || _b === void 0 ? void 0 : _b.call(_a, el, true);
  166. }
  167. }
  168. if (!this.isVoidElement(name)) {
  169. this.stack.push(name);
  170. if (foreignContextElements.has(name)) {
  171. this.foreignContext.push(true);
  172. } else if (htmlIntegrationElements.has(name)) {
  173. this.foreignContext.push(false);
  174. }
  175. }
  176. (_d = (_c = this.cbs).onopentagname) === null || _d === void 0 ? void 0 : _d.call(_c, name);
  177. if (this.cbs.onopentag) {
  178. this.attribs = {};
  179. }
  180. };
  181. Parser.prototype.endOpenTag = function (isImplied) {
  182. var _a;
  183. var _b;
  184. this.startIndex = this.openTagStart;
  185. if (this.attribs) {
  186. (_b = (_a = this.cbs).onopentag) === null || _b === void 0 ? void 0 : _b.call(_a, this.tagname, this.attribs, isImplied);
  187. this.attribs = null;
  188. }
  189. if (this.cbs.onclosetag && this.isVoidElement(this.tagname)) {
  190. this.cbs.onclosetag(this.tagname, true);
  191. }
  192. this.tagname = '';
  193. };
  194. /** @internal */
  195. Parser.prototype.onopentagend = function (endIndex) {
  196. this.endIndex = endIndex;
  197. this.endOpenTag(false);
  198. // Set `startIndex` for next node
  199. this.startIndex = endIndex + 1;
  200. };
  201. /** @internal */
  202. Parser.prototype.onclosetag = function (start, endIndex) {
  203. var _a;
  204. var _b;
  205. var _c;
  206. var _d;
  207. var _e;
  208. var _f;
  209. this.endIndex = endIndex;
  210. var name = this.getSlice(start, endIndex);
  211. if (this.lowerCaseTagNames) {
  212. name = name.toLowerCase();
  213. }
  214. if (foreignContextElements.has(name) || htmlIntegrationElements.has(name)) {
  215. this.foreignContext.pop();
  216. }
  217. if (!this.isVoidElement(name)) {
  218. var pos = this.stack.lastIndexOf(name);
  219. if (pos !== -1) {
  220. if (this.cbs.onclosetag) {
  221. var count = this.stack.length - pos;
  222. while (count--) {
  223. // We know the stack has sufficient elements.
  224. this.cbs.onclosetag(this.stack.pop(), count !== 0);
  225. }
  226. } else {
  227. this.stack.length = pos;
  228. }
  229. } else if (!this.options.xmlMode && name === 'p') {
  230. // Implicit open before close
  231. this.emitOpenTag('p');
  232. this.closeCurrentTag(true);
  233. }
  234. } else if (!this.options.xmlMode && name === 'br') {
  235. // We can't use `emitOpenTag` for implicit open, as `br` would be implicitly closed.
  236. (_b = (_a = this.cbs).onopentagname) === null || _b === void 0 ? void 0 : _b.call(_a, 'br');
  237. (_d = (_c = this.cbs).onopentag) === null || _d === void 0 ? void 0 : _d.call(_c, 'br', {}, true);
  238. (_f = (_e = this.cbs).onclosetag) === null || _f === void 0 ? void 0 : _f.call(_e, 'br', false);
  239. }
  240. // Set `startIndex` for next node
  241. this.startIndex = endIndex + 1;
  242. };
  243. /** @internal */
  244. Parser.prototype.onselfclosingtag = function (endIndex) {
  245. this.endIndex = endIndex;
  246. if (this.options.xmlMode || this.options.recognizeSelfClosing || this.foreignContext[this.foreignContext.length - 1]) {
  247. this.closeCurrentTag(false);
  248. // Set `startIndex` for next node
  249. this.startIndex = endIndex + 1;
  250. } else {
  251. // Ignore the fact that the tag is self-closing.
  252. this.onopentagend(endIndex);
  253. }
  254. };
  255. Parser.prototype.closeCurrentTag = function (isOpenImplied) {
  256. var _a;
  257. var _b;
  258. var name = this.tagname;
  259. this.endOpenTag(isOpenImplied);
  260. // Self-closing tags will be on the top of the stack
  261. if (this.stack[this.stack.length - 1] === name) {
  262. // If the opening tag isn't implied, the closing tag has to be implied.
  263. (_b = (_a = this.cbs).onclosetag) === null || _b === void 0 ? void 0 : _b.call(_a, name, !isOpenImplied);
  264. this.stack.pop();
  265. }
  266. };
  267. /** @internal */
  268. Parser.prototype.onattribname = function (start, endIndex) {
  269. this.startIndex = start;
  270. var name = this.getSlice(start, endIndex);
  271. this.attribname = this.lowerCaseAttributeNames ? name.toLowerCase() : name;
  272. };
  273. /** @internal */
  274. Parser.prototype.onattribdata = function (start, endIndex) {
  275. this.attribvalue += this.getSlice(start, endIndex);
  276. };
  277. /** @internal */
  278. Parser.prototype.onattribentity = function (cp) {
  279. this.attribvalue += (0, decode_js_1.fromCodePoint)(cp);
  280. };
  281. /** @internal */
  282. Parser.prototype.onattribend = function (quote, endIndex) {
  283. var _a;
  284. var _b;
  285. this.endIndex = endIndex;
  286. (_b = (_a = this.cbs).onattribute) === null || _b === void 0
  287. ? void 0
  288. : _b.call(
  289. _a,
  290. this.attribname,
  291. this.attribvalue,
  292. quote === Tokenizer_js_1.QuoteType.Double ? '"' : quote === Tokenizer_js_1.QuoteType.Single ? "'" : quote === Tokenizer_js_1.QuoteType.NoValue ? undefined : null
  293. );
  294. if (this.attribs && !Object.prototype.hasOwnProperty.call(this.attribs, this.attribname)) {
  295. this.attribs[this.attribname] = this.attribvalue;
  296. }
  297. this.attribvalue = '';
  298. };
  299. Parser.prototype.getInstructionName = function (value) {
  300. var idx = value.search(reNameEnd);
  301. var name = idx < 0 ? value : value.substr(0, idx);
  302. if (this.lowerCaseTagNames) {
  303. name = name.toLowerCase();
  304. }
  305. return name;
  306. };
  307. /** @internal */
  308. Parser.prototype.ondeclaration = function (start, endIndex) {
  309. this.endIndex = endIndex;
  310. var value = this.getSlice(start, endIndex);
  311. if (this.cbs.onprocessinginstruction) {
  312. var name = this.getInstructionName(value);
  313. this.cbs.onprocessinginstruction('!'.concat(name), '!'.concat(value));
  314. }
  315. // Set `startIndex` for next node
  316. this.startIndex = endIndex + 1;
  317. };
  318. /** @internal */
  319. Parser.prototype.onprocessinginstruction = function (start, endIndex) {
  320. this.endIndex = endIndex;
  321. var value = this.getSlice(start, endIndex);
  322. if (this.cbs.onprocessinginstruction) {
  323. var name = this.getInstructionName(value);
  324. this.cbs.onprocessinginstruction('?'.concat(name), '?'.concat(value));
  325. }
  326. // Set `startIndex` for next node
  327. this.startIndex = endIndex + 1;
  328. };
  329. /** @internal */
  330. Parser.prototype.oncomment = function (start, endIndex, offset) {
  331. var _a;
  332. var _b;
  333. var _c;
  334. var _d;
  335. this.endIndex = endIndex;
  336. (_b = (_a = this.cbs).oncomment) === null || _b === void 0 ? void 0 : _b.call(_a, this.getSlice(start, endIndex - offset));
  337. (_d = (_c = this.cbs).oncommentend) === null || _d === void 0 ? void 0 : _d.call(_c);
  338. // Set `startIndex` for next node
  339. this.startIndex = endIndex + 1;
  340. };
  341. /** @internal */
  342. Parser.prototype.oncdata = function (start, endIndex, offset) {
  343. var _a;
  344. var _b;
  345. var _c;
  346. var _d;
  347. var _e;
  348. var _f;
  349. var _g;
  350. var _h;
  351. var _j;
  352. var _k;
  353. this.endIndex = endIndex;
  354. var value = this.getSlice(start, endIndex - offset);
  355. if (this.options.xmlMode || this.options.recognizeCDATA) {
  356. (_b = (_a = this.cbs).oncdatastart) === null || _b === void 0 ? void 0 : _b.call(_a);
  357. (_d = (_c = this.cbs).ontext) === null || _d === void 0 ? void 0 : _d.call(_c, value);
  358. (_f = (_e = this.cbs).oncdataend) === null || _f === void 0 ? void 0 : _f.call(_e);
  359. } else {
  360. (_h = (_g = this.cbs).oncomment) === null || _h === void 0 ? void 0 : _h.call(_g, '[CDATA['.concat(value, ']]'));
  361. (_k = (_j = this.cbs).oncommentend) === null || _k === void 0 ? void 0 : _k.call(_j);
  362. }
  363. // Set `startIndex` for next node
  364. this.startIndex = endIndex + 1;
  365. };
  366. /** @internal */
  367. Parser.prototype.onend = function () {
  368. var _a;
  369. var _b;
  370. if (this.cbs.onclosetag) {
  371. // Set the end index for all remaining tags
  372. this.endIndex = this.startIndex;
  373. for (var i = this.stack.length; i > 0; this.cbs.onclosetag(this.stack[--i], true)) {}
  374. }
  375. (_b = (_a = this.cbs).onend) === null || _b === void 0 ? void 0 : _b.call(_a);
  376. };
  377. /**
  378. * Resets the parser to a blank state, ready to parse a new HTML document
  379. */
  380. Parser.prototype.reset = function () {
  381. var _a;
  382. var _b;
  383. var _c;
  384. var _d;
  385. (_b = (_a = this.cbs).onreset) === null || _b === void 0 ? void 0 : _b.call(_a);
  386. this.tokenizer.reset();
  387. this.tagname = '';
  388. this.attribname = '';
  389. this.attribs = null;
  390. this.stack.length = 0;
  391. this.startIndex = 0;
  392. this.endIndex = 0;
  393. (_d = (_c = this.cbs).onparserinit) === null || _d === void 0 ? void 0 : _d.call(_c, this);
  394. this.buffers.length = 0;
  395. this.bufferOffset = 0;
  396. this.writeIndex = 0;
  397. this.ended = false;
  398. };
  399. /**
  400. * Resets the parser, then parses a complete document and
  401. * pushes it to the handler.
  402. *
  403. * @param data Document to parse.
  404. */
  405. Parser.prototype.parseComplete = function (data) {
  406. this.reset();
  407. this.end(data);
  408. };
  409. Parser.prototype.getSlice = function (start, end) {
  410. while (start - this.bufferOffset >= this.buffers[0].length) {
  411. this.shiftBuffer();
  412. }
  413. var str = this.buffers[0].slice(start - this.bufferOffset, end - this.bufferOffset);
  414. while (end - this.bufferOffset > this.buffers[0].length) {
  415. this.shiftBuffer();
  416. str += this.buffers[0].slice(0, end - this.bufferOffset);
  417. }
  418. return str;
  419. };
  420. Parser.prototype.shiftBuffer = function () {
  421. this.bufferOffset += this.buffers[0].length;
  422. this.writeIndex--;
  423. this.buffers.shift();
  424. };
  425. /**
  426. * Parses a chunk of data and calls the corresponding callbacks.
  427. *
  428. * @param chunk Chunk to parse.
  429. */
  430. Parser.prototype.write = function (chunk) {
  431. var _a;
  432. var _b;
  433. if (this.ended) {
  434. (_b = (_a = this.cbs).onerror) === null || _b === void 0 ? void 0 : _b.call(_a, new Error('.write() after done!'));
  435. return;
  436. }
  437. this.buffers.push(chunk);
  438. if (this.tokenizer.running) {
  439. this.tokenizer.write(chunk);
  440. this.writeIndex++;
  441. }
  442. };
  443. /**
  444. * Parses the end of the buffer and clears the stack, calls onend.
  445. *
  446. * @param chunk Optional final chunk to parse.
  447. */
  448. Parser.prototype.end = function (chunk) {
  449. var _a;
  450. var _b;
  451. if (this.ended) {
  452. (_b = (_a = this.cbs).onerror) === null || _b === void 0 ? void 0 : _b.call(_a, Error('.end() after done!'));
  453. return;
  454. }
  455. if (chunk) {
  456. this.write(chunk);
  457. }
  458. this.ended = true;
  459. this.tokenizer.end();
  460. };
  461. /**
  462. * Pauses parsing. The parser won't emit events until `resume` is called.
  463. */
  464. Parser.prototype.pause = function () {
  465. this.tokenizer.pause();
  466. };
  467. /**
  468. * Resumes parsing after `pause` was called.
  469. */
  470. Parser.prototype.resume = function () {
  471. this.tokenizer.resume();
  472. while (this.tokenizer.running && this.writeIndex < this.buffers.length) {
  473. this.tokenizer.write(this.buffers[this.writeIndex++]);
  474. }
  475. if (this.ended) {
  476. this.tokenizer.end();
  477. }
  478. };
  479. /**
  480. * Alias of `write`, for backwards compatibility.
  481. *
  482. * @param chunk Chunk to parse.
  483. * @deprecated
  484. */
  485. Parser.prototype.parseChunk = function (chunk) {
  486. this.write(chunk);
  487. };
  488. /**
  489. * Alias of `end`, for backwards compatibility.
  490. *
  491. * @param chunk Optional final chunk to parse.
  492. * @deprecated
  493. */
  494. Parser.prototype.done = function (chunk) {
  495. this.end(chunk);
  496. };
  497. return Parser;
  498. })();
  499. module.exports = Parser;