decode.js 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. 'use strict';
  2. var __importDefault =
  3. (this && this.__importDefault) ||
  4. function (mod) {
  5. return mod && mod.__esModule
  6. ? mod
  7. : {
  8. default: mod
  9. };
  10. };
  11. Object.defineProperty(exports, '__esModule', {
  12. value: true
  13. });
  14. exports.decodeXML =
  15. exports.decodeHTMLStrict =
  16. exports.decodeHTML =
  17. exports.determineBranch =
  18. exports.BinTrieFlags =
  19. exports.fromCodePoint =
  20. exports.replaceCodePoint =
  21. exports.decodeCodePoint =
  22. exports.xmlDecodeTree =
  23. exports.htmlDecodeTree =
  24. void 0;
  25. var decode_data_html_js_1 = __importDefault(require('./generated/decode-data-html.js'));
  26. exports.htmlDecodeTree = decode_data_html_js_1.default;
  27. var decode_data_xml_js_1 = __importDefault(require('./generated/decode-data-xml.js'));
  28. exports.xmlDecodeTree = decode_data_xml_js_1.default;
  29. var decode_codepoint_js_1 = __importDefault(require('./decode_codepoint.js'));
  30. exports.decodeCodePoint = decode_codepoint_js_1.default;
  31. var decode_codepoint_js_2 = require('./decode_codepoint.js');
  32. Object.defineProperty(exports, 'replaceCodePoint', {
  33. enumerable: true,
  34. get: function () {
  35. return decode_codepoint_js_2.replaceCodePoint;
  36. }
  37. });
  38. Object.defineProperty(exports, 'fromCodePoint', {
  39. enumerable: true,
  40. get: function () {
  41. return decode_codepoint_js_2.fromCodePoint;
  42. }
  43. });
  44. var CharCodes;
  45. (function (CharCodes) {
  46. CharCodes[(CharCodes['NUM'] = 35)] = 'NUM';
  47. CharCodes[(CharCodes['SEMI'] = 59)] = 'SEMI';
  48. CharCodes[(CharCodes['ZERO'] = 48)] = 'ZERO';
  49. CharCodes[(CharCodes['NINE'] = 57)] = 'NINE';
  50. CharCodes[(CharCodes['LOWER_A'] = 97)] = 'LOWER_A';
  51. CharCodes[(CharCodes['LOWER_F'] = 102)] = 'LOWER_F';
  52. CharCodes[(CharCodes['LOWER_X'] = 120)] = 'LOWER_X';
  53. /** Bit that needs to be set to convert an upper case ASCII character to lower case */
  54. CharCodes[(CharCodes['To_LOWER_BIT'] = 32)] = 'To_LOWER_BIT';
  55. })(CharCodes || (CharCodes = {}));
  56. var BinTrieFlags;
  57. (function (BinTrieFlags) {
  58. BinTrieFlags[(BinTrieFlags['VALUE_LENGTH'] = 49152)] = 'VALUE_LENGTH';
  59. BinTrieFlags[(BinTrieFlags['BRANCH_LENGTH'] = 16256)] = 'BRANCH_LENGTH';
  60. BinTrieFlags[(BinTrieFlags['JUMP_TABLE'] = 127)] = 'JUMP_TABLE';
  61. })((BinTrieFlags = exports.BinTrieFlags || (exports.BinTrieFlags = {})));
  62. function getDecoder(decodeTree) {
  63. return function decodeHTMLBinary(str, strict) {
  64. var ret = '';
  65. var lastIdx = 0;
  66. var strIdx = 0;
  67. while ((strIdx = str.indexOf('&', strIdx)) >= 0) {
  68. ret += str.slice(lastIdx, strIdx);
  69. lastIdx = strIdx;
  70. // Skip the "&"
  71. strIdx += 1;
  72. // If we have a numeric entity, handle this separately.
  73. if (str.charCodeAt(strIdx) === CharCodes.NUM) {
  74. // Skip the leading "&#". For hex entities, also skip the leading "x".
  75. var start = strIdx + 1;
  76. var base = 10;
  77. var cp = str.charCodeAt(start);
  78. if ((cp | CharCodes.To_LOWER_BIT) === CharCodes.LOWER_X) {
  79. base = 16;
  80. strIdx += 1;
  81. start += 1;
  82. }
  83. do cp = str.charCodeAt(++strIdx);
  84. while (
  85. (cp >= CharCodes.ZERO && cp <= CharCodes.NINE) ||
  86. (base === 16 && (cp | CharCodes.To_LOWER_BIT) >= CharCodes.LOWER_A && (cp | CharCodes.To_LOWER_BIT) <= CharCodes.LOWER_F)
  87. );
  88. if (start !== strIdx) {
  89. var entity = str.substring(start, strIdx);
  90. var parsed = parseInt(entity, base);
  91. if (str.charCodeAt(strIdx) === CharCodes.SEMI) {
  92. strIdx += 1;
  93. } else if (strict) {
  94. continue;
  95. }
  96. ret += (0, decode_codepoint_js_1.default)(parsed);
  97. lastIdx = strIdx;
  98. }
  99. continue;
  100. }
  101. var resultIdx = 0;
  102. var excess = 1;
  103. var treeIdx = 0;
  104. var current = decodeTree[treeIdx];
  105. for (; strIdx < str.length; strIdx++, excess++) {
  106. treeIdx = determineBranch(decodeTree, current, treeIdx + 1, str.charCodeAt(strIdx));
  107. if (treeIdx < 0) {
  108. break;
  109. }
  110. current = decodeTree[treeIdx];
  111. var masked = current & BinTrieFlags.VALUE_LENGTH;
  112. // If the branch is a value, store it and continue
  113. if (masked) {
  114. // If we have a legacy entity while parsing strictly, just skip the number of bytes
  115. if (!strict || str.charCodeAt(strIdx) === CharCodes.SEMI) {
  116. resultIdx = treeIdx;
  117. excess = 0;
  118. }
  119. // The mask is the number of bytes of the value, including the current byte.
  120. var valueLength = (masked >> 14) - 1;
  121. if (valueLength === 0) {
  122. break;
  123. }
  124. treeIdx += valueLength;
  125. }
  126. }
  127. if (resultIdx !== 0) {
  128. var valueLength = (decodeTree[resultIdx] & BinTrieFlags.VALUE_LENGTH) >> 14;
  129. ret +=
  130. valueLength === 1
  131. ? String.fromCharCode(decodeTree[resultIdx] & ~BinTrieFlags.VALUE_LENGTH)
  132. : valueLength === 2
  133. ? String.fromCharCode(decodeTree[resultIdx + 1])
  134. : String.fromCharCode(decodeTree[resultIdx + 1], decodeTree[resultIdx + 2]);
  135. lastIdx = strIdx - excess + 1;
  136. }
  137. }
  138. return ret + str.slice(lastIdx);
  139. };
  140. }
  141. function determineBranch(decodeTree, current, nodeIdx, char) {
  142. var branchCount = (current & BinTrieFlags.BRANCH_LENGTH) >> 7;
  143. var jumpOffset = current & BinTrieFlags.JUMP_TABLE;
  144. // Case 1: Single branch encoded in jump offset
  145. if (branchCount === 0) {
  146. return jumpOffset !== 0 && char === jumpOffset ? nodeIdx : -1;
  147. }
  148. // Case 2: Multiple branches encoded in jump table
  149. if (jumpOffset) {
  150. var value = char - jumpOffset;
  151. return value < 0 || value >= branchCount ? -1 : decodeTree[nodeIdx + value] - 1;
  152. }
  153. // Case 3: Multiple branches encoded in dictionary
  154. // Binary search for the character.
  155. var lo = nodeIdx;
  156. var hi = lo + branchCount - 1;
  157. while (lo <= hi) {
  158. var mid = (lo + hi) >>> 1;
  159. var midVal = decodeTree[mid];
  160. if (midVal < char) {
  161. lo = mid + 1;
  162. } else if (midVal > char) {
  163. hi = mid - 1;
  164. } else {
  165. return decodeTree[mid + branchCount];
  166. }
  167. }
  168. return -1;
  169. }
  170. exports.determineBranch = determineBranch;
  171. var htmlDecoder = getDecoder(decode_data_html_js_1.default);
  172. var xmlDecoder = getDecoder(decode_data_xml_js_1.default);
  173. /**
  174. * Decodes an HTML string, allowing for entities not terminated by a semi-colon.
  175. *
  176. * @param str The string to decode.
  177. * @returns The decoded string.
  178. */
  179. function decodeHTML(str) {
  180. return htmlDecoder(str, false);
  181. }
  182. exports.decodeHTML = decodeHTML;
  183. /**
  184. * Decodes an HTML string, requiring all entities to be terminated by a semi-colon.
  185. *
  186. * @param str The string to decode.
  187. * @returns The decoded string.
  188. */
  189. function decodeHTMLStrict(str) {
  190. return htmlDecoder(str, true);
  191. }
  192. exports.decodeHTMLStrict = decodeHTMLStrict;
  193. /**
  194. * Decodes an XML string, requiring all entities to be terminated by a semi-colon.
  195. *
  196. * @param str The string to decode.
  197. * @returns The decoded string.
  198. */
  199. function decodeXML(str) {
  200. return xmlDecoder(str, true);
  201. }
  202. exports.decodeXML = decodeXML;
  203. //# sourceMappingURL=decode.js.map