Lexer.ts 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516
  1. import { _Tokenizer } from './Tokenizer.ts';
  2. import { _defaults } from './defaults.ts';
  3. import { block, inline } from './rules.ts';
  4. import type { Token, TokensList, Tokens } from './Tokens.ts';
  5. import type { MarkedOptions, TokenizerExtension } from './MarkedOptions.ts';
  6. import type { Rules } from './rules.ts';
  7. /**
  8. * smartypants text replacement
  9. */
  10. function smartypants(text: string) {
  11. return text
  12. // em-dashes
  13. .replace(/---/g, '\u2014')
  14. // en-dashes
  15. .replace(/--/g, '\u2013')
  16. // opening singles
  17. .replace(/(^|[-\u2014/(\[{"\s])'/g, '$1\u2018')
  18. // closing singles & apostrophes
  19. .replace(/'/g, '\u2019')
  20. // opening doubles
  21. .replace(/(^|[-\u2014/(\[{\u2018\s])"/g, '$1\u201c')
  22. // closing doubles
  23. .replace(/"/g, '\u201d')
  24. // ellipses
  25. .replace(/\.{3}/g, '\u2026');
  26. }
  27. /**
  28. * mangle email addresses
  29. */
  30. function mangle(text: string) {
  31. let out = '';
  32. for (let i = 0; i < text.length; i++) {
  33. const ch = Math.random() > 0.5
  34. ? 'x' + text.charCodeAt(i).toString(16)
  35. : text.charCodeAt(i).toString();
  36. out += '&#' + ch + ';';
  37. }
  38. return out;
  39. }
  40. /**
  41. * Block Lexer
  42. */
  43. export class _Lexer {
  44. tokens: TokensList;
  45. options: MarkedOptions;
  46. state: {
  47. inLink: boolean;
  48. inRawBlock: boolean;
  49. top: boolean;
  50. };
  51. private tokenizer: _Tokenizer;
  52. private inlineQueue: {src: string, tokens: Token[]}[];
  53. constructor(options?: MarkedOptions) {
  54. // TokenList cannot be created in one go
  55. // @ts-expect-error
  56. this.tokens = [];
  57. this.tokens.links = Object.create(null);
  58. this.options = options || _defaults;
  59. this.options.tokenizer = this.options.tokenizer || new _Tokenizer();
  60. this.tokenizer = this.options.tokenizer;
  61. this.tokenizer.options = this.options;
  62. this.tokenizer.lexer = this;
  63. this.inlineQueue = [];
  64. this.state = {
  65. inLink: false,
  66. inRawBlock: false,
  67. top: true
  68. };
  69. const rules = {
  70. block: block.normal,
  71. inline: inline.normal
  72. };
  73. if (this.options.pedantic) {
  74. rules.block = block.pedantic;
  75. rules.inline = inline.pedantic;
  76. } else if (this.options.gfm) {
  77. rules.block = block.gfm;
  78. if (this.options.breaks) {
  79. rules.inline = inline.breaks;
  80. } else {
  81. rules.inline = inline.gfm;
  82. }
  83. }
  84. this.tokenizer.rules = rules;
  85. }
  86. /**
  87. * Expose Rules
  88. */
  89. static get rules(): Rules {
  90. return {
  91. block,
  92. inline
  93. };
  94. }
  95. /**
  96. * Static Lex Method
  97. */
  98. static lex(src: string, options?: MarkedOptions) {
  99. const lexer = new _Lexer(options);
  100. return lexer.lex(src);
  101. }
  102. /**
  103. * Static Lex Inline Method
  104. */
  105. static lexInline(src: string, options?: MarkedOptions) {
  106. const lexer = new _Lexer(options);
  107. return lexer.inlineTokens(src);
  108. }
  109. /**
  110. * Preprocessing
  111. */
  112. lex(src: string) {
  113. src = src
  114. .replace(/\r\n|\r/g, '\n');
  115. this.blockTokens(src, this.tokens);
  116. let next;
  117. while (next = this.inlineQueue.shift()) {
  118. this.inlineTokens(next.src, next.tokens);
  119. }
  120. return this.tokens;
  121. }
  122. /**
  123. * Lexing
  124. */
  125. blockTokens(src: string, tokens?: Token[]): Token[];
  126. blockTokens(src: string, tokens?: TokensList): TokensList;
  127. blockTokens(src: string, tokens: Token[] = []) {
  128. if (this.options.pedantic) {
  129. src = src.replace(/\t/g, ' ').replace(/^ +$/gm, '');
  130. } else {
  131. src = src.replace(/^( *)(\t+)/gm, (_, leading, tabs) => {
  132. return leading + ' '.repeat(tabs.length);
  133. });
  134. }
  135. let token: Tokens.Generic | undefined;
  136. let lastToken;
  137. let cutSrc;
  138. let lastParagraphClipped;
  139. while (src) {
  140. if (this.options.extensions
  141. && this.options.extensions.block
  142. && this.options.extensions.block.some((extTokenizer: TokenizerExtension['tokenizer']) => {
  143. if (token = extTokenizer.call({ lexer: this }, src, tokens)) {
  144. src = src.substring(token.raw.length);
  145. tokens.push(token);
  146. return true;
  147. }
  148. return false;
  149. })) {
  150. continue;
  151. }
  152. // newline
  153. if (token = this.tokenizer.space(src)) {
  154. src = src.substring(token.raw.length);
  155. if (token.raw.length === 1 && tokens.length > 0) {
  156. // if there's a single \n as a spacer, it's terminating the last line,
  157. // so move it there so that we don't get unecessary paragraph tags
  158. tokens[tokens.length - 1].raw += '\n';
  159. } else {
  160. tokens.push(token);
  161. }
  162. continue;
  163. }
  164. // code
  165. if (token = this.tokenizer.code(src)) {
  166. src = src.substring(token.raw.length);
  167. lastToken = tokens[tokens.length - 1];
  168. // An indented code block cannot interrupt a paragraph.
  169. if (lastToken && (lastToken.type === 'paragraph' || lastToken.type === 'text')) {
  170. lastToken.raw += '\n' + token.raw;
  171. lastToken.text += '\n' + token.text;
  172. this.inlineQueue[this.inlineQueue.length - 1].src = lastToken.text;
  173. } else {
  174. tokens.push(token);
  175. }
  176. continue;
  177. }
  178. // fences
  179. if (token = this.tokenizer.fences(src)) {
  180. src = src.substring(token.raw.length);
  181. tokens.push(token);
  182. continue;
  183. }
  184. // heading
  185. if (token = this.tokenizer.heading(src)) {
  186. src = src.substring(token.raw.length);
  187. tokens.push(token);
  188. continue;
  189. }
  190. // hr
  191. if (token = this.tokenizer.hr(src)) {
  192. src = src.substring(token.raw.length);
  193. tokens.push(token);
  194. continue;
  195. }
  196. // blockquote
  197. if (token = this.tokenizer.blockquote(src)) {
  198. src = src.substring(token.raw.length);
  199. tokens.push(token);
  200. continue;
  201. }
  202. // list
  203. if (token = this.tokenizer.list(src)) {
  204. src = src.substring(token.raw.length);
  205. tokens.push(token);
  206. continue;
  207. }
  208. // html
  209. if (token = this.tokenizer.html(src)) {
  210. src = src.substring(token.raw.length);
  211. tokens.push(token);
  212. continue;
  213. }
  214. // def
  215. if (token = this.tokenizer.def(src)) {
  216. src = src.substring(token.raw.length);
  217. lastToken = tokens[tokens.length - 1];
  218. if (lastToken && (lastToken.type === 'paragraph' || lastToken.type === 'text')) {
  219. lastToken.raw += '\n' + token.raw;
  220. lastToken.text += '\n' + token.raw;
  221. this.inlineQueue[this.inlineQueue.length - 1].src = lastToken.text;
  222. } else if (!this.tokens.links[token.tag]) {
  223. this.tokens.links[token.tag] = {
  224. href: token.href,
  225. title: token.title
  226. };
  227. }
  228. continue;
  229. }
  230. // table (gfm)
  231. if (token = this.tokenizer.table(src)) {
  232. src = src.substring(token.raw.length);
  233. tokens.push(token);
  234. continue;
  235. }
  236. // lheading
  237. if (token = this.tokenizer.lheading(src)) {
  238. src = src.substring(token.raw.length);
  239. tokens.push(token);
  240. continue;
  241. }
  242. // top-level paragraph
  243. // prevent paragraph consuming extensions by clipping 'src' to extension start
  244. cutSrc = src;
  245. if (this.options.extensions && this.options.extensions.startBlock) {
  246. let startIndex = Infinity;
  247. const tempSrc = src.slice(1);
  248. let tempStart;
  249. this.options.extensions.startBlock.forEach((getStartIndex) => {
  250. tempStart = getStartIndex.call({ lexer: this }, tempSrc);
  251. if (typeof tempStart === 'number' && tempStart >= 0) { startIndex = Math.min(startIndex, tempStart); }
  252. });
  253. if (startIndex < Infinity && startIndex >= 0) {
  254. cutSrc = src.substring(0, startIndex + 1);
  255. }
  256. }
  257. if (this.state.top && (token = this.tokenizer.paragraph(cutSrc))) {
  258. lastToken = tokens[tokens.length - 1];
  259. if (lastParagraphClipped && lastToken.type === 'paragraph') {
  260. lastToken.raw += '\n' + token.raw;
  261. lastToken.text += '\n' + token.text;
  262. this.inlineQueue.pop();
  263. this.inlineQueue[this.inlineQueue.length - 1].src = lastToken.text;
  264. } else {
  265. tokens.push(token);
  266. }
  267. lastParagraphClipped = (cutSrc.length !== src.length);
  268. src = src.substring(token.raw.length);
  269. continue;
  270. }
  271. // text
  272. if (token = this.tokenizer.text(src)) {
  273. src = src.substring(token.raw.length);
  274. lastToken = tokens[tokens.length - 1];
  275. if (lastToken && lastToken.type === 'text') {
  276. lastToken.raw += '\n' + token.raw;
  277. lastToken.text += '\n' + token.text;
  278. this.inlineQueue.pop();
  279. this.inlineQueue[this.inlineQueue.length - 1].src = lastToken.text;
  280. } else {
  281. tokens.push(token);
  282. }
  283. continue;
  284. }
  285. if (src) {
  286. const errMsg = 'Infinite loop on byte: ' + src.charCodeAt(0);
  287. if (this.options.silent) {
  288. console.error(errMsg);
  289. break;
  290. } else {
  291. throw new Error(errMsg);
  292. }
  293. }
  294. }
  295. this.state.top = true;
  296. return tokens;
  297. }
  298. inline(src: string, tokens: Token[] = []) {
  299. this.inlineQueue.push({ src, tokens });
  300. return tokens;
  301. }
  302. /**
  303. * Lexing/Compiling
  304. */
  305. inlineTokens(src: string, tokens: Token[] = []): Token[] {
  306. let token, lastToken, cutSrc;
  307. // String with links masked to avoid interference with em and strong
  308. let maskedSrc = src;
  309. let match;
  310. let keepPrevChar, prevChar;
  311. // Mask out reflinks
  312. if (this.tokens.links) {
  313. const links = Object.keys(this.tokens.links);
  314. if (links.length > 0) {
  315. while ((match = this.tokenizer.rules.inline.reflinkSearch.exec(maskedSrc)) != null) {
  316. if (links.includes(match[0].slice(match[0].lastIndexOf('[') + 1, -1))) {
  317. maskedSrc = maskedSrc.slice(0, match.index) + '[' + 'a'.repeat(match[0].length - 2) + ']' + maskedSrc.slice(this.tokenizer.rules.inline.reflinkSearch.lastIndex);
  318. }
  319. }
  320. }
  321. }
  322. // Mask out other blocks
  323. while ((match = this.tokenizer.rules.inline.blockSkip.exec(maskedSrc)) != null) {
  324. maskedSrc = maskedSrc.slice(0, match.index) + '[' + 'a'.repeat(match[0].length - 2) + ']' + maskedSrc.slice(this.tokenizer.rules.inline.blockSkip.lastIndex);
  325. }
  326. // Mask out escaped characters
  327. while ((match = this.tokenizer.rules.inline.anyPunctuation.exec(maskedSrc)) != null) {
  328. maskedSrc = maskedSrc.slice(0, match.index) + '++' + maskedSrc.slice(this.tokenizer.rules.inline.anyPunctuation.lastIndex);
  329. }
  330. while (src) {
  331. if (!keepPrevChar) {
  332. prevChar = '';
  333. }
  334. keepPrevChar = false;
  335. // extensions
  336. if (this.options.extensions
  337. && this.options.extensions.inline
  338. && this.options.extensions.inline.some((extTokenizer) => {
  339. if (token = extTokenizer.call({ lexer: this }, src, tokens)) {
  340. src = src.substring(token.raw.length);
  341. tokens.push(token);
  342. return true;
  343. }
  344. return false;
  345. })) {
  346. continue;
  347. }
  348. // escape
  349. if (token = this.tokenizer.escape(src)) {
  350. src = src.substring(token.raw.length);
  351. tokens.push(token);
  352. continue;
  353. }
  354. // tag
  355. if (token = this.tokenizer.tag(src)) {
  356. src = src.substring(token.raw.length);
  357. lastToken = tokens[tokens.length - 1];
  358. if (lastToken && token.type === 'text' && lastToken.type === 'text') {
  359. lastToken.raw += token.raw;
  360. lastToken.text += token.text;
  361. } else {
  362. tokens.push(token);
  363. }
  364. continue;
  365. }
  366. // link
  367. if (token = this.tokenizer.link(src)) {
  368. src = src.substring(token.raw.length);
  369. tokens.push(token);
  370. continue;
  371. }
  372. // reflink, nolink
  373. if (token = this.tokenizer.reflink(src, this.tokens.links)) {
  374. src = src.substring(token.raw.length);
  375. lastToken = tokens[tokens.length - 1];
  376. if (lastToken && token.type === 'text' && lastToken.type === 'text') {
  377. lastToken.raw += token.raw;
  378. lastToken.text += token.text;
  379. } else {
  380. tokens.push(token);
  381. }
  382. continue;
  383. }
  384. // em & strong
  385. if (token = this.tokenizer.emStrong(src, maskedSrc, prevChar)) {
  386. src = src.substring(token.raw.length);
  387. tokens.push(token);
  388. continue;
  389. }
  390. // code
  391. if (token = this.tokenizer.codespan(src)) {
  392. src = src.substring(token.raw.length);
  393. tokens.push(token);
  394. continue;
  395. }
  396. // br
  397. if (token = this.tokenizer.br(src)) {
  398. src = src.substring(token.raw.length);
  399. tokens.push(token);
  400. continue;
  401. }
  402. // del (gfm)
  403. if (token = this.tokenizer.del(src)) {
  404. src = src.substring(token.raw.length);
  405. tokens.push(token);
  406. continue;
  407. }
  408. // autolink
  409. if (token = this.tokenizer.autolink(src, mangle)) {
  410. src = src.substring(token.raw.length);
  411. tokens.push(token);
  412. continue;
  413. }
  414. // url (gfm)
  415. if (!this.state.inLink && (token = this.tokenizer.url(src, mangle))) {
  416. src = src.substring(token.raw.length);
  417. tokens.push(token);
  418. continue;
  419. }
  420. // text
  421. // prevent inlineText consuming extensions by clipping 'src' to extension start
  422. cutSrc = src;
  423. if (this.options.extensions && this.options.extensions.startInline) {
  424. let startIndex = Infinity;
  425. const tempSrc = src.slice(1);
  426. let tempStart;
  427. this.options.extensions.startInline.forEach((getStartIndex) => {
  428. tempStart = getStartIndex.call({ lexer: this }, tempSrc);
  429. if (typeof tempStart === 'number' && tempStart >= 0) { startIndex = Math.min(startIndex, tempStart); }
  430. });
  431. if (startIndex < Infinity && startIndex >= 0) {
  432. cutSrc = src.substring(0, startIndex + 1);
  433. }
  434. }
  435. if (token = this.tokenizer.inlineText(cutSrc, smartypants)) {
  436. src = src.substring(token.raw.length);
  437. if (token.raw.slice(-1) !== '_') { // Track prevChar before string of ____ started
  438. prevChar = token.raw.slice(-1);
  439. }
  440. keepPrevChar = true;
  441. lastToken = tokens[tokens.length - 1];
  442. if (lastToken && lastToken.type === 'text') {
  443. lastToken.raw += token.raw;
  444. lastToken.text += token.text;
  445. } else {
  446. tokens.push(token);
  447. }
  448. continue;
  449. }
  450. if (src) {
  451. const errMsg = 'Infinite loop on byte: ' + src.charCodeAt(0);
  452. if (this.options.silent) {
  453. console.error(errMsg);
  454. break;
  455. } else {
  456. throw new Error(errMsg);
  457. }
  458. }
  459. }
  460. return tokens;
  461. }
  462. }