latex.lark 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403
  1. %ignore /[ \t\n\r]+/
  2. %ignore "\\," | "\\thinspace" | "\\:" | "\\medspace" | "\\;" | "\\thickspace"
  3. %ignore "\\quad" | "\\qquad"
  4. %ignore "\\!" | "\\negthinspace" | "\\negmedspace" | "\\negthickspace"
  5. %ignore "\\vrule" | "\\vcenter" | "\\vbox" | "\\vskip" | "\\vspace" | "\\hfill"
  6. %ignore "\\*" | "\\-" | "\\." | "\\/" | "\\(" | "\\="
  7. %ignore "\\left" | "\\right"
  8. %ignore "\\limits" | "\\nolimits"
  9. %ignore "\\displaystyle"
  10. ///////////////////// tokens ///////////////////////
  11. // basic binary operators
  12. ADD: "+"
  13. SUB: "-"
  14. MUL: "*"
  15. DIV: "/"
  16. // tokens with distinct left and right symbols
  17. L_BRACE: "{"
  18. R_BRACE: "}"
  19. L_BRACE_LITERAL: "\\{"
  20. R_BRACE_LITERAL: "\\}"
  21. L_BRACKET: "["
  22. R_BRACKET: "]"
  23. L_CEIL: "\\lceil"
  24. R_CEIL: "\\rceil"
  25. L_FLOOR: "\\lfloor"
  26. R_FLOOR: "\\rfloor"
  27. L_PAREN: "("
  28. R_PAREN: ")"
  29. // limit, integral, sum, and product symbols
  30. FUNC_LIM: "\\lim"
  31. LIM_APPROACH_SYM: "\\to" | "\\rightarrow" | "\\Rightarrow" | "\\longrightarrow" | "\\Longrightarrow"
  32. FUNC_INT: "\\int" | "\\intop"
  33. FUNC_SUM: "\\sum"
  34. FUNC_PROD: "\\prod"
  35. // common functions
  36. FUNC_EXP: "\\exp"
  37. FUNC_LOG: "\\log"
  38. FUNC_LN: "\\ln"
  39. FUNC_LG: "\\lg"
  40. FUNC_MIN: "\\min"
  41. FUNC_MAX: "\\max"
  42. // trigonometric functions
  43. FUNC_SIN: "\\sin"
  44. FUNC_COS: "\\cos"
  45. FUNC_TAN: "\\tan"
  46. FUNC_CSC: "\\csc"
  47. FUNC_SEC: "\\sec"
  48. FUNC_COT: "\\cot"
  49. // inverse trigonometric functions
  50. FUNC_ARCSIN: "\\arcsin"
  51. FUNC_ARCCOS: "\\arccos"
  52. FUNC_ARCTAN: "\\arctan"
  53. FUNC_ARCCSC: "\\arccsc"
  54. FUNC_ARCSEC: "\\arcsec"
  55. FUNC_ARCCOT: "\\arccot"
  56. // hyperbolic trigonometric functions
  57. FUNC_SINH: "\\sinh"
  58. FUNC_COSH: "\\cosh"
  59. FUNC_TANH: "\\tanh"
  60. FUNC_ARSINH: "\\arsinh"
  61. FUNC_ARCOSH: "\\arcosh"
  62. FUNC_ARTANH: "\\artanh"
  63. FUNC_SQRT: "\\sqrt"
  64. // miscellaneous symbols
  65. CMD_TIMES: "\\times"
  66. CMD_CDOT: "\\cdot"
  67. CMD_DIV: "\\div"
  68. CMD_FRAC: "\\frac" | "\\dfrac" | "\\tfrac" | "\\nicefrac"
  69. CMD_BINOM: "\\binom" | "\\dbinom" | "\\tbinom"
  70. CMD_OVERLINE: "\\overline"
  71. CMD_LANGLE: "\\langle"
  72. CMD_RANGLE: "\\rangle"
  73. CMD_MATHIT: "\\mathit"
  74. CMD_INFTY: "\\infty"
  75. BANG: "!"
  76. BAR: "|"
  77. CARET: "^"
  78. COLON: ":"
  79. UNDERSCORE: "_"
  80. // relational symbols
  81. EQUAL: "="
  82. NOT_EQUAL: "\\neq" | "\\ne"
  83. LT: "<"
  84. LTE: "\\leq" | "\\le" | "\\leqslant"
  85. GT: ">"
  86. GTE: "\\geq" | "\\ge" | "\\geqslant"
  87. DIV_SYMBOL: CMD_DIV | DIV
  88. MUL_SYMBOL: MUL | CMD_TIMES | CMD_CDOT
  89. %import .greek_symbols.GREEK_SYMBOL
  90. UPRIGHT_DIFFERENTIAL_SYMBOL: "\\text{d}" | "\\mathrm{d}"
  91. DIFFERENTIAL_SYMBOL: "d" | UPRIGHT_DIFFERENTIAL_SYMBOL
  92. // disallow "d" as a variable name because we want to parse "d" as a differential symbol.
  93. SYMBOL: /[a-zA-Z]'*/
  94. GREEK_SYMBOL_WITH_PRIMES: GREEK_SYMBOL "'"*
  95. LATIN_SYMBOL_WITH_LATIN_SUBSCRIPT: /([a-zA-Z]'*)_(([A-Za-z0-9]|[a-zA-Z]+)|\{([A-Za-z0-9]|[a-zA-Z]+'*)\})/
  96. LATIN_SYMBOL_WITH_GREEK_SUBSCRIPT: /([a-zA-Z]'*)_/ GREEK_SYMBOL | /([a-zA-Z]'*)_/ L_BRACE GREEK_SYMBOL_WITH_PRIMES R_BRACE
  97. // best to define the variant with braces like that instead of shoving it all into one case like in
  98. // /([a-zA-Z])_/ L_BRACE? GREEK_SYMBOL R_BRACE? because then we can easily error out on input like
  99. // r"h_{\theta"
  100. GREEK_SYMBOL_WITH_LATIN_SUBSCRIPT: GREEK_SYMBOL_WITH_PRIMES /_(([A-Za-z0-9]|[a-zA-Z]+)|\{([A-Za-z0-9]|[a-zA-Z]+'*)\})/
  101. GREEK_SYMBOL_WITH_GREEK_SUBSCRIPT: GREEK_SYMBOL_WITH_PRIMES /_/ (GREEK_SYMBOL | L_BRACE GREEK_SYMBOL_WITH_PRIMES R_BRACE)
  102. MULTI_LETTER_SYMBOL: /[a-zA-Z]+(\s+[a-zA-Z]+)*'*/
  103. %import common.DIGIT -> DIGIT
  104. CMD_PRIME: "\\prime"
  105. CMD_ASTERISK: "\\ast"
  106. PRIMES: "'"+
  107. STARS: "*"+
  108. PRIMES_VIA_CMD: CMD_PRIME+
  109. STARS_VIA_CMD: CMD_ASTERISK+
  110. CMD_IMAGINARY_UNIT: "\\imaginaryunit"
  111. CMD_BEGIN: "\\begin"
  112. CMD_END: "\\end"
  113. // matrices
  114. IGNORE_L: /[ \t\n\r]*/ L_BRACE* /[ \t\n\r]*/
  115. IGNORE_R: /[ \t\n\r]*/ R_BRACE* /[ \t\n\r]*/
  116. ARRAY_MATRIX_BEGIN: L_BRACE "array" R_BRACE L_BRACE /[^}]*/ R_BRACE
  117. ARRAY_MATRIX_END: L_BRACE "array" R_BRACE
  118. AMSMATH_MATRIX: L_BRACE "matrix" R_BRACE
  119. AMSMATH_PMATRIX: L_BRACE "pmatrix" R_BRACE
  120. AMSMATH_BMATRIX: L_BRACE "bmatrix" R_BRACE
  121. // Without the (L|R)_PARENs and (L|R)_BRACKETs, a matrix defined using
  122. // \begin{array}...\end{array} or \begin{matrix}...\end{matrix} must
  123. // not qualify as a complete matrix expression; this is done so that
  124. // if we have \begin{array}...\end{array} or \begin{matrix}...\end{matrix}
  125. // between BAR pairs, then they should be interpreted as determinants as
  126. // opposed to sympy.Abs (absolute value) applied to a matrix.
  127. CMD_BEGIN_AMSPMATRIX_AMSBMATRIX: CMD_BEGIN (AMSMATH_PMATRIX | AMSMATH_BMATRIX)
  128. CMD_BEGIN_ARRAY_AMSMATRIX: (L_PAREN | L_BRACKET) IGNORE_L CMD_BEGIN (ARRAY_MATRIX_BEGIN | AMSMATH_MATRIX)
  129. CMD_MATRIX_BEGIN: CMD_BEGIN_AMSPMATRIX_AMSBMATRIX | CMD_BEGIN_ARRAY_AMSMATRIX
  130. CMD_END_AMSPMATRIX_AMSBMATRIX: CMD_END (AMSMATH_PMATRIX | AMSMATH_BMATRIX)
  131. CMD_END_ARRAY_AMSMATRIX: CMD_END (ARRAY_MATRIX_END | AMSMATH_MATRIX) IGNORE_R "\\right"? (R_PAREN | R_BRACKET)
  132. CMD_MATRIX_END: CMD_END_AMSPMATRIX_AMSBMATRIX | CMD_END_ARRAY_AMSMATRIX
  133. MATRIX_COL_DELIM: "&"
  134. MATRIX_ROW_DELIM: "\\\\"
  135. FUNC_MATRIX_TRACE: "\\trace"
  136. FUNC_MATRIX_ADJUGATE: "\\adjugate"
  137. // determinants
  138. AMSMATH_VMATRIX: L_BRACE "vmatrix" R_BRACE
  139. CMD_DETERMINANT_BEGIN_SIMPLE: CMD_BEGIN AMSMATH_VMATRIX
  140. CMD_DETERMINANT_BEGIN_VARIANT: BAR IGNORE_L CMD_BEGIN (ARRAY_MATRIX_BEGIN | AMSMATH_MATRIX)
  141. CMD_DETERMINANT_BEGIN: CMD_DETERMINANT_BEGIN_SIMPLE | CMD_DETERMINANT_BEGIN_VARIANT
  142. CMD_DETERMINANT_END_SIMPLE: CMD_END AMSMATH_VMATRIX
  143. CMD_DETERMINANT_END_VARIANT: CMD_END (ARRAY_MATRIX_END | AMSMATH_MATRIX) IGNORE_R "\\right"? BAR
  144. CMD_DETERMINANT_END: CMD_DETERMINANT_END_SIMPLE | CMD_DETERMINANT_END_VARIANT
  145. FUNC_DETERMINANT: "\\det"
  146. //////////////////// grammar //////////////////////
  147. latex_string: _relation | _expression
  148. _one_letter_symbol: SYMBOL
  149. | LATIN_SYMBOL_WITH_LATIN_SUBSCRIPT
  150. | LATIN_SYMBOL_WITH_GREEK_SUBSCRIPT
  151. | GREEK_SYMBOL_WITH_LATIN_SUBSCRIPT
  152. | GREEK_SYMBOL_WITH_GREEK_SUBSCRIPT
  153. | GREEK_SYMBOL_WITH_PRIMES
  154. // LuaTeX-generated outputs of \mathit{foo'} and \mathit{foo}'
  155. // seem to be the same on the surface. We allow both styles.
  156. multi_letter_symbol: CMD_MATHIT L_BRACE MULTI_LETTER_SYMBOL R_BRACE
  157. | CMD_MATHIT L_BRACE MULTI_LETTER_SYMBOL R_BRACE /'+/
  158. number: /\d+(\.\d*)?/ | CMD_IMAGINARY_UNIT
  159. _atomic_expr: _one_letter_symbol
  160. | multi_letter_symbol
  161. | number
  162. | CMD_INFTY
  163. group_round_parentheses: L_PAREN _expression R_PAREN
  164. group_square_brackets: L_BRACKET _expression R_BRACKET
  165. group_curly_parentheses: L_BRACE _expression R_BRACE
  166. _relation: eq | ne | lt | lte | gt | gte
  167. eq: _expression EQUAL _expression
  168. ne: _expression NOT_EQUAL _expression
  169. lt: _expression LT _expression
  170. lte: _expression LTE _expression
  171. gt: _expression GT _expression
  172. gte: _expression GTE _expression
  173. _expression_core: _atomic_expr | group_curly_parentheses
  174. add: _expression ADD _expression_mul
  175. | ADD _expression_mul
  176. sub: _expression SUB _expression_mul
  177. | SUB _expression_mul
  178. mul: _expression_mul MUL_SYMBOL _expression_power
  179. div: _expression_mul DIV_SYMBOL _expression_power
  180. adjacent_expressions: (_one_letter_symbol | number) _expression_mul
  181. | group_round_parentheses (group_round_parentheses | _one_letter_symbol)
  182. | _function _function
  183. | fraction _expression_mul
  184. _expression_func: _expression_core
  185. | group_round_parentheses
  186. | fraction
  187. | binomial
  188. | _function
  189. | _integral// | derivative
  190. | limit
  191. | matrix
  192. _expression_power: _expression_func | superscript | matrix_prime | symbol_prime
  193. _expression_mul: _expression_power
  194. | mul | div | adjacent_expressions
  195. | summation | product
  196. _expression: _expression_mul | add | sub
  197. _limit_dir: "+" | "-" | L_BRACE ("+" | "-") R_BRACE
  198. limit_dir_expr: _expression CARET _limit_dir
  199. group_curly_parentheses_lim: L_BRACE _expression LIM_APPROACH_SYM (limit_dir_expr | _expression) R_BRACE
  200. limit: FUNC_LIM UNDERSCORE group_curly_parentheses_lim _expression
  201. differential: DIFFERENTIAL_SYMBOL _one_letter_symbol
  202. //_derivative_operator: CMD_FRAC L_BRACE DIFFERENTIAL_SYMBOL R_BRACE L_BRACE differential R_BRACE
  203. //derivative: _derivative_operator _expression
  204. _integral: normal_integral | integral_with_special_fraction
  205. normal_integral: FUNC_INT _expression DIFFERENTIAL_SYMBOL _one_letter_symbol
  206. | FUNC_INT (CARET _expression_core UNDERSCORE _expression_core)? _expression? DIFFERENTIAL_SYMBOL _one_letter_symbol
  207. | FUNC_INT (UNDERSCORE _expression_core CARET _expression_core)? _expression? DIFFERENTIAL_SYMBOL _one_letter_symbol
  208. group_curly_parentheses_int: L_BRACE _expression? differential R_BRACE
  209. special_fraction: CMD_FRAC group_curly_parentheses_int group_curly_parentheses
  210. integral_with_special_fraction: FUNC_INT special_fraction
  211. | FUNC_INT (CARET _expression_core UNDERSCORE _expression_core)? special_fraction
  212. | FUNC_INT (UNDERSCORE _expression_core CARET _expression_core)? special_fraction
  213. group_curly_parentheses_special: UNDERSCORE L_BRACE _atomic_expr EQUAL _atomic_expr R_BRACE CARET _expression_core
  214. | CARET _expression_core UNDERSCORE L_BRACE _atomic_expr EQUAL _atomic_expr R_BRACE
  215. summation: FUNC_SUM group_curly_parentheses_special _expression
  216. | FUNC_SUM group_curly_parentheses_special _expression
  217. product: FUNC_PROD group_curly_parentheses_special _expression
  218. | FUNC_PROD group_curly_parentheses_special _expression
  219. superscript: _expression_func CARET (_expression_power | CMD_PRIME | CMD_ASTERISK)
  220. | _expression_func CARET L_BRACE (PRIMES | STARS | PRIMES_VIA_CMD | STARS_VIA_CMD) R_BRACE
  221. matrix_prime: (matrix | group_round_parentheses) PRIMES
  222. symbol_prime: (LATIN_SYMBOL_WITH_LATIN_SUBSCRIPT
  223. | LATIN_SYMBOL_WITH_GREEK_SUBSCRIPT
  224. | GREEK_SYMBOL_WITH_LATIN_SUBSCRIPT
  225. | GREEK_SYMBOL_WITH_GREEK_SUBSCRIPT) PRIMES
  226. fraction: _basic_fraction
  227. | _simple_fraction
  228. | _general_fraction
  229. _basic_fraction: CMD_FRAC DIGIT (DIGIT | SYMBOL | GREEK_SYMBOL_WITH_PRIMES)
  230. _simple_fraction: CMD_FRAC DIGIT group_curly_parentheses
  231. | CMD_FRAC group_curly_parentheses (DIGIT | SYMBOL | GREEK_SYMBOL_WITH_PRIMES)
  232. _general_fraction: CMD_FRAC group_curly_parentheses group_curly_parentheses
  233. binomial: _basic_binomial
  234. | _simple_binomial
  235. | _general_binomial
  236. _basic_binomial: CMD_BINOM DIGIT (DIGIT | SYMBOL | GREEK_SYMBOL_WITH_PRIMES)
  237. _simple_binomial: CMD_BINOM DIGIT group_curly_parentheses
  238. | CMD_BINOM group_curly_parentheses (DIGIT | SYMBOL | GREEK_SYMBOL_WITH_PRIMES)
  239. _general_binomial: CMD_BINOM group_curly_parentheses group_curly_parentheses
  240. list_of_expressions: _expression ("," _expression)*
  241. function_applied: _one_letter_symbol L_PAREN list_of_expressions R_PAREN
  242. min: FUNC_MIN L_PAREN list_of_expressions R_PAREN
  243. max: FUNC_MAX L_PAREN list_of_expressions R_PAREN
  244. bra: CMD_LANGLE _expression BAR
  245. ket: BAR _expression CMD_RANGLE
  246. inner_product: CMD_LANGLE _expression BAR _expression CMD_RANGLE
  247. _function: function_applied
  248. | abs | floor | ceil
  249. | _trigonometric_function | _inverse_trigonometric_function
  250. | _trigonometric_function_power
  251. | _hyperbolic_trigonometric_function | _inverse_hyperbolic_trigonometric_function
  252. | exponential
  253. | log
  254. | square_root
  255. | factorial
  256. | conjugate
  257. | max | min
  258. | bra | ket | inner_product
  259. | determinant
  260. | trace
  261. | adjugate
  262. exponential: FUNC_EXP _expression
  263. log: FUNC_LOG _expression
  264. | FUNC_LN _expression
  265. | FUNC_LG _expression
  266. | FUNC_LOG UNDERSCORE (DIGIT | _one_letter_symbol) _expression
  267. | FUNC_LOG UNDERSCORE group_curly_parentheses _expression
  268. square_root: FUNC_SQRT group_curly_parentheses
  269. | FUNC_SQRT group_square_brackets group_curly_parentheses
  270. factorial: _expression_func BANG
  271. conjugate: CMD_OVERLINE group_curly_parentheses
  272. | CMD_OVERLINE DIGIT
  273. _trigonometric_function: sin | cos | tan | csc | sec | cot
  274. sin: FUNC_SIN _expression
  275. cos: FUNC_COS _expression
  276. tan: FUNC_TAN _expression
  277. csc: FUNC_CSC _expression
  278. sec: FUNC_SEC _expression
  279. cot: FUNC_COT _expression
  280. _trigonometric_function_power: sin_power | cos_power | tan_power | csc_power | sec_power | cot_power
  281. sin_power: FUNC_SIN CARET _expression_core _expression
  282. cos_power: FUNC_COS CARET _expression_core _expression
  283. tan_power: FUNC_TAN CARET _expression_core _expression
  284. csc_power: FUNC_CSC CARET _expression_core _expression
  285. sec_power: FUNC_SEC CARET _expression_core _expression
  286. cot_power: FUNC_COT CARET _expression_core _expression
  287. _hyperbolic_trigonometric_function: sinh | cosh | tanh
  288. sinh: FUNC_SINH _expression
  289. cosh: FUNC_COSH _expression
  290. tanh: FUNC_TANH _expression
  291. _inverse_trigonometric_function: arcsin | arccos | arctan | arccsc | arcsec | arccot
  292. arcsin: FUNC_ARCSIN _expression
  293. arccos: FUNC_ARCCOS _expression
  294. arctan: FUNC_ARCTAN _expression
  295. arccsc: FUNC_ARCCSC _expression
  296. arcsec: FUNC_ARCSEC _expression
  297. arccot: FUNC_ARCCOT _expression
  298. _inverse_hyperbolic_trigonometric_function: asinh | acosh | atanh
  299. asinh: FUNC_ARSINH _expression
  300. acosh: FUNC_ARCOSH _expression
  301. atanh: FUNC_ARTANH _expression
  302. abs: BAR _expression BAR
  303. floor: L_FLOOR _expression R_FLOOR
  304. ceil: L_CEIL _expression R_CEIL
  305. matrix: CMD_MATRIX_BEGIN matrix_body CMD_MATRIX_END
  306. matrix_body: matrix_row (MATRIX_ROW_DELIM matrix_row)* (MATRIX_ROW_DELIM)?
  307. matrix_row: _expression (MATRIX_COL_DELIM _expression)*
  308. determinant: (CMD_DETERMINANT_BEGIN matrix_body CMD_DETERMINANT_END)
  309. | FUNC_DETERMINANT _expression
  310. trace: FUNC_MATRIX_TRACE _expression
  311. adjugate: FUNC_MATRIX_ADJUGATE _expression