nearley.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. "Converts Nearley grammars to Lark"
  2. import os.path
  3. import sys
  4. import codecs
  5. import argparse
  6. from lark import Lark, Transformer, v_args
  7. nearley_grammar = r"""
  8. start: (ruledef|directive)+
  9. directive: "@" NAME (STRING|NAME)
  10. | "@" JS -> js_code
  11. ruledef: NAME "->" expansions
  12. | NAME REGEXP "->" expansions -> macro
  13. expansions: expansion ("|" expansion)*
  14. expansion: expr+ js
  15. ?expr: item (":" /[+*?]/)?
  16. ?item: rule|string|regexp|null
  17. | "(" expansions ")"
  18. rule: NAME
  19. string: STRING
  20. regexp: REGEXP
  21. null: "null"
  22. JS: /{%.*?%}/s
  23. js: JS?
  24. NAME: /[a-zA-Z_$]\w*/
  25. COMMENT: /#[^\n]*/
  26. REGEXP: /\[.*?\]/
  27. STRING: _STRING "i"?
  28. %import common.ESCAPED_STRING -> _STRING
  29. %import common.WS
  30. %ignore WS
  31. %ignore COMMENT
  32. """
  33. nearley_grammar_parser = Lark(nearley_grammar, parser='earley', lexer='basic')
  34. def _get_rulename(name):
  35. name = {'_': '_ws_maybe', '__': '_ws'}.get(name, name)
  36. return 'n_' + name.replace('$', '__DOLLAR__').lower()
  37. @v_args(inline=True)
  38. class NearleyToLark(Transformer):
  39. def __init__(self):
  40. self._count = 0
  41. self.extra_rules = {}
  42. self.extra_rules_rev = {}
  43. self.alias_js_code = {}
  44. def _new_function(self, code):
  45. name = 'alias_%d' % self._count
  46. self._count += 1
  47. self.alias_js_code[name] = code
  48. return name
  49. def _extra_rule(self, rule):
  50. if rule in self.extra_rules_rev:
  51. return self.extra_rules_rev[rule]
  52. name = 'xrule_%d' % len(self.extra_rules)
  53. assert name not in self.extra_rules
  54. self.extra_rules[name] = rule
  55. self.extra_rules_rev[rule] = name
  56. return name
  57. def rule(self, name):
  58. return _get_rulename(name)
  59. def ruledef(self, name, exps):
  60. return '!%s: %s' % (_get_rulename(name), exps)
  61. def expr(self, item, op):
  62. rule = '(%s)%s' % (item, op)
  63. return self._extra_rule(rule)
  64. def regexp(self, r):
  65. return '/%s/' % r
  66. def null(self):
  67. return ''
  68. def string(self, s):
  69. return self._extra_rule(s)
  70. def expansion(self, *x):
  71. x, js = x[:-1], x[-1]
  72. if js.children:
  73. js_code ,= js.children
  74. js_code = js_code[2:-2]
  75. alias = '-> ' + self._new_function(js_code)
  76. else:
  77. alias = ''
  78. return ' '.join(x) + alias
  79. def expansions(self, *x):
  80. return '%s' % ('\n |'.join(x))
  81. def start(self, *rules):
  82. return '\n'.join(filter(None, rules))
  83. def _nearley_to_lark(g, builtin_path, n2l, js_code, folder_path, includes):
  84. rule_defs = []
  85. tree = nearley_grammar_parser.parse(g)
  86. for statement in tree.children:
  87. if statement.data == 'directive':
  88. directive, arg = statement.children
  89. if directive in ('builtin', 'include'):
  90. folder = builtin_path if directive == 'builtin' else folder_path
  91. path = os.path.join(folder, arg[1:-1])
  92. if path not in includes:
  93. includes.add(path)
  94. with codecs.open(path, encoding='utf8') as f:
  95. text = f.read()
  96. rule_defs += _nearley_to_lark(text, builtin_path, n2l, js_code, os.path.abspath(os.path.dirname(path)), includes)
  97. else:
  98. assert False, directive
  99. elif statement.data == 'js_code':
  100. code ,= statement.children
  101. code = code[2:-2]
  102. js_code.append(code)
  103. elif statement.data == 'macro':
  104. pass # TODO Add support for macros!
  105. elif statement.data == 'ruledef':
  106. rule_defs.append(n2l.transform(statement))
  107. else:
  108. raise Exception("Unknown statement: %s" % statement)
  109. return rule_defs
  110. def create_code_for_nearley_grammar(g, start, builtin_path, folder_path, es6=False):
  111. import js2py
  112. emit_code = []
  113. def emit(x=None):
  114. if x:
  115. emit_code.append(x)
  116. emit_code.append('\n')
  117. js_code = ['function id(x) {return x[0];}']
  118. n2l = NearleyToLark()
  119. rule_defs = _nearley_to_lark(g, builtin_path, n2l, js_code, folder_path, set())
  120. lark_g = '\n'.join(rule_defs)
  121. lark_g += '\n'+'\n'.join('!%s: %s' % item for item in n2l.extra_rules.items())
  122. emit('from lark import Lark, Transformer')
  123. emit()
  124. emit('grammar = ' + repr(lark_g))
  125. emit()
  126. for alias, code in n2l.alias_js_code.items():
  127. js_code.append('%s = (%s);' % (alias, code))
  128. if es6:
  129. emit(js2py.translate_js6('\n'.join(js_code)))
  130. else:
  131. emit(js2py.translate_js('\n'.join(js_code)))
  132. emit('class TransformNearley(Transformer):')
  133. for alias in n2l.alias_js_code:
  134. emit(" %s = var.get('%s').to_python()" % (alias, alias))
  135. emit(" __default__ = lambda self, n, c, m: c if c else None")
  136. emit()
  137. emit('parser = Lark(grammar, start="n_%s", maybe_placeholders=False)' % start)
  138. emit('def parse(text):')
  139. emit(' return TransformNearley().transform(parser.parse(text))')
  140. return ''.join(emit_code)
  141. def main(fn, start, nearley_lib, es6=False):
  142. with codecs.open(fn, encoding='utf8') as f:
  143. grammar = f.read()
  144. return create_code_for_nearley_grammar(grammar, start, os.path.join(nearley_lib, 'builtin'), os.path.abspath(os.path.dirname(fn)), es6=es6)
  145. def get_arg_parser():
  146. parser = argparse.ArgumentParser(description='Reads a Nearley grammar (with js functions), and outputs an equivalent lark parser.')
  147. parser.add_argument('nearley_grammar', help='Path to the file containing the nearley grammar')
  148. parser.add_argument('start_rule', help='Rule within the nearley grammar to make the base rule')
  149. parser.add_argument('nearley_lib', help='Path to root directory of nearley codebase (used for including builtins)')
  150. parser.add_argument('--es6', help='Enable experimental ES6 support', action='store_true')
  151. return parser
  152. if __name__ == '__main__':
  153. parser = get_arg_parser()
  154. if len(sys.argv) == 1:
  155. parser.print_help(sys.stderr)
  156. sys.exit(1)
  157. args = parser.parse_args()
  158. print(main(fn=args.nearley_grammar, start=args.start_rule, nearley_lib=args.nearley_lib, es6=args.es6))