XPath.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352
  1. #
  2. # Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
  3. # Use of this file is governed by the BSD 3-clause license that
  4. # can be found in the LICENSE.txt file in the project root.
  5. #
  6. #
  7. # Represent a subset of XPath XML path syntax for use in identifying nodes in
  8. # parse trees.
  9. #
  10. # <p>
  11. # Split path into words and separators {@code /} and {@code //} via ANTLR
  12. # itself then walk path elements from left to right. At each separator-word
  13. # pair, find set of nodes. Next stage uses those as work list.</p>
  14. #
  15. # <p>
  16. # The basic interface is
  17. # {@link XPath#findAll ParseTree.findAll}{@code (tree, pathString, parser)}.
  18. # But that is just shorthand for:</p>
  19. #
  20. # <pre>
  21. # {@link XPath} p = new {@link XPath#XPath XPath}(parser, pathString);
  22. # return p.{@link #evaluate evaluate}(tree);
  23. # </pre>
  24. #
  25. # <p>
  26. # See {@code org.antlr.v4.test.TestXPath} for descriptions. In short, this
  27. # allows operators:</p>
  28. #
  29. # <dl>
  30. # <dt>/</dt> <dd>root</dd>
  31. # <dt>//</dt> <dd>anywhere</dd>
  32. # <dt>!</dt> <dd>invert; this must appear directly after root or anywhere
  33. # operator</dd>
  34. # </dl>
  35. #
  36. # <p>
  37. # and path elements:</p>
  38. #
  39. # <dl>
  40. # <dt>ID</dt> <dd>token name</dd>
  41. # <dt>'string'</dt> <dd>any string literal token from the grammar</dd>
  42. # <dt>expr</dt> <dd>rule name</dd>
  43. # <dt>*</dt> <dd>wildcard matching any node</dd>
  44. # </dl>
  45. #
  46. # <p>
  47. # Whitespace is not allowed.</p>
  48. #
  49. from antlr4 import CommonTokenStream, DFA, PredictionContextCache, Lexer, LexerATNSimulator, ParserRuleContext, TerminalNode
  50. from antlr4.InputStream import InputStream
  51. from antlr4.Parser import Parser
  52. from antlr4.RuleContext import RuleContext
  53. from antlr4.Token import Token
  54. from antlr4.atn.ATNDeserializer import ATNDeserializer
  55. from antlr4.error.ErrorListener import ErrorListener
  56. from antlr4.error.Errors import LexerNoViableAltException
  57. from antlr4.tree.Tree import ParseTree
  58. from antlr4.tree.Trees import Trees
  59. from io import StringIO
  60. def serializedATN():
  61. with StringIO() as buf:
  62. buf.write("\3\u0430\ud6d1\u8206\uad2d\u4417\uaef1\u8d80\uaadd\2\n")
  63. buf.write("\64\b\1\4\2\t\2\4\3\t\3\4\4\t\4\4\5\t\5\4\6\t\6\4\7\t")
  64. buf.write("\7\4\b\t\b\4\t\t\t\3\2\3\2\3\2\3\3\3\3\3\4\3\4\3\5\3\5")
  65. buf.write("\3\6\3\6\7\6\37\n\6\f\6\16\6\"\13\6\3\6\3\6\3\7\3\7\5")
  66. buf.write("\7(\n\7\3\b\3\b\3\t\3\t\7\t.\n\t\f\t\16\t\61\13\t\3\t")
  67. buf.write("\3\t\3/\2\n\3\5\5\6\7\7\t\b\13\t\r\2\17\2\21\n\3\2\4\7")
  68. buf.write("\2\62;aa\u00b9\u00b9\u0302\u0371\u2041\u2042\17\2C\\c")
  69. buf.write("|\u00c2\u00d8\u00da\u00f8\u00fa\u0301\u0372\u037f\u0381")
  70. buf.write("\u2001\u200e\u200f\u2072\u2191\u2c02\u2ff1\u3003\ud801")
  71. buf.write("\uf902\ufdd1\ufdf2\uffff\64\2\3\3\2\2\2\2\5\3\2\2\2\2")
  72. buf.write("\7\3\2\2\2\2\t\3\2\2\2\2\13\3\2\2\2\2\21\3\2\2\2\3\23")
  73. buf.write("\3\2\2\2\5\26\3\2\2\2\7\30\3\2\2\2\t\32\3\2\2\2\13\34")
  74. buf.write("\3\2\2\2\r\'\3\2\2\2\17)\3\2\2\2\21+\3\2\2\2\23\24\7\61")
  75. buf.write("\2\2\24\25\7\61\2\2\25\4\3\2\2\2\26\27\7\61\2\2\27\6\3")
  76. buf.write("\2\2\2\30\31\7,\2\2\31\b\3\2\2\2\32\33\7#\2\2\33\n\3\2")
  77. buf.write("\2\2\34 \5\17\b\2\35\37\5\r\7\2\36\35\3\2\2\2\37\"\3\2")
  78. buf.write("\2\2 \36\3\2\2\2 !\3\2\2\2!#\3\2\2\2\" \3\2\2\2#$\b\6")
  79. buf.write("\2\2$\f\3\2\2\2%(\5\17\b\2&(\t\2\2\2\'%\3\2\2\2\'&\3\2")
  80. buf.write("\2\2(\16\3\2\2\2)*\t\3\2\2*\20\3\2\2\2+/\7)\2\2,.\13\2")
  81. buf.write("\2\2-,\3\2\2\2.\61\3\2\2\2/\60\3\2\2\2/-\3\2\2\2\60\62")
  82. buf.write("\3\2\2\2\61/\3\2\2\2\62\63\7)\2\2\63\22\3\2\2\2\6\2 \'")
  83. buf.write("/\3\3\6\2")
  84. return buf.getvalue()
  85. class XPathLexer(Lexer):
  86. atn = ATNDeserializer().deserialize(serializedATN())
  87. decisionsToDFA = [ DFA(ds, i) for i, ds in enumerate(atn.decisionToState) ]
  88. TOKEN_REF = 1
  89. RULE_REF = 2
  90. ANYWHERE = 3
  91. ROOT = 4
  92. WILDCARD = 5
  93. BANG = 6
  94. ID = 7
  95. STRING = 8
  96. modeNames = [ "DEFAULT_MODE" ]
  97. literalNames = [ "<INVALID>",
  98. "'//'", "'/'", "'*'", "'!'" ]
  99. symbolicNames = [ "<INVALID>",
  100. "TOKEN_REF", "RULE_REF", "ANYWHERE", "ROOT", "WILDCARD", "BANG",
  101. "ID", "STRING" ]
  102. ruleNames = [ "ANYWHERE", "ROOT", "WILDCARD", "BANG", "ID", "NameChar",
  103. "NameStartChar", "STRING" ]
  104. grammarFileName = "XPathLexer.g4"
  105. def __init__(self, input=None):
  106. super().__init__(input)
  107. self.checkVersion("4.9.1")
  108. self._interp = LexerATNSimulator(self, self.atn, self.decisionsToDFA, PredictionContextCache())
  109. self._actions = None
  110. self._predicates = None
  111. def action(self, localctx:RuleContext, ruleIndex:int, actionIndex:int):
  112. if self._actions is None:
  113. actions = dict()
  114. actions[4] = self.ID_action
  115. self._actions = actions
  116. _action = self._actions.get(ruleIndex, None)
  117. if _action is not None:
  118. _action(localctx, actionIndex)
  119. else:
  120. raise Exception("No registered action for: %d" % ruleIndex)
  121. def ID_action(self, localctx:RuleContext , actionIndex:int):
  122. if actionIndex == 0:
  123. char = self.text[0]
  124. if char.isupper():
  125. self.type = XPathLexer.TOKEN_REF
  126. else:
  127. self.type = XPathLexer.RULE_REF
  128. class XPath(object):
  129. WILDCARD = "*" # word not operator/separator
  130. NOT = "!" # word for invert operator
  131. def __init__(self, parser:Parser, path:str):
  132. self.parser = parser
  133. self.path = path
  134. self.elements = self.split(path)
  135. def split(self, path:str):
  136. input = InputStream(path)
  137. lexer = XPathLexer(input)
  138. def recover(self, e):
  139. raise e
  140. lexer.recover = recover
  141. lexer.removeErrorListeners()
  142. lexer.addErrorListener(ErrorListener()) # XPathErrorListener does no more
  143. tokenStream = CommonTokenStream(lexer)
  144. try:
  145. tokenStream.fill()
  146. except LexerNoViableAltException as e:
  147. pos = lexer.column
  148. msg = "Invalid tokens or characters at index %d in path '%s'" % (pos, path)
  149. raise Exception(msg, e)
  150. tokens = iter(tokenStream.tokens)
  151. elements = list()
  152. for el in tokens:
  153. invert = False
  154. anywhere = False
  155. # Check for path separators, if none assume root
  156. if el.type in [XPathLexer.ROOT, XPathLexer.ANYWHERE]:
  157. anywhere = el.type == XPathLexer.ANYWHERE
  158. next_el = next(tokens, None)
  159. if not next_el:
  160. raise Exception('Missing element after %s' % el.getText())
  161. else:
  162. el = next_el
  163. # Check for bangs
  164. if el.type == XPathLexer.BANG:
  165. invert = True
  166. next_el = next(tokens, None)
  167. if not next_el:
  168. raise Exception('Missing element after %s' % el.getText())
  169. else:
  170. el = next_el
  171. # Add searched element
  172. if el.type in [XPathLexer.TOKEN_REF, XPathLexer.RULE_REF, XPathLexer.WILDCARD, XPathLexer.STRING]:
  173. element = self.getXPathElement(el, anywhere)
  174. element.invert = invert
  175. elements.append(element)
  176. elif el.type==Token.EOF:
  177. break
  178. else:
  179. raise Exception("Unknown path element %s" % lexer.symbolicNames[el.type])
  180. return elements
  181. #
  182. # Convert word like {@code#} or {@code ID} or {@code expr} to a path
  183. # element. {@code anywhere} is {@code true} if {@code //} precedes the
  184. # word.
  185. #
  186. def getXPathElement(self, wordToken:Token, anywhere:bool):
  187. if wordToken.type==Token.EOF:
  188. raise Exception("Missing path element at end of path")
  189. word = wordToken.text
  190. if wordToken.type==XPathLexer.WILDCARD :
  191. return XPathWildcardAnywhereElement() if anywhere else XPathWildcardElement()
  192. elif wordToken.type in [XPathLexer.TOKEN_REF, XPathLexer.STRING]:
  193. tsource = self.parser.getTokenStream().tokenSource
  194. ttype = Token.INVALID_TYPE
  195. if wordToken.type == XPathLexer.TOKEN_REF:
  196. if word in tsource.ruleNames:
  197. ttype = tsource.ruleNames.index(word) + 1
  198. else:
  199. if word in tsource.literalNames:
  200. ttype = tsource.literalNames.index(word)
  201. if ttype == Token.INVALID_TYPE:
  202. raise Exception("%s at index %d isn't a valid token name" % (word, wordToken.tokenIndex))
  203. return XPathTokenAnywhereElement(word, ttype) if anywhere else XPathTokenElement(word, ttype)
  204. else:
  205. ruleIndex = self.parser.ruleNames.index(word) if word in self.parser.ruleNames else -1
  206. if ruleIndex == -1:
  207. raise Exception("%s at index %d isn't a valid rule name" % (word, wordToken.tokenIndex))
  208. return XPathRuleAnywhereElement(word, ruleIndex) if anywhere else XPathRuleElement(word, ruleIndex)
  209. @staticmethod
  210. def findAll(tree:ParseTree, xpath:str, parser:Parser):
  211. p = XPath(parser, xpath)
  212. return p.evaluate(tree)
  213. #
  214. # Return a list of all nodes starting at {@code t} as root that satisfy the
  215. # path. The root {@code /} is relative to the node passed to
  216. # {@link #evaluate}.
  217. #
  218. def evaluate(self, t:ParseTree):
  219. dummyRoot = ParserRuleContext()
  220. dummyRoot.children = [t] # don't set t's parent.
  221. work = [dummyRoot]
  222. for element in self.elements:
  223. work_next = list()
  224. for node in work:
  225. if not isinstance(node, TerminalNode) and node.children:
  226. # only try to match next element if it has children
  227. # e.g., //func/*/stat might have a token node for which
  228. # we can't go looking for stat nodes.
  229. matching = element.evaluate(node)
  230. # See issue antlr#370 - Prevents XPath from returning the
  231. # same node multiple times
  232. matching = filter(lambda m: m not in work_next, matching)
  233. work_next.extend(matching)
  234. work = work_next
  235. return work
  236. class XPathElement(object):
  237. def __init__(self, nodeName:str):
  238. self.nodeName = nodeName
  239. self.invert = False
  240. def __str__(self):
  241. return type(self).__name__ + "[" + ("!" if self.invert else "") + self.nodeName + "]"
  242. #
  243. # Either {@code ID} at start of path or {@code ...//ID} in middle of path.
  244. #
  245. class XPathRuleAnywhereElement(XPathElement):
  246. def __init__(self, ruleName:str, ruleIndex:int):
  247. super().__init__(ruleName)
  248. self.ruleIndex = ruleIndex
  249. def evaluate(self, t:ParseTree):
  250. # return all ParserRuleContext descendants of t that match ruleIndex (or do not match if inverted)
  251. return filter(lambda c: isinstance(c, ParserRuleContext) and (self.invert ^ (c.getRuleIndex() == self.ruleIndex)), Trees.descendants(t))
  252. class XPathRuleElement(XPathElement):
  253. def __init__(self, ruleName:str, ruleIndex:int):
  254. super().__init__(ruleName)
  255. self.ruleIndex = ruleIndex
  256. def evaluate(self, t:ParseTree):
  257. # return all ParserRuleContext children of t that match ruleIndex (or do not match if inverted)
  258. return filter(lambda c: isinstance(c, ParserRuleContext) and (self.invert ^ (c.getRuleIndex() == self.ruleIndex)), Trees.getChildren(t))
  259. class XPathTokenAnywhereElement(XPathElement):
  260. def __init__(self, ruleName:str, tokenType:int):
  261. super().__init__(ruleName)
  262. self.tokenType = tokenType
  263. def evaluate(self, t:ParseTree):
  264. # return all TerminalNode descendants of t that match tokenType (or do not match if inverted)
  265. return filter(lambda c: isinstance(c, TerminalNode) and (self.invert ^ (c.symbol.type == self.tokenType)), Trees.descendants(t))
  266. class XPathTokenElement(XPathElement):
  267. def __init__(self, ruleName:str, tokenType:int):
  268. super().__init__(ruleName)
  269. self.tokenType = tokenType
  270. def evaluate(self, t:ParseTree):
  271. # return all TerminalNode children of t that match tokenType (or do not match if inverted)
  272. return filter(lambda c: isinstance(c, TerminalNode) and (self.invert ^ (c.symbol.type == self.tokenType)), Trees.getChildren(t))
  273. class XPathWildcardAnywhereElement(XPathElement):
  274. def __init__(self):
  275. super().__init__(XPath.WILDCARD)
  276. def evaluate(self, t:ParseTree):
  277. if self.invert:
  278. return list() # !* is weird but valid (empty)
  279. else:
  280. return Trees.descendants(t)
  281. class XPathWildcardElement(XPathElement):
  282. def __init__(self):
  283. super().__init__(XPath.WILDCARD)
  284. def evaluate(self, t:ParseTree):
  285. if self.invert:
  286. return list() # !* is weird but valid (empty)
  287. else:
  288. return Trees.getChildren(t)