Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/patsy/infix_parser.py : 25%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of Patsy
2# Copyright (C) 2011 Nathaniel Smith <njs@pobox.com>
3# See file LICENSE.txt for license information.
5# This file implements a simple "shunting yard algorithm" parser for infix
6# languages with parentheses. It is used as the core of our parser for
7# formulas, but is generic enough to be used for other purposes as well
8# (e.g. parsing linear constraints). It just builds a parse tree; semantics
9# are somebody else's problem.
10#
11# Plus it spends energy on tracking where each item in the parse tree comes
12# from, to allow high-quality error reporting.
13#
14# You are expected to provide an collection of Operators, a collection of
15# atomic types, and an iterator that provides Tokens. Each Operator should
16# have a unique token_type (which is an arbitrary Python object), and each
17# Token should have a matching token_type, or one of the special types
18# Token.LPAREN, Token.RPAREN. Each Token is required to have a valid Origin
19# attached, for error reporting.
21# XX: still seriously consider putting the magic intercept handling into the
22# tokenizer. we'd still need separate term-sets that get pasted together by ~
23# to create the modeldesc, though... heck maybe we should just have a
24# modeldesc be 1-or-more termsets, with the convention that if it's 1, then
25# it's a rhs, and if it's 2, it's (lhs, rhs), and otherwise you're on your
26# own. Test: would this be useful for multiple-group log-linear models,
27# maybe? Answer: Perhaps. outcome ~ x1 + x2 ~ group. But lots of other
28# plausible, maybe better ways to write this -- (outcome | group) ~ x1 + x2?
29# "outcome ~ x1 + x2", group="group"? etc.
31from __future__ import print_function
33__all__ = ["Token", "ParseNode", "Operator", "parse"]
35from patsy import PatsyError
36from patsy.origin import Origin
37from patsy.util import (repr_pretty_delegate, repr_pretty_impl,
38 no_pickling, assert_no_pickling)
40class _UniqueValue(object):
41 def __init__(self, print_as):
42 self._print_as = print_as
44 def __repr__(self):
45 return "%s(%r)" % (self.__class__.__name__, self._print_as)
47 __getstate__ = no_pickling
49class Token(object):
50 """A token with possible payload.
52 .. attribute:: type
54 An arbitrary object indicating the type of this token. Should be
55 :term:`hashable`, but otherwise it can be whatever you like.
56 """
57 LPAREN = _UniqueValue("LPAREN")
58 RPAREN = _UniqueValue("RPAREN")
60 def __init__(self, type, origin, extra=None):
61 self.type = type
62 self.origin = origin
63 self.extra = extra
65 __repr__ = repr_pretty_delegate
66 def _repr_pretty_(self, p, cycle):
67 assert not cycle
68 kwargs = []
69 if self.extra is not None:
70 kwargs = [("extra", self.extra)]
71 return repr_pretty_impl(p, self, [self.type, self.origin], kwargs)
73 __getstate__ = no_pickling
75class ParseNode(object):
76 def __init__(self, type, token, args, origin):
77 self.type = type
78 self.token = token
79 self.args = args
80 self.origin = origin
82 __repr__ = repr_pretty_delegate
83 def _repr_pretty_(self, p, cycle):
84 return repr_pretty_impl(p, self, [self.type, self.token, self.args])
86 __getstate__ = no_pickling
88class Operator(object):
89 def __init__(self, token_type, arity, precedence):
90 self.token_type = token_type
91 self.arity = arity
92 self.precedence = precedence
94 def __repr__(self):
95 return "%s(%r, %r, %r)" % (self.__class__.__name__,
96 self.token_type, self.arity, self.precedence)
98 __getstate__ = no_pickling
100class _StackOperator(object):
101 def __init__(self, op, token):
102 self.op = op
103 self.token = token
105 __getstate__ = no_pickling
107_open_paren = Operator(Token.LPAREN, -1, -9999999)
109class _ParseContext(object):
110 def __init__(self, unary_ops, binary_ops, atomic_types, trace):
111 self.op_stack = []
112 self.noun_stack = []
113 self.unary_ops = unary_ops
114 self.binary_ops = binary_ops
115 self.atomic_types = atomic_types
116 self.trace = trace
118 __getstate__ = no_pickling
120def _read_noun_context(token, c):
121 if token.type == Token.LPAREN:
122 if c.trace:
123 print("Pushing open-paren")
124 c.op_stack.append(_StackOperator(_open_paren, token))
125 return True
126 elif token.type in c.unary_ops:
127 if c.trace:
128 print("Pushing unary op %r" % (token.type,))
129 c.op_stack.append(_StackOperator(c.unary_ops[token.type], token))
130 return True
131 elif token.type in c.atomic_types:
132 if c.trace:
133 print("Pushing noun %r (%r)" % (token.type, token.extra))
134 c.noun_stack.append(ParseNode(token.type, token, [],
135 token.origin))
136 return False
137 else:
138 raise PatsyError("expected a noun, not '%s'"
139 % (token.origin.relevant_code(),),
140 token)
142def _run_op(c):
143 assert c.op_stack
144 stackop = c.op_stack.pop()
145 args = []
146 for i in range(stackop.op.arity):
147 args.append(c.noun_stack.pop())
148 args.reverse()
149 if c.trace:
150 print("Reducing %r (%r)" % (stackop.op.token_type, args))
151 node = ParseNode(stackop.op.token_type, stackop.token, args,
152 Origin.combine([stackop.token] + args))
153 c.noun_stack.append(node)
155def _read_op_context(token, c):
156 if token.type == Token.RPAREN:
157 if c.trace:
158 print("Found close-paren")
159 while c.op_stack and c.op_stack[-1].op.token_type != Token.LPAREN:
160 _run_op(c)
161 if not c.op_stack:
162 raise PatsyError("missing '(' or extra ')'", token)
163 assert c.op_stack[-1].op.token_type == Token.LPAREN
164 # Expand the origin of the item on top of the noun stack to include
165 # the open and close parens:
166 combined = Origin.combine([c.op_stack[-1].token,
167 c.noun_stack[-1].token,
168 token])
169 c.noun_stack[-1].origin = combined
170 # Pop the open-paren
171 c.op_stack.pop()
172 return False
173 elif token.type in c.binary_ops:
174 if c.trace:
175 print("Found binary operator %r" % (token.type))
176 stackop = _StackOperator(c.binary_ops[token.type], token)
177 while (c.op_stack
178 and stackop.op.precedence <= c.op_stack[-1].op.precedence):
179 _run_op(c)
180 if c.trace:
181 print("Pushing binary operator %r" % (token.type))
182 c.op_stack.append(stackop)
183 return True
184 else:
185 raise PatsyError("expected an operator, not '%s'"
186 % (token.origin.relevant_code(),),
187 token)
189def infix_parse(tokens, operators, atomic_types, trace=False):
190 token_source = iter(tokens)
192 unary_ops = {}
193 binary_ops = {}
194 for op in operators:
195 assert op.precedence > _open_paren.precedence
196 if op.arity == 1:
197 unary_ops[op.token_type] = op
198 elif op.arity == 2:
199 binary_ops[op.token_type] = op
200 else:
201 raise ValueError("operators must be unary or binary")
203 c = _ParseContext(unary_ops, binary_ops, atomic_types, trace)
205 # This is an implementation of Dijkstra's shunting yard algorithm:
206 # http://en.wikipedia.org/wiki/Shunting_yard_algorithm
207 # http://www.engr.mun.ca/~theo/Misc/exp_parsing.htm
209 want_noun = True
210 for token in token_source:
211 if c.trace:
212 print("Reading next token (want_noun=%r)" % (want_noun,))
213 if want_noun:
214 want_noun = _read_noun_context(token, c)
215 else:
216 want_noun = _read_op_context(token, c)
217 if c.trace:
218 print("End of token stream")
220 if want_noun:
221 raise PatsyError("expected a noun, but instead the expression ended",
222 c.op_stack[-1].token.origin)
224 while c.op_stack:
225 if c.op_stack[-1].op.token_type == Token.LPAREN:
226 raise PatsyError("Unmatched '('", c.op_stack[-1].token)
227 _run_op(c)
229 assert len(c.noun_stack) == 1
230 return c.noun_stack.pop()
232# Much more thorough tests in parse_formula.py, this is just a smoke test:
233def test_infix_parse():
234 ops = [Operator("+", 2, 10),
235 Operator("*", 2, 20),
236 Operator("-", 1, 30)]
237 atomic = ["ATOM1", "ATOM2"]
238 # a + -b * (c + d)
239 mock_origin = Origin("asdf", 2, 3)
240 tokens = [Token("ATOM1", mock_origin, "a"),
241 Token("+", mock_origin, "+"),
242 Token("-", mock_origin, "-"),
243 Token("ATOM2", mock_origin, "b"),
244 Token("*", mock_origin, "*"),
245 Token(Token.LPAREN, mock_origin, "("),
246 Token("ATOM1", mock_origin, "c"),
247 Token("+", mock_origin, "+"),
248 Token("ATOM2", mock_origin, "d"),
249 Token(Token.RPAREN, mock_origin, ")")]
250 tree = infix_parse(tokens, ops, atomic)
251 def te(tree, type, extra):
252 assert tree.type == type
253 assert tree.token.extra == extra
254 te(tree, "+", "+")
255 te(tree.args[0], "ATOM1", "a")
256 assert tree.args[0].args == []
257 te(tree.args[1], "*", "*")
258 te(tree.args[1].args[0], "-", "-")
259 assert len(tree.args[1].args[0].args) == 1
260 te(tree.args[1].args[0].args[0], "ATOM2", "b")
261 te(tree.args[1].args[1], "+", "+")
262 te(tree.args[1].args[1].args[0], "ATOM1", "c")
263 te(tree.args[1].args[1].args[1], "ATOM2", "d")
265 from nose.tools import assert_raises
266 # No ternary ops
267 assert_raises(ValueError,
268 infix_parse, [], [Operator("+", 3, 10)], ["ATOMIC"])
270 # smoke test just to make sure there are no egregious bugs in 'trace'
271 infix_parse(tokens, ops, atomic, trace=True)