2 # Copyright (C) 2006, 2007, 2008 Michael Bayer mike_mp@zzzcomputing.com
4 # This module is part of Mako and is released under
5 # the MIT License: http://www.opensource.org/licenses/mit-license.php
7 """provides the Lexer class for parsing template strings into parse trees."""
10 from mako import parsetree, exceptions
11 from mako.pygen import adjust_whitespace
16 def __init__(self, text, filename=None, disable_unicode=False, input_encoding=None, preprocessor=None):
18 self.filename = filename
19 self.template = parsetree.TemplateNode(self.filename)
20 self.matched_lineno = 1
21 self.matched_charpos = 0
23 self.match_position = 0
25 self.control_line = []
26 self.disable_unicode = disable_unicode
27 self.encoding = input_encoding
28 if preprocessor is None:
29 self.preprocessor = []
30 elif not hasattr(preprocessor, '__iter__'):
31 self.preprocessor = [preprocessor]
33 self.preprocessor = preprocessor
35 exception_kwargs = property(lambda self:{'source':self.text, 'lineno':self.matched_lineno, 'pos':self.matched_charpos, 'filename':self.filename})
37 def match(self, regexp, flags=None):
38 """match the given regular expression string and flags to the current text position.
40 if a match occurs, update the current text and line position."""
41 mp = self.match_position
43 reg = _regexp_cache[(regexp, flags)]
46 reg = re.compile(regexp, flags)
48 reg = re.compile(regexp)
49 _regexp_cache[(regexp, flags)] = reg
51 match = reg.match(self.text, self.match_position)
53 (start, end) = match.span()
55 self.match_position = end + 1
57 self.match_position = end
58 self.matched_lineno = self.lineno
59 lines = re.findall(r"\n", self.text[mp:self.match_position])
61 while (cp >= 0 and cp<self.textlength and self.text[cp] != '\n'):
63 self.matched_charpos = mp - cp
64 self.lineno += len(lines)
65 #print "MATCHED:", match.group(0), "LINE START:", self.matched_lineno, "LINE END:", self.lineno
66 #print "MATCH:", regexp, "\n", self.text[mp : mp + 15], (match and "TRUE" or "FALSE")
69 def parse_until_text(self, *text):
70 startpos = self.match_position
72 match = self.match(r'#.*\n')
75 match = self.match(r'(\"\"\"|\'\'\'|\"|\')')
77 m = self.match(r'.*?%s' % match.group(1), re.S)
79 raise exceptions.SyntaxException("Unmatched '%s'" % match.group(1), **self.exception_kwargs)
81 match = self.match(r'(%s)' % r'|'.join(text))
83 return (self.text[startpos:self.match_position-len(match.group(1))], match.group(1))
85 match = self.match(r".*?(?=\"|\'|#|%s)" % r'|'.join(text), re.S)
87 raise exceptions.SyntaxException("Expected: %s" % ','.join(text), **self.exception_kwargs)
89 def append_node(self, nodecls, *args, **kwargs):
90 kwargs.setdefault('source', self.text)
91 kwargs.setdefault('lineno', self.matched_lineno)
92 kwargs.setdefault('pos', self.matched_charpos)
93 kwargs['filename'] = self.filename
94 node = nodecls(*args, **kwargs)
96 self.tag[-1].nodes.append(node)
98 self.template.nodes.append(node)
99 if isinstance(node, parsetree.Tag):
101 node.parent = self.tag[-1]
102 self.tag.append(node)
103 elif isinstance(node, parsetree.ControlLine):
105 self.control_line.pop()
106 elif node.is_primary:
107 self.control_line.append(node)
108 elif len(self.control_line) and not self.control_line[-1].is_ternary(node.keyword):
109 raise exceptions.SyntaxException("Keyword '%s' not a legal ternary for keyword '%s'" % (node.keyword, self.control_line[-1].keyword), **self.exception_kwargs)
111 def escape_code(self, text):
112 if not self.disable_unicode and self.encoding:
113 return text.encode('ascii', 'backslashreplace')
118 for preproc in self.preprocessor:
119 self.text = preproc(self.text)
120 if not isinstance(self.text, unicode) and self.text.startswith(codecs.BOM_UTF8):
121 self.text = self.text[len(codecs.BOM_UTF8):]
122 parsed_encoding = 'utf-8'
123 me = self.match_encoding()
124 if me is not None and me != 'utf-8':
125 raise exceptions.CompileException("Found utf-8 BOM in file, with conflicting magic encoding comment of '%s'" % me, self.text.decode('utf-8', 'ignore'), 0, 0, self.filename)
127 parsed_encoding = self.match_encoding()
129 self.encoding = parsed_encoding
130 if not self.disable_unicode and not isinstance(self.text, unicode):
133 self.text = self.text.decode(self.encoding)
134 except UnicodeDecodeError, e:
135 raise exceptions.CompileException("Unicode decode operation of encoding '%s' failed" % self.encoding, self.text.decode('utf-8', 'ignore'), 0, 0, self.filename)
138 self.text = self.text.decode()
139 except UnicodeDecodeError, e:
140 raise exceptions.CompileException("Could not read template using encoding of 'ascii'. Did you forget a magic encoding comment?", self.text.decode('utf-8', 'ignore'), 0, 0, self.filename)
142 self.textlength = len(self.text)
145 if self.match_position > self.textlength:
150 if self.match_expression():
152 if self.match_control_line():
154 if self.match_comment():
156 if self.match_tag_start():
158 if self.match_tag_end():
160 if self.match_python_block():
162 if self.match_text():
165 if self.match_position > self.textlength:
167 raise exceptions.CompileException("assertion failed")
170 raise exceptions.SyntaxException("Unclosed tag: <%%%s>" % self.tag[-1].keyword, **self.exception_kwargs)
171 if len(self.control_line):
172 raise exceptions.SyntaxException("Unterminated control keyword: '%s'" % self.control_line[-1].keyword, self.text, self.control_line[-1].lineno, self.control_line[-1].pos, self.filename)
175 def match_encoding(self):
176 match = self.match(r'#.*coding[:=]\s*([-\w.]+).*\r?\n')
178 return match.group(1)
182 def match_tag_start(self):
183 match = self.match(r'''
186 ([\w\.\:]+) # keyword
188 ((?:\s+\w+|=|".*?"|'.*?')*) # attrname, = sign, string expression
190 \s* # more whitespace
199 (keyword, attr, isend) = (match.group(1).lower(), match.group(2), match.group(3))
200 self.keyword = keyword
203 for att in re.findall(r"\s*(\w+)\s*=\s*(?:'([^']*)'|\"([^\"]*)\")", attr):
204 (key, val1, val2) = att
206 text = text.replace('\r\n', '\n')
207 attributes[key] = self.escape_code(text)
208 self.append_node(parsetree.Tag, keyword, attributes)
212 if keyword == 'text':
213 match = self.match(r'(.*?)(?=\</%text>)', re.S)
215 raise exceptions.SyntaxException("Unclosed tag: <%%%s>" % self.tag[-1].keyword, **self.exception_kwargs)
216 self.append_node(parsetree.Text, match.group(1))
217 return self.match_tag_end()
222 def match_tag_end(self):
223 match = self.match(r'\</%[\t ]*(.+?)[\t ]*>')
225 if not len(self.tag):
226 raise exceptions.SyntaxException("Closing tag without opening tag: </%%%s>" % match.group(1), **self.exception_kwargs)
227 elif self.tag[-1].keyword != match.group(1):
228 raise exceptions.SyntaxException("Closing tag </%%%s> does not match tag: <%%%s>" % (match.group(1), self.tag[-1].keyword), **self.exception_kwargs)
235 match = self.match(r'\Z', re.S)
237 string = match.group()
245 def match_text(self):
246 match = self.match(r"""
247 (.*?) # anything, followed by:
249 (?<=\n)(?=[ \t]*(?=%|\#\#)) # an eval or line-based comment preceded by a consumed \n and whitespace
251 (?=\${) # an expression
253 (?=\#\*) # multiline comment
255 (?=</?[%&]) # a substitution or block or call start or end
258 (\\\r?\n) # an escaped newline - throw away
264 text = match.group(1)
265 self.append_node(parsetree.Text, text)
270 def match_python_block(self):
271 match = self.match(r"<%(!)?")
273 (line, pos) = (self.matched_lineno, self.matched_charpos)
274 (text, end) = self.parse_until_text(r'%>')
275 text = adjust_whitespace(text) + "\n" # the trailing newline helps compiler.parse() not complain about indentation
276 self.append_node(parsetree.Code, self.escape_code(text), match.group(1)=='!', lineno=line, pos=pos)
281 def match_expression(self):
282 match = self.match(r"\${")
284 (line, pos) = (self.matched_lineno, self.matched_charpos)
285 (text, end) = self.parse_until_text(r'\|', r'}')
287 (escapes, end) = self.parse_until_text(r'}')
290 text = text.replace('\r\n', '\n')
291 self.append_node(parsetree.Expression, self.escape_code(text), escapes.strip(), lineno=line, pos=pos)
296 def match_control_line(self):
297 match = self.match(r"(?<=^)[\t ]*(%|##)[\t ]*((?:(?:\\r?\n)|[^\r\n])*)(?:\r?\n|\Z)", re.M)
299 operator = match.group(1)
300 text = match.group(2)
302 m2 = re.match(r'(end)?(\w+)\s*(.*)', text)
304 raise exceptions.SyntaxException("Invalid control line: '%s'" % text, **self.exception_kwargs)
305 (isend, keyword) = m2.group(1, 2)
306 isend = (isend is not None)
309 if not len(self.control_line):
310 raise exceptions.SyntaxException("No starting keyword '%s' for '%s'" % (keyword, text), **self.exception_kwargs)
311 elif self.control_line[-1].keyword != keyword:
312 raise exceptions.SyntaxException("Keyword '%s' doesn't match keyword '%s'" % (text, self.control_line[-1].keyword), **self.exception_kwargs)
313 self.append_node(parsetree.ControlLine, keyword, isend, self.escape_code(text))
315 self.append_node(parsetree.Comment, text)
320 def match_comment(self):
321 """matches the multiline version of a comment"""
322 match = self.match(r"<%doc>(.*?)</%doc>", re.S)
324 self.append_node(parsetree.Comment, match.group(1))