bin/mako/lexer.py

   1 # lexer.py
   2 # Copyright (C) 2006, 2007, 2008 Michael Bayer mike_mp@zzzcomputing.com
   3 #
   4 # This module is part of Mako and is released under
   5 # the MIT License: http://www.opensource.org/licenses/mit-license.php
   6
   7 """provides the Lexer class for parsing template strings into parse trees."""
   8
   9 import re, codecs
  10 from mako import parsetree, exceptions
  11 from mako.pygen import adjust_whitespace
  12
  13 _regexp_cache = {}
  14
  15 class Lexer(object):
  16     def __init__(self, text, filename=None, disable_unicode=False, input_encoding=None, preprocessor=None):
  17         self.text = text
  18         self.filename = filename
  19         self.template = parsetree.TemplateNode(self.filename)
  20         self.matched_lineno = 1
  21         self.matched_charpos = 0
  22         self.lineno = 1
  23         self.match_position = 0
  24         self.tag = []
  25         self.control_line = []
  26         self.disable_unicode = disable_unicode
  27         self.encoding = input_encoding
  28         if preprocessor is None:
  29             self.preprocessor = []
  30         elif not hasattr(preprocessor, '__iter__'):
  31             self.preprocessor = [preprocessor]
  32         else:
  33             self.preprocessor = preprocessor
  34
  35     exception_kwargs = property(lambda self:{'source':self.text, 'lineno':self.matched_lineno, 'pos':self.matched_charpos, 'filename':self.filename})
  36
  37     def match(self, regexp, flags=None):
  38         """match the given regular expression string and flags to the current text position.
  39
  40         if a match occurs, update the current text and line position."""
  41         mp = self.match_position
  42         try:
  43             reg = _regexp_cache[(regexp, flags)]
  44         except KeyError:
  45             if flags:
  46                 reg = re.compile(regexp, flags)
  47             else:
  48                 reg = re.compile(regexp)
  49             _regexp_cache[(regexp, flags)] = reg
  50
  51         match = reg.match(self.text, self.match_position)
  52         if match:
  53             (start, end) = match.span()
  54             if end == start:
  55                 self.match_position = end + 1
  56             else:
  57                 self.match_position = end
  58             self.matched_lineno = self.lineno
  59             lines = re.findall(r"\n", self.text[mp:self.match_position])
  60             cp = mp - 1
  61             while (cp >= 0 and cp<self.textlength and self.text[cp] != '\n'):
  62                 cp -=1
  63             self.matched_charpos = mp - cp
  64             self.lineno += len(lines)
  65             #print "MATCHED:", match.group(0), "LINE START:", self.matched_lineno, "LINE END:", self.lineno
  66         #print "MATCH:", regexp, "\n", self.text[mp : mp + 15], (match and "TRUE" or "FALSE")
  67         return match
  68
  69     def parse_until_text(self, *text):
  70         startpos = self.match_position
  71         while True:
  72             match = self.match(r'#.*\n')
  73             if match:
  74                 continue
  75             match = self.match(r'(\"\"\"|\'\'\'|\"|\')')
  76             if match:
  77                 m = self.match(r'.*?%s' % match.group(1), re.S)
  78                 if not m:
  79                     raise exceptions.SyntaxException("Unmatched '%s'" % match.group(1), **self.exception_kwargs)
  80             else:
  81                 match = self.match(r'(%s)' % r'|'.join(text))
  82                 if match:
  83                     return (self.text[startpos:self.match_position-len(match.group(1))], match.group(1))
  84                 else:
  85                     match = self.match(r".*?(?=\"|\'|#|%s)" % r'|'.join(text), re.S)
  86                     if not match:
  87                         raise exceptions.SyntaxException("Expected: %s" % ','.join(text), **self.exception_kwargs)
  88
  89     def append_node(self, nodecls, *args, **kwargs):
  90         kwargs.setdefault('source', self.text)
  91         kwargs.setdefault('lineno', self.matched_lineno)
  92         kwargs.setdefault('pos', self.matched_charpos)
  93         kwargs['filename'] = self.filename
  94         node = nodecls(*args, **kwargs)
  95         if len(self.tag):
  96             self.tag[-1].nodes.append(node)
  97         else:
  98             self.template.nodes.append(node)
  99         if isinstance(node, parsetree.Tag):
 100             if len(self.tag):
 101                 node.parent = self.tag[-1]
 102             self.tag.append(node)
 103         elif isinstance(node, parsetree.ControlLine):
 104             if node.isend:
 105                 self.control_line.pop()
 106             elif node.is_primary:
 107                 self.control_line.append(node)
 108             elif len(self.control_line) and not self.control_line[-1].is_ternary(node.keyword):
 109                 raise exceptions.SyntaxException("Keyword '%s' not a legal ternary for keyword '%s'" % (node.keyword, self.control_line[-1].keyword), **self.exception_kwargs)
 110
 111     def escape_code(self, text):
 112         if not self.disable_unicode and self.encoding:
 113             return text.encode('ascii', 'backslashreplace')
 114         else:
 115             return text
 116
 117     def parse(self):
 118         for preproc in self.preprocessor:
 119             self.text = preproc(self.text)
 120         if not isinstance(self.text, unicode) and self.text.startswith(codecs.BOM_UTF8):
 121             self.text = self.text[len(codecs.BOM_UTF8):]
 122             parsed_encoding = 'utf-8'
 123             me = self.match_encoding()
 124             if me is not None and me != 'utf-8':
 125                 raise exceptions.CompileException("Found utf-8 BOM in file, with conflicting magic encoding comment of '%s'" % me, self.text.decode('utf-8', 'ignore'), 0, 0, self.filename)
 126         else:
 127             parsed_encoding = self.match_encoding()
 128         if parsed_encoding:
 129             self.encoding = parsed_encoding
 130         if not self.disable_unicode and not isinstance(self.text, unicode):
 131             if self.encoding:
 132                 try:
 133                     self.text = self.text.decode(self.encoding)
 134                 except UnicodeDecodeError, e:
 135                     raise exceptions.CompileException("Unicode decode operation of encoding '%s' failed" % self.encoding, self.text.decode('utf-8', 'ignore'), 0, 0, self.filename)
 136             else:
 137                 try:
 138                     self.text = self.text.decode()
 139                 except UnicodeDecodeError, e:
 140                     raise exceptions.CompileException("Could not read template using encoding of 'ascii'.  Did you forget a magic encoding comment?", self.text.decode('utf-8', 'ignore'), 0, 0, self.filename)
 141
 142         self.textlength = len(self.text)
 143
 144         while (True):
 145             if self.match_position > self.textlength:
 146                 break
 147
 148             if self.match_end():
 149                 break
 150             if self.match_expression():
 151                 continue
 152             if self.match_control_line():
 153                 continue
 154             if self.match_comment():
 155                 continue
 156             if self.match_tag_start():
 157                 continue
 158             if self.match_tag_end():
 159                 continue
 160             if self.match_python_block():
 161                 continue
 162             if self.match_text():
 163                 continue
 164
 165             if self.match_position > self.textlength:
 166                 break
 167             raise exceptions.CompileException("assertion failed")
 168
 169         if len(self.tag):
 170             raise exceptions.SyntaxException("Unclosed tag: <%%%s>" % self.tag[-1].keyword, **self.exception_kwargs)
 171         if len(self.control_line):
 172             raise exceptions.SyntaxException("Unterminated control keyword: '%s'" % self.control_line[-1].keyword, self.text, self.control_line[-1].lineno, self.control_line[-1].pos, self.filename)
 173         return self.template
 174
 175     def match_encoding(self):
 176         match = self.match(r'#.*coding[:=]\s*([-\w.]+).*\r?\n')
 177         if match:
 178             return match.group(1)
 179         else:
 180             return None
 181
 182     def match_tag_start(self):
 183         match = self.match(r'''
 184             \<%     # opening tag
 185
 186             ([\w\.\:]+)   # keyword
 187
 188             ((?:\s+\w+|=|".*?"|'.*?')*)  # attrname, = sign, string expression
 189
 190             \s*     # more whitespace
 191
 192             (/)?>   # closing
 193
 194             ''',
 195
 196             re.I | re.S | re.X)
 197
 198         if match:
 199             (keyword, attr, isend) = (match.group(1).lower(), match.group(2), match.group(3))
 200             self.keyword = keyword
 201             attributes = {}
 202             if attr:
 203                 for att in re.findall(r"\s*(\w+)\s*=\s*(?:'([^']*)'|\"([^\"]*)\")", attr):
 204                     (key, val1, val2) = att
 205                     text = val1 or val2
 206                     text = text.replace('\r\n', '\n')
 207                     attributes[key] = self.escape_code(text)
 208             self.append_node(parsetree.Tag, keyword, attributes)
 209             if isend:
 210                 self.tag.pop()
 211             else:
 212                 if keyword == 'text':
 213                     match = self.match(r'(.*?)(?=\</%text>)',  re.S)
 214                     if not match:
 215                         raise exceptions.SyntaxException("Unclosed tag: <%%%s>" % self.tag[-1].keyword, **self.exception_kwargs)
 216                     self.append_node(parsetree.Text, match.group(1))
 217                     return self.match_tag_end()
 218             return True
 219         else:
 220             return False
 221
 222     def match_tag_end(self):
 223         match = self.match(r'\</%[\t ]*(.+?)[\t ]*>')
 224         if match:
 225             if not len(self.tag):
 226                 raise exceptions.SyntaxException("Closing tag without opening tag: </%%%s>" % match.group(1), **self.exception_kwargs)
 227             elif self.tag[-1].keyword != match.group(1):
 228                 raise exceptions.SyntaxException("Closing tag </%%%s> does not match tag: <%%%s>" % (match.group(1), self.tag[-1].keyword), **self.exception_kwargs)
 229             self.tag.pop()
 230             return True
 231         else:
 232             return False
 233
 234     def match_end(self):
 235         match = self.match(r'\Z', re.S)
 236         if match:
 237             string = match.group()
 238             if string:
 239                 return string
 240             else:
 241                 return True
 242         else:
 243             return False
 244
 245     def match_text(self):
 246         match = self.match(r"""
 247                 (.*?)         # anything, followed by:
 248                 (
 249                  (?<=\n)(?=[ \t]*(?=%|\#\#)) # an eval or line-based comment preceded by a consumed \n and whitespace
 250                  |
 251                  (?=\${)   # an expression
 252                  |
 253                  (?=\#\*) # multiline comment
 254                  |
 255                  (?=</?[%&])  # a substitution or block or call start or end
 256                                               # - don't consume
 257                  |
 258                  (\\\r?\n)         # an escaped newline  - throw away
 259                  |
 260                  \Z           # end of string
 261                 )""", re.X | re.S)
 262
 263         if match:
 264             text = match.group(1)
 265             self.append_node(parsetree.Text, text)
 266             return True
 267         else:
 268             return False
 269
 270     def match_python_block(self):
 271         match = self.match(r"<%(!)?")
 272         if match:
 273             (line, pos) = (self.matched_lineno, self.matched_charpos)
 274             (text, end) = self.parse_until_text(r'%>')
 275             text = adjust_whitespace(text) + "\n"   # the trailing newline helps compiler.parse() not complain about indentation
 276             self.append_node(parsetree.Code, self.escape_code(text), match.group(1)=='!', lineno=line, pos=pos)
 277             return True
 278         else:
 279             return False
 280
 281     def match_expression(self):
 282         match = self.match(r"\${")
 283         if match:
 284             (line, pos) = (self.matched_lineno, self.matched_charpos)
 285             (text, end) = self.parse_until_text(r'\|', r'}')
 286             if end == '|':
 287                 (escapes, end) = self.parse_until_text(r'}')
 288             else:
 289                 escapes = ""
 290             text = text.replace('\r\n', '\n')
 291             self.append_node(parsetree.Expression, self.escape_code(text), escapes.strip(), lineno=line, pos=pos)
 292             return True
 293         else:
 294             return False
 295
 296     def match_control_line(self):
 297         match = self.match(r"(?<=^)[\t ]*(%|##)[\t ]*((?:(?:\\r?\n)|[^\r\n])*)(?:\r?\n|\Z)", re.M)
 298         if match:
 299             operator = match.group(1)
 300             text = match.group(2)
 301             if operator == '%':
 302                 m2 = re.match(r'(end)?(\w+)\s*(.*)', text)
 303                 if not m2:
 304                     raise exceptions.SyntaxException("Invalid control line: '%s'" % text, **self.exception_kwargs)
 305                 (isend, keyword) = m2.group(1, 2)
 306                 isend = (isend is not None)
 307
 308                 if isend:
 309                     if not len(self.control_line):
 310                         raise exceptions.SyntaxException("No starting keyword '%s' for '%s'" % (keyword, text), **self.exception_kwargs)
 311                     elif self.control_line[-1].keyword != keyword:
 312                         raise exceptions.SyntaxException("Keyword '%s' doesn't match keyword '%s'" % (text, self.control_line[-1].keyword), **self.exception_kwargs)
 313                 self.append_node(parsetree.ControlLine, keyword, isend, self.escape_code(text))
 314             else:
 315                 self.append_node(parsetree.Comment, text)
 316             return True
 317         else:
 318             return False
 319
 320     def match_comment(self):
 321         """matches the multiline version of a comment"""
 322         match = self.match(r"<%doc>(.*?)</%doc>", re.S)
 323         if match:
 324             self.append_node(parsetree.Comment, match.group(1))
 325             return True
 326         else:
 327             return False
 328