bin/mako/filters.py

   1 # filters.py
   2 # Copyright (C) 2006, 2007, 2008 Geoffrey T. Dairiki <dairiki@dairiki.org> and Michael Bayer <mike_mp@zzzcomputing.com>
   3 #
   4 # This module is part of Mako and is released under
   5 # the MIT License: http://www.opensource.org/licenses/mit-license.php
   6
   7
   8 import re, cgi, urllib, htmlentitydefs, codecs
   9 from StringIO import StringIO
  10
  11 xml_escapes = {
  12     '&' : '&amp;',
  13     '>' : '&gt;',
  14     '<' : '&lt;',
  15     '"' : '&#34;',   # also &quot; in html-only
  16     "'" : '&#39;'    # also &apos; in html-only
  17 }
  18 # XXX: &quot; is valid in HTML and XML
  19 #      &apos; is not valid HTML, but is valid XML
  20
  21 def html_escape(string):
  22     return cgi.escape(string, True)
  23
  24 def xml_escape(string):
  25     return re.sub(r'([&<"\'>])', lambda m: xml_escapes[m.group()], string)
  26
  27 def url_escape(string):
  28     # convert into a list of octets
  29     string = string.encode("utf8")
  30     return urllib.quote_plus(string)
  31
  32 def url_unescape(string):
  33     text = urllib.unquote_plus(string)
  34     if not is_ascii_str(text):
  35         text = text.decode("utf8")
  36     return text
  37
  38 def trim(string):
  39     return string.strip()
  40
  41
  42 class Decode(object):
  43     def __getattr__(self, key):
  44         def decode(x):
  45             if isinstance(x, unicode):
  46                 return x
  47             elif not isinstance(x, str):
  48                 return unicode(str(x), encoding=key)
  49             else:
  50                 return unicode(x, encoding=key)
  51         return decode
  52 decode = Decode()
  53
  54
  55 _ASCII_re = re.compile(r'\A[\x00-\x7f]*\Z')
  56
  57 def is_ascii_str(text):
  58     return isinstance(text, str) and _ASCII_re.match(text)
  59
  60 ################################################################
  61
  62 class XMLEntityEscaper(object):
  63     def __init__(self, codepoint2name, name2codepoint):
  64         self.codepoint2entity = dict([(c, u'&%s;' % n)
  65                                       for c,n in codepoint2name.iteritems()])
  66         self.name2codepoint = name2codepoint
  67
  68     def escape_entities(self, text):
  69         """Replace characters with their character entity references.
  70
  71         Only characters corresponding to a named entity are replaced.
  72         """
  73         return unicode(text).translate(self.codepoint2entity)
  74
  75     def __escape(self, m):
  76         codepoint = ord(m.group())
  77         try:
  78             return self.codepoint2entity[codepoint]
  79         except (KeyError, IndexError):
  80             return '&#x%X;' % codepoint
  81
  82
  83     __escapable = re.compile(r'["&<>]|[^\x00-\x7f]')
  84
  85     def escape(self, text):
  86         """Replace characters with their character references.
  87
  88         Replace characters by their named entity references.
  89         Non-ASCII characters, if they do not have a named entity reference,
  90         are replaced by numerical character references.
  91
  92         The return value is guaranteed to be ASCII.
  93         """
  94         return self.__escapable.sub(self.__escape, unicode(text)
  95                                     ).encode('ascii')
  96
  97     # XXX: This regexp will not match all valid XML entity names__.
  98     # (It punts on details involving involving CombiningChars and Extenders.)
  99     #
 100     # .. __: http://www.w3.org/TR/2000/REC-xml-20001006#NT-EntityRef
 101     __characterrefs = re.compile(r'''& (?:
 102                                           \#(\d+)
 103                                           | \#x([\da-f]+)
 104                                           | ( (?!\d) [:\w] [-.:\w]+ )
 105                                           ) ;''',
 106                                  re.X | re.UNICODE)
 107
 108     def __unescape(self, m):
 109         dval, hval, name = m.groups()
 110         if dval:
 111             codepoint = int(dval)
 112         elif hval:
 113             codepoint = int(hval, 16)
 114         else:
 115             codepoint = self.name2codepoint.get(name, 0xfffd)
 116             # U+FFFD = "REPLACEMENT CHARACTER"
 117         if codepoint < 128:
 118             return chr(codepoint)
 119         return unichr(codepoint)
 120
 121     def unescape(self, text):
 122         """Unescape character references.
 123
 124         All character references (both entity references and numerical
 125         character references) are unescaped.
 126         """
 127         return self.__characterrefs.sub(self.__unescape, text)
 128
 129
 130 _html_entities_escaper = XMLEntityEscaper(htmlentitydefs.codepoint2name,
 131                                           htmlentitydefs.name2codepoint)
 132
 133 html_entities_escape = _html_entities_escaper.escape_entities
 134 html_entities_unescape = _html_entities_escaper.unescape
 135
 136
 137 def htmlentityreplace_errors(ex):
 138     """An encoding error handler.
 139
 140     This python `codecs`_ error handler replaces unencodable
 141     characters with HTML entities, or, if no HTML entity exists for
 142     the character, XML character references.
 143
 144     >>> u'The cost was \u20ac12.'.encode('latin1', 'htmlentityreplace')
 145     'The cost was &euro;12.'
 146     """
 147     if isinstance(ex, UnicodeEncodeError):
 148         # Handle encoding errors
 149         bad_text = ex.object[ex.start:ex.end]
 150         text = _html_entities_escaper.escape(bad_text)
 151         return (unicode(text), ex.end)
 152     raise ex
 153
 154 codecs.register_error('htmlentityreplace', htmlentityreplace_errors)
 155
 156
 157 # TODO: options to make this dynamic per-compilation will be added in a later release
 158 DEFAULT_ESCAPES = {
 159     'x':'filters.xml_escape',
 160     'h':'filters.html_escape',
 161     'u':'filters.url_escape',
 162     'trim':'filters.trim',
 163     'entity':'filters.html_entities_escape',
 164     'unicode':'unicode',
 165     'decode':'decode',
 166     'str':'str',
 167     'n':'n'
 168 }
 169
 170