2 # Copyright (C) 2006, 2007, 2008 Geoffrey T. Dairiki <dairiki@dairiki.org> and Michael Bayer <mike_mp@zzzcomputing.com>
4 # This module is part of Mako and is released under
5 # the MIT License: http://www.opensource.org/licenses/mit-license.php
8 import re, cgi, urllib, htmlentitydefs, codecs
9 from StringIO import StringIO
15 '"' : '"', # also " in html-only
16 "'" : ''' # also ' in html-only
18 # XXX: " is valid in HTML and XML
19 # ' is not valid HTML, but is valid XML
21 def html_escape(string):
22 return cgi.escape(string, True)
24 def xml_escape(string):
25 return re.sub(r'([&<"\'>])', lambda m: xml_escapes[m.group()], string)
27 def url_escape(string):
28 # convert into a list of octets
29 string = string.encode("utf8")
30 return urllib.quote_plus(string)
32 def url_unescape(string):
33 text = urllib.unquote_plus(string)
34 if not is_ascii_str(text):
35 text = text.decode("utf8")
43 def __getattr__(self, key):
45 if isinstance(x, unicode):
47 elif not isinstance(x, str):
48 return unicode(str(x), encoding=key)
50 return unicode(x, encoding=key)
55 _ASCII_re = re.compile(r'\A[\x00-\x7f]*\Z')
57 def is_ascii_str(text):
58 return isinstance(text, str) and _ASCII_re.match(text)
60 ################################################################
62 class XMLEntityEscaper(object):
63 def __init__(self, codepoint2name, name2codepoint):
64 self.codepoint2entity = dict([(c, u'&%s;' % n)
65 for c,n in codepoint2name.iteritems()])
66 self.name2codepoint = name2codepoint
68 def escape_entities(self, text):
69 """Replace characters with their character entity references.
71 Only characters corresponding to a named entity are replaced.
73 return unicode(text).translate(self.codepoint2entity)
75 def __escape(self, m):
76 codepoint = ord(m.group())
78 return self.codepoint2entity[codepoint]
79 except (KeyError, IndexError):
80 return '&#x%X;' % codepoint
83 __escapable = re.compile(r'["&<>]|[^\x00-\x7f]')
85 def escape(self, text):
86 """Replace characters with their character references.
88 Replace characters by their named entity references.
89 Non-ASCII characters, if they do not have a named entity reference,
90 are replaced by numerical character references.
92 The return value is guaranteed to be ASCII.
94 return self.__escapable.sub(self.__escape, unicode(text)
97 # XXX: This regexp will not match all valid XML entity names__.
98 # (It punts on details involving involving CombiningChars and Extenders.)
100 # .. __: http://www.w3.org/TR/2000/REC-xml-20001006#NT-EntityRef
101 __characterrefs = re.compile(r'''& (?:
104 | ( (?!\d) [:\w] [-.:\w]+ )
108 def __unescape(self, m):
109 dval, hval, name = m.groups()
111 codepoint = int(dval)
113 codepoint = int(hval, 16)
115 codepoint = self.name2codepoint.get(name, 0xfffd)
116 # U+FFFD = "REPLACEMENT CHARACTER"
118 return chr(codepoint)
119 return unichr(codepoint)
121 def unescape(self, text):
122 """Unescape character references.
124 All character references (both entity references and numerical
125 character references) are unescaped.
127 return self.__characterrefs.sub(self.__unescape, text)
130 _html_entities_escaper = XMLEntityEscaper(htmlentitydefs.codepoint2name,
131 htmlentitydefs.name2codepoint)
133 html_entities_escape = _html_entities_escaper.escape_entities
134 html_entities_unescape = _html_entities_escaper.unescape
137 def htmlentityreplace_errors(ex):
138 """An encoding error handler.
140 This python `codecs`_ error handler replaces unencodable
141 characters with HTML entities, or, if no HTML entity exists for
142 the character, XML character references.
144 >>> u'The cost was \u20ac12.'.encode('latin1', 'htmlentityreplace')
145 'The cost was €12.'
147 if isinstance(ex, UnicodeEncodeError):
148 # Handle encoding errors
149 bad_text = ex.object[ex.start:ex.end]
150 text = _html_entities_escaper.escape(bad_text)
151 return (unicode(text), ex.end)
154 codecs.register_error('htmlentityreplace', htmlentityreplace_errors)
157 # TODO: options to make this dynamic per-compilation will be added in a later release
159 'x':'filters.xml_escape',
160 'h':'filters.html_escape',
161 'u':'filters.url_escape',
162 'trim':'filters.trim',
163 'entity':'filters.html_entities_escape',