1 # -*- coding: utf-8 -*-
2 ##############################################################################
4 # OpenERP, Open Source Business Applications
5 # Copyright (C) 2012 OpenERP S.A. (<http://openerp.com>).
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as
9 # published by the Free Software Foundation, either version 3 of the
10 # License, or (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ##############################################################################
26 from openerp.loglevels import ustr
28 def html_sanitize(src):
31 src = ustr(src, errors='replace')
32 root = lxml.html.fromstring(u"<div>%s</div>" % src)
33 result = handle_element(root)
35 for element in children(result[0]):
36 if isinstance(element, basestring):
40 res.append(lxml.html.tostring(element))
43 # FIXME: shouldn't this be a whitelist rather than a blacklist?!
44 to_remove = set(["script", "head", "meta", "title", "link", "img"])
45 to_unwrap = set(["html", "body"])
47 javascript_regex = re.compile(r"^\s*javascript\s*:.*$", re.IGNORECASE)
49 def handle_a(el, new):
50 href = el.get("href", "#")
51 if javascript_regex.search(href):
59 def handle_element(element):
60 if isinstance(element, basestring):
62 if element.tag in to_remove:
64 if element.tag in to_unwrap:
65 return reduce(operator.add, [handle_element(x) for x in children(element)])
66 result = lxml.html.fromstring("<%s />" % element.tag)
67 for c in children(element):
68 append_to(handle_element(c), result)
69 if element.tag in special:
70 special[element.tag](element, result)
75 if node.text is not None:
77 for child_node in node.getchildren():
78 res.append(child_node)
79 if child_node.tail is not None:
80 res.append(child_node.tail)
83 def append_to(elements, dest_node):
84 for element in elements:
85 if isinstance(element, basestring):
86 children = dest_node.getchildren()
87 if len(children) == 0:
88 dest_node.text = element
90 children[-1].tail = element
92 dest_node.append(element)