[IMP] removed dependency to pyquery
[odoo/odoo.git] / openerp / tools / html_sanitize.py
1
2 import lxml.html
3 import re
4
5 def html_sanitize(x):
6     if not x:
7         return x
8     if type(x) == str:
9         x = unicode(x, "utf8", "replace")
10     root = lxml.html.fromstring("<div>%s</div>" % x)
11     result = handle_element(root)
12     res = ""
13     for el in children(result[0]):
14         if type(el) == str or type(el) == unicode:
15             res += el
16         else:
17             el.tail = ""
18             res += lxml.html.tostring(el)
19     return res
20
21 to_remove = set(["script", "head", "meta", "title", "link", "img"])
22 to_unwrap = set(["html", "body"])
23
24 javascript_regex = re.compile("""^\s*javascript\s*\:.*$""")
25 def handle_a(el, new):
26     href = el.get("href", "#")
27     if javascript_regex.search(href):
28         href = "#"
29     new.set("href", href)
30 special = {
31     "a": handle_a,
32 }
33
34 def handle_element(el):
35     if type(el) == str or type(el) == unicode:
36         return [el]
37     if el.tag in to_remove:
38         return []
39     if el.tag in to_unwrap:
40         return reduce(lambda x,y: x+y, [handle_element(x) for x in children(el)])
41     new = lxml.html.fromstring("<%s />" % el.tag)
42     for i in children(el):
43         append_to(handle_element(i), new)
44     if el.tag in special:
45         special[el.tag](el, new)
46     return [new]
47     
48 def children(el):
49     res = []
50     if el.text is not None:
51         res.append(el.text)
52     for i in el.getchildren():
53         res.append(i)
54         if i.tail is not None:
55             res.append(i.tail)
56     return res
57
58 def append_to(new_ones, el):
59     for i in new_ones:
60         if type(i) == str or type(i) == unicode:
61             children = el.getchildren()
62             if len(children) == 0:
63                 el.text = i
64             else:
65                 children[-1].tail = i
66         else:
67             el.append(i)