6763b464ab1b925ed2bed8deaf45811e7a6f85e9
[odoo/odoo.git] / openerp / tools / html_sanitize.py
1
2 import re
3
4 def html_sanitize(x):
5     return x # It seems that our test suite doesn't care.
6     if not x:
7         return x
8     root = pq("<div />")
9     if type(x) == str:
10         x = unicode(x, "utf8", "replace")
11     root.html(x)
12     result = handle_element(root[0])
13     new = pq(result)
14     return new.html()
15
16 to_remove = set(["script", "head", "meta", "title", "link", "img"])
17 to_unwrap = set(["html", "body"])
18
19 javascript_regex = re.compile("""^\s*javascript\s*\:.*$""")
20 def handle_a(el, new):
21     href = el.get("href", "#")
22     if javascript_regex.search(href):
23         href = "#"
24     new.set("href", href)
25 special = {
26     "a": handle_a,
27 }
28
29 def handle_element(el):
30     if type(el) == str or type(el) == unicode:
31         return [el]
32     if el.tag in to_remove:
33         return []
34     if el.tag in to_unwrap:
35         return reduce(lambda x,y: x+y, [handle_element(x) for x in children(el)])
36     new = pq("<%s />" % el.tag)[0]
37     for i in children(el):
38         append_to(handle_element(i), new)
39     if el.tag in special:
40         special[el.tag](el, new)
41     return [new]
42     
43 def children(el):
44     res = []
45     if el.text is not None:
46         res.append(el.text)
47     for i in el.getchildren():
48         res.append(i)
49         if i.tail is not None:
50             res.append(i.tail)
51     return res
52
53 def append_to(new_ones, el):
54     for i in new_ones:
55         if type(i) == str or type(i) == unicode:
56             children = el.getchildren()
57             if len(children) == 0:
58                 el.text = i
59             else:
60                 children[-1].tail = i
61         else:
62             el.append(i)