9 x = unicode(x, "utf8", "replace")
10 root = lxml.html.fromstring("<div>%s</div>" % x)
11 result = handle_element(root)
13 for el in children(result[0]):
14 if type(el) == str or type(el) == unicode:
18 res += lxml.html.tostring(el)
21 to_remove = set(["script", "head", "meta", "title", "link", "img"])
22 to_unwrap = set(["html", "body"])
24 javascript_regex = re.compile("""^\s*javascript\s*\:.*$""")
25 def handle_a(el, new):
26 href = el.get("href", "#")
27 if javascript_regex.search(href):
34 def handle_element(el):
35 if type(el) == str or type(el) == unicode:
37 if el.tag in to_remove:
39 if el.tag in to_unwrap:
40 return reduce(lambda x,y: x+y, [handle_element(x) for x in children(el)])
41 new = lxml.html.fromstring("<%s />" % el.tag)
42 for i in children(el):
43 append_to(handle_element(i), new)
45 special[el.tag](el, new)
50 if el.text is not None:
52 for i in el.getchildren():
54 if i.tail is not None:
58 def append_to(new_ones, el):
60 if type(i) == str or type(i) == unicode:
61 children = el.getchildren()
62 if len(children) == 0: