def test_simple(self):
x = "yop"
self.assertEqual(x, html_sanitize(x))
+
+ def test_trailing_text(self):
+ x = 'lala<p>yop</p>xxx'
+ self.assertEqual(x, html_sanitize(x))
- def test_test_case(self):
+ def test_no_exception(self):
html_sanitize(test_case)
- def test_crm(self):
+ def test_unicode(self):
html_sanitize("Merci à l'intérêt pour notre produit.nous vous contacterons bientôt. Merci")
if __name__ == '__main__':
-from pyquery import PyQuery as pq
+import lxml.html
import re
def html_sanitize(x):
if not x:
return x
- root = pq("<div />")
if type(x) == str:
x = unicode(x, "utf8", "replace")
- root.html(x)
- result = handle_element(root[0])
- new = pq(result)
- return new.html()
+ root = lxml.html.fromstring("<div>%s</div>" % x)
+ result = handle_element(root)
+ res = ""
+ for el in children(result[0]):
+ if type(el) == str or type(el) == unicode:
+ res += el
+ else:
+ el.tail = ""
+ res += lxml.html.tostring(el)
+ return res
to_remove = set(["script", "head", "meta", "title", "link", "img"])
to_unwrap = set(["html", "body"])
return []
if el.tag in to_unwrap:
return reduce(lambda x,y: x+y, [handle_element(x) for x in children(el)])
- new = pq("<%s />" % el.tag)[0]
+ new = lxml.html.fromstring("<%s />" % el.tag)
for i in children(el):
append_to(handle_element(i), new)
if el.tag in special:
else:
children[-1].tail = i
else:
- el.append(i)
\ No newline at end of file
+ el.append(i)