##############################################################################
#
# OpenERP, Open Source Business Applications
-# Copyright (C) 2012 OpenERP S.A. (<http://openerp.com>).
+# Copyright (C) 2012-2013 OpenERP S.A. (<http://openerp.com>).
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
import cgi
import logging
import lxml.html
+import lxml.html.clean as clean
import openerp.pooler as pooler
-import operator
import random
import re
import socket
import threading
import time
+from email.utils import getaddresses
from openerp.loglevels import ustr
# HTML Sanitizer
#----------------------------------------------------------
+tags_to_kill = ["script", "head", "meta", "title", "link", "style", "frame", "iframe", "base", "object", "embed"]
+tags_to_remove = ['html', 'body', 'font']
+
+
def html_sanitize(src):
if not src:
return src
src = ustr(src, errors='replace')
- root = lxml.html.fromstring(u"<div>%s</div>" % src)
- result = handle_element(root)
- res = []
- for element in children(result[0]):
- if isinstance(element, basestring):
- res.append(element)
- else:
- element.tail = ""
- res.append(lxml.html.tostring(element))
- return ''.join(res)
-
-# FIXME: shouldn't this be a whitelist rather than a blacklist?!
-to_remove = set(["script", "head", "meta", "title", "link", "img"])
-to_unwrap = set(["html", "body"])
-
-javascript_regex = re.compile(r"^\s*javascript\s*:.*$", re.IGNORECASE)
-
-def handle_a(el, new):
- href = el.get("href", "#")
- if javascript_regex.search(href):
- href = "#"
- new.set("href", href)
-
-special = {
- "a": handle_a,
-}
-
-def handle_element(element):
- if isinstance(element, basestring):
- return [element]
- if element.tag in to_remove:
- return []
- if element.tag in to_unwrap:
- return reduce(operator.add, [handle_element(x) for x in children(element)])
- result = lxml.html.fromstring("<%s />" % element.tag)
- for c in children(element):
- append_to(handle_element(c), result)
- if element.tag in special:
- special[element.tag](element, result)
- return [result]
-
-def children(node):
- res = []
- if node.text is not None:
- res.append(node.text)
- for child_node in node.getchildren():
- res.append(child_node)
- if child_node.tail is not None:
- res.append(child_node.tail)
- return res
-def append_to(elements, dest_node):
- for element in elements:
- if isinstance(element, basestring):
- children = dest_node.getchildren()
- if len(children) == 0:
- dest_node.text = element
- else:
- children[-1].tail = element
- else:
- dest_node.append(element)
+ # html encode email tags
+ part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL)
+ src = part.sub(lambda m: cgi.escape(m.group(1)), src)
+
+ # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
+ try:
+ cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, kill_tags=tags_to_kill, remove_tags=tags_to_remove)
+ cleaned = cleaner.clean_html(src)
+ except TypeError:
+ # lxml.clean version < 2.3.1 does not have a kill_tags attribute
+ # to remove in 2014
+ cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, remove_tags=tags_to_kill + tags_to_remove)
+ cleaned = cleaner.clean_html(src)
+ except Exception, e:
+ if isinstance(e, etree.ParserError) and 'empty' in str(e):
+ return ""
+ _logger.warning('html_sanitize failed to parse %s' % (src))
+ cleaned = '<p>Impossible to parse</p>'
+
+ # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution
+ cleaned = cleaned.replace('%24', '$')
+ cleaned = cleaned.replace('%7B', '{')
+ cleaned = cleaned.replace('%7D', '}')
+ cleaned = cleaned.replace('%20', ' ')
+ cleaned = cleaned.replace('%5B', '[')
+ cleaned = cleaned.replace('%5D', ']')
+
+ return cleaned
#----------------------------------------------------------
dest += source[idx:]
return dest
- if not html:
+ if not html or not isinstance(html, basestring):
return html
html = ustr(html)
+ # 0. remove encoding attribute inside tags
+ doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
+ html = doctype.sub(r"", html)
+
# 1. <br[ /]> -> \n, because otherwise the tree is obfuscated
br_tags = re.compile(r'([<]\s*[bB][rR]\s*\/?[>])')
html = _replace_matching_regex(br_tags, html, '__BR_TAG__')
url_index.append(url)
html = ustr(etree.tostring(tree, encoding=encoding))
+ # \r char is converted into , must remove it
+ html = html.replace(' ', '')
html = html.replace('<strong>', '*').replace('</strong>', '*')
html = html.replace('<b>', '*').replace('</b>', '*')
""" Return a list of the email addresses found in ``text`` """
if not text:
return []
- return re.findall(r'([^ ,<@]+@[^> ,]+)', text)
+ return [addr[1] for addr in getaddresses([text])
+ # getaddresses() returns '' when email parsing fails, and
+ # sometimes returns emails without at least '@'. The '@'
+ # is strictly required in RFC2822's `addr-spec`.
+ if addr[1]
+ if '@' in addr[1]]
\ No newline at end of file