##############################################################################
#
# OpenERP, Open Source Business Applications
-# Copyright (C) 2012 OpenERP S.A. (<http://openerp.com>).
+# Copyright (C) 2012-2013 OpenERP S.A. (<http://openerp.com>).
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
import socket
import threading
import time
+from email.utils import getaddresses
from openerp.loglevels import ustr
# HTML Sanitizer
#----------------------------------------------------------
-# FIXME: shouldn't this be a whitelist rather than a blacklist?!
tags_to_kill = ["script", "head", "meta", "title", "link", "style", "frame", "iframe", "base", "object", "embed"]
tags_to_remove = ['html', 'body', 'font']
return src
src = ustr(src, errors='replace')
- # some cases make the parser crash (such as SCRIPT/XSS in test_mail)
+ # html encode email tags
+ part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL)
+ src = part.sub(lambda m: cgi.escape(m.group(1)), src)
+
+ # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
try:
cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, kill_tags=tags_to_kill, remove_tags=tags_to_remove)
cleaned = cleaner.clean_html(src)
- except:
- cleaned = 'Impossible to parse'
+ except TypeError:
+ # lxml.clean version < 2.3.1 does not have a kill_tags attribute
+ # to remove in 2014
+ cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, remove_tags=tags_to_kill + tags_to_remove)
+ cleaned = cleaner.clean_html(src)
+ except Exception, e:
+ if isinstance(e, etree.ParserError) and 'empty' in str(e):
+ return ""
+ _logger.warning('html_sanitize failed to parse %s' % (src))
+ cleaned = '<p>Impossible to parse</p>'
+
+ # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution
+ cleaned = cleaned.replace('%24', '$')
+ cleaned = cleaned.replace('%7B', '{')
+ cleaned = cleaned.replace('%7D', '}')
+ cleaned = cleaned.replace('%20', ' ')
+ cleaned = cleaned.replace('%5B', '[')
+ cleaned = cleaned.replace('%5D', ']')
+
return cleaned
url_index.append(url)
html = ustr(etree.tostring(tree, encoding=encoding))
+ # \r char is converted into , must remove it
+ html = html.replace(' ', '')
html = html.replace('<strong>', '*').replace('</strong>', '*')
html = html.replace('<b>', '*').replace('</b>', '*')
""" Return a list of the email addresses found in ``text`` """
if not text:
return []
- return re.findall(r'([^ ,<@]+@[^> ,]+)', text)
+ return [addr[1] for addr in getaddresses([text])
+ # getaddresses() returns '' when email parsing fails, and
+ # sometimes returns emails without at least '@'. The '@'
+ # is strictly required in RFC2822's `addr-spec`.
+ if addr[1]
+ if '@' in addr[1]]
\ No newline at end of file