openerp/tools/mail.py

   1 # -*- coding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Business Applications
   5 #    Copyright (C) 2012 OpenERP S.A. (<http://openerp.com>).
   6 #
   7 #    This program is free software: you can redistribute it and/or modify
   8 #    it under the terms of the GNU Affero General Public License as
   9 #    published by the Free Software Foundation, either version 3 of the
  10 #    License, or (at your option) any later version.
  11 #
  12 #    This program is distributed in the hope that it will be useful,
  13 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #    GNU Affero General Public License for more details.
  16 #
  17 #    You should have received a copy of the GNU Affero General Public License
  18 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 ##############################################################################
  21
  22 from lxml import etree
  23 import cgi
  24 import logging
  25 import lxml.html
  26 import lxml.html.clean as clean
  27 import openerp.pooler as pooler
  28 import random
  29 import re
  30 import socket
  31 import threading
  32 import time
  33
  34 from openerp.loglevels import ustr
  35
  36 _logger = logging.getLogger(__name__)
  37
  38
  39 #----------------------------------------------------------
  40 # HTML Sanitizer
  41 #----------------------------------------------------------
  42
  43 tags_to_kill = ["script", "head", "meta", "title", "link", "style", "frame", "iframe", "base", "object", "embed"]
  44 tags_to_remove = ['html', 'body', 'font']
  45
  46
  47 def html_sanitize(src):
  48     if not src:
  49         return src
  50     src = ustr(src, errors='replace')
  51
  52     # html encode email tags
  53     part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL)
  54     src = part.sub(lambda m: cgi.escape(m.group(1)), src)
  55
  56     # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
  57     try:
  58         cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, kill_tags=tags_to_kill, remove_tags=tags_to_remove)
  59         cleaned = cleaner.clean_html(src)
  60     except TypeError, e:
  61         # lxml.clean version < 2.3.1 does not have a kill_tags attribute
  62         # to remove in 2014
  63         cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, remove_tags=tags_to_kill+tags_to_remove)
  64         cleaned = cleaner.clean_html(src)
  65     except:
  66         _logger.warning('html_sanitize failed to parse %s' % (src))
  67         cleaned = '<p>Impossible to parse</p>'
  68     return cleaned
  69
  70
  71 #----------------------------------------------------------
  72 # HTML Cleaner
  73 #----------------------------------------------------------
  74
  75 def html_email_clean(html):
  76     """ html_email_clean: clean the html to display in the web client.
  77         - strip email quotes (remove blockquote nodes)
  78         - strip signatures (remove --\n{\n)Blahblah), by replacing <br> by
  79             \n to avoid ignoring signatures converted into html
  80
  81         :param string html: sanitized html; tags like html or head should not
  82             be present in the html string. This method therefore takes as input
  83             html code coming from a sanitized source, like fields.html.
  84     """
  85     def _replace_matching_regex(regex, source, replace=''):
  86         dest = ''
  87         idx = 0
  88         for item in re.finditer(regex, source):
  89             dest += source[idx:item.start()] + replace
  90             idx = item.end()
  91         dest += source[idx:]
  92         return dest
  93
  94     if not html or not isinstance(html, basestring):
  95         return html
  96
  97     html = ustr(html)
  98
  99     # 0. remove encoding attribute inside tags
 100     doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
 101     html = doctype.sub(r"", html)
 102
 103     # 1. <br[ /]> -> \n, because otherwise the tree is obfuscated
 104     br_tags = re.compile(r'([<]\s*[bB][rR]\s*\/?[>])')
 105     html = _replace_matching_regex(br_tags, html, '__BR_TAG__')
 106
 107     # 2. form a tree, handle (currently ?) pure-text by enclosing them in a pre
 108     root = lxml.html.fromstring(html)
 109     if not len(root) and root.text is None and root.tail is None:
 110         html = '<div>%s</div>' % html
 111         root = lxml.html.fromstring(html)
 112
 113     # 2.5 remove quoted text in nodes
 114     quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
 115     for node in root.getiterator():
 116         if not node.text:
 117             continue
 118         node.text = _replace_matching_regex(quote_tags, node.text)
 119
 120     # 3. remove blockquotes
 121     quotes = [el for el in root.getiterator(tag='blockquote')]
 122     for node in quotes:
 123         # copy the node tail into parent text
 124         if node.tail:
 125             parent = node.getparent()
 126             parent.text = parent.text or '' + node.tail
 127         # remove the node
 128         node.getparent().remove(node)
 129
 130     # 4. strip signatures
 131     signature = re.compile(r'([-]{2}[\s]?[\r\n]{1,2}[^\z]+)')
 132     for elem in root.getiterator():
 133         if elem.text:
 134             match = re.search(signature, elem.text)
 135             if match:
 136                 elem.text = elem.text[:match.start()] + elem.text[match.end():]
 137         if elem.tail:
 138             match = re.search(signature, elem.tail)
 139             if match:
 140                 elem.tail = elem.tail[:match.start()] + elem.tail[match.end():]
 141
 142     # 5. \n back to <br/>
 143     html = etree.tostring(root, pretty_print=True)
 144     html = html.replace('__BR_TAG__', '<br />')
 145
 146     # 6. Misc cleaning :
 147     # - ClEditor seems to love using <div><br /><div> -> replace with <br />
 148     br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)')
 149     html = _replace_matching_regex(br_div_tags, html, '<br />')
 150
 151     return html
 152
 153
 154 #----------------------------------------------------------
 155 # HTML/Text management
 156 #----------------------------------------------------------
 157
 158 def html2plaintext(html, body_id=None, encoding='utf-8'):
 159     """ From an HTML text, convert the HTML to plain text.
 160     If @param body_id is provided then this is the tag where the
 161     body (not necessarily <body>) starts.
 162     """
 163     ## (c) Fry-IT, www.fry-it.com, 2007
 164     ## <peter@fry-it.com>
 165     ## download here: http://www.peterbe.com/plog/html2plaintext
 166
 167     html = ustr(html)
 168     tree = etree.fromstring(html, parser=etree.HTMLParser())
 169
 170     if body_id is not None:
 171         source = tree.xpath('//*[@id=%s]' % (body_id,))
 172     else:
 173         source = tree.xpath('//body')
 174     if len(source):
 175         tree = source[0]
 176
 177     url_index = []
 178     i = 0
 179     for link in tree.findall('.//a'):
 180         url = link.get('href')
 181         if url:
 182             i += 1
 183             link.tag = 'span'
 184             link.text = '%s [%s]' % (link.text, i)
 185             url_index.append(url)
 186
 187     html = ustr(etree.tostring(tree, encoding=encoding))
 188     # \r char is converted into &#13;, must remove it
 189     html = html.replace('&#13;', '')
 190
 191     html = html.replace('<strong>', '*').replace('</strong>', '*')
 192     html = html.replace('<b>', '*').replace('</b>', '*')
 193     html = html.replace('<h3>', '*').replace('</h3>', '*')
 194     html = html.replace('<h2>', '**').replace('</h2>', '**')
 195     html = html.replace('<h1>', '**').replace('</h1>', '**')
 196     html = html.replace('<em>', '/').replace('</em>', '/')
 197     html = html.replace('<tr>', '\n')
 198     html = html.replace('</p>', '\n')
 199     html = re.sub('<br\s*/?>', '\n', html)
 200     html = re.sub('<.*?>', ' ', html)
 201     html = html.replace(' ' * 2, ' ')
 202
 203     # strip all lines
 204     html = '\n'.join([x.strip() for x in html.splitlines()])
 205     html = html.replace('\n' * 2, '\n')
 206
 207     for i, url in enumerate(url_index):
 208         if i == 0:
 209             html += '\n\n'
 210         html += ustr('[%s] %s\n') % (i + 1, url)
 211
 212     return html
 213
 214 def plaintext2html(text, container_tag=False):
 215     """ Convert plaintext into html. Content of the text is escaped to manage
 216         html entities, using cgi.escape().
 217         - all \n,\r are replaced by <br />
 218         - enclose content into <p>
 219         - 2 or more consecutive <br /> are considered as paragraph breaks
 220
 221         :param string container_tag: container of the html; by default the
 222             content is embedded into a <div>
 223     """
 224     text = cgi.escape(ustr(text))
 225
 226     # 1. replace \n and \r
 227     text = text.replace('\n', '<br/>')
 228     text = text.replace('\r', '<br/>')
 229
 230     # 2-3: form paragraphs
 231     idx = 0
 232     final = '<p>'
 233     br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})')
 234     for item in re.finditer(br_tags, text):
 235         final += text[idx:item.start()] + '</p><p>'
 236         idx = item.end()
 237     final += text[idx:] + '</p>'
 238
 239     # 4. container
 240     if container_tag:
 241         final = '<%s>%s</%s>' % (container_tag, final, container_tag)
 242     return ustr(final)
 243
 244 def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False):
 245     """ Append extra content at the end of an HTML snippet, trying
 246         to locate the end of the HTML document (</body>, </html>, or
 247         EOF), and converting the provided content in html unless ``plaintext``
 248         is False.
 249         Content conversion can be done in two ways:
 250         - wrapping it into a pre (preserve=True)
 251         - use plaintext2html (preserve=False, using container_tag to wrap the
 252             whole content)
 253         A side-effect of this method is to coerce all HTML tags to
 254         lowercase in ``html``, and strip enclosing <html> or <body> tags in
 255         content if ``plaintext`` is False.
 256
 257         :param str html: html tagsoup (doesn't have to be XHTML)
 258         :param str content: extra content to append
 259         :param bool plaintext: whether content is plaintext and should
 260             be wrapped in a <pre/> tag.
 261         :param bool preserve: if content is plaintext, wrap it into a <pre>
 262             instead of converting it into html
 263     """
 264     html = ustr(html)
 265     if plaintext and preserve:
 266         content = u'\n<pre>%s</pre>\n' % ustr(content)
 267     elif plaintext:
 268         content = '\n%s\n' % plaintext2html(content, container_tag)
 269     else:
 270         content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
 271         content = u'\n%s\n' % ustr(content)
 272     # Force all tags to lowercase
 273     html = re.sub(r'(</?)\W*(\w+)([ >])',
 274         lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
 275     insert_location = html.find('</body>')
 276     if insert_location == -1:
 277         insert_location = html.find('</html>')
 278     if insert_location == -1:
 279         return '%s%s' % (html, content)
 280     return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
 281
 282 #----------------------------------------------------------
 283 # Emails
 284 #----------------------------------------------------------
 285
 286 email_re = re.compile(r"""
 287     ([a-zA-Z][\w\.-]*[a-zA-Z0-9]     # username part
 288     @                                # mandatory @ sign
 289     [a-zA-Z0-9][\w\.-]*              # domain must start with a letter ... Ged> why do we include a 0-9 then?
 290      \.
 291      [a-z]{2,3}                      # TLD
 292     )
 293     """, re.VERBOSE)
 294 res_re = re.compile(r"\[([0-9]+)\]", re.UNICODE)
 295 command_re = re.compile("^Set-([a-z]+) *: *(.+)$", re.I + re.UNICODE)
 296
 297 # Updated in 7.0 to match the model name as well
 298 # Typical form of references is <timestamp-openerp-record_id-model_name@domain>
 299 # group(1) = the record ID ; group(2) = the model (if any) ; group(3) = the domain
 300 reference_re = re.compile("<.*-open(?:object|erp)-(\\d+)(?:-([\w.]+))?.*@(.*)>", re.UNICODE)
 301
 302 def generate_tracking_message_id(res_id):
 303     """Returns a string that can be used in the Message-ID RFC822 header field
 304
 305        Used to track the replies related to a given object thanks to the "In-Reply-To"
 306        or "References" fields that Mail User Agents will set.
 307     """
 308     try:
 309         rnd = random.SystemRandom().random()
 310     except NotImplementedError:
 311         rnd = random.random()
 312     rndstr = ("%.15f" % rnd)[2:]
 313     return "<%.15f.%s-openerp-%s@%s>" % (time.time(), rndstr, res_id, socket.gethostname())
 314
 315 def email_send(email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False,
 316                attachments=None, message_id=None, references=None, openobject_id=False, debug=False, subtype='plain', headers=None,
 317                smtp_server=None, smtp_port=None, ssl=False, smtp_user=None, smtp_password=None, cr=None, uid=None):
 318     """Low-level function for sending an email (deprecated).
 319
 320     :deprecate: since OpenERP 6.1, please use ir.mail_server.send_email() instead.
 321     :param email_from: A string used to fill the `From` header, if falsy,
 322                        config['email_from'] is used instead.  Also used for
 323                        the `Reply-To` header if `reply_to` is not provided
 324     :param email_to: a sequence of addresses to send the mail to.
 325     """
 326
 327     # If not cr, get cr from current thread database
 328     local_cr = None
 329     if not cr:
 330         db_name = getattr(threading.currentThread(), 'dbname', None)
 331         if db_name:
 332             local_cr = cr = pooler.get_db(db_name).cursor()
 333         else:
 334             raise Exception("No database cursor found, please pass one explicitly")
 335
 336     # Send Email
 337     try:
 338         mail_server_pool = pooler.get_pool(cr.dbname).get('ir.mail_server')
 339         res = False
 340         # Pack Message into MIME Object
 341         email_msg = mail_server_pool.build_email(email_from, email_to, subject, body, email_cc, email_bcc, reply_to,
 342                    attachments, message_id, references, openobject_id, subtype, headers=headers)
 343
 344         res = mail_server_pool.send_email(cr, uid or 1, email_msg, mail_server_id=None,
 345                        smtp_server=smtp_server, smtp_port=smtp_port, smtp_user=smtp_user, smtp_password=smtp_password,
 346                        smtp_encryption=('ssl' if ssl else None), smtp_debug=debug)
 347     except Exception:
 348         _logger.exception("tools.email_send failed to deliver email")
 349         return False
 350     finally:
 351         if local_cr:
 352             cr.close()
 353     return res
 354
 355 def email_split(text):
 356     """ Return a list of the email addresses found in ``text`` """
 357     if not text:
 358         return []
 359     return re.findall(r'([^ ,<@]+@[^> ,]+)', text)