openerp/tools/mail.py

   1 # -*- coding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Business Applications
   5 #    Copyright (C) 2012-2013 OpenERP S.A. (<http://openerp.com>).
   6 #
   7 #    This program is free software: you can redistribute it and/or modify
   8 #    it under the terms of the GNU Affero General Public License as
   9 #    published by the Free Software Foundation, either version 3 of the
  10 #    License, or (at your option) any later version.
  11 #
  12 #    This program is distributed in the hope that it will be useful,
  13 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #    GNU Affero General Public License for more details.
  16 #
  17 #    You should have received a copy of the GNU Affero General Public License
  18 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 ##############################################################################
  21
  22 from lxml import etree
  23 import cgi
  24 import logging
  25 import lxml.html
  26 import lxml.html.clean as clean
  27 import openerp.pooler as pooler
  28 import random
  29 import re
  30 import socket
  31 import threading
  32 import time
  33 from email.utils import getaddresses
  34
  35 from openerp.loglevels import ustr
  36
  37 _logger = logging.getLogger(__name__)
  38
  39
  40 #----------------------------------------------------------
  41 # HTML Sanitizer
  42 #----------------------------------------------------------
  43
  44 tags_to_kill = ["script", "head", "meta", "title", "link", "style", "frame", "iframe", "base", "object", "embed"]
  45 tags_to_remove = ['html', 'body', 'font']
  46
  47
  48 def html_sanitize(src):
  49     if not src:
  50         return src
  51     src = ustr(src, errors='replace')
  52
  53     # html encode email tags
  54     part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL)
  55     src = part.sub(lambda m: cgi.escape(m.group(1)), src)
  56
  57     # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
  58     try:
  59         cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, kill_tags=tags_to_kill, remove_tags=tags_to_remove)
  60         cleaned = cleaner.clean_html(src)
  61     except TypeError:
  62         # lxml.clean version < 2.3.1 does not have a kill_tags attribute
  63         # to remove in 2014
  64         cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, remove_tags=tags_to_kill + tags_to_remove)
  65         cleaned = cleaner.clean_html(src)
  66     except Exception, e:
  67         if isinstance(e, etree.ParserError) and 'empty' in str(e):
  68             return ""
  69         _logger.warning('html_sanitize failed to parse %s' % (src))
  70         cleaned = '<p>Impossible to parse</p>'
  71     return cleaned
  72
  73
  74 #----------------------------------------------------------
  75 # HTML Cleaner
  76 #----------------------------------------------------------
  77
  78 def html_email_clean(html):
  79     """ html_email_clean: clean the html to display in the web client.
  80         - strip email quotes (remove blockquote nodes)
  81         - strip signatures (remove --\n{\n)Blahblah), by replacing <br> by
  82             \n to avoid ignoring signatures converted into html
  83
  84         :param string html: sanitized html; tags like html or head should not
  85             be present in the html string. This method therefore takes as input
  86             html code coming from a sanitized source, like fields.html.
  87     """
  88     def _replace_matching_regex(regex, source, replace=''):
  89         dest = ''
  90         idx = 0
  91         for item in re.finditer(regex, source):
  92             dest += source[idx:item.start()] + replace
  93             idx = item.end()
  94         dest += source[idx:]
  95         return dest
  96
  97     if not html or not isinstance(html, basestring):
  98         return html
  99
 100     html = ustr(html)
 101
 102     # 0. remove encoding attribute inside tags
 103     doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
 104     html = doctype.sub(r"", html)
 105
 106     # 1. <br[ /]> -> \n, because otherwise the tree is obfuscated
 107     br_tags = re.compile(r'([<]\s*[bB][rR]\s*\/?[>])')
 108     html = _replace_matching_regex(br_tags, html, '__BR_TAG__')
 109
 110     # 2. form a tree, handle (currently ?) pure-text by enclosing them in a pre
 111     root = lxml.html.fromstring(html)
 112     if not len(root) and root.text is None and root.tail is None:
 113         html = '<div>%s</div>' % html
 114         root = lxml.html.fromstring(html)
 115
 116     # 2.5 remove quoted text in nodes
 117     quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
 118     for node in root.getiterator():
 119         if not node.text:
 120             continue
 121         node.text = _replace_matching_regex(quote_tags, node.text)
 122
 123     # 3. remove blockquotes
 124     quotes = [el for el in root.getiterator(tag='blockquote')]
 125     for node in quotes:
 126         # copy the node tail into parent text
 127         if node.tail:
 128             parent = node.getparent()
 129             parent.text = parent.text or '' + node.tail
 130         # remove the node
 131         node.getparent().remove(node)
 132
 133     # 4. strip signatures
 134     signature = re.compile(r'([-]{2}[\s]?[\r\n]{1,2}[^\z]+)')
 135     for elem in root.getiterator():
 136         if elem.text:
 137             match = re.search(signature, elem.text)
 138             if match:
 139                 elem.text = elem.text[:match.start()] + elem.text[match.end():]
 140         if elem.tail:
 141             match = re.search(signature, elem.tail)
 142             if match:
 143                 elem.tail = elem.tail[:match.start()] + elem.tail[match.end():]
 144
 145     # 5. \n back to <br/>
 146     html = etree.tostring(root, pretty_print=True)
 147     html = html.replace('__BR_TAG__', '<br />')
 148
 149     # 6. Misc cleaning :
 150     # - ClEditor seems to love using <div><br /><div> -> replace with <br />
 151     br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)')
 152     html = _replace_matching_regex(br_div_tags, html, '<br />')
 153
 154     return html
 155
 156
 157 #----------------------------------------------------------
 158 # HTML/Text management
 159 #----------------------------------------------------------
 160
 161 def html2plaintext(html, body_id=None, encoding='utf-8'):
 162     """ From an HTML text, convert the HTML to plain text.
 163     If @param body_id is provided then this is the tag where the
 164     body (not necessarily <body>) starts.
 165     """
 166     ## (c) Fry-IT, www.fry-it.com, 2007
 167     ## <peter@fry-it.com>
 168     ## download here: http://www.peterbe.com/plog/html2plaintext
 169
 170     html = ustr(html)
 171     tree = etree.fromstring(html, parser=etree.HTMLParser())
 172
 173     if body_id is not None:
 174         source = tree.xpath('//*[@id=%s]' % (body_id,))
 175     else:
 176         source = tree.xpath('//body')
 177     if len(source):
 178         tree = source[0]
 179
 180     url_index = []
 181     i = 0
 182     for link in tree.findall('.//a'):
 183         url = link.get('href')
 184         if url:
 185             i += 1
 186             link.tag = 'span'
 187             link.text = '%s [%s]' % (link.text, i)
 188             url_index.append(url)
 189
 190     html = ustr(etree.tostring(tree, encoding=encoding))
 191     # \r char is converted into &#13;, must remove it
 192     html = html.replace('&#13;', '')
 193
 194     html = html.replace('<strong>', '*').replace('</strong>', '*')
 195     html = html.replace('<b>', '*').replace('</b>', '*')
 196     html = html.replace('<h3>', '*').replace('</h3>', '*')
 197     html = html.replace('<h2>', '**').replace('</h2>', '**')
 198     html = html.replace('<h1>', '**').replace('</h1>', '**')
 199     html = html.replace('<em>', '/').replace('</em>', '/')
 200     html = html.replace('<tr>', '\n')
 201     html = html.replace('</p>', '\n')
 202     html = re.sub('<br\s*/?>', '\n', html)
 203     html = re.sub('<.*?>', ' ', html)
 204     html = html.replace(' ' * 2, ' ')
 205
 206     # strip all lines
 207     html = '\n'.join([x.strip() for x in html.splitlines()])
 208     html = html.replace('\n' * 2, '\n')
 209
 210     for i, url in enumerate(url_index):
 211         if i == 0:
 212             html += '\n\n'
 213         html += ustr('[%s] %s\n') % (i + 1, url)
 214
 215     return html
 216
 217 def plaintext2html(text, container_tag=False):
 218     """ Convert plaintext into html. Content of the text is escaped to manage
 219         html entities, using cgi.escape().
 220         - all \n,\r are replaced by <br />
 221         - enclose content into <p>
 222         - 2 or more consecutive <br /> are considered as paragraph breaks
 223
 224         :param string container_tag: container of the html; by default the
 225             content is embedded into a <div>
 226     """
 227     text = cgi.escape(ustr(text))
 228
 229     # 1. replace \n and \r
 230     text = text.replace('\n', '<br/>')
 231     text = text.replace('\r', '<br/>')
 232
 233     # 2-3: form paragraphs
 234     idx = 0
 235     final = '<p>'
 236     br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})')
 237     for item in re.finditer(br_tags, text):
 238         final += text[idx:item.start()] + '</p><p>'
 239         idx = item.end()
 240     final += text[idx:] + '</p>'
 241
 242     # 4. container
 243     if container_tag:
 244         final = '<%s>%s</%s>' % (container_tag, final, container_tag)
 245     return ustr(final)
 246
 247 def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False):
 248     """ Append extra content at the end of an HTML snippet, trying
 249         to locate the end of the HTML document (</body>, </html>, or
 250         EOF), and converting the provided content in html unless ``plaintext``
 251         is False.
 252         Content conversion can be done in two ways:
 253         - wrapping it into a pre (preserve=True)
 254         - use plaintext2html (preserve=False, using container_tag to wrap the
 255             whole content)
 256         A side-effect of this method is to coerce all HTML tags to
 257         lowercase in ``html``, and strip enclosing <html> or <body> tags in
 258         content if ``plaintext`` is False.
 259
 260         :param str html: html tagsoup (doesn't have to be XHTML)
 261         :param str content: extra content to append
 262         :param bool plaintext: whether content is plaintext and should
 263             be wrapped in a <pre/> tag.
 264         :param bool preserve: if content is plaintext, wrap it into a <pre>
 265             instead of converting it into html
 266     """
 267     html = ustr(html)
 268     if plaintext and preserve:
 269         content = u'\n<pre>%s</pre>\n' % ustr(content)
 270     elif plaintext:
 271         content = '\n%s\n' % plaintext2html(content, container_tag)
 272     else:
 273         content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
 274         content = u'\n%s\n' % ustr(content)
 275     # Force all tags to lowercase
 276     html = re.sub(r'(</?)\W*(\w+)([ >])',
 277         lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
 278     insert_location = html.find('</body>')
 279     if insert_location == -1:
 280         insert_location = html.find('</html>')
 281     if insert_location == -1:
 282         return '%s%s' % (html, content)
 283     return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
 284
 285 #----------------------------------------------------------
 286 # Emails
 287 #----------------------------------------------------------
 288
 289 email_re = re.compile(r"""
 290     ([a-zA-Z][\w\.-]*[a-zA-Z0-9]     # username part
 291     @                                # mandatory @ sign
 292     [a-zA-Z0-9][\w\.-]*              # domain must start with a letter ... Ged> why do we include a 0-9 then?
 293      \.
 294      [a-z]{2,3}                      # TLD
 295     )
 296     """, re.VERBOSE)
 297 res_re = re.compile(r"\[([0-9]+)\]", re.UNICODE)
 298 command_re = re.compile("^Set-([a-z]+) *: *(.+)$", re.I + re.UNICODE)
 299
 300 # Updated in 7.0 to match the model name as well
 301 # Typical form of references is <timestamp-openerp-record_id-model_name@domain>
 302 # group(1) = the record ID ; group(2) = the model (if any) ; group(3) = the domain
 303 reference_re = re.compile("<.*-open(?:object|erp)-(\\d+)(?:-([\w.]+))?.*@(.*)>", re.UNICODE)
 304
 305 def generate_tracking_message_id(res_id):
 306     """Returns a string that can be used in the Message-ID RFC822 header field
 307
 308        Used to track the replies related to a given object thanks to the "In-Reply-To"
 309        or "References" fields that Mail User Agents will set.
 310     """
 311     try:
 312         rnd = random.SystemRandom().random()
 313     except NotImplementedError:
 314         rnd = random.random()
 315     rndstr = ("%.15f" % rnd)[2:]
 316     return "<%.15f.%s-openerp-%s@%s>" % (time.time(), rndstr, res_id, socket.gethostname())
 317
 318 def email_send(email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False,
 319                attachments=None, message_id=None, references=None, openobject_id=False, debug=False, subtype='plain', headers=None,
 320                smtp_server=None, smtp_port=None, ssl=False, smtp_user=None, smtp_password=None, cr=None, uid=None):
 321     """Low-level function for sending an email (deprecated).
 322
 323     :deprecate: since OpenERP 6.1, please use ir.mail_server.send_email() instead.
 324     :param email_from: A string used to fill the `From` header, if falsy,
 325                        config['email_from'] is used instead.  Also used for
 326                        the `Reply-To` header if `reply_to` is not provided
 327     :param email_to: a sequence of addresses to send the mail to.
 328     """
 329
 330     # If not cr, get cr from current thread database
 331     local_cr = None
 332     if not cr:
 333         db_name = getattr(threading.currentThread(), 'dbname', None)
 334         if db_name:
 335             local_cr = cr = pooler.get_db(db_name).cursor()
 336         else:
 337             raise Exception("No database cursor found, please pass one explicitly")
 338
 339     # Send Email
 340     try:
 341         mail_server_pool = pooler.get_pool(cr.dbname).get('ir.mail_server')
 342         res = False
 343         # Pack Message into MIME Object
 344         email_msg = mail_server_pool.build_email(email_from, email_to, subject, body, email_cc, email_bcc, reply_to,
 345                    attachments, message_id, references, openobject_id, subtype, headers=headers)
 346
 347         res = mail_server_pool.send_email(cr, uid or 1, email_msg, mail_server_id=None,
 348                        smtp_server=smtp_server, smtp_port=smtp_port, smtp_user=smtp_user, smtp_password=smtp_password,
 349                        smtp_encryption=('ssl' if ssl else None), smtp_debug=debug)
 350     except Exception:
 351         _logger.exception("tools.email_send failed to deliver email")
 352         return False
 353     finally:
 354         if local_cr:
 355             cr.close()
 356     return res
 357
 358 def email_split(text):
 359     """ Return a list of the email addresses found in ``text`` """
 360     if not text:
 361         return []
 362     return [addr[1] for addr in getaddresses([text])
 363                 # getaddresses() returns '' when email parsing fails, and
 364                 # sometimes returns emails without at least '@'. The '@'
 365                 # is strictly required in RFC2822's `addr-spec`.
 366                 if addr[1]
 367                 if '@' in addr[1]]