openerp/tools/mail.py

   1 # -*- coding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Business Applications
   5 #    Copyright (C) 2012-2013 OpenERP S.A. (<http://openerp.com>).
   6 #
   7 #    This program is free software: you can redistribute it and/or modify
   8 #    it under the terms of the GNU Affero General Public License as
   9 #    published by the Free Software Foundation, either version 3 of the
  10 #    License, or (at your option) any later version.
  11 #
  12 #    This program is distributed in the hope that it will be useful,
  13 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #    GNU Affero General Public License for more details.
  16 #
  17 #    You should have received a copy of the GNU Affero General Public License
  18 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 ##############################################################################
  21
  22 from lxml import etree
  23 import cgi
  24 import logging
  25 import lxml.html
  26 import lxml.html.clean as clean
  27 import random
  28 import re
  29 import socket
  30 import threading
  31 import time
  32
  33 import openerp
  34 from openerp.loglevels import ustr
  35
  36 _logger = logging.getLogger(__name__)
  37
  38
  39 #----------------------------------------------------------
  40 # HTML Sanitizer
  41 #----------------------------------------------------------
  42
  43 tags_to_kill = ["script", "head", "meta", "title", "link", "style", "frame", "iframe", "base", "object", "embed"]
  44 tags_to_remove = ['html', 'body', 'font']
  45
  46 # allow new semantic HTML5 tags
  47 allowed_tags = clean.defs.tags | frozenset('article section header footer hgroup nav aside figure'.split())
  48 safe_attrs = clean.defs.safe_attrs | frozenset(['style'])
  49
  50 def html_sanitize(src, silent=True):
  51     if not src:
  52         return src
  53     src = ustr(src, errors='replace')
  54
  55     logger = logging.getLogger(__name__ + '.html_sanitize')
  56
  57     # html encode email tags
  58     part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL)
  59     src = part.sub(lambda m: cgi.escape(m.group(1)), src)
  60
  61     kwargs = {
  62         'page_structure': True,
  63         'style': False,             # do not remove style attributes
  64         'forms': True,              # remove form tags
  65         'remove_unknown_tags': False,
  66         'allow_tags': allowed_tags,
  67     }
  68     if etree.LXML_VERSION >= (2, 3, 1):
  69         # kill_tags attribute has been added in version 2.3.1
  70         kwargs.update({
  71             'kill_tags': tags_to_kill,
  72             'remove_tags': tags_to_remove,
  73         })
  74     else:
  75         kwargs['remove_tags'] = tags_to_kill + tags_to_remove
  76
  77     if etree.LXML_VERSION >= (3, 1, 0):
  78         kwargs.update({
  79             'safe_attrs_only': True,
  80             'safe_attrs': safe_attrs,
  81         })
  82     else:
  83         # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style"
  84         kwargs['safe_attrs_only'] = False
  85
  86     try:
  87         # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
  88         cleaner = clean.Cleaner(**kwargs)
  89         cleaned = cleaner.clean_html(src)
  90     except etree.ParserError:
  91         if not silent:
  92             raise
  93         logger.warning('ParserError obtained when sanitizing %r', src, exc_info=True)
  94         cleaned = '<p>ParserError when sanitizing</p>'
  95     except Exception:
  96         if not silent:
  97             raise
  98         logger.warning('unknown error obtained when sanitizing %r', src, exc_info=True)
  99         cleaned = '<p>Unknown error when sanitizing</p>'
 100     return cleaned
 101
 102
 103 #----------------------------------------------------------
 104 # HTML Cleaner
 105 #----------------------------------------------------------
 106
 107 def html_email_clean(html, remove=False, shorten=False, max_length=300):
 108     """ html_email_clean: clean the html by doing the following steps:
 109
 110      - try to strip email quotes, by removing blockquotes or having some client-
 111        specific heuristics
 112      - try to strip signatures
 113      - shorten the html to a maximum number of characters if requested
 114
 115     Some specific use case:
 116
 117      - MsOffice: ``div.style = border-top:solid;`` delimitates the beginning of
 118        a quote; detecting by finding WordSection1 of MsoNormal
 119      - Hotmail: ``hr.stopSpelling`` delimitates the beginning of a quote; detect
 120        Hotmail by funding ``SkyDrivePlaceholder``
 121
 122     :param string html: sanitized html; tags like html or head should not
 123                         be present in the html string. This method therefore
 124                         takes as input html code coming from a sanitized source,
 125                         like fields.html.
 126     :param boolean remove: remove the html code that is unwanted; otherwise it
 127                            is only flagged and tagged
 128     :param boolean shorten: shorten the html; every excessing content will
 129                             be flagged as to remove
 130     :param int max_length: if shortening, maximum number of characters before
 131                            shortening
 132     """
 133     def _replace_matching_regex(regex, source, replace=''):
 134         """ Replace all matching expressions in source by replace """
 135         if not source:
 136             return source
 137         dest = ''
 138         idx = 0
 139         for item in re.finditer(regex, source):
 140             dest += source[idx:item.start()] + replace
 141             idx = item.end()
 142         dest += source[idx:]
 143         return dest
 144
 145     def _create_node(tag, text, tail=None, attrs={}):
 146         new_node = etree.Element(tag)
 147         new_node.text = text
 148         new_node.tail = tail
 149         for key, val in attrs.iteritems():
 150             new_node.set(key, val)
 151         return new_node
 152
 153     def _insert_new_node(node, index, new_node_tag, new_node_text, new_node_tail=None, new_node_attrs={}):
 154         new_node = _create_node(new_node_tag, new_node_text, new_node_tail, new_node_attrs)
 155         node.insert(index, new_node)
 156         return new_node
 157
 158     def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs={}):
 159         text = node.text or ''
 160         if not re.search(regex, text):
 161             return
 162
 163         cur_node = node
 164         node.text = ''
 165         idx, iteration = 0, 0
 166         for item in re.finditer(regex, text):
 167             if iteration == 0:
 168                 cur_node.text = text[idx:item.start()]
 169             else:
 170                 _insert_new_node(node, (iteration - 1) * 2 + 1, new_node_tag, text[idx:item.start()])
 171             new_node = _insert_new_node(node, iteration * 2, new_node_tag, text[item.start():item.end()], None, new_node_attrs)
 172
 173             cur_node = new_node
 174             idx = item.end()
 175             iteration += 1
 176         new_node = _insert_new_node(node, -1, new_node_tag, text[idx:] + (cur_node.tail or ''), None, {})
 177
 178     if not html or not isinstance(html, basestring):
 179         return html
 180     html = ustr(html)
 181
 182     # Pre processing
 183     # ------------------------------------------------------------
 184     # TDE TODO: --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}'
 185
 186     # html: remove encoding attribute inside tags
 187     doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
 188     html = doctype.sub(r"", html)
 189
 190     # html: ClEditor seems to love using <div><br /><div> -> replace with <br />
 191     br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)', re.IGNORECASE)
 192     html = _replace_matching_regex(br_div_tags, html, '<br />')
 193
 194     # form a tree
 195     root = lxml.html.fromstring(html)
 196     if not len(root) and root.text is None and root.tail is None:
 197         html = '<div>%s</div>' % html
 198         root = lxml.html.fromstring(html)
 199
 200     # remove all tails and replace them by a span element, because managing text and tails can be a pain in the ass
 201     for node in root.getiterator():
 202         if node.tail:
 203             tail_node = _create_node('span', node.tail)
 204             node.tail = None
 205             node.addnext(tail_node)
 206
 207     # form node and tag text-based quotes and signature
 208     quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
 209     signature = re.compile(r'([-]{2,}[\s]?[\r\n]{1,2}[^.]+)')
 210     for node in root.getiterator():
 211         _tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'})
 212         _tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'})
 213
 214     # Processing
 215     # ------------------------------------------------------------
 216
 217     # tree: tag nodes
 218     # signature_begin = False  # try dynamic signature recognition
 219     quote_begin = False
 220     overlength = False
 221     cur_char_nbr = 0
 222     for node in root.getiterator():
 223         # root: try to tag the client used to write the html
 224         if 'WordSection1' in node.get('class', '') or 'MsoNormal' in node.get('class', ''):
 225             root.set('msoffice', '1')
 226         if 'SkyDrivePlaceholder' in node.get('class', '') or 'SkyDrivePlaceholder' in node.get('id', ''):
 227             root.set('hotmail', '1')
 228
 229         # state of the parsing
 230         if quote_begin:
 231             node.set('in_quote', '1')
 232             node.set('tail_remove', '1')
 233         if overlength:
 234             node.set('in_overlength', '1')
 235             node.set('tail_remove', '1')
 236
 237         if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''):
 238             quote_begin = True
 239             node.set('in_quote', '1')
 240             node.set('tail_remove', '1')
 241         if root.get('hotmail') and node.tag == 'hr' and ('stopSpelling' in node.get('class', '') or 'stopSpelling' in node.get('id', '')):
 242             quote_begin = True
 243             node.set('in_quote', '1')
 244             node.set('tail_remove', '1')
 245
 246         # shorten:
 247         # 1/ truncate the text at the next available space
 248         # 2/ create a 'read more' node, next to current node
 249         # 3/ add the truncated text in a new node, next to 'read more' node
 250         if shorten and not overlength and cur_char_nbr + len(node.text or '') > max_length:
 251             overlength = True
 252             # truncate text
 253             innertext = node.text[0:(max_length - cur_char_nbr)]
 254             outertext = node.text[(max_length - cur_char_nbr):]
 255             stop_idx = outertext.find(' ')
 256             if stop_idx == -1:
 257                 stop_idx = len(outertext)
 258             node.text = innertext + outertext[0:stop_idx]
 259             # create <span> ... <a href="#">read more</a></span> node
 260             read_more_node = _create_node('span', ' ... ', None, {'class': 'oe_mail_expand'})
 261             read_more_link_node = _create_node('a', 'read more', None, {'href': '#', 'class': 'oe_mail_expand'})
 262             read_more_node.append(read_more_link_node)
 263             # create outertext node
 264             new_node = _create_node('span', outertext[stop_idx:])
 265             # add newly created nodes in dom
 266             node.addnext(new_node)
 267             node.addnext(read_more_node)
 268             # tag node
 269             new_node.set('in_overlength', '1')
 270
 271             cur_char_nbr += len(node.text or '')
 272
 273         if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'):
 274             node.set('in_quote', '1')
 275
 276     # Post processing
 277     # ------------------------------------------------------------
 278
 279     to_remove = []
 280     for node in root.getiterator():
 281         if node.get('in_quote') or node.get('in_overlength'):
 282             # copy the node tail into parent text
 283             if node.tail and not node.get('tail_remove'):
 284                 parent = node.getparent()
 285                 parent.tail = node.tail + (parent.tail or '')
 286             to_remove.append(node)
 287         if node.get('tail_remove'):
 288             node.tail = ''
 289     for node in to_remove:
 290         if remove:
 291             node.getparent().remove(node)
 292         else:
 293             if not 'oe_mail_expand' in node.get('class', ''):  # trick: read more link should be displayed even if it's in overlength
 294                 node_class = node.get('class', '') + ' ' + 'oe_mail_cleaned'
 295                 node.set('class', node_class)
 296
 297     # html: \n that were tail of elements have been encapsulated into <span> -> back to \n
 298     html = etree.tostring(root, pretty_print=False)
 299     linebreaks = re.compile(r'<span>([\s]*[\r\n]+[\s]*)<\/span>', re.IGNORECASE | re.DOTALL)
 300     html = _replace_matching_regex(linebreaks, html, '\n')
 301
 302     return html
 303
 304
 305 #----------------------------------------------------------
 306 # HTML/Text management
 307 #----------------------------------------------------------
 308
 309 def html2plaintext(html, body_id=None, encoding='utf-8'):
 310     """ From an HTML text, convert the HTML to plain text.
 311     If @param body_id is provided then this is the tag where the
 312     body (not necessarily <body>) starts.
 313     """
 314     ## (c) Fry-IT, www.fry-it.com, 2007
 315     ## <peter@fry-it.com>
 316     ## download here: http://www.peterbe.com/plog/html2plaintext
 317
 318     html = ustr(html)
 319     tree = etree.fromstring(html, parser=etree.HTMLParser())
 320
 321     if body_id is not None:
 322         source = tree.xpath('//*[@id=%s]' % (body_id,))
 323     else:
 324         source = tree.xpath('//body')
 325     if len(source):
 326         tree = source[0]
 327
 328     url_index = []
 329     i = 0
 330     for link in tree.findall('.//a'):
 331         url = link.get('href')
 332         if url:
 333             i += 1
 334             link.tag = 'span'
 335             link.text = '%s [%s]' % (link.text, i)
 336             url_index.append(url)
 337
 338     html = ustr(etree.tostring(tree, encoding=encoding))
 339     # \r char is converted into &#13;, must remove it
 340     html = html.replace('&#13;', '')
 341
 342     html = html.replace('<strong>', '*').replace('</strong>', '*')
 343     html = html.replace('<b>', '*').replace('</b>', '*')
 344     html = html.replace('<h3>', '*').replace('</h3>', '*')
 345     html = html.replace('<h2>', '**').replace('</h2>', '**')
 346     html = html.replace('<h1>', '**').replace('</h1>', '**')
 347     html = html.replace('<em>', '/').replace('</em>', '/')
 348     html = html.replace('<tr>', '\n')
 349     html = html.replace('</p>', '\n')
 350     html = re.sub('<br\s*/?>', '\n', html)
 351     html = re.sub('<.*?>', ' ', html)
 352     html = html.replace(' ' * 2, ' ')
 353
 354     # strip all lines
 355     html = '\n'.join([x.strip() for x in html.splitlines()])
 356     html = html.replace('\n' * 2, '\n')
 357
 358     for i, url in enumerate(url_index):
 359         if i == 0:
 360             html += '\n\n'
 361         html += ustr('[%s] %s\n') % (i + 1, url)
 362
 363     return html
 364
 365 def plaintext2html(text, container_tag=False):
 366     """ Convert plaintext into html. Content of the text is escaped to manage
 367         html entities, using cgi.escape().
 368         - all \n,\r are replaced by <br />
 369         - enclose content into <p>
 370         - 2 or more consecutive <br /> are considered as paragraph breaks
 371
 372         :param string container_tag: container of the html; by default the
 373             content is embedded into a <div>
 374     """
 375     text = cgi.escape(ustr(text))
 376
 377     # 1. replace \n and \r
 378     text = text.replace('\n', '<br/>')
 379     text = text.replace('\r', '<br/>')
 380
 381     # 2-3: form paragraphs
 382     idx = 0
 383     final = '<p>'
 384     br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})')
 385     for item in re.finditer(br_tags, text):
 386         final += text[idx:item.start()] + '</p><p>'
 387         idx = item.end()
 388     final += text[idx:] + '</p>'
 389
 390     # 4. container
 391     if container_tag:
 392         final = '<%s>%s</%s>' % (container_tag, final, container_tag)
 393     return ustr(final)
 394
 395 def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False):
 396     """ Append extra content at the end of an HTML snippet, trying
 397         to locate the end of the HTML document (</body>, </html>, or
 398         EOF), and converting the provided content in html unless ``plaintext``
 399         is False.
 400         Content conversion can be done in two ways:
 401         - wrapping it into a pre (preserve=True)
 402         - use plaintext2html (preserve=False, using container_tag to wrap the
 403             whole content)
 404         A side-effect of this method is to coerce all HTML tags to
 405         lowercase in ``html``, and strip enclosing <html> or <body> tags in
 406         content if ``plaintext`` is False.
 407
 408         :param str html: html tagsoup (doesn't have to be XHTML)
 409         :param str content: extra content to append
 410         :param bool plaintext: whether content is plaintext and should
 411             be wrapped in a <pre/> tag.
 412         :param bool preserve: if content is plaintext, wrap it into a <pre>
 413             instead of converting it into html
 414     """
 415     html = ustr(html)
 416     if plaintext and preserve:
 417         content = u'\n<pre>%s</pre>\n' % ustr(content)
 418     elif plaintext:
 419         content = '\n%s\n' % plaintext2html(content, container_tag)
 420     else:
 421         content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
 422         content = u'\n%s\n' % ustr(content)
 423     # Force all tags to lowercase
 424     html = re.sub(r'(</?)\W*(\w+)([ >])',
 425         lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
 426     insert_location = html.find('</body>')
 427     if insert_location == -1:
 428         insert_location = html.find('</html>')
 429     if insert_location == -1:
 430         return '%s%s' % (html, content)
 431     return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
 432
 433 #----------------------------------------------------------
 434 # Emails
 435 #----------------------------------------------------------
 436
 437 # matches any email in a body of text
 438 email_re = re.compile(r"""([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6})""", re.VERBOSE)
 439
 440 # matches a string containing only one email
 441 single_email_re = re.compile(r"""^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$""", re.VERBOSE)
 442
 443 res_re = re.compile(r"\[([0-9]+)\]", re.UNICODE)
 444 command_re = re.compile("^Set-([a-z]+) *: *(.+)$", re.I + re.UNICODE)
 445
 446 # Updated in 7.0 to match the model name as well
 447 # Typical form of references is <timestamp-openerp-record_id-model_name@domain>
 448 # group(1) = the record ID ; group(2) = the model (if any) ; group(3) = the domain
 449 reference_re = re.compile("<.*-open(?:object|erp)-(\\d+)(?:-([\w.]+))?.*@(.*)>", re.UNICODE)
 450
 451 # Bounce regex
 452 # Typical form of bounce is bounce-128-crm.lead-34@domain
 453 # group(1) = the mail ID; group(2) = the model (if any); group(3) = the record ID
 454 bounce_re = re.compile("[\w]+-(\d+)-?([\w.]+)?-?(\d+)?", re.UNICODE)
 455
 456 def generate_tracking_message_id(res_id):
 457     """Returns a string that can be used in the Message-ID RFC822 header field
 458
 459        Used to track the replies related to a given object thanks to the "In-Reply-To"
 460        or "References" fields that Mail User Agents will set.
 461     """
 462     try:
 463         rnd = random.SystemRandom().random()
 464     except NotImplementedError:
 465         rnd = random.random()
 466     rndstr = ("%.15f" % rnd)[2:]
 467     return "<%.15f.%s-openerp-%s@%s>" % (time.time(), rndstr, res_id, socket.gethostname())
 468
 469 def email_send(email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False,
 470                attachments=None, message_id=None, references=None, openobject_id=False, debug=False, subtype='plain', headers=None,
 471                smtp_server=None, smtp_port=None, ssl=False, smtp_user=None, smtp_password=None, cr=None, uid=None):
 472     """Low-level function for sending an email (deprecated).
 473
 474     :deprecate: since OpenERP 6.1, please use ir.mail_server.send_email() instead.
 475     :param email_from: A string used to fill the `From` header, if falsy,
 476                        config['email_from'] is used instead.  Also used for
 477                        the `Reply-To` header if `reply_to` is not provided
 478     :param email_to: a sequence of addresses to send the mail to.
 479     """
 480
 481     # If not cr, get cr from current thread database
 482     local_cr = None
 483     if not cr:
 484         db_name = getattr(threading.currentThread(), 'dbname', None)
 485         if db_name:
 486             local_cr = cr = openerp.registry(db_name).db.cursor()
 487         else:
 488             raise Exception("No database cursor found, please pass one explicitly")
 489
 490     # Send Email
 491     try:
 492         mail_server_pool = openerp.registry(cr.dbname)['ir.mail_server']
 493         res = False
 494         # Pack Message into MIME Object
 495         email_msg = mail_server_pool.build_email(email_from, email_to, subject, body, email_cc, email_bcc, reply_to,
 496                    attachments, message_id, references, openobject_id, subtype, headers=headers)
 497
 498         res = mail_server_pool.send_email(cr, uid or 1, email_msg, mail_server_id=None,
 499                        smtp_server=smtp_server, smtp_port=smtp_port, smtp_user=smtp_user, smtp_password=smtp_password,
 500                        smtp_encryption=('ssl' if ssl else None), smtp_debug=debug)
 501     except Exception:
 502         _logger.exception("tools.email_send failed to deliver email")
 503         return False
 504     finally:
 505         if local_cr:
 506             cr.close()
 507     return res
 508
 509 def email_split(text):
 510     """ Return a list of the email addresses found in ``text`` """
 511     if not text:
 512         return []
 513     return re.findall(r'([^ ,<@]+@[^> ,]+)', text)