openerp/tools/mail.py

   1 # -*- coding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Business Applications
   5 #    Copyright (C) 2012-2013 OpenERP S.A. (<http://openerp.com>).
   6 #
   7 #    This program is free software: you can redistribute it and/or modify
   8 #    it under the terms of the GNU Affero General Public License as
   9 #    published by the Free Software Foundation, either version 3 of the
  10 #    License, or (at your option) any later version.
  11 #
  12 #    This program is distributed in the hope that it will be useful,
  13 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #    GNU Affero General Public License for more details.
  16 #
  17 #    You should have received a copy of the GNU Affero General Public License
  18 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 ##############################################################################
  21
  22 from lxml import etree
  23 import cgi
  24 import logging
  25 import lxml.html
  26 import lxml.html.clean as clean
  27 import random
  28 import re
  29 import socket
  30 import threading
  31 import time
  32
  33 import openerp
  34 from openerp.loglevels import ustr
  35
  36 _logger = logging.getLogger(__name__)
  37
  38
  39 #----------------------------------------------------------
  40 # HTML Sanitizer
  41 #----------------------------------------------------------
  42
  43 tags_to_kill = ["script", "head", "meta", "title", "link", "style", "frame", "iframe", "base", "object", "embed"]
  44 tags_to_remove = ['html', 'body', 'font']
  45
  46 # allow new semantic HTML5 tags
  47 allowed_tags = clean.defs.tags | frozenset('article section header footer hgroup nav aside figure'.split())
  48 safe_attrs = clean.defs.safe_attrs | frozenset(['style'])
  49
  50 def html_sanitize(src, silent=True):
  51     if not src:
  52         return src
  53     src = ustr(src, errors='replace')
  54
  55     logger = logging.getLogger(__name__ + '.html_sanitize')
  56
  57     # html encode email tags
  58     part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL)
  59     src = part.sub(lambda m: cgi.escape(m.group(1)), src)
  60
  61     kwargs = {
  62         'page_structure': True,
  63         'style': False,             # do not remove style attributes
  64         'forms': True,              # remove form tags
  65         'remove_unknown_tags': False,
  66         'allow_tags': allowed_tags,
  67     }
  68     if etree.LXML_VERSION >= (2, 3, 1):
  69         # kill_tags attribute has been added in version 2.3.1
  70         kwargs.update({
  71             'kill_tags': tags_to_kill,
  72             'remove_tags': tags_to_remove,
  73         })
  74     else:
  75         kwargs['remove_tags'] = tags_to_kill + tags_to_remove
  76
  77     if etree.LXML_VERSION >= (3, 1, 0):
  78         kwargs.update({
  79             'safe_attrs_only': True,
  80             'safe_attrs': safe_attrs,
  81         })
  82     else:
  83         # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style"
  84         kwargs['safe_attrs_only'] = False
  85
  86     try:
  87         # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
  88         cleaner = clean.Cleaner(**kwargs)
  89         cleaned = cleaner.clean_html(src)
  90     except etree.ParserError:
  91         if not silent:
  92             raise
  93         logger.warning('ParserError obtained when sanitizing %r', src, exc_info=True)
  94         cleaned = '<p>ParserError when sanitizing</p>'
  95     except Exception:
  96         if not silent:
  97             raise
  98         logger.warning('unknown error obtained when sanitizing %r', src, exc_info=True)
  99         cleaned = '<p>Unknown error when sanitizing</p>'
 100     return cleaned
 101
 102
 103 #----------------------------------------------------------
 104 # HTML Cleaner
 105 #----------------------------------------------------------
 106
 107 def html_email_clean(html, remove=False, shorten=False, max_length=300):
 108     """ html_email_clean: clean the html by doing the following steps:
 109
 110      - try to strip email quotes, by removing blockquotes or having some client-
 111        specific heuristics
 112      - try to strip signatures
 113      - shorten the html to a maximum number of characters if requested
 114
 115     Some specific use case:
 116
 117      - MsOffice: ``div.style = border-top:solid;`` delimitates the beginning of
 118        a quote; detecting by finding WordSection1 of MsoNormal
 119      - Hotmail: ``hr.stopSpelling`` delimitates the beginning of a quote; detect
 120        Hotmail by funding ``SkyDrivePlaceholder``
 121
 122     :param string html: sanitized html; tags like html or head should not
 123                         be present in the html string. This method therefore
 124                         takes as input html code coming from a sanitized source,
 125                         like fields.html.
 126     :param boolean remove: remove the html code that is unwanted; otherwise it
 127                            is only flagged and tagged
 128     :param boolean shorten: shorten the html; every excessing content will
 129                             be flagged as to remove
 130     :param int max_length: if shortening, maximum number of characters before
 131                            shortening
 132     """
 133     def _replace_matching_regex(regex, source, replace=''):
 134         """ Replace all matching expressions in source by replace """
 135         if not source:
 136             return source
 137         dest = ''
 138         idx = 0
 139         for item in re.finditer(regex, source):
 140             dest += source[idx:item.start()] + replace
 141             idx = item.end()
 142         dest += source[idx:]
 143         return dest
 144
 145     def _create_node(tag, text, tail=None, attrs={}):
 146         new_node = etree.Element(tag)
 147         new_node.text = text
 148         new_node.tail = tail
 149         for key, val in attrs.iteritems():
 150             new_node.set(key, val)
 151         return new_node
 152
 153     def _insert_new_node(node, index, new_node_tag, new_node_text, new_node_tail=None, new_node_attrs={}):
 154         new_node = _create_node(new_node_tag, new_node_text, new_node_tail, new_node_attrs)
 155         node.insert(index, new_node)
 156         return new_node
 157
 158     def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs={}):
 159         text = node.text or ''
 160         if not re.search(regex, text):
 161             return
 162
 163         cur_node = node
 164         node.text = ''
 165         idx, iteration = 0, 0
 166         for item in re.finditer(regex, text):
 167             if iteration == 0:
 168                 cur_node.text = text[idx:item.start()]
 169             else:
 170                 _insert_new_node(node, (iteration - 1) * 2 + 1, new_node_tag, text[idx:item.start()])
 171             new_node = _insert_new_node(node, iteration * 2, new_node_tag, text[item.start():item.end()], None, new_node_attrs)
 172
 173             cur_node = new_node
 174             idx = item.end()
 175             iteration += 1
 176         new_node = _insert_new_node(node, -1, new_node_tag, text[idx:] + (cur_node.tail or ''), None, {})
 177
 178     def _truncate_node(node, position, simplify_whitespaces=True):
 179         """ Truncate a node text at a given position. This algorithm will shorten
 180         at the end of the word whose ending character exceeds position.
 181
 182             :param bool simplify_whitespaces: whether to try to count all successive
 183                                               whitespaces as one character. This
 184                                               option should not be True when trying
 185                                               to keep 'pre' consistency.
 186         """
 187         if node.text is None:
 188             node.text = ''
 189
 190         truncate_idx = -1
 191         if simplify_whitespaces:
 192             cur_char_nbr = 0
 193             word = None
 194             node_words = node.text.strip(' \t\r\n').split()
 195             for word in node_words:
 196                 cur_char_nbr += len(word)
 197                 if cur_char_nbr >= position:
 198                     break
 199             if word:
 200                 truncate_idx = node.text.find(word) + len(word)
 201         else:
 202             truncate_idx = position
 203         if truncate_idx == -1 or truncate_idx > len(node.text):
 204             truncate_idx = len(node.text)
 205
 206         # compose new text bits
 207         innertext = node.text[0:truncate_idx]
 208         outertext = node.text[truncate_idx:]
 209         node.text = innertext
 210
 211         # create <span> ... <a href="#">read more</a></span> node
 212         read_more_node = _create_node('span', ' ... ', None, {'class': 'oe_mail_expand'})
 213         read_more_link_node = _create_node('a', 'read more', None, {'href': '#', 'class': 'oe_mail_expand'})
 214         read_more_node.append(read_more_link_node)
 215         # create outertext node
 216         overtext_node = _create_node('span', outertext)
 217         # tag node
 218         overtext_node.set('in_overlength', '1')
 219         # add newly created nodes in dom
 220         node.append(read_more_node)
 221         node.append(overtext_node)
 222
 223     if not html or not isinstance(html, basestring):
 224         return html
 225     html = ustr(html)
 226
 227     # Pre processing
 228     # ------------------------------------------------------------
 229     # TDE TODO: --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}'
 230
 231     # html: remove encoding attribute inside tags
 232     doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
 233     html = doctype.sub(r"", html)
 234
 235     # html: ClEditor seems to love using <div><br /><div> -> replace with <br />
 236     br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)', re.IGNORECASE)
 237     html = _replace_matching_regex(br_div_tags, html, '<br />')
 238
 239     # form a tree
 240     root = lxml.html.fromstring(html)
 241     if not len(root) and root.text is None and root.tail is None:
 242         html = '<div>%s</div>' % html
 243         root = lxml.html.fromstring(html)
 244
 245     quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
 246     signature = re.compile(r'([-]{2,}[\s]?[\r\n]{1,2}[\s\S]+)')
 247     for node in root.iter():
 248         # remove all tails and replace them by a span element, because managing text and tails can be a pain in the ass
 249         if node.tail:
 250             tail_node = _create_node('span', node.tail)
 251             node.tail = None
 252             node.addnext(tail_node)
 253
 254         # form node and tag text-based quotes and signature
 255         _tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'})
 256         _tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'})
 257
 258     # Processing
 259     # ------------------------------------------------------------
 260
 261     # tree: tag nodes
 262     # signature_begin = False  # try dynamic signature recognition
 263     quote_begin = False
 264     overlength = False
 265     cur_char_nbr = 0
 266     for node in root.iter():
 267         # node_text = re.sub('\s{2,}', ' ', node.text and node.text.strip(' \t\r\n') or '')  # do not take into account multiple spaces that are displayed as max 1 space in html
 268         node_text = ' '.join((node.text and node.text.strip(' \t\r\n') or '').split())
 269
 270         # root: try to tag the client used to write the html
 271         if 'WordSection1' in node.get('class', '') or 'MsoNormal' in node.get('class', ''):
 272             root.set('msoffice', '1')
 273         if 'SkyDrivePlaceholder' in node.get('class', '') or 'SkyDrivePlaceholder' in node.get('id', ''):
 274             root.set('hotmail', '1')
 275
 276         # state of the parsing: flag quotes and tails to remove
 277         if quote_begin:
 278             node.set('in_quote', '1')
 279             node.set('tail_remove', '1')
 280         # state of the parsing: flag when being in over-length content
 281         if overlength:
 282             node.set('in_overlength', '1')
 283             node.set('tail_remove', '1')
 284
 285         # find quote in msoffice / hotmail / blockquote / text quote and signatures
 286         if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''):
 287             quote_begin = True
 288             node.set('in_quote', '1')
 289             node.set('tail_remove', '1')
 290         if root.get('hotmail') and node.tag == 'hr' and ('stopSpelling' in node.get('class', '') or 'stopSpelling' in node.get('id', '')):
 291             quote_begin = True
 292             node.set('in_quote', '1')
 293             node.set('tail_remove', '1')
 294         if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'):
 295             node.set('in_quote', '1')
 296
 297         # shorten:
 298         # 1/ truncate the text at the next available space
 299         # 2/ create a 'read more' node, next to current node
 300         # 3/ add the truncated text in a new node, next to 'read more' node
 301         if shorten and not overlength and cur_char_nbr + len(node_text) > max_length:
 302             node_to_truncate = node
 303             while node_to_truncate.get('in_quote') and node_to_truncate.getparent() is not None:
 304                 node_to_truncate = node_to_truncate.getparent()
 305             overlength = True
 306             node_to_truncate.set('truncate', '1')
 307             if node_to_truncate == node:
 308                 node_to_truncate.set('truncate_position', str(max_length - cur_char_nbr))
 309             else:
 310                 node_to_truncate.set('truncate_position', str(len(node.text or '')))
 311         cur_char_nbr += len(node_text)
 312
 313     # Tree modification
 314     # ------------------------------------------------------------
 315
 316     for node in root.iter():
 317         if node.get('truncate'):
 318             _truncate_node(node, int(node.get('truncate_position', '0')), node.tag != 'pre')
 319
 320     # Post processing
 321     # ------------------------------------------------------------
 322
 323     to_remove = []
 324     for node in root.iter():
 325         if node.get('in_quote') or node.get('in_overlength'):
 326             # copy the node tail into parent text
 327             if node.tail and not node.get('tail_remove'):
 328                 parent = node.getparent()
 329                 parent.tail = node.tail + (parent.tail or '')
 330             to_remove.append(node)
 331         if node.get('tail_remove'):
 332             node.tail = ''
 333         # clean node
 334         for attribute_name in ['in_quote', 'tail_remove', 'in_overlength', 'msoffice', 'hotmail', 'truncate', 'truncate_position']:
 335             node.attrib.pop(attribute_name, None)
 336     for node in to_remove:
 337         if remove:
 338             node.getparent().remove(node)
 339         else:
 340             if not 'oe_mail_expand' in node.get('class', ''):  # trick: read more link should be displayed even if it's in overlength
 341                 node_class = node.get('class', '') + ' oe_mail_cleaned'
 342                 node.set('class', node_class)
 343
 344     # html: \n that were tail of elements have been encapsulated into <span> -> back to \n
 345     html = etree.tostring(root, pretty_print=False)
 346     linebreaks = re.compile(r'<span[^>]*>([\s]*[\r\n]+[\s]*)<\/span>', re.IGNORECASE | re.DOTALL)
 347     html = _replace_matching_regex(linebreaks, html, '\n')
 348
 349     return html
 350
 351
 352 #----------------------------------------------------------
 353 # HTML/Text management
 354 #----------------------------------------------------------
 355
 356 def html2plaintext(html, body_id=None, encoding='utf-8'):
 357     """ From an HTML text, convert the HTML to plain text.
 358     If @param body_id is provided then this is the tag where the
 359     body (not necessarily <body>) starts.
 360     """
 361     ## (c) Fry-IT, www.fry-it.com, 2007
 362     ## <peter@fry-it.com>
 363     ## download here: http://www.peterbe.com/plog/html2plaintext
 364
 365     html = ustr(html)
 366     tree = etree.fromstring(html, parser=etree.HTMLParser())
 367
 368     if body_id is not None:
 369         source = tree.xpath('//*[@id=%s]' % (body_id,))
 370     else:
 371         source = tree.xpath('//body')
 372     if len(source):
 373         tree = source[0]
 374
 375     url_index = []
 376     i = 0
 377     for link in tree.findall('.//a'):
 378         url = link.get('href')
 379         if url:
 380             i += 1
 381             link.tag = 'span'
 382             link.text = '%s [%s]' % (link.text, i)
 383             url_index.append(url)
 384
 385     html = ustr(etree.tostring(tree, encoding=encoding))
 386     # \r char is converted into &#13;, must remove it
 387     html = html.replace('&#13;', '')
 388
 389     html = html.replace('<strong>', '*').replace('</strong>', '*')
 390     html = html.replace('<b>', '*').replace('</b>', '*')
 391     html = html.replace('<h3>', '*').replace('</h3>', '*')
 392     html = html.replace('<h2>', '**').replace('</h2>', '**')
 393     html = html.replace('<h1>', '**').replace('</h1>', '**')
 394     html = html.replace('<em>', '/').replace('</em>', '/')
 395     html = html.replace('<tr>', '\n')
 396     html = html.replace('</p>', '\n')
 397     html = re.sub('<br\s*/?>', '\n', html)
 398     html = re.sub('<.*?>', ' ', html)
 399     html = html.replace(' ' * 2, ' ')
 400
 401     # strip all lines
 402     html = '\n'.join([x.strip() for x in html.splitlines()])
 403     html = html.replace('\n' * 2, '\n')
 404
 405     for i, url in enumerate(url_index):
 406         if i == 0:
 407             html += '\n\n'
 408         html += ustr('[%s] %s\n') % (i + 1, url)
 409
 410     return html
 411
 412 def plaintext2html(text, container_tag=False):
 413     """ Convert plaintext into html. Content of the text is escaped to manage
 414         html entities, using cgi.escape().
 415         - all \n,\r are replaced by <br />
 416         - enclose content into <p>
 417         - 2 or more consecutive <br /> are considered as paragraph breaks
 418
 419         :param string container_tag: container of the html; by default the
 420             content is embedded into a <div>
 421     """
 422     text = cgi.escape(ustr(text))
 423
 424     # 1. replace \n and \r
 425     text = text.replace('\n', '<br/>')
 426     text = text.replace('\r', '<br/>')
 427
 428     # 2-3: form paragraphs
 429     idx = 0
 430     final = '<p>'
 431     br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})')
 432     for item in re.finditer(br_tags, text):
 433         final += text[idx:item.start()] + '</p><p>'
 434         idx = item.end()
 435     final += text[idx:] + '</p>'
 436
 437     # 4. container
 438     if container_tag:
 439         final = '<%s>%s</%s>' % (container_tag, final, container_tag)
 440     return ustr(final)
 441
 442 def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False):
 443     """ Append extra content at the end of an HTML snippet, trying
 444         to locate the end of the HTML document (</body>, </html>, or
 445         EOF), and converting the provided content in html unless ``plaintext``
 446         is False.
 447         Content conversion can be done in two ways:
 448         - wrapping it into a pre (preserve=True)
 449         - use plaintext2html (preserve=False, using container_tag to wrap the
 450             whole content)
 451         A side-effect of this method is to coerce all HTML tags to
 452         lowercase in ``html``, and strip enclosing <html> or <body> tags in
 453         content if ``plaintext`` is False.
 454
 455         :param str html: html tagsoup (doesn't have to be XHTML)
 456         :param str content: extra content to append
 457         :param bool plaintext: whether content is plaintext and should
 458             be wrapped in a <pre/> tag.
 459         :param bool preserve: if content is plaintext, wrap it into a <pre>
 460             instead of converting it into html
 461     """
 462     html = ustr(html)
 463     if plaintext and preserve:
 464         content = u'\n<pre>%s</pre>\n' % ustr(content)
 465     elif plaintext:
 466         content = '\n%s\n' % plaintext2html(content, container_tag)
 467     else:
 468         content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
 469         content = u'\n%s\n' % ustr(content)
 470     # Force all tags to lowercase
 471     html = re.sub(r'(</?)\W*(\w+)([ >])',
 472         lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
 473     insert_location = html.find('</body>')
 474     if insert_location == -1:
 475         insert_location = html.find('</html>')
 476     if insert_location == -1:
 477         return '%s%s' % (html, content)
 478     return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
 479
 480 #----------------------------------------------------------
 481 # Emails
 482 #----------------------------------------------------------
 483
 484 # matches any email in a body of text
 485 email_re = re.compile(r"""([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6})""", re.VERBOSE)
 486
 487 # matches a string containing only one email
 488 single_email_re = re.compile(r"""^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$""", re.VERBOSE)
 489
 490 res_re = re.compile(r"\[([0-9]+)\]", re.UNICODE)
 491 command_re = re.compile("^Set-([a-z]+) *: *(.+)$", re.I + re.UNICODE)
 492
 493 # Updated in 7.0 to match the model name as well
 494 # Typical form of references is <timestamp-openerp-record_id-model_name@domain>
 495 # group(1) = the record ID ; group(2) = the model (if any) ; group(3) = the domain
 496 reference_re = re.compile("<.*-open(?:object|erp)-(\\d+)(?:-([\w.]+))?.*@(.*)>", re.UNICODE)
 497
 498 # Bounce regex
 499 # Typical form of bounce is bounce-128-crm.lead-34@domain
 500 # group(1) = the mail ID; group(2) = the model (if any); group(3) = the record ID
 501 bounce_re = re.compile("[\w]+-(\d+)-?([\w.]+)?-?(\d+)?", re.UNICODE)
 502
 503 def generate_tracking_message_id(res_id):
 504     """Returns a string that can be used in the Message-ID RFC822 header field
 505
 506        Used to track the replies related to a given object thanks to the "In-Reply-To"
 507        or "References" fields that Mail User Agents will set.
 508     """
 509     try:
 510         rnd = random.SystemRandom().random()
 511     except NotImplementedError:
 512         rnd = random.random()
 513     rndstr = ("%.15f" % rnd)[2:]
 514     return "<%.15f.%s-openerp-%s@%s>" % (time.time(), rndstr, res_id, socket.gethostname())
 515
 516 def email_send(email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False,
 517                attachments=None, message_id=None, references=None, openobject_id=False, debug=False, subtype='plain', headers=None,
 518                smtp_server=None, smtp_port=None, ssl=False, smtp_user=None, smtp_password=None, cr=None, uid=None):
 519     """Low-level function for sending an email (deprecated).
 520
 521     :deprecate: since OpenERP 6.1, please use ir.mail_server.send_email() instead.
 522     :param email_from: A string used to fill the `From` header, if falsy,
 523                        config['email_from'] is used instead.  Also used for
 524                        the `Reply-To` header if `reply_to` is not provided
 525     :param email_to: a sequence of addresses to send the mail to.
 526     """
 527
 528     # If not cr, get cr from current thread database
 529     local_cr = None
 530     if not cr:
 531         db_name = getattr(threading.currentThread(), 'dbname', None)
 532         if db_name:
 533             local_cr = cr = openerp.registry(db_name).db.cursor()
 534         else:
 535             raise Exception("No database cursor found, please pass one explicitly")
 536
 537     # Send Email
 538     try:
 539         mail_server_pool = openerp.registry(cr.dbname)['ir.mail_server']
 540         res = False
 541         # Pack Message into MIME Object
 542         email_msg = mail_server_pool.build_email(email_from, email_to, subject, body, email_cc, email_bcc, reply_to,
 543                    attachments, message_id, references, openobject_id, subtype, headers=headers)
 544
 545         res = mail_server_pool.send_email(cr, uid or 1, email_msg, mail_server_id=None,
 546                        smtp_server=smtp_server, smtp_port=smtp_port, smtp_user=smtp_user, smtp_password=smtp_password,
 547                        smtp_encryption=('ssl' if ssl else None), smtp_debug=debug)
 548     except Exception:
 549         _logger.exception("tools.email_send failed to deliver email")
 550         return False
 551     finally:
 552         if local_cr:
 553             cr.close()
 554     return res
 555
 556 def email_split(text):
 557     """ Return a list of the email addresses found in ``text`` """
 558     if not text:
 559         return []
 560     return re.findall(r'([^ ,<@]+@[^> ,]+)', text)