openerp/tools/mail.py

   1 # -*- coding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Business Applications
   5 #    Copyright (C) 2012-2013 OpenERP S.A. (<http://openerp.com>).
   6 #
   7 #    This program is free software: you can redistribute it and/or modify
   8 #    it under the terms of the GNU Affero General Public License as
   9 #    published by the Free Software Foundation, either version 3 of the
  10 #    License, or (at your option) any later version.
  11 #
  12 #    This program is distributed in the hope that it will be useful,
  13 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #    GNU Affero General Public License for more details.
  16 #
  17 #    You should have received a copy of the GNU Affero General Public License
  18 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 ##############################################################################
  21
  22 from lxml import etree
  23 import cgi
  24 import logging
  25 import lxml.html
  26 import lxml.html.clean as clean
  27 import random
  28 import re
  29 import socket
  30 import threading
  31 import time
  32
  33 import openerp
  34 from openerp.loglevels import ustr
  35
  36 _logger = logging.getLogger(__name__)
  37
  38
  39 #----------------------------------------------------------
  40 # HTML Sanitizer
  41 #----------------------------------------------------------
  42
  43 tags_to_kill = ["script", "head", "meta", "title", "link", "style", "frame", "iframe", "base", "object", "embed"]
  44 tags_to_remove = ['html', 'body', 'font']
  45
  46
  47 def html_sanitize(src, silent=True):
  48     if not src:
  49         return src
  50     src = ustr(src, errors='replace')
  51
  52     logger = logging.getLogger(__name__ + '.html_sanitize')
  53
  54     # html encode email tags
  55     part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL)
  56     src = part.sub(lambda m: cgi.escape(m.group(1)), src)
  57
  58     kwargs = {
  59         'page_structure': True,
  60         'style': False,             # do not remove style attributes
  61         'forms': True,              # remove form tags
  62     }
  63     if etree.LXML_VERSION >= (2, 3, 1):
  64         # kill_tags attribute has been added in version 2.3.1
  65         kwargs.update({
  66             'kill_tags': tags_to_kill,
  67             'remove_tags': tags_to_remove,
  68         })
  69     else:
  70         kwargs['remove_tags'] = tags_to_kill + tags_to_remove
  71
  72     if etree.LXML_VERSION >= (3, 1, 0):
  73         kwargs.update({
  74             'safe_attrs_only': True,
  75             'safe_attrs': clean.defs.safe_attrs | set(['style']),
  76         })
  77     else:
  78         # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style"
  79         kwargs['safe_attrs_only'] = False
  80
  81     try:
  82         # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
  83         cleaner = clean.Cleaner(**kwargs)
  84         cleaned = cleaner.clean_html(src)
  85     except etree.ParserError:
  86         if not silent:
  87             raise
  88         logger.warning('ParserError obtained when sanitizing %r', src, exc_info=True)
  89         cleaned = '<p>ParserError when sanitizing</p>'
  90     except Exception:
  91         if not silent:
  92             raise
  93         logger.warning('unknown error obtained when sanitizing %r', src, exc_info=True)
  94         cleaned = '<p>Unknown error when sanitizing</p>'
  95     return cleaned
  96
  97
  98 #----------------------------------------------------------
  99 # HTML Cleaner
 100 #----------------------------------------------------------
 101
 102 def html_email_clean(html, remove=False, shorten=False, max_length=300):
 103     """ html_email_clean: clean the html by doing the following steps:
 104
 105      - try to strip email quotes, by removing blockquotes or having some client-
 106        specific heuristics
 107      - try to strip signatures
 108      - shorten the html to a maximum number of characters if requested
 109
 110     Some specific use case:
 111
 112      - MsOffice: ``div.style = border-top:solid;`` delimitates the beginning of
 113        a quote; detecting by finding WordSection1 of MsoNormal
 114      - Hotmail: ``hr.stopSpelling`` delimitates the beginning of a quote; detect
 115        Hotmail by funding ``SkyDrivePlaceholder``
 116
 117     :param string html: sanitized html; tags like html or head should not
 118                         be present in the html string. This method therefore
 119                         takes as input html code coming from a sanitized source,
 120                         like fields.html.
 121     :param boolean remove: remove the html code that is unwanted; otherwise it
 122                            is only flagged and tagged
 123     :param boolean shorten: shorten the html; every excessing content will
 124                             be flagged as to remove
 125     :param int max_length: if shortening, maximum number of characters before
 126                            shortening
 127     """
 128     def _replace_matching_regex(regex, source, replace=''):
 129         """ Replace all matching expressions in source by replace """
 130         if not source:
 131             return source
 132         dest = ''
 133         idx = 0
 134         for item in re.finditer(regex, source):
 135             dest += source[idx:item.start()] + replace
 136             idx = item.end()
 137         dest += source[idx:]
 138         return dest
 139
 140     def _create_node(tag, text, tail=None, attrs={}):
 141         new_node = etree.Element(tag)
 142         new_node.text = text
 143         new_node.tail = tail
 144         for key, val in attrs.iteritems():
 145             new_node.set(key, val)
 146         return new_node
 147
 148     def _insert_new_node(node, index, new_node_tag, new_node_text, new_node_tail=None, new_node_attrs={}):
 149         new_node = _create_node(new_node_tag, new_node_text, new_node_tail, new_node_attrs)
 150         node.insert(index, new_node)
 151         return new_node
 152
 153     def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs={}):
 154         text = node.text or ''
 155         if not re.search(regex, text):
 156             return
 157
 158         cur_node = node
 159         node.text = ''
 160         idx, iteration = 0, 0
 161         for item in re.finditer(regex, text):
 162             if iteration == 0:
 163                 cur_node.text = text[idx:item.start()]
 164             else:
 165                 _insert_new_node(node, (iteration - 1) * 2 + 1, new_node_tag, text[idx:item.start()])
 166             new_node = _insert_new_node(node, iteration * 2, new_node_tag, text[item.start():item.end()], None, new_node_attrs)
 167
 168             cur_node = new_node
 169             idx = item.end()
 170             iteration += 1
 171         new_node = _insert_new_node(node, -1, new_node_tag, text[idx:] + (cur_node.tail or ''), None, {})
 172
 173     if not html or not isinstance(html, basestring):
 174         return html
 175     html = ustr(html)
 176
 177     # Pre processing
 178     # ------------------------------------------------------------
 179     # TDE TODO: --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}'
 180
 181     # html: remove encoding attribute inside tags
 182     doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
 183     html = doctype.sub(r"", html)
 184
 185     # html: ClEditor seems to love using <div><br /><div> -> replace with <br />
 186     br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)', re.IGNORECASE)
 187     html = _replace_matching_regex(br_div_tags, html, '<br />')
 188
 189     # form a tree
 190     root = lxml.html.fromstring(html)
 191     if not len(root) and root.text is None and root.tail is None:
 192         html = '<div>%s</div>' % html
 193         root = lxml.html.fromstring(html)
 194
 195     # remove all tails and replace them by a span element, because managing text and tails can be a pain in the ass
 196     for node in root.getiterator():
 197         if node.tail:
 198             tail_node = _create_node('span', node.tail)
 199             node.tail = None
 200             node.addnext(tail_node)
 201
 202     # form node and tag text-based quotes and signature
 203     quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
 204     signature = re.compile(r'([-]{2,}[\s]?[\r\n]{1,2}[^.]+)')
 205     for node in root.getiterator():
 206         _tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'})
 207         _tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'})
 208
 209     # Processing
 210     # ------------------------------------------------------------
 211
 212     # tree: tag nodes
 213     # signature_begin = False  # try dynamic signature recognition
 214     quote_begin = False
 215     overlength = False
 216     cur_char_nbr = 0
 217     for node in root.getiterator():
 218         # root: try to tag the client used to write the html
 219         if 'WordSection1' in node.get('class', '') or 'MsoNormal' in node.get('class', ''):
 220             root.set('msoffice', '1')
 221         if 'SkyDrivePlaceholder' in node.get('class', '') or 'SkyDrivePlaceholder' in node.get('id', ''):
 222             root.set('hotmail', '1')
 223
 224         # state of the parsing
 225         if quote_begin:
 226             node.set('in_quote', '1')
 227             node.set('tail_remove', '1')
 228         if overlength:
 229             node.set('in_overlength', '1')
 230             node.set('tail_remove', '1')
 231
 232         if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''):
 233             quote_begin = True
 234             node.set('in_quote', '1')
 235             node.set('tail_remove', '1')
 236         if root.get('hotmail') and node.tag == 'hr' and ('stopSpelling' in node.get('class', '') or 'stopSpelling' in node.get('id', '')):
 237             quote_begin = True
 238             node.set('in_quote', '1')
 239             node.set('tail_remove', '1')
 240
 241         # shorten:
 242         # 1/ truncate the text at the next available space
 243         # 2/ create a 'read more' node, next to current node
 244         # 3/ add the truncated text in a new node, next to 'read more' node
 245         if shorten and not overlength and cur_char_nbr + len(node.text or '') > max_length:
 246             overlength = True
 247             # truncate text
 248             innertext = node.text[0:(max_length - cur_char_nbr)]
 249             outertext = node.text[(max_length - cur_char_nbr):]
 250             stop_idx = outertext.find(' ')
 251             if stop_idx == -1:
 252                 stop_idx = len(outertext)
 253             node.text = innertext + outertext[0:stop_idx]
 254             # create <span> ... <a href="#">read more</a></span> node
 255             read_more_node = _create_node('span', ' ... ', None, {'class': 'oe_mail_expand'})
 256             read_more_link_node = _create_node('a', 'read more', None, {'href': '#', 'class': 'oe_mail_expand'})
 257             read_more_node.append(read_more_link_node)
 258             # create outertext node
 259             new_node = _create_node('span', outertext[stop_idx:])
 260             # add newly created nodes in dom
 261             node.addnext(new_node)
 262             node.addnext(read_more_node)
 263             # tag node
 264             new_node.set('in_overlength', '1')
 265
 266             cur_char_nbr += len(node.text or '')
 267
 268         if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'):
 269             node.set('in_quote', '1')
 270
 271     # Post processing
 272     # ------------------------------------------------------------
 273
 274     to_remove = []
 275     for node in root.getiterator():
 276         if node.get('in_quote') or node.get('in_overlength'):
 277             # copy the node tail into parent text
 278             if node.tail and not node.get('tail_remove'):
 279                 parent = node.getparent()
 280                 parent.tail = node.tail + (parent.tail or '')
 281             to_remove.append(node)
 282         if node.get('tail_remove'):
 283             node.tail = ''
 284     for node in to_remove:
 285         if remove:
 286             node.getparent().remove(node)
 287         else:
 288             if not 'oe_mail_expand' in node.get('class', ''):  # trick: read more link should be displayed even if it's in overlength
 289                 node_class = node.get('class', '') + ' ' + 'oe_mail_cleaned'
 290                 node.set('class', node_class)
 291
 292     # html: \n that were tail of elements have been encapsulated into <span> -> back to \n
 293     html = etree.tostring(root, pretty_print=False)
 294     linebreaks = re.compile(r'<span>([\s]*[\r\n]+[\s]*)<\/span>', re.IGNORECASE | re.DOTALL)
 295     html = _replace_matching_regex(linebreaks, html, '\n')
 296
 297     return html
 298
 299
 300 #----------------------------------------------------------
 301 # HTML/Text management
 302 #----------------------------------------------------------
 303
 304 def html2plaintext(html, body_id=None, encoding='utf-8'):
 305     """ From an HTML text, convert the HTML to plain text.
 306     If @param body_id is provided then this is the tag where the
 307     body (not necessarily <body>) starts.
 308     """
 309     ## (c) Fry-IT, www.fry-it.com, 2007
 310     ## <peter@fry-it.com>
 311     ## download here: http://www.peterbe.com/plog/html2plaintext
 312
 313     html = ustr(html)
 314     tree = etree.fromstring(html, parser=etree.HTMLParser())
 315
 316     if body_id is not None:
 317         source = tree.xpath('//*[@id=%s]' % (body_id,))
 318     else:
 319         source = tree.xpath('//body')
 320     if len(source):
 321         tree = source[0]
 322
 323     url_index = []
 324     i = 0
 325     for link in tree.findall('.//a'):
 326         url = link.get('href')
 327         if url:
 328             i += 1
 329             link.tag = 'span'
 330             link.text = '%s [%s]' % (link.text, i)
 331             url_index.append(url)
 332
 333     html = ustr(etree.tostring(tree, encoding=encoding))
 334     # \r char is converted into &#13;, must remove it
 335     html = html.replace('&#13;', '')
 336
 337     html = html.replace('<strong>', '*').replace('</strong>', '*')
 338     html = html.replace('<b>', '*').replace('</b>', '*')
 339     html = html.replace('<h3>', '*').replace('</h3>', '*')
 340     html = html.replace('<h2>', '**').replace('</h2>', '**')
 341     html = html.replace('<h1>', '**').replace('</h1>', '**')
 342     html = html.replace('<em>', '/').replace('</em>', '/')
 343     html = html.replace('<tr>', '\n')
 344     html = html.replace('</p>', '\n')
 345     html = re.sub('<br\s*/?>', '\n', html)
 346     html = re.sub('<.*?>', ' ', html)
 347     html = html.replace(' ' * 2, ' ')
 348
 349     # strip all lines
 350     html = '\n'.join([x.strip() for x in html.splitlines()])
 351     html = html.replace('\n' * 2, '\n')
 352
 353     for i, url in enumerate(url_index):
 354         if i == 0:
 355             html += '\n\n'
 356         html += ustr('[%s] %s\n') % (i + 1, url)
 357
 358     return html
 359
 360 def plaintext2html(text, container_tag=False):
 361     """ Convert plaintext into html. Content of the text is escaped to manage
 362         html entities, using cgi.escape().
 363         - all \n,\r are replaced by <br />
 364         - enclose content into <p>
 365         - 2 or more consecutive <br /> are considered as paragraph breaks
 366
 367         :param string container_tag: container of the html; by default the
 368             content is embedded into a <div>
 369     """
 370     text = cgi.escape(ustr(text))
 371
 372     # 1. replace \n and \r
 373     text = text.replace('\n', '<br/>')
 374     text = text.replace('\r', '<br/>')
 375
 376     # 2-3: form paragraphs
 377     idx = 0
 378     final = '<p>'
 379     br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})')
 380     for item in re.finditer(br_tags, text):
 381         final += text[idx:item.start()] + '</p><p>'
 382         idx = item.end()
 383     final += text[idx:] + '</p>'
 384
 385     # 4. container
 386     if container_tag:
 387         final = '<%s>%s</%s>' % (container_tag, final, container_tag)
 388     return ustr(final)
 389
 390 def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False):
 391     """ Append extra content at the end of an HTML snippet, trying
 392         to locate the end of the HTML document (</body>, </html>, or
 393         EOF), and converting the provided content in html unless ``plaintext``
 394         is False.
 395         Content conversion can be done in two ways:
 396         - wrapping it into a pre (preserve=True)
 397         - use plaintext2html (preserve=False, using container_tag to wrap the
 398             whole content)
 399         A side-effect of this method is to coerce all HTML tags to
 400         lowercase in ``html``, and strip enclosing <html> or <body> tags in
 401         content if ``plaintext`` is False.
 402
 403         :param str html: html tagsoup (doesn't have to be XHTML)
 404         :param str content: extra content to append
 405         :param bool plaintext: whether content is plaintext and should
 406             be wrapped in a <pre/> tag.
 407         :param bool preserve: if content is plaintext, wrap it into a <pre>
 408             instead of converting it into html
 409     """
 410     html = ustr(html)
 411     if plaintext and preserve:
 412         content = u'\n<pre>%s</pre>\n' % ustr(content)
 413     elif plaintext:
 414         content = '\n%s\n' % plaintext2html(content, container_tag)
 415     else:
 416         content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
 417         content = u'\n%s\n' % ustr(content)
 418     # Force all tags to lowercase
 419     html = re.sub(r'(</?)\W*(\w+)([ >])',
 420         lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
 421     insert_location = html.find('</body>')
 422     if insert_location == -1:
 423         insert_location = html.find('</html>')
 424     if insert_location == -1:
 425         return '%s%s' % (html, content)
 426     return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
 427
 428 #----------------------------------------------------------
 429 # Emails
 430 #----------------------------------------------------------
 431
 432 # matches any email in a body of text
 433 email_re = re.compile(r"""([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6})""", re.VERBOSE)
 434
 435 # matches a string containing only one email
 436 single_email_re = re.compile(r"""^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$""", re.VERBOSE)
 437
 438 res_re = re.compile(r"\[([0-9]+)\]", re.UNICODE)
 439 command_re = re.compile("^Set-([a-z]+) *: *(.+)$", re.I + re.UNICODE)
 440
 441 # Updated in 7.0 to match the model name as well
 442 # Typical form of references is <timestamp-openerp-record_id-model_name@domain>
 443 # group(1) = the record ID ; group(2) = the model (if any) ; group(3) = the domain
 444 reference_re = re.compile("<.*-open(?:object|erp)-(\\d+)(?:-([\w.]+))?.*@(.*)>", re.UNICODE)
 445
 446 # Bounce regex
 447 # Typical form of bounce is bounce-128-crm.lead-34@domain
 448 # group(1) = the mail ID; group(2) = the model (if any); group(3) = the record ID
 449 bounce_re = re.compile("[\w]+-(\d+)-?([\w.]+)?-?(\d+)?", re.UNICODE)
 450
 451 def generate_tracking_message_id(res_id):
 452     """Returns a string that can be used in the Message-ID RFC822 header field
 453
 454        Used to track the replies related to a given object thanks to the "In-Reply-To"
 455        or "References" fields that Mail User Agents will set.
 456     """
 457     try:
 458         rnd = random.SystemRandom().random()
 459     except NotImplementedError:
 460         rnd = random.random()
 461     rndstr = ("%.15f" % rnd)[2:]
 462     return "<%.15f.%s-openerp-%s@%s>" % (time.time(), rndstr, res_id, socket.gethostname())
 463
 464 def email_send(email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False,
 465                attachments=None, message_id=None, references=None, openobject_id=False, debug=False, subtype='plain', headers=None,
 466                smtp_server=None, smtp_port=None, ssl=False, smtp_user=None, smtp_password=None, cr=None, uid=None):
 467     """Low-level function for sending an email (deprecated).
 468
 469     :deprecate: since OpenERP 6.1, please use ir.mail_server.send_email() instead.
 470     :param email_from: A string used to fill the `From` header, if falsy,
 471                        config['email_from'] is used instead.  Also used for
 472                        the `Reply-To` header if `reply_to` is not provided
 473     :param email_to: a sequence of addresses to send the mail to.
 474     """
 475
 476     # If not cr, get cr from current thread database
 477     local_cr = None
 478     if not cr:
 479         db_name = getattr(threading.currentThread(), 'dbname', None)
 480         if db_name:
 481             local_cr = cr = openerp.registry(db_name).db.cursor()
 482         else:
 483             raise Exception("No database cursor found, please pass one explicitly")
 484
 485     # Send Email
 486     try:
 487         mail_server_pool = openerp.registry(cr.dbname)['ir.mail_server']
 488         res = False
 489         # Pack Message into MIME Object
 490         email_msg = mail_server_pool.build_email(email_from, email_to, subject, body, email_cc, email_bcc, reply_to,
 491                    attachments, message_id, references, openobject_id, subtype, headers=headers)
 492
 493         res = mail_server_pool.send_email(cr, uid or 1, email_msg, mail_server_id=None,
 494                        smtp_server=smtp_server, smtp_port=smtp_port, smtp_user=smtp_user, smtp_password=smtp_password,
 495                        smtp_encryption=('ssl' if ssl else None), smtp_debug=debug)
 496     except Exception:
 497         _logger.exception("tools.email_send failed to deliver email")
 498         return False
 499     finally:
 500         if local_cr:
 501             cr.close()
 502     return res
 503
 504 def email_split(text):
 505     """ Return a list of the email addresses found in ``text`` """
 506     if not text:
 507         return []
 508     return re.findall(r'([^ ,<@]+@[^> ,]+)', text)