openerp/tools/mail.py

   1 # -*- coding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Business Applications
   5 #    Copyright (C) 2012 OpenERP S.A. (<http://openerp.com>).
   6 #
   7 #    This program is free software: you can redistribute it and/or modify
   8 #    it under the terms of the GNU Affero General Public License as
   9 #    published by the Free Software Foundation, either version 3 of the
  10 #    License, or (at your option) any later version.
  11 #
  12 #    This program is distributed in the hope that it will be useful,
  13 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #    GNU Affero General Public License for more details.
  16 #
  17 #    You should have received a copy of the GNU Affero General Public License
  18 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 ##############################################################################
  21
  22 from lxml import etree
  23 import cgi
  24 import logging
  25 import lxml.html
  26 import lxml.html.clean as clean
  27 import random
  28 import re
  29 import socket
  30 import threading
  31 import time
  32
  33 import openerp
  34 from openerp.loglevels import ustr
  35
  36 _logger = logging.getLogger(__name__)
  37
  38
  39 #----------------------------------------------------------
  40 # HTML Sanitizer
  41 #----------------------------------------------------------
  42
  43 tags_to_kill = ["script", "head", "meta", "title", "link", "style", "frame", "iframe", "base", "object", "embed"]
  44 tags_to_remove = ['html', 'body', 'font']
  45
  46
  47 def html_sanitize(src):
  48     if not src:
  49         return src
  50     src = ustr(src, errors='replace')
  51
  52     # html encode email tags
  53     part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL)
  54     src = part.sub(lambda m: cgi.escape(m.group(1)), src)
  55
  56     # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
  57     try:
  58         cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, kill_tags=tags_to_kill, remove_tags=tags_to_remove)
  59         cleaned = cleaner.clean_html(src)
  60     except TypeError, e:
  61         # lxml.clean version < 2.3.1 does not have a kill_tags attribute
  62         # to remove in 2014
  63         cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, remove_tags=tags_to_kill + tags_to_remove)
  64         cleaned = cleaner.clean_html(src)
  65     except etree.ParserError, e:
  66         _logger.warning('html_sanitize: ParserError "%s" obtained when sanitizing "%s"' % (e, src))
  67         cleaned = '<p>ParserError when sanitizing</p>'
  68     except Exception, e:
  69         _logger.warning('html_sanitize: unknown error "%s" obtained when sanitizing "%s"' % (e, src))
  70         cleaned = '<p>Unknown error when sanitizing</p>'
  71     return cleaned
  72
  73
  74 #----------------------------------------------------------
  75 # HTML Cleaner
  76 #----------------------------------------------------------
  77
  78 def html_email_clean(html, remove=False, shorten=False, max_length=300):
  79     """ html_email_clean: clean the html by doing the following steps:
  80
  81      - try to strip email quotes, by removing blockquotes or having some client-
  82        specific heuristics
  83      - try to strip signatures
  84      - shorten the html to a maximum number of characters if requested
  85
  86     Some specific use case:
  87
  88      - MsOffice: ``div.style = border-top:solid;`` delimitates the beginning of
  89        a quote; detecting by finding WordSection1 of MsoNormal
  90      - Hotmail: ``hr.stopSpelling`` delimitates the beginning of a quote; detect
  91        Hotmail by funding ``SkyDrivePlaceholder``
  92
  93     :param string html: sanitized html; tags like html or head should not
  94                         be present in the html string. This method therefore
  95                         takes as input html code coming from a sanitized source,
  96                         like fields.html.
  97     :param boolean remove: remove the html code that is unwanted; otherwise it
  98                            is only flagged and tagged
  99     :param boolean shorten: shorten the html; every excessing content will
 100                             be flagged as to remove
 101     :param int max_length: if shortening, maximum number of characters before
 102                            shortening
 103     """
 104     def _replace_matching_regex(regex, source, replace=''):
 105         """ Replace all matching expressions in source by replace """
 106         if not source:
 107             return source
 108         dest = ''
 109         idx = 0
 110         for item in re.finditer(regex, source):
 111             dest += source[idx:item.start()] + replace
 112             idx = item.end()
 113         dest += source[idx:]
 114         return dest
 115
 116     def _create_node(tag, text, tail=None, attrs={}):
 117         new_node = etree.Element(tag)
 118         new_node.text = text
 119         new_node.tail = tail
 120         for key, val in attrs.iteritems():
 121             new_node.set(key, val)
 122         return new_node
 123
 124     def _insert_new_node(node, index, new_node_tag, new_node_text, new_node_tail=None, new_node_attrs={}):
 125         new_node = _create_node(new_node_tag, new_node_text, new_node_tail, new_node_attrs)
 126         node.insert(index, new_node)
 127         return new_node
 128
 129     def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs={}):
 130         text = node.text or ''
 131         if not re.search(regex, text):
 132             return
 133
 134         cur_node = node
 135         node.text = ''
 136         idx, iteration = 0, 0
 137         for item in re.finditer(regex, text):
 138             if iteration == 0:
 139                 cur_node.text = text[idx:item.start()]
 140             else:
 141                 _insert_new_node(node, (iteration - 1) * 2 + 1, new_node_tag, text[idx:item.start()])
 142             new_node = _insert_new_node(node, iteration * 2, new_node_tag, text[item.start():item.end()], None, new_node_attrs)
 143
 144             cur_node = new_node
 145             idx = item.end()
 146             iteration += 1
 147         new_node = _insert_new_node(node, -1, new_node_tag, text[idx:] + (cur_node.tail or ''), None, {})
 148
 149     if not html or not isinstance(html, basestring):
 150         return html
 151     html = ustr(html)
 152
 153     # Pre processing
 154     # ------------------------------------------------------------
 155     # TDE TODO: --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}'
 156
 157     # html: remove encoding attribute inside tags
 158     doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
 159     html = doctype.sub(r"", html)
 160
 161     # html: ClEditor seems to love using <div><br /><div> -> replace with <br />
 162     br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)', re.IGNORECASE)
 163     html = _replace_matching_regex(br_div_tags, html, '<br />')
 164
 165     # form a tree
 166     root = lxml.html.fromstring(html)
 167     if not len(root) and root.text is None and root.tail is None:
 168         html = '<div>%s</div>' % html
 169         root = lxml.html.fromstring(html)
 170
 171     # remove all tails and replace them by a span element, because managing text and tails can be a pain in the ass
 172     for node in root.getiterator():
 173         if node.tail:
 174             tail_node = _create_node('span', node.tail)
 175             node.tail = None
 176             node.addnext(tail_node)
 177
 178     # form node and tag text-based quotes and signature
 179     quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
 180     signature = re.compile(r'([-]{2,}[\s]?[\r\n]{1,2}[^.]+)')
 181     for node in root.getiterator():
 182         _tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'})
 183         _tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'})
 184
 185     # Processing
 186     # ------------------------------------------------------------
 187
 188     # tree: tag nodes
 189     # signature_begin = False  # try dynamic signature recognition
 190     quote_begin = False
 191     overlength = False
 192     cur_char_nbr = 0
 193     for node in root.getiterator():
 194         # root: try to tag the client used to write the html
 195         if 'WordSection1' in node.get('class', '') or 'MsoNormal' in node.get('class', ''):
 196             root.set('msoffice', '1')
 197         if 'SkyDrivePlaceholder' in node.get('class', '') or 'SkyDrivePlaceholder' in node.get('id', ''):
 198             root.set('hotmail', '1')
 199
 200         # state of the parsing
 201         if quote_begin:
 202             node.set('in_quote', '1')
 203             node.set('tail_remove', '1')
 204         if overlength:
 205             node.set('in_overlength', '1')
 206             node.set('tail_remove', '1')
 207
 208         if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''):
 209             quote_begin = True
 210             node.set('in_quote', '1')
 211             node.set('tail_remove', '1')
 212         if root.get('hotmail') and node.tag == 'hr' and ('stopSpelling' in node.get('class', '') or 'stopSpelling' in node.get('id', '')):
 213             quote_begin = True
 214             node.set('in_quote', '1')
 215             node.set('tail_remove', '1')
 216
 217         # shorten:
 218         # 1/ truncate the text at the next available space
 219         # 2/ create a 'read more' node, next to current node
 220         # 3/ add the truncated text in a new node, next to 'read more' node
 221         if shorten and not overlength and cur_char_nbr + len(node.text or '') > max_length:
 222             overlength = True
 223             # truncate text
 224             innertext = node.text[0:(max_length - cur_char_nbr)]
 225             outertext = node.text[(max_length - cur_char_nbr):]
 226             stop_idx = outertext.find(' ')
 227             if stop_idx == -1:
 228                 stop_idx = len(outertext)
 229             node.text = innertext + outertext[0:stop_idx]
 230             # create <span> ... <a href="#">read more</a></span> node
 231             read_more_node = _create_node('span', ' ... ', None, {'class': 'oe_mail_expand'})
 232             read_more_link_node = _create_node('a', 'read more', None, {'href': '#', 'class': 'oe_mail_expand'})
 233             read_more_node.append(read_more_link_node)
 234             # create outertext node
 235             new_node = _create_node('span', outertext[stop_idx:])
 236             # add newly created nodes in dom
 237             node.addnext(new_node)
 238             node.addnext(read_more_node)
 239             # tag node
 240             new_node.set('in_overlength', '1')
 241
 242             cur_char_nbr += len(node.text or '')
 243
 244         if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'):
 245             node.set('in_quote', '1')
 246
 247     # Post processing
 248     # ------------------------------------------------------------
 249
 250     to_remove = []
 251     for node in root.getiterator():
 252         if node.get('in_quote') or node.get('in_overlength'):
 253             # copy the node tail into parent text
 254             if node.tail and not node.get('tail_remove'):
 255                 parent = node.getparent()
 256                 parent.tail = node.tail + (parent.tail or '')
 257             to_remove.append(node)
 258         if node.get('tail_remove'):
 259             node.tail = ''
 260     for node in to_remove:
 261         if remove:
 262             node.getparent().remove(node)
 263         else:
 264             if not 'oe_mail_expand' in node.get('class', ''):  # trick: read more link should be displayed even if it's in overlength
 265                 node_class = node.get('class', '') + ' ' + 'oe_mail_cleaned'
 266                 node.set('class', node_class)
 267
 268     # html: \n that were tail of elements have been encapsulated into <span> -> back to \n
 269     html = etree.tostring(root, pretty_print=False)
 270     linebreaks = re.compile(r'<span>([\s]*[\r\n]+[\s]*)<\/span>', re.IGNORECASE | re.DOTALL)
 271     html = _replace_matching_regex(linebreaks, html, '\n')
 272
 273     return html
 274
 275
 276 #----------------------------------------------------------
 277 # HTML/Text management
 278 #----------------------------------------------------------
 279
 280 def html2plaintext(html, body_id=None, encoding='utf-8'):
 281     """ From an HTML text, convert the HTML to plain text.
 282     If @param body_id is provided then this is the tag where the
 283     body (not necessarily <body>) starts.
 284     """
 285     ## (c) Fry-IT, www.fry-it.com, 2007
 286     ## <peter@fry-it.com>
 287     ## download here: http://www.peterbe.com/plog/html2plaintext
 288
 289     html = ustr(html)
 290     tree = etree.fromstring(html, parser=etree.HTMLParser())
 291
 292     if body_id is not None:
 293         source = tree.xpath('//*[@id=%s]' % (body_id,))
 294     else:
 295         source = tree.xpath('//body')
 296     if len(source):
 297         tree = source[0]
 298
 299     url_index = []
 300     i = 0
 301     for link in tree.findall('.//a'):
 302         url = link.get('href')
 303         if url:
 304             i += 1
 305             link.tag = 'span'
 306             link.text = '%s [%s]' % (link.text, i)
 307             url_index.append(url)
 308
 309     html = ustr(etree.tostring(tree, encoding=encoding))
 310     # \r char is converted into &#13;, must remove it
 311     html = html.replace('&#13;', '')
 312
 313     html = html.replace('<strong>', '*').replace('</strong>', '*')
 314     html = html.replace('<b>', '*').replace('</b>', '*')
 315     html = html.replace('<h3>', '*').replace('</h3>', '*')
 316     html = html.replace('<h2>', '**').replace('</h2>', '**')
 317     html = html.replace('<h1>', '**').replace('</h1>', '**')
 318     html = html.replace('<em>', '/').replace('</em>', '/')
 319     html = html.replace('<tr>', '\n')
 320     html = html.replace('</p>', '\n')
 321     html = re.sub('<br\s*/?>', '\n', html)
 322     html = re.sub('<.*?>', ' ', html)
 323     html = html.replace(' ' * 2, ' ')
 324
 325     # strip all lines
 326     html = '\n'.join([x.strip() for x in html.splitlines()])
 327     html = html.replace('\n' * 2, '\n')
 328
 329     for i, url in enumerate(url_index):
 330         if i == 0:
 331             html += '\n\n'
 332         html += ustr('[%s] %s\n') % (i + 1, url)
 333
 334     return html
 335
 336 def plaintext2html(text, container_tag=False):
 337     """ Convert plaintext into html. Content of the text is escaped to manage
 338         html entities, using cgi.escape().
 339         - all \n,\r are replaced by <br />
 340         - enclose content into <p>
 341         - 2 or more consecutive <br /> are considered as paragraph breaks
 342
 343         :param string container_tag: container of the html; by default the
 344             content is embedded into a <div>
 345     """
 346     text = cgi.escape(ustr(text))
 347
 348     # 1. replace \n and \r
 349     text = text.replace('\n', '<br/>')
 350     text = text.replace('\r', '<br/>')
 351
 352     # 2-3: form paragraphs
 353     idx = 0
 354     final = '<p>'
 355     br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})')
 356     for item in re.finditer(br_tags, text):
 357         final += text[idx:item.start()] + '</p><p>'
 358         idx = item.end()
 359     final += text[idx:] + '</p>'
 360
 361     # 4. container
 362     if container_tag:
 363         final = '<%s>%s</%s>' % (container_tag, final, container_tag)
 364     return ustr(final)
 365
 366 def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False):
 367     """ Append extra content at the end of an HTML snippet, trying
 368         to locate the end of the HTML document (</body>, </html>, or
 369         EOF), and converting the provided content in html unless ``plaintext``
 370         is False.
 371         Content conversion can be done in two ways:
 372         - wrapping it into a pre (preserve=True)
 373         - use plaintext2html (preserve=False, using container_tag to wrap the
 374             whole content)
 375         A side-effect of this method is to coerce all HTML tags to
 376         lowercase in ``html``, and strip enclosing <html> or <body> tags in
 377         content if ``plaintext`` is False.
 378
 379         :param str html: html tagsoup (doesn't have to be XHTML)
 380         :param str content: extra content to append
 381         :param bool plaintext: whether content is plaintext and should
 382             be wrapped in a <pre/> tag.
 383         :param bool preserve: if content is plaintext, wrap it into a <pre>
 384             instead of converting it into html
 385     """
 386     html = ustr(html)
 387     if plaintext and preserve:
 388         content = u'\n<pre>%s</pre>\n' % ustr(content)
 389     elif plaintext:
 390         content = '\n%s\n' % plaintext2html(content, container_tag)
 391     else:
 392         content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
 393         content = u'\n%s\n' % ustr(content)
 394     # Force all tags to lowercase
 395     html = re.sub(r'(</?)\W*(\w+)([ >])',
 396         lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
 397     insert_location = html.find('</body>')
 398     if insert_location == -1:
 399         insert_location = html.find('</html>')
 400     if insert_location == -1:
 401         return '%s%s' % (html, content)
 402     return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
 403
 404 #----------------------------------------------------------
 405 # Emails
 406 #----------------------------------------------------------
 407
 408 # matches any email in a body of text
 409 email_re = re.compile(r"""([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6})""", re.VERBOSE)
 410
 411 # matches a string containing only one email
 412 single_email_re = re.compile(r"""^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$""", re.VERBOSE)
 413
 414 res_re = re.compile(r"\[([0-9]+)\]", re.UNICODE)
 415 command_re = re.compile("^Set-([a-z]+) *: *(.+)$", re.I + re.UNICODE)
 416
 417 # Updated in 7.0 to match the model name as well
 418 # Typical form of references is <timestamp-openerp-record_id-model_name@domain>
 419 # group(1) = the record ID ; group(2) = the model (if any) ; group(3) = the domain
 420 reference_re = re.compile("<.*-open(?:object|erp)-(\\d+)(?:-([\w.]+))?.*@(.*)>", re.UNICODE)
 421
 422 def generate_tracking_message_id(res_id):
 423     """Returns a string that can be used in the Message-ID RFC822 header field
 424
 425        Used to track the replies related to a given object thanks to the "In-Reply-To"
 426        or "References" fields that Mail User Agents will set.
 427     """
 428     try:
 429         rnd = random.SystemRandom().random()
 430     except NotImplementedError:
 431         rnd = random.random()
 432     rndstr = ("%.15f" % rnd)[2:]
 433     return "<%.15f.%s-openerp-%s@%s>" % (time.time(), rndstr, res_id, socket.gethostname())
 434
 435 def email_send(email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False,
 436                attachments=None, message_id=None, references=None, openobject_id=False, debug=False, subtype='plain', headers=None,
 437                smtp_server=None, smtp_port=None, ssl=False, smtp_user=None, smtp_password=None, cr=None, uid=None):
 438     """Low-level function for sending an email (deprecated).
 439
 440     :deprecate: since OpenERP 6.1, please use ir.mail_server.send_email() instead.
 441     :param email_from: A string used to fill the `From` header, if falsy,
 442                        config['email_from'] is used instead.  Also used for
 443                        the `Reply-To` header if `reply_to` is not provided
 444     :param email_to: a sequence of addresses to send the mail to.
 445     """
 446
 447     # If not cr, get cr from current thread database
 448     local_cr = None
 449     if not cr:
 450         db_name = getattr(threading.currentThread(), 'dbname', None)
 451         if db_name:
 452             local_cr = cr = openerp.registry(db_name).db.cursor()
 453         else:
 454             raise Exception("No database cursor found, please pass one explicitly")
 455
 456     # Send Email
 457     try:
 458         mail_server_pool = openerp.registry(cr.dbname)['ir.mail_server']
 459         res = False
 460         # Pack Message into MIME Object
 461         email_msg = mail_server_pool.build_email(email_from, email_to, subject, body, email_cc, email_bcc, reply_to,
 462                    attachments, message_id, references, openobject_id, subtype, headers=headers)
 463
 464         res = mail_server_pool.send_email(cr, uid or 1, email_msg, mail_server_id=None,
 465                        smtp_server=smtp_server, smtp_port=smtp_port, smtp_user=smtp_user, smtp_password=smtp_password,
 466                        smtp_encryption=('ssl' if ssl else None), smtp_debug=debug)
 467     except Exception:
 468         _logger.exception("tools.email_send failed to deliver email")
 469         return False
 470     finally:
 471         if local_cr:
 472             cr.close()
 473     return res
 474
 475 def email_split(text):
 476     """ Return a list of the email addresses found in ``text`` """
 477     if not text:
 478         return []
 479     return re.findall(r'([^ ,<@]+@[^> ,]+)', text)