openerp/tools/mail.py

   1 # -*- coding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Business Applications
   5 #    Copyright (C) 2012 OpenERP S.A. (<http://openerp.com>).
   6 #
   7 #    This program is free software: you can redistribute it and/or modify
   8 #    it under the terms of the GNU Affero General Public License as
   9 #    published by the Free Software Foundation, either version 3 of the
  10 #    License, or (at your option) any later version.
  11 #
  12 #    This program is distributed in the hope that it will be useful,
  13 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #    GNU Affero General Public License for more details.
  16 #
  17 #    You should have received a copy of the GNU Affero General Public License
  18 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 ##############################################################################
  21
  22 from lxml import etree
  23 import cgi
  24 import logging
  25 import lxml.html
  26 import lxml.html.clean as clean
  27 import random
  28 import re
  29 import socket
  30 import threading
  31 import time
  32
  33 import openerp
  34 from openerp.loglevels import ustr
  35
  36 _logger = logging.getLogger(__name__)
  37
  38
  39 #----------------------------------------------------------
  40 # HTML Sanitizer
  41 #----------------------------------------------------------
  42
  43 tags_to_kill = ["script", "head", "meta", "title", "link", "style", "frame", "iframe", "base", "object", "embed"]
  44 tags_to_remove = ['html', 'body', 'font']
  45
  46
  47 def html_sanitize(src):
  48     if not src:
  49         return src
  50     src = ustr(src, errors='replace')
  51
  52     # html encode email tags
  53     part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL)
  54     src = part.sub(lambda m: cgi.escape(m.group(1)), src)
  55
  56     # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
  57     try:
  58         cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, kill_tags=tags_to_kill, remove_tags=tags_to_remove)
  59         cleaned = cleaner.clean_html(src)
  60     except TypeError, e:
  61         # lxml.clean version < 2.3.1 does not have a kill_tags attribute
  62         # to remove in 2014
  63         cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, remove_tags=tags_to_kill + tags_to_remove)
  64         cleaned = cleaner.clean_html(src)
  65     except etree.ParserError, e:
  66         _logger.warning('html_sanitize: ParserError "%s" obtained when sanitizing "%s"' % (e, src))
  67         cleaned = '<p>ParserError when sanitizing</p>'
  68     except Exception, e:
  69         _logger.warning('html_sanitize: unknown error "%s" obtained when sanitizing "%s"' % (e, src))
  70         cleaned = '<p>Unknown error when sanitizing</p>'
  71     return cleaned
  72
  73
  74 #----------------------------------------------------------
  75 # HTML Cleaner
  76 #----------------------------------------------------------
  77
  78 def html_email_clean(html, remove_unwanted=False, use_max_length=False, max_length=300):
  79     """ html_email_clean: clean the html to display in the web client.
  80         - strip email quotes (remove blockquote nodes)
  81         - strip signatures (remove --\n{\n)Blahblah), by replacing <br> by
  82             \n to avoid ignoring signatures converted into html
  83
  84         :param string html: sanitized html; tags like html or head should not
  85             be present in the html string. This method therefore takes as input
  86             html code coming from a sanitized source, like fields.html.
  87     """
  88     def _replace_matching_regex(regex, source, replace=''):
  89         if not source:
  90             return source
  91         dest = ''
  92         idx = 0
  93         for item in re.finditer(regex, source):
  94             dest += source[idx:item.start()] + replace
  95             idx = item.end()
  96         dest += source[idx:]
  97         return dest
  98
  99     def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs=None):
 100         # print '\t_tag_matching_regex_in_text'
 101         text = node.text or ''
 102         node.text = ''
 103         cur_node = node
 104         idx = 0
 105         caca = 0
 106         for item in re.finditer(regex, text):
 107             # print '\t\tfound', item.start(), item.end(), '-', text[item.start():item.end()], '-'
 108             if caca == 0:
 109                 cur_node.text = text[idx:item.start()]
 110             else:
 111                 cur_node.tail = text[idx:item.start()]
 112
 113             # create element
 114             new_node = etree.Element(new_node_tag)
 115             new_node.text = text[item.start():item.end()]
 116             for key, val in new_node_attrs.iteritems():
 117                 new_node.set(key, val)
 118
 119             # insert element in DOM
 120             node.insert(caca, new_node)
 121             cur_node = new_node
 122             idx = item.end()
 123             caca += 1
 124         if caca == 0:
 125             cur_node.text = (cur_node.text or '') + text[idx:]
 126         else:
 127             cur_node.tail = text[idx:] + (cur_node.tail or '')
 128
 129     if not html or not isinstance(html, basestring):
 130         return html
 131     html = ustr(html)
 132
 133     # Pre processing
 134     # ------------------------------------------------------------
 135
 136     # --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}'
 137
 138     # html: remove encoding attribute inside tags
 139     doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
 140     html = doctype.sub(r"", html)
 141
 142     # html: ClEditor seems to love using <div><br /><div> -> replace with <br />
 143     br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)')
 144     html = _replace_matching_regex(br_div_tags, html, '<br />')
 145
 146     # html: <br[ /]> -> \n, to de-obfuscate the tree
 147     br_tags = re.compile(r'([<]\s*[bB][rR]\s*\/?[>])')
 148     html = _replace_matching_regex(br_tags, html, '__BR_TAG__')
 149
 150     # form a tree
 151     root = lxml.html.fromstring(html)
 152     if not len(root) and root.text is None and root.tail is None:
 153         html = '<div>%s</div>' % html
 154         root = lxml.html.fromstring(html)
 155
 156     # form node and tag text-based quotes and signature
 157     quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
 158     signature = re.compile(r'([-]{2}[\s]?[\r\n]{1,2}[^\z]+)')
 159     for node in root.getiterator():
 160         _tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'})
 161         _tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'})
 162
 163     # Processing
 164     # ------------------------------------------------------------
 165
 166     # tree: tag nodes
 167     quote_begin = False
 168     for node in root.getiterator():
 169         if node.get('class') in ['WordSection1', 'MsoNormal']:
 170             root.set('msoffice', '1')
 171         if node.get('class') in ['SkyDrivePlaceholder'] or node.get('id') in ['SkyDrivePlaceholder']:
 172             root.set('hotmail', '1')
 173
 174         if quote_begin:
 175             node.set('quote', '1')
 176
 177         if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''):
 178             quote_begin = True
 179         if root.get('hotmail') and node.tag == 'hr' and ('stopSpelling' in node.get('class', '') or 'stopSpelling' in node.get('id', '')):
 180             quote_begin = True
 181
 182         if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'):
 183             node.set('remove', '1')
 184         if quote_begin:
 185             node.set('remove', '1')
 186             node.set('tail_remove', '1')
 187
 188     # Post processing
 189     # ------------------------------------------------------------
 190
 191     if remove_unwanted:
 192         to_delete = []
 193         for node in root.getiterator():
 194             if node.get('remove'):
 195                 # copy the node tail into parent text
 196                 if node.tail and not node.get('tail_remove'):
 197                     parent = node.getparent()
 198                     parent.tail = node.tail + (parent.tail or '')
 199                 to_delete.append(node)
 200         for node in to_delete:
 201             node.getparent().remove(node)
 202
 203     # html: \n back to <br/>
 204     html = etree.tostring(root, pretty_print=True)
 205     html = html.replace('__BR_TAG__', '<br />')
 206
 207     return html
 208
 209
 210 #----------------------------------------------------------
 211 # HTML/Text management
 212 #----------------------------------------------------------
 213
 214 def html2plaintext(html, body_id=None, encoding='utf-8'):
 215     """ From an HTML text, convert the HTML to plain text.
 216     If @param body_id is provided then this is the tag where the
 217     body (not necessarily <body>) starts.
 218     """
 219     ## (c) Fry-IT, www.fry-it.com, 2007
 220     ## <peter@fry-it.com>
 221     ## download here: http://www.peterbe.com/plog/html2plaintext
 222
 223     html = ustr(html)
 224     tree = etree.fromstring(html, parser=etree.HTMLParser())
 225
 226     if body_id is not None:
 227         source = tree.xpath('//*[@id=%s]' % (body_id,))
 228     else:
 229         source = tree.xpath('//body')
 230     if len(source):
 231         tree = source[0]
 232
 233     url_index = []
 234     i = 0
 235     for link in tree.findall('.//a'):
 236         url = link.get('href')
 237         if url:
 238             i += 1
 239             link.tag = 'span'
 240             link.text = '%s [%s]' % (link.text, i)
 241             url_index.append(url)
 242
 243     html = ustr(etree.tostring(tree, encoding=encoding))
 244     # \r char is converted into &#13;, must remove it
 245     html = html.replace('&#13;', '')
 246
 247     html = html.replace('<strong>', '*').replace('</strong>', '*')
 248     html = html.replace('<b>', '*').replace('</b>', '*')
 249     html = html.replace('<h3>', '*').replace('</h3>', '*')
 250     html = html.replace('<h2>', '**').replace('</h2>', '**')
 251     html = html.replace('<h1>', '**').replace('</h1>', '**')
 252     html = html.replace('<em>', '/').replace('</em>', '/')
 253     html = html.replace('<tr>', '\n')
 254     html = html.replace('</p>', '\n')
 255     html = re.sub('<br\s*/?>', '\n', html)
 256     html = re.sub('<.*?>', ' ', html)
 257     html = html.replace(' ' * 2, ' ')
 258
 259     # strip all lines
 260     html = '\n'.join([x.strip() for x in html.splitlines()])
 261     html = html.replace('\n' * 2, '\n')
 262
 263     for i, url in enumerate(url_index):
 264         if i == 0:
 265             html += '\n\n'
 266         html += ustr('[%s] %s\n') % (i + 1, url)
 267
 268     return html
 269
 270 def plaintext2html(text, container_tag=False):
 271     """ Convert plaintext into html. Content of the text is escaped to manage
 272         html entities, using cgi.escape().
 273         - all \n,\r are replaced by <br />
 274         - enclose content into <p>
 275         - 2 or more consecutive <br /> are considered as paragraph breaks
 276
 277         :param string container_tag: container of the html; by default the
 278             content is embedded into a <div>
 279     """
 280     text = cgi.escape(ustr(text))
 281
 282     # 1. replace \n and \r
 283     text = text.replace('\n', '<br/>')
 284     text = text.replace('\r', '<br/>')
 285
 286     # 2-3: form paragraphs
 287     idx = 0
 288     final = '<p>'
 289     br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})')
 290     for item in re.finditer(br_tags, text):
 291         final += text[idx:item.start()] + '</p><p>'
 292         idx = item.end()
 293     final += text[idx:] + '</p>'
 294
 295     # 4. container
 296     if container_tag:
 297         final = '<%s>%s</%s>' % (container_tag, final, container_tag)
 298     return ustr(final)
 299
 300 def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False):
 301     """ Append extra content at the end of an HTML snippet, trying
 302         to locate the end of the HTML document (</body>, </html>, or
 303         EOF), and converting the provided content in html unless ``plaintext``
 304         is False.
 305         Content conversion can be done in two ways:
 306         - wrapping it into a pre (preserve=True)
 307         - use plaintext2html (preserve=False, using container_tag to wrap the
 308             whole content)
 309         A side-effect of this method is to coerce all HTML tags to
 310         lowercase in ``html``, and strip enclosing <html> or <body> tags in
 311         content if ``plaintext`` is False.
 312
 313         :param str html: html tagsoup (doesn't have to be XHTML)
 314         :param str content: extra content to append
 315         :param bool plaintext: whether content is plaintext and should
 316             be wrapped in a <pre/> tag.
 317         :param bool preserve: if content is plaintext, wrap it into a <pre>
 318             instead of converting it into html
 319     """
 320     html = ustr(html)
 321     if plaintext and preserve:
 322         content = u'\n<pre>%s</pre>\n' % ustr(content)
 323     elif plaintext:
 324         content = '\n%s\n' % plaintext2html(content, container_tag)
 325     else:
 326         content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
 327         content = u'\n%s\n' % ustr(content)
 328     # Force all tags to lowercase
 329     html = re.sub(r'(</?)\W*(\w+)([ >])',
 330         lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
 331     insert_location = html.find('</body>')
 332     if insert_location == -1:
 333         insert_location = html.find('</html>')
 334     if insert_location == -1:
 335         return '%s%s' % (html, content)
 336     return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
 337
 338 #----------------------------------------------------------
 339 # Emails
 340 #----------------------------------------------------------
 341
 342 # matches any email in a body of text
 343 email_re = re.compile(r"""([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6})""", re.VERBOSE)
 344
 345 # matches a string containing only one email
 346 single_email_re = re.compile(r"""^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$""", re.VERBOSE)
 347
 348 res_re = re.compile(r"\[([0-9]+)\]", re.UNICODE)
 349 command_re = re.compile("^Set-([a-z]+) *: *(.+)$", re.I + re.UNICODE)
 350
 351 # Updated in 7.0 to match the model name as well
 352 # Typical form of references is <timestamp-openerp-record_id-model_name@domain>
 353 # group(1) = the record ID ; group(2) = the model (if any) ; group(3) = the domain
 354 reference_re = re.compile("<.*-open(?:object|erp)-(\\d+)(?:-([\w.]+))?.*@(.*)>", re.UNICODE)
 355
 356 def generate_tracking_message_id(res_id):
 357     """Returns a string that can be used in the Message-ID RFC822 header field
 358
 359        Used to track the replies related to a given object thanks to the "In-Reply-To"
 360        or "References" fields that Mail User Agents will set.
 361     """
 362     try:
 363         rnd = random.SystemRandom().random()
 364     except NotImplementedError:
 365         rnd = random.random()
 366     rndstr = ("%.15f" % rnd)[2:]
 367     return "<%.15f.%s-openerp-%s@%s>" % (time.time(), rndstr, res_id, socket.gethostname())
 368
 369 def email_send(email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False,
 370                attachments=None, message_id=None, references=None, openobject_id=False, debug=False, subtype='plain', headers=None,
 371                smtp_server=None, smtp_port=None, ssl=False, smtp_user=None, smtp_password=None, cr=None, uid=None):
 372     """Low-level function for sending an email (deprecated).
 373
 374     :deprecate: since OpenERP 6.1, please use ir.mail_server.send_email() instead.
 375     :param email_from: A string used to fill the `From` header, if falsy,
 376                        config['email_from'] is used instead.  Also used for
 377                        the `Reply-To` header if `reply_to` is not provided
 378     :param email_to: a sequence of addresses to send the mail to.
 379     """
 380
 381     # If not cr, get cr from current thread database
 382     local_cr = None
 383     if not cr:
 384         db_name = getattr(threading.currentThread(), 'dbname', None)
 385         if db_name:
 386             local_cr = cr = openerp.registry(db_name).db.cursor()
 387         else:
 388             raise Exception("No database cursor found, please pass one explicitly")
 389
 390     # Send Email
 391     try:
 392         mail_server_pool = openerp.registry(cr.dbname)['ir.mail_server']
 393         res = False
 394         # Pack Message into MIME Object
 395         email_msg = mail_server_pool.build_email(email_from, email_to, subject, body, email_cc, email_bcc, reply_to,
 396                    attachments, message_id, references, openobject_id, subtype, headers=headers)
 397
 398         res = mail_server_pool.send_email(cr, uid or 1, email_msg, mail_server_id=None,
 399                        smtp_server=smtp_server, smtp_port=smtp_port, smtp_user=smtp_user, smtp_password=smtp_password,
 400                        smtp_encryption=('ssl' if ssl else None), smtp_debug=debug)
 401     except Exception:
 402         _logger.exception("tools.email_send failed to deliver email")
 403         return False
 404     finally:
 405         if local_cr:
 406             cr.close()
 407     return res
 408
 409 def email_split(text):
 410     """ Return a list of the email addresses found in ``text`` """
 411     if not text:
 412         return []
 413     return re.findall(r'([^ ,<@]+@[^> ,]+)', text)