openerp/tools/mail.py

   1 # -*- coding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Business Applications
   5 #    Copyright (C) 2012-2013 OpenERP S.A. (<http://openerp.com>).
   6 #
   7 #    This program is free software: you can redistribute it and/or modify
   8 #    it under the terms of the GNU Affero General Public License as
   9 #    published by the Free Software Foundation, either version 3 of the
  10 #    License, or (at your option) any later version.
  11 #
  12 #    This program is distributed in the hope that it will be useful,
  13 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #    GNU Affero General Public License for more details.
  16 #
  17 #    You should have received a copy of the GNU Affero General Public License
  18 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 ##############################################################################
  21
  22 from lxml import etree
  23 import cgi
  24 import logging
  25 import lxml.html
  26 import lxml.html.clean as clean
  27 import openerp.pooler as pooler
  28 import random
  29 import re
  30 import socket
  31 import threading
  32 import time
  33 from email.utils import getaddresses
  34
  35 from openerp.loglevels import ustr
  36
  37 _logger = logging.getLogger(__name__)
  38
  39
  40 #----------------------------------------------------------
  41 # HTML Sanitizer
  42 #----------------------------------------------------------
  43
  44 tags_to_kill = ["script", "head", "meta", "title", "link", "style", "frame", "iframe", "base", "object", "embed"]
  45 tags_to_remove = ['html', 'body', 'font']
  46
  47
  48 def html_sanitize(src):
  49     if not src:
  50         return src
  51     src = ustr(src, errors='replace')
  52
  53     # html encode email tags
  54     part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL)
  55     src = part.sub(lambda m: cgi.escape(m.group(1)), src)
  56
  57     # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
  58     try:
  59         cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, kill_tags=tags_to_kill, remove_tags=tags_to_remove)
  60         cleaned = cleaner.clean_html(src)
  61     except TypeError:
  62         # lxml.clean version < 2.3.1 does not have a kill_tags attribute
  63         # to remove in 2014
  64         cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, remove_tags=tags_to_kill + tags_to_remove)
  65         cleaned = cleaner.clean_html(src)
  66     except Exception, e:
  67         if isinstance(e, etree.ParserError) and 'empty' in str(e):
  68             return ""
  69         _logger.warning('html_sanitize failed to parse %s' % (src))
  70         cleaned = '<p>Impossible to parse</p>'
  71
  72     # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution
  73     cleaned = cleaned.replace('%24', '$')
  74     cleaned = cleaned.replace('%7B', '{')
  75     cleaned = cleaned.replace('%7D', '}')
  76     cleaned = cleaned.replace('%20', ' ')
  77     cleaned = cleaned.replace('%5B', '[')
  78     cleaned = cleaned.replace('%5D', ']')
  79
  80     return cleaned
  81
  82
  83 #----------------------------------------------------------
  84 # HTML Cleaner
  85 #----------------------------------------------------------
  86
  87 def html_email_clean(html):
  88     """ html_email_clean: clean the html to display in the web client.
  89         - strip email quotes (remove blockquote nodes)
  90         - strip signatures (remove --\n{\n)Blahblah), by replacing <br> by
  91             \n to avoid ignoring signatures converted into html
  92
  93         :param string html: sanitized html; tags like html or head should not
  94             be present in the html string. This method therefore takes as input
  95             html code coming from a sanitized source, like fields.html.
  96     """
  97     def _replace_matching_regex(regex, source, replace=''):
  98         dest = ''
  99         idx = 0
 100         for item in re.finditer(regex, source):
 101             dest += source[idx:item.start()] + replace
 102             idx = item.end()
 103         dest += source[idx:]
 104         return dest
 105
 106     if not html or not isinstance(html, basestring):
 107         return html
 108
 109     html = ustr(html)
 110
 111     # 0. remove encoding attribute inside tags
 112     doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
 113     html = doctype.sub(r"", html)
 114
 115     # 1. <br[ /]> -> \n, because otherwise the tree is obfuscated
 116     br_tags = re.compile(r'([<]\s*[bB][rR]\s*\/?[>])')
 117     html = _replace_matching_regex(br_tags, html, '__BR_TAG__')
 118
 119     # 2. form a tree, handle (currently ?) pure-text by enclosing them in a pre
 120     root = lxml.html.fromstring(html)
 121     if not len(root) and root.text is None and root.tail is None:
 122         html = '<div>%s</div>' % html
 123         root = lxml.html.fromstring(html)
 124
 125     # 2.5 remove quoted text in nodes
 126     quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
 127     for node in root.getiterator():
 128         if not node.text:
 129             continue
 130         node.text = _replace_matching_regex(quote_tags, node.text)
 131
 132     # 3. remove blockquotes
 133     quotes = [el for el in root.getiterator(tag='blockquote')]
 134     for node in quotes:
 135         # copy the node tail into parent text
 136         if node.tail:
 137             parent = node.getparent()
 138             parent.text = parent.text or '' + node.tail
 139         # remove the node
 140         node.getparent().remove(node)
 141
 142     # 4. strip signatures
 143     signature = re.compile(r'([-]{2}[\s]?[\r\n]{1,2}[^\z]+)')
 144     for elem in root.getiterator():
 145         if elem.text:
 146             match = re.search(signature, elem.text)
 147             if match:
 148                 elem.text = elem.text[:match.start()] + elem.text[match.end():]
 149         if elem.tail:
 150             match = re.search(signature, elem.tail)
 151             if match:
 152                 elem.tail = elem.tail[:match.start()] + elem.tail[match.end():]
 153
 154     # 5. \n back to <br/>
 155     html = etree.tostring(root, pretty_print=True)
 156     html = html.replace('__BR_TAG__', '<br />')
 157
 158     # 6. Misc cleaning :
 159     # - ClEditor seems to love using <div><br /><div> -> replace with <br />
 160     br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)')
 161     html = _replace_matching_regex(br_div_tags, html, '<br />')
 162
 163     return html
 164
 165
 166 #----------------------------------------------------------
 167 # HTML/Text management
 168 #----------------------------------------------------------
 169
 170 def html2plaintext(html, body_id=None, encoding='utf-8'):
 171     """ From an HTML text, convert the HTML to plain text.
 172     If @param body_id is provided then this is the tag where the
 173     body (not necessarily <body>) starts.
 174     """
 175     ## (c) Fry-IT, www.fry-it.com, 2007
 176     ## <peter@fry-it.com>
 177     ## download here: http://www.peterbe.com/plog/html2plaintext
 178
 179     html = ustr(html)
 180     tree = etree.fromstring(html, parser=etree.HTMLParser())
 181
 182     if body_id is not None:
 183         source = tree.xpath('//*[@id=%s]' % (body_id,))
 184     else:
 185         source = tree.xpath('//body')
 186     if len(source):
 187         tree = source[0]
 188
 189     url_index = []
 190     i = 0
 191     for link in tree.findall('.//a'):
 192         url = link.get('href')
 193         if url:
 194             i += 1
 195             link.tag = 'span'
 196             link.text = '%s [%s]' % (link.text, i)
 197             url_index.append(url)
 198
 199     html = ustr(etree.tostring(tree, encoding=encoding))
 200     # \r char is converted into &#13;, must remove it
 201     html = html.replace('&#13;', '')
 202
 203     html = html.replace('<strong>', '*').replace('</strong>', '*')
 204     html = html.replace('<b>', '*').replace('</b>', '*')
 205     html = html.replace('<h3>', '*').replace('</h3>', '*')
 206     html = html.replace('<h2>', '**').replace('</h2>', '**')
 207     html = html.replace('<h1>', '**').replace('</h1>', '**')
 208     html = html.replace('<em>', '/').replace('</em>', '/')
 209     html = html.replace('<tr>', '\n')
 210     html = html.replace('</p>', '\n')
 211     html = re.sub('<br\s*/?>', '\n', html)
 212     html = re.sub('<.*?>', ' ', html)
 213     html = html.replace(' ' * 2, ' ')
 214     html = html.replace('&gt;', '>')
 215     html = html.replace('&lt;', '<')
 216     html = html.replace('&amp;', '&')
 217
 218     # strip all lines
 219     html = '\n'.join([x.strip() for x in html.splitlines()])
 220     html = html.replace('\n' * 2, '\n')
 221
 222     for i, url in enumerate(url_index):
 223         if i == 0:
 224             html += '\n\n'
 225         html += ustr('[%s] %s\n') % (i + 1, url)
 226
 227     return html
 228
 229 def plaintext2html(text, container_tag=False):
 230     """ Convert plaintext into html. Content of the text is escaped to manage
 231         html entities, using cgi.escape().
 232         - all \n,\r are replaced by <br />
 233         - enclose content into <p>
 234         - 2 or more consecutive <br /> are considered as paragraph breaks
 235
 236         :param string container_tag: container of the html; by default the
 237             content is embedded into a <div>
 238     """
 239     text = cgi.escape(ustr(text))
 240
 241     # 1. replace \n and \r
 242     text = text.replace('\n', '<br/>')
 243     text = text.replace('\r', '<br/>')
 244
 245     # 2-3: form paragraphs
 246     idx = 0
 247     final = '<p>'
 248     br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})')
 249     for item in re.finditer(br_tags, text):
 250         final += text[idx:item.start()] + '</p><p>'
 251         idx = item.end()
 252     final += text[idx:] + '</p>'
 253
 254     # 4. container
 255     if container_tag:
 256         final = '<%s>%s</%s>' % (container_tag, final, container_tag)
 257     return ustr(final)
 258
 259 def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False):
 260     """ Append extra content at the end of an HTML snippet, trying
 261         to locate the end of the HTML document (</body>, </html>, or
 262         EOF), and converting the provided content in html unless ``plaintext``
 263         is False.
 264         Content conversion can be done in two ways:
 265         - wrapping it into a pre (preserve=True)
 266         - use plaintext2html (preserve=False, using container_tag to wrap the
 267             whole content)
 268         A side-effect of this method is to coerce all HTML tags to
 269         lowercase in ``html``, and strip enclosing <html> or <body> tags in
 270         content if ``plaintext`` is False.
 271
 272         :param str html: html tagsoup (doesn't have to be XHTML)
 273         :param str content: extra content to append
 274         :param bool plaintext: whether content is plaintext and should
 275             be wrapped in a <pre/> tag.
 276         :param bool preserve: if content is plaintext, wrap it into a <pre>
 277             instead of converting it into html
 278     """
 279     html = ustr(html)
 280     if plaintext and preserve:
 281         content = u'\n<pre>%s</pre>\n' % ustr(content)
 282     elif plaintext:
 283         content = '\n%s\n' % plaintext2html(content, container_tag)
 284     else:
 285         content = re.sub(r'(?i)(</?(?:html|body|head|!\s*DOCTYPE)[^>]*>)', '', content)
 286         content = u'\n%s\n' % ustr(content)
 287     # Force all tags to lowercase
 288     html = re.sub(r'(</?)\W*(\w+)([ >])',
 289         lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
 290     insert_location = html.find('</body>')
 291     if insert_location == -1:
 292         insert_location = html.find('</html>')
 293     if insert_location == -1:
 294         return '%s%s' % (html, content)
 295     return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
 296
 297 #----------------------------------------------------------
 298 # Emails
 299 #----------------------------------------------------------
 300
 301 email_re = re.compile(r"""
 302     ([a-zA-Z][\w\.-]*[a-zA-Z0-9]     # username part
 303     @                                # mandatory @ sign
 304     [a-zA-Z0-9][\w\.-]*              # domain must start with a letter ... Ged> why do we include a 0-9 then?
 305      \.
 306      [a-z]{2,3}                      # TLD
 307     )
 308     """, re.VERBOSE)
 309 res_re = re.compile(r"\[([0-9]+)\]", re.UNICODE)
 310 command_re = re.compile("^Set-([a-z]+) *: *(.+)$", re.I + re.UNICODE)
 311
 312 # Updated in 7.0 to match the model name as well
 313 # Typical form of references is <timestamp-openerp-record_id-model_name@domain>
 314 # group(1) = the record ID ; group(2) = the model (if any) ; group(3) = the domain
 315 reference_re = re.compile("<.*-open(?:object|erp)-(\\d+)(?:-([\w.]+))?[^>]*@([^>]*)>", re.UNICODE)
 316
 317 def generate_tracking_message_id(res_id):
 318     """Returns a string that can be used in the Message-ID RFC822 header field
 319
 320        Used to track the replies related to a given object thanks to the "In-Reply-To"
 321        or "References" fields that Mail User Agents will set.
 322     """
 323     try:
 324         rnd = random.SystemRandom().random()
 325     except NotImplementedError:
 326         rnd = random.random()
 327     rndstr = ("%.15f" % rnd)[2:]
 328     return "<%.15f.%s-openerp-%s@%s>" % (time.time(), rndstr, res_id, socket.gethostname())
 329
 330 def email_send(email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False,
 331                attachments=None, message_id=None, references=None, openobject_id=False, debug=False, subtype='plain', headers=None,
 332                smtp_server=None, smtp_port=None, ssl=False, smtp_user=None, smtp_password=None, cr=None, uid=None):
 333     """Low-level function for sending an email (deprecated).
 334
 335     :deprecate: since OpenERP 6.1, please use ir.mail_server.send_email() instead.
 336     :param email_from: A string used to fill the `From` header, if falsy,
 337                        config['email_from'] is used instead.  Also used for
 338                        the `Reply-To` header if `reply_to` is not provided
 339     :param email_to: a sequence of addresses to send the mail to.
 340     """
 341
 342     # If not cr, get cr from current thread database
 343     local_cr = None
 344     if not cr:
 345         db_name = getattr(threading.currentThread(), 'dbname', None)
 346         if db_name:
 347             local_cr = cr = pooler.get_db(db_name).cursor()
 348         else:
 349             raise Exception("No database cursor found, please pass one explicitly")
 350
 351     # Send Email
 352     try:
 353         mail_server_pool = pooler.get_pool(cr.dbname).get('ir.mail_server')
 354         res = False
 355         # Pack Message into MIME Object
 356         email_msg = mail_server_pool.build_email(email_from, email_to, subject, body, email_cc, email_bcc, reply_to,
 357                    attachments, message_id, references, openobject_id, subtype, headers=headers)
 358
 359         res = mail_server_pool.send_email(cr, uid or 1, email_msg, mail_server_id=None,
 360                        smtp_server=smtp_server, smtp_port=smtp_port, smtp_user=smtp_user, smtp_password=smtp_password,
 361                        smtp_encryption=('ssl' if ssl else None), smtp_debug=debug)
 362     except Exception:
 363         _logger.exception("tools.email_send failed to deliver email")
 364         return False
 365     finally:
 366         if local_cr:
 367             cr.close()
 368     return res
 369
 370 def email_split(text):
 371     """ Return a list of the email addresses found in ``text`` """
 372     if not text:
 373         return []
 374     return [addr[1] for addr in getaddresses([text])
 375                 # getaddresses() returns '' when email parsing fails, and
 376                 # sometimes returns emails without at least '@'. The '@'
 377                 # is strictly required in RFC2822's `addr-spec`.
 378                 if addr[1]
 379                 if '@' in addr[1]]