openerp/tools/mail.py

   1 # -*- coding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Business Applications
   5 #    Copyright (C) 2012 OpenERP S.A. (<http://openerp.com>).
   6 #
   7 #    This program is free software: you can redistribute it and/or modify
   8 #    it under the terms of the GNU Affero General Public License as
   9 #    published by the Free Software Foundation, either version 3 of the
  10 #    License, or (at your option) any later version.
  11 #
  12 #    This program is distributed in the hope that it will be useful,
  13 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #    GNU Affero General Public License for more details.
  16 #
  17 #    You should have received a copy of the GNU Affero General Public License
  18 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 ##############################################################################
  21
  22 from lxml import etree
  23 import cgi
  24 import logging
  25 import lxml.html
  26 import openerp.pooler as pooler
  27 import operator
  28 import random
  29 import re
  30 import socket
  31 import threading
  32 import time
  33
  34 from openerp.loglevels import ustr
  35
  36 _logger = logging.getLogger(__name__)
  37
  38
  39 #----------------------------------------------------------
  40 # HTML Sanitizer
  41 #----------------------------------------------------------
  42
  43 def html_sanitize(src):
  44     if not src:
  45         return src
  46     src = ustr(src, errors='replace')
  47     root = lxml.html.fromstring(u"<div>%s</div>" % src)
  48     result = handle_element(root)
  49     res = []
  50     for element in children(result[0]):
  51         if isinstance(element, basestring):
  52             res.append(element)
  53         else:
  54             element.tail = ""
  55             res.append(lxml.html.tostring(element))
  56     return ''.join(res)
  57
  58 # FIXME: shouldn't this be a whitelist rather than a blacklist?!
  59 to_remove = set(["script", "head", "meta", "title", "link", "img"])
  60 to_unwrap = set(["html", "body"])
  61
  62 javascript_regex = re.compile(r"^\s*javascript\s*:.*$", re.IGNORECASE)
  63
  64 def handle_a(el, new):
  65     href = el.get("href", "#")
  66     if javascript_regex.search(href):
  67         href = "#"
  68     new.set("href", href)
  69
  70 special = {
  71     "a": handle_a,
  72 }
  73
  74 def handle_element(element):
  75     if isinstance(element, basestring):
  76         return [element]
  77     if element.tag in to_remove:
  78         return []
  79     if element.tag in to_unwrap:
  80         return reduce(operator.add, [handle_element(x) for x in children(element)])
  81     result = lxml.html.fromstring("<%s />" % element.tag)
  82     for c in children(element):
  83         append_to(handle_element(c), result)
  84     if element.tag in special:
  85         special[element.tag](element, result)
  86     return [result]
  87
  88 def children(node):
  89     res = []
  90     if node.text is not None:
  91         res.append(node.text)
  92     for child_node in node.getchildren():
  93         res.append(child_node)
  94         if child_node.tail is not None:
  95             res.append(child_node.tail)
  96     return res
  97
  98 def append_to(elements, dest_node):
  99     for element in elements:
 100         if isinstance(element, basestring):
 101             children = dest_node.getchildren()
 102             if len(children) == 0:
 103                 dest_node.text = element
 104             else:
 105                 children[-1].tail = element
 106         else:
 107             dest_node.append(element)
 108
 109
 110 #----------------------------------------------------------
 111 # HTML Cleaner
 112 #----------------------------------------------------------
 113
 114 def html_email_clean(html):
 115     """ html_email_clean: clean the html to display in the web client.
 116         - strip email quotes (remove blockquote nodes)
 117         - strip signatures (remove --\n{\n)Blahblah), by replacing <br> by
 118             \n to avoid ignoring signatures converted into html
 119
 120         :param string html: sanitized html; tags like html or head should not
 121             be present in the html string. This method therefore takes as input
 122             html code coming from a sanitized source, like fields.html.
 123     """
 124     def _replace_matching_regex(regex, source, replace=''):
 125         dest = ''
 126         idx = 0
 127         for item in re.finditer(regex, source):
 128             dest += source[idx:item.start()] + replace
 129             idx = item.end()
 130         dest += source[idx:]
 131         return dest
 132
 133     html = ustr(html)
 134
 135     # 1. <br[ /]> -> \n, because otherwise the tree is obfuscated
 136     br_tags = re.compile(r'([<]\s*[bB][rR]\s*\/?[>])')
 137     html = _replace_matching_regex(br_tags, html, '__BR_TAG__')
 138
 139     # 2. form a tree, handle (currently ?) pure-text by enclosing them in a pre
 140     root = lxml.html.fromstring(html)
 141     if not len(root) and root.text is None and root.tail is None:
 142         html = '<div>%s</div>' % html
 143         root = lxml.html.fromstring(html)
 144
 145     # 2.5 remove quoted text in nodes
 146     quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
 147     for node in root.getiterator():
 148         if not node.text:
 149             continue
 150         node.text = _replace_matching_regex(quote_tags, node.text)
 151
 152     # 3. remove blockquotes
 153     quotes = [el for el in root.getiterator(tag='blockquote')]
 154     for node in quotes:
 155         # copy the node tail into parent text
 156         if node.tail:
 157             parent = node.getparent()
 158             parent.text = parent.text or '' + node.tail
 159         # remove the node
 160         node.getparent().remove(node)
 161
 162     # 4. strip signatures
 163     signature = re.compile(r'([-]{2}[\s]?[\r\n]{1,2}[^\z]+)')
 164     for elem in root.getiterator():
 165         if elem.text:
 166             match = re.search(signature, elem.text)
 167             if match:
 168                 elem.text = elem.text[:match.start()] + elem.text[match.end():]
 169         if elem.tail:
 170             match = re.search(signature, elem.tail)
 171             if match:
 172                 elem.tail = elem.tail[:match.start()] + elem.tail[match.end():]
 173
 174     # 5. \n back to <br/>
 175     html = etree.tostring(root, pretty_print=True)
 176     html = html.replace('__BR_TAG__', '<br />')
 177
 178     # 6. Misc cleaning :
 179     # - ClEditor seems to love using <div><br /><div> -> replace with <br />
 180     br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)')
 181     html = _replace_matching_regex(br_div_tags, html, '<br />')
 182
 183     return html
 184
 185
 186 #----------------------------------------------------------
 187 # HTML/Text management
 188 #----------------------------------------------------------
 189
 190 def html2plaintext(html, body_id=None, encoding='utf-8'):
 191     """ From an HTML text, convert the HTML to plain text.
 192     If @param body_id is provided then this is the tag where the
 193     body (not necessarily <body>) starts.
 194     """
 195     ## (c) Fry-IT, www.fry-it.com, 2007
 196     ## <peter@fry-it.com>
 197     ## download here: http://www.peterbe.com/plog/html2plaintext
 198
 199     html = ustr(html)
 200     tree = etree.fromstring(html, parser=etree.HTMLParser())
 201
 202     if body_id is not None:
 203         source = tree.xpath('//*[@id=%s]' % (body_id,))
 204     else:
 205         source = tree.xpath('//body')
 206     if len(source):
 207         tree = source[0]
 208
 209     url_index = []
 210     i = 0
 211     for link in tree.findall('.//a'):
 212         url = link.get('href')
 213         if url:
 214             i += 1
 215             link.tag = 'span'
 216             link.text = '%s [%s]' % (link.text, i)
 217             url_index.append(url)
 218
 219     html = ustr(etree.tostring(tree, encoding=encoding))
 220
 221     html = html.replace('<strong>', '*').replace('</strong>', '*')
 222     html = html.replace('<b>', '*').replace('</b>', '*')
 223     html = html.replace('<h3>', '*').replace('</h3>', '*')
 224     html = html.replace('<h2>', '**').replace('</h2>', '**')
 225     html = html.replace('<h1>', '**').replace('</h1>', '**')
 226     html = html.replace('<em>', '/').replace('</em>', '/')
 227     html = html.replace('<tr>', '\n')
 228     html = html.replace('</p>', '\n')
 229     html = re.sub('<br\s*/?>', '\n', html)
 230     html = re.sub('<.*?>', ' ', html)
 231     html = html.replace(' ' * 2, ' ')
 232
 233     # strip all lines
 234     html = '\n'.join([x.strip() for x in html.splitlines()])
 235     html = html.replace('\n' * 2, '\n')
 236
 237     for i, url in enumerate(url_index):
 238         if i == 0:
 239             html += '\n\n'
 240         html += ustr('[%s] %s\n') % (i + 1, url)
 241
 242     return html
 243
 244 def plaintext2html(text, container_tag=False):
 245     """ Convert plaintext into html. Content of the text is escaped to manage
 246         html entities, using cgi.escape().
 247         - all \n,\r are replaced by <br />
 248         - enclose content into <p>
 249         - 2 or more consecutive <br /> are considered as paragraph breaks
 250
 251         :param string container_tag: container of the html; by default the
 252             content is embedded into a <div>
 253     """
 254     text = cgi.escape(ustr(text))
 255
 256     # 1. replace \n and \r
 257     text = text.replace('\n', '<br/>')
 258     text = text.replace('\r', '<br/>')
 259
 260     # 2-3: form paragraphs
 261     idx = 0
 262     final = '<p>'
 263     br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})')
 264     for item in re.finditer(br_tags, text):
 265         final += text[idx:item.start()] + '</p><p>'
 266         idx = item.end()
 267     final += text[idx:] + '</p>'
 268
 269     # 4. container
 270     if container_tag:
 271         final = '<%s>%s</%s>' % (container_tag, final, container_tag)
 272     return ustr(final)
 273
 274 def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False):
 275     """ Append extra content at the end of an HTML snippet, trying
 276         to locate the end of the HTML document (</body>, </html>, or
 277         EOF), and converting the provided content in html unless ``plaintext``
 278         is False.
 279         Content conversion can be done in two ways:
 280         - wrapping it into a pre (preserve=True)
 281         - use plaintext2html (preserve=False, using container_tag to wrap the
 282             whole content)
 283         A side-effect of this method is to coerce all HTML tags to
 284         lowercase in ``html``, and strip enclosing <html> or <body> tags in
 285         content if ``plaintext`` is False.
 286
 287         :param str html: html tagsoup (doesn't have to be XHTML)
 288         :param str content: extra content to append
 289         :param bool plaintext: whether content is plaintext and should
 290             be wrapped in a <pre/> tag.
 291         :param bool preserve: if content is plaintext, wrap it into a <pre>
 292             instead of converting it into html
 293     """
 294     html = ustr(html)
 295     if plaintext and preserve:
 296         content = u'\n<pre>%s</pre>\n' % ustr(content)
 297     elif plaintext:
 298         content = '\n%s\n' % plaintext2html(content, container_tag)
 299     else:
 300         content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
 301         content = u'\n%s\n' % ustr(content)
 302     # Force all tags to lowercase
 303     html = re.sub(r'(</?)\W*(\w+)([ >])',
 304         lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
 305     insert_location = html.find('</body>')
 306     if insert_location == -1:
 307         insert_location = html.find('</html>')
 308     if insert_location == -1:
 309         return '%s%s' % (html, content)
 310     return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
 311
 312 #----------------------------------------------------------
 313 # Emails
 314 #----------------------------------------------------------
 315
 316 email_re = re.compile(r"""
 317     ([a-zA-Z][\w\.-]*[a-zA-Z0-9]     # username part
 318     @                                # mandatory @ sign
 319     [a-zA-Z0-9][\w\.-]*              # domain must start with a letter ... Ged> why do we include a 0-9 then?
 320      \.
 321      [a-z]{2,3}                      # TLD
 322     )
 323     """, re.VERBOSE)
 324 res_re = re.compile(r"\[([0-9]+)\]", re.UNICODE)
 325 command_re = re.compile("^Set-([a-z]+) *: *(.+)$", re.I + re.UNICODE)
 326
 327 # Updated in 7.0 to match the model name as well
 328 # Typical form of references is <timestamp-openerp-record_id-model_name@domain>
 329 # group(1) = the record ID ; group(2) = the model (if any) ; group(3) = the domain
 330 reference_re = re.compile("<.*-open(?:object|erp)-(\\d+)(?:-([\w.]+))?.*@(.*)>", re.UNICODE)
 331
 332 def generate_tracking_message_id(res_id):
 333     """Returns a string that can be used in the Message-ID RFC822 header field
 334
 335        Used to track the replies related to a given object thanks to the "In-Reply-To"
 336        or "References" fields that Mail User Agents will set.
 337     """
 338     try:
 339         rnd = random.SystemRandom().random()
 340     except NotImplementedError:
 341         rnd = random.random()
 342     rndstr = ("%.15f" % rnd)[2:]
 343     return "<%.15f.%s-openerp-%s@%s>" % (time.time(), rndstr, res_id, socket.gethostname())
 344
 345 def email_send(email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False,
 346                attachments=None, message_id=None, references=None, openobject_id=False, debug=False, subtype='plain', headers=None,
 347                smtp_server=None, smtp_port=None, ssl=False, smtp_user=None, smtp_password=None, cr=None, uid=None):
 348     """Low-level function for sending an email (deprecated).
 349
 350     :deprecate: since OpenERP 6.1, please use ir.mail_server.send_email() instead.
 351     :param email_from: A string used to fill the `From` header, if falsy,
 352                        config['email_from'] is used instead.  Also used for
 353                        the `Reply-To` header if `reply_to` is not provided
 354     :param email_to: a sequence of addresses to send the mail to.
 355     """
 356
 357     # If not cr, get cr from current thread database
 358     if not cr:
 359         db_name = getattr(threading.currentThread(), 'dbname', None)
 360         if db_name:
 361             cr = pooler.get_db_only(db_name).cursor()
 362         else:
 363             raise Exception("No database cursor found, please pass one explicitly")
 364
 365     # Send Email
 366     try:
 367         mail_server_pool = pooler.get_pool(cr.dbname).get('ir.mail_server')
 368         res = False
 369         # Pack Message into MIME Object
 370         email_msg = mail_server_pool.build_email(email_from, email_to, subject, body, email_cc, email_bcc, reply_to,
 371                    attachments, message_id, references, openobject_id, subtype, headers=headers)
 372
 373         res = mail_server_pool.send_email(cr, uid or 1, email_msg, mail_server_id=None,
 374                        smtp_server=smtp_server, smtp_port=smtp_port, smtp_user=smtp_user, smtp_password=smtp_password,
 375                        smtp_encryption=('ssl' if ssl else None), smtp_debug=debug)
 376     except Exception:
 377         _logger.exception("tools.email_send failed to deliver email")
 378         return False
 379     finally:
 380         cr.close()
 381     return res
 382
 383 def email_split(text):
 384     """ Return a list of the email addresses found in ``text`` """
 385     if not text:
 386         return []
 387     return re.findall(r'([^ ,<@]+@[^> ,]+)', text)