openerp/tools/mail.py

   1 # -*- coding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Business Applications
   5 #    Copyright (C) 2012 OpenERP S.A. (<http://openerp.com>).
   6 #
   7 #    This program is free software: you can redistribute it and/or modify
   8 #    it under the terms of the GNU Affero General Public License as
   9 #    published by the Free Software Foundation, either version 3 of the
  10 #    License, or (at your option) any later version.
  11 #
  12 #    This program is distributed in the hope that it will be useful,
  13 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #    GNU Affero General Public License for more details.
  16 #
  17 #    You should have received a copy of the GNU Affero General Public License
  18 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 ##############################################################################
  21
  22 from lxml import etree
  23 import cgi
  24 import logging
  25 import lxml.html
  26 import openerp.pooler as pooler
  27 import operator
  28 import random
  29 import re
  30 import socket
  31 import threading
  32 import time
  33
  34 from openerp.loglevels import ustr
  35
  36 _logger = logging.getLogger(__name__)
  37
  38
  39 #----------------------------------------------------------
  40 # HTML Sanitizer
  41 #----------------------------------------------------------
  42
  43 def html_sanitize(src):
  44     if not src:
  45         return src
  46     src = ustr(src, errors='replace')
  47     root = lxml.html.fromstring(u"<div>%s</div>" % src)
  48     result = handle_element(root)
  49     res = []
  50     for element in children(result[0]):
  51         if isinstance(element, basestring):
  52             res.append(element)
  53         else:
  54             element.tail = ""
  55             res.append(lxml.html.tostring(element))
  56     return ''.join(res)
  57
  58 # FIXME: shouldn't this be a whitelist rather than a blacklist?!
  59 to_remove = set(["script", "head", "meta", "title", "link", "img"])
  60 to_unwrap = set(["html", "body"])
  61
  62 javascript_regex = re.compile(r"^\s*javascript\s*:.*$", re.IGNORECASE)
  63
  64 def handle_a(el, new):
  65     href = el.get("href", "#")
  66     if javascript_regex.search(href):
  67         href = "#"
  68     new.set("href", href)
  69
  70 special = {
  71     "a": handle_a,
  72 }
  73
  74 def handle_element(element):
  75     if isinstance(element, basestring):
  76         return [element]
  77     if element.tag in to_remove:
  78         return []
  79     if element.tag in to_unwrap:
  80         return reduce(operator.add, [handle_element(x) for x in children(element)])
  81     result = lxml.html.fromstring("<%s />" % element.tag)
  82     for c in children(element):
  83         append_to(handle_element(c), result)
  84     if element.tag in special:
  85         special[element.tag](element, result)
  86     return [result]
  87
  88 def children(node):
  89     res = []
  90     if node.text is not None:
  91         res.append(node.text)
  92     for child_node in node.getchildren():
  93         res.append(child_node)
  94         if child_node.tail is not None:
  95             res.append(child_node.tail)
  96     return res
  97
  98 def append_to(elements, dest_node):
  99     for element in elements:
 100         if isinstance(element, basestring):
 101             children = dest_node.getchildren()
 102             if len(children) == 0:
 103                 dest_node.text = element
 104             else:
 105                 children[-1].tail = element
 106         else:
 107             dest_node.append(element)
 108
 109
 110 #----------------------------------------------------------
 111 # HTML Cleaner
 112 #----------------------------------------------------------
 113
 114 def html_email_clean(html):
 115     """ html_email_clean: clean the html to display in the web client.
 116         - strip email quotes (remove blockquote nodes)
 117         - strip signatures (remove --\n{\n)Blahblah), by replacing <br> by
 118             \n to avoid ignoring signatures converted into html
 119
 120         :param string html: sanitized html; tags like html or head should not
 121             be present in the html string. This method therefore takes as input
 122             html code coming from a sanitized source, like fields.html.
 123     """
 124     def _replace_matching_regex(regex, source, replace=''):
 125         dest = ''
 126         idx = 0
 127         for item in re.finditer(regex, source):
 128             dest += source[idx:item.start()] + replace
 129             idx = item.end()
 130         dest += source[idx:]
 131         return dest
 132
 133     if not html:
 134         return html
 135
 136     html = ustr(html)
 137
 138     # 1. <br[ /]> -> \n, because otherwise the tree is obfuscated
 139     br_tags = re.compile(r'([<]\s*[bB][rR]\s*\/?[>])')
 140     html = _replace_matching_regex(br_tags, html, '__BR_TAG__')
 141
 142     # 2. form a tree, handle (currently ?) pure-text by enclosing them in a pre
 143     root = lxml.html.fromstring(html)
 144     if not len(root) and root.text is None and root.tail is None:
 145         html = '<div>%s</div>' % html
 146         root = lxml.html.fromstring(html)
 147
 148     # 2.5 remove quoted text in nodes
 149     quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
 150     for node in root.getiterator():
 151         if not node.text:
 152             continue
 153         node.text = _replace_matching_regex(quote_tags, node.text)
 154
 155     # 3. remove blockquotes
 156     quotes = [el for el in root.getiterator(tag='blockquote')]
 157     for node in quotes:
 158         # copy the node tail into parent text
 159         if node.tail:
 160             parent = node.getparent()
 161             parent.text = parent.text or '' + node.tail
 162         # remove the node
 163         node.getparent().remove(node)
 164
 165     # 4. strip signatures
 166     signature = re.compile(r'([-]{2}[\s]?[\r\n]{1,2}[^\z]+)')
 167     for elem in root.getiterator():
 168         if elem.text:
 169             match = re.search(signature, elem.text)
 170             if match:
 171                 elem.text = elem.text[:match.start()] + elem.text[match.end():]
 172         if elem.tail:
 173             match = re.search(signature, elem.tail)
 174             if match:
 175                 elem.tail = elem.tail[:match.start()] + elem.tail[match.end():]
 176
 177     # 5. \n back to <br/>
 178     html = etree.tostring(root, pretty_print=True)
 179     html = html.replace('__BR_TAG__', '<br />')
 180
 181     # 6. Misc cleaning :
 182     # - ClEditor seems to love using <div><br /><div> -> replace with <br />
 183     br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)')
 184     html = _replace_matching_regex(br_div_tags, html, '<br />')
 185
 186     return html
 187
 188
 189 #----------------------------------------------------------
 190 # HTML/Text management
 191 #----------------------------------------------------------
 192
 193 def html2plaintext(html, body_id=None, encoding='utf-8'):
 194     """ From an HTML text, convert the HTML to plain text.
 195     If @param body_id is provided then this is the tag where the
 196     body (not necessarily <body>) starts.
 197     """
 198     ## (c) Fry-IT, www.fry-it.com, 2007
 199     ## <peter@fry-it.com>
 200     ## download here: http://www.peterbe.com/plog/html2plaintext
 201
 202     html = ustr(html)
 203     tree = etree.fromstring(html, parser=etree.HTMLParser())
 204
 205     if body_id is not None:
 206         source = tree.xpath('//*[@id=%s]' % (body_id,))
 207     else:
 208         source = tree.xpath('//body')
 209     if len(source):
 210         tree = source[0]
 211
 212     url_index = []
 213     i = 0
 214     for link in tree.findall('.//a'):
 215         url = link.get('href')
 216         if url:
 217             i += 1
 218             link.tag = 'span'
 219             link.text = '%s [%s]' % (link.text, i)
 220             url_index.append(url)
 221
 222     html = ustr(etree.tostring(tree, encoding=encoding))
 223
 224     html = html.replace('<strong>', '*').replace('</strong>', '*')
 225     html = html.replace('<b>', '*').replace('</b>', '*')
 226     html = html.replace('<h3>', '*').replace('</h3>', '*')
 227     html = html.replace('<h2>', '**').replace('</h2>', '**')
 228     html = html.replace('<h1>', '**').replace('</h1>', '**')
 229     html = html.replace('<em>', '/').replace('</em>', '/')
 230     html = html.replace('<tr>', '\n')
 231     html = html.replace('</p>', '\n')
 232     html = re.sub('<br\s*/?>', '\n', html)
 233     html = re.sub('<.*?>', ' ', html)
 234     html = html.replace(' ' * 2, ' ')
 235
 236     # strip all lines
 237     html = '\n'.join([x.strip() for x in html.splitlines()])
 238     html = html.replace('\n' * 2, '\n')
 239
 240     for i, url in enumerate(url_index):
 241         if i == 0:
 242             html += '\n\n'
 243         html += ustr('[%s] %s\n') % (i + 1, url)
 244
 245     return html
 246
 247 def plaintext2html(text, container_tag=False):
 248     """ Convert plaintext into html. Content of the text is escaped to manage
 249         html entities, using cgi.escape().
 250         - all \n,\r are replaced by <br />
 251         - enclose content into <p>
 252         - 2 or more consecutive <br /> are considered as paragraph breaks
 253
 254         :param string container_tag: container of the html; by default the
 255             content is embedded into a <div>
 256     """
 257     text = cgi.escape(ustr(text))
 258
 259     # 1. replace \n and \r
 260     text = text.replace('\n', '<br/>')
 261     text = text.replace('\r', '<br/>')
 262
 263     # 2-3: form paragraphs
 264     idx = 0
 265     final = '<p>'
 266     br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})')
 267     for item in re.finditer(br_tags, text):
 268         final += text[idx:item.start()] + '</p><p>'
 269         idx = item.end()
 270     final += text[idx:] + '</p>'
 271
 272     # 4. container
 273     if container_tag:
 274         final = '<%s>%s</%s>' % (container_tag, final, container_tag)
 275     return ustr(final)
 276
 277 def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False):
 278     """ Append extra content at the end of an HTML snippet, trying
 279         to locate the end of the HTML document (</body>, </html>, or
 280         EOF), and converting the provided content in html unless ``plaintext``
 281         is False.
 282         Content conversion can be done in two ways:
 283         - wrapping it into a pre (preserve=True)
 284         - use plaintext2html (preserve=False, using container_tag to wrap the
 285             whole content)
 286         A side-effect of this method is to coerce all HTML tags to
 287         lowercase in ``html``, and strip enclosing <html> or <body> tags in
 288         content if ``plaintext`` is False.
 289
 290         :param str html: html tagsoup (doesn't have to be XHTML)
 291         :param str content: extra content to append
 292         :param bool plaintext: whether content is plaintext and should
 293             be wrapped in a <pre/> tag.
 294         :param bool preserve: if content is plaintext, wrap it into a <pre>
 295             instead of converting it into html
 296     """
 297     html = ustr(html)
 298     if plaintext and preserve:
 299         content = u'\n<pre>%s</pre>\n' % ustr(content)
 300     elif plaintext:
 301         content = '\n%s\n' % plaintext2html(content, container_tag)
 302     else:
 303         content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
 304         content = u'\n%s\n' % ustr(content)
 305     # Force all tags to lowercase
 306     html = re.sub(r'(</?)\W*(\w+)([ >])',
 307         lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
 308     insert_location = html.find('</body>')
 309     if insert_location == -1:
 310         insert_location = html.find('</html>')
 311     if insert_location == -1:
 312         return '%s%s' % (html, content)
 313     return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
 314
 315 #----------------------------------------------------------
 316 # Emails
 317 #----------------------------------------------------------
 318
 319 email_re = re.compile(r"""
 320     ([a-zA-Z][\w\.-]*[a-zA-Z0-9]     # username part
 321     @                                # mandatory @ sign
 322     [a-zA-Z0-9][\w\.-]*              # domain must start with a letter ... Ged> why do we include a 0-9 then?
 323      \.
 324      [a-z]{2,3}                      # TLD
 325     )
 326     """, re.VERBOSE)
 327 res_re = re.compile(r"\[([0-9]+)\]", re.UNICODE)
 328 command_re = re.compile("^Set-([a-z]+) *: *(.+)$", re.I + re.UNICODE)
 329
 330 # Updated in 7.0 to match the model name as well
 331 # Typical form of references is <timestamp-openerp-record_id-model_name@domain>
 332 # group(1) = the record ID ; group(2) = the model (if any) ; group(3) = the domain
 333 reference_re = re.compile("<.*-open(?:object|erp)-(\\d+)(?:-([\w.]+))?.*@(.*)>", re.UNICODE)
 334
 335 def generate_tracking_message_id(res_id):
 336     """Returns a string that can be used in the Message-ID RFC822 header field
 337
 338        Used to track the replies related to a given object thanks to the "In-Reply-To"
 339        or "References" fields that Mail User Agents will set.
 340     """
 341     try:
 342         rnd = random.SystemRandom().random()
 343     except NotImplementedError:
 344         rnd = random.random()
 345     rndstr = ("%.15f" % rnd)[2:]
 346     return "<%.15f.%s-openerp-%s@%s>" % (time.time(), rndstr, res_id, socket.gethostname())
 347
 348 def email_send(email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False,
 349                attachments=None, message_id=None, references=None, openobject_id=False, debug=False, subtype='plain', headers=None,
 350                smtp_server=None, smtp_port=None, ssl=False, smtp_user=None, smtp_password=None, cr=None, uid=None):
 351     """Low-level function for sending an email (deprecated).
 352
 353     :deprecate: since OpenERP 6.1, please use ir.mail_server.send_email() instead.
 354     :param email_from: A string used to fill the `From` header, if falsy,
 355                        config['email_from'] is used instead.  Also used for
 356                        the `Reply-To` header if `reply_to` is not provided
 357     :param email_to: a sequence of addresses to send the mail to.
 358     """
 359
 360     # If not cr, get cr from current thread database
 361     if not cr:
 362         db_name = getattr(threading.currentThread(), 'dbname', None)
 363         if db_name:
 364             cr = pooler.get_db_only(db_name).cursor()
 365         else:
 366             raise Exception("No database cursor found, please pass one explicitly")
 367
 368     # Send Email
 369     try:
 370         mail_server_pool = pooler.get_pool(cr.dbname).get('ir.mail_server')
 371         res = False
 372         # Pack Message into MIME Object
 373         email_msg = mail_server_pool.build_email(email_from, email_to, subject, body, email_cc, email_bcc, reply_to,
 374                    attachments, message_id, references, openobject_id, subtype, headers=headers)
 375
 376         res = mail_server_pool.send_email(cr, uid or 1, email_msg, mail_server_id=None,
 377                        smtp_server=smtp_server, smtp_port=smtp_port, smtp_user=smtp_user, smtp_password=smtp_password,
 378                        smtp_encryption=('ssl' if ssl else None), smtp_debug=debug)
 379     except Exception:
 380         _logger.exception("tools.email_send failed to deliver email")
 381         return False
 382     finally:
 383         cr.close()
 384     return res
 385
 386 def email_split(text):
 387     """ Return a list of the email addresses found in ``text`` """
 388     if not text:
 389         return []
 390     return re.findall(r'([^ ,<@]+@[^> ,]+)', text)