openerp/tools/mail.py

   1 # -*- coding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Business Applications
   5 #    Copyright (C) 2012 OpenERP S.A. (<http://openerp.com>).
   6 #
   7 #    This program is free software: you can redistribute it and/or modify
   8 #    it under the terms of the GNU Affero General Public License as
   9 #    published by the Free Software Foundation, either version 3 of the
  10 #    License, or (at your option) any later version.
  11 #
  12 #    This program is distributed in the hope that it will be useful,
  13 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #    GNU Affero General Public License for more details.
  16 #
  17 #    You should have received a copy of the GNU Affero General Public License
  18 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 ##############################################################################
  21
  22 from lxml import etree
  23 import cgi
  24 import logging
  25 import lxml.html
  26 import openerp.pooler as pooler
  27 import operator
  28 import random
  29 import re
  30 import socket
  31 import threading
  32 import time
  33
  34 from openerp.loglevels import ustr
  35
  36 _logger = logging.getLogger(__name__)
  37
  38
  39 #----------------------------------------------------------
  40 # HTML Sanitizer
  41 #----------------------------------------------------------
  42
  43 def html_sanitize(src):
  44     if not src:
  45         return src
  46     src = ustr(src, errors='replace')
  47     root = lxml.html.fromstring(u"<div>%s</div>" % src)
  48     result = handle_element(root)
  49     res = []
  50     for element in children(result[0]):
  51         if isinstance(element, basestring):
  52             res.append(element)
  53         else:
  54             element.tail = ""
  55             res.append(lxml.html.tostring(element))
  56     return ''.join(res)
  57
  58 # FIXME: shouldn't this be a whitelist rather than a blacklist?!
  59 to_remove = set(["script", "head", "meta", "title", "link", "img"])
  60 to_unwrap = set(["html", "body"])
  61
  62 javascript_regex = re.compile(r"^\s*javascript\s*:.*$", re.IGNORECASE)
  63
  64 def handle_a(el, new):
  65     href = el.get("href", "#")
  66     if javascript_regex.search(href):
  67         href = "#"
  68     new.set("href", href)
  69
  70 special = {
  71     "a": handle_a,
  72 }
  73
  74 def handle_element(element):
  75     if isinstance(element, basestring):
  76         return [element]
  77     if element.tag in to_remove:
  78         return []
  79     if element.tag in to_unwrap:
  80         return reduce(operator.add, [handle_element(x) for x in children(element)])
  81     result = lxml.html.fromstring("<%s />" % element.tag)
  82     for c in children(element):
  83         append_to(handle_element(c), result)
  84     if element.tag in special:
  85         special[element.tag](element, result)
  86     return [result]
  87
  88 def children(node):
  89     res = []
  90     if node.text is not None:
  91         res.append(node.text)
  92     for child_node in node.getchildren():
  93         res.append(child_node)
  94         if child_node.tail is not None:
  95             res.append(child_node.tail)
  96     return res
  97
  98 def append_to(elements, dest_node):
  99     for element in elements:
 100         if isinstance(element, basestring):
 101             children = dest_node.getchildren()
 102             if len(children) == 0:
 103                 dest_node.text = element
 104             else:
 105                 children[-1].tail = element
 106         else:
 107             dest_node.append(element)
 108
 109
 110 #----------------------------------------------------------
 111 # HTML Cleaner
 112 #----------------------------------------------------------
 113
 114 def html_email_clean(html):
 115     """ html_email_clean: clean the html to display in the web client.
 116         - strip email quotes (remove blockquote nodes)
 117         - strip signatures (remove --\n{\n)Blahblah), by replacing <br> by
 118             \n to avoid ignoring signatures converted into html
 119
 120         :param string html: sanitized html; tags like html or head should not
 121             be present in the html string. This method therefore takes as input
 122             html code coming from a sanitized source, like fields.html.
 123     """
 124     def _replace_matching_regex(regex, source, replace=''):
 125         dest = ''
 126         idx = 0
 127         for item in re.finditer(regex, source):
 128             dest += source[idx:item.start()] + replace
 129             idx = item.end()
 130         dest += source[idx:]
 131         return dest
 132
 133     if not html or not isinstance(html, basestring):
 134         return html
 135
 136     html = ustr(html)
 137
 138     # 0. remove encoding attribute inside tags
 139     doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
 140     html = doctype.sub(r"", html)
 141
 142     # 1. <br[ /]> -> \n, because otherwise the tree is obfuscated
 143     br_tags = re.compile(r'([<]\s*[bB][rR]\s*\/?[>])')
 144     html = _replace_matching_regex(br_tags, html, '__BR_TAG__')
 145
 146     # 2. form a tree, handle (currently ?) pure-text by enclosing them in a pre
 147     root = lxml.html.fromstring(html)
 148     if not len(root) and root.text is None and root.tail is None:
 149         html = '<div>%s</div>' % html
 150         root = lxml.html.fromstring(html)
 151
 152     # 2.5 remove quoted text in nodes
 153     quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
 154     for node in root.getiterator():
 155         if not node.text:
 156             continue
 157         node.text = _replace_matching_regex(quote_tags, node.text)
 158
 159     # 3. remove blockquotes
 160     quotes = [el for el in root.getiterator(tag='blockquote')]
 161     for node in quotes:
 162         # copy the node tail into parent text
 163         if node.tail:
 164             parent = node.getparent()
 165             parent.text = parent.text or '' + node.tail
 166         # remove the node
 167         node.getparent().remove(node)
 168
 169     # 4. strip signatures
 170     signature = re.compile(r'([-]{2}[\s]?[\r\n]{1,2}[^\z]+)')
 171     for elem in root.getiterator():
 172         if elem.text:
 173             match = re.search(signature, elem.text)
 174             if match:
 175                 elem.text = elem.text[:match.start()] + elem.text[match.end():]
 176         if elem.tail:
 177             match = re.search(signature, elem.tail)
 178             if match:
 179                 elem.tail = elem.tail[:match.start()] + elem.tail[match.end():]
 180
 181     # 5. \n back to <br/>
 182     html = etree.tostring(root, pretty_print=True)
 183     html = html.replace('__BR_TAG__', '<br />')
 184
 185     # 6. Misc cleaning :
 186     # - ClEditor seems to love using <div><br /><div> -> replace with <br />
 187     br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)')
 188     html = _replace_matching_regex(br_div_tags, html, '<br />')
 189
 190     return html
 191
 192
 193 #----------------------------------------------------------
 194 # HTML/Text management
 195 #----------------------------------------------------------
 196
 197 def html2plaintext(html, body_id=None, encoding='utf-8'):
 198     """ From an HTML text, convert the HTML to plain text.
 199     If @param body_id is provided then this is the tag where the
 200     body (not necessarily <body>) starts.
 201     """
 202     ## (c) Fry-IT, www.fry-it.com, 2007
 203     ## <peter@fry-it.com>
 204     ## download here: http://www.peterbe.com/plog/html2plaintext
 205
 206     html = ustr(html)
 207     tree = etree.fromstring(html, parser=etree.HTMLParser())
 208
 209     if body_id is not None:
 210         source = tree.xpath('//*[@id=%s]' % (body_id,))
 211     else:
 212         source = tree.xpath('//body')
 213     if len(source):
 214         tree = source[0]
 215
 216     url_index = []
 217     i = 0
 218     for link in tree.findall('.//a'):
 219         url = link.get('href')
 220         if url:
 221             i += 1
 222             link.tag = 'span'
 223             link.text = '%s [%s]' % (link.text, i)
 224             url_index.append(url)
 225
 226     html = ustr(etree.tostring(tree, encoding=encoding))
 227
 228     html = html.replace('<strong>', '*').replace('</strong>', '*')
 229     html = html.replace('<b>', '*').replace('</b>', '*')
 230     html = html.replace('<h3>', '*').replace('</h3>', '*')
 231     html = html.replace('<h2>', '**').replace('</h2>', '**')
 232     html = html.replace('<h1>', '**').replace('</h1>', '**')
 233     html = html.replace('<em>', '/').replace('</em>', '/')
 234     html = html.replace('<tr>', '\n')
 235     html = html.replace('</p>', '\n')
 236     html = re.sub('<br\s*/?>', '\n', html)
 237     html = re.sub('<.*?>', ' ', html)
 238     html = html.replace(' ' * 2, ' ')
 239
 240     # strip all lines
 241     html = '\n'.join([x.strip() for x in html.splitlines()])
 242     html = html.replace('\n' * 2, '\n')
 243
 244     for i, url in enumerate(url_index):
 245         if i == 0:
 246             html += '\n\n'
 247         html += ustr('[%s] %s\n') % (i + 1, url)
 248
 249     return html
 250
 251 def plaintext2html(text, container_tag=False):
 252     """ Convert plaintext into html. Content of the text is escaped to manage
 253         html entities, using cgi.escape().
 254         - all \n,\r are replaced by <br />
 255         - enclose content into <p>
 256         - 2 or more consecutive <br /> are considered as paragraph breaks
 257
 258         :param string container_tag: container of the html; by default the
 259             content is embedded into a <div>
 260     """
 261     text = cgi.escape(ustr(text))
 262
 263     # 1. replace \n and \r
 264     text = text.replace('\n', '<br/>')
 265     text = text.replace('\r', '<br/>')
 266
 267     # 2-3: form paragraphs
 268     idx = 0
 269     final = '<p>'
 270     br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})')
 271     for item in re.finditer(br_tags, text):
 272         final += text[idx:item.start()] + '</p><p>'
 273         idx = item.end()
 274     final += text[idx:] + '</p>'
 275
 276     # 4. container
 277     if container_tag:
 278         final = '<%s>%s</%s>' % (container_tag, final, container_tag)
 279     return ustr(final)
 280
 281 def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False):
 282     """ Append extra content at the end of an HTML snippet, trying
 283         to locate the end of the HTML document (</body>, </html>, or
 284         EOF), and converting the provided content in html unless ``plaintext``
 285         is False.
 286         Content conversion can be done in two ways:
 287         - wrapping it into a pre (preserve=True)
 288         - use plaintext2html (preserve=False, using container_tag to wrap the
 289             whole content)
 290         A side-effect of this method is to coerce all HTML tags to
 291         lowercase in ``html``, and strip enclosing <html> or <body> tags in
 292         content if ``plaintext`` is False.
 293
 294         :param str html: html tagsoup (doesn't have to be XHTML)
 295         :param str content: extra content to append
 296         :param bool plaintext: whether content is plaintext and should
 297             be wrapped in a <pre/> tag.
 298         :param bool preserve: if content is plaintext, wrap it into a <pre>
 299             instead of converting it into html
 300     """
 301     html = ustr(html)
 302     if plaintext and preserve:
 303         content = u'\n<pre>%s</pre>\n' % ustr(content)
 304     elif plaintext:
 305         content = '\n%s\n' % plaintext2html(content, container_tag)
 306     else:
 307         content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
 308         content = u'\n%s\n' % ustr(content)
 309     # Force all tags to lowercase
 310     html = re.sub(r'(</?)\W*(\w+)([ >])',
 311         lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
 312     insert_location = html.find('</body>')
 313     if insert_location == -1:
 314         insert_location = html.find('</html>')
 315     if insert_location == -1:
 316         return '%s%s' % (html, content)
 317     return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
 318
 319 #----------------------------------------------------------
 320 # Emails
 321 #----------------------------------------------------------
 322
 323 # matches any email in a body of text
 324 email_re = re.compile(r"""([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6})""", re.VERBOSE)
 325
 326 # matches a string containing only one email
 327 single_email_re = re.compile(r"""^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$""", re.VERBOSE)
 328
 329 res_re = re.compile(r"\[([0-9]+)\]", re.UNICODE)
 330 command_re = re.compile("^Set-([a-z]+) *: *(.+)$", re.I + re.UNICODE)
 331
 332 # Updated in 7.0 to match the model name as well
 333 # Typical form of references is <timestamp-openerp-record_id-model_name@domain>
 334 # group(1) = the record ID ; group(2) = the model (if any) ; group(3) = the domain
 335 reference_re = re.compile("<.*-open(?:object|erp)-(\\d+)(?:-([\w.]+))?.*@(.*)>", re.UNICODE)
 336
 337 def generate_tracking_message_id(res_id):
 338     """Returns a string that can be used in the Message-ID RFC822 header field
 339
 340        Used to track the replies related to a given object thanks to the "In-Reply-To"
 341        or "References" fields that Mail User Agents will set.
 342     """
 343     try:
 344         rnd = random.SystemRandom().random()
 345     except NotImplementedError:
 346         rnd = random.random()
 347     rndstr = ("%.15f" % rnd)[2:]
 348     return "<%.15f.%s-openerp-%s@%s>" % (time.time(), rndstr, res_id, socket.gethostname())
 349
 350 def email_send(email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False,
 351                attachments=None, message_id=None, references=None, openobject_id=False, debug=False, subtype='plain', headers=None,
 352                smtp_server=None, smtp_port=None, ssl=False, smtp_user=None, smtp_password=None, cr=None, uid=None):
 353     """Low-level function for sending an email (deprecated).
 354
 355     :deprecate: since OpenERP 6.1, please use ir.mail_server.send_email() instead.
 356     :param email_from: A string used to fill the `From` header, if falsy,
 357                        config['email_from'] is used instead.  Also used for
 358                        the `Reply-To` header if `reply_to` is not provided
 359     :param email_to: a sequence of addresses to send the mail to.
 360     """
 361
 362     # If not cr, get cr from current thread database
 363     local_cr = None
 364     if not cr:
 365         db_name = getattr(threading.currentThread(), 'dbname', None)
 366         if db_name:
 367             local_cr = cr = pooler.get_db(db_name).cursor()
 368         else:
 369             raise Exception("No database cursor found, please pass one explicitly")
 370
 371     # Send Email
 372     try:
 373         mail_server_pool = pooler.get_pool(cr.dbname).get('ir.mail_server')
 374         res = False
 375         # Pack Message into MIME Object
 376         email_msg = mail_server_pool.build_email(email_from, email_to, subject, body, email_cc, email_bcc, reply_to,
 377                    attachments, message_id, references, openobject_id, subtype, headers=headers)
 378
 379         res = mail_server_pool.send_email(cr, uid or 1, email_msg, mail_server_id=None,
 380                        smtp_server=smtp_server, smtp_port=smtp_port, smtp_user=smtp_user, smtp_password=smtp_password,
 381                        smtp_encryption=('ssl' if ssl else None), smtp_debug=debug)
 382     except Exception:
 383         _logger.exception("tools.email_send failed to deliver email")
 384         return False
 385     finally:
 386         if local_cr:
 387             cr.close()
 388     return res
 389
 390 def email_split(text):
 391     """ Return a list of the email addresses found in ``text`` """
 392     if not text:
 393         return []
 394     return re.findall(r'([^ ,<@]+@[^> ,]+)', text)