openerp/tools/mail.py

   1 # -*- coding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Business Applications
   5 #    Copyright (C) 2012 OpenERP S.A. (<http://openerp.com>).
   6 #
   7 #    This program is free software: you can redistribute it and/or modify
   8 #    it under the terms of the GNU Affero General Public License as
   9 #    published by the Free Software Foundation, either version 3 of the
  10 #    License, or (at your option) any later version.
  11 #
  12 #    This program is distributed in the hope that it will be useful,
  13 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #    GNU Affero General Public License for more details.
  16 #
  17 #    You should have received a copy of the GNU Affero General Public License
  18 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 ##############################################################################
  21
  22 from lxml import etree
  23 import cgi
  24 import logging
  25 import lxml.html
  26 import openerp.pooler as pooler
  27 import operator
  28 import random
  29 import re
  30 import socket
  31 import threading
  32 import time
  33
  34 from openerp.loglevels import ustr
  35
  36 _logger = logging.getLogger(__name__)
  37
  38
  39 #----------------------------------------------------------
  40 # HTML Sanitizer
  41 #----------------------------------------------------------
  42
  43 def html_sanitize(src):
  44     if not src:
  45         return src
  46     src = ustr(src, errors='replace')
  47     root = lxml.html.fromstring(u"<div>%s</div>" % src)
  48     result = handle_element(root)
  49     res = []
  50     for element in children(result[0]):
  51         if isinstance(element, basestring):
  52             res.append(element)
  53         else:
  54             element.tail = ""
  55             res.append(lxml.html.tostring(element))
  56     return ''.join(res)
  57
  58 # FIXME: shouldn't this be a whitelist rather than a blacklist?!
  59 to_remove = set(["script", "head", "meta", "title", "link", "img"])
  60 to_unwrap = set(["html", "body"])
  61
  62 javascript_regex = re.compile(r"^\s*javascript\s*:.*$", re.IGNORECASE)
  63
  64 def handle_a(el, new):
  65     href = el.get("href", "#")
  66     if javascript_regex.search(href):
  67         href = "#"
  68     new.set("href", href)
  69
  70 special = {
  71     "a": handle_a,
  72 }
  73
  74 def handle_element(element):
  75     if isinstance(element, basestring):
  76         return [element]
  77     if element.tag in to_remove:
  78         return []
  79     if element.tag in to_unwrap:
  80         return reduce(operator.add, [handle_element(x) for x in children(element)])
  81     result = lxml.html.fromstring("<%s />" % element.tag)
  82     for c in children(element):
  83         append_to(handle_element(c), result)
  84     if element.tag in special:
  85         special[element.tag](element, result)
  86     return [result]
  87
  88 def children(node):
  89     res = []
  90     if node.text is not None:
  91         res.append(node.text)
  92     for child_node in node.getchildren():
  93         res.append(child_node)
  94         if child_node.tail is not None:
  95             res.append(child_node.tail)
  96     return res
  97
  98 def append_to(elements, dest_node):
  99     for element in elements:
 100         if isinstance(element, basestring):
 101             children = dest_node.getchildren()
 102             if len(children) == 0:
 103                 dest_node.text = element
 104             else:
 105                 children[-1].tail = element
 106         else:
 107             dest_node.append(element)
 108
 109
 110 #----------------------------------------------------------
 111 # HTML Cleaner
 112 #----------------------------------------------------------
 113
 114 def html_email_clean(html):
 115     """ html_email_clean: clean the html to display in the web client.
 116         - strip email quotes (remove blockquote nodes)
 117         - strip signatures (remove --\n{\n)Blahblah), by replacing <br> by
 118             \n to avoid ignoring signatures converted into html
 119
 120         :param string html: sanitized html; tags like html or head should not
 121             be present in the html string. This method therefore takes as input
 122             html code coming from a sanitized source, like fields.html.
 123     """
 124     html = ustr(html)
 125     modified_html = ''
 126
 127     # 1. <br[ /]> -> \n, because otherwise the tree is obfuscated
 128     br_tags = re.compile(r'([<]\s*br\s*\/?[>])')
 129     idx = 0
 130     for item in re.finditer(br_tags, html):
 131         modified_html += html[idx:item.start()] + '__BR_TAG__'
 132         idx = item.end()
 133     modified_html += html[idx:]
 134     html = modified_html
 135     # TDE note: seems to have lots of <div><br></div> in emails... needs to be checks, could be cleaned
 136
 137     # 2. form a tree, handle (currently ?) pure-text by enclosing them in a pre
 138     root = lxml.html.fromstring(html)
 139     if not len(root) and root.text is None and root.tail is None:
 140         html = '<div>%s</div>' % html
 141         root = lxml.html.fromstring(html)
 142
 143     # 2.5 remove quoted text in nodes
 144     quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
 145     for node in root.getiterator():
 146         if not node.text:
 147             continue
 148         idx = 0
 149         text = ''
 150         for item in re.finditer(quote_tags, node.text):
 151             print item
 152             text += node.text[idx:item.start()]
 153             idx = item.end()
 154         text += node.text[idx:]
 155         node.text = text
 156
 157     # 3. remove blockquotes
 158     quotes = [el for el in root.getiterator(tag='blockquote')]
 159     for node in quotes:
 160         # copy the node tail into parent text
 161         if node.tail:
 162             parent = node.getparent()
 163             parent.text = parent.text or '' + node.tail
 164         # remove the node
 165         node.getparent().remove(node)
 166
 167     # 4. strip signatures
 168     signature = re.compile(r'([-]{2}[\s]?[\r\n]{1,2}[^\z]+)')
 169     for elem in root.getiterator():
 170         if elem.text:
 171             match = re.search(signature, elem.text)
 172             if match:
 173                 elem.text = elem.text[:match.start()] + elem.text[match.end():]
 174         if elem.tail:
 175             match = re.search(signature, elem.tail)
 176             if match:
 177                 elem.tail = elem.tail[:match.start()] + elem.tail[match.end():]
 178
 179     # 5. \n back to <br/>
 180     html = etree.tostring(root, pretty_print=True)
 181     html = html.replace('__BR_TAG__', '<br />')
 182
 183     # 6. Misc cleaning :
 184     # - ClEditor seems to love using <div><br /><div> -> replace with <br />
 185     modified_html = ''
 186     br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)')
 187     idx = 0
 188     for item in re.finditer(br_div_tags, html):
 189         modified_html += html[idx:item.start()] + '<br />'
 190         idx = item.end()
 191     modified_html += html[idx:]
 192     html = modified_html
 193
 194     return html
 195
 196
 197 #----------------------------------------------------------
 198 # HTML/Text management
 199 #----------------------------------------------------------
 200
 201 def html2plaintext(html, body_id=None, encoding='utf-8'):
 202     """ From an HTML text, convert the HTML to plain text.
 203     If @param body_id is provided then this is the tag where the
 204     body (not necessarily <body>) starts.
 205     """
 206     ## (c) Fry-IT, www.fry-it.com, 2007
 207     ## <peter@fry-it.com>
 208     ## download here: http://www.peterbe.com/plog/html2plaintext
 209
 210     html = ustr(html)
 211     tree = etree.fromstring(html, parser=etree.HTMLParser())
 212
 213     if body_id is not None:
 214         source = tree.xpath('//*[@id=%s]' % (body_id,))
 215     else:
 216         source = tree.xpath('//body')
 217     if len(source):
 218         tree = source[0]
 219
 220     url_index = []
 221     i = 0
 222     for link in tree.findall('.//a'):
 223         url = link.get('href')
 224         if url:
 225             i += 1
 226             link.tag = 'span'
 227             link.text = '%s [%s]' % (link.text, i)
 228             url_index.append(url)
 229
 230     html = ustr(etree.tostring(tree, encoding=encoding))
 231
 232     html = html.replace('<strong>', '*').replace('</strong>', '*')
 233     html = html.replace('<b>', '*').replace('</b>', '*')
 234     html = html.replace('<h3>', '*').replace('</h3>', '*')
 235     html = html.replace('<h2>', '**').replace('</h2>', '**')
 236     html = html.replace('<h1>', '**').replace('</h1>', '**')
 237     html = html.replace('<em>', '/').replace('</em>', '/')
 238     html = html.replace('<tr>', '\n')
 239     html = html.replace('</p>', '\n')
 240     html = re.sub('<br\s*/?>', '\n', html)
 241     html = re.sub('<.*?>', ' ', html)
 242     html = html.replace(' ' * 2, ' ')
 243
 244     # strip all lines
 245     html = '\n'.join([x.strip() for x in html.splitlines()])
 246     html = html.replace('\n' * 2, '\n')
 247
 248     for i, url in enumerate(url_index):
 249         if i == 0:
 250             html += '\n\n'
 251         html += ustr('[%s] %s\n') % (i + 1, url)
 252
 253     return html
 254
 255 def plaintext2html(text, container_tag=False):
 256     """ Convert plaintext into html. Content of the text is escaped to manage
 257         html entities, using cgi.escape().
 258         - all \n,\r are replaced by <br />
 259         - enclose content into <p>
 260         - 2 or more consecutive <br /> are considered as paragraph breaks
 261
 262         :param string container_tag: container of the html; by default the
 263             content is embedded into a <div>
 264     """
 265     text = cgi.escape(ustr(text))
 266
 267     # 1. replace \n and \r
 268     text = text.replace('\n', '<br/>')
 269     text = text.replace('\r', '<br/>')
 270
 271     # 2-3: form paragraphs
 272     idx = 0
 273     final = '<p>'
 274     br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})')
 275     for item in re.finditer(br_tags, text):
 276         final += text[idx:item.start()] + '</p><p>'
 277         idx = item.end()
 278     final += text[idx:] + '</p>'
 279
 280     # 4. container
 281     if container_tag:
 282         final = '<%s>%s</%s>' % (container_tag, final, container_tag)
 283     return ustr(final)
 284
 285 def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False):
 286     """ Append extra content at the end of an HTML snippet, trying
 287         to locate the end of the HTML document (</body>, </html>, or
 288         EOF), and converting the provided content in html unless ``plaintext``
 289         is False.
 290         Content conversion can be done in two ways:
 291         - wrapping it into a pre (preserve=True)
 292         - use plaintext2html (preserve=False, using container_tag to wrap the
 293             whole content)
 294         A side-effect of this method is to coerce all HTML tags to
 295         lowercase in ``html``, and strip enclosing <html> or <body> tags in
 296         content if ``plaintext`` is False.
 297
 298         :param str html: html tagsoup (doesn't have to be XHTML)
 299         :param str content: extra content to append
 300         :param bool plaintext: whether content is plaintext and should
 301             be wrapped in a <pre/> tag.
 302         :param bool preserve: if content is plaintext, wrap it into a <pre>
 303             instead of converting it into html
 304     """
 305     html = ustr(html)
 306     if plaintext and preserve:
 307         content = u'\n<pre>%s</pre>\n' % ustr(content)
 308     elif plaintext:
 309         content = '\n%s\n' % plaintext2html(content, container_tag)
 310     else:
 311         content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
 312         content = u'\n%s\n' % ustr(content)
 313     # Force all tags to lowercase
 314     html = re.sub(r'(</?)\W*(\w+)([ >])',
 315         lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
 316     insert_location = html.find('</body>')
 317     if insert_location == -1:
 318         insert_location = html.find('</html>')
 319     if insert_location == -1:
 320         return '%s%s' % (html, content)
 321     return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
 322
 323 #----------------------------------------------------------
 324 # Emails
 325 #----------------------------------------------------------
 326
 327 email_re = re.compile(r"""
 328     ([a-zA-Z][\w\.-]*[a-zA-Z0-9]     # username part
 329     @                                # mandatory @ sign
 330     [a-zA-Z0-9][\w\.-]*              # domain must start with a letter ... Ged> why do we include a 0-9 then?
 331      \.
 332      [a-z]{2,3}                      # TLD
 333     )
 334     """, re.VERBOSE)
 335 res_re = re.compile(r"\[([0-9]+)\]", re.UNICODE)
 336 command_re = re.compile("^Set-([a-z]+) *: *(.+)$", re.I + re.UNICODE)
 337
 338 # Updated in 7.0 to match the model name as well
 339 # Typical form of references is <timestamp-openerp-record_id-model_name@domain>
 340 # group(1) = the record ID ; group(2) = the model (if any) ; group(3) = the domain
 341 reference_re = re.compile("<.*-open(?:object|erp)-(\\d+)(?:-([\w.]+))?.*@(.*)>", re.UNICODE)
 342
 343 def generate_tracking_message_id(res_id):
 344     """Returns a string that can be used in the Message-ID RFC822 header field
 345
 346        Used to track the replies related to a given object thanks to the "In-Reply-To"
 347        or "References" fields that Mail User Agents will set.
 348     """
 349     try:
 350         rnd = random.SystemRandom().random()
 351     except NotImplementedError:
 352         rnd = random.random()
 353     rndstr = ("%.15f" % rnd)[2:]
 354     return "<%.15f.%s-openerp-%s@%s>" % (time.time(), rndstr, res_id, socket.gethostname())
 355
 356 def email_send(email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False,
 357                attachments=None, message_id=None, references=None, openobject_id=False, debug=False, subtype='plain', headers=None,
 358                smtp_server=None, smtp_port=None, ssl=False, smtp_user=None, smtp_password=None, cr=None, uid=None):
 359     """Low-level function for sending an email (deprecated).
 360
 361     :deprecate: since OpenERP 6.1, please use ir.mail_server.send_email() instead.
 362     :param email_from: A string used to fill the `From` header, if falsy,
 363                        config['email_from'] is used instead.  Also used for
 364                        the `Reply-To` header if `reply_to` is not provided
 365     :param email_to: a sequence of addresses to send the mail to.
 366     """
 367
 368     # If not cr, get cr from current thread database
 369     if not cr:
 370         db_name = getattr(threading.currentThread(), 'dbname', None)
 371         if db_name:
 372             cr = pooler.get_db_only(db_name).cursor()
 373         else:
 374             raise Exception("No database cursor found, please pass one explicitly")
 375
 376     # Send Email
 377     try:
 378         mail_server_pool = pooler.get_pool(cr.dbname).get('ir.mail_server')
 379         res = False
 380         # Pack Message into MIME Object
 381         email_msg = mail_server_pool.build_email(email_from, email_to, subject, body, email_cc, email_bcc, reply_to,
 382                    attachments, message_id, references, openobject_id, subtype, headers=headers)
 383
 384         res = mail_server_pool.send_email(cr, uid or 1, email_msg, mail_server_id=None,
 385                        smtp_server=smtp_server, smtp_port=smtp_port, smtp_user=smtp_user, smtp_password=smtp_password,
 386                        smtp_encryption=('ssl' if ssl else None), smtp_debug=debug)
 387     except Exception:
 388         _logger.exception("tools.email_send failed to deliver email")
 389         return False
 390     finally:
 391         cr.close()
 392     return res
 393
 394 def email_split(text):
 395     """ Return a list of the email addresses found in ``text`` """
 396     if not text:
 397         return []
 398     return re.findall(r'([^ ,<@]+@[^> ,]+)', text)