openerp/tools/mail.py

   1 # -*- coding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Business Applications
   5 #    Copyright (C) 2012 OpenERP S.A. (<http://openerp.com>).
   6 #
   7 #    This program is free software: you can redistribute it and/or modify
   8 #    it under the terms of the GNU Affero General Public License as
   9 #    published by the Free Software Foundation, either version 3 of the
  10 #    License, or (at your option) any later version.
  11 #
  12 #    This program is distributed in the hope that it will be useful,
  13 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #    GNU Affero General Public License for more details.
  16 #
  17 #    You should have received a copy of the GNU Affero General Public License
  18 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 ##############################################################################
  21
  22 from lxml import etree
  23 import cgi
  24 import logging
  25 import lxml.html
  26 import openerp.pooler as pooler
  27 import operator
  28 import random
  29 import re
  30 import socket
  31 import threading
  32 import time
  33
  34 from openerp.loglevels import ustr
  35
  36 _logger = logging.getLogger(__name__)
  37
  38
  39 #----------------------------------------------------------
  40 # HTML Sanitizer
  41 #----------------------------------------------------------
  42
  43 def html_sanitize(src):
  44     if not src:
  45         return src
  46     src = ustr(src, errors='replace')
  47     root = lxml.html.fromstring(u"<div>%s</div>" % src)
  48     result = handle_element(root)
  49     res = []
  50     for element in children(result[0]):
  51         if isinstance(element, basestring):
  52             res.append(element)
  53         else:
  54             element.tail = ""
  55             res.append(lxml.html.tostring(element))
  56     return ''.join(res)
  57
  58 # FIXME: shouldn't this be a whitelist rather than a blacklist?!
  59 to_remove = set(["script", "head", "meta", "title", "link", "img"])
  60 to_unwrap = set(["html", "body"])
  61
  62 javascript_regex = re.compile(r"^\s*javascript\s*:.*$", re.IGNORECASE)
  63
  64 def handle_a(el, new):
  65     href = el.get("href", "#")
  66     if javascript_regex.search(href):
  67         href = "#"
  68     new.set("href", href)
  69
  70 special = {
  71     "a": handle_a,
  72 }
  73
  74 def handle_element(element):
  75     if isinstance(element, basestring):
  76         return [element]
  77     if element.tag in to_remove:
  78         return []
  79     if element.tag in to_unwrap:
  80         return reduce(operator.add, [handle_element(x) for x in children(element)])
  81     result = lxml.html.fromstring("<%s />" % element.tag)
  82     for c in children(element):
  83         append_to(handle_element(c), result)
  84     if element.tag in special:
  85         special[element.tag](element, result)
  86     return [result]
  87
  88 def children(node):
  89     res = []
  90     if node.text is not None:
  91         res.append(node.text)
  92     for child_node in node.getchildren():
  93         res.append(child_node)
  94         if child_node.tail is not None:
  95             res.append(child_node.tail)
  96     return res
  97
  98 def append_to(elements, dest_node):
  99     for element in elements:
 100         if isinstance(element, basestring):
 101             children = dest_node.getchildren()
 102             if len(children) == 0:
 103                 dest_node.text = element
 104             else:
 105                 children[-1].tail = element
 106         else:
 107             dest_node.append(element)
 108
 109
 110 #----------------------------------------------------------
 111 # HTML Cleaner
 112 #----------------------------------------------------------
 113
 114 def html_email_clean(html):
 115     """ html_email_clean: clean the html to display in the web client.
 116         - strip email quotes (remove blockquote nodes)
 117         - strip signatures (remove --\n{\n)Blahblah), by replacing <br> by
 118             \n to avoid ignoring signatures converted into html
 119
 120         :param string html: sanitized html; tags like html or head should not
 121             be present in the html string. This method therefore takes as input
 122             html code coming from a sanitized source, like fields.html.
 123     """
 124     modified_html = ''
 125
 126     # 1. <br[ /]> -> \n, because otherwise the tree is obfuscated
 127     br_tags = re.compile(r'([<]\s*br\s*\/?[>])')
 128     idx = 0
 129     for item in re.finditer(br_tags, html):
 130         modified_html += html[idx:item.start()] + '__BR_TAG__'
 131         idx = item.end()
 132     modified_html += html[idx:]
 133     html = modified_html
 134
 135     # 2. form a tree, handle (currently ?) pure-text by enclosing them in a pre
 136     root = lxml.html.fromstring(html)
 137     if not len(root) and root.text is None and root.tail is None:
 138         html = '<div>%s</div>' % html
 139         root = lxml.html.fromstring(html)
 140
 141     # 3. remove blockquotes
 142     quotes = [el for el in root.getiterator(tag='blockquote')]
 143     for node in quotes:
 144         node.getparent().remove(node)
 145
 146     # 4. strip signatures
 147     signature = re.compile(r'([-]{2}[\s]?[\r\n]{1,2}[^\z]+)')
 148     for elem in root.getiterator():
 149         if elem.text:
 150             match = re.search(signature, elem.text)
 151             if match:
 152                 elem.text = elem.text[:match.start()] + elem.text[match.end():]
 153         if elem.tail:
 154             match = re.search(signature, elem.tail)
 155             if match:
 156                 elem.tail = elem.tail[:match.start()] + elem.tail[match.end():]
 157
 158     # 5. \n back to <br/>
 159     html = etree.tostring(root, pretty_print=True)
 160     html = html.replace('__BR_TAG__', '<br />')
 161
 162     # 6. Misc cleaning :
 163     # - ClEditor seems to love using <div><br /><div> -> replace with <br />
 164     modified_html = ''
 165     br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)')
 166     idx = 0
 167     for item in re.finditer(br_div_tags, html):
 168         modified_html += html[idx:item.start()] + '<br />'
 169         idx = item.end()
 170     modified_html += html[idx:]
 171     html = modified_html
 172
 173     return html
 174
 175
 176 #----------------------------------------------------------
 177 # HTML/Text management
 178 #----------------------------------------------------------
 179
 180 def html2plaintext(html, body_id=None, encoding='utf-8'):
 181     """ From an HTML text, convert the HTML to plain text.
 182     If @param body_id is provided then this is the tag where the
 183     body (not necessarily <body>) starts.
 184     """
 185     ## (c) Fry-IT, www.fry-it.com, 2007
 186     ## <peter@fry-it.com>
 187     ## download here: http://www.peterbe.com/plog/html2plaintext
 188
 189     html = ustr(html)
 190
 191     from lxml.etree import tostring, fromstring, HTMLParser
 192     tree = fromstring(html, parser=HTMLParser())
 193
 194     if body_id is not None:
 195         source = tree.xpath('//*[@id=%s]' % (body_id,))
 196     else:
 197         source = tree.xpath('//body')
 198     if len(source):
 199         tree = source[0]
 200
 201     url_index = []
 202     i = 0
 203     for link in tree.findall('.//a'):
 204         url = link.get('href')
 205         if url:
 206             i += 1
 207             link.tag = 'span'
 208             link.text = '%s [%s]' % (link.text, i)
 209             url_index.append(url)
 210
 211     html = ustr(tostring(tree, encoding=encoding))
 212
 213     html = html.replace('<strong>', '*').replace('</strong>', '*')
 214     html = html.replace('<b>', '*').replace('</b>', '*')
 215     html = html.replace('<h3>', '*').replace('</h3>', '*')
 216     html = html.replace('<h2>', '**').replace('</h2>', '**')
 217     html = html.replace('<h1>', '**').replace('</h1>', '**')
 218     html = html.replace('<em>', '/').replace('</em>', '/')
 219     html = html.replace('<tr>', '\n')
 220     html = html.replace('</p>', '\n')
 221     html = re.sub('<br\s*/?>', '\n', html)
 222     html = re.sub('<.*?>', ' ', html)
 223     html = html.replace(' ' * 2, ' ')
 224
 225     # strip all lines
 226     html = '\n'.join([x.strip() for x in html.splitlines()])
 227     html = html.replace('\n' * 2, '\n')
 228
 229     for i, url in enumerate(url_index):
 230         if i == 0:
 231             html += '\n\n'
 232         html += ustr('[%s] %s\n') % (i + 1, url)
 233
 234     return html
 235
 236 def text2html(text, container_tag='div'):
 237     """ Convert plaintext into html. Content of the text is escaped to manage
 238         html entities, using cgi.escape().
 239         - all \n,\r are replaced by <br />
 240         - enclose content into <p>
 241         - 2 or more consecutive <br /> are considered as paragraph breaks
 242
 243         :param string container_tag: container of the html; by default the
 244             content is embedded into a <div>
 245     """
 246     text = cgi.escape(text)
 247
 248     # 1. replace \n and \r
 249     text = text.replace('\n', '<br/>')
 250     text = text.replace('\r', '<br/>')
 251
 252     # 2-3: form paragraphs
 253     idx = 0
 254     final = '<p>'
 255     br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})')
 256     for item in re.finditer(br_tags, text):
 257         final += text[idx:item.start()] + '</p><p>'
 258         idx = item.end()
 259     final += text[idx:] + '</p>'
 260
 261     # 4. container
 262     if container_tag:
 263         final = '<%s>%s</%s>' % (container_tag, final, container_tag)
 264     return final
 265
 266 #----------------------------------------------------------
 267 # Emails
 268 #----------------------------------------------------------
 269
 270 email_re = re.compile(r"""
 271     ([a-zA-Z][\w\.-]*[a-zA-Z0-9]     # username part
 272     @                                # mandatory @ sign
 273     [a-zA-Z0-9][\w\.-]*              # domain must start with a letter ... Ged> why do we include a 0-9 then?
 274      \.
 275      [a-z]{2,3}                      # TLD
 276     )
 277     """, re.VERBOSE)
 278 res_re = re.compile(r"\[([0-9]+)\]", re.UNICODE)
 279 command_re = re.compile("^Set-([a-z]+) *: *(.+)$", re.I + re.UNICODE)
 280
 281 # Updated in 7.0 to match the model name as well
 282 # Typical form of references is <timestamp-openerp-record_id-model_name@domain>
 283 # group(1) = the record ID ; group(2) = the model (if any) ; group(3) = the domain
 284 reference_re = re.compile("<.*-open(?:object|erp)-(\\d+)(?:-([\w.]+))?.*@(.*)>", re.UNICODE)
 285
 286 def generate_tracking_message_id(res_id):
 287     """Returns a string that can be used in the Message-ID RFC822 header field
 288
 289        Used to track the replies related to a given object thanks to the "In-Reply-To"
 290        or "References" fields that Mail User Agents will set.
 291     """
 292     try:
 293         rnd = random.SystemRandom().random()
 294     except NotImplementedError:
 295         rnd = random.random()
 296     rndstr = ("%.15f" % rnd)[2:]
 297     return "<%.15f.%s-openerp-%s@%s>" % (time.time(), rndstr, res_id, socket.gethostname())
 298
 299 def email_send(email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False,
 300                attachments=None, message_id=None, references=None, openobject_id=False, debug=False, subtype='plain', headers=None,
 301                smtp_server=None, smtp_port=None, ssl=False, smtp_user=None, smtp_password=None, cr=None, uid=None):
 302     """Low-level function for sending an email (deprecated).
 303
 304     :deprecate: since OpenERP 6.1, please use ir.mail_server.send_email() instead.
 305     :param email_from: A string used to fill the `From` header, if falsy,
 306                        config['email_from'] is used instead.  Also used for
 307                        the `Reply-To` header if `reply_to` is not provided
 308     :param email_to: a sequence of addresses to send the mail to.
 309     """
 310
 311     # If not cr, get cr from current thread database
 312     if not cr:
 313         db_name = getattr(threading.currentThread(), 'dbname', None)
 314         if db_name:
 315             cr = pooler.get_db_only(db_name).cursor()
 316         else:
 317             raise Exception("No database cursor found, please pass one explicitly")
 318
 319     # Send Email
 320     try:
 321         mail_server_pool = pooler.get_pool(cr.dbname).get('ir.mail_server')
 322         res = False
 323         # Pack Message into MIME Object
 324         email_msg = mail_server_pool.build_email(email_from, email_to, subject, body, email_cc, email_bcc, reply_to,
 325                    attachments, message_id, references, openobject_id, subtype, headers=headers)
 326
 327         res = mail_server_pool.send_email(cr, uid or 1, email_msg, mail_server_id=None,
 328                        smtp_server=smtp_server, smtp_port=smtp_port, smtp_user=smtp_user, smtp_password=smtp_password,
 329                        smtp_encryption=('ssl' if ssl else None), smtp_debug=debug)
 330     except Exception:
 331         _logger.exception("tools.email_send failed to deliver email")
 332         return False
 333     finally:
 334         cr.close()
 335     return res
 336
 337 def email_split(text):
 338     """ Return a list of the email addresses found in ``text`` """
 339     if not text:
 340         return []
 341     return re.findall(r'([^ ,<@]+@[^> ,]+)', text)
 342
 343 def append_content_to_html(html, content, plaintext=True):
 344     """Append extra content at the end of an HTML snippet, trying
 345        to locate the end of the HTML document (</body>, </html>, or
 346        EOF), and wrapping the provided content in a <pre/> block
 347        unless ``plaintext`` is False. A side-effect of this
 348        method is to coerce all HTML tags to lowercase in ``html``,
 349        and strip enclosing <html> or <body> tags in content if
 350        ``plaintext`` is False.
 351
 352        :param str html: html tagsoup (doesn't have to be XHTML)
 353        :param str content: extra content to append
 354        :param bool plaintext: whether content is plaintext and should
 355            be wrapped in a <pre/> tag.
 356     """
 357     html = ustr(html)
 358     if plaintext:
 359         content = u'\n<pre>%s</pre>\n' % ustr(content)
 360     else:
 361         content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
 362         content = u'\n%s\n' % ustr(content)
 363     # Force all tags to lowercase
 364     html = re.sub(r'(</?)\W*(\w+)([ >])',
 365         lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
 366     insert_location = html.find('</body>')
 367     if insert_location == -1:
 368         insert_location = html.find('</html>')
 369     if insert_location == -1:
 370         return '%s%s' % (html, content)
 371     return '%s%s%s' % (html[:insert_location], content, html[insert_location:])