1 # -*- coding: utf-8 -*-
2 ##############################################################################
4 # OpenERP, Open Source Business Applications
5 # Copyright (C) 2012-2013 OpenERP S.A. (<http://openerp.com>).
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as
9 # published by the Free Software Foundation, either version 3 of the
10 # License, or (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ##############################################################################
22 from lxml import etree
26 import lxml.html.clean as clean
34 from openerp.loglevels import ustr
36 _logger = logging.getLogger(__name__)
39 #----------------------------------------------------------
41 #----------------------------------------------------------
43 tags_to_kill = ["script", "head", "meta", "title", "link", "style", "frame", "iframe", "base", "object", "embed"]
44 tags_to_remove = ['html', 'body', 'font']
46 # allow new semantic HTML5 tags
47 allowed_tags = clean.defs.tags | frozenset('article section header footer hgroup nav aside figure'.split())
48 safe_attrs = clean.defs.safe_attrs | frozenset(['style'])
50 def html_sanitize(src, silent=True):
53 src = ustr(src, errors='replace')
55 logger = logging.getLogger(__name__ + '.html_sanitize')
57 # html encode email tags
58 part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL)
59 src = part.sub(lambda m: cgi.escape(m.group(1)), src)
62 'page_structure': True,
63 'style': False, # do not remove style attributes
64 'forms': True, # remove form tags
65 'remove_unknown_tags': False,
66 'allow_tags': allowed_tags,
68 if etree.LXML_VERSION >= (2, 3, 1):
69 # kill_tags attribute has been added in version 2.3.1
71 'kill_tags': tags_to_kill,
72 'remove_tags': tags_to_remove,
75 kwargs['remove_tags'] = tags_to_kill + tags_to_remove
77 if etree.LXML_VERSION >= (3, 1, 0):
79 'safe_attrs_only': True,
80 'safe_attrs': safe_attrs,
83 # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style"
84 kwargs['safe_attrs_only'] = False
87 # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
88 cleaner = clean.Cleaner(**kwargs)
89 cleaned = cleaner.clean_html(src)
90 except etree.ParserError:
93 logger.warning('ParserError obtained when sanitizing %r', src, exc_info=True)
94 cleaned = '<p>ParserError when sanitizing</p>'
98 logger.warning('unknown error obtained when sanitizing %r', src, exc_info=True)
99 cleaned = '<p>Unknown error when sanitizing</p>'
103 #----------------------------------------------------------
105 #----------------------------------------------------------
107 def html_email_clean(html, remove=False, shorten=False, max_length=300):
108 """ html_email_clean: clean the html by doing the following steps:
110 - try to strip email quotes, by removing blockquotes or having some client-
112 - try to strip signatures
113 - shorten the html to a maximum number of characters if requested
115 Some specific use case:
117 - MsOffice: ``div.style = border-top:solid;`` delimitates the beginning of
118 a quote; detecting by finding WordSection1 of MsoNormal
119 - Hotmail: ``hr.stopSpelling`` delimitates the beginning of a quote; detect
120 Hotmail by funding ``SkyDrivePlaceholder``
122 :param string html: sanitized html; tags like html or head should not
123 be present in the html string. This method therefore
124 takes as input html code coming from a sanitized source,
126 :param boolean remove: remove the html code that is unwanted; otherwise it
127 is only flagged and tagged
128 :param boolean shorten: shorten the html; every excessing content will
129 be flagged as to remove
130 :param int max_length: if shortening, maximum number of characters before
133 def _replace_matching_regex(regex, source, replace=''):
134 """ Replace all matching expressions in source by replace """
139 for item in re.finditer(regex, source):
140 dest += source[idx:item.start()] + replace
145 def _create_node(tag, text, tail=None, attrs={}):
146 new_node = etree.Element(tag)
149 for key, val in attrs.iteritems():
150 new_node.set(key, val)
153 def _insert_new_node(node, index, new_node_tag, new_node_text, new_node_tail=None, new_node_attrs={}):
154 new_node = _create_node(new_node_tag, new_node_text, new_node_tail, new_node_attrs)
155 node.insert(index, new_node)
158 def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs={}):
159 text = node.text or ''
160 if not re.search(regex, text):
165 idx, iteration = 0, 0
166 for item in re.finditer(regex, text):
168 cur_node.text = text[idx:item.start()]
170 _insert_new_node(node, (iteration - 1) * 2 + 1, new_node_tag, text[idx:item.start()])
171 new_node = _insert_new_node(node, iteration * 2, new_node_tag, text[item.start():item.end()], None, new_node_attrs)
176 new_node = _insert_new_node(node, -1, new_node_tag, text[idx:] + (cur_node.tail or ''), None, {})
178 def _truncate_node(node, position, simplify_whitespaces=True):
179 """ Truncate a node text at a given position. This algorithm will shorten
180 at the end of the word whose ending character exceeds position.
182 :param bool simplify_whitespaces: whether to try to count all successive
183 whitespaces as one character. This
184 option should not be True when trying
185 to keep 'pre' consistency.
187 if node.text is None:
191 if simplify_whitespaces:
194 node_words = node.text.strip(' \t\r\n').split()
195 for word in node_words:
196 cur_char_nbr += len(word)
197 if cur_char_nbr >= position:
200 truncate_idx = node.text.find(word) + len(word)
202 truncate_idx = position
203 if truncate_idx == -1 or truncate_idx > len(node.text):
204 truncate_idx = len(node.text)
206 # compose new text bits
207 innertext = node.text[0:truncate_idx]
208 outertext = node.text[truncate_idx:]
209 node.text = innertext
211 # create <span> ... <a href="#">read more</a></span> node
212 read_more_node = _create_node('span', ' ... ', None, {'class': 'oe_mail_expand'})
213 read_more_link_node = _create_node('a', 'read more', None, {'href': '#', 'class': 'oe_mail_expand'})
214 read_more_node.append(read_more_link_node)
215 # create outertext node
216 overtext_node = _create_node('span', outertext)
218 overtext_node.set('in_overlength', '1')
219 # add newly created nodes in dom
220 node.append(read_more_node)
221 node.append(overtext_node)
223 if not html or not isinstance(html, basestring):
228 # ------------------------------------------------------------
229 # TDE TODO: --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}'
231 # html: remove encoding attribute inside tags
232 doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
233 html = doctype.sub(r"", html)
235 # html: ClEditor seems to love using <div><br /><div> -> replace with <br />
236 br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)', re.IGNORECASE)
237 html = _replace_matching_regex(br_div_tags, html, '<br />')
240 root = lxml.html.fromstring(html)
241 if not len(root) and root.text is None and root.tail is None:
242 html = '<div>%s</div>' % html
243 root = lxml.html.fromstring(html)
245 quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
246 signature = re.compile(r'([-]{2,}[\s]?[\r\n]{1,2}[\s\S]+)')
247 for node in root.iter():
248 # remove all tails and replace them by a span element, because managing text and tails can be a pain in the ass
250 tail_node = _create_node('span', node.tail)
252 node.addnext(tail_node)
254 # form node and tag text-based quotes and signature
255 _tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'})
256 _tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'})
259 # ------------------------------------------------------------
262 # signature_begin = False # try dynamic signature recognition
266 for node in root.iter():
267 # node_text = re.sub('\s{2,}', ' ', node.text and node.text.strip(' \t\r\n') or '') # do not take into account multiple spaces that are displayed as max 1 space in html
268 node_text = ' '.join((node.text and node.text.strip(' \t\r\n') or '').split())
270 # root: try to tag the client used to write the html
271 if 'WordSection1' in node.get('class', '') or 'MsoNormal' in node.get('class', ''):
272 root.set('msoffice', '1')
273 if 'SkyDrivePlaceholder' in node.get('class', '') or 'SkyDrivePlaceholder' in node.get('id', ''):
274 root.set('hotmail', '1')
276 # state of the parsing: flag quotes and tails to remove
278 node.set('in_quote', '1')
279 node.set('tail_remove', '1')
280 # state of the parsing: flag when being in over-length content
282 node.set('in_overlength', '1')
283 node.set('tail_remove', '1')
285 # find quote in msoffice / hotmail / blockquote / text quote and signatures
286 if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''):
288 node.set('in_quote', '1')
289 node.set('tail_remove', '1')
290 if root.get('hotmail') and node.tag == 'hr' and ('stopSpelling' in node.get('class', '') or 'stopSpelling' in node.get('id', '')):
292 node.set('in_quote', '1')
293 node.set('tail_remove', '1')
294 if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'):
295 node.set('in_quote', '1')
298 # 1/ truncate the text at the next available space
299 # 2/ create a 'read more' node, next to current node
300 # 3/ add the truncated text in a new node, next to 'read more' node
301 if shorten and not overlength and cur_char_nbr + len(node_text) > max_length:
302 node_to_truncate = node
303 while node_to_truncate.get('in_quote') and node_to_truncate.getparent() is not None:
304 node_to_truncate = node_to_truncate.getparent()
306 node_to_truncate.set('truncate', '1')
307 if node_to_truncate == node:
308 node_to_truncate.set('truncate_position', str(max_length - cur_char_nbr))
310 node_to_truncate.set('truncate_position', str(len(node.text or '')))
311 cur_char_nbr += len(node_text)
314 # ------------------------------------------------------------
316 for node in root.iter():
317 if node.get('truncate'):
318 _truncate_node(node, int(node.get('truncate_position', '0')), node.tag != 'pre')
321 # ------------------------------------------------------------
324 for node in root.iter():
325 if node.get('in_quote') or node.get('in_overlength'):
326 # copy the node tail into parent text
327 if node.tail and not node.get('tail_remove'):
328 parent = node.getparent()
329 parent.tail = node.tail + (parent.tail or '')
330 to_remove.append(node)
331 if node.get('tail_remove'):
334 for attribute_name in ['in_quote', 'tail_remove', 'in_overlength', 'msoffice', 'hotmail', 'truncate', 'truncate_position']:
335 node.attrib.pop(attribute_name, None)
336 for node in to_remove:
338 node.getparent().remove(node)
340 if not 'oe_mail_expand' in node.get('class', ''): # trick: read more link should be displayed even if it's in overlength
341 node_class = node.get('class', '') + ' oe_mail_cleaned'
342 node.set('class', node_class)
344 # html: \n that were tail of elements have been encapsulated into <span> -> back to \n
345 html = etree.tostring(root, pretty_print=False)
346 linebreaks = re.compile(r'<span[^>]*>([\s]*[\r\n]+[\s]*)<\/span>', re.IGNORECASE | re.DOTALL)
347 html = _replace_matching_regex(linebreaks, html, '\n')
352 #----------------------------------------------------------
353 # HTML/Text management
354 #----------------------------------------------------------
356 def html2plaintext(html, body_id=None, encoding='utf-8'):
357 """ From an HTML text, convert the HTML to plain text.
358 If @param body_id is provided then this is the tag where the
359 body (not necessarily <body>) starts.
361 ## (c) Fry-IT, www.fry-it.com, 2007
362 ## <peter@fry-it.com>
363 ## download here: http://www.peterbe.com/plog/html2plaintext
366 tree = etree.fromstring(html, parser=etree.HTMLParser())
368 if body_id is not None:
369 source = tree.xpath('//*[@id=%s]' % (body_id,))
371 source = tree.xpath('//body')
377 for link in tree.findall('.//a'):
378 url = link.get('href')
382 link.text = '%s [%s]' % (link.text, i)
383 url_index.append(url)
385 html = ustr(etree.tostring(tree, encoding=encoding))
386 # \r char is converted into , must remove it
387 html = html.replace(' ', '')
389 html = html.replace('<strong>', '*').replace('</strong>', '*')
390 html = html.replace('<b>', '*').replace('</b>', '*')
391 html = html.replace('<h3>', '*').replace('</h3>', '*')
392 html = html.replace('<h2>', '**').replace('</h2>', '**')
393 html = html.replace('<h1>', '**').replace('</h1>', '**')
394 html = html.replace('<em>', '/').replace('</em>', '/')
395 html = html.replace('<tr>', '\n')
396 html = html.replace('</p>', '\n')
397 html = re.sub('<br\s*/?>', '\n', html)
398 html = re.sub('<.*?>', ' ', html)
399 html = html.replace(' ' * 2, ' ')
402 html = '\n'.join([x.strip() for x in html.splitlines()])
403 html = html.replace('\n' * 2, '\n')
405 for i, url in enumerate(url_index):
408 html += ustr('[%s] %s\n') % (i + 1, url)
412 def plaintext2html(text, container_tag=False):
413 """ Convert plaintext into html. Content of the text is escaped to manage
414 html entities, using cgi.escape().
415 - all \n,\r are replaced by <br />
416 - enclose content into <p>
417 - 2 or more consecutive <br /> are considered as paragraph breaks
419 :param string container_tag: container of the html; by default the
420 content is embedded into a <div>
422 text = cgi.escape(ustr(text))
424 # 1. replace \n and \r
425 text = text.replace('\n', '<br/>')
426 text = text.replace('\r', '<br/>')
428 # 2-3: form paragraphs
431 br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})')
432 for item in re.finditer(br_tags, text):
433 final += text[idx:item.start()] + '</p><p>'
435 final += text[idx:] + '</p>'
439 final = '<%s>%s</%s>' % (container_tag, final, container_tag)
442 def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False):
443 """ Append extra content at the end of an HTML snippet, trying
444 to locate the end of the HTML document (</body>, </html>, or
445 EOF), and converting the provided content in html unless ``plaintext``
447 Content conversion can be done in two ways:
448 - wrapping it into a pre (preserve=True)
449 - use plaintext2html (preserve=False, using container_tag to wrap the
451 A side-effect of this method is to coerce all HTML tags to
452 lowercase in ``html``, and strip enclosing <html> or <body> tags in
453 content if ``plaintext`` is False.
455 :param str html: html tagsoup (doesn't have to be XHTML)
456 :param str content: extra content to append
457 :param bool plaintext: whether content is plaintext and should
458 be wrapped in a <pre/> tag.
459 :param bool preserve: if content is plaintext, wrap it into a <pre>
460 instead of converting it into html
463 if plaintext and preserve:
464 content = u'\n<pre>%s</pre>\n' % ustr(content)
466 content = '\n%s\n' % plaintext2html(content, container_tag)
468 content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
469 content = u'\n%s\n' % ustr(content)
470 # Force all tags to lowercase
471 html = re.sub(r'(</?)\W*(\w+)([ >])',
472 lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
473 insert_location = html.find('</body>')
474 if insert_location == -1:
475 insert_location = html.find('</html>')
476 if insert_location == -1:
477 return '%s%s' % (html, content)
478 return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
480 #----------------------------------------------------------
482 #----------------------------------------------------------
484 # matches any email in a body of text
485 email_re = re.compile(r"""([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6})""", re.VERBOSE)
487 # matches a string containing only one email
488 single_email_re = re.compile(r"""^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$""", re.VERBOSE)
490 res_re = re.compile(r"\[([0-9]+)\]", re.UNICODE)
491 command_re = re.compile("^Set-([a-z]+) *: *(.+)$", re.I + re.UNICODE)
493 # Updated in 7.0 to match the model name as well
494 # Typical form of references is <timestamp-openerp-record_id-model_name@domain>
495 # group(1) = the record ID ; group(2) = the model (if any) ; group(3) = the domain
496 reference_re = re.compile("<.*-open(?:object|erp)-(\\d+)(?:-([\w.]+))?.*@(.*)>", re.UNICODE)
499 # Typical form of bounce is bounce-128-crm.lead-34@domain
500 # group(1) = the mail ID; group(2) = the model (if any); group(3) = the record ID
501 bounce_re = re.compile("[\w]+-(\d+)-?([\w.]+)?-?(\d+)?", re.UNICODE)
503 def generate_tracking_message_id(res_id):
504 """Returns a string that can be used in the Message-ID RFC822 header field
506 Used to track the replies related to a given object thanks to the "In-Reply-To"
507 or "References" fields that Mail User Agents will set.
510 rnd = random.SystemRandom().random()
511 except NotImplementedError:
512 rnd = random.random()
513 rndstr = ("%.15f" % rnd)[2:]
514 return "<%.15f.%s-openerp-%s@%s>" % (time.time(), rndstr, res_id, socket.gethostname())
516 def email_send(email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False,
517 attachments=None, message_id=None, references=None, openobject_id=False, debug=False, subtype='plain', headers=None,
518 smtp_server=None, smtp_port=None, ssl=False, smtp_user=None, smtp_password=None, cr=None, uid=None):
519 """Low-level function for sending an email (deprecated).
521 :deprecate: since OpenERP 6.1, please use ir.mail_server.send_email() instead.
522 :param email_from: A string used to fill the `From` header, if falsy,
523 config['email_from'] is used instead. Also used for
524 the `Reply-To` header if `reply_to` is not provided
525 :param email_to: a sequence of addresses to send the mail to.
528 # If not cr, get cr from current thread database
531 db_name = getattr(threading.currentThread(), 'dbname', None)
533 local_cr = cr = openerp.registry(db_name).db.cursor()
535 raise Exception("No database cursor found, please pass one explicitly")
539 mail_server_pool = openerp.registry(cr.dbname)['ir.mail_server']
541 # Pack Message into MIME Object
542 email_msg = mail_server_pool.build_email(email_from, email_to, subject, body, email_cc, email_bcc, reply_to,
543 attachments, message_id, references, openobject_id, subtype, headers=headers)
545 res = mail_server_pool.send_email(cr, uid or 1, email_msg, mail_server_id=None,
546 smtp_server=smtp_server, smtp_port=smtp_port, smtp_user=smtp_user, smtp_password=smtp_password,
547 smtp_encryption=('ssl' if ssl else None), smtp_debug=debug)
549 _logger.exception("tools.email_send failed to deliver email")
556 def email_split(text):
557 """ Return a list of the email addresses found in ``text`` """
560 return re.findall(r'([^ ,<@]+@[^> ,]+)', text)