1 # -*- coding: utf-8 -*-
2 ##############################################################################
4 # OpenERP, Open Source Business Applications
5 # Copyright (C) 2012-2013 OpenERP S.A. (<http://openerp.com>).
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as
9 # published by the Free Software Foundation, either version 3 of the
10 # License, or (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ##############################################################################
22 from lxml import etree
26 import lxml.html.clean as clean
34 from openerp.loglevels import ustr
36 _logger = logging.getLogger(__name__)
39 #----------------------------------------------------------
41 #----------------------------------------------------------
43 tags_to_kill = ["script", "head", "meta", "title", "link", "style", "frame", "iframe", "base", "object", "embed"]
44 tags_to_remove = ['html', 'body', 'font']
46 # allow new semantic HTML5 tags
47 allowed_tags = clean.defs.tags | frozenset('article section header footer hgroup nav aside figure'.split())
48 safe_attrs = clean.defs.safe_attrs | frozenset(['style'])
51 def html_sanitize(src, silent=True):
54 src = ustr(src, errors='replace')
56 logger = logging.getLogger(__name__ + '.html_sanitize')
58 # html encode email tags
59 part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL)
60 src = part.sub(lambda m: cgi.escape(m.group(1)), src)
63 'page_structure': True,
64 'style': False, # do not remove style attributes
65 'forms': True, # remove form tags
66 'remove_unknown_tags': False,
67 'allow_tags': allowed_tags,
69 if etree.LXML_VERSION >= (2, 3, 1):
70 # kill_tags attribute has been added in version 2.3.1
72 'kill_tags': tags_to_kill,
73 'remove_tags': tags_to_remove,
76 kwargs['remove_tags'] = tags_to_kill + tags_to_remove
78 kwargs['safe_attrs_only'] = False
81 # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
82 cleaner = clean.Cleaner(**kwargs)
83 cleaned = cleaner.clean_html(src)
84 except etree.ParserError, e:
89 logger.warning('ParserError obtained when sanitizing %r', src, exc_info=True)
90 cleaned = '<p>ParserError when sanitizing</p>'
94 logger.warning('unknown error obtained when sanitizing %r', src, exc_info=True)
95 cleaned = '<p>Unknown error when sanitizing</p>'
99 #----------------------------------------------------------
101 #----------------------------------------------------------
103 def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_options=None,
104 protect_sections=False):
105 """ html_email_clean: clean the html by doing the following steps:
107 - try to strip email quotes, by removing blockquotes or having some client-
109 - try to strip signatures
110 - shorten the html to a maximum number of characters if requested
112 Some specific use case:
114 - MsOffice: ``div.style = border-top:solid;`` delimitates the beginning of
115 a quote; detecting by finding WordSection1 of MsoNormal
116 - Hotmail: ``hr.stopSpelling`` delimitates the beginning of a quote; detect
117 Hotmail by funding ``SkyDrivePlaceholder``
119 :param string html: sanitized html; tags like html or head should not
120 be present in the html string. This method therefore
121 takes as input html code coming from a sanitized source,
123 :param boolean remove: remove the html code that is unwanted; otherwise it
124 is only flagged and tagged
125 :param boolean shorten: shorten the html; every excessing content will
126 be flagged as to remove
127 :param int max_length: if shortening, maximum number of characters before
129 :param dict expand_options: options for the read more link when shortening
130 the content.The used keys are the following:
132 - oe_expand_container_tag: class applied to the
133 container of the whole read more link
134 - oe_expand_container_class: class applied to the
135 link container (default: oe_mail_expand)
136 - oe_expand_container_content: content of the
137 container (default: ...)
138 - oe_expand_separator_node: optional separator, like
139 adding ... <br /><br /> <a ...>read more</a> (default: void)
140 - oe_expand_a_href: href of the read more link itself
142 - oe_expand_a_class: class applied to the <a> containing
143 the link itself (default: oe_mail_expand)
144 - oe_expand_a_content: content of the <a> (default: read more)
146 The formatted read more link is the following:
147 <cont_tag class="oe_expand_container_class">
148 oe_expand_container_content
149 if expand_options.get('oe_expand_separator_node'):
150 <oe_expand_separator_node/>
151 <a href="oe_expand_a_href" class="oe_expand_a_class">
156 def _replace_matching_regex(regex, source, replace=''):
157 """ Replace all matching expressions in source by replace """
162 for item in re.finditer(regex, source):
163 dest += source[idx:item.start()] + replace
168 def _create_node(tag, text, tail=None, attrs={}):
169 new_node = etree.Element(tag)
172 for key, val in attrs.iteritems():
173 new_node.set(key, val)
176 def _insert_new_node(node, index, new_node_tag, new_node_text, new_node_tail=None, new_node_attrs={}):
177 new_node = _create_node(new_node_tag, new_node_text, new_node_tail, new_node_attrs)
178 node.insert(index, new_node)
181 def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs={}):
182 text = node.text or ''
183 if not re.search(regex, text):
188 idx, iteration = 0, 0
189 for item in re.finditer(regex, text):
191 cur_node.text = text[idx:item.start()]
193 _insert_new_node(node, (iteration - 1) * 2 + 1, new_node_tag, text[idx:item.start()])
194 new_node = _insert_new_node(node, iteration * 2, new_node_tag, text[item.start():item.end()], None, new_node_attrs)
199 new_node = _insert_new_node(node, -1, new_node_tag, text[idx:] + (cur_node.tail or ''), None, {})
201 def _truncate_node(node, position, simplify_whitespaces=True):
202 """ Truncate a node text at a given position. This algorithm will shorten
203 at the end of the word whose ending character exceeds position.
205 :param bool simplify_whitespaces: whether to try to count all successive
206 whitespaces as one character. This
207 option should not be True when trying
208 to keep 'pre' consistency.
210 if node.text is None:
214 if simplify_whitespaces:
217 node_words = node.text.strip(' \t\r\n').split()
218 for word in node_words:
219 cur_char_nbr += len(word)
220 if cur_char_nbr >= position:
223 truncate_idx = node.text.find(word) + len(word)
225 truncate_idx = position
226 if truncate_idx == -1 or truncate_idx > len(node.text):
227 truncate_idx = len(node.text)
229 # compose new text bits
230 innertext = node.text[0:truncate_idx]
231 outertext = node.text[truncate_idx:]
232 node.text = innertext
234 # create <span> ... <a href="#">read more</a></span> node
235 read_more_node = _create_node(
236 expand_options.get('oe_expand_container_tag', 'span'),
237 expand_options.get('oe_expand_container_content', ' ... '),
239 {'class': expand_options.get('oe_expand_container_class', 'oe_mail_expand')}
241 if expand_options.get('oe_expand_separator_node'):
242 read_more_separator_node = _create_node(
243 expand_options.get('oe_expand_separator_node'),
248 read_more_node.append(read_more_separator_node)
249 read_more_link_node = _create_node(
251 expand_options.get('oe_expand_a_content', 'read more'),
254 'href': expand_options.get('oe_expand_a_href', '#'),
255 'class': expand_options.get('oe_expand_a_class', 'oe_mail_expand'),
258 read_more_node.append(read_more_link_node)
259 # create outertext node
260 overtext_node = _create_node('span', outertext)
262 overtext_node.set('in_overlength', '1')
263 # add newly created nodes in dom
264 node.append(read_more_node)
265 node.append(overtext_node)
267 if expand_options is None:
270 if not html or not isinstance(html, basestring):
275 # ------------------------------------------------------------
276 # TDE TODO: --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}'
278 # html: remove encoding attribute inside tags
279 doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
280 html = doctype.sub(r"", html)
282 # html: ClEditor seems to love using <div><br /><div> -> replace with <br />
283 br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)', re.IGNORECASE)
284 html = _replace_matching_regex(br_div_tags, html, '<br />')
287 root = lxml.html.fromstring(html)
288 if not len(root) and root.text is None and root.tail is None:
289 html = '<div>%s</div>' % html
290 root = lxml.html.fromstring(html)
292 quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
293 signature = re.compile(r'([-]{2,}[\s]?[\r\n]{1,2}[\s\S]+)')
294 for node in root.iter():
295 # remove all tails and replace them by a span element, because managing text and tails can be a pain in the ass
297 tail_node = _create_node('span', node.tail)
299 node.addnext(tail_node)
301 # form node and tag text-based quotes and signature
302 _tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'})
303 _tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'})
306 # ------------------------------------------------------------
309 # signature_begin = False # try dynamic signature recognition
312 overlength_section_id = None
313 overlength_section_count = 0
315 for node in root.iter():
316 # do not take into account multiple spaces that are displayed as max 1 space in html
317 node_text = ' '.join((node.text and node.text.strip(' \t\r\n') or '').split())
319 # root: try to tag the client used to write the html
320 if 'WordSection1' in node.get('class', '') or 'MsoNormal' in node.get('class', ''):
321 root.set('msoffice', '1')
322 if 'SkyDrivePlaceholder' in node.get('class', '') or 'SkyDrivePlaceholder' in node.get('id', ''):
323 root.set('hotmail', '1')
325 # protect sections by tagging section limits and blocks contained inside sections, using an increasing id to re-find them later
326 if node.tag == 'section':
327 overlength_section_count += 1
328 node.set('section_closure', str(overlength_section_count))
329 if node.getparent() is not None and (node.getparent().get('section_closure') or node.getparent().get('section_inner')):
330 node.set('section_inner', str(overlength_section_count))
332 # state of the parsing: flag quotes and tails to remove
334 node.set('in_quote', '1')
335 node.set('tail_remove', '1')
336 # state of the parsing: flag when being in over-length content, depending on section content if defined (only when having protect_sections)
338 if not overlength_section_id or int(node.get('section_inner', overlength_section_count + 1)) > overlength_section_count:
339 node.set('in_overlength', '1')
340 node.set('tail_remove', '1')
342 # find quote in msoffice / hotmail / blockquote / text quote and signatures
343 if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''):
345 node.set('in_quote', '1')
346 node.set('tail_remove', '1')
347 if root.get('hotmail') and node.tag == 'hr' and ('stopSpelling' in node.get('class', '') or 'stopSpelling' in node.get('id', '')):
349 node.set('in_quote', '1')
350 node.set('tail_remove', '1')
351 if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'):
352 node.set('in_quote', '1')
355 # if protect section:
356 # 1/ find the first parent not being inside a section
357 # 2/ add the read more link
359 # 1/ truncate the text at the next available space
360 # 2/ create a 'read more' node, next to current node
361 # 3/ add the truncated text in a new node, next to 'read more' node
362 node_text = (node.text or '').strip().strip('\n').strip()
363 if shorten and not overlength and cur_char_nbr + len(node_text) > max_length:
364 node_to_truncate = node
365 while node_to_truncate.getparent() is not None:
366 if node_to_truncate.get('in_quote'):
367 node_to_truncate = node_to_truncate.getparent()
368 elif protect_sections and (node_to_truncate.getparent().get('section_inner') or node_to_truncate.getparent().get('section_closure')):
369 node_to_truncate = node_to_truncate.getparent()
370 overlength_section_id = node_to_truncate.get('section_closure')
375 node_to_truncate.set('truncate', '1')
376 if node_to_truncate == node:
377 node_to_truncate.set('truncate_position', str(max_length - cur_char_nbr))
379 node_to_truncate.set('truncate_position', str(len(node.text or '')))
380 cur_char_nbr += len(node_text)
383 # ------------------------------------------------------------
385 for node in root.iter():
386 if node.get('truncate'):
387 _truncate_node(node, int(node.get('truncate_position', '0')), node.tag != 'pre')
390 # ------------------------------------------------------------
393 for node in root.iter():
394 if node.get('in_quote') or node.get('in_overlength'):
395 # copy the node tail into parent text
396 if node.tail and not node.get('tail_remove'):
397 parent = node.getparent()
398 parent.tail = node.tail + (parent.tail or '')
399 to_remove.append(node)
400 if node.get('tail_remove'):
403 for attribute_name in ['in_quote', 'tail_remove', 'in_overlength', 'msoffice', 'hotmail', 'truncate', 'truncate_position']:
404 node.attrib.pop(attribute_name, None)
405 for node in to_remove:
407 node.getparent().remove(node)
409 if not expand_options.get('oe_expand_a_class', 'oe_mail_expand') in node.get('class', ''): # trick: read more link should be displayed even if it's in overlength
410 node_class = node.get('class', '') + ' oe_mail_cleaned'
411 node.set('class', node_class)
413 # html: \n that were tail of elements have been encapsulated into <span> -> back to \n
414 html = etree.tostring(root, pretty_print=False)
415 linebreaks = re.compile(r'<span[^>]*>([\s]*[\r\n]+[\s]*)<\/span>', re.IGNORECASE | re.DOTALL)
416 html = _replace_matching_regex(linebreaks, html, '\n')
421 #----------------------------------------------------------
422 # HTML/Text management
423 #----------------------------------------------------------
425 def html2plaintext(html, body_id=None, encoding='utf-8'):
426 """ From an HTML text, convert the HTML to plain text.
427 If @param body_id is provided then this is the tag where the
428 body (not necessarily <body>) starts.
430 ## (c) Fry-IT, www.fry-it.com, 2007
431 ## <peter@fry-it.com>
432 ## download here: http://www.peterbe.com/plog/html2plaintext
435 tree = etree.fromstring(html, parser=etree.HTMLParser())
437 if body_id is not None:
438 source = tree.xpath('//*[@id=%s]' % (body_id,))
440 source = tree.xpath('//body')
446 for link in tree.findall('.//a'):
447 url = link.get('href')
451 link.text = '%s [%s]' % (link.text, i)
452 url_index.append(url)
454 html = ustr(etree.tostring(tree, encoding=encoding))
455 # \r char is converted into , must remove it
456 html = html.replace(' ', '')
458 html = html.replace('<strong>', '*').replace('</strong>', '*')
459 html = html.replace('<b>', '*').replace('</b>', '*')
460 html = html.replace('<h3>', '*').replace('</h3>', '*')
461 html = html.replace('<h2>', '**').replace('</h2>', '**')
462 html = html.replace('<h1>', '**').replace('</h1>', '**')
463 html = html.replace('<em>', '/').replace('</em>', '/')
464 html = html.replace('<tr>', '\n')
465 html = html.replace('</p>', '\n')
466 html = re.sub('<br\s*/?>', '\n', html)
467 html = re.sub('<.*?>', ' ', html)
468 html = html.replace(' ' * 2, ' ')
471 html = '\n'.join([x.strip() for x in html.splitlines()])
472 html = html.replace('\n' * 2, '\n')
474 for i, url in enumerate(url_index):
477 html += ustr('[%s] %s\n') % (i + 1, url)
481 def plaintext2html(text, container_tag=False):
482 """ Convert plaintext into html. Content of the text is escaped to manage
483 html entities, using cgi.escape().
484 - all \n,\r are replaced by <br />
485 - enclose content into <p>
486 - 2 or more consecutive <br /> are considered as paragraph breaks
488 :param string container_tag: container of the html; by default the
489 content is embedded into a <div>
491 text = cgi.escape(ustr(text))
493 # 1. replace \n and \r
494 text = text.replace('\n', '<br/>')
495 text = text.replace('\r', '<br/>')
497 # 2-3: form paragraphs
500 br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})')
501 for item in re.finditer(br_tags, text):
502 final += text[idx:item.start()] + '</p><p>'
504 final += text[idx:] + '</p>'
508 final = '<%s>%s</%s>' % (container_tag, final, container_tag)
511 def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False):
512 """ Append extra content at the end of an HTML snippet, trying
513 to locate the end of the HTML document (</body>, </html>, or
514 EOF), and converting the provided content in html unless ``plaintext``
516 Content conversion can be done in two ways:
517 - wrapping it into a pre (preserve=True)
518 - use plaintext2html (preserve=False, using container_tag to wrap the
520 A side-effect of this method is to coerce all HTML tags to
521 lowercase in ``html``, and strip enclosing <html> or <body> tags in
522 content if ``plaintext`` is False.
524 :param str html: html tagsoup (doesn't have to be XHTML)
525 :param str content: extra content to append
526 :param bool plaintext: whether content is plaintext and should
527 be wrapped in a <pre/> tag.
528 :param bool preserve: if content is plaintext, wrap it into a <pre>
529 instead of converting it into html
532 if plaintext and preserve:
533 content = u'\n<pre>%s</pre>\n' % ustr(content)
535 content = '\n%s\n' % plaintext2html(content, container_tag)
537 content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
538 content = u'\n%s\n' % ustr(content)
539 # Force all tags to lowercase
540 html = re.sub(r'(</?)\W*(\w+)([ >])',
541 lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
542 insert_location = html.find('</body>')
543 if insert_location == -1:
544 insert_location = html.find('</html>')
545 if insert_location == -1:
546 return '%s%s' % (html, content)
547 return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
549 #----------------------------------------------------------
551 #----------------------------------------------------------
553 # matches any email in a body of text
554 email_re = re.compile(r"""([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6})""", re.VERBOSE)
556 # matches a string containing only one email
557 single_email_re = re.compile(r"""^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$""", re.VERBOSE)
559 res_re = re.compile(r"\[([0-9]+)\]", re.UNICODE)
560 command_re = re.compile("^Set-([a-z]+) *: *(.+)$", re.I + re.UNICODE)
562 # Updated in 7.0 to match the model name as well
563 # Typical form of references is <timestamp-openerp-record_id-model_name@domain>
564 # group(1) = the record ID ; group(2) = the model (if any) ; group(3) = the domain
565 reference_re = re.compile("<.*-open(?:object|erp)-(\\d+)(?:-([\w.]+))?.*@(.*)>", re.UNICODE)
568 # Typical form of bounce is bounce-128-crm.lead-34@domain
569 # group(1) = the mail ID; group(2) = the model (if any); group(3) = the record ID
570 bounce_re = re.compile("[\w]+-(\d+)-?([\w.]+)?-?(\d+)?", re.UNICODE)
572 def generate_tracking_message_id(res_id):
573 """Returns a string that can be used in the Message-ID RFC822 header field
575 Used to track the replies related to a given object thanks to the "In-Reply-To"
576 or "References" fields that Mail User Agents will set.
579 rnd = random.SystemRandom().random()
580 except NotImplementedError:
581 rnd = random.random()
582 rndstr = ("%.15f" % rnd)[2:]
583 return "<%.15f.%s-openerp-%s@%s>" % (time.time(), rndstr, res_id, socket.gethostname())
585 def email_send(email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False,
586 attachments=None, message_id=None, references=None, openobject_id=False, debug=False, subtype='plain', headers=None,
587 smtp_server=None, smtp_port=None, ssl=False, smtp_user=None, smtp_password=None, cr=None, uid=None):
588 """Low-level function for sending an email (deprecated).
590 :deprecate: since OpenERP 6.1, please use ir.mail_server.send_email() instead.
591 :param email_from: A string used to fill the `From` header, if falsy,
592 config['email_from'] is used instead. Also used for
593 the `Reply-To` header if `reply_to` is not provided
594 :param email_to: a sequence of addresses to send the mail to.
597 # If not cr, get cr from current thread database
600 db_name = getattr(threading.currentThread(), 'dbname', None)
602 local_cr = cr = openerp.registry(db_name).db.cursor()
604 raise Exception("No database cursor found, please pass one explicitly")
608 mail_server_pool = openerp.registry(cr.dbname)['ir.mail_server']
610 # Pack Message into MIME Object
611 email_msg = mail_server_pool.build_email(email_from, email_to, subject, body, email_cc, email_bcc, reply_to,
612 attachments, message_id, references, openobject_id, subtype, headers=headers)
614 res = mail_server_pool.send_email(cr, uid or 1, email_msg, mail_server_id=None,
615 smtp_server=smtp_server, smtp_port=smtp_port, smtp_user=smtp_user, smtp_password=smtp_password,
616 smtp_encryption=('ssl' if ssl else None), smtp_debug=debug)
618 _logger.exception("tools.email_send failed to deliver email")
625 def email_split(text):
626 """ Return a list of the email addresses found in ``text`` """
629 return re.findall(r'([^ ,<@]+@[^> ,]+)', text)