1 # -*- coding: utf-8 -*-
2 ##############################################################################
4 # OpenERP, Open Source Business Applications
5 # Copyright (C) 2012-TODAY OpenERP S.A. (<http://openerp.com>).
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as
9 # published by the Free Software Foundation, either version 3 of the
10 # License, or (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ##############################################################################
22 from lxml import etree
26 import lxml.html.clean as clean
32 from email.utils import getaddresses
35 from openerp.loglevels import ustr
37 _logger = logging.getLogger(__name__)
40 #----------------------------------------------------------
42 #----------------------------------------------------------
44 tags_to_kill = ["script", "head", "meta", "title", "link", "style", "frame", "iframe", "base", "object", "embed"]
45 tags_to_remove = ['html', 'body', 'font']
47 # allow new semantic HTML5 tags
48 allowed_tags = clean.defs.tags | frozenset('article section header footer hgroup nav aside figure main'.split() + [etree.Comment])
49 safe_attrs = clean.defs.safe_attrs | frozenset(
51 'data-oe-model', 'data-oe-id', 'data-oe-field', 'data-oe-type', 'data-oe-expression', 'data-oe-translate', 'data-oe-nodeid',
52 'data-snippet-id', 'data-publish', 'data-id', 'data-res_id', 'data-member_id', 'data-view-id'
56 def html_sanitize(src, silent=True, strict=False):
59 src = ustr(src, errors='replace')
61 logger = logging.getLogger(__name__ + '.html_sanitize')
63 # html encode email tags
64 part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL)
65 src = part.sub(lambda m: cgi.escape(m.group(1)), src)
66 # html encode mako tags <% ... %> to decode them later and keep them alive, otherwise they are stripped by the cleaner
67 src = src.replace('<%', cgi.escape('<%'))
68 src = src.replace('%>', cgi.escape('%>'))
71 'page_structure': True,
72 'style': False, # do not remove style attributes
73 'forms': True, # remove form tags
74 'remove_unknown_tags': False,
75 'allow_tags': allowed_tags,
77 'processing_instructions': False
79 if etree.LXML_VERSION >= (2, 3, 1):
80 # kill_tags attribute has been added in version 2.3.1
82 'kill_tags': tags_to_kill,
83 'remove_tags': tags_to_remove,
86 kwargs['remove_tags'] = tags_to_kill + tags_to_remove
89 if etree.LXML_VERSION >= (3, 1, 0):
90 # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style"
92 'safe_attrs_only': True,
93 'safe_attrs': safe_attrs,
96 kwargs['safe_attrs_only'] = False # keep oe-data attributes + style
97 kwargs['frames'] = False, # do not remove frames (embbed video in CMS blogs)
100 # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
101 cleaner = clean.Cleaner(**kwargs)
102 cleaned = cleaner.clean_html(src)
103 # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution
104 cleaned = cleaned.replace('%24', '$')
105 cleaned = cleaned.replace('%7B', '{')
106 cleaned = cleaned.replace('%7D', '}')
107 cleaned = cleaned.replace('%20', ' ')
108 cleaned = cleaned.replace('%5B', '[')
109 cleaned = cleaned.replace('%5D', ']')
110 cleaned = cleaned.replace('<%', '<%')
111 cleaned = cleaned.replace('%>', '%>')
112 except etree.ParserError, e:
113 if 'empty' in str(e):
117 logger.warning('ParserError obtained when sanitizing %r', src, exc_info=True)
118 cleaned = '<p>ParserError when sanitizing</p>'
122 logger.warning('unknown error obtained when sanitizing %r', src, exc_info=True)
123 cleaned = '<p>Unknown error when sanitizing</p>'
125 # this is ugly, but lxml/etree tostring want to put everything in a 'div' that breaks the editor -> remove that
126 if cleaned.startswith('<div>') and cleaned.endswith('</div>'):
127 cleaned = cleaned[5:-6]
132 #----------------------------------------------------------
134 #----------------------------------------------------------
136 def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_options=None,
137 protect_sections=False):
138 """ html_email_clean: clean the html by doing the following steps:
140 - try to strip email quotes, by removing blockquotes or having some client-
142 - try to strip signatures
143 - shorten the html to a maximum number of characters if requested
145 Some specific use case:
147 - MsOffice: ``div.style = border-top:solid;`` delimitates the beginning of
148 a quote; detecting by finding WordSection1 of MsoNormal
149 - Hotmail: ``hr.stopSpelling`` delimitates the beginning of a quote; detect
150 Hotmail by funding ``SkyDrivePlaceholder``
152 :param string html: sanitized html; tags like html or head should not
153 be present in the html string. This method therefore
154 takes as input html code coming from a sanitized source,
156 :param boolean remove: remove the html code that is unwanted; otherwise it
157 is only flagged and tagged
158 :param boolean shorten: shorten the html; every excessing content will
159 be flagged as to remove
160 :param int max_length: if shortening, maximum number of characters before
162 :param dict expand_options: options for the read more link when shortening
163 the content.The used keys are the following:
165 - oe_expand_container_tag: class applied to the
166 container of the whole read more link
167 - oe_expand_container_class: class applied to the
168 link container (default: oe_mail_expand)
169 - oe_expand_container_content: content of the
170 container (default: ...)
171 - oe_expand_separator_node: optional separator, like
172 adding ... <br /><br /> <a ...>read more</a> (default: void)
173 - oe_expand_a_href: href of the read more link itself
175 - oe_expand_a_class: class applied to the <a> containing
176 the link itself (default: oe_mail_expand)
177 - oe_expand_a_content: content of the <a> (default: read more)
179 The formatted read more link is the following:
180 <cont_tag class="oe_expand_container_class">
181 oe_expand_container_content
182 if expand_options.get('oe_expand_separator_node'):
183 <oe_expand_separator_node/>
184 <a href="oe_expand_a_href" class="oe_expand_a_class">
189 def _replace_matching_regex(regex, source, replace=''):
190 """ Replace all matching expressions in source by replace """
195 for item in re.finditer(regex, source):
196 dest += source[idx:item.start()] + replace
201 def _create_node(tag, text, tail=None, attrs={}):
202 new_node = etree.Element(tag)
205 for key, val in attrs.iteritems():
206 new_node.set(key, val)
209 def _insert_new_node(node, index, new_node_tag, new_node_text, new_node_tail=None, new_node_attrs={}):
210 new_node = _create_node(new_node_tag, new_node_text, new_node_tail, new_node_attrs)
211 node.insert(index, new_node)
214 def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs={}):
215 text = node.text or ''
216 if not re.search(regex, text):
221 idx, iteration = 0, 0
222 for item in re.finditer(regex, text):
224 cur_node.text = text[idx:item.start()]
226 _insert_new_node(node, (iteration - 1) * 2 + 1, new_node_tag, text[idx:item.start()])
227 new_node = _insert_new_node(node, iteration * 2, new_node_tag, text[item.start():item.end()], None, new_node_attrs)
232 new_node = _insert_new_node(node, -1, new_node_tag, text[idx:] + (cur_node.tail or ''), None, {})
234 def _truncate_node(node, position, simplify_whitespaces=True):
235 """ Truncate a node text at a given position. This algorithm will shorten
236 at the end of the word whose ending character exceeds position.
238 :param bool simplify_whitespaces: whether to try to count all successive
239 whitespaces as one character. This
240 option should not be True when trying
241 to keep 'pre' consistency.
243 if node.text is None:
247 if simplify_whitespaces:
250 node_words = node.text.strip(' \t\r\n').split()
251 for word in node_words:
252 cur_char_nbr += len(word)
253 if cur_char_nbr >= position:
256 truncate_idx = node.text.find(word) + len(word)
258 truncate_idx = position
259 if truncate_idx == -1 or truncate_idx > len(node.text):
260 truncate_idx = len(node.text)
262 # compose new text bits
263 innertext = node.text[0:truncate_idx]
264 outertext = node.text[truncate_idx:]
265 node.text = innertext
267 # create <span> ... <a href="#">read more</a></span> node
268 read_more_node = _create_node(
269 expand_options.get('oe_expand_container_tag', 'span'),
270 expand_options.get('oe_expand_container_content', ' ... '),
272 {'class': expand_options.get('oe_expand_container_class', 'oe_mail_expand')}
274 if expand_options.get('oe_expand_separator_node'):
275 read_more_separator_node = _create_node(
276 expand_options.get('oe_expand_separator_node'),
281 read_more_node.append(read_more_separator_node)
282 read_more_link_node = _create_node(
284 expand_options.get('oe_expand_a_content', 'read more'),
287 'href': expand_options.get('oe_expand_a_href', '#'),
288 'class': expand_options.get('oe_expand_a_class', 'oe_mail_expand'),
291 read_more_node.append(read_more_link_node)
292 # create outertext node
293 overtext_node = _create_node('span', outertext)
295 overtext_node.set('in_overlength', '1')
296 # add newly created nodes in dom
297 node.append(read_more_node)
298 node.append(overtext_node)
300 if expand_options is None:
303 if not html or not isinstance(html, basestring):
308 # ------------------------------------------------------------
309 # TDE TODO: --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}'
311 # html: remove encoding attribute inside tags
312 doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
313 html = doctype.sub(r"", html)
315 # html: ClEditor seems to love using <div><br /><div> -> replace with <br />
316 br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)', re.IGNORECASE)
317 html = _replace_matching_regex(br_div_tags, html, '<br />')
320 root = lxml.html.fromstring(html)
321 if not len(root) and root.text is None and root.tail is None:
322 html = '<div>%s</div>' % html
323 root = lxml.html.fromstring(html)
325 quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
326 signature = re.compile(r'([-]{2,}[\s]?[\r\n]{1,2}[\s\S]+)')
327 for node in root.iter():
328 # remove all tails and replace them by a span element, because managing text and tails can be a pain in the ass
330 tail_node = _create_node('span', node.tail)
332 node.addnext(tail_node)
334 # form node and tag text-based quotes and signature
335 _tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'})
336 _tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'})
339 # ------------------------------------------------------------
342 # signature_begin = False # try dynamic signature recognition
345 overlength_section_id = None
346 overlength_section_count = 0
348 for node in root.iter():
349 # comments do not need processing
350 # note: bug in node.get(value, default) for HtmlComments, default never returned
351 if node.tag == etree.Comment:
353 # do not take into account multiple spaces that are displayed as max 1 space in html
354 node_text = ' '.join((node.text and node.text.strip(' \t\r\n') or '').split())
356 # root: try to tag the client used to write the html
357 if 'WordSection1' in node.get('class', '') or 'MsoNormal' in node.get('class', ''):
358 root.set('msoffice', '1')
359 if 'SkyDrivePlaceholder' in node.get('class', '') or 'SkyDrivePlaceholder' in node.get('id', ''):
360 root.set('hotmail', '1')
362 # protect sections by tagging section limits and blocks contained inside sections, using an increasing id to re-find them later
363 if node.tag == 'section':
364 overlength_section_count += 1
365 node.set('section_closure', str(overlength_section_count))
366 if node.getparent() is not None and (node.getparent().get('section_closure') or node.getparent().get('section_inner')):
367 node.set('section_inner', str(overlength_section_count))
369 # state of the parsing: flag quotes and tails to remove
371 node.set('in_quote', '1')
372 node.set('tail_remove', '1')
373 # state of the parsing: flag when being in over-length content, depending on section content if defined (only when having protect_sections)
375 if not overlength_section_id or int(node.get('section_inner', overlength_section_count + 1)) > overlength_section_count:
376 node.set('in_overlength', '1')
377 node.set('tail_remove', '1')
379 # find quote in msoffice / hotmail / blockquote / text quote and signatures
380 if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''):
382 node.set('in_quote', '1')
383 node.set('tail_remove', '1')
384 if root.get('hotmail') and node.tag == 'hr' and ('stopSpelling' in node.get('class', '') or 'stopSpelling' in node.get('id', '')):
386 node.set('in_quote', '1')
387 node.set('tail_remove', '1')
388 if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'):
389 node.set('in_quote', '1')
392 # if protect section:
393 # 1/ find the first parent not being inside a section
394 # 2/ add the read more link
396 # 1/ truncate the text at the next available space
397 # 2/ create a 'read more' node, next to current node
398 # 3/ add the truncated text in a new node, next to 'read more' node
399 node_text = (node.text or '').strip().strip('\n').strip()
400 if shorten and not overlength and cur_char_nbr + len(node_text) > max_length:
401 node_to_truncate = node
402 while node_to_truncate.getparent() is not None:
403 if node_to_truncate.get('in_quote'):
404 node_to_truncate = node_to_truncate.getparent()
405 elif protect_sections and (node_to_truncate.getparent().get('section_inner') or node_to_truncate.getparent().get('section_closure')):
406 node_to_truncate = node_to_truncate.getparent()
407 overlength_section_id = node_to_truncate.get('section_closure')
412 node_to_truncate.set('truncate', '1')
413 if node_to_truncate == node:
414 node_to_truncate.set('truncate_position', str(max_length - cur_char_nbr))
416 node_to_truncate.set('truncate_position', str(len(node.text or '')))
417 cur_char_nbr += len(node_text)
420 # ------------------------------------------------------------
422 for node in root.iter():
423 if node.get('truncate'):
424 _truncate_node(node, int(node.get('truncate_position', '0')), node.tag != 'pre')
427 # ------------------------------------------------------------
430 for node in root.iter():
431 if node.get('in_quote') or node.get('in_overlength'):
432 # copy the node tail into parent text
433 if node.tail and not node.get('tail_remove'):
434 parent = node.getparent()
435 parent.tail = node.tail + (parent.tail or '')
436 to_remove.append(node)
437 if node.get('tail_remove'):
440 for attribute_name in ['in_quote', 'tail_remove', 'in_overlength', 'msoffice', 'hotmail', 'truncate', 'truncate_position']:
441 node.attrib.pop(attribute_name, None)
442 for node in to_remove:
444 node.getparent().remove(node)
446 if not expand_options.get('oe_expand_a_class', 'oe_mail_expand') in node.get('class', ''): # trick: read more link should be displayed even if it's in overlength
447 node_class = node.get('class', '') + ' oe_mail_cleaned'
448 node.set('class', node_class)
450 # html: \n that were tail of elements have been encapsulated into <span> -> back to \n
451 html = etree.tostring(root, pretty_print=False)
452 linebreaks = re.compile(r'<span[^>]*>([\s]*[\r\n]+[\s]*)<\/span>', re.IGNORECASE | re.DOTALL)
453 html = _replace_matching_regex(linebreaks, html, '\n')
458 #----------------------------------------------------------
459 # HTML/Text management
460 #----------------------------------------------------------
462 def html2plaintext(html, body_id=None, encoding='utf-8'):
463 """ From an HTML text, convert the HTML to plain text.
464 If @param body_id is provided then this is the tag where the
465 body (not necessarily <body>) starts.
467 ## (c) Fry-IT, www.fry-it.com, 2007
468 ## <peter@fry-it.com>
469 ## download here: http://www.peterbe.com/plog/html2plaintext
472 tree = etree.fromstring(html, parser=etree.HTMLParser())
474 if body_id is not None:
475 source = tree.xpath('//*[@id=%s]' % (body_id,))
477 source = tree.xpath('//body')
483 for link in tree.findall('.//a'):
484 url = link.get('href')
488 link.text = '%s [%s]' % (link.text, i)
489 url_index.append(url)
491 html = ustr(etree.tostring(tree, encoding=encoding))
492 # \r char is converted into , must remove it
493 html = html.replace(' ', '')
495 html = html.replace('<strong>', '*').replace('</strong>', '*')
496 html = html.replace('<b>', '*').replace('</b>', '*')
497 html = html.replace('<h3>', '*').replace('</h3>', '*')
498 html = html.replace('<h2>', '**').replace('</h2>', '**')
499 html = html.replace('<h1>', '**').replace('</h1>', '**')
500 html = html.replace('<em>', '/').replace('</em>', '/')
501 html = html.replace('<tr>', '\n')
502 html = html.replace('</p>', '\n')
503 html = re.sub('<br\s*/?>', '\n', html)
504 html = re.sub('<.*?>', ' ', html)
505 html = html.replace(' ' * 2, ' ')
506 html = html.replace('>', '>')
507 html = html.replace('<', '<')
510 html = '\n'.join([x.strip() for x in html.splitlines()])
511 html = html.replace('\n' * 2, '\n')
513 for i, url in enumerate(url_index):
516 html += ustr('[%s] %s\n') % (i + 1, url)
520 def plaintext2html(text, container_tag=False):
521 """ Convert plaintext into html. Content of the text is escaped to manage
522 html entities, using cgi.escape().
523 - all \n,\r are replaced by <br />
524 - enclose content into <p>
525 - 2 or more consecutive <br /> are considered as paragraph breaks
527 :param string container_tag: container of the html; by default the
528 content is embedded into a <div>
530 text = cgi.escape(ustr(text))
532 # 1. replace \n and \r
533 text = text.replace('\n', '<br/>')
534 text = text.replace('\r', '<br/>')
536 # 2-3: form paragraphs
539 br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})')
540 for item in re.finditer(br_tags, text):
541 final += text[idx:item.start()] + '</p><p>'
543 final += text[idx:] + '</p>'
547 final = '<%s>%s</%s>' % (container_tag, final, container_tag)
550 def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False):
551 """ Append extra content at the end of an HTML snippet, trying
552 to locate the end of the HTML document (</body>, </html>, or
553 EOF), and converting the provided content in html unless ``plaintext``
555 Content conversion can be done in two ways:
556 - wrapping it into a pre (preserve=True)
557 - use plaintext2html (preserve=False, using container_tag to wrap the
559 A side-effect of this method is to coerce all HTML tags to
560 lowercase in ``html``, and strip enclosing <html> or <body> tags in
561 content if ``plaintext`` is False.
563 :param str html: html tagsoup (doesn't have to be XHTML)
564 :param str content: extra content to append
565 :param bool plaintext: whether content is plaintext and should
566 be wrapped in a <pre/> tag.
567 :param bool preserve: if content is plaintext, wrap it into a <pre>
568 instead of converting it into html
571 if plaintext and preserve:
572 content = u'\n<pre>%s</pre>\n' % ustr(content)
574 content = '\n%s\n' % plaintext2html(content, container_tag)
576 content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
577 content = u'\n%s\n' % ustr(content)
578 # Force all tags to lowercase
579 html = re.sub(r'(</?)\W*(\w+)([ >])',
580 lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
581 insert_location = html.find('</body>')
582 if insert_location == -1:
583 insert_location = html.find('</html>')
584 if insert_location == -1:
585 return '%s%s' % (html, content)
586 return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
588 #----------------------------------------------------------
590 #----------------------------------------------------------
592 # matches any email in a body of text
593 email_re = re.compile(r"""([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6})""", re.VERBOSE)
595 # matches a string containing only one email
596 single_email_re = re.compile(r"""^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$""", re.VERBOSE)
598 res_re = re.compile(r"\[([0-9]+)\]", re.UNICODE)
599 command_re = re.compile("^Set-([a-z]+) *: *(.+)$", re.I + re.UNICODE)
601 # Updated in 7.0 to match the model name as well
602 # Typical form of references is <timestamp-openerp-record_id-model_name@domain>
603 # group(1) = the record ID ; group(2) = the model (if any) ; group(3) = the domain
604 reference_re = re.compile("<.*-open(?:object|erp)-(\\d+)(?:-([\w.]+))?.*@(.*)>", re.UNICODE)
607 # Typical form of bounce is bounce-128-crm.lead-34@domain
608 # group(1) = the mail ID; group(2) = the model (if any); group(3) = the record ID
609 bounce_re = re.compile("[\w]+-(\d+)-?([\w.]+)?-?(\d+)?", re.UNICODE)
611 def generate_tracking_message_id(res_id):
612 """Returns a string that can be used in the Message-ID RFC822 header field
614 Used to track the replies related to a given object thanks to the "In-Reply-To"
615 or "References" fields that Mail User Agents will set.
618 rnd = random.SystemRandom().random()
619 except NotImplementedError:
620 rnd = random.random()
621 rndstr = ("%.15f" % rnd)[2:]
622 return "<%.15f.%s-openerp-%s@%s>" % (time.time(), rndstr, res_id, socket.gethostname())
624 def email_send(email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False,
625 attachments=None, message_id=None, references=None, openobject_id=False, debug=False, subtype='plain', headers=None,
626 smtp_server=None, smtp_port=None, ssl=False, smtp_user=None, smtp_password=None, cr=None, uid=None):
627 """Low-level function for sending an email (deprecated).
629 :deprecate: since OpenERP 6.1, please use ir.mail_server.send_email() instead.
630 :param email_from: A string used to fill the `From` header, if falsy,
631 config['email_from'] is used instead. Also used for
632 the `Reply-To` header if `reply_to` is not provided
633 :param email_to: a sequence of addresses to send the mail to.
636 # If not cr, get cr from current thread database
639 db_name = getattr(threading.currentThread(), 'dbname', None)
641 local_cr = cr = openerp.registry(db_name).cursor()
643 raise Exception("No database cursor found, please pass one explicitly")
647 mail_server_pool = openerp.registry(cr.dbname)['ir.mail_server']
649 # Pack Message into MIME Object
650 email_msg = mail_server_pool.build_email(email_from, email_to, subject, body, email_cc, email_bcc, reply_to,
651 attachments, message_id, references, openobject_id, subtype, headers=headers)
653 res = mail_server_pool.send_email(cr, uid or 1, email_msg, mail_server_id=None,
654 smtp_server=smtp_server, smtp_port=smtp_port, smtp_user=smtp_user, smtp_password=smtp_password,
655 smtp_encryption=('ssl' if ssl else None), smtp_debug=debug)
657 _logger.exception("tools.email_send failed to deliver email")
664 def email_split(text):
665 """ Return a list of the email addresses found in ``text`` """
668 return [addr[1] for addr in getaddresses([text])
669 # getaddresses() returns '' when email parsing fails, and
670 # sometimes returns emails without at least '@'. The '@'
671 # is strictly required in RFC2822's `addr-spec`.