1 # -*- coding: utf-8 -*-
2 ##############################################################################
4 # OpenERP, Open Source Business Applications
5 # Copyright (C) 2012-TODAY OpenERP S.A. (<http://openerp.com>).
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as
9 # published by the Free Software Foundation, either version 3 of the
10 # License, or (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ##############################################################################
22 from lxml import etree
26 import lxml.html.clean as clean
32 from email.utils import getaddresses
35 from openerp.loglevels import ustr
37 _logger = logging.getLogger(__name__)
40 #----------------------------------------------------------
42 #----------------------------------------------------------
44 tags_to_kill = ["script", "head", "meta", "title", "link", "style", "frame", "iframe", "base", "object", "embed"]
45 tags_to_remove = ['html', 'body', 'font']
47 # allow new semantic HTML5 tags
48 allowed_tags = clean.defs.tags | frozenset('article section header footer hgroup nav aside figure main'.split() + [etree.Comment])
49 safe_attrs = clean.defs.safe_attrs | frozenset(
51 'data-oe-model', 'data-oe-id', 'data-oe-field', 'data-oe-type', 'data-oe-expression', 'data-oe-translate', 'data-oe-nodeid',
52 'data-snippet-id', 'data-publish', 'data-id', 'data-res_id', 'data-member_id', 'data-view-id'
56 def html_sanitize(src, silent=True, strict=False):
59 src = ustr(src, errors='replace')
61 logger = logging.getLogger(__name__ + '.html_sanitize')
63 # html encode email tags
64 part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL)
65 src = part.sub(lambda m: cgi.escape(m.group(1)), src)
66 # html encode mako tags <% ... %> to decode them later and keep them alive, otherwise they are stripped by the cleaner
67 src = src.replace('<%', cgi.escape('<%'))
68 src = src.replace('%>', cgi.escape('%>'))
71 'page_structure': True,
72 'style': False, # do not remove style attributes
73 'forms': True, # remove form tags
74 'remove_unknown_tags': False,
75 'allow_tags': allowed_tags,
77 'processing_instructions': False
79 if etree.LXML_VERSION >= (2, 3, 1):
80 # kill_tags attribute has been added in version 2.3.1
82 'kill_tags': tags_to_kill,
83 'remove_tags': tags_to_remove,
86 kwargs['remove_tags'] = tags_to_kill + tags_to_remove
89 if etree.LXML_VERSION >= (3, 1, 0):
90 # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style"
92 'safe_attrs_only': True,
93 'safe_attrs': safe_attrs,
96 kwargs['safe_attrs_only'] = False # keep oe-data attributes + style
97 kwargs['frames'] = False, # do not remove frames (embbed video in CMS blogs)
100 # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
101 cleaner = clean.Cleaner(**kwargs)
102 cleaned = cleaner.clean_html(src)
103 # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution
104 cleaned = cleaned.replace('%24', '$')
105 cleaned = cleaned.replace('%7B', '{')
106 cleaned = cleaned.replace('%7D', '}')
107 cleaned = cleaned.replace('%20', ' ')
108 cleaned = cleaned.replace('%5B', '[')
109 cleaned = cleaned.replace('%5D', ']')
110 cleaned = cleaned.replace('<%', '<%')
111 cleaned = cleaned.replace('%>', '%>')
112 except etree.ParserError, e:
113 if 'empty' in str(e):
117 logger.warning('ParserError obtained when sanitizing %r', src, exc_info=True)
118 cleaned = '<p>ParserError when sanitizing</p>'
122 logger.warning('unknown error obtained when sanitizing %r', src, exc_info=True)
123 cleaned = '<p>Unknown error when sanitizing</p>'
125 # this is ugly, but lxml/etree tostring want to put everything in a 'div' that breaks the editor -> remove that
126 if cleaned.startswith('<div>') and cleaned.endswith('</div>'):
127 cleaned = cleaned[5:-6]
132 #----------------------------------------------------------
134 #----------------------------------------------------------
136 def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_options=None,
137 protect_sections=False):
138 """ html_email_clean: clean the html by doing the following steps:
140 - try to strip email quotes, by removing blockquotes or having some client-
142 - try to strip signatures
143 - shorten the html to a maximum number of characters if requested
145 Some specific use case:
147 - MsOffice: ``div.style = border-top:solid;`` delimitates the beginning of
148 a quote; detecting by finding WordSection1 of MsoNormal
149 - Hotmail: ``hr.stopSpelling`` delimitates the beginning of a quote; detect
150 Hotmail by funding ``SkyDrivePlaceholder``
152 :param string html: sanitized html; tags like html or head should not
153 be present in the html string. This method therefore
154 takes as input html code coming from a sanitized source,
156 :param boolean remove: remove the html code that is unwanted; otherwise it
157 is only flagged and tagged
158 :param boolean shorten: shorten the html; every excessing content will
159 be flagged as to remove
160 :param int max_length: if shortening, maximum number of characters before
162 :param dict expand_options: options for the read more link when shortening
163 the content.The used keys are the following:
165 - oe_expand_container_tag: class applied to the
166 container of the whole read more link
167 - oe_expand_container_class: class applied to the
168 link container (default: oe_mail_expand)
169 - oe_expand_container_content: content of the
170 container (default: ...)
171 - oe_expand_separator_node: optional separator, like
172 adding ... <br /><br /> <a ...>read more</a> (default: void)
173 - oe_expand_a_href: href of the read more link itself
175 - oe_expand_a_class: class applied to the <a> containing
176 the link itself (default: oe_mail_expand)
177 - oe_expand_a_content: content of the <a> (default: read more)
179 The formatted read more link is the following:
180 <cont_tag class="oe_expand_container_class">
181 oe_expand_container_content
182 if expand_options.get('oe_expand_separator_node'):
183 <oe_expand_separator_node/>
184 <a href="oe_expand_a_href" class="oe_expand_a_class">
189 def _replace_matching_regex(regex, source, replace=''):
190 """ Replace all matching expressions in source by replace """
195 for item in re.finditer(regex, source):
196 dest += source[idx:item.start()] + replace
201 def _create_node(tag, text, tail=None, attrs={}):
202 new_node = etree.Element(tag)
205 for key, val in attrs.iteritems():
206 new_node.set(key, val)
209 def _insert_new_node(node, index, new_node_tag, new_node_text, new_node_tail=None, new_node_attrs={}):
210 new_node = _create_node(new_node_tag, new_node_text, new_node_tail, new_node_attrs)
211 node.insert(index, new_node)
214 def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs={}):
215 text = node.text or ''
216 if not re.search(regex, text):
221 idx, iteration = 0, 0
222 for item in re.finditer(regex, text):
224 cur_node.text = text[idx:item.start()]
226 _insert_new_node(node, (iteration - 1) * 2 + 1, new_node_tag, text[idx:item.start()])
227 new_node = _insert_new_node(node, iteration * 2, new_node_tag, text[item.start():item.end()], None, new_node_attrs)
232 new_node = _insert_new_node(node, -1, new_node_tag, text[idx:] + (cur_node.tail or ''), None, {})
234 def _truncate_node(node, position, simplify_whitespaces=True):
235 """ Truncate a node text at a given position. This algorithm will shorten
236 at the end of the word whose ending character exceeds position.
238 :param bool simplify_whitespaces: whether to try to count all successive
239 whitespaces as one character. This
240 option should not be True when trying
241 to keep 'pre' consistency.
243 if node.text is None:
247 if simplify_whitespaces:
250 node_words = node.text.strip(' \t\r\n').split()
251 for word in node_words:
252 cur_char_nbr += len(word)
253 if cur_char_nbr >= position:
256 truncate_idx = node.text.find(word) + len(word)
258 truncate_idx = position
259 if truncate_idx == -1 or truncate_idx > len(node.text):
260 truncate_idx = len(node.text)
262 # compose new text bits
263 innertext = node.text[0:truncate_idx]
264 outertext = node.text[truncate_idx:]
265 node.text = innertext
267 # create <span> ... <a href="#">read more</a></span> node
268 read_more_node = _create_node(
269 expand_options.get('oe_expand_container_tag', 'span'),
270 expand_options.get('oe_expand_container_content', ' ... '),
272 {'class': expand_options.get('oe_expand_container_class', 'oe_mail_expand')}
274 if expand_options.get('oe_expand_separator_node'):
275 read_more_separator_node = _create_node(
276 expand_options.get('oe_expand_separator_node'),
281 read_more_node.append(read_more_separator_node)
282 read_more_link_node = _create_node(
284 expand_options.get('oe_expand_a_content', 'read more'),
287 'href': expand_options.get('oe_expand_a_href', '#'),
288 'class': expand_options.get('oe_expand_a_class', 'oe_mail_expand'),
291 read_more_node.append(read_more_link_node)
292 # create outertext node
293 overtext_node = _create_node('span', outertext)
295 overtext_node.set('in_overlength', '1')
296 # add newly created nodes in dom
297 node.append(read_more_node)
298 node.append(overtext_node)
300 if expand_options is None:
303 if not html or not isinstance(html, basestring):
308 # ------------------------------------------------------------
309 # TDE TODO: --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}'
311 # html: remove encoding attribute inside tags
312 doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
313 html = doctype.sub(r"", html)
315 # html: ClEditor seems to love using <div><br /><div> -> replace with <br />
316 br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)', re.IGNORECASE)
317 html = _replace_matching_regex(br_div_tags, html, '<br />')
320 root = lxml.html.fromstring(html)
321 if not len(root) and root.text is None and root.tail is None:
322 html = '<div>%s</div>' % html
323 root = lxml.html.fromstring(html)
325 quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
326 signature = re.compile(r'([-]{2,}[\s]?[\r\n]{1,2}[\s\S]+)')
327 for node in root.iter():
328 # remove all tails and replace them by a span element, because managing text and tails can be a pain in the ass
330 tail_node = _create_node('span', node.tail)
332 node.addnext(tail_node)
334 # form node and tag text-based quotes and signature
335 _tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'})
336 _tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'})
339 # ------------------------------------------------------------
342 # signature_begin = False # try dynamic signature recognition
345 overlength_section_id = None
346 overlength_section_count = 0
348 for node in root.iter():
349 # comments do not need processing
350 # note: bug in node.get(value, default) for HtmlComments, default never returned
351 if node.tag == etree.Comment:
353 # do not take into account multiple spaces that are displayed as max 1 space in html
354 node_text = ' '.join((node.text and node.text.strip(' \t\r\n') or '').split())
356 # root: try to tag the client used to write the html
357 if 'WordSection1' in node.get('class', '') or 'MsoNormal' in node.get('class', ''):
358 root.set('msoffice', '1')
359 if 'SkyDrivePlaceholder' in node.get('class', '') or 'SkyDrivePlaceholder' in node.get('id', ''):
360 root.set('hotmail', '1')
362 # protect sections by tagging section limits and blocks contained inside sections, using an increasing id to re-find them later
363 if node.tag == 'section':
364 overlength_section_count += 1
365 node.set('section_closure', str(overlength_section_count))
366 if node.getparent() is not None and (node.getparent().get('section_closure') or node.getparent().get('section_inner')):
367 node.set('section_inner', str(overlength_section_count))
369 # state of the parsing: flag quotes and tails to remove
371 node.set('in_quote', '1')
372 node.set('tail_remove', '1')
373 # state of the parsing: flag when being in over-length content, depending on section content if defined (only when having protect_sections)
375 if not overlength_section_id or int(node.get('section_inner', overlength_section_count + 1)) > overlength_section_count:
376 node.set('in_overlength', '1')
377 node.set('tail_remove', '1')
379 # find quote in msoffice / hotmail / blockquote / text quote and signatures
380 if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''):
382 node.set('in_quote', '1')
383 node.set('tail_remove', '1')
384 if root.get('hotmail') and node.tag == 'hr' and ('stopSpelling' in node.get('class', '') or 'stopSpelling' in node.get('id', '')):
386 node.set('in_quote', '1')
387 node.set('tail_remove', '1')
388 if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'):
389 # here no quote_begin because we want to be able to remove some quoted
390 # text without removing all the remaining context
391 node.set('in_quote', '1')
392 if node.getparent() is not None and node.getparent().get('in_quote'):
393 # inside a block of removed text but not in quote_begin (see above)
394 node.set('in_quote', '1')
397 # if protect section:
398 # 1/ find the first parent not being inside a section
399 # 2/ add the read more link
401 # 1/ truncate the text at the next available space
402 # 2/ create a 'read more' node, next to current node
403 # 3/ add the truncated text in a new node, next to 'read more' node
404 node_text = (node.text or '').strip().strip('\n').strip()
405 if shorten and not overlength and cur_char_nbr + len(node_text) > max_length:
406 node_to_truncate = node
407 while node_to_truncate.getparent() is not None:
408 if node_to_truncate.get('in_quote'):
409 node_to_truncate = node_to_truncate.getparent()
410 elif protect_sections and (node_to_truncate.getparent().get('section_inner') or node_to_truncate.getparent().get('section_closure')):
411 node_to_truncate = node_to_truncate.getparent()
412 overlength_section_id = node_to_truncate.get('section_closure')
417 node_to_truncate.set('truncate', '1')
418 if node_to_truncate == node:
419 node_to_truncate.set('truncate_position', str(max_length - cur_char_nbr))
421 node_to_truncate.set('truncate_position', str(len(node.text or '')))
422 cur_char_nbr += len(node_text)
425 # ------------------------------------------------------------
427 for node in root.iter():
428 if node.get('truncate'):
429 _truncate_node(node, int(node.get('truncate_position', '0')), node.tag != 'pre')
432 # ------------------------------------------------------------
435 for node in root.iter():
436 if node.get('in_quote') or node.get('in_overlength'):
437 # copy the node tail into parent text
438 if node.tail and not node.get('tail_remove'):
439 parent = node.getparent()
440 parent.tail = node.tail + (parent.tail or '')
441 to_remove.append(node)
442 if node.get('tail_remove'):
445 for attribute_name in ['in_quote', 'tail_remove', 'in_overlength', 'msoffice', 'hotmail', 'truncate', 'truncate_position']:
446 node.attrib.pop(attribute_name, None)
447 for node in to_remove:
449 node.getparent().remove(node)
451 if not expand_options.get('oe_expand_a_class', 'oe_mail_expand') in node.get('class', ''): # trick: read more link should be displayed even if it's in overlength
452 node_class = node.get('class', '') + ' oe_mail_cleaned'
453 node.set('class', node_class)
455 # html: \n that were tail of elements have been encapsulated into <span> -> back to \n
456 html = etree.tostring(root, pretty_print=False)
457 linebreaks = re.compile(r'<span[^>]*>([\s]*[\r\n]+[\s]*)<\/span>', re.IGNORECASE | re.DOTALL)
458 html = _replace_matching_regex(linebreaks, html, '\n')
463 #----------------------------------------------------------
464 # HTML/Text management
465 #----------------------------------------------------------
467 def html2plaintext(html, body_id=None, encoding='utf-8'):
468 """ From an HTML text, convert the HTML to plain text.
469 If @param body_id is provided then this is the tag where the
470 body (not necessarily <body>) starts.
472 ## (c) Fry-IT, www.fry-it.com, 2007
473 ## <peter@fry-it.com>
474 ## download here: http://www.peterbe.com/plog/html2plaintext
477 tree = etree.fromstring(html, parser=etree.HTMLParser())
479 if body_id is not None:
480 source = tree.xpath('//*[@id=%s]' % (body_id,))
482 source = tree.xpath('//body')
488 for link in tree.findall('.//a'):
489 url = link.get('href')
493 link.text = '%s [%s]' % (link.text, i)
494 url_index.append(url)
496 html = ustr(etree.tostring(tree, encoding=encoding))
497 # \r char is converted into , must remove it
498 html = html.replace(' ', '')
500 html = html.replace('<strong>', '*').replace('</strong>', '*')
501 html = html.replace('<b>', '*').replace('</b>', '*')
502 html = html.replace('<h3>', '*').replace('</h3>', '*')
503 html = html.replace('<h2>', '**').replace('</h2>', '**')
504 html = html.replace('<h1>', '**').replace('</h1>', '**')
505 html = html.replace('<em>', '/').replace('</em>', '/')
506 html = html.replace('<tr>', '\n')
507 html = html.replace('</p>', '\n')
508 html = re.sub('<br\s*/?>', '\n', html)
509 html = re.sub('<.*?>', ' ', html)
510 html = html.replace(' ' * 2, ' ')
511 html = html.replace('>', '>')
512 html = html.replace('<', '<')
513 html = html.replace('&', '&')
516 html = '\n'.join([x.strip() for x in html.splitlines()])
517 html = html.replace('\n' * 2, '\n')
519 for i, url in enumerate(url_index):
522 html += ustr('[%s] %s\n') % (i + 1, url)
526 def plaintext2html(text, container_tag=False):
527 """ Convert plaintext into html. Content of the text is escaped to manage
528 html entities, using cgi.escape().
529 - all \n,\r are replaced by <br />
530 - enclose content into <p>
531 - 2 or more consecutive <br /> are considered as paragraph breaks
533 :param string container_tag: container of the html; by default the
534 content is embedded into a <div>
536 text = cgi.escape(ustr(text))
538 # 1. replace \n and \r
539 text = text.replace('\n', '<br/>')
540 text = text.replace('\r', '<br/>')
542 # 2-3: form paragraphs
545 br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})')
546 for item in re.finditer(br_tags, text):
547 final += text[idx:item.start()] + '</p><p>'
549 final += text[idx:] + '</p>'
553 final = '<%s>%s</%s>' % (container_tag, final, container_tag)
556 def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False):
557 """ Append extra content at the end of an HTML snippet, trying
558 to locate the end of the HTML document (</body>, </html>, or
559 EOF), and converting the provided content in html unless ``plaintext``
561 Content conversion can be done in two ways:
562 - wrapping it into a pre (preserve=True)
563 - use plaintext2html (preserve=False, using container_tag to wrap the
565 A side-effect of this method is to coerce all HTML tags to
566 lowercase in ``html``, and strip enclosing <html> or <body> tags in
567 content if ``plaintext`` is False.
569 :param str html: html tagsoup (doesn't have to be XHTML)
570 :param str content: extra content to append
571 :param bool plaintext: whether content is plaintext and should
572 be wrapped in a <pre/> tag.
573 :param bool preserve: if content is plaintext, wrap it into a <pre>
574 instead of converting it into html
577 if plaintext and preserve:
578 content = u'\n<pre>%s</pre>\n' % ustr(content)
580 content = '\n%s\n' % plaintext2html(content, container_tag)
582 content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
583 content = u'\n%s\n' % ustr(content)
584 # Force all tags to lowercase
585 html = re.sub(r'(</?)\W*(\w+)([ >])',
586 lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
587 insert_location = html.find('</body>')
588 if insert_location == -1:
589 insert_location = html.find('</html>')
590 if insert_location == -1:
591 return '%s%s' % (html, content)
592 return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
594 #----------------------------------------------------------
596 #----------------------------------------------------------
598 # matches any email in a body of text
599 email_re = re.compile(r"""([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6})""", re.VERBOSE)
601 # matches a string containing only one email
602 single_email_re = re.compile(r"""^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$""", re.VERBOSE)
604 res_re = re.compile(r"\[([0-9]+)\]", re.UNICODE)
605 command_re = re.compile("^Set-([a-z]+) *: *(.+)$", re.I + re.UNICODE)
607 # Updated in 7.0 to match the model name as well
608 # Typical form of references is <timestamp-openerp-record_id-model_name@domain>
609 # group(1) = the record ID ; group(2) = the model (if any) ; group(3) = the domain
610 reference_re = re.compile("<.*-open(?:object|erp)-(\\d+)(?:-([\w.]+))?[^>]*@([^>]*)>", re.UNICODE)
613 def generate_tracking_message_id(res_id):
614 """Returns a string that can be used in the Message-ID RFC822 header field
616 Used to track the replies related to a given object thanks to the "In-Reply-To"
617 or "References" fields that Mail User Agents will set.
620 rnd = random.SystemRandom().random()
621 except NotImplementedError:
622 rnd = random.random()
623 rndstr = ("%.15f" % rnd)[2:]
624 return "<%.15f.%s-openerp-%s@%s>" % (time.time(), rndstr, res_id, socket.gethostname())
626 def email_send(email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False,
627 attachments=None, message_id=None, references=None, openobject_id=False, debug=False, subtype='plain', headers=None,
628 smtp_server=None, smtp_port=None, ssl=False, smtp_user=None, smtp_password=None, cr=None, uid=None):
629 """Low-level function for sending an email (deprecated).
631 :deprecate: since OpenERP 6.1, please use ir.mail_server.send_email() instead.
632 :param email_from: A string used to fill the `From` header, if falsy,
633 config['email_from'] is used instead. Also used for
634 the `Reply-To` header if `reply_to` is not provided
635 :param email_to: a sequence of addresses to send the mail to.
638 # If not cr, get cr from current thread database
641 db_name = getattr(threading.currentThread(), 'dbname', None)
643 local_cr = cr = openerp.registry(db_name).cursor()
645 raise Exception("No database cursor found, please pass one explicitly")
649 mail_server_pool = openerp.registry(cr.dbname)['ir.mail_server']
651 # Pack Message into MIME Object
652 email_msg = mail_server_pool.build_email(email_from, email_to, subject, body, email_cc, email_bcc, reply_to,
653 attachments, message_id, references, openobject_id, subtype, headers=headers)
655 res = mail_server_pool.send_email(cr, uid or 1, email_msg, mail_server_id=None,
656 smtp_server=smtp_server, smtp_port=smtp_port, smtp_user=smtp_user, smtp_password=smtp_password,
657 smtp_encryption=('ssl' if ssl else None), smtp_debug=debug)
659 _logger.exception("tools.email_send failed to deliver email")
666 def email_split(text):
667 """ Return a list of the email addresses found in ``text`` """
670 return [addr[1] for addr in getaddresses([text])
671 # getaddresses() returns '' when email parsing fails, and
672 # sometimes returns emails without at least '@'. The '@'
673 # is strictly required in RFC2822's `addr-spec`.