1 # -*- coding: utf-8 -*-
2 ##############################################################################
4 # OpenERP, Open Source Business Applications
5 # Copyright (C) 2012-TODAY OpenERP S.A. (<http://openerp.com>).
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as
9 # published by the Free Software Foundation, either version 3 of the
10 # License, or (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ##############################################################################
22 from lxml import etree
26 import lxml.html.clean as clean
33 from email.utils import getaddresses
36 from openerp.loglevels import ustr
38 _logger = logging.getLogger(__name__)
41 #----------------------------------------------------------
43 #----------------------------------------------------------
45 tags_to_kill = ["script", "head", "meta", "title", "link", "style", "frame", "iframe", "base", "object", "embed"]
46 tags_to_remove = ['html', 'body', 'font']
48 # allow new semantic HTML5 tags
49 allowed_tags = clean.defs.tags | frozenset('article section header footer hgroup nav aside figure main'.split())
50 safe_attrs = clean.defs.safe_attrs | frozenset(
52 'data-oe-model', 'data-oe-id', 'data-oe-field', 'data-oe-type', 'data-oe-expression', 'data-oe-translate', 'data-oe-nodeid',
53 'data-snippet-id', 'data-publish', 'data-id', 'data-res_id', 'data-member_id', 'data-view-id'
57 def html_sanitize(src, silent=True, strict=False):
60 src = ustr(src, errors='replace')
62 logger = logging.getLogger(__name__ + '.html_sanitize')
64 # html encode email tags
65 part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL)
66 src = part.sub(lambda m: cgi.escape(m.group(1)), src)
69 'page_structure': True,
70 'style': False, # do not remove style attributes
71 'forms': True, # remove form tags
72 'remove_unknown_tags': False,
73 'allow_tags': allowed_tags,
75 if etree.LXML_VERSION >= (2, 3, 1):
76 # kill_tags attribute has been added in version 2.3.1
78 'kill_tags': tags_to_kill,
79 'remove_tags': tags_to_remove,
82 kwargs['remove_tags'] = tags_to_kill + tags_to_remove
85 if etree.LXML_VERSION >= (3, 1, 0):
86 # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style"
88 'safe_attrs_only': True,
89 'safe_attrs': safe_attrs,
92 kwargs['safe_attrs_only'] = False # keep oe-data attributes + style
93 kwargs['frames'] = False, # do not remove frames (embbed video in CMS blogs)
96 # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
97 cleaner = clean.Cleaner(**kwargs)
98 cleaned = cleaner.clean_html(src)
99 # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution
100 cleaned = xml.sax.saxutils.unescape(cleaned, {'%24': '$', '%7B': '{', '%7D': '}', '%20': ' '})
101 except etree.ParserError, e:
102 if 'empty' in str(e):
106 logger.warning('ParserError obtained when sanitizing %r', src, exc_info=True)
107 cleaned = '<p>ParserError when sanitizing</p>'
111 logger.warning('unknown error obtained when sanitizing %r', src, exc_info=True)
112 cleaned = '<p>Unknown error when sanitizing</p>'
114 # this is ugly, but lxml/etree tostring want to put everything in a 'div' that breaks the editor -> remove that
115 if cleaned.startswith('<div>') and cleaned.endswith('</div>'):
116 cleaned = cleaned[5:-6]
121 #----------------------------------------------------------
123 #----------------------------------------------------------
125 def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_options=None,
126 protect_sections=False):
127 """ html_email_clean: clean the html by doing the following steps:
129 - try to strip email quotes, by removing blockquotes or having some client-
131 - try to strip signatures
132 - shorten the html to a maximum number of characters if requested
134 Some specific use case:
136 - MsOffice: ``div.style = border-top:solid;`` delimitates the beginning of
137 a quote; detecting by finding WordSection1 of MsoNormal
138 - Hotmail: ``hr.stopSpelling`` delimitates the beginning of a quote; detect
139 Hotmail by funding ``SkyDrivePlaceholder``
141 :param string html: sanitized html; tags like html or head should not
142 be present in the html string. This method therefore
143 takes as input html code coming from a sanitized source,
145 :param boolean remove: remove the html code that is unwanted; otherwise it
146 is only flagged and tagged
147 :param boolean shorten: shorten the html; every excessing content will
148 be flagged as to remove
149 :param int max_length: if shortening, maximum number of characters before
151 :param dict expand_options: options for the read more link when shortening
152 the content.The used keys are the following:
154 - oe_expand_container_tag: class applied to the
155 container of the whole read more link
156 - oe_expand_container_class: class applied to the
157 link container (default: oe_mail_expand)
158 - oe_expand_container_content: content of the
159 container (default: ...)
160 - oe_expand_separator_node: optional separator, like
161 adding ... <br /><br /> <a ...>read more</a> (default: void)
162 - oe_expand_a_href: href of the read more link itself
164 - oe_expand_a_class: class applied to the <a> containing
165 the link itself (default: oe_mail_expand)
166 - oe_expand_a_content: content of the <a> (default: read more)
168 The formatted read more link is the following:
169 <cont_tag class="oe_expand_container_class">
170 oe_expand_container_content
171 if expand_options.get('oe_expand_separator_node'):
172 <oe_expand_separator_node/>
173 <a href="oe_expand_a_href" class="oe_expand_a_class">
178 def _replace_matching_regex(regex, source, replace=''):
179 """ Replace all matching expressions in source by replace """
184 for item in re.finditer(regex, source):
185 dest += source[idx:item.start()] + replace
190 def _create_node(tag, text, tail=None, attrs={}):
191 new_node = etree.Element(tag)
194 for key, val in attrs.iteritems():
195 new_node.set(key, val)
198 def _insert_new_node(node, index, new_node_tag, new_node_text, new_node_tail=None, new_node_attrs={}):
199 new_node = _create_node(new_node_tag, new_node_text, new_node_tail, new_node_attrs)
200 node.insert(index, new_node)
203 def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs={}):
204 text = node.text or ''
205 if not re.search(regex, text):
210 idx, iteration = 0, 0
211 for item in re.finditer(regex, text):
213 cur_node.text = text[idx:item.start()]
215 _insert_new_node(node, (iteration - 1) * 2 + 1, new_node_tag, text[idx:item.start()])
216 new_node = _insert_new_node(node, iteration * 2, new_node_tag, text[item.start():item.end()], None, new_node_attrs)
221 new_node = _insert_new_node(node, -1, new_node_tag, text[idx:] + (cur_node.tail or ''), None, {})
223 def _truncate_node(node, position, simplify_whitespaces=True):
224 """ Truncate a node text at a given position. This algorithm will shorten
225 at the end of the word whose ending character exceeds position.
227 :param bool simplify_whitespaces: whether to try to count all successive
228 whitespaces as one character. This
229 option should not be True when trying
230 to keep 'pre' consistency.
232 if node.text is None:
236 if simplify_whitespaces:
239 node_words = node.text.strip(' \t\r\n').split()
240 for word in node_words:
241 cur_char_nbr += len(word)
242 if cur_char_nbr >= position:
245 truncate_idx = node.text.find(word) + len(word)
247 truncate_idx = position
248 if truncate_idx == -1 or truncate_idx > len(node.text):
249 truncate_idx = len(node.text)
251 # compose new text bits
252 innertext = node.text[0:truncate_idx]
253 outertext = node.text[truncate_idx:]
254 node.text = innertext
256 # create <span> ... <a href="#">read more</a></span> node
257 read_more_node = _create_node(
258 expand_options.get('oe_expand_container_tag', 'span'),
259 expand_options.get('oe_expand_container_content', ' ... '),
261 {'class': expand_options.get('oe_expand_container_class', 'oe_mail_expand')}
263 if expand_options.get('oe_expand_separator_node'):
264 read_more_separator_node = _create_node(
265 expand_options.get('oe_expand_separator_node'),
270 read_more_node.append(read_more_separator_node)
271 read_more_link_node = _create_node(
273 expand_options.get('oe_expand_a_content', 'read more'),
276 'href': expand_options.get('oe_expand_a_href', '#'),
277 'class': expand_options.get('oe_expand_a_class', 'oe_mail_expand'),
280 read_more_node.append(read_more_link_node)
281 # create outertext node
282 overtext_node = _create_node('span', outertext)
284 overtext_node.set('in_overlength', '1')
285 # add newly created nodes in dom
286 node.append(read_more_node)
287 node.append(overtext_node)
289 if expand_options is None:
292 if not html or not isinstance(html, basestring):
297 # ------------------------------------------------------------
298 # TDE TODO: --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}'
300 # html: remove encoding attribute inside tags
301 doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
302 html = doctype.sub(r"", html)
304 # html: ClEditor seems to love using <div><br /><div> -> replace with <br />
305 br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)', re.IGNORECASE)
306 html = _replace_matching_regex(br_div_tags, html, '<br />')
309 root = lxml.html.fromstring(html)
310 if not len(root) and root.text is None and root.tail is None:
311 html = '<div>%s</div>' % html
312 root = lxml.html.fromstring(html)
314 quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
315 signature = re.compile(r'([-]{2,}[\s]?[\r\n]{1,2}[\s\S]+)')
316 for node in root.iter():
317 # remove all tails and replace them by a span element, because managing text and tails can be a pain in the ass
319 tail_node = _create_node('span', node.tail)
321 node.addnext(tail_node)
323 # form node and tag text-based quotes and signature
324 _tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'})
325 _tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'})
328 # ------------------------------------------------------------
331 # signature_begin = False # try dynamic signature recognition
334 overlength_section_id = None
335 overlength_section_count = 0
337 for node in root.iter():
338 # do not take into account multiple spaces that are displayed as max 1 space in html
339 node_text = ' '.join((node.text and node.text.strip(' \t\r\n') or '').split())
341 # root: try to tag the client used to write the html
342 if 'WordSection1' in node.get('class', '') or 'MsoNormal' in node.get('class', ''):
343 root.set('msoffice', '1')
344 if 'SkyDrivePlaceholder' in node.get('class', '') or 'SkyDrivePlaceholder' in node.get('id', ''):
345 root.set('hotmail', '1')
347 # protect sections by tagging section limits and blocks contained inside sections, using an increasing id to re-find them later
348 if node.tag == 'section':
349 overlength_section_count += 1
350 node.set('section_closure', str(overlength_section_count))
351 if node.getparent() is not None and (node.getparent().get('section_closure') or node.getparent().get('section_inner')):
352 node.set('section_inner', str(overlength_section_count))
354 # state of the parsing: flag quotes and tails to remove
356 node.set('in_quote', '1')
357 node.set('tail_remove', '1')
358 # state of the parsing: flag when being in over-length content, depending on section content if defined (only when having protect_sections)
360 if not overlength_section_id or int(node.get('section_inner', overlength_section_count + 1)) > overlength_section_count:
361 node.set('in_overlength', '1')
362 node.set('tail_remove', '1')
364 # find quote in msoffice / hotmail / blockquote / text quote and signatures
365 if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''):
367 node.set('in_quote', '1')
368 node.set('tail_remove', '1')
369 if root.get('hotmail') and node.tag == 'hr' and ('stopSpelling' in node.get('class', '') or 'stopSpelling' in node.get('id', '')):
371 node.set('in_quote', '1')
372 node.set('tail_remove', '1')
373 if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'):
374 node.set('in_quote', '1')
377 # if protect section:
378 # 1/ find the first parent not being inside a section
379 # 2/ add the read more link
381 # 1/ truncate the text at the next available space
382 # 2/ create a 'read more' node, next to current node
383 # 3/ add the truncated text in a new node, next to 'read more' node
384 node_text = (node.text or '').strip().strip('\n').strip()
385 if shorten and not overlength and cur_char_nbr + len(node_text) > max_length:
386 node_to_truncate = node
387 while node_to_truncate.getparent() is not None:
388 if node_to_truncate.get('in_quote'):
389 node_to_truncate = node_to_truncate.getparent()
390 elif protect_sections and (node_to_truncate.getparent().get('section_inner') or node_to_truncate.getparent().get('section_closure')):
391 node_to_truncate = node_to_truncate.getparent()
392 overlength_section_id = node_to_truncate.get('section_closure')
397 node_to_truncate.set('truncate', '1')
398 if node_to_truncate == node:
399 node_to_truncate.set('truncate_position', str(max_length - cur_char_nbr))
401 node_to_truncate.set('truncate_position', str(len(node.text or '')))
402 cur_char_nbr += len(node_text)
405 # ------------------------------------------------------------
407 for node in root.iter():
408 if node.get('truncate'):
409 _truncate_node(node, int(node.get('truncate_position', '0')), node.tag != 'pre')
412 # ------------------------------------------------------------
415 for node in root.iter():
416 if node.get('in_quote') or node.get('in_overlength'):
417 # copy the node tail into parent text
418 if node.tail and not node.get('tail_remove'):
419 parent = node.getparent()
420 parent.tail = node.tail + (parent.tail or '')
421 to_remove.append(node)
422 if node.get('tail_remove'):
425 for attribute_name in ['in_quote', 'tail_remove', 'in_overlength', 'msoffice', 'hotmail', 'truncate', 'truncate_position']:
426 node.attrib.pop(attribute_name, None)
427 for node in to_remove:
429 node.getparent().remove(node)
431 if not expand_options.get('oe_expand_a_class', 'oe_mail_expand') in node.get('class', ''): # trick: read more link should be displayed even if it's in overlength
432 node_class = node.get('class', '') + ' oe_mail_cleaned'
433 node.set('class', node_class)
435 # html: \n that were tail of elements have been encapsulated into <span> -> back to \n
436 html = etree.tostring(root, pretty_print=False)
437 linebreaks = re.compile(r'<span[^>]*>([\s]*[\r\n]+[\s]*)<\/span>', re.IGNORECASE | re.DOTALL)
438 html = _replace_matching_regex(linebreaks, html, '\n')
443 #----------------------------------------------------------
444 # HTML/Text management
445 #----------------------------------------------------------
447 def html2plaintext(html, body_id=None, encoding='utf-8'):
448 """ From an HTML text, convert the HTML to plain text.
449 If @param body_id is provided then this is the tag where the
450 body (not necessarily <body>) starts.
452 ## (c) Fry-IT, www.fry-it.com, 2007
453 ## <peter@fry-it.com>
454 ## download here: http://www.peterbe.com/plog/html2plaintext
457 tree = etree.fromstring(html, parser=etree.HTMLParser())
459 if body_id is not None:
460 source = tree.xpath('//*[@id=%s]' % (body_id,))
462 source = tree.xpath('//body')
468 for link in tree.findall('.//a'):
469 url = link.get('href')
473 link.text = '%s [%s]' % (link.text, i)
474 url_index.append(url)
476 html = ustr(etree.tostring(tree, encoding=encoding))
477 # \r char is converted into , must remove it
478 html = html.replace(' ', '')
480 html = html.replace('<strong>', '*').replace('</strong>', '*')
481 html = html.replace('<b>', '*').replace('</b>', '*')
482 html = html.replace('<h3>', '*').replace('</h3>', '*')
483 html = html.replace('<h2>', '**').replace('</h2>', '**')
484 html = html.replace('<h1>', '**').replace('</h1>', '**')
485 html = html.replace('<em>', '/').replace('</em>', '/')
486 html = html.replace('<tr>', '\n')
487 html = html.replace('</p>', '\n')
488 html = re.sub('<br\s*/?>', '\n', html)
489 html = re.sub('<.*?>', ' ', html)
490 html = html.replace(' ' * 2, ' ')
493 html = '\n'.join([x.strip() for x in html.splitlines()])
494 html = html.replace('\n' * 2, '\n')
496 for i, url in enumerate(url_index):
499 html += ustr('[%s] %s\n') % (i + 1, url)
503 def plaintext2html(text, container_tag=False):
504 """ Convert plaintext into html. Content of the text is escaped to manage
505 html entities, using cgi.escape().
506 - all \n,\r are replaced by <br />
507 - enclose content into <p>
508 - 2 or more consecutive <br /> are considered as paragraph breaks
510 :param string container_tag: container of the html; by default the
511 content is embedded into a <div>
513 text = cgi.escape(ustr(text))
515 # 1. replace \n and \r
516 text = text.replace('\n', '<br/>')
517 text = text.replace('\r', '<br/>')
519 # 2-3: form paragraphs
522 br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})')
523 for item in re.finditer(br_tags, text):
524 final += text[idx:item.start()] + '</p><p>'
526 final += text[idx:] + '</p>'
530 final = '<%s>%s</%s>' % (container_tag, final, container_tag)
533 def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False):
534 """ Append extra content at the end of an HTML snippet, trying
535 to locate the end of the HTML document (</body>, </html>, or
536 EOF), and converting the provided content in html unless ``plaintext``
538 Content conversion can be done in two ways:
539 - wrapping it into a pre (preserve=True)
540 - use plaintext2html (preserve=False, using container_tag to wrap the
542 A side-effect of this method is to coerce all HTML tags to
543 lowercase in ``html``, and strip enclosing <html> or <body> tags in
544 content if ``plaintext`` is False.
546 :param str html: html tagsoup (doesn't have to be XHTML)
547 :param str content: extra content to append
548 :param bool plaintext: whether content is plaintext and should
549 be wrapped in a <pre/> tag.
550 :param bool preserve: if content is plaintext, wrap it into a <pre>
551 instead of converting it into html
554 if plaintext and preserve:
555 content = u'\n<pre>%s</pre>\n' % ustr(content)
557 content = '\n%s\n' % plaintext2html(content, container_tag)
559 content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
560 content = u'\n%s\n' % ustr(content)
561 # Force all tags to lowercase
562 html = re.sub(r'(</?)\W*(\w+)([ >])',
563 lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
564 insert_location = html.find('</body>')
565 if insert_location == -1:
566 insert_location = html.find('</html>')
567 if insert_location == -1:
568 return '%s%s' % (html, content)
569 return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
571 #----------------------------------------------------------
573 #----------------------------------------------------------
575 # matches any email in a body of text
576 email_re = re.compile(r"""([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6})""", re.VERBOSE)
578 # matches a string containing only one email
579 single_email_re = re.compile(r"""^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$""", re.VERBOSE)
581 res_re = re.compile(r"\[([0-9]+)\]", re.UNICODE)
582 command_re = re.compile("^Set-([a-z]+) *: *(.+)$", re.I + re.UNICODE)
584 # Updated in 7.0 to match the model name as well
585 # Typical form of references is <timestamp-openerp-record_id-model_name@domain>
586 # group(1) = the record ID ; group(2) = the model (if any) ; group(3) = the domain
587 reference_re = re.compile("<.*-open(?:object|erp)-(\\d+)(?:-([\w.]+))?.*@(.*)>", re.UNICODE)
590 # Typical form of bounce is bounce-128-crm.lead-34@domain
591 # group(1) = the mail ID; group(2) = the model (if any); group(3) = the record ID
592 bounce_re = re.compile("[\w]+-(\d+)-?([\w.]+)?-?(\d+)?", re.UNICODE)
594 def generate_tracking_message_id(res_id):
595 """Returns a string that can be used in the Message-ID RFC822 header field
597 Used to track the replies related to a given object thanks to the "In-Reply-To"
598 or "References" fields that Mail User Agents will set.
601 rnd = random.SystemRandom().random()
602 except NotImplementedError:
603 rnd = random.random()
604 rndstr = ("%.15f" % rnd)[2:]
605 return "<%.15f.%s-openerp-%s@%s>" % (time.time(), rndstr, res_id, socket.gethostname())
607 def email_send(email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False,
608 attachments=None, message_id=None, references=None, openobject_id=False, debug=False, subtype='plain', headers=None,
609 smtp_server=None, smtp_port=None, ssl=False, smtp_user=None, smtp_password=None, cr=None, uid=None):
610 """Low-level function for sending an email (deprecated).
612 :deprecate: since OpenERP 6.1, please use ir.mail_server.send_email() instead.
613 :param email_from: A string used to fill the `From` header, if falsy,
614 config['email_from'] is used instead. Also used for
615 the `Reply-To` header if `reply_to` is not provided
616 :param email_to: a sequence of addresses to send the mail to.
619 # If not cr, get cr from current thread database
622 db_name = getattr(threading.currentThread(), 'dbname', None)
624 local_cr = cr = openerp.registry(db_name).db.cursor()
626 raise Exception("No database cursor found, please pass one explicitly")
630 mail_server_pool = openerp.registry(cr.dbname)['ir.mail_server']
632 # Pack Message into MIME Object
633 email_msg = mail_server_pool.build_email(email_from, email_to, subject, body, email_cc, email_bcc, reply_to,
634 attachments, message_id, references, openobject_id, subtype, headers=headers)
636 res = mail_server_pool.send_email(cr, uid or 1, email_msg, mail_server_id=None,
637 smtp_server=smtp_server, smtp_port=smtp_port, smtp_user=smtp_user, smtp_password=smtp_password,
638 smtp_encryption=('ssl' if ssl else None), smtp_debug=debug)
640 _logger.exception("tools.email_send failed to deliver email")
647 def email_split(text):
648 """ Return a list of the email addresses found in ``text`` """
651 return [addr[1] for addr in getaddresses([text])
652 # getaddresses() returns '' when email parsing fails, and
653 # sometimes returns emails without at least '@'. The '@'
654 # is strictly required in RFC2822's `addr-spec`.