1 # -*- coding: utf-8 -*-
2 ##############################################################################
4 # OpenERP, Open Source Business Applications
5 # Copyright (C) 2012-TODAY OpenERP S.A. (<http://openerp.com>).
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as
9 # published by the Free Software Foundation, either version 3 of the
10 # License, or (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ##############################################################################
22 from lxml import etree
26 import lxml.html.clean as clean
32 from email.utils import getaddresses
35 from openerp.loglevels import ustr
37 _logger = logging.getLogger(__name__)
40 #----------------------------------------------------------
42 #----------------------------------------------------------
44 tags_to_kill = ["script", "head", "meta", "title", "link", "style", "frame", "iframe", "base", "object", "embed"]
45 tags_to_remove = ['html', 'body', 'font']
47 # allow new semantic HTML5 tags
48 allowed_tags = clean.defs.tags | frozenset('article section header footer hgroup nav aside figure main'.split())
49 safe_attrs = clean.defs.safe_attrs | frozenset(
51 'data-oe-model', 'data-oe-id', 'data-oe-field', 'data-oe-type', 'data-oe-expression', 'data-oe-translate', 'data-oe-nodeid',
52 'data-snippet-id', 'data-publish', 'data-id', 'data-res_id', 'data-member_id', 'data-view-id'
56 def html_sanitize(src, silent=True, strict=False):
59 src = ustr(src, errors='replace')
61 logger = logging.getLogger(__name__ + '.html_sanitize')
63 # html encode email tags
64 part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL)
65 src = part.sub(lambda m: cgi.escape(m.group(1)), src)
68 'page_structure': True,
69 'style': False, # do not remove style attributes
70 'forms': True, # remove form tags
71 'remove_unknown_tags': False,
72 'allow_tags': allowed_tags,
74 if etree.LXML_VERSION >= (2, 3, 1):
75 # kill_tags attribute has been added in version 2.3.1
77 'kill_tags': tags_to_kill,
78 'remove_tags': tags_to_remove,
81 kwargs['remove_tags'] = tags_to_kill + tags_to_remove
84 if etree.LXML_VERSION >= (3, 1, 0):
85 # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style"
87 'safe_attrs_only': True,
88 'safe_attrs': safe_attrs,
91 kwargs['safe_attrs_only'] = False # keep oe-data attributes + style
92 kwargs['frames'] = False, # do not remove frames (embbed video in CMS blogs)
95 # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
96 cleaner = clean.Cleaner(**kwargs)
97 cleaned = cleaner.clean_html(src)
98 # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution
99 cleaned = cleaned.replace('%24', '$')
100 cleaned = cleaned.replace('%7B', '{')
101 cleaned = cleaned.replace('%7D', '}')
102 cleaned = cleaned.replace('%20', ' ')
103 cleaned = cleaned.replace('%5B', '[')
104 cleaned = cleaned.replace('%5D', ']')
105 except etree.ParserError, e:
106 if 'empty' in str(e):
110 logger.warning('ParserError obtained when sanitizing %r', src, exc_info=True)
111 cleaned = '<p>ParserError when sanitizing</p>'
115 logger.warning('unknown error obtained when sanitizing %r', src, exc_info=True)
116 cleaned = '<p>Unknown error when sanitizing</p>'
118 # this is ugly, but lxml/etree tostring want to put everything in a 'div' that breaks the editor -> remove that
119 if cleaned.startswith('<div>') and cleaned.endswith('</div>'):
120 cleaned = cleaned[5:-6]
125 #----------------------------------------------------------
127 #----------------------------------------------------------
129 def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_options=None,
130 protect_sections=False):
131 """ html_email_clean: clean the html by doing the following steps:
133 - try to strip email quotes, by removing blockquotes or having some client-
135 - try to strip signatures
136 - shorten the html to a maximum number of characters if requested
138 Some specific use case:
140 - MsOffice: ``div.style = border-top:solid;`` delimitates the beginning of
141 a quote; detecting by finding WordSection1 of MsoNormal
142 - Hotmail: ``hr.stopSpelling`` delimitates the beginning of a quote; detect
143 Hotmail by funding ``SkyDrivePlaceholder``
145 :param string html: sanitized html; tags like html or head should not
146 be present in the html string. This method therefore
147 takes as input html code coming from a sanitized source,
149 :param boolean remove: remove the html code that is unwanted; otherwise it
150 is only flagged and tagged
151 :param boolean shorten: shorten the html; every excessing content will
152 be flagged as to remove
153 :param int max_length: if shortening, maximum number of characters before
155 :param dict expand_options: options for the read more link when shortening
156 the content.The used keys are the following:
158 - oe_expand_container_tag: class applied to the
159 container of the whole read more link
160 - oe_expand_container_class: class applied to the
161 link container (default: oe_mail_expand)
162 - oe_expand_container_content: content of the
163 container (default: ...)
164 - oe_expand_separator_node: optional separator, like
165 adding ... <br /><br /> <a ...>read more</a> (default: void)
166 - oe_expand_a_href: href of the read more link itself
168 - oe_expand_a_class: class applied to the <a> containing
169 the link itself (default: oe_mail_expand)
170 - oe_expand_a_content: content of the <a> (default: read more)
172 The formatted read more link is the following:
173 <cont_tag class="oe_expand_container_class">
174 oe_expand_container_content
175 if expand_options.get('oe_expand_separator_node'):
176 <oe_expand_separator_node/>
177 <a href="oe_expand_a_href" class="oe_expand_a_class">
182 def _replace_matching_regex(regex, source, replace=''):
183 """ Replace all matching expressions in source by replace """
188 for item in re.finditer(regex, source):
189 dest += source[idx:item.start()] + replace
194 def _create_node(tag, text, tail=None, attrs={}):
195 new_node = etree.Element(tag)
198 for key, val in attrs.iteritems():
199 new_node.set(key, val)
202 def _insert_new_node(node, index, new_node_tag, new_node_text, new_node_tail=None, new_node_attrs={}):
203 new_node = _create_node(new_node_tag, new_node_text, new_node_tail, new_node_attrs)
204 node.insert(index, new_node)
207 def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs={}):
208 text = node.text or ''
209 if not re.search(regex, text):
214 idx, iteration = 0, 0
215 for item in re.finditer(regex, text):
217 cur_node.text = text[idx:item.start()]
219 _insert_new_node(node, (iteration - 1) * 2 + 1, new_node_tag, text[idx:item.start()])
220 new_node = _insert_new_node(node, iteration * 2, new_node_tag, text[item.start():item.end()], None, new_node_attrs)
225 new_node = _insert_new_node(node, -1, new_node_tag, text[idx:] + (cur_node.tail or ''), None, {})
227 def _truncate_node(node, position, simplify_whitespaces=True):
228 """ Truncate a node text at a given position. This algorithm will shorten
229 at the end of the word whose ending character exceeds position.
231 :param bool simplify_whitespaces: whether to try to count all successive
232 whitespaces as one character. This
233 option should not be True when trying
234 to keep 'pre' consistency.
236 if node.text is None:
240 if simplify_whitespaces:
243 node_words = node.text.strip(' \t\r\n').split()
244 for word in node_words:
245 cur_char_nbr += len(word)
246 if cur_char_nbr >= position:
249 truncate_idx = node.text.find(word) + len(word)
251 truncate_idx = position
252 if truncate_idx == -1 or truncate_idx > len(node.text):
253 truncate_idx = len(node.text)
255 # compose new text bits
256 innertext = node.text[0:truncate_idx]
257 outertext = node.text[truncate_idx:]
258 node.text = innertext
260 # create <span> ... <a href="#">read more</a></span> node
261 read_more_node = _create_node(
262 expand_options.get('oe_expand_container_tag', 'span'),
263 expand_options.get('oe_expand_container_content', ' ... '),
265 {'class': expand_options.get('oe_expand_container_class', 'oe_mail_expand')}
267 if expand_options.get('oe_expand_separator_node'):
268 read_more_separator_node = _create_node(
269 expand_options.get('oe_expand_separator_node'),
274 read_more_node.append(read_more_separator_node)
275 read_more_link_node = _create_node(
277 expand_options.get('oe_expand_a_content', 'read more'),
280 'href': expand_options.get('oe_expand_a_href', '#'),
281 'class': expand_options.get('oe_expand_a_class', 'oe_mail_expand'),
284 read_more_node.append(read_more_link_node)
285 # create outertext node
286 overtext_node = _create_node('span', outertext)
288 overtext_node.set('in_overlength', '1')
289 # add newly created nodes in dom
290 node.append(read_more_node)
291 node.append(overtext_node)
293 if expand_options is None:
296 if not html or not isinstance(html, basestring):
301 # ------------------------------------------------------------
302 # TDE TODO: --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}'
304 # html: remove encoding attribute inside tags
305 doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
306 html = doctype.sub(r"", html)
308 # html: ClEditor seems to love using <div><br /><div> -> replace with <br />
309 br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)', re.IGNORECASE)
310 html = _replace_matching_regex(br_div_tags, html, '<br />')
313 root = lxml.html.fromstring(html)
314 if not len(root) and root.text is None and root.tail is None:
315 html = '<div>%s</div>' % html
316 root = lxml.html.fromstring(html)
318 quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
319 signature = re.compile(r'([-]{2,}[\s]?[\r\n]{1,2}[\s\S]+)')
320 for node in root.iter():
321 # remove all tails and replace them by a span element, because managing text and tails can be a pain in the ass
323 tail_node = _create_node('span', node.tail)
325 node.addnext(tail_node)
327 # form node and tag text-based quotes and signature
328 _tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'})
329 _tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'})
332 # ------------------------------------------------------------
335 # signature_begin = False # try dynamic signature recognition
338 overlength_section_id = None
339 overlength_section_count = 0
341 for node in root.iter():
342 # do not take into account multiple spaces that are displayed as max 1 space in html
343 node_text = ' '.join((node.text and node.text.strip(' \t\r\n') or '').split())
345 # root: try to tag the client used to write the html
346 if 'WordSection1' in node.get('class', '') or 'MsoNormal' in node.get('class', ''):
347 root.set('msoffice', '1')
348 if 'SkyDrivePlaceholder' in node.get('class', '') or 'SkyDrivePlaceholder' in node.get('id', ''):
349 root.set('hotmail', '1')
351 # protect sections by tagging section limits and blocks contained inside sections, using an increasing id to re-find them later
352 if node.tag == 'section':
353 overlength_section_count += 1
354 node.set('section_closure', str(overlength_section_count))
355 if node.getparent() is not None and (node.getparent().get('section_closure') or node.getparent().get('section_inner')):
356 node.set('section_inner', str(overlength_section_count))
358 # state of the parsing: flag quotes and tails to remove
360 node.set('in_quote', '1')
361 node.set('tail_remove', '1')
362 # state of the parsing: flag when being in over-length content, depending on section content if defined (only when having protect_sections)
364 if not overlength_section_id or int(node.get('section_inner', overlength_section_count + 1)) > overlength_section_count:
365 node.set('in_overlength', '1')
366 node.set('tail_remove', '1')
368 # find quote in msoffice / hotmail / blockquote / text quote and signatures
369 if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''):
371 node.set('in_quote', '1')
372 node.set('tail_remove', '1')
373 if root.get('hotmail') and node.tag == 'hr' and ('stopSpelling' in node.get('class', '') or 'stopSpelling' in node.get('id', '')):
375 node.set('in_quote', '1')
376 node.set('tail_remove', '1')
377 if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'):
378 node.set('in_quote', '1')
381 # if protect section:
382 # 1/ find the first parent not being inside a section
383 # 2/ add the read more link
385 # 1/ truncate the text at the next available space
386 # 2/ create a 'read more' node, next to current node
387 # 3/ add the truncated text in a new node, next to 'read more' node
388 node_text = (node.text or '').strip().strip('\n').strip()
389 if shorten and not overlength and cur_char_nbr + len(node_text) > max_length:
390 node_to_truncate = node
391 while node_to_truncate.getparent() is not None:
392 if node_to_truncate.get('in_quote'):
393 node_to_truncate = node_to_truncate.getparent()
394 elif protect_sections and (node_to_truncate.getparent().get('section_inner') or node_to_truncate.getparent().get('section_closure')):
395 node_to_truncate = node_to_truncate.getparent()
396 overlength_section_id = node_to_truncate.get('section_closure')
401 node_to_truncate.set('truncate', '1')
402 if node_to_truncate == node:
403 node_to_truncate.set('truncate_position', str(max_length - cur_char_nbr))
405 node_to_truncate.set('truncate_position', str(len(node.text or '')))
406 cur_char_nbr += len(node_text)
409 # ------------------------------------------------------------
411 for node in root.iter():
412 if node.get('truncate'):
413 _truncate_node(node, int(node.get('truncate_position', '0')), node.tag != 'pre')
416 # ------------------------------------------------------------
419 for node in root.iter():
420 if node.get('in_quote') or node.get('in_overlength'):
421 # copy the node tail into parent text
422 if node.tail and not node.get('tail_remove'):
423 parent = node.getparent()
424 parent.tail = node.tail + (parent.tail or '')
425 to_remove.append(node)
426 if node.get('tail_remove'):
429 for attribute_name in ['in_quote', 'tail_remove', 'in_overlength', 'msoffice', 'hotmail', 'truncate', 'truncate_position']:
430 node.attrib.pop(attribute_name, None)
431 for node in to_remove:
433 node.getparent().remove(node)
435 if not expand_options.get('oe_expand_a_class', 'oe_mail_expand') in node.get('class', ''): # trick: read more link should be displayed even if it's in overlength
436 node_class = node.get('class', '') + ' oe_mail_cleaned'
437 node.set('class', node_class)
439 # html: \n that were tail of elements have been encapsulated into <span> -> back to \n
440 html = etree.tostring(root, pretty_print=False)
441 linebreaks = re.compile(r'<span[^>]*>([\s]*[\r\n]+[\s]*)<\/span>', re.IGNORECASE | re.DOTALL)
442 html = _replace_matching_regex(linebreaks, html, '\n')
447 #----------------------------------------------------------
448 # HTML/Text management
449 #----------------------------------------------------------
451 def html2plaintext(html, body_id=None, encoding='utf-8'):
452 """ From an HTML text, convert the HTML to plain text.
453 If @param body_id is provided then this is the tag where the
454 body (not necessarily <body>) starts.
456 ## (c) Fry-IT, www.fry-it.com, 2007
457 ## <peter@fry-it.com>
458 ## download here: http://www.peterbe.com/plog/html2plaintext
461 tree = etree.fromstring(html, parser=etree.HTMLParser())
463 if body_id is not None:
464 source = tree.xpath('//*[@id=%s]' % (body_id,))
466 source = tree.xpath('//body')
472 for link in tree.findall('.//a'):
473 url = link.get('href')
477 link.text = '%s [%s]' % (link.text, i)
478 url_index.append(url)
480 html = ustr(etree.tostring(tree, encoding=encoding))
481 # \r char is converted into , must remove it
482 html = html.replace(' ', '')
484 html = html.replace('<strong>', '*').replace('</strong>', '*')
485 html = html.replace('<b>', '*').replace('</b>', '*')
486 html = html.replace('<h3>', '*').replace('</h3>', '*')
487 html = html.replace('<h2>', '**').replace('</h2>', '**')
488 html = html.replace('<h1>', '**').replace('</h1>', '**')
489 html = html.replace('<em>', '/').replace('</em>', '/')
490 html = html.replace('<tr>', '\n')
491 html = html.replace('</p>', '\n')
492 html = re.sub('<br\s*/?>', '\n', html)
493 html = re.sub('<.*?>', ' ', html)
494 html = html.replace(' ' * 2, ' ')
497 html = '\n'.join([x.strip() for x in html.splitlines()])
498 html = html.replace('\n' * 2, '\n')
500 for i, url in enumerate(url_index):
503 html += ustr('[%s] %s\n') % (i + 1, url)
507 def plaintext2html(text, container_tag=False):
508 """ Convert plaintext into html. Content of the text is escaped to manage
509 html entities, using cgi.escape().
510 - all \n,\r are replaced by <br />
511 - enclose content into <p>
512 - 2 or more consecutive <br /> are considered as paragraph breaks
514 :param string container_tag: container of the html; by default the
515 content is embedded into a <div>
517 text = cgi.escape(ustr(text))
519 # 1. replace \n and \r
520 text = text.replace('\n', '<br/>')
521 text = text.replace('\r', '<br/>')
523 # 2-3: form paragraphs
526 br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})')
527 for item in re.finditer(br_tags, text):
528 final += text[idx:item.start()] + '</p><p>'
530 final += text[idx:] + '</p>'
534 final = '<%s>%s</%s>' % (container_tag, final, container_tag)
537 def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False):
538 """ Append extra content at the end of an HTML snippet, trying
539 to locate the end of the HTML document (</body>, </html>, or
540 EOF), and converting the provided content in html unless ``plaintext``
542 Content conversion can be done in two ways:
543 - wrapping it into a pre (preserve=True)
544 - use plaintext2html (preserve=False, using container_tag to wrap the
546 A side-effect of this method is to coerce all HTML tags to
547 lowercase in ``html``, and strip enclosing <html> or <body> tags in
548 content if ``plaintext`` is False.
550 :param str html: html tagsoup (doesn't have to be XHTML)
551 :param str content: extra content to append
552 :param bool plaintext: whether content is plaintext and should
553 be wrapped in a <pre/> tag.
554 :param bool preserve: if content is plaintext, wrap it into a <pre>
555 instead of converting it into html
558 if plaintext and preserve:
559 content = u'\n<pre>%s</pre>\n' % ustr(content)
561 content = '\n%s\n' % plaintext2html(content, container_tag)
563 content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
564 content = u'\n%s\n' % ustr(content)
565 # Force all tags to lowercase
566 html = re.sub(r'(</?)\W*(\w+)([ >])',
567 lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
568 insert_location = html.find('</body>')
569 if insert_location == -1:
570 insert_location = html.find('</html>')
571 if insert_location == -1:
572 return '%s%s' % (html, content)
573 return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
575 #----------------------------------------------------------
577 #----------------------------------------------------------
579 # matches any email in a body of text
580 email_re = re.compile(r"""([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6})""", re.VERBOSE)
582 # matches a string containing only one email
583 single_email_re = re.compile(r"""^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$""", re.VERBOSE)
585 res_re = re.compile(r"\[([0-9]+)\]", re.UNICODE)
586 command_re = re.compile("^Set-([a-z]+) *: *(.+)$", re.I + re.UNICODE)
588 # Updated in 7.0 to match the model name as well
589 # Typical form of references is <timestamp-openerp-record_id-model_name@domain>
590 # group(1) = the record ID ; group(2) = the model (if any) ; group(3) = the domain
591 reference_re = re.compile("<.*-open(?:object|erp)-(\\d+)(?:-([\w.]+))?.*@(.*)>", re.UNICODE)
594 # Typical form of bounce is bounce-128-crm.lead-34@domain
595 # group(1) = the mail ID; group(2) = the model (if any); group(3) = the record ID
596 bounce_re = re.compile("[\w]+-(\d+)-?([\w.]+)?-?(\d+)?", re.UNICODE)
598 def generate_tracking_message_id(res_id):
599 """Returns a string that can be used in the Message-ID RFC822 header field
601 Used to track the replies related to a given object thanks to the "In-Reply-To"
602 or "References" fields that Mail User Agents will set.
605 rnd = random.SystemRandom().random()
606 except NotImplementedError:
607 rnd = random.random()
608 rndstr = ("%.15f" % rnd)[2:]
609 return "<%.15f.%s-openerp-%s@%s>" % (time.time(), rndstr, res_id, socket.gethostname())
611 def email_send(email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False,
612 attachments=None, message_id=None, references=None, openobject_id=False, debug=False, subtype='plain', headers=None,
613 smtp_server=None, smtp_port=None, ssl=False, smtp_user=None, smtp_password=None, cr=None, uid=None):
614 """Low-level function for sending an email (deprecated).
616 :deprecate: since OpenERP 6.1, please use ir.mail_server.send_email() instead.
617 :param email_from: A string used to fill the `From` header, if falsy,
618 config['email_from'] is used instead. Also used for
619 the `Reply-To` header if `reply_to` is not provided
620 :param email_to: a sequence of addresses to send the mail to.
623 # If not cr, get cr from current thread database
626 db_name = getattr(threading.currentThread(), 'dbname', None)
628 local_cr = cr = openerp.registry(db_name).db.cursor()
630 raise Exception("No database cursor found, please pass one explicitly")
634 mail_server_pool = openerp.registry(cr.dbname)['ir.mail_server']
636 # Pack Message into MIME Object
637 email_msg = mail_server_pool.build_email(email_from, email_to, subject, body, email_cc, email_bcc, reply_to,
638 attachments, message_id, references, openobject_id, subtype, headers=headers)
640 res = mail_server_pool.send_email(cr, uid or 1, email_msg, mail_server_id=None,
641 smtp_server=smtp_server, smtp_port=smtp_port, smtp_user=smtp_user, smtp_password=smtp_password,
642 smtp_encryption=('ssl' if ssl else None), smtp_debug=debug)
644 _logger.exception("tools.email_send failed to deliver email")
651 def email_split(text):
652 """ Return a list of the email addresses found in ``text`` """
655 return [addr[1] for addr in getaddresses([text])
656 # getaddresses() returns '' when email parsing fails, and
657 # sometimes returns emails without at least '@'. The '@'
658 # is strictly required in RFC2822's `addr-spec`.