1 # -*- coding: utf-8 -*-
2 ##############################################################################
4 # OpenERP, Open Source Business Applications
5 # Copyright (C) 2012-TODAY OpenERP S.A. (<http://openerp.com>).
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as
9 # published by the Free Software Foundation, either version 3 of the
10 # License, or (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ##############################################################################
22 from lxml import etree
26 import lxml.html.clean as clean
32 from email.utils import getaddresses
35 from openerp.loglevels import ustr
37 _logger = logging.getLogger(__name__)
40 #----------------------------------------------------------
42 #----------------------------------------------------------
44 tags_to_kill = ["script", "head", "meta", "title", "link", "style", "frame", "iframe", "base", "object", "embed"]
45 tags_to_remove = ['html', 'body', 'font']
47 # allow new semantic HTML5 tags
48 allowed_tags = clean.defs.tags | frozenset('article section header footer hgroup nav aside figure main'.split() + [etree.Comment])
49 safe_attrs = clean.defs.safe_attrs | frozenset(
51 'data-oe-model', 'data-oe-id', 'data-oe-field', 'data-oe-type', 'data-oe-expression', 'data-oe-translate', 'data-oe-nodeid',
52 'data-snippet-id', 'data-publish', 'data-id', 'data-res_id', 'data-member_id', 'data-view-id'
56 def html_sanitize(src, silent=True, strict=False):
59 src = ustr(src, errors='replace')
61 logger = logging.getLogger(__name__ + '.html_sanitize')
63 # html encode email tags
64 part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL)
65 src = part.sub(lambda m: cgi.escape(m.group(1)), src)
68 'page_structure': True,
69 'style': False, # do not remove style attributes
70 'forms': True, # remove form tags
71 'remove_unknown_tags': False,
72 'allow_tags': allowed_tags,
74 'processing_instructions' : False
76 if etree.LXML_VERSION >= (2, 3, 1):
77 # kill_tags attribute has been added in version 2.3.1
79 'kill_tags': tags_to_kill,
80 'remove_tags': tags_to_remove,
83 kwargs['remove_tags'] = tags_to_kill + tags_to_remove
86 if etree.LXML_VERSION >= (3, 1, 0):
87 # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style"
89 'safe_attrs_only': True,
90 'safe_attrs': safe_attrs,
93 kwargs['safe_attrs_only'] = False # keep oe-data attributes + style
94 kwargs['frames'] = False, # do not remove frames (embbed video in CMS blogs)
97 # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
98 cleaner = clean.Cleaner(**kwargs)
99 cleaned = cleaner.clean_html(src)
100 # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution
101 cleaned = cleaned.replace('%24', '$')
102 cleaned = cleaned.replace('%7B', '{')
103 cleaned = cleaned.replace('%7D', '}')
104 cleaned = cleaned.replace('%20', ' ')
105 cleaned = cleaned.replace('%5B', '[')
106 cleaned = cleaned.replace('%5D', ']')
107 except etree.ParserError, e:
108 if 'empty' in str(e):
112 logger.warning('ParserError obtained when sanitizing %r', src, exc_info=True)
113 cleaned = '<p>ParserError when sanitizing</p>'
117 logger.warning('unknown error obtained when sanitizing %r', src, exc_info=True)
118 cleaned = '<p>Unknown error when sanitizing</p>'
120 # this is ugly, but lxml/etree tostring want to put everything in a 'div' that breaks the editor -> remove that
121 if cleaned.startswith('<div>') and cleaned.endswith('</div>'):
122 cleaned = cleaned[5:-6]
127 #----------------------------------------------------------
129 #----------------------------------------------------------
131 def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_options=None,
132 protect_sections=False):
133 """ html_email_clean: clean the html by doing the following steps:
135 - try to strip email quotes, by removing blockquotes or having some client-
137 - try to strip signatures
138 - shorten the html to a maximum number of characters if requested
140 Some specific use case:
142 - MsOffice: ``div.style = border-top:solid;`` delimitates the beginning of
143 a quote; detecting by finding WordSection1 of MsoNormal
144 - Hotmail: ``hr.stopSpelling`` delimitates the beginning of a quote; detect
145 Hotmail by funding ``SkyDrivePlaceholder``
147 :param string html: sanitized html; tags like html or head should not
148 be present in the html string. This method therefore
149 takes as input html code coming from a sanitized source,
151 :param boolean remove: remove the html code that is unwanted; otherwise it
152 is only flagged and tagged
153 :param boolean shorten: shorten the html; every excessing content will
154 be flagged as to remove
155 :param int max_length: if shortening, maximum number of characters before
157 :param dict expand_options: options for the read more link when shortening
158 the content.The used keys are the following:
160 - oe_expand_container_tag: class applied to the
161 container of the whole read more link
162 - oe_expand_container_class: class applied to the
163 link container (default: oe_mail_expand)
164 - oe_expand_container_content: content of the
165 container (default: ...)
166 - oe_expand_separator_node: optional separator, like
167 adding ... <br /><br /> <a ...>read more</a> (default: void)
168 - oe_expand_a_href: href of the read more link itself
170 - oe_expand_a_class: class applied to the <a> containing
171 the link itself (default: oe_mail_expand)
172 - oe_expand_a_content: content of the <a> (default: read more)
174 The formatted read more link is the following:
175 <cont_tag class="oe_expand_container_class">
176 oe_expand_container_content
177 if expand_options.get('oe_expand_separator_node'):
178 <oe_expand_separator_node/>
179 <a href="oe_expand_a_href" class="oe_expand_a_class">
184 def _replace_matching_regex(regex, source, replace=''):
185 """ Replace all matching expressions in source by replace """
190 for item in re.finditer(regex, source):
191 dest += source[idx:item.start()] + replace
196 def _create_node(tag, text, tail=None, attrs={}):
197 new_node = etree.Element(tag)
200 for key, val in attrs.iteritems():
201 new_node.set(key, val)
204 def _insert_new_node(node, index, new_node_tag, new_node_text, new_node_tail=None, new_node_attrs={}):
205 new_node = _create_node(new_node_tag, new_node_text, new_node_tail, new_node_attrs)
206 node.insert(index, new_node)
209 def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs={}):
210 text = node.text or ''
211 if not re.search(regex, text):
216 idx, iteration = 0, 0
217 for item in re.finditer(regex, text):
219 cur_node.text = text[idx:item.start()]
221 _insert_new_node(node, (iteration - 1) * 2 + 1, new_node_tag, text[idx:item.start()])
222 new_node = _insert_new_node(node, iteration * 2, new_node_tag, text[item.start():item.end()], None, new_node_attrs)
227 new_node = _insert_new_node(node, -1, new_node_tag, text[idx:] + (cur_node.tail or ''), None, {})
229 def _truncate_node(node, position, simplify_whitespaces=True):
230 """ Truncate a node text at a given position. This algorithm will shorten
231 at the end of the word whose ending character exceeds position.
233 :param bool simplify_whitespaces: whether to try to count all successive
234 whitespaces as one character. This
235 option should not be True when trying
236 to keep 'pre' consistency.
238 if node.text is None:
242 if simplify_whitespaces:
245 node_words = node.text.strip(' \t\r\n').split()
246 for word in node_words:
247 cur_char_nbr += len(word)
248 if cur_char_nbr >= position:
251 truncate_idx = node.text.find(word) + len(word)
253 truncate_idx = position
254 if truncate_idx == -1 or truncate_idx > len(node.text):
255 truncate_idx = len(node.text)
257 # compose new text bits
258 innertext = node.text[0:truncate_idx]
259 outertext = node.text[truncate_idx:]
260 node.text = innertext
262 # create <span> ... <a href="#">read more</a></span> node
263 read_more_node = _create_node(
264 expand_options.get('oe_expand_container_tag', 'span'),
265 expand_options.get('oe_expand_container_content', ' ... '),
267 {'class': expand_options.get('oe_expand_container_class', 'oe_mail_expand')}
269 if expand_options.get('oe_expand_separator_node'):
270 read_more_separator_node = _create_node(
271 expand_options.get('oe_expand_separator_node'),
276 read_more_node.append(read_more_separator_node)
277 read_more_link_node = _create_node(
279 expand_options.get('oe_expand_a_content', 'read more'),
282 'href': expand_options.get('oe_expand_a_href', '#'),
283 'class': expand_options.get('oe_expand_a_class', 'oe_mail_expand'),
286 read_more_node.append(read_more_link_node)
287 # create outertext node
288 overtext_node = _create_node('span', outertext)
290 overtext_node.set('in_overlength', '1')
291 # add newly created nodes in dom
292 node.append(read_more_node)
293 node.append(overtext_node)
295 if expand_options is None:
298 if not html or not isinstance(html, basestring):
303 # ------------------------------------------------------------
304 # TDE TODO: --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}'
306 # html: remove encoding attribute inside tags
307 doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
308 html = doctype.sub(r"", html)
310 # html: ClEditor seems to love using <div><br /><div> -> replace with <br />
311 br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)', re.IGNORECASE)
312 html = _replace_matching_regex(br_div_tags, html, '<br />')
315 root = lxml.html.fromstring(html)
316 if not len(root) and root.text is None and root.tail is None:
317 html = '<div>%s</div>' % html
318 root = lxml.html.fromstring(html)
320 quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
321 signature = re.compile(r'([-]{2,}[\s]?[\r\n]{1,2}[\s\S]+)')
322 for node in root.iter():
323 # remove all tails and replace them by a span element, because managing text and tails can be a pain in the ass
325 tail_node = _create_node('span', node.tail)
327 node.addnext(tail_node)
329 # form node and tag text-based quotes and signature
330 _tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'})
331 _tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'})
334 # ------------------------------------------------------------
337 # signature_begin = False # try dynamic signature recognition
340 overlength_section_id = None
341 overlength_section_count = 0
343 for node in root.iter():
344 # comments do not need processing
345 # note: bug in node.get(value, default) for HtmlComments, default never returned
346 if node.tag == etree.Comment:
348 # do not take into account multiple spaces that are displayed as max 1 space in html
349 node_text = ' '.join((node.text and node.text.strip(' \t\r\n') or '').split())
351 # root: try to tag the client used to write the html
352 if 'WordSection1' in node.get('class', '') or 'MsoNormal' in node.get('class', ''):
353 root.set('msoffice', '1')
354 if 'SkyDrivePlaceholder' in node.get('class', '') or 'SkyDrivePlaceholder' in node.get('id', ''):
355 root.set('hotmail', '1')
357 # protect sections by tagging section limits and blocks contained inside sections, using an increasing id to re-find them later
358 if node.tag == 'section':
359 overlength_section_count += 1
360 node.set('section_closure', str(overlength_section_count))
361 if node.getparent() is not None and (node.getparent().get('section_closure') or node.getparent().get('section_inner')):
362 node.set('section_inner', str(overlength_section_count))
364 # state of the parsing: flag quotes and tails to remove
366 node.set('in_quote', '1')
367 node.set('tail_remove', '1')
368 # state of the parsing: flag when being in over-length content, depending on section content if defined (only when having protect_sections)
370 if not overlength_section_id or int(node.get('section_inner', overlength_section_count + 1)) > overlength_section_count:
371 node.set('in_overlength', '1')
372 node.set('tail_remove', '1')
374 # find quote in msoffice / hotmail / blockquote / text quote and signatures
375 if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''):
377 node.set('in_quote', '1')
378 node.set('tail_remove', '1')
379 if root.get('hotmail') and node.tag == 'hr' and ('stopSpelling' in node.get('class', '') or 'stopSpelling' in node.get('id', '')):
381 node.set('in_quote', '1')
382 node.set('tail_remove', '1')
383 if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'):
384 node.set('in_quote', '1')
387 # if protect section:
388 # 1/ find the first parent not being inside a section
389 # 2/ add the read more link
391 # 1/ truncate the text at the next available space
392 # 2/ create a 'read more' node, next to current node
393 # 3/ add the truncated text in a new node, next to 'read more' node
394 node_text = (node.text or '').strip().strip('\n').strip()
395 if shorten and not overlength and cur_char_nbr + len(node_text) > max_length:
396 node_to_truncate = node
397 while node_to_truncate.getparent() is not None:
398 if node_to_truncate.get('in_quote'):
399 node_to_truncate = node_to_truncate.getparent()
400 elif protect_sections and (node_to_truncate.getparent().get('section_inner') or node_to_truncate.getparent().get('section_closure')):
401 node_to_truncate = node_to_truncate.getparent()
402 overlength_section_id = node_to_truncate.get('section_closure')
407 node_to_truncate.set('truncate', '1')
408 if node_to_truncate == node:
409 node_to_truncate.set('truncate_position', str(max_length - cur_char_nbr))
411 node_to_truncate.set('truncate_position', str(len(node.text or '')))
412 cur_char_nbr += len(node_text)
415 # ------------------------------------------------------------
417 for node in root.iter():
418 if node.get('truncate'):
419 _truncate_node(node, int(node.get('truncate_position', '0')), node.tag != 'pre')
422 # ------------------------------------------------------------
425 for node in root.iter():
426 if node.get('in_quote') or node.get('in_overlength'):
427 # copy the node tail into parent text
428 if node.tail and not node.get('tail_remove'):
429 parent = node.getparent()
430 parent.tail = node.tail + (parent.tail or '')
431 to_remove.append(node)
432 if node.get('tail_remove'):
435 for attribute_name in ['in_quote', 'tail_remove', 'in_overlength', 'msoffice', 'hotmail', 'truncate', 'truncate_position']:
436 node.attrib.pop(attribute_name, None)
437 for node in to_remove:
439 node.getparent().remove(node)
441 if not expand_options.get('oe_expand_a_class', 'oe_mail_expand') in node.get('class', ''): # trick: read more link should be displayed even if it's in overlength
442 node_class = node.get('class', '') + ' oe_mail_cleaned'
443 node.set('class', node_class)
445 # html: \n that were tail of elements have been encapsulated into <span> -> back to \n
446 html = etree.tostring(root, pretty_print=False)
447 linebreaks = re.compile(r'<span[^>]*>([\s]*[\r\n]+[\s]*)<\/span>', re.IGNORECASE | re.DOTALL)
448 html = _replace_matching_regex(linebreaks, html, '\n')
453 #----------------------------------------------------------
454 # HTML/Text management
455 #----------------------------------------------------------
457 def html2plaintext(html, body_id=None, encoding='utf-8'):
458 """ From an HTML text, convert the HTML to plain text.
459 If @param body_id is provided then this is the tag where the
460 body (not necessarily <body>) starts.
462 ## (c) Fry-IT, www.fry-it.com, 2007
463 ## <peter@fry-it.com>
464 ## download here: http://www.peterbe.com/plog/html2plaintext
467 tree = etree.fromstring(html, parser=etree.HTMLParser())
469 if body_id is not None:
470 source = tree.xpath('//*[@id=%s]' % (body_id,))
472 source = tree.xpath('//body')
478 for link in tree.findall('.//a'):
479 url = link.get('href')
483 link.text = '%s [%s]' % (link.text, i)
484 url_index.append(url)
486 html = ustr(etree.tostring(tree, encoding=encoding))
487 # \r char is converted into , must remove it
488 html = html.replace(' ', '')
490 html = html.replace('<strong>', '*').replace('</strong>', '*')
491 html = html.replace('<b>', '*').replace('</b>', '*')
492 html = html.replace('<h3>', '*').replace('</h3>', '*')
493 html = html.replace('<h2>', '**').replace('</h2>', '**')
494 html = html.replace('<h1>', '**').replace('</h1>', '**')
495 html = html.replace('<em>', '/').replace('</em>', '/')
496 html = html.replace('<tr>', '\n')
497 html = html.replace('</p>', '\n')
498 html = re.sub('<br\s*/?>', '\n', html)
499 html = re.sub('<.*?>', ' ', html)
500 html = html.replace(' ' * 2, ' ')
501 html = html.replace('>', '>')
502 html = html.replace('<', '<')
503 html = html.replace('&', '&')
506 html = '\n'.join([x.strip() for x in html.splitlines()])
507 html = html.replace('\n' * 2, '\n')
509 for i, url in enumerate(url_index):
512 html += ustr('[%s] %s\n') % (i + 1, url)
516 def plaintext2html(text, container_tag=False):
517 """ Convert plaintext into html. Content of the text is escaped to manage
518 html entities, using cgi.escape().
519 - all \n,\r are replaced by <br />
520 - enclose content into <p>
521 - 2 or more consecutive <br /> are considered as paragraph breaks
523 :param string container_tag: container of the html; by default the
524 content is embedded into a <div>
526 text = cgi.escape(ustr(text))
528 # 1. replace \n and \r
529 text = text.replace('\n', '<br/>')
530 text = text.replace('\r', '<br/>')
532 # 2-3: form paragraphs
535 br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})')
536 for item in re.finditer(br_tags, text):
537 final += text[idx:item.start()] + '</p><p>'
539 final += text[idx:] + '</p>'
543 final = '<%s>%s</%s>' % (container_tag, final, container_tag)
546 def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False):
547 """ Append extra content at the end of an HTML snippet, trying
548 to locate the end of the HTML document (</body>, </html>, or
549 EOF), and converting the provided content in html unless ``plaintext``
551 Content conversion can be done in two ways:
552 - wrapping it into a pre (preserve=True)
553 - use plaintext2html (preserve=False, using container_tag to wrap the
555 A side-effect of this method is to coerce all HTML tags to
556 lowercase in ``html``, and strip enclosing <html> or <body> tags in
557 content if ``plaintext`` is False.
559 :param str html: html tagsoup (doesn't have to be XHTML)
560 :param str content: extra content to append
561 :param bool plaintext: whether content is plaintext and should
562 be wrapped in a <pre/> tag.
563 :param bool preserve: if content is plaintext, wrap it into a <pre>
564 instead of converting it into html
567 if plaintext and preserve:
568 content = u'\n<pre>%s</pre>\n' % ustr(content)
570 content = '\n%s\n' % plaintext2html(content, container_tag)
572 content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
573 content = u'\n%s\n' % ustr(content)
574 # Force all tags to lowercase
575 html = re.sub(r'(</?)\W*(\w+)([ >])',
576 lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
577 insert_location = html.find('</body>')
578 if insert_location == -1:
579 insert_location = html.find('</html>')
580 if insert_location == -1:
581 return '%s%s' % (html, content)
582 return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
584 #----------------------------------------------------------
586 #----------------------------------------------------------
588 # matches any email in a body of text
589 email_re = re.compile(r"""([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6})""", re.VERBOSE)
591 # matches a string containing only one email
592 single_email_re = re.compile(r"""^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$""", re.VERBOSE)
594 res_re = re.compile(r"\[([0-9]+)\]", re.UNICODE)
595 command_re = re.compile("^Set-([a-z]+) *: *(.+)$", re.I + re.UNICODE)
597 # Updated in 7.0 to match the model name as well
598 # Typical form of references is <timestamp-openerp-record_id-model_name@domain>
599 # group(1) = the record ID ; group(2) = the model (if any) ; group(3) = the domain
600 reference_re = re.compile("<.*-open(?:object|erp)-(\\d+)(?:-([\w.]+))?[^>]*@([^>]*)>", re.UNICODE)
603 # Typical form of bounce is bounce-128-crm.lead-34@domain
604 # group(1) = the mail ID; group(2) = the model (if any); group(3) = the record ID
605 bounce_re = re.compile("[\w]+-(\d+)-?([\w.]+)?-?(\d+)?", re.UNICODE)
607 def generate_tracking_message_id(res_id):
608 """Returns a string that can be used in the Message-ID RFC822 header field
610 Used to track the replies related to a given object thanks to the "In-Reply-To"
611 or "References" fields that Mail User Agents will set.
614 rnd = random.SystemRandom().random()
615 except NotImplementedError:
616 rnd = random.random()
617 rndstr = ("%.15f" % rnd)[2:]
618 return "<%.15f.%s-openerp-%s@%s>" % (time.time(), rndstr, res_id, socket.gethostname())
620 def email_send(email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False,
621 attachments=None, message_id=None, references=None, openobject_id=False, debug=False, subtype='plain', headers=None,
622 smtp_server=None, smtp_port=None, ssl=False, smtp_user=None, smtp_password=None, cr=None, uid=None):
623 """Low-level function for sending an email (deprecated).
625 :deprecate: since OpenERP 6.1, please use ir.mail_server.send_email() instead.
626 :param email_from: A string used to fill the `From` header, if falsy,
627 config['email_from'] is used instead. Also used for
628 the `Reply-To` header if `reply_to` is not provided
629 :param email_to: a sequence of addresses to send the mail to.
632 # If not cr, get cr from current thread database
635 db_name = getattr(threading.currentThread(), 'dbname', None)
637 local_cr = cr = openerp.registry(db_name).cursor()
639 raise Exception("No database cursor found, please pass one explicitly")
643 mail_server_pool = openerp.registry(cr.dbname)['ir.mail_server']
645 # Pack Message into MIME Object
646 email_msg = mail_server_pool.build_email(email_from, email_to, subject, body, email_cc, email_bcc, reply_to,
647 attachments, message_id, references, openobject_id, subtype, headers=headers)
649 res = mail_server_pool.send_email(cr, uid or 1, email_msg, mail_server_id=None,
650 smtp_server=smtp_server, smtp_port=smtp_port, smtp_user=smtp_user, smtp_password=smtp_password,
651 smtp_encryption=('ssl' if ssl else None), smtp_debug=debug)
653 _logger.exception("tools.email_send failed to deliver email")
660 def email_split(text):
661 """ Return a list of the email addresses found in ``text`` """
664 return [addr[1] for addr in getaddresses([text])
665 # getaddresses() returns '' when email parsing fails, and
666 # sometimes returns emails without at least '@'. The '@'
667 # is strictly required in RFC2822's `addr-spec`.