1 # -*- coding: utf-8 -*-
2 ##############################################################################
4 # OpenERP, Open Source Business Applications
5 # Copyright (C) 2012-2013 OpenERP S.A. (<http://openerp.com>).
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as
9 # published by the Free Software Foundation, either version 3 of the
10 # License, or (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ##############################################################################
22 from lxml import etree
26 import lxml.html.clean as clean
34 from openerp.loglevels import ustr
36 _logger = logging.getLogger(__name__)
39 #----------------------------------------------------------
41 #----------------------------------------------------------
43 tags_to_kill = ["script", "head", "meta", "title", "link", "style", "frame", "iframe", "base", "object", "embed"]
44 tags_to_remove = ['html', 'body', 'font']
46 # allow new semantic HTML5 tags
47 allowed_tags = clean.defs.tags | frozenset('article section header footer hgroup nav aside figure'.split())
48 safe_attrs = clean.defs.safe_attrs | frozenset(['style'])
51 def html_sanitize(src, silent=True):
54 src = ustr(src, errors='replace')
56 logger = logging.getLogger(__name__ + '.html_sanitize')
58 # html encode email tags
59 part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL)
60 src = part.sub(lambda m: cgi.escape(m.group(1)), src)
63 'page_structure': True,
64 'style': False, # do not remove style attributes
65 'forms': True, # remove form tags
66 'remove_unknown_tags': False,
67 'allow_tags': allowed_tags,
69 if etree.LXML_VERSION >= (2, 3, 1):
70 # kill_tags attribute has been added in version 2.3.1
72 'kill_tags': tags_to_kill,
73 'remove_tags': tags_to_remove,
76 kwargs['remove_tags'] = tags_to_kill + tags_to_remove
78 if etree.LXML_VERSION >= (3, 1, 0):
80 'safe_attrs_only': True,
81 'safe_attrs': safe_attrs,
84 # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style"
85 kwargs['safe_attrs_only'] = False
88 # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail)
89 cleaner = clean.Cleaner(**kwargs)
90 cleaned = cleaner.clean_html(src)
91 except etree.ParserError:
94 logger.warning('ParserError obtained when sanitizing %r', src, exc_info=True)
95 cleaned = '<p>ParserError when sanitizing</p>'
99 logger.warning('unknown error obtained when sanitizing %r', src, exc_info=True)
100 cleaned = '<p>Unknown error when sanitizing</p>'
104 #----------------------------------------------------------
106 #----------------------------------------------------------
108 def html_email_clean(html, remove=False, shorten=False, max_length=300, expand_options=None,
109 protect_sections=False):
110 """ html_email_clean: clean the html by doing the following steps:
112 - try to strip email quotes, by removing blockquotes or having some client-
114 - try to strip signatures
115 - shorten the html to a maximum number of characters if requested
117 Some specific use case:
119 - MsOffice: ``div.style = border-top:solid;`` delimitates the beginning of
120 a quote; detecting by finding WordSection1 of MsoNormal
121 - Hotmail: ``hr.stopSpelling`` delimitates the beginning of a quote; detect
122 Hotmail by funding ``SkyDrivePlaceholder``
124 :param string html: sanitized html; tags like html or head should not
125 be present in the html string. This method therefore
126 takes as input html code coming from a sanitized source,
128 :param boolean remove: remove the html code that is unwanted; otherwise it
129 is only flagged and tagged
130 :param boolean shorten: shorten the html; every excessing content will
131 be flagged as to remove
132 :param int max_length: if shortening, maximum number of characters before
134 :param dict expand_options: options for the read more link when shortening
135 the content.The used keys are the following:
137 - oe_expand_container_tag: class applied to the
138 container of the whole read more link
139 - oe_expand_container_class: class applied to the
140 link container (default: oe_mail_expand)
141 - oe_expand_container_content: content of the
142 container (default: ...)
143 - oe_expand_separator_node: optional separator, like
144 adding ... <br /><br /> <a ...>read more</a> (default: void)
145 - oe_expand_a_href: href of the read more link itself
147 - oe_expand_a_class: class applied to the <a> containing
148 the link itself (default: oe_mail_expand)
149 - oe_expand_a_content: content of the <a> (default: read more)
151 The formatted read more link is the following:
152 <cont_tag class="oe_expand_container_class">
153 oe_expand_container_content
154 if expand_options.get('oe_expand_separator_node'):
155 <oe_expand_separator_node/>
156 <a href="oe_expand_a_href" class="oe_expand_a_class">
161 def _replace_matching_regex(regex, source, replace=''):
162 """ Replace all matching expressions in source by replace """
167 for item in re.finditer(regex, source):
168 dest += source[idx:item.start()] + replace
173 def _create_node(tag, text, tail=None, attrs={}):
174 new_node = etree.Element(tag)
177 for key, val in attrs.iteritems():
178 new_node.set(key, val)
181 def _insert_new_node(node, index, new_node_tag, new_node_text, new_node_tail=None, new_node_attrs={}):
182 new_node = _create_node(new_node_tag, new_node_text, new_node_tail, new_node_attrs)
183 node.insert(index, new_node)
186 def _tag_matching_regex_in_text(regex, node, new_node_tag='span', new_node_attrs={}):
187 text = node.text or ''
188 if not re.search(regex, text):
193 idx, iteration = 0, 0
194 for item in re.finditer(regex, text):
196 cur_node.text = text[idx:item.start()]
198 _insert_new_node(node, (iteration - 1) * 2 + 1, new_node_tag, text[idx:item.start()])
199 new_node = _insert_new_node(node, iteration * 2, new_node_tag, text[item.start():item.end()], None, new_node_attrs)
204 new_node = _insert_new_node(node, -1, new_node_tag, text[idx:] + (cur_node.tail or ''), None, {})
206 def _truncate_node(node, position, simplify_whitespaces=True):
207 """ Truncate a node text at a given position. This algorithm will shorten
208 at the end of the word whose ending character exceeds position.
210 :param bool simplify_whitespaces: whether to try to count all successive
211 whitespaces as one character. This
212 option should not be True when trying
213 to keep 'pre' consistency.
215 if node.text is None:
219 if simplify_whitespaces:
222 node_words = node.text.strip(' \t\r\n').split()
223 for word in node_words:
224 cur_char_nbr += len(word)
225 if cur_char_nbr >= position:
228 truncate_idx = node.text.find(word) + len(word)
230 truncate_idx = position
231 if truncate_idx == -1 or truncate_idx > len(node.text):
232 truncate_idx = len(node.text)
234 # compose new text bits
235 innertext = node.text[0:truncate_idx]
236 outertext = node.text[truncate_idx:]
237 node.text = innertext
239 # create <span> ... <a href="#">read more</a></span> node
240 read_more_node = _create_node(
241 expand_options.get('oe_expand_container_tag', 'span'),
242 expand_options.get('oe_expand_container_content', ' ... '),
244 {'class': expand_options.get('oe_expand_container_class', 'oe_mail_expand')}
246 if expand_options.get('oe_expand_separator_node'):
247 read_more_separator_node = _create_node(
248 expand_options.get('oe_expand_separator_node'),
253 read_more_node.append(read_more_separator_node)
254 read_more_link_node = _create_node(
256 expand_options.get('oe_expand_a_content', 'read more'),
259 'href': expand_options.get('oe_expand_a_href', '#'),
260 'class': expand_options.get('oe_expand_a_class', 'oe_mail_expand'),
263 read_more_node.append(read_more_link_node)
264 # create outertext node
265 overtext_node = _create_node('span', outertext)
267 overtext_node.set('in_overlength', '1')
268 # add newly created nodes in dom
269 node.append(read_more_node)
270 node.append(overtext_node)
272 if expand_options is None:
275 if not html or not isinstance(html, basestring):
280 # ------------------------------------------------------------
281 # TDE TODO: --- MAIL ORIGINAL ---: '[\-]{4,}([^\-]*)[\-]{4,}'
283 # html: remove encoding attribute inside tags
284 doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
285 html = doctype.sub(r"", html)
287 # html: ClEditor seems to love using <div><br /><div> -> replace with <br />
288 br_div_tags = re.compile(r'(<div>\s*<br\s*\/>\s*<\/div>)', re.IGNORECASE)
289 html = _replace_matching_regex(br_div_tags, html, '<br />')
292 root = lxml.html.fromstring(html)
293 if not len(root) and root.text is None and root.tail is None:
294 html = '<div>%s</div>' % html
295 root = lxml.html.fromstring(html)
297 quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
298 signature = re.compile(r'([-]{2,}[\s]?[\r\n]{1,2}[\s\S]+)')
299 for node in root.iter():
300 # remove all tails and replace them by a span element, because managing text and tails can be a pain in the ass
302 tail_node = _create_node('span', node.tail)
304 node.addnext(tail_node)
306 # form node and tag text-based quotes and signature
307 _tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'})
308 _tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'})
311 # ------------------------------------------------------------
314 # signature_begin = False # try dynamic signature recognition
317 overlength_section_id = None
318 overlength_section_count = 0
320 for node in root.iter():
321 # do not take into account multiple spaces that are displayed as max 1 space in html
322 node_text = ' '.join((node.text and node.text.strip(' \t\r\n') or '').split())
324 # root: try to tag the client used to write the html
325 if 'WordSection1' in node.get('class', '') or 'MsoNormal' in node.get('class', ''):
326 root.set('msoffice', '1')
327 if 'SkyDrivePlaceholder' in node.get('class', '') or 'SkyDrivePlaceholder' in node.get('id', ''):
328 root.set('hotmail', '1')
330 # protect sections by tagging section limits and blocks contained inside sections, using an increasing id to re-find them later
331 if node.tag == 'section':
332 overlength_section_count += 1
333 node.set('section_closure', str(overlength_section_count))
334 if node.getparent() is not None and (node.getparent().get('section_closure') or node.getparent().get('section_inner')):
335 node.set('section_inner', str(overlength_section_count))
337 # state of the parsing: flag quotes and tails to remove
339 node.set('in_quote', '1')
340 node.set('tail_remove', '1')
341 # state of the parsing: flag when being in over-length content, depending on section content if defined (only when having protect_sections)
343 if not overlength_section_id or int(node.get('section_inner', overlength_section_count + 1)) > overlength_section_count:
344 node.set('in_overlength', '1')
345 node.set('tail_remove', '1')
347 # find quote in msoffice / hotmail / blockquote / text quote and signatures
348 if root.get('msoffice') and node.tag == 'div' and 'border-top:solid' in node.get('style', ''):
350 node.set('in_quote', '1')
351 node.set('tail_remove', '1')
352 if root.get('hotmail') and node.tag == 'hr' and ('stopSpelling' in node.get('class', '') or 'stopSpelling' in node.get('id', '')):
354 node.set('in_quote', '1')
355 node.set('tail_remove', '1')
356 if node.tag == 'blockquote' or node.get('text_quote') or node.get('text_signature'):
357 node.set('in_quote', '1')
360 # if protect section:
361 # 1/ find the first parent not being inside a section
362 # 2/ add the read more link
364 # 1/ truncate the text at the next available space
365 # 2/ create a 'read more' node, next to current node
366 # 3/ add the truncated text in a new node, next to 'read more' node
367 node_text = (node.text or '').strip().strip('\n').strip()
368 if shorten and not overlength and cur_char_nbr + len(node_text) > max_length:
369 node_to_truncate = node
370 while node_to_truncate.getparent() is not None:
371 if node_to_truncate.get('in_quote'):
372 node_to_truncate = node_to_truncate.getparent()
373 elif protect_sections and (node_to_truncate.getparent().get('section_inner') or node_to_truncate.getparent().get('section_closure')):
374 node_to_truncate = node_to_truncate.getparent()
375 overlength_section_id = node_to_truncate.get('section_closure')
380 node_to_truncate.set('truncate', '1')
381 if node_to_truncate == node:
382 node_to_truncate.set('truncate_position', str(max_length - cur_char_nbr))
384 node_to_truncate.set('truncate_position', str(len(node.text or '')))
385 cur_char_nbr += len(node_text)
388 # ------------------------------------------------------------
390 for node in root.iter():
391 if node.get('truncate'):
392 _truncate_node(node, int(node.get('truncate_position', '0')), node.tag != 'pre')
395 # ------------------------------------------------------------
398 for node in root.iter():
399 if node.get('in_quote') or node.get('in_overlength'):
400 # copy the node tail into parent text
401 if node.tail and not node.get('tail_remove'):
402 parent = node.getparent()
403 parent.tail = node.tail + (parent.tail or '')
404 to_remove.append(node)
405 if node.get('tail_remove'):
408 for attribute_name in ['in_quote', 'tail_remove', 'in_overlength', 'msoffice', 'hotmail', 'truncate', 'truncate_position']:
409 node.attrib.pop(attribute_name, None)
410 for node in to_remove:
412 node.getparent().remove(node)
414 if not expand_options.get('oe_expand_a_class', 'oe_mail_expand') in node.get('class', ''): # trick: read more link should be displayed even if it's in overlength
415 node_class = node.get('class', '') + ' oe_mail_cleaned'
416 node.set('class', node_class)
418 # html: \n that were tail of elements have been encapsulated into <span> -> back to \n
419 html = etree.tostring(root, pretty_print=False)
420 linebreaks = re.compile(r'<span[^>]*>([\s]*[\r\n]+[\s]*)<\/span>', re.IGNORECASE | re.DOTALL)
421 html = _replace_matching_regex(linebreaks, html, '\n')
426 #----------------------------------------------------------
427 # HTML/Text management
428 #----------------------------------------------------------
430 def html2plaintext(html, body_id=None, encoding='utf-8'):
431 """ From an HTML text, convert the HTML to plain text.
432 If @param body_id is provided then this is the tag where the
433 body (not necessarily <body>) starts.
435 ## (c) Fry-IT, www.fry-it.com, 2007
436 ## <peter@fry-it.com>
437 ## download here: http://www.peterbe.com/plog/html2plaintext
440 tree = etree.fromstring(html, parser=etree.HTMLParser())
442 if body_id is not None:
443 source = tree.xpath('//*[@id=%s]' % (body_id,))
445 source = tree.xpath('//body')
451 for link in tree.findall('.//a'):
452 url = link.get('href')
456 link.text = '%s [%s]' % (link.text, i)
457 url_index.append(url)
459 html = ustr(etree.tostring(tree, encoding=encoding))
460 # \r char is converted into , must remove it
461 html = html.replace(' ', '')
463 html = html.replace('<strong>', '*').replace('</strong>', '*')
464 html = html.replace('<b>', '*').replace('</b>', '*')
465 html = html.replace('<h3>', '*').replace('</h3>', '*')
466 html = html.replace('<h2>', '**').replace('</h2>', '**')
467 html = html.replace('<h1>', '**').replace('</h1>', '**')
468 html = html.replace('<em>', '/').replace('</em>', '/')
469 html = html.replace('<tr>', '\n')
470 html = html.replace('</p>', '\n')
471 html = re.sub('<br\s*/?>', '\n', html)
472 html = re.sub('<.*?>', ' ', html)
473 html = html.replace(' ' * 2, ' ')
476 html = '\n'.join([x.strip() for x in html.splitlines()])
477 html = html.replace('\n' * 2, '\n')
479 for i, url in enumerate(url_index):
482 html += ustr('[%s] %s\n') % (i + 1, url)
486 def plaintext2html(text, container_tag=False):
487 """ Convert plaintext into html. Content of the text is escaped to manage
488 html entities, using cgi.escape().
489 - all \n,\r are replaced by <br />
490 - enclose content into <p>
491 - 2 or more consecutive <br /> are considered as paragraph breaks
493 :param string container_tag: container of the html; by default the
494 content is embedded into a <div>
496 text = cgi.escape(ustr(text))
498 # 1. replace \n and \r
499 text = text.replace('\n', '<br/>')
500 text = text.replace('\r', '<br/>')
502 # 2-3: form paragraphs
505 br_tags = re.compile(r'(([<]\s*[bB][rR]\s*\/?[>]\s*){2,})')
506 for item in re.finditer(br_tags, text):
507 final += text[idx:item.start()] + '</p><p>'
509 final += text[idx:] + '</p>'
513 final = '<%s>%s</%s>' % (container_tag, final, container_tag)
516 def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=False):
517 """ Append extra content at the end of an HTML snippet, trying
518 to locate the end of the HTML document (</body>, </html>, or
519 EOF), and converting the provided content in html unless ``plaintext``
521 Content conversion can be done in two ways:
522 - wrapping it into a pre (preserve=True)
523 - use plaintext2html (preserve=False, using container_tag to wrap the
525 A side-effect of this method is to coerce all HTML tags to
526 lowercase in ``html``, and strip enclosing <html> or <body> tags in
527 content if ``plaintext`` is False.
529 :param str html: html tagsoup (doesn't have to be XHTML)
530 :param str content: extra content to append
531 :param bool plaintext: whether content is plaintext and should
532 be wrapped in a <pre/> tag.
533 :param bool preserve: if content is plaintext, wrap it into a <pre>
534 instead of converting it into html
537 if plaintext and preserve:
538 content = u'\n<pre>%s</pre>\n' % ustr(content)
540 content = '\n%s\n' % plaintext2html(content, container_tag)
542 content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
543 content = u'\n%s\n' % ustr(content)
544 # Force all tags to lowercase
545 html = re.sub(r'(</?)\W*(\w+)([ >])',
546 lambda m: '%s%s%s' % (m.group(1), m.group(2).lower(), m.group(3)), html)
547 insert_location = html.find('</body>')
548 if insert_location == -1:
549 insert_location = html.find('</html>')
550 if insert_location == -1:
551 return '%s%s' % (html, content)
552 return '%s%s%s' % (html[:insert_location], content, html[insert_location:])
554 #----------------------------------------------------------
556 #----------------------------------------------------------
558 # matches any email in a body of text
559 email_re = re.compile(r"""([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6})""", re.VERBOSE)
561 # matches a string containing only one email
562 single_email_re = re.compile(r"""^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}$""", re.VERBOSE)
564 res_re = re.compile(r"\[([0-9]+)\]", re.UNICODE)
565 command_re = re.compile("^Set-([a-z]+) *: *(.+)$", re.I + re.UNICODE)
567 # Updated in 7.0 to match the model name as well
568 # Typical form of references is <timestamp-openerp-record_id-model_name@domain>
569 # group(1) = the record ID ; group(2) = the model (if any) ; group(3) = the domain
570 reference_re = re.compile("<.*-open(?:object|erp)-(\\d+)(?:-([\w.]+))?.*@(.*)>", re.UNICODE)
573 # Typical form of bounce is bounce-128-crm.lead-34@domain
574 # group(1) = the mail ID; group(2) = the model (if any); group(3) = the record ID
575 bounce_re = re.compile("[\w]+-(\d+)-?([\w.]+)?-?(\d+)?", re.UNICODE)
577 def generate_tracking_message_id(res_id):
578 """Returns a string that can be used in the Message-ID RFC822 header field
580 Used to track the replies related to a given object thanks to the "In-Reply-To"
581 or "References" fields that Mail User Agents will set.
584 rnd = random.SystemRandom().random()
585 except NotImplementedError:
586 rnd = random.random()
587 rndstr = ("%.15f" % rnd)[2:]
588 return "<%.15f.%s-openerp-%s@%s>" % (time.time(), rndstr, res_id, socket.gethostname())
590 def email_send(email_from, email_to, subject, body, email_cc=None, email_bcc=None, reply_to=False,
591 attachments=None, message_id=None, references=None, openobject_id=False, debug=False, subtype='plain', headers=None,
592 smtp_server=None, smtp_port=None, ssl=False, smtp_user=None, smtp_password=None, cr=None, uid=None):
593 """Low-level function for sending an email (deprecated).
595 :deprecate: since OpenERP 6.1, please use ir.mail_server.send_email() instead.
596 :param email_from: A string used to fill the `From` header, if falsy,
597 config['email_from'] is used instead. Also used for
598 the `Reply-To` header if `reply_to` is not provided
599 :param email_to: a sequence of addresses to send the mail to.
602 # If not cr, get cr from current thread database
605 db_name = getattr(threading.currentThread(), 'dbname', None)
607 local_cr = cr = openerp.registry(db_name).db.cursor()
609 raise Exception("No database cursor found, please pass one explicitly")
613 mail_server_pool = openerp.registry(cr.dbname)['ir.mail_server']
615 # Pack Message into MIME Object
616 email_msg = mail_server_pool.build_email(email_from, email_to, subject, body, email_cc, email_bcc, reply_to,
617 attachments, message_id, references, openobject_id, subtype, headers=headers)
619 res = mail_server_pool.send_email(cr, uid or 1, email_msg, mail_server_id=None,
620 smtp_server=smtp_server, smtp_port=smtp_port, smtp_user=smtp_user, smtp_password=smtp_password,
621 smtp_encryption=('ssl' if ssl else None), smtp_debug=debug)
623 _logger.exception("tools.email_send failed to deliver email")
630 def email_split(text):
631 """ Return a list of the email addresses found in ``text`` """
634 return re.findall(r'([^ ,<@]+@[^> ,]+)', text)