tags_to_remove = ['html', 'body', 'font']
# allow new semantic HTML5 tags
-allowed_tags = clean.defs.tags | frozenset('article section header footer hgroup nav aside figure main'.split())
+allowed_tags = clean.defs.tags | frozenset('article section header footer hgroup nav aside figure main'.split() + [etree.Comment])
safe_attrs = clean.defs.safe_attrs | frozenset(
['style',
'data-oe-model', 'data-oe-id', 'data-oe-field', 'data-oe-type', 'data-oe-expression', 'data-oe-translate', 'data-oe-nodeid',
'forms': True, # remove form tags
'remove_unknown_tags': False,
'allow_tags': allowed_tags,
+ 'comments': False,
+ 'processing_instructions' : False
}
if etree.LXML_VERSION >= (2, 3, 1):
# kill_tags attribute has been added in version 2.3.1
overlength_section_count = 0
cur_char_nbr = 0
for node in root.iter():
+ # comments do not need processing
+ if node.tag == etree.Comment:
+ continue
# do not take into account multiple spaces that are displayed as max 1 space in html
node_text = ' '.join((node.text and node.text.strip(' \t\r\n') or '').split())
# root: try to tag the client used to write the html
+ # note: bug in node.get(value, default) for HtmlComments, default never returned
if 'WordSection1' in node.get('class', '') or 'MsoNormal' in node.get('class', ''):
root.set('msoffice', '1')
if 'SkyDrivePlaceholder' in node.get('class', '') or 'SkyDrivePlaceholder' in node.get('id', ''):
# sometimes returns emails without at least '@'. The '@'
# is strictly required in RFC2822's `addr-spec`.
if addr[1]
- if '@' in addr[1]]
\ No newline at end of file
+ if '@' in addr[1]]