[IMP] mail: parsing emails with several html parts
authorMartin Trigaux <mat@openerp.com>
Thu, 9 Oct 2014 07:14:22 +0000 (09:14 +0200)
committerMartin Trigaux <mat@openerp.com>
Thu, 9 Oct 2014 11:53:23 +0000 (13:53 +0200)
If an email contains several text/html parts inside a multipart email, the previous code was only keeping the last content part.
The Content-Type: multipart/mixed allows several independent part (RFC1341 7.2.2), so two html is technically valid.
With this patch, the two parts are concatenated. (opw 614755)

Modify append_content_to_html regex to make sure the regex keeps the content of the html instead of removing it.
e.g.: "123 <html> 456 </html> 789" used to be stripped to "123  789" while we expect "123 456 789"

addons/mail/mail_thread.py
addons/mail/tests/test_mail_gateway.py
openerp/tools/mail.py

index 63cb3a8..ee03953 100644 (file)
@@ -801,9 +801,13 @@ class mail_thread(osv.AbstractModel):
                 body = tools.append_content_to_html(u'', body, preserve=True)
         else:
             alternative = False
+            mixed = False
+            html = u''
             for part in message.walk():
                 if part.get_content_type() == 'multipart/alternative':
                     alternative = True
+                if part.get_content_type() == 'multipart/mixed':
+                    mixed = True
                 if part.get_content_maintype() == 'multipart':
                     continue  # skip container
                 # part.get_filename returns decoded value if able to decode, coded otherwise.
@@ -830,8 +834,11 @@ class mail_thread(osv.AbstractModel):
                                                                          encoding, errors='replace'), preserve=True)
                 # 3) text/html -> raw
                 elif part.get_content_type() == 'text/html':
+                    # mutlipart/alternative have one text and a html part, keep only the second
+                    # mixed allows several html parts, append html content
+                    append_content = not alternative or (html and mixed)
                     html = tools.ustr(part.get_payload(decode=True), encoding, errors='replace')
-                    if alternative:
+                    if not append_content:
                         body = html
                     else:
                         body = tools.append_content_to_html(body, html, plaintext=False)
index bc2f207..c952c6a 100644 (file)
@@ -141,6 +141,53 @@ X-Attachment-Id: f_hkpb27k00
 dGVzdAo=
 --089e01536c4ed4d17204e49b8e96--"""
 
+MAIL_MULTIPART_MIXED_TWO = """X-Original-To: raoul@grosbedon.fr
+Delivered-To: raoul@grosbedon.fr
+Received: by mail1.grosbedon.com (Postfix, from userid 10002)
+    id E8166BFACA; Fri, 23 Aug 2013 13:18:01 +0200 (CEST)
+From: "Bruce Wayne" <bruce@wayneenterprises.com>
+Content-Type: multipart/alternative;
+ boundary="Apple-Mail=_9331E12B-8BD2-4EC7-B53E-01F3FBEC9227"
+Message-Id: <6BB1FAB2-2104-438E-9447-07AE2C8C4A92@sexample.com>
+Mime-Version: 1.0 (Mac OS X Mail 7.3 \(1878.6\))
+
+--Apple-Mail=_9331E12B-8BD2-4EC7-B53E-01F3FBEC9227
+Content-Transfer-Encoding: 7bit
+Content-Type: text/plain;
+    charset=us-ascii
+
+First and second part
+
+--Apple-Mail=_9331E12B-8BD2-4EC7-B53E-01F3FBEC9227
+Content-Type: multipart/mixed;
+ boundary="Apple-Mail=_CA6C687E-6AA0-411E-B0FE-F0ABB4CFED1F"
+
+--Apple-Mail=_CA6C687E-6AA0-411E-B0FE-F0ABB4CFED1F
+Content-Transfer-Encoding: 7bit
+Content-Type: text/html;
+    charset=us-ascii
+
+<html><head></head><body>First part</body></html>
+
+--Apple-Mail=_CA6C687E-6AA0-411E-B0FE-F0ABB4CFED1F
+Content-Disposition: inline;
+    filename=thetruth.pdf
+Content-Type: application/pdf;
+    name="thetruth.pdf"
+Content-Transfer-Encoding: base64
+
+SSBhbSB0aGUgQmF0TWFuCg==
+
+--Apple-Mail=_CA6C687E-6AA0-411E-B0FE-F0ABB4CFED1F
+Content-Transfer-Encoding: 7bit
+Content-Type: text/html;
+    charset=us-ascii
+
+<html><head></head><body>Second part</body></html>
+--Apple-Mail=_CA6C687E-6AA0-411E-B0FE-F0ABB4CFED1F--
+
+--Apple-Mail=_9331E12B-8BD2-4EC7-B53E-01F3FBEC9227--
+"""
 
 class TestMailgateway(TestMailBase):
 
@@ -202,6 +249,14 @@ class TestMailgateway(TestMailBase):
         self.assertIn('<div dir="ltr">Should create a multipart/mixed: from gmail, <b>bold</b>, with attachment.<br clear="all"><div><br></div>', res.get('body', ''),
                       'message_parse: html version should be in body after parsing multipart/mixed')
 
+        res = self.mail_thread.message_parse(cr, uid, MAIL_MULTIPART_MIXED_TWO)
+        self.assertNotIn('First and second part', res.get('body', ''),
+                         'message_parse: text version should not be in body after parsing multipart/mixed')
+        self.assertIn('First part', res.get('body', ''),
+                      'message_parse: first part of the html version should be in body after parsing multipart/mixed')
+        self.assertIn('Second part', res.get('body', ''),
+                      'message_parse: second part of the html version should be in body after parsing multipart/mixed')
+
     def test_10_message_process(self):
         """ Testing incoming emails processing. """
         cr, uid, user_raoul = self.cr, self.uid, self.user_raoul
index 49ec584..b0212db 100644 (file)
@@ -282,7 +282,7 @@ def append_content_to_html(html, content, plaintext=True, preserve=False, contai
     elif plaintext:
         content = '\n%s\n' % plaintext2html(content, container_tag)
     else:
-        content = re.sub(r'(?i)(</?html.*>|</?body.*>|<!\W*DOCTYPE.*>)', '', content)
+        content = re.sub(r'(?i)(</?(?:html|body|head|!\s*DOCTYPE)[^>]*>)', '', content)
         content = u'\n%s\n' % ustr(content)
     # Force all tags to lowercase
     html = re.sub(r'(</?)\W*(\w+)([ >])',