[FIX] tools: mail: fixed shortening of html content.

author Thibault Delavallée <tde@openerp.com>

Tue, 22 Oct 2013 13:50:37 +0000 (15:50 +0200)

committer Thibault Delavallée <tde@openerp.com>

Tue, 22 Oct 2013 13:50:37 +0000 (15:50 +0200)
author Thibault Delavallée <tde@openerp.com>
Tue, 22 Oct 2013 13:50:37 +0000 (15:50 +0200)
committer Thibault Delavallée <tde@openerp.com>
Tue, 22 Oct 2013 13:50:37 +0000 (15:50 +0200)
diff --git a/openerp/tests/test_mail.py b/openerp/tests/test_mail.py

index c8bbc0c..df1b5e1 100755 (executable)
--- a/openerp/tests/test_mail.py
+++ b/openerp/tests/test_mail.py
@@ -149,6 +149,59 @@ class TestCleaner(unittest2.TestCase):
              for text in out_lst:
                  self.assertNotIn(text, new_html, 'html_email_cleaner did not remove unwanted content')
  
+    def test_05_shorten(self):
+        # TEST: shorten length
+        test_str = '''<div>
+        <span>
+        </span>
+        <p>Hello, <span>Raoul</span> 
+    <bold>You</bold> are 
+    pretty</p>
+<span>Really</span>
+</div>
+'''
+        # shorten at 'H' of Hello -> should shorten after Hello,
+        html = html_email_clean(test_str, shorten=True, max_length=1, remove=True)
+        self.assertIn('Hello,', html, 'html_email_cleaner: shorten error or too short')
+        self.assertNotIn('Raoul', html, 'html_email_cleaner: shorten error or too long')
+        self.assertIn('read more', html, 'html_email_cleaner: shorten error about read more inclusion')
+        # shorten at 'are' -> should shorten after are
+        html = html_email_clean(test_str, shorten=True, max_length=17, remove=True)
+        self.assertIn('Hello,', html, 'html_email_cleaner: shorten error or too short')
+        self.assertIn('Raoul', html, 'html_email_cleaner: shorten error or too short')
+        self.assertIn('are', html, 'html_email_cleaner: shorten error or too short')
+        self.assertNotIn('pretty', html, 'html_email_cleaner: shorten error or too long')
+        self.assertNotIn('Really', html, 'html_email_cleaner: shorten error or too long')
+        self.assertIn('read more', html, 'html_email_cleaner: shorten error about read more inclusion')
+
+        # TEST: shorten in quote
+        test_str = '''<div> Blahble         
+            bluih      blouh   
+        <blockquote>This is a quote
+        <span>And this is quite a long quote, after all.</span>
+        </blockquote>
+</div>'''
+        # shorten in the quote
+        html = html_email_clean(test_str, shorten=True, max_length=25, remove=True)
+        self.assertIn('Blahble', html, 'html_email_cleaner: shorten error or too short')
+        self.assertIn('bluih', html, 'html_email_cleaner: shorten error or too short')
+        self.assertIn('blouh', html, 'html_email_cleaner: shorten error or too short')
+        self.assertNotIn('quote', html, 'html_email_cleaner: shorten error or too long')
+        self.assertIn('read more', html, 'html_email_cleaner: shorten error about read more inclusion')
+        # shorten in second word
+        html = html_email_clean(test_str, shorten=True, max_length=9, remove=True)
+        self.assertIn('Blahble', html, 'html_email_cleaner: shorten error or too short')
+        self.assertIn('bluih', html, 'html_email_cleaner: shorten error or too short')
+        self.assertNotIn('blouh', html, 'html_email_cleaner: shorten error or too short')
+        self.assertNotIn('quote', html, 'html_email_cleaner: shorten error or too long')
+        self.assertIn('read more', html, 'html_email_cleaner: shorten error about read more inclusion')
+        # shorten waaay too large
+        html = html_email_clean(test_str, shorten=True, max_length=900, remove=True)
+        self.assertIn('Blahble', html, 'html_email_cleaner: shorten error or too short')
+        self.assertIn('bluih', html, 'html_email_cleaner: shorten error or too short')
+        self.assertIn('blouh', html, 'html_email_cleaner: shorten error or too short')
+        self.assertNotIn('quote', html, 'html_email_cleaner: shorten error or too long')
+
      def test_10_email_text(self):
          """ html_email_clean test for text-based emails """
          new_html = html_email_clean(test_mail_examples.TEXT_1, remove=True)
@@ -230,7 +283,7 @@ class TestCleaner(unittest2.TestCase):
          for ext in test_mail_examples.BUG_1_OUT:
              self.assertNotIn(ext, new_html, 'html_email_cleaner did not removed invalid content')
  
-        new_html = html_email_clean(test_mail_examples.BUG2, remove=True, shorten=True, max_length=4000)
+        new_html = html_email_clean(test_mail_examples.BUG2, remove=True, shorten=True, max_length=250)
          for ext in test_mail_examples.BUG_2_IN:
              self.assertIn(ext, new_html, 'html_email_cleaner wrongly removed valid content')
          for ext in test_mail_examples.BUG_2_OUT:
diff --git a/openerp/tools/mail.py b/openerp/tools/mail.py

index b287ded..2e84f0b 100644 (file)
--- a/openerp/tools/mail.py
+++ b/openerp/tools/mail.py
@@ -175,20 +175,38 @@ def html_email_clean(html, remove=False, shorten=False, max_length=300):
              iteration += 1
          new_node = _insert_new_node(node, -1, new_node_tag, text[idx:] + (cur_node.tail or ''), None, {})
  
-    def _truncate_node(node, position, find_first_blank=True):
+    def _truncate_node(node, position, simplify_whitespaces=True):
+        """ Truncate a node text at a given position. This algorithm will shorten
+        at the end of the word whose ending character exceeds position.
+
+            :param bool simplify_whitespaces: whether to try to count all successive
+                                              whitespaces as one character. This
+                                              option should not be True when trying
+                                              to keep 'pre' consistency.
+        """
          if node.text is None:
              node.text = ''
-        # truncate text
-        end_position = position if len(node.text) >= position else len(node.text)
-        innertext = node.text[0:end_position]
-        outertext = node.text[end_position:]
-        if find_first_blank:
-            stop_idx = outertext.find(' ')
+
+        if simplify_whitespaces:
+            cur_char_nbr = 0
+            word = None
+            node_words = node.text.strip(' \t\r\n').split()
+            for word in node_words:
+                cur_char_nbr += len(word)
+                if cur_char_nbr >= position:
+                    break
+            stop_idx = node.text.find(word) + len(word)
              if stop_idx == -1:
-                stop_idx = len(outertext)
+                stop_idx = len(node.text)
          else:
-            stop_idx = 0
-        node.text = innertext + outertext[0:stop_idx]
+            stop_idx = position
+        stop_idx = stop_idx if len(node.text) >= stop_idx else len(node.text)
+
+        # compose new text bits
+        innertext = node.text[0:stop_idx]
+        outertext = node.text[stop_idx:]
+        node.text = innertext
+
          # create <span> ... <a href="#">read more</a></span> node
          read_more_node = _create_node('span', ' ... ', None, {'class': 'oe_mail_expand'})
          read_more_link_node = _create_node('a', 'read more', None, {'href': '#', 'class': 'oe_mail_expand'})
@@ -223,17 +241,16 @@ def html_email_clean(html, remove=False, shorten=False, max_length=300):
          html = '<div>%s</div>' % html
          root = lxml.html.fromstring(html)
  
-    # remove all tails and replace them by a span element, because managing text and tails can be a pain in the ass
-    for node in root.getiterator():
+    quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
+    signature = re.compile(r'([-]{2,}[\s]?[\r\n]{1,2}[\s\S]+)')
+    for node in root.iter():
+        # remove all tails and replace them by a span element, because managing text and tails can be a pain in the ass
          if node.tail:
              tail_node = _create_node('span', node.tail)
              node.tail = None
              node.addnext(tail_node)
  
-    # form node and tag text-based quotes and signature
-    quote_tags = re.compile(r'(\n(>)+[^\n\r]*)')
-    signature = re.compile(r'([-]{2,}[\s]?[\r\n]{1,2}[\s\S]+)')
-    for node in root.getiterator():
+        # form node and tag text-based quotes and signature
          _tag_matching_regex_in_text(quote_tags, node, 'span', {'text_quote': '1'})
          _tag_matching_regex_in_text(signature, node, 'span', {'text_signature': '1'})
  
@@ -245,7 +262,10 @@ def html_email_clean(html, remove=False, shorten=False, max_length=300):
      quote_begin = False
      overlength = False
      cur_char_nbr = 0
-    for node in root.getiterator():
+    for node in root.iter():
+        # node_text = re.sub('\s{2,}', ' ', node.text and node.text.strip(' \t\r\n') or '')  # do not take into account multiple spaces that are displayed as max 1 space in html
+        node_text = ' '.join((node.text and node.text.strip(' \t\r\n') or '').split())
+
          # root: try to tag the client used to write the html
          if 'WordSection1' in node.get('class', '') or 'MsoNormal' in node.get('class', ''):
              root.set('msoffice', '1')
@@ -277,27 +297,30 @@ def html_email_clean(html, remove=False, shorten=False, max_length=300):
          # 1/ truncate the text at the next available space
          # 2/ create a 'read more' node, next to current node
          # 3/ add the truncated text in a new node, next to 'read more' node
-        if shorten and not overlength and cur_char_nbr + len(node.text or '') > max_length:
+        if shorten and not overlength and cur_char_nbr + len(node_text) > max_length:
              node_to_truncate = node
              while node_to_truncate.get('in_quote') and node_to_truncate.getparent() is not None:
                  node_to_truncate = node_to_truncate.getparent()
              overlength = True
              node_to_truncate.set('truncate', '1')
-            node_to_truncate.set('truncate_position', str(max_length - cur_char_nbr))
-        cur_char_nbr += len(node.text or '')
+            if node_to_truncate == node:
+                node_to_truncate.set('truncate_position', str(max_length - cur_char_nbr))
+            else:
+                node_to_truncate.set('truncate_position', str(len(node.text or '')))
+        cur_char_nbr += len(node_text)
  
      # Tree modification
      # ------------------------------------------------------------
  
      for node in root.iter():
          if node.get('truncate'):
-            _truncate_node(node, int(node.get('truncate_position', '0')))
+            _truncate_node(node, int(node.get('truncate_position', '0')), node.tag != 'pre')
  
      # Post processing
      # ------------------------------------------------------------
  
      to_remove = []
-    for node in root.getiterator():
+    for node in root.iter():
          if node.get('in_quote') or node.get('in_overlength'):
              # copy the node tail into parent text
              if node.tail and not node.get('tail_remove'):
@@ -306,17 +329,20 @@ def html_email_clean(html, remove=False, shorten=False, max_length=300):
              to_remove.append(node)
          if node.get('tail_remove'):
              node.tail = ''
+        # clean node
+        for attribute_name in ['in_quote', 'tail_remove', 'in_overlength', 'msoffice', 'hotmail', 'truncate', 'truncate_position']:
+            node.attrib.pop(attribute_name, None)
      for node in to_remove:
          if remove:
              node.getparent().remove(node)
          else:
              if not 'oe_mail_expand' in node.get('class', ''):  # trick: read more link should be displayed even if it's in overlength
-                node_class = node.get('class', '') + ' ' + 'oe_mail_cleaned'
+                node_class = node.get('class', '') + ' oe_mail_cleaned'
                  node.set('class', node_class)
  
      # html: \n that were tail of elements have been encapsulated into <span> -> back to \n
      html = etree.tostring(root, pretty_print=False)
-    linebreaks = re.compile(r'<span>([\s]*[\r\n]+[\s]*)<\/span>', re.IGNORECASE | re.DOTALL)
+    linebreaks = re.compile(r'<span[^>]*>([\s]*[\r\n]+[\s]*)<\/span>', re.IGNORECASE | re.DOTALL)
      html = _replace_matching_regex(linebreaks, html, '\n')
  
      return html
author	Thibault Delavallée <tde@openerp.com>
	Tue, 22 Oct 2013 13:50:37 +0000 (15:50 +0200)
committer	Thibault Delavallée <tde@openerp.com>
	Tue, 22 Oct 2013 13:50:37 +0000 (15:50 +0200)
openerp/tests/test_mail.py		patch \| blob \| history
openerp/tools/mail.py		patch \| blob \| history