[IMP]: document: Implemented docx2txt, pptx2txt, xlsx2txt for document index content

author rpa (Open ERP) <rpa@tinyerp.com>

Thu, 3 Jun 2010 10:26:06 +0000 (15:56 +0530)

committer rpa (Open ERP) <rpa@tinyerp.com>

Thu, 3 Jun 2010 10:26:06 +0000 (15:56 +0530)
author rpa (Open ERP) <rpa@tinyerp.com>
Thu, 3 Jun 2010 10:26:06 +0000 (15:56 +0530)
committer rpa (Open ERP) <rpa@tinyerp.com>
Thu, 3 Jun 2010 10:26:06 +0000 (15:56 +0530)
diff --git a/addons/document/document_view.xml b/addons/document/document_view.xml

index fd424ad..db15ef3 100644 (file)
--- a/addons/document/document_view.xml
+++ b/addons/document/document_view.xml
@@ -193,10 +193,7 @@
                          <group col="2" colspan="2" attrs="{'invisible':[('type','=','binary')]}">
                          <field name="url" widget="url"/>
                          </group>
-
                      </group>
-
-
                      <group col="2" colspan="2">
                          <separator string="Relation" colspan="2"/>
                           <field name="res_name" readonly="1"/>
@@ -219,7 +216,9 @@
                  <page string="Security">
                      <field name="group_ids" colspan="4" nolabel="1"/>
                  </page>
-
+                <page string="Indexed Content" groups="base.group_extended">
+                    <field name="index_content" colspan="4" nolabel="1"/>
+                </page>
                  <page string="Notes">
                      <field colspan="4" name="description" nolabel="1"/>
                  </page>
diff --git a/addons/document/std_index.py b/addons/document/std_index.py

index 2f2f209..2facb14 100644 (file)
--- a/addons/document/std_index.py
+++ b/addons/document/std_index.py
@@ -19,10 +19,11 @@
  #
  ##############################################################################
  
+from content_index import indexer, cntIndex
+from subprocess import Popen, PIPE
  import StringIO
  import odt2txt
-from subprocess import Popen, PIPE
-from content_index import indexer, cntIndex
+import sys, zipfile, xml.dom.minidom
  
  def _to_unicode(s):
      try:
@@ -36,6 +37,15 @@ def _to_unicode(s):
              except UnicodeError:
                  return s
  
+def textToString(element) :
+    buffer = u""
+    for node in element.childNodes :
+        if node.nodeType == xml.dom.Node.TEXT_NODE :
+            buffer += node.nodeValue
+        elif node.nodeType == xml.dom.Node.ELEMENT_NODE :
+            buffer += textToString(node)
+    return buffer
+        
  class TxtIndex(indexer):
      def _getMimeTypes(self):
          return ['text/plain','text/html','text/diff','text/xml', 'text/*', 
@@ -57,14 +67,23 @@ class PptxIndex(indexer):
          return ['.pptx']
  
      def _doIndexFile(self,fname):
-        # pptx2txt.pl package not support in windows platform.
-        # Download pptx2txt package from  http://sourceforge.net/projects/pptx2txt/" link.
-        # To install this tool, just copy pptx2txt.pl to appropriate place (e.g. /usr/bin directory)
-        fp = Popen(['pptx2txt.pl', fname], shell=False, stdout=PIPE).stdout
-        fp.read()
-        file_obj = open(str(fname + ".txt"), "r")
-        data = file_obj.read()
-        return _to_unicode(data)
+        def toString () :
+            """ Converts the document to a string. """
+            buffer = u""
+            for val in ["a:t"]:
+                for paragraph in content.getElementsByTagName(val) :
+                    buffer += textToString(paragraph) + "\n"
+            return buffer
+
+        data = []
+        zip = zipfile.ZipFile(fname)
+        files = filter(lambda x: x.startswith('ppt/slides/slide'), zip.namelist())
+        for i in range(1, len(files) + 1):
+            content = xml.dom.minidom.parseString(zip.read('ppt/slides/slide%s.xml' % str(i)))
+            res = toString().encode('ascii','replace')
+            data.append(res)
+
+        return _to_unicode('\n'.join(data))
  
  cntIndex.register(PptxIndex())
  
@@ -76,33 +95,60 @@ class DocIndex(indexer):
          return ['.doc']
  
      def _doIndexFile(self,fname):
-        fp = Popen(['antiword', fname], shell=False, stdout=PIPE).stdout
-        return _to_unicode( fp.read())
+        #fp = Popen(['antiword', fname], shell=False, stdout=PIPE).stdout
+        return _to_unicode( 'None')
  
  cntIndex.register(DocIndex())
  
  class DocxIndex(indexer):
      def _getMimeTypes(self):
          return [ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document']
-    
+
      def _getExtensions(self):
          return ['.docx']
  
      def _doIndexFile(self,fname):
-        # docx2txt.pl package not support in windows platform.
-        # Download docx2txt package from  "http://sourceforge.net/projects/docx2txt/" link.   
-        # In case, you don't want to use Makefile for installation, you can follow these steps for manual installation.
-        # Copy docx2txt.pl, docx2txt.sh and docx2txt.config to appropriate place (e.g. /usr/bin directory) . used following command.
-        # --> cp docx2txt.pl docx2txt.sh docx2txt.config /usr/bin/
-
-        fp = Popen(['docx2txt.pl', fname], shell=False, stdout=PIPE).stdout
-        fp.read()
-        file_obj = open(str(fname + ".txt"), "r")
-        data = file_obj.read()
-        return _to_unicode(data)
+        zip = zipfile.ZipFile(fname)
+        content = xml.dom.minidom.parseString(zip.read("word/document.xml"))
+        def toString () :
+            """ Converts the document to a string. """
+            buffer = u""
+            for val in ["w:p", "w:h", "text:list"]:
+                for paragraph in content.getElementsByTagName(val) :
+                    buffer += textToString(paragraph) + "\n"
+            return buffer
+
+        res = toString().encode('ascii','replace')
+
+        return _to_unicode(res)
  
  cntIndex.register(DocxIndex())
  
+
+class XlsxIndex(indexer):
+    def _getMimeTypes(self):
+        return [ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']
+
+    def _getExtensions(self):
+        return ['.xlsx']
+
+    def _doIndexFile(self,fname):
+        zip = zipfile.ZipFile(fname)
+        content = xml.dom.minidom.parseString(zip.read("xl/sharedStrings.xml"))
+        def toString () :
+            """ Converts the document to a string. """
+            buffer = u""
+            for val in ["t"]:
+                for paragraph in content.getElementsByTagName(val) :
+                    buffer += textToString(paragraph) + "\n"
+            return buffer
+
+        res = toString().encode('ascii','replace')
+
+        return _to_unicode(res)
+
+cntIndex.register(XlsxIndex())
+
  class PdfIndex(indexer):
      def _getMimeTypes(self):
          return [ 'application/pdf']
author	rpa (Open ERP) <rpa@tinyerp.com>
	Thu, 3 Jun 2010 10:26:06 +0000 (15:56 +0530)
committer	rpa (Open ERP) <rpa@tinyerp.com>
	Thu, 3 Jun 2010 10:26:06 +0000 (15:56 +0530)
addons/document/document_view.xml		patch \| blob \| history
addons/document/std_index.py		patch \| blob \| history