#
##############################################################################
+from content_index import indexer, cntIndex
+from subprocess import Popen, PIPE
import StringIO
import odt2txt
-from subprocess import Popen, PIPE
-from content_index import indexer, cntIndex
+import sys, zipfile, xml.dom.minidom
def _to_unicode(s):
try:
except UnicodeError:
return s
+def textToString(element) :
+ buffer = u""
+ for node in element.childNodes :
+ if node.nodeType == xml.dom.Node.TEXT_NODE :
+ buffer += node.nodeValue
+ elif node.nodeType == xml.dom.Node.ELEMENT_NODE :
+ buffer += textToString(node)
+ return buffer
+
class TxtIndex(indexer):
def _getMimeTypes(self):
return ['text/plain','text/html','text/diff','text/xml', 'text/*',
return ['.pptx']
def _doIndexFile(self,fname):
- # pptx2txt.pl package not support in windows platform.
- # Download pptx2txt package from http://sourceforge.net/projects/pptx2txt/" link.
- # To install this tool, just copy pptx2txt.pl to appropriate place (e.g. /usr/bin directory)
- fp = Popen(['pptx2txt.pl', fname], shell=False, stdout=PIPE).stdout
- fp.read()
- file_obj = open(str(fname + ".txt"), "r")
- data = file_obj.read()
- return _to_unicode(data)
+ def toString () :
+ """ Converts the document to a string. """
+ buffer = u""
+ for val in ["a:t"]:
+ for paragraph in content.getElementsByTagName(val) :
+ buffer += textToString(paragraph) + "\n"
+ return buffer
+
+ data = []
+ zip = zipfile.ZipFile(fname)
+ files = filter(lambda x: x.startswith('ppt/slides/slide'), zip.namelist())
+ for i in range(1, len(files) + 1):
+ content = xml.dom.minidom.parseString(zip.read('ppt/slides/slide%s.xml' % str(i)))
+ res = toString().encode('ascii','replace')
+ data.append(res)
+
+ return _to_unicode('\n'.join(data))
cntIndex.register(PptxIndex())
return ['.doc']
def _doIndexFile(self,fname):
- fp = Popen(['antiword', fname], shell=False, stdout=PIPE).stdout
- return _to_unicode( fp.read())
+ #fp = Popen(['antiword', fname], shell=False, stdout=PIPE).stdout
+ return _to_unicode( 'None')
cntIndex.register(DocIndex())
class DocxIndex(indexer):
def _getMimeTypes(self):
return [ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document']
-
+
def _getExtensions(self):
return ['.docx']
def _doIndexFile(self,fname):
- # docx2txt.pl package not support in windows platform.
- # Download docx2txt package from "http://sourceforge.net/projects/docx2txt/" link.
- # In case, you don't want to use Makefile for installation, you can follow these steps for manual installation.
- # Copy docx2txt.pl, docx2txt.sh and docx2txt.config to appropriate place (e.g. /usr/bin directory) . used following command.
- # --> cp docx2txt.pl docx2txt.sh docx2txt.config /usr/bin/
-
- fp = Popen(['docx2txt.pl', fname], shell=False, stdout=PIPE).stdout
- fp.read()
- file_obj = open(str(fname + ".txt"), "r")
- data = file_obj.read()
- return _to_unicode(data)
+ zip = zipfile.ZipFile(fname)
+ content = xml.dom.minidom.parseString(zip.read("word/document.xml"))
+ def toString () :
+ """ Converts the document to a string. """
+ buffer = u""
+ for val in ["w:p", "w:h", "text:list"]:
+ for paragraph in content.getElementsByTagName(val) :
+ buffer += textToString(paragraph) + "\n"
+ return buffer
+
+ res = toString().encode('ascii','replace')
+
+ return _to_unicode(res)
cntIndex.register(DocxIndex())
+
+class XlsxIndex(indexer):
+ def _getMimeTypes(self):
+ return [ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']
+
+ def _getExtensions(self):
+ return ['.xlsx']
+
+ def _doIndexFile(self,fname):
+ zip = zipfile.ZipFile(fname)
+ content = xml.dom.minidom.parseString(zip.read("xl/sharedStrings.xml"))
+ def toString () :
+ """ Converts the document to a string. """
+ buffer = u""
+ for val in ["t"]:
+ for paragraph in content.getElementsByTagName(val) :
+ buffer += textToString(paragraph) + "\n"
+ return buffer
+
+ res = toString().encode('ascii','replace')
+
+ return _to_unicode(res)
+
+cntIndex.register(XlsxIndex())
+
class PdfIndex(indexer):
def _getMimeTypes(self):
return [ 'application/pdf']