addons/document/std_index.py

   1 # -*- coding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Management Solution
   5 #    Copyright (C) 2004-2010 Tiny SPRL (<http://tiny.be>).
   6 #
   7 #    This program is free software: you can redistribute it and/or modify
   8 #    it under the terms of the GNU Affero General Public License as
   9 #    published by the Free Software Foundation, either version 3 of the
  10 #    License, or (at your option) any later version.
  11 #
  12 #    This program is distributed in the hope that it will be useful,
  13 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #    GNU Affero General Public License for more details.
  16 #
  17 #    You should have received a copy of the GNU Affero General Public License
  18 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 ##############################################################################
  21
  22 from content_index import indexer, cntIndex
  23 from subprocess import Popen, PIPE
  24 import StringIO
  25 import odt2txt
  26 import sys, zipfile, xml.dom.minidom
  27 import logging
  28
  29 def _to_unicode(s):
  30     try:
  31         return s.decode('utf-8')
  32     except UnicodeError:
  33         try:
  34             return s.decode('latin')
  35         except UnicodeError:
  36             try:
  37                 return s.encode('ascii')
  38             except UnicodeError:
  39                 return s
  40
  41 def textToString(element) :
  42     buffer = u""
  43     for node in element.childNodes :
  44         if node.nodeType == xml.dom.Node.TEXT_NODE :
  45             buffer += node.nodeValue
  46         elif node.nodeType == xml.dom.Node.ELEMENT_NODE :
  47             buffer += textToString(node)
  48     return buffer
  49
  50 class TxtIndex(indexer):
  51     def _getMimeTypes(self):
  52         return ['text/plain','text/html','text/diff','text/xml', 'text/*',
  53             'application/xml']
  54
  55     def _getExtensions(self):
  56         return ['.txt', '.py']
  57
  58     def _doIndexContent(self,content):
  59         return content
  60
  61 cntIndex.register(TxtIndex())
  62
  63 class PptxIndex(indexer):
  64     def _getMimeTypes(self):
  65         return [ 'application/vnd.openxmlformats-officedocument.presentationml.presentation']
  66
  67     def _getExtensions(self):
  68         return ['.pptx']
  69
  70     def _doIndexFile(self,fname):
  71         def toString () :
  72             """ Converts the document to a string. """
  73             buffer = u""
  74             for val in ["a:t"]:
  75                 for paragraph in content.getElementsByTagName(val) :
  76                     buffer += textToString(paragraph) + "\n"
  77             return buffer
  78
  79         data = []
  80         zip = zipfile.ZipFile(fname)
  81         files = filter(lambda x: x.startswith('ppt/slides/slide'), zip.namelist())
  82         for i in range(1, len(files) + 1):
  83             content = xml.dom.minidom.parseString(zip.read('ppt/slides/slide%s.xml' % str(i)))
  84             res = toString().encode('ascii','replace')
  85             data.append(res)
  86
  87         return _to_unicode('\n'.join(data))
  88
  89 cntIndex.register(PptxIndex())
  90
  91 class DocIndex(indexer):
  92     def _getMimeTypes(self):
  93         return [ 'application/ms-word']
  94
  95     def _getExtensions(self):
  96         return ['.doc']
  97
  98     def _doIndexFile(self,fname):
  99         try:
 100             pop = Popen(['antiword', fname], shell=False, stdout=PIPE)
 101             (data, _) = pop.communicate()
 102             return _to_unicode(data)
 103         except OSError:
 104             logger = logging.getLogger('document.DocIndex')
 105             logger.warn("Failed attempt to execute antiword (MS Word reader). Antiword is necessary to index the file %s of MIME type %s. Detailed error available at DEBUG level.", fname, self._getMimeTypes()[0])
 106             logger.debug("Trace of the failed file indexing attempt: ", exc_info=True)
 107             return False
 108
 109 cntIndex.register(DocIndex())
 110
 111 class DocxIndex(indexer):
 112     def _getMimeTypes(self):
 113         return [ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document']
 114
 115     def _getExtensions(self):
 116         return ['.docx']
 117
 118     def _doIndexFile(self,fname):
 119         zip = zipfile.ZipFile(fname)
 120         content = xml.dom.minidom.parseString(zip.read("word/document.xml"))
 121         def toString () :
 122             """ Converts the document to a string. """
 123             buffer = u""
 124             for val in ["w:p", "w:h", "text:list"]:
 125                 for paragraph in content.getElementsByTagName(val) :
 126                     buffer += textToString(paragraph) + "\n"
 127             return buffer
 128
 129         res = toString().encode('ascii','replace')
 130
 131         return _to_unicode(res)
 132
 133 cntIndex.register(DocxIndex())
 134
 135
 136 class XlsxIndex(indexer):
 137     def _getMimeTypes(self):
 138         return [ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']
 139
 140     def _getExtensions(self):
 141         return ['.xlsx']
 142
 143     def _doIndexFile(self,fname):
 144         zip = zipfile.ZipFile(fname)
 145         content = xml.dom.minidom.parseString(zip.read("xl/sharedStrings.xml"))
 146         def toString () :
 147             """ Converts the document to a string. """
 148             buffer = u""
 149             for val in ["t"]:
 150                 for paragraph in content.getElementsByTagName(val) :
 151                     buffer += textToString(paragraph) + "\n"
 152             return buffer
 153
 154         res = toString().encode('ascii','replace')
 155
 156         return _to_unicode(res)
 157
 158 cntIndex.register(XlsxIndex())
 159
 160 class PdfIndex(indexer):
 161     def _getMimeTypes(self):
 162         return [ 'application/pdf']
 163
 164     def _getExtensions(self):
 165         return ['.pdf']
 166
 167     def _doIndexFile(self,fname):
 168         pop = Popen(['pdftotext', '-enc', 'UTF-8', '-nopgbrk', fname, '-'], shell=False, stdout=PIPE)
 169         (data, _) = pop.communicate()
 170         return _to_unicode(data)
 171
 172 cntIndex.register(PdfIndex())
 173
 174 class ImageNoIndex(indexer):
 175     def _getMimeTypes(self):
 176         return [ 'image/*']
 177
 178     def _getExtensions(self):
 179         #better return no extension, and let 'file' do its magic
 180         return []
 181         #return ['.png','.jpg','.gif','.jpeg','.bmp','.tiff']
 182
 183     def _doIndexContent(self,content):
 184         return 'image'
 185
 186
 187 cntIndex.register(ImageNoIndex())
 188
 189 # other opendocument formats:
 190 # chart-template chart database
 191 # formula-template formula graphics-template graphics
 192 # image
 193 # presentation-template presentation spreadsheet-template spreadsheet
 194
 195 class OpenDoc(indexer):
 196     """ Index OpenDocument files.
 197
 198         Q: is it really worth it to index spreadsheets, or do we only get a
 199         meaningless list of numbers (cell contents) ?
 200         """
 201     def _getMimeTypes(self):
 202         otypes = [ 'text', 'text-web', 'text-template', 'text-master' ]
 203         return map(lambda a: 'application/vnd.oasis.opendocument.'+a, otypes)
 204
 205     def _getExtensions(self):
 206         return ['.odt', '.ott', ] # '.ods'
 207
 208     def _doIndexContent(self, content):
 209         s = StringIO.StringIO(content)
 210         o = odt2txt.OpenDocumentTextFile(s)
 211         result = _to_unicode(o.toString())
 212         s.close()
 213         return result
 214
 215 cntIndex.register(OpenDoc())
 216
 217
 218 #eof
 219
 220 # vim:expandtab:smartindent:tabstop=4:softtabstop=4:shiftwidth=4: