addons/document/std_index.py

   1 # -*- coding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Management Solution
   5 #    Copyright (C) 2004-2010 Tiny SPRL (<http://tiny.be>).
   6 #
   7 #    This program is free software: you can redistribute it and/or modify
   8 #    it under the terms of the GNU Affero General Public License as
   9 #    published by the Free Software Foundation, either version 3 of the
  10 #    License, or (at your option) any later version.
  11 #
  12 #    This program is distributed in the hope that it will be useful,
  13 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #    GNU Affero General Public License for more details.
  16 #
  17 #    You should have received a copy of the GNU Affero General Public License
  18 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 ##############################################################################
  21
  22 from content_index import indexer, cntIndex
  23 from subprocess import Popen, PIPE
  24 import StringIO
  25 import odt2txt
  26 import sys, zipfile, xml.dom.minidom
  27 import logging
  28 _logger = logging.getLogger(__name__)
  29
  30 def _to_unicode(s):
  31     try:
  32         return s.decode('utf-8')
  33     except UnicodeError:
  34         try:
  35             return s.decode('latin')
  36         except UnicodeError:
  37             try:
  38                 return s.encode('ascii')
  39             except UnicodeError:
  40                 return s
  41
  42 def textToString(element) :
  43     buffer = u""
  44     for node in element.childNodes :
  45         if node.nodeType == xml.dom.Node.TEXT_NODE :
  46             buffer += node.nodeValue
  47         elif node.nodeType == xml.dom.Node.ELEMENT_NODE :
  48             buffer += textToString(node)
  49     return buffer
  50
  51 class TxtIndex(indexer):
  52     def _getMimeTypes(self):
  53         return ['text/plain','text/html','text/diff','text/xml', 'text/*',
  54             'application/xml']
  55
  56     def _getExtensions(self):
  57         return ['.txt', '.py']
  58
  59     def _doIndexContent(self,content):
  60         return content
  61
  62 cntIndex.register(TxtIndex())
  63
  64 class PptxIndex(indexer):
  65     def _getMimeTypes(self):
  66         return [ 'application/vnd.openxmlformats-officedocument.presentationml.presentation']
  67
  68     def _getExtensions(self):
  69         return ['.pptx']
  70
  71     def _doIndexFile(self,fname):
  72         def toString () :
  73             """ Converts the document to a string. """
  74             buffer = u""
  75             for val in ["a:t"]:
  76                 for paragraph in content.getElementsByTagName(val) :
  77                     buffer += textToString(paragraph) + "\n"
  78             return buffer
  79
  80         data = []
  81         zip = zipfile.ZipFile(fname)
  82         files = filter(lambda x: x.startswith('ppt/slides/slide'), zip.namelist())
  83         for i in range(1, len(files) + 1):
  84             content = xml.dom.minidom.parseString(zip.read('ppt/slides/slide%s.xml' % str(i)))
  85             res = toString().encode('ascii','replace')
  86             data.append(res)
  87
  88         return _to_unicode('\n'.join(data))
  89
  90 cntIndex.register(PptxIndex())
  91
  92 class DocIndex(indexer):
  93     def _getMimeTypes(self):
  94         return [ 'application/ms-word']
  95
  96     def _getExtensions(self):
  97         return ['.doc']
  98
  99     def _doIndexFile(self,fname):
 100         try:
 101             pop = Popen(['antiword', fname], shell=False, stdout=PIPE)
 102             (data, _) = pop.communicate()
 103             return _to_unicode(data)
 104         except OSError:
 105
 106             _logger.warn("Failed attempt to execute antiword (MS Word reader). Antiword is necessary to index the file %s of MIME type %s. Detailed error available at DEBUG level.", fname, self._getMimeTypes()[0])
 107             _logger.debug("Trace of the failed file indexing attempt.", exc_info=True)
 108             return False
 109
 110 cntIndex.register(DocIndex())
 111
 112 class DocxIndex(indexer):
 113     def _getMimeTypes(self):
 114         return [ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document']
 115
 116     def _getExtensions(self):
 117         return ['.docx']
 118
 119     def _doIndexFile(self,fname):
 120         zip = zipfile.ZipFile(fname)
 121         content = xml.dom.minidom.parseString(zip.read("word/document.xml"))
 122         def toString () :
 123             """ Converts the document to a string. """
 124             buffer = u""
 125             for val in ["w:p", "w:h", "text:list"]:
 126                 for paragraph in content.getElementsByTagName(val) :
 127                     buffer += textToString(paragraph) + "\n"
 128             return buffer
 129
 130         res = toString().encode('ascii','replace')
 131
 132         return _to_unicode(res)
 133
 134 cntIndex.register(DocxIndex())
 135
 136
 137 class XlsxIndex(indexer):
 138     def _getMimeTypes(self):
 139         return [ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']
 140
 141     def _getExtensions(self):
 142         return ['.xlsx']
 143
 144     def _doIndexFile(self,fname):
 145         zip = zipfile.ZipFile(fname)
 146         content = xml.dom.minidom.parseString(zip.read("xl/sharedStrings.xml"))
 147         def toString () :
 148             """ Converts the document to a string. """
 149             buffer = u""
 150             for val in ["t"]:
 151                 for paragraph in content.getElementsByTagName(val) :
 152                     buffer += textToString(paragraph) + "\n"
 153             return buffer
 154
 155         res = toString().encode('ascii','replace')
 156
 157         return _to_unicode(res)
 158
 159 cntIndex.register(XlsxIndex())
 160
 161 class PdfIndex(indexer):
 162     def _getMimeTypes(self):
 163         return [ 'application/pdf']
 164
 165     def _getExtensions(self):
 166         return ['.pdf']
 167
 168     def _doIndexFile(self,fname):
 169         pop = Popen(['pdftotext', '-enc', 'UTF-8', '-nopgbrk', fname, '-'], shell=False, stdout=PIPE)
 170         (data, _) = pop.communicate()
 171         return _to_unicode(data)
 172
 173 cntIndex.register(PdfIndex())
 174
 175 class ImageNoIndex(indexer):
 176     def _getMimeTypes(self):
 177         return [ 'image/*']
 178
 179     def _getExtensions(self):
 180         #better return no extension, and let 'file' do its magic
 181         return []
 182         #return ['.png','.jpg','.gif','.jpeg','.bmp','.tiff']
 183
 184     def _doIndexContent(self,content):
 185         return 'image'
 186
 187
 188 cntIndex.register(ImageNoIndex())
 189
 190 # other opendocument formats:
 191 # chart-template chart database
 192 # formula-template formula graphics-template graphics
 193 # image
 194 # presentation-template presentation spreadsheet-template spreadsheet
 195
 196 class OpenDoc(indexer):
 197     """ Index OpenDocument files.
 198
 199         Q: is it really worth it to index spreadsheets, or do we only get a
 200         meaningless list of numbers (cell contents) ?
 201         """
 202     def _getMimeTypes(self):
 203         otypes = [ 'text', 'text-web', 'text-template', 'text-master' ]
 204         return map(lambda a: 'application/vnd.oasis.opendocument.'+a, otypes)
 205
 206     def _getExtensions(self):
 207         return ['.odt', '.ott', ] # '.ods'
 208
 209     def _doIndexContent(self, content):
 210         s = StringIO.StringIO(content)
 211         o = odt2txt.OpenDocumentTextFile(s)
 212         result = _to_unicode(o.toString())
 213         s.close()
 214         return result
 215
 216 cntIndex.register(OpenDoc())
 217
 218
 219 #eof
 220
 221 # vim:expandtab:smartindent:tabstop=4:softtabstop=4:shiftwidth=4: