addons/document/std_index.py

   1 # -*- coding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Management Solution
   5 #    Copyright (C) 2004-2010 Tiny SPRL (<http://tiny.be>).
   6 #
   7 #    This program is free software: you can redistribute it and/or modify
   8 #    it under the terms of the GNU Affero General Public License as
   9 #    published by the Free Software Foundation, either version 3 of the
  10 #    License, or (at your option) any later version.
  11 #
  12 #    This program is distributed in the hope that it will be useful,
  13 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #    GNU Affero General Public License for more details.
  16 #
  17 #    You should have received a copy of the GNU Affero General Public License
  18 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 ##############################################################################
  21
  22 from content_index import indexer, cntIndex
  23 from subprocess import Popen, PIPE
  24 import StringIO
  25 import odt2txt
  26 import sys, zipfile, xml.dom.minidom
  27
  28 def _to_unicode(s):
  29     try:
  30         return s.decode('utf-8')
  31     except UnicodeError:
  32         try:
  33             return s.decode('latin')
  34         except UnicodeError:
  35             try:
  36                 return s.encode('ascii')
  37             except UnicodeError:
  38                 return s
  39
  40 def textToString(element) :
  41     buffer = u""
  42     for node in element.childNodes :
  43         if node.nodeType == xml.dom.Node.TEXT_NODE :
  44             buffer += node.nodeValue
  45         elif node.nodeType == xml.dom.Node.ELEMENT_NODE :
  46             buffer += textToString(node)
  47     return buffer
  48
  49 class TxtIndex(indexer):
  50     def _getMimeTypes(self):
  51         return ['text/plain','text/html','text/diff','text/xml', 'text/*',
  52             'application/xml']
  53
  54     def _getExtensions(self):
  55         return ['.txt', '.py']
  56
  57     def _doIndexContent(self,content):
  58         return content
  59
  60 cntIndex.register(TxtIndex())
  61
  62 class PptxIndex(indexer):
  63     def _getMimeTypes(self):
  64         return [ 'application/vnd.openxmlformats-officedocument.presentationml.presentation']
  65
  66     def _getExtensions(self):
  67         return ['.pptx']
  68
  69     def _doIndexFile(self,fname):
  70         def toString () :
  71             """ Converts the document to a string. """
  72             buffer = u""
  73             for val in ["a:t"]:
  74                 for paragraph in content.getElementsByTagName(val) :
  75                     buffer += textToString(paragraph) + "\n"
  76             return buffer
  77
  78         data = []
  79         zip = zipfile.ZipFile(fname)
  80         files = filter(lambda x: x.startswith('ppt/slides/slide'), zip.namelist())
  81         for i in range(1, len(files) + 1):
  82             content = xml.dom.minidom.parseString(zip.read('ppt/slides/slide%s.xml' % str(i)))
  83             res = toString().encode('ascii','replace')
  84             data.append(res)
  85
  86         return _to_unicode('\n'.join(data))
  87
  88 cntIndex.register(PptxIndex())
  89
  90 class DocIndex(indexer):
  91     def _getMimeTypes(self):
  92         return [ 'application/ms-word']
  93
  94     def _getExtensions(self):
  95         return ['.doc']
  96
  97     def _doIndexFile(self,fname):
  98         pop = Popen(['antiword', fname], shell=False, stdout=PIPE)
  99         (data, _) = pop.communicate()
 100         return _to_unicode(data)
 101
 102 cntIndex.register(DocIndex())
 103
 104 class DocxIndex(indexer):
 105     def _getMimeTypes(self):
 106         return [ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document']
 107
 108     def _getExtensions(self):
 109         return ['.docx']
 110
 111     def _doIndexFile(self,fname):
 112         zip = zipfile.ZipFile(fname)
 113         content = xml.dom.minidom.parseString(zip.read("word/document.xml"))
 114         def toString () :
 115             """ Converts the document to a string. """
 116             buffer = u""
 117             for val in ["w:p", "w:h", "text:list"]:
 118                 for paragraph in content.getElementsByTagName(val) :
 119                     buffer += textToString(paragraph) + "\n"
 120             return buffer
 121
 122         res = toString().encode('ascii','replace')
 123
 124         return _to_unicode(res)
 125
 126 cntIndex.register(DocxIndex())
 127
 128
 129 class XlsxIndex(indexer):
 130     def _getMimeTypes(self):
 131         return [ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']
 132
 133     def _getExtensions(self):
 134         return ['.xlsx']
 135
 136     def _doIndexFile(self,fname):
 137         zip = zipfile.ZipFile(fname)
 138         content = xml.dom.minidom.parseString(zip.read("xl/sharedStrings.xml"))
 139         def toString () :
 140             """ Converts the document to a string. """
 141             buffer = u""
 142             for val in ["t"]:
 143                 for paragraph in content.getElementsByTagName(val) :
 144                     buffer += textToString(paragraph) + "\n"
 145             return buffer
 146
 147         res = toString().encode('ascii','replace')
 148
 149         return _to_unicode(res)
 150
 151 cntIndex.register(XlsxIndex())
 152
 153 class PdfIndex(indexer):
 154     def _getMimeTypes(self):
 155         return [ 'application/pdf']
 156
 157     def _getExtensions(self):
 158         return ['.pdf']
 159
 160     def _doIndexFile(self,fname):
 161         pop = Popen(['pdftotext', '-enc', 'UTF-8', '-nopgbrk', fname, '-'], shell=False, stdout=PIPE)
 162         (data, _) = pop.communicate()
 163         return _to_unicode(data)
 164
 165 cntIndex.register(PdfIndex())
 166
 167 class ImageNoIndex(indexer):
 168     def _getMimeTypes(self):
 169         return [ 'image/*']
 170
 171     def _getExtensions(self):
 172         #better return no extension, and let 'file' do its magic
 173         return []
 174         #return ['.png','.jpg','.gif','.jpeg','.bmp','.tiff']
 175
 176     def _doIndexContent(self,content):
 177         return 'image'
 178
 179
 180 cntIndex.register(ImageNoIndex())
 181
 182 # other opendocument formats:
 183 # chart-template chart database
 184 # formula-template formula graphics-template graphics
 185 # image
 186 # presentation-template presentation spreadsheet-template spreadsheet
 187
 188 class OpenDoc(indexer):
 189     """ Index OpenDocument files.
 190
 191         Q: is it really worth it to index spreadsheets, or do we only get a
 192         meaningless list of numbers (cell contents) ?
 193         """
 194     def _getMimeTypes(self):
 195         otypes = [ 'text', 'text-web', 'text-template', 'text-master' ]
 196         return map(lambda a: 'application/vnd.oasis.opendocument.'+a, otypes)
 197
 198     def _getExtensions(self):
 199         return ['.odt', '.ott', ] # '.ods'
 200
 201     def _doIndexContent(self, content):
 202         s = StringIO.StringIO(content)
 203         o = odt2txt.OpenDocumentTextFile(s)
 204         result = _to_unicode(o.toString())
 205         s.close()
 206         return result
 207
 208 cntIndex.register(OpenDoc())
 209
 210
 211 #eof