addons/document/std_index.py

   1 # -*- coding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Management Solution
   5 #    Copyright (C) 2004-2010 Tiny SPRL (<http://tiny.be>).
   6 #
   7 #    This program is free software: you can redistribute it and/or modify
   8 #    it under the terms of the GNU Affero General Public License as
   9 #    published by the Free Software Foundation, either version 3 of the
  10 #    License, or (at your option) any later version.
  11 #
  12 #    This program is distributed in the hope that it will be useful,
  13 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #    GNU Affero General Public License for more details.
  16 #
  17 #    You should have received a copy of the GNU Affero General Public License
  18 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 ##############################################################################
  21
  22 import StringIO
  23 import odt2txt
  24 from subprocess import Popen, PIPE
  25 from content_index import indexer, cntIndex
  26
  27 def _to_unicode(s):
  28     try:
  29         return s.decode('utf-8')
  30     except UnicodeError:
  31         try:
  32             return s.decode('latin')
  33         except UnicodeError:
  34             try:
  35                 return s.encode('ascii')
  36             except UnicodeError:
  37                 return s
  38
  39 class TxtIndex(indexer):
  40     def _getMimeTypes(self):
  41         return ['text/plain','text/html','text/diff','text/xml', 'text/*',
  42             'application/xml']
  43
  44     def _getExtensions(self):
  45         return ['.txt', '.py']
  46
  47     def _doIndexContent(self, content):
  48         return content
  49
  50 cntIndex.register(TxtIndex())
  51
  52 class PptxIndex(indexer):
  53     def _getMimeTypes(self):
  54         return [ 'application/vnd.openxmlformats-officedocument.presentationml.presentation']
  55
  56     def _getExtensions(self):
  57         return ['.pptx']
  58
  59     def _doIndexFile(self, fname):
  60         # pptx2txt.pl package not support in windows platform.
  61         # Download pptx2txt package from  http://sourceforge.net/projects/pptx2txt/" link.
  62         # To install this tool, just copy pptx2txt.pl to appropriate place (e.g. /usr/bin directory)
  63         fp = Popen(['pptx2txt.pl', fname], shell=False, stdout=PIPE).stdout
  64         fp.read()
  65         file_obj = open(str(fname + ".txt"), "r")
  66         data = file_obj.read()
  67         return _to_unicode(data)
  68
  69 cntIndex.register(PptxIndex())
  70
  71 class DocIndex(indexer):
  72     def _getMimeTypes(self):
  73         return [ 'application/ms-word']
  74
  75     def _getExtensions(self):
  76         return ['.doc']
  77
  78     def _doIndexFile(self,fname):
  79         fp = Popen(['antiword', fname], shell=False, stdout=PIPE).stdout
  80         return _to_unicode( fp.read())
  81
  82 cntIndex.register(DocIndex())
  83
  84 class DocxIndex(indexer):
  85     def _getMimeTypes(self):
  86         return [ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document']
  87
  88     def _getExtensions(self):
  89         return ['.docx']
  90
  91     def _doIndexFile(self, fname):
  92         # docx2txt.pl package not support in windows platform.
  93         # Download docx2txt package from  "http://sourceforge.net/projects/docx2txt/" link.
  94         # In case, you don't want to use Makefile for installation, you can follow these steps for manual installation.
  95         # Copy docx2txt.pl, docx2txt.sh and docx2txt.config to appropriate place (e.g. /usr/bin directory) . used following command.
  96         # --> cp docx2txt.pl docx2txt.sh docx2txt.config /usr/bin/
  97
  98         fp = Popen(['docx2txt.pl', fname], shell=False, stdout=PIPE).stdout
  99         fp.read()
 100         file_obj = open(str(fname + ".txt"), "r")
 101         data = file_obj.read()
 102         return _to_unicode(data)
 103
 104 cntIndex.register(DocxIndex())
 105
 106 class PdfIndex(indexer):
 107     def _getMimeTypes(self):
 108         return [ 'application/pdf']
 109
 110     def _getExtensions(self):
 111         return ['.pdf']
 112
 113     def _doIndexFile(self,fname):
 114         fp = Popen(['pdftotext', '-enc', 'UTF-8', '-nopgbrk', fname, '-'], shell=False, stdout=PIPE).stdout
 115         return _to_unicode( fp.read())
 116
 117 cntIndex.register(PdfIndex())
 118
 119 class ImageNoIndex(indexer):
 120     def _getMimeTypes(self):
 121         return [ 'image/*']
 122
 123     def _getExtensions(self):
 124         #better return no extension, and let 'file' do its magic
 125         return []
 126         #return ['.png','.jpg','.gif','.jpeg','.bmp','.tiff']
 127
 128     def _doIndexContent(self,content):
 129         return 'image'
 130
 131
 132 cntIndex.register(ImageNoIndex())
 133
 134 # other opendocument formats:
 135 #vnd.oasis.opendocument.chart-template
 136 #vnd.oasis.opendocument.chart
 137 #vnd.oasis.opendocument.database
 138 #vnd.oasis.opendocument.formula-template
 139 #vnd.oasis.opendocument.formula
 140 #vnd.oasis.opendocument.graphics-template
 141 #vnd.oasis.opendocument.graphics
 142 #vnd.oasis.opendocument.image
 143 #vnd.oasis.opendocument.presentation-template
 144 #vnd.oasis.opendocument.presentation
 145 #vnd.oasis.opendocument.spreadsheet-template
 146 #vnd.oasis.opendocument.spreadsheet
 147
 148 class OpenDoc(indexer):
 149     """ Index OpenDocument files.
 150
 151         Q: is it really worth it to index spreadsheets, or do we only get a
 152         meaningless list of numbers (cell contents) ?
 153         """
 154     def _getMimeTypes(self):
 155         otypes = [ 'text', 'text-web', 'text-template', 'text-master' ]
 156         return map(lambda a: 'application/vnd.oasis.opendocument.'+a, otypes)
 157
 158     def _getExtensions(self):
 159         return ['.odt', '.ott', ] # '.ods'
 160
 161     def _doIndexContent(self, content):
 162         s = StringIO.StringIO(content)
 163         o = odt2txt.OpenDocumentTextFile(s)
 164         result = _to_unicode(o.toString())
 165         s.close()
 166         return result
 167
 168 cntIndex.register(OpenDoc())
 169
 170
 171 #eof