addons/document/std_index.py

   1 # -*- coding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Management Solution
   5 #    Copyright (C) 2004-2010 Tiny SPRL (<http://tiny.be>).
   6 #
   7 #    This program is free software: you can redistribute it and/or modify
   8 #    it under the terms of the GNU Affero General Public License as
   9 #    published by the Free Software Foundation, either version 3 of the
  10 #    License, or (at your option) any later version.
  11 #
  12 #    This program is distributed in the hope that it will be useful,
  13 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #    GNU Affero General Public License for more details.
  16 #
  17 #    You should have received a copy of the GNU Affero General Public License
  18 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 ##############################################################################
  21
  22 import StringIO
  23 import odt2txt
  24
  25 from content_index import indexer, cntIndex
  26 from subprocess import Popen, PIPE
  27
  28 def _to_unicode(s):
  29     try:
  30         return s.decode('utf-8')
  31     except UnicodeError:
  32         try:
  33             return s.decode('latin')
  34         except UnicodeError:
  35             try:
  36                 return s.encode('ascii')
  37             except UnicodeError:
  38                 return s
  39
  40 class TxtIndex(indexer):
  41     def _getMimeTypes(self):
  42         return ['text/plain','text/html','text/diff','text/xml', 'text/*',
  43         'application/xml']
  44
  45     def _getExtensions(self):
  46         return ['.txt', '.py']
  47
  48     def _doIndexContent(self,content):
  49         return content
  50
  51 cntIndex.register(TxtIndex())
  52
  53 class DocIndex(indexer):
  54     def _getMimeTypes(self):
  55         return [ 'application/ms-word']
  56
  57     def _getExtensions(self):
  58         return ['.doc']
  59
  60     def _doIndexFile(self,fname):
  61         fp = Popen(['antiword',fname], shell=False, stdout=PIPE).stdout
  62         return _to_unicode( fp.read())
  63
  64 cntIndex.register(DocIndex())
  65
  66 class PdfIndex(indexer):
  67     def _getMimeTypes(self):
  68         return [ 'application/pdf']
  69
  70     def _getExtensions(self):
  71         return ['.pdf']
  72
  73     def _doIndexFile(self,fname):
  74         fp = Popen(['pdftotext', '-enc', 'UTF-8', '-nopgbrk', fname, '-'], shell=False, stdout=PIPE).stdout
  75         return _to_unicode( fp.read())
  76
  77 cntIndex.register(PdfIndex())
  78
  79 class ImageNoIndex(indexer):
  80     def _getMimeTypes(self):
  81         return [ 'image/*']
  82
  83     def _getExtensions(self):
  84         #better return no extension, and let 'file' do its magic
  85         return []
  86         #return ['.png','.jpg','.gif','.jpeg','.bmp','.tiff']
  87
  88     def _doIndexContent(self,content):
  89         return 'image'
  90
  91
  92 cntIndex.register(ImageNoIndex())
  93
  94 #class Doc(indexer):
  95     #def _getDefMime(self,ext):
  96
  97 #def content_index(content, filename=None, content_type=None):
  98     #fname,ext = os.path.splitext(filename)
  99     #result = ''
 100     #elif ext in ('.xls','.ods','.odt','.odp'):
 101         #s = StringIO.StringIO(content)
 102         #o = odt2txt.OpenDocumentTextFile(s)
 103         #result = _to_unicode(o.toString())
 104         #s.close()
 105     #elif ext in ('.txt','.py','.patch','.html','.csv','.xml'):
 106         #result = content
 107     #else:
 108         #result = content
 109     #return result
 110
 111 #eof