addons/document/content_index.py

   1 # -*- coding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Management Solution
   5 #    Copyright (C) 2004-2010 Tiny SPRL (<http://tiny.be>).
   6 #
   7 #    This program is free software: you can redistribute it and/or modify
   8 #    it under the terms of the GNU Affero General Public License as
   9 #    published by the Free Software Foundation, either version 3 of the
  10 #    License, or (at your option) any later version.
  11 #
  12 #    This program is distributed in the hope that it will be useful,
  13 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #    GNU Affero General Public License for more details.
  16 #
  17 #    You should have received a copy of the GNU Affero General Public License
  18 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 ##############################################################################
  21 import logging
  22 import os
  23 import tempfile
  24 from subprocess import Popen, PIPE
  25
  26 class NhException(Exception):
  27     pass
  28
  29
  30 class indexer(object):
  31     """ An indexer knows how to parse the content of some file.
  32
  33         Typically, one indexer should be instantiated per file
  34         type.
  35         Override this class to add more functionality. Note that
  36         you should only override the Content or the File methods
  37         that give an optimal result. """
  38
  39     def _getMimeTypes(self):
  40         """ Return supported mimetypes """
  41         return []
  42
  43     def _getExtensions(self):
  44         return []
  45
  46     def _getDefMime(self, ext):
  47         """ Return a mimetype for this document type, ideally the
  48             closest to the extension ext. """
  49         mts = self._getMimeTypes();
  50         if len (mts):
  51             return mts[0]
  52         return None
  53
  54     def indexContent(self, content, filename=None, realfile = None):
  55         """ Use either content or the real file, to index.
  56             Some parsers will work better with the actual
  57             content, others parse a file easier. Try the
  58             optimal.
  59         """
  60         res = ''
  61         try:
  62             if content != None:
  63                 return self._doIndexContent(content)
  64         except NhException:
  65             pass
  66
  67         if realfile != None:
  68             try:
  69                 return self._doIndexFile(realfile)
  70             except NhException:
  71                 pass
  72
  73             fp = open(realfile,'rb')
  74             content2 = fp.read()
  75             fp.close()
  76
  77             # The not-handled exception may be raised here
  78             return self._doIndexContent(content2)
  79
  80
  81         # last try, with a tmp file
  82         if content:
  83             try:
  84                 fname,ext = filename and os.path.splitext(filename) or ('','')
  85                 fd, rfname = tempfile.mkstemp(suffix=ext)
  86                 os.write(fd, content)
  87                 os.close(fd)
  88                 res = self._doIndexFile(rfname)
  89                 os.unlink(rfname)
  90                 return res
  91             except NhException:
  92                 pass
  93
  94         raise NhException('No appropriate method to index file')
  95
  96     def _doIndexContent(self,content):
  97         raise NhException("Content not handled here")
  98
  99     def _doIndexFile(self,fpath):
 100         raise NhException("Content not handled here")
 101
 102     def __repr__(self):
 103         return "<indexer %s.%s>" %(self.__module__, self.__class__.__name__)
 104
 105
 106 def mime_match(mime, mdict):
 107     if mdict.has_key(mime):
 108         return (mime, mdict[mime])
 109     if '/' in mime:
 110         mpat = mime.split('/')[0]+'/*'
 111         if mdict.has_key(mpat):
 112             return (mime, mdict[mpat])
 113
 114     return (None, None)
 115
 116 class contentIndex(object):
 117     __logger = logging.getLogger('addons.document.content_index')
 118     def __init__(self):
 119         self.mimes = {}
 120         self.exts = {}
 121
 122     def register(self, obj):
 123         f = False
 124         for mime in obj._getMimeTypes():
 125             self.mimes[mime] = obj
 126             f = True
 127
 128         for ext in obj._getExtensions():
 129             self.exts[ext] = obj
 130             f = True
 131
 132         if f:
 133             self.__logger.debug('Register content indexer: %r', obj)
 134         if not f:
 135             raise Exception("Your indexer should at least suport a mimetype or extension")
 136
 137     def doIndex(self, content, filename=None, content_type=None, realfname = None, debug=False):
 138         fobj = None
 139         fname = None
 140         mime = None
 141         if content_type and self.mimes.has_key(content_type):
 142             mime = content_type
 143             fobj = self.mimes[content_type]
 144         elif filename:
 145             bname,ext = os.path.splitext(filename)
 146             if self.exts.has_key(ext):
 147                 fobj = self.exts[ext]
 148                 mime = fobj._getDefMime(ext)
 149
 150         if content_type and not fobj:
 151             mime,fobj = mime_match(content_type, self.mimes)
 152
 153         if not fobj:
 154             try:
 155                 if realfname :
 156                     fname = realfname
 157                 else:
 158                     try:
 159                         bname,ext = os.path.splitext(filename or 'test.tmp')
 160                     except Exception:
 161                         bname, ext = filename, 'tmp'
 162                     fd, fname = tempfile.mkstemp(suffix=ext)
 163                     os.write(fd, content)
 164                     os.close(fd)
 165
 166                 fp = Popen(['file','-b','--mime',fname], shell=False, stdout=PIPE).stdout
 167                 result = fp.read()
 168                 fp.close()
 169                 mime2 = result.split(';')[0]
 170                 self.__logger.debug('File gave us: %s', mime2)
 171                 # Note that the temporary file still exists now.
 172                 mime,fobj = mime_match(mime2, self.mimes)
 173                 if not mime:
 174                     mime = mime2
 175             except Exception:
 176                 self.__logger.exception('Cannot determine mime type')
 177
 178         try:
 179             if fobj:
 180                 res = (mime, fobj.indexContent(content,filename,fname or realfname) )
 181             else:
 182                 self.__logger.debug("Have no object, return (%s, None)", mime)
 183                 res = (mime, None )
 184         except Exception:
 185             self.__logger.exception("Could not index file %s (%s)",
 186                                     filename, fname or realfname)
 187             res = None
 188
 189         # If we created a tmp file, unlink it now
 190         if not realfname and fname:
 191             try:
 192                 os.unlink(fname)
 193             except Exception:
 194                 self.__logger.exception("Could not unlink %s", fname)
 195
 196         return res
 197
 198 cntIndex = contentIndex()
 199
 200 # vim:expandtab:smartindent:tabstop=4:softtabstop=4:shiftwidth=4: