addons/document/content_index.py

   1 # -*- coding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Management Solution
   5 #    Copyright (C) 2004-2010 Tiny SPRL (<http://tiny.be>).
   6 #
   7 #    This program is free software: you can redistribute it and/or modify
   8 #    it under the terms of the GNU Affero General Public License as
   9 #    published by the Free Software Foundation, either version 3 of the
  10 #    License, or (at your option) any later version.
  11 #
  12 #    This program is distributed in the hope that it will be useful,
  13 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #    GNU Affero General Public License for more details.
  16 #
  17 #    You should have received a copy of the GNU Affero General Public License
  18 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 ##############################################################################
  21 import logging
  22 import os
  23 import tempfile
  24 from subprocess import Popen, PIPE
  25
  26 class NhException(Exception):
  27     pass
  28
  29
  30 class indexer(object):
  31     """ An indexer knows how to parse the content of some file.
  32
  33         Typically, one indexer should be instantiated per file
  34         type.
  35         Override this class to add more functionality. Note that
  36         you should only override the Content or the File methods
  37         that give an optimal result. """
  38
  39     def _getMimeTypes(self):
  40         """ Return supported mimetypes """
  41         return []
  42
  43     def _getExtensions(self):
  44         return []
  45
  46     def _getDefMime(self, ext):
  47         """ Return a mimetype for this document type, ideally the
  48             closest to the extension ext. """
  49         mts = self._getMimeTypes();
  50         if len (mts):
  51             return mts[0]
  52         return None
  53
  54     def indexContent(self, content, filename=None, realfile = None):
  55         """ Use either content or the real file, to index.
  56             Some parsers will work better with the actual
  57             content, others parse a file easier. Try the
  58             optimal.
  59         """
  60         res = ''
  61         try:
  62             if content != None:
  63                 return self._doIndexContent(content)
  64         except NhException:
  65             pass
  66
  67         if realfile != None:
  68             try:
  69                 return self._doIndexFile(realfile)
  70             except NhException:
  71                 pass
  72
  73             fp = open(realfile,'rb')
  74             try:
  75                 content2 = fp.read()
  76             finally:
  77                 fp.close()
  78
  79             # The not-handled exception may be raised here
  80             return self._doIndexContent(content2)
  81
  82
  83         # last try, with a tmp file
  84         if content:
  85             try:
  86                 fname,ext = filename and os.path.splitext(filename) or ('','')
  87                 fd, rfname = tempfile.mkstemp(suffix=ext)
  88                 os.write(fd, content)
  89                 os.close(fd)
  90                 res = self._doIndexFile(rfname)
  91                 os.unlink(rfname)
  92                 return res
  93             except NhException:
  94                 pass
  95
  96         raise NhException('No appropriate method to index file')
  97
  98     def _doIndexContent(self,content):
  99         raise NhException("Content not handled here")
 100
 101     def _doIndexFile(self,fpath):
 102         raise NhException("Content not handled here")
 103
 104     def __repr__(self):
 105         return "<indexer %s.%s>" %(self.__module__, self.__class__.__name__)
 106
 107
 108 def mime_match(mime, mdict):
 109     if mdict.has_key(mime):
 110         return (mime, mdict[mime])
 111     if '/' in mime:
 112         mpat = mime.split('/')[0]+'/*'
 113         if mdict.has_key(mpat):
 114             return (mime, mdict[mpat])
 115
 116     return (None, None)
 117
 118 class contentIndex(object):
 119     __logger = logging.getLogger('addons.document.content_index')
 120     def __init__(self):
 121         self.mimes = {}
 122         self.exts = {}
 123
 124     def register(self, obj):
 125         f = False
 126         for mime in obj._getMimeTypes():
 127             self.mimes[mime] = obj
 128             f = True
 129
 130         for ext in obj._getExtensions():
 131             self.exts[ext] = obj
 132             f = True
 133
 134         if f:
 135             self.__logger.debug('Register content indexer: %r', obj)
 136         if not f:
 137             raise Exception("Your indexer should at least suport a mimetype or extension")
 138
 139     def doIndex(self, content, filename=None, content_type=None, realfname = None, debug=False):
 140         fobj = None
 141         fname = None
 142         mime = None
 143         if content_type and self.mimes.has_key(content_type):
 144             mime = content_type
 145             fobj = self.mimes[content_type]
 146         elif filename:
 147             bname,ext = os.path.splitext(filename)
 148             if self.exts.has_key(ext):
 149                 fobj = self.exts[ext]
 150                 mime = fobj._getDefMime(ext)
 151
 152         if content_type and not fobj:
 153             mime,fobj = mime_match(content_type, self.mimes)
 154
 155         if not fobj:
 156             try:
 157                 if realfname :
 158                     fname = realfname
 159                 else:
 160                     try:
 161                         bname,ext = os.path.splitext(filename or 'test.tmp')
 162                     except Exception:
 163                         bname, ext = filename, 'tmp'
 164                     fd, fname = tempfile.mkstemp(suffix=ext)
 165                     os.write(fd, content)
 166                     os.close(fd)
 167
 168                 fp = Popen(['file','-b','--mime',fname], shell=False, stdout=PIPE).stdout
 169                 try:
 170                     result = fp.read()
 171                 finally:
 172                     fp.close()
 173                 mime2 = result.split(';')[0]
 174                 self.__logger.debug('File gave us: %s', mime2)
 175                 # Note that the temporary file still exists now.
 176                 mime,fobj = mime_match(mime2, self.mimes)
 177                 if not mime:
 178                     mime = mime2
 179             except Exception:
 180                 self.__logger.exception('Cannot determine mime type')
 181
 182         try:
 183             if fobj:
 184                 res = (mime, fobj.indexContent(content,filename,fname or realfname) )
 185             else:
 186                 self.__logger.debug("Have no object, return (%s, None)", mime)
 187                 res = (mime, None )
 188         except Exception:
 189             self.__logger.exception("Could not index file %s (%s)",
 190                                     filename, fname or realfname)
 191             res = None
 192
 193         # If we created a tmp file, unlink it now
 194         if not realfname and fname:
 195             try:
 196                 os.unlink(fname)
 197             except Exception:
 198                 self.__logger.exception("Could not unlink %s", fname)
 199
 200         return res
 201
 202 cntIndex = contentIndex()
 203
 204 # vim:expandtab:smartindent:tabstop=4:softtabstop=4:shiftwidth=4: