addons/document/content_index.py

   1 # -*- coding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Management Solution
   5 #    Copyright (C) 2004-2010 Tiny SPRL (<http://tiny.be>).
   6 #
   7 #    This program is free software: you can redistribute it and/or modify
   8 #    it under the terms of the GNU Affero General Public License as
   9 #    published by the Free Software Foundation, either version 3 of the
  10 #    License, or (at your option) any later version.
  11 #
  12 #    This program is distributed in the hope that it will be useful,
  13 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #    GNU Affero General Public License for more details.
  16 #
  17 #    You should have received a copy of the GNU Affero General Public License
  18 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 ##############################################################################
  21 import logging
  22 import os
  23 import tempfile
  24
  25 class NhException(Exception):
  26     pass
  27
  28 from subprocess import Popen, PIPE
  29
  30 class indexer(object):
  31     """ An indexer knows how to parse the content of some file.
  32
  33         Typically, one indexer should be instantiated per file
  34         type.
  35         Override this class to add more functionality. Note that
  36         you should only override the Content or the File methods
  37         that give an optimal result. """
  38
  39     def _getMimeTypes(self):
  40         """ Return supported mimetypes """
  41         return []
  42
  43     def _getExtensions(self):
  44         return []
  45
  46     def _getDefMime(self, ext):
  47         """ Return a mimetype for this document type, ideally the
  48             closest to the extension ext. """
  49         mts = self._getMimeTypes();
  50         if len (mts):
  51             return mts[0]
  52         return None
  53
  54     def indexContent(self, content, filename=None, realfile = None):
  55         """ Use either content or the real file, to index.
  56             Some parsers will work better with the actual
  57             content, others parse a file easier. Try the
  58             optimal.
  59         """
  60         res = ''
  61         try:
  62             if content != None:
  63                 return self._doIndexContent(content)
  64         except NhException:
  65             pass
  66
  67         if realfile != None:
  68             try:
  69                 return self._doIndexFile(realfile)
  70             except NhException:
  71                 pass
  72
  73             fp = open(realfile,'rb')
  74             content2 = fp.read()
  75             fp.close()
  76
  77             # The not-handled exception may be raised here
  78             return self._doIndexContent(content2)
  79
  80
  81         # last try, with a tmp file
  82         if content:
  83             try:
  84                 fname,ext = filename and os.path.splitext(filename) or ('','')
  85                 fd, rfname = tempfile.mkstemp(suffix=ext)
  86                 os.write(fd, content)
  87                 os.close(fd)
  88                 res = self._doIndexFile(rfname)
  89                 os.unlink(rfname)
  90                 return res
  91             except NhException:
  92                 pass
  93
  94         raise NhException('No appropriate method to index file')
  95
  96     def _doIndexContent(self,content):
  97         raise NhException("Content not handled here")
  98
  99     def _doIndexFile(self,fpath):
 100         raise NhException("Content not handled here")
 101
 102
 103
 104 def mime_match(mime, mdict):
 105     if mdict.has_key(mime):
 106         return (mime, mdict[mime])
 107     if '/' in mime:
 108         mpat = mime.split('/')[0]+'/*'
 109         if mdict.has_key(mpat):
 110             return (mime, mdict[mpat])
 111
 112     return (None, None)
 113
 114 class contentIndex(object):
 115     __logger = logging.getLogger('addons.document.content_index')
 116     def __init__(self):
 117         self.mimes = {}
 118         self.exts = {}
 119
 120     def register(self, obj):
 121         f = False
 122         for mime in obj._getMimeTypes():
 123             self.mimes[mime] = obj
 124             f = True
 125
 126         for ext in obj._getExtensions():
 127             self.exts[ext] = obj
 128             f = True
 129
 130         if f:
 131             self.__logger.debug('Register content indexer: %s', obj)
 132         if not f:
 133             raise Exception("Your indexer should at least suport a mimetype or extension")
 134
 135     def doIndex(self, content, filename=None, content_type=None, realfname = None, debug=False):
 136         fobj = None
 137         fname = None
 138         mime = None
 139         if content_type and self.mimes.has_key(content_type):
 140             mime = content_type
 141             fobj = self.mimes[content_type]
 142         elif filename:
 143             bname,ext = os.path.splitext(filename)
 144             if self.exts.has_key(ext):
 145                 fobj = self.exts[ext]
 146                 mime = fobj._getDefMime(ext)
 147
 148         if content_type and not fobj:
 149             mime,fobj = mime_match(content_type, self.mimes)
 150
 151         if not fobj:
 152             try:
 153                 if realfname :
 154                     fname = realfname
 155                 else:
 156                     try:
 157                         bname,ext = os.path.splitext(filename or 'test.tmp')
 158                     except Exception:
 159                         bname, ext = filename, 'tmp'
 160                     fd, fname = tempfile.mkstemp(suffix=ext)
 161                     os.write(fd, content)
 162                     os.close(fd)
 163
 164                 fp = Popen(['file','-b','--mime',fname], shell=False, stdout=PIPE).stdout
 165                 result = fp.read()
 166                 fp.close()
 167                 mime2 = result.split(';')[0]
 168                 self.__logger.debug('File gave us: %s', mime2)
 169                 # Note that the temporary file still exists now.
 170                 mime,fobj = mime_match(mime2, self.mimes)
 171                 if not mime:
 172                     mime = mime2
 173             except Exception:
 174                 self.__logger.exception('Cannot determine mime type')
 175
 176         try:
 177             if fobj:
 178                 res = (mime, fobj.indexContent(content,filename,fname or realfname) )
 179             else:
 180                 self.__logger.debug("Have no object, return (%s, None)", mime)
 181                 res = (mime, None )
 182         except Exception:
 183             self.__logger.exception("Could not index file %s (%s)",
 184                                     filename, fname or realfname)
 185             res = None
 186
 187         # If we created a tmp file, unlink it now
 188         if not realfname and fname:
 189             try:
 190                 os.unlink(fname)
 191             except Exception:
 192                 self.__logger.exception("Could not unlink %s", fname)
 193
 194         return res
 195
 196 cntIndex = contentIndex()
 197
 198 # vim:expandtab:smartindent:tabstop=4:softtabstop=4:shiftwidth=4: