addons/document/content_index.py

   1 # -*- coding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Management Solution
   5 #    Copyright (C) 2004-2010 Tiny SPRL (<http://tiny.be>).
   6 #
   7 #    This program is free software: you can redistribute it and/or modify
   8 #    it under the terms of the GNU Affero General Public License as
   9 #    published by the Free Software Foundation, either version 3 of the
  10 #    License, or (at your option) any later version.
  11 #
  12 #    This program is distributed in the hope that it will be useful,
  13 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #    GNU Affero General Public License for more details.
  16 #
  17 #    You should have received a copy of the GNU Affero General Public License
  18 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 ##############################################################################
  21 import os
  22 import tempfile
  23
  24 # A quick hack: if netsvc is not there, emulate it. Thus, work offline, too
  25 try:
  26     import netsvc
  27     def log(lvl,msg):
  28         netsvc.Logger().notifyChannel("index",lvl,msg)
  29 except:
  30     class netsvc:
  31         LOG_NOTSET = 'notset'
  32         LOG_DEBUG_RPC = 'debug_rpc'
  33         LOG_DEBUG = 'debug'
  34         LOG_INFO = 'info'
  35         LOG_WARNING = 'warn'
  36         LOG_ERROR = 'error'
  37         LOG_CRITICAL = 'critical'
  38
  39     def log(lvl,msg):
  40         print msg
  41
  42
  43 class NhException(Exception):
  44     pass
  45
  46 from subprocess import Popen, PIPE
  47
  48 class indexer():
  49     """ An indexer knows how to parse the content of some file.
  50
  51         Typically, one indexer should be instantiated per file
  52         type.
  53         Override this class to add more functionality. Note that
  54         you should only override the Content or the File methods
  55         that give an optimal result. """
  56
  57     def _getMimeTypes(self):
  58         """ Return supported mimetypes """
  59         return []
  60
  61     def _getExtensions(self):
  62         return []
  63
  64     def _getDefMime(self,ext):
  65         """ Return a mimetype for this document type, ideally the
  66             closest to the extension ext. """
  67         mts = self._getMimeTypes();
  68         if len (mts):
  69             return mts[0]
  70         return None
  71
  72     def indexContent(self,content,filename=None, realfile = None):
  73         """ Use either content or the real file, to index.
  74             Some parsers will work better with the actual
  75             content, others parse a file easier. Try the
  76             optimal.
  77         """
  78         res = ''
  79         try:
  80             if content != None:
  81                 return self._doIndexContent(content)
  82         except NhException:
  83             pass
  84
  85         if realfile != None:
  86             try:
  87                 return self._doIndexFile(realfile)
  88             except NhException:
  89                 pass
  90
  91             fp = open(realfile,'rb')
  92             content2 = fp.read()
  93             fp.close()
  94
  95             # The not-handled exception may be raised here
  96             return self._doIndexContent(content2)
  97
  98
  99         # last try, with a tmp file
 100         if content:
 101             try:
 102                 fname,ext = filename and os.path.splitext(filename) or ('','')
 103                 fd, rfname = tempfile.mkstemp(suffix=ext)
 104                 os.write(fd, content)
 105                 os.close(fd)
 106                 res = self._doIndexFile(rfname)
 107                 os.unlink(rfname)
 108                 return res
 109             except NhException:
 110                 pass
 111
 112         raise NhException('No appropriate method to index file')
 113
 114     def _doIndexContent(self,content):
 115         raise NhException("Content not handled here")
 116
 117     def _doIndexFile(self,fpath):
 118         raise NhException("Content not handled here")
 119
 120
 121
 122 def mime_match(mime, mdict):
 123     if mdict.has_key(mime):
 124         return (mime, mdict[mime])
 125     if '/' in mime:
 126         mpat = mime.split('/')[0]+'/*'
 127         if mdict.has_key(mpat):
 128             return (mime, mdict[mpat])
 129
 130     return (None, None)
 131
 132 class contentIndex() :
 133     def __init__(self):
 134         self.mimes = {}
 135         self.exts = {}
 136
 137     def register(self, obj):
 138         f = False
 139         for mime in obj._getMimeTypes():
 140             self.mimes[mime] = obj
 141             f = True
 142
 143         for ext in obj._getExtensions():
 144             self.exts[ext] = obj
 145             f = True
 146
 147         if f:
 148             log(netsvc.LOG_DEBUG, "Register content indexer: %r" % obj)
 149         if not f:
 150             raise Exception("Your indexer should at least suport a mimetype or extension")
 151
 152     def doIndex(self,content, filename=None, content_type=None, realfname = None, debug=False):
 153         fobj = None
 154         fname = None
 155         mime = None
 156         if content_type and self.mimes.has_key(content_type):
 157             mime = content_type
 158             fobj = self.mimes[content_type]
 159         elif filename:
 160             bname,ext = os.path.splitext(filename)
 161             if self.exts.has_key(ext):
 162                 fobj = self.exts[ext]
 163                 mime = fobj._getDefMime(ext)
 164
 165         if content_type and not fobj:
 166             mime,fobj = mime_match(content_type, self.mimes)
 167
 168         if not fobj:
 169             try:
 170                 if realfname :
 171                     fname = realfname
 172                 else:
 173                     bname,ext = os.path.splitext(filename)
 174                     fd, fname = tempfile.mkstemp(suffix=ext)
 175                     os.write(fd, content)
 176                     os.close(fd)
 177
 178                 fp = Popen(['file','-b','--mime-type',fname], shell=False, stdout=PIPE).stdout
 179                 result = fp.read()
 180                 fp.close()
 181                 mime2 = result.strip()
 182                 log(netsvc.LOG_DEBUG,"File gave us: %s" % mime2)
 183                 # Note that the temporary file still exists now.
 184                 mime,fobj = mime_match(mime2, self.mimes)
 185                 if not mime:
 186                     mime = mime2
 187             except Exception, e:
 188                 log(netsvc.LOG_WARNING,"Cannot determine mime type: %s" % str(e))
 189
 190         try:
 191             if fobj:
 192                 res = (mime, fobj.indexContent(content,filename,fname or realfname) )
 193             else:
 194                 log(netsvc.LOG_DEBUG,"Have no object, return (%s, None)" % mime)
 195                 res = (mime, None )
 196         except Exception, e:
 197             log(netsvc.LOG_WARNING,"Could not index file, %s" % e)
 198             res = None
 199
 200         # If we created a tmp file, unlink it now
 201         if not realfname and fname:
 202             try:
 203                 os.unlink(fname)
 204             except Exception, e:
 205                 log(netsvc.LOG_WARNING,"Could not unlink %s, %s" %(fname, e))
 206
 207         return res
 208
 209 cntIndex = contentIndex()
 210
 211 # vim:expandtab:smartindent:tabstop=4:softtabstop=4:shiftwidth=4: