addons/document/content_index.py

   1 # -*- coding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 #    OpenERP, Open Source Management Solution
   5 #    Copyright (C) 2004-2009 Tiny SPRL (<http://tiny.be>).
   6 #
   7 #    This program is free software: you can redistribute it and/or modify
   8 #    it under the terms of the GNU Affero General Public License as
   9 #    published by the Free Software Foundation, either version 3 of the
  10 #    License, or (at your option) any later version.
  11 #
  12 #    This program is distributed in the hope that it will be useful,
  13 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 #    GNU Affero General Public License for more details.
  16 #
  17 #    You should have received a copy of the GNU Affero General Public License
  18 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 #
  20 ##############################################################################
  21 import os
  22 import tempfile
  23
  24 # A quick hack: if netsvc is not there, emulate it. Thus, work offline, too
  25 try:
  26     import netsvc
  27     def log(lvl,msg):
  28         netsvc.Logger().notifyChannel("index",lvl,msg)
  29 except:
  30     class netsvc:
  31         LOG_NOTSET = 'notset'
  32         LOG_DEBUG_RPC = 'debug_rpc'
  33         LOG_DEBUG = 'debug'
  34         LOG_DEBUG2 = 'debug2'
  35         LOG_INFO = 'info'
  36         LOG_WARNING = 'warn'
  37         LOG_ERROR = 'error'
  38         LOG_CRITICAL = 'critical'
  39
  40     def log(lvl,msg):
  41         print msg
  42
  43
  44 class NhException(Exception):
  45     pass
  46
  47 from subprocess import Popen, PIPE
  48
  49 class indexer():
  50     """ An indexer knows how to parse the content of some file.
  51
  52         Typically, one indexer should be instantiated per file
  53         type.
  54         Override this class to add more functionality. Note that
  55         you should only override the Content or the File methods
  56         that give an optimal result. """
  57
  58     def _getMimeTypes(self):
  59         """ Return supported mimetypes """
  60         return []
  61
  62     def _getExtensions(self):
  63         return []
  64
  65     def _getDefMime(self,ext):
  66         """ Return a mimetype for this document type, ideally the
  67             closest to the extension ext. """
  68         mts = self._getMimeTypes();
  69         if len (mts):
  70             return mts[0]
  71         return None
  72
  73     def indexContent(self,content,filename=None, realfile = None):
  74         """ Use either content or the real file, to index.
  75             Some parsers will work better with the actual
  76             content, others parse a file easier. Try the
  77             optimal.
  78         """
  79         res = ''
  80         try:
  81             if content != None:
  82                 return self._doIndexContent(content)
  83         except NhException:
  84             pass
  85
  86         if realfile != None:
  87             try:
  88                 return self._doIndexFile(realfile)
  89             except NhException:
  90                 pass
  91
  92             fp = open(realfile,'rb')
  93             content2 = fp.read()
  94             fp.close()
  95
  96             # The not-handled exception may be raised here
  97             return self._doIndexContent(content2)
  98
  99
 100         # last try, with a tmp file
 101         if content:
 102             try:
 103                 fname,ext = filename and os.path.splitext(filename) or ('','')
 104                 fd, rfname = tempfile.mkstemp(suffix=ext)
 105                 os.write(fd, content)
 106                 os.close(fd)
 107                 res = self._doIndexFile(rfname)
 108                 os.unlink(rfname)
 109                 return res
 110             except NhException:
 111                 pass
 112
 113         raise NhException('No appropriate method to index file')
 114
 115     def _doIndexContent(self,content):
 116         raise NhException("Content not handled here")
 117
 118     def _doIndexFile(self,fpath):
 119         raise NhException("Content not handled here")
 120
 121
 122
 123 def mime_match(mime, mdict):
 124     if mdict.has_key(mime):
 125         return (mime, mdict[mime])
 126     if '/' in mime:
 127         mpat = mime.split('/')[0]+'/*'
 128         if mdict.has_key(mpat):
 129             return (mime, mdict[mpat])
 130
 131     return (None, None)
 132
 133 class contentIndex() :
 134     def __init__(self):
 135         self.mimes = {}
 136         self.exts = {}
 137
 138     def register(self, obj):
 139         f = False
 140         for mime in obj._getMimeTypes():
 141             self.mimes[mime] = obj
 142             f = True
 143
 144         for ext in obj._getExtensions():
 145             self.exts[ext] = obj
 146             f = True
 147
 148         if f:
 149             log(netsvc.LOG_DEBUG, "Register content indexer: %r" % obj)
 150         if not f:
 151             raise Exception("Your indexer should at least suport a mimetype or extension")
 152
 153     def doIndex(self,content, filename=None, content_type=None, realfname = None, debug=False):
 154         fobj = None
 155         fname = None
 156         mime = None
 157         if content_type and self.mimes.has_key(content_type):
 158             mime = content_type
 159             fobj = self.mimes[content_type]
 160         elif filename:
 161             bname,ext = os.path.splitext(filename)
 162             if self.exts.has_key(ext):
 163                 fobj = self.exts[ext]
 164                 mime = fobj._getDefMime(ext)
 165
 166         if content_type and not fobj:
 167             mime,fobj = mime_match(content_type, self.mimes)
 168
 169         if not fobj:
 170             try:
 171                 if realfname :
 172                     fname = realfname
 173                 else:
 174                     bname,ext = os.path.splitext(filename)
 175                     fd, fname = tempfile.mkstemp(suffix=ext)
 176                     os.write(fd, content)
 177                     os.close(fd)
 178
 179                 fp = Popen(['file','-b','--mime-type',fname], shell=False, stdout=PIPE).stdout
 180                 result = fp.read()
 181                 fp.close()
 182                 mime2 = result.strip()
 183                 log(netsvc.LOG_DEBUG,"File gave us: %s" % mime2)
 184                 # Note that the temporary file still exists now.
 185                 mime,fobj = mime_match(mime2, self.mimes)
 186                 if not mime:
 187                     mime = mime2
 188             except Exception, e:
 189                 log(netsvc.LOG_WARNING,"Cannot determine mime type: %s" % str(e))
 190
 191         try:
 192             if fobj:
 193                 res = (mime, fobj.indexContent(content,filename,fname or realfname) )
 194             else:
 195                 log(netsvc.LOG_DEBUG,"Have no object, return (%s, None)" % mime)
 196                 res = (mime, None )
 197         except Exception, e:
 198             log(netsvc.LOG_WARNING,"Could not index file, %s" % e)
 199             res = None
 200
 201         # If we created a tmp file, unlink it now
 202         if not realfname and fname:
 203             try:
 204                 os.unlink(fname)
 205             except Exception, e:
 206                 log(netsvc.LOG_WARNING,"Could not unlink %s, %s" %(fname, e))
 207
 208         return res
 209
 210 cntIndex = contentIndex()
 211
 212 # vim:expandtab:smartindent:tabstop=4:softtabstop=4:shiftwidth=4: