1 # -*- coding: utf-8 -*-
2 ##############################################################################
4 # OpenERP, Open Source Management Solution
5 # Copyright (C) 2004-2010 Tiny SPRL (<http://tiny.be>).
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as
9 # published by the Free Software Foundation, either version 3 of the
10 # License, or (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ##############################################################################
24 from subprocess import Popen, PIPE
26 class NhException(Exception):
30 class indexer(object):
31 """ An indexer knows how to parse the content of some file.
33 Typically, one indexer should be instantiated per file
35 Override this class to add more functionality. Note that
36 you should only override the Content or the File methods
37 that give an optimal result. """
39 def _getMimeTypes(self):
40 """ Return supported mimetypes """
43 def _getExtensions(self):
46 def _getDefMime(self, ext):
47 """ Return a mimetype for this document type, ideally the
48 closest to the extension ext. """
49 mts = self._getMimeTypes();
54 def indexContent(self, content, filename=None, realfile = None):
55 """ Use either content or the real file, to index.
56 Some parsers will work better with the actual
57 content, others parse a file easier. Try the
63 return self._doIndexContent(content)
69 return self._doIndexFile(realfile)
73 fp = open(realfile,'rb')
77 # The not-handled exception may be raised here
78 return self._doIndexContent(content2)
81 # last try, with a tmp file
84 fname,ext = filename and os.path.splitext(filename) or ('','')
85 fd, rfname = tempfile.mkstemp(suffix=ext)
88 res = self._doIndexFile(rfname)
94 raise NhException('No appropriate method to index file')
96 def _doIndexContent(self,content):
97 raise NhException("Content not handled here")
99 def _doIndexFile(self,fpath):
100 raise NhException("Content not handled here")
103 return "<indexer %s.%s>" %(self.__module__, self.__class__.__name__)
106 def mime_match(mime, mdict):
107 if mdict.has_key(mime):
108 return (mime, mdict[mime])
110 mpat = mime.split('/')[0]+'/*'
111 if mdict.has_key(mpat):
112 return (mime, mdict[mpat])
116 class contentIndex(object):
117 __logger = logging.getLogger('addons.document.content_index')
122 def register(self, obj):
124 for mime in obj._getMimeTypes():
125 self.mimes[mime] = obj
128 for ext in obj._getExtensions():
133 self.__logger.debug('Register content indexer: %r', obj)
135 raise Exception("Your indexer should at least suport a mimetype or extension")
137 def doIndex(self, content, filename=None, content_type=None, realfname = None, debug=False):
141 if content_type and self.mimes.has_key(content_type):
143 fobj = self.mimes[content_type]
145 bname,ext = os.path.splitext(filename)
146 if self.exts.has_key(ext):
147 fobj = self.exts[ext]
148 mime = fobj._getDefMime(ext)
150 if content_type and not fobj:
151 mime,fobj = mime_match(content_type, self.mimes)
159 bname,ext = os.path.splitext(filename or 'test.tmp')
161 bname, ext = filename, 'tmp'
162 fd, fname = tempfile.mkstemp(suffix=ext)
163 os.write(fd, content)
166 fp = Popen(['file','-b','--mime',fname], shell=False, stdout=PIPE).stdout
169 mime2 = result.split(';')[0]
170 self.__logger.debug('File gave us: %s', mime2)
171 # Note that the temporary file still exists now.
172 mime,fobj = mime_match(mime2, self.mimes)
176 self.__logger.exception('Cannot determine mime type')
180 res = (mime, fobj.indexContent(content,filename,fname or realfname) )
182 self.__logger.debug("Have no object, return (%s, None)", mime)
185 self.__logger.exception("Could not index file %s (%s)",
186 filename, fname or realfname)
189 # If we created a tmp file, unlink it now
190 if not realfname and fname:
194 self.__logger.exception("Could not unlink %s", fname)
198 cntIndex = contentIndex()
200 # vim:expandtab:smartindent:tabstop=4:softtabstop=4:shiftwidth=4: