[IMP] use const SUPERUSER_ID insteand of int 1
[odoo/odoo.git] / addons / document / content_index.py
index 6c0f083..f24c033 100644 (file)
@@ -1,8 +1,8 @@
 # -*- coding: utf-8 -*-
 ##############################################################################
-#    
+#
 #    OpenERP, Open Source Management Solution
-#    Copyright (C) 2004-2009 Tiny SPRL (<http://tiny.be>).
+#    Copyright (C) 2004-2010 Tiny SPRL (<http://tiny.be>).
 #
 #    This program is free software: you can redistribute it and/or modify
 #    it under the terms of the GNU Affero General Public License as
 #    GNU Affero General Public License for more details.
 #
 #    You should have received a copy of the GNU Affero General Public License
-#    along with this program.  If not, see <http://www.gnu.org/licenses/>.     
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 ##############################################################################
-
-import time
+import logging
 import os
-import StringIO
-import odt2txt
 import tempfile
+from subprocess import Popen, PIPE
+_logger = logging.getLogger(__name__)
+class NhException(Exception):
+    pass
 
-#
-# This should be the indexer
-#
-def _to_unicode(s):
-    try:
-        return s.decode('utf-8')
-    except UnicodeError:
+
+class indexer(object):
+    """ An indexer knows how to parse the content of some file.
+
+        Typically, one indexer should be instantiated per file
+        type.
+        Override this class to add more functionality. Note that
+        you should only override the Content or the File methods
+        that give an optimal result. """
+
+    def _getMimeTypes(self):
+        """ Return supported mimetypes """
+        return []
+
+    def _getExtensions(self):
+        return []
+
+    def _getDefMime(self, ext):
+        """ Return a mimetype for this document type, ideally the
+            closest to the extension ext. """
+        mts = self._getMimeTypes();
+        if len (mts):
+            return mts[0]
+        return None
+
+    def indexContent(self, content, filename=None, realfile = None):
+        """ Use either content or the real file, to index.
+            Some parsers will work better with the actual
+            content, others parse a file easier. Try the
+            optimal.
+        """
+        res = ''
         try:
-            return s.decode('latin')
-        except UnicodeError:
+            if content != None:
+                return self._doIndexContent(content)
+        except NhException:
+            pass
+
+        if realfile != None:
+            try:
+                return self._doIndexFile(realfile)
+            except NhException:
+                pass
+
+            fp = open(realfile,'rb')
+            try:
+                content2 = fp.read()
+            finally:
+                fp.close()
+
+            # The not-handled exception may be raised here
+            return self._doIndexContent(content2)
+
+
+        # last try, with a tmp file
+        if content:
             try:
-                return s.encode('ascii')
-            except UnicodeError:
-                return s
-
-
-def content_index(content, filename=None, content_type=None):
-    fname,ext = os.path.splitext(filename)
-    result = ''
-    if ext in ('.doc'): #or content_type ?
-        (stdin,stdout) = os.popen2('antiword -', 'b')
-        stdin.write(content)
-        stdin.close()
-        result = _to_unicode(stdout.read())
-    elif ext == '.pdf':
-        file_descriptor, file_name = tempfile.mkstemp(suffix=ext)
-        os.write(file_descriptor, content)
-        os.close(file_descriptor)
-        fp = os.popen('pdftotext -enc UTF-8 -nopgbrk '+file_name+' -', 'r')
-        result = fp.read()
-        fp.close()
-    elif ext in ('.xls','.ods','.odt','.odp'):
-        s = StringIO.StringIO(content)
-        o = odt2txt.OpenDocumentTextFile(s)
-        result = _to_unicode(o.toString())
-        s.close()
-    elif ext in ('.txt','.py','.patch','.html','.csv','.xml'):
-        result = content
-    #else:
-    #    result = content
-    return result
+                fname,ext = filename and os.path.splitext(filename) or ('','')
+                fd, rfname = tempfile.mkstemp(suffix=ext)
+                os.write(fd, content)
+                os.close(fd)
+                res = self._doIndexFile(rfname)
+                os.unlink(rfname)
+                return res
+            except NhException:
+                pass
+
+        raise NhException('No appropriate method to index file')
+
+    def _doIndexContent(self,content):
+        raise NhException("Content not handled here")
+
+    def _doIndexFile(self,fpath):
+        raise NhException("Content not handled here")
+
+    def __repr__(self):
+        return "<indexer %s.%s>" %(self.__module__, self.__class__.__name__)
+
+
+def mime_match(mime, mdict):
+    if mdict.has_key(mime):
+        return (mime, mdict[mime])
+    if '/' in mime:
+        mpat = mime.split('/')[0]+'/*'
+        if mdict.has_key(mpat):
+            return (mime, mdict[mpat])
+
+    return (None, None)
+
+class contentIndex(object):
+    
+    def __init__(self):
+        self.mimes = {}
+        self.exts = {}
+
+    def register(self, obj):
+        f = False
+        for mime in obj._getMimeTypes():
+            self.mimes[mime] = obj
+            f = True
+
+        for ext in obj._getExtensions():
+            self.exts[ext] = obj
+            f = True
+
+        if f:
+            _logger.debug('Register content indexer: %r', obj)
+        if not f:
+            raise Exception("Your indexer should at least suport a mimetype or extension")
+
+    def doIndex(self, content, filename=None, content_type=None, realfname = None, debug=False):
+        fobj = None
+        fname = None
+        mime = None
+        if content_type and self.mimes.has_key(content_type):
+            mime = content_type
+            fobj = self.mimes[content_type]
+        elif filename:
+            bname,ext = os.path.splitext(filename)
+            if self.exts.has_key(ext):
+                fobj = self.exts[ext]
+                mime = fobj._getDefMime(ext)
+
+        if content_type and not fobj:
+            mime,fobj = mime_match(content_type, self.mimes)
+
+        if not fobj:
+            try:
+                if realfname :
+                    fname = realfname
+                else:
+                    try:
+                        bname,ext = os.path.splitext(filename or 'test.tmp')
+                    except Exception:
+                        bname, ext = filename, 'tmp'
+                    fd, fname = tempfile.mkstemp(suffix=ext)
+                    os.write(fd, content)
+                    os.close(fd)
+
+                pop = Popen(['file','-b','--mime',fname], shell=False, stdout=PIPE)
+                (result, _) = pop.communicate()
+
+                mime2 = result.split(';')[0]
+                _logger.debug('File gave us: %s', mime2)
+                # Note that the temporary file still exists now.
+                mime,fobj = mime_match(mime2, self.mimes)
+                if not mime:
+                    mime = mime2
+            except Exception:
+                _logger.exception('Cannot determine mime type')
+
+        try:
+            if fobj:
+                res = (mime, fobj.indexContent(content,filename,fname or realfname) )
+            else:
+                _logger.debug("Have no object, return (%s, None)", mime)
+                res = (mime, None )
+        except Exception:
+            _logger.exception("Could not index file %s (%s)",
+                                    filename, fname or realfname)
+            res = None
+
+        # If we created a tmp file, unlink it now
+        if not realfname and fname:
+            try:
+                os.unlink(fname)
+            except Exception:
+                _logger.exception("Could not unlink %s", fname)
+        return res
+
+cntIndex = contentIndex()
 
 # vim:expandtab:smartindent:tabstop=4:softtabstop=4:shiftwidth=4: