[IMP]document : get improvement of document from xrg-addons branch who made nice...
[odoo/odoo.git] / addons / document / std_index.py
1 # -*- coding: utf-8 -*-
2 ##############################################################################
3 #    
4 #    OpenERP, Open Source Management Solution
5 #    Copyright (C) 2004-2009 Tiny SPRL (<http://tiny.be>).
6 #
7 #    This program is free software: you can redistribute it and/or modify
8 #    it under the terms of the GNU Affero General Public License as
9 #    published by the Free Software Foundation, either version 3 of the
10 #    License, or (at your option) any later version.
11 #
12 #    This program is distributed in the hope that it will be useful,
13 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
14 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 #    GNU Affero General Public License for more details.
16 #
17 #    You should have received a copy of the GNU Affero General Public License
18 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.     
19 #
20 ##############################################################################
21
22 import StringIO
23 import odt2txt
24
25 from content_index import indexer, cntIndex
26
27
28 def _to_unicode(s):
29     try:
30         return s.decode('utf-8')
31     except UnicodeError:
32         try:
33             return s.decode('latin')
34         except UnicodeError:
35             try:
36                 return s.encode('ascii')
37             except UnicodeError:
38                 return s
39
40 class TxtIndex(indexer):
41         def _getMimeTypes(self):
42             return ['text/plain','text/html','text/diff','text/xml', 'text/*', 
43                 'application/xml']
44         
45         def _getExtensions(self):
46             return ['.txt', '.py']
47
48         def _doIndexContent(self,content):
49                 return content
50                 
51 cntIndex.register(TxtIndex())
52
53 class DocIndex(indexer):
54         def _getMimeTypes(self):
55             return [ 'application/ms-word']
56         
57         def _getExtensions(self):
58             return ['.doc']
59
60         def _doIndexFile(self,fname):
61                 fp = Popen(['antiword',fname], shell=False, stdout=PIPE).stdout
62                 return _to_unicode( fp.read())
63
64 cntIndex.register(DocIndex())
65
66 class PdfIndex(indexer):
67         def _getMimeTypes(self):
68             return [ 'application/pdf']
69         
70         def _getExtensions(self):
71             return ['.pdf']
72
73         def _doIndexFile(self,fname):
74                 fp = Popen(['pdftotext', '-enc', 'UTF-8', '-nopgbrk', fname, '-'], shell=False, stdout=PIPE).stdout
75                 return _to_unicode( fp.read())
76
77 cntIndex.register(PdfIndex())
78
79 class ImageNoIndex(indexer):
80         def _getMimeTypes(self):
81             return [ 'image/*']
82         
83         def _getExtensions(self):
84             #better return no extension, and let 'file' do its magic
85             return []
86             #return ['.png','.jpg','.gif','.jpeg','.bmp','.tiff']
87
88         def _doIndexContent(self,content):
89                 return 'image'
90
91
92 cntIndex.register(ImageNoIndex())
93
94 #class Doc(indexer):
95         #def _getDefMime(self,ext):
96
97 #def content_index(content, filename=None, content_type=None):
98     #fname,ext = os.path.splitext(filename)
99     #result = ''
100     #elif ext in ('.xls','.ods','.odt','.odp'):
101         #s = StringIO.StringIO(content)
102         #o = odt2txt.OpenDocumentTextFile(s)
103         #result = _to_unicode(o.toString())
104         #s.close()
105     #elif ext in ('.txt','.py','.patch','.html','.csv','.xml'):
106         #result = content
107     #else:
108         #result = content
109     #return result
110
111 #eof