1 # -*- coding: utf-8 -*-
2 ##############################################################################
4 # OpenERP, Open Source Management Solution
5 # Copyright (C) 2004-2010 Tiny SPRL (<http://tiny.be>).
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as
9 # published by the Free Software Foundation, either version 3 of the
10 # License, or (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ##############################################################################
24 from subprocess import Popen, PIPE
25 from content_index import indexer, cntIndex
29 return s.decode('utf-8')
32 return s.decode('latin')
35 return s.encode('ascii')
39 class TxtIndex(indexer):
40 def _getMimeTypes(self):
41 return ['text/plain','text/html','text/diff','text/xml', 'text/*',
44 def _getExtensions(self):
45 return ['.txt', '.py']
47 def _doIndexContent(self, content):
50 cntIndex.register(TxtIndex())
52 class PptxIndex(indexer):
53 def _getMimeTypes(self):
54 return [ 'application/vnd.openxmlformats-officedocument.presentationml.presentation']
56 def _getExtensions(self):
59 def _doIndexFile(self, fname):
60 # pptx2txt.pl package not support in windows platform.
61 # Download pptx2txt package from http://sourceforge.net/projects/pptx2txt/" link.
62 # To install this tool, just copy pptx2txt.pl to appropriate place (e.g. /usr/bin directory)
63 fp = Popen(['pptx2txt.pl', fname], shell=False, stdout=PIPE).stdout
65 file_obj = open(str(fname + ".txt"), "r")
66 data = file_obj.read()
67 return _to_unicode(data)
69 cntIndex.register(PptxIndex())
71 class DocIndex(indexer):
72 def _getMimeTypes(self):
73 return [ 'application/ms-word']
75 def _getExtensions(self):
78 def _doIndexFile(self,fname):
79 fp = Popen(['antiword', fname], shell=False, stdout=PIPE).stdout
80 return _to_unicode( fp.read())
82 cntIndex.register(DocIndex())
84 class DocxIndex(indexer):
85 def _getMimeTypes(self):
86 return [ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document']
88 def _getExtensions(self):
91 def _doIndexFile(self, fname):
92 # docx2txt.pl package not support in windows platform.
93 # Download docx2txt package from "http://sourceforge.net/projects/docx2txt/" link.
94 # In case, you don't want to use Makefile for installation, you can follow these steps for manual installation.
95 # Copy docx2txt.pl, docx2txt.sh and docx2txt.config to appropriate place (e.g. /usr/bin directory) . used following command.
96 # --> cp docx2txt.pl docx2txt.sh docx2txt.config /usr/bin/
98 fp = Popen(['docx2txt.pl', fname], shell=False, stdout=PIPE).stdout
100 file_obj = open(str(fname + ".txt"), "r")
101 data = file_obj.read()
102 return _to_unicode(data)
104 cntIndex.register(DocxIndex())
106 class PdfIndex(indexer):
107 def _getMimeTypes(self):
108 return [ 'application/pdf']
110 def _getExtensions(self):
113 def _doIndexFile(self,fname):
114 fp = Popen(['pdftotext', '-enc', 'UTF-8', '-nopgbrk', fname, '-'], shell=False, stdout=PIPE).stdout
115 return _to_unicode( fp.read())
117 cntIndex.register(PdfIndex())
119 class ImageNoIndex(indexer):
120 def _getMimeTypes(self):
123 def _getExtensions(self):
124 #better return no extension, and let 'file' do its magic
126 #return ['.png','.jpg','.gif','.jpeg','.bmp','.tiff']
128 def _doIndexContent(self,content):
132 cntIndex.register(ImageNoIndex())
134 # other opendocument formats:
135 #vnd.oasis.opendocument.chart-template
136 #vnd.oasis.opendocument.chart
137 #vnd.oasis.opendocument.database
138 #vnd.oasis.opendocument.formula-template
139 #vnd.oasis.opendocument.formula
140 #vnd.oasis.opendocument.graphics-template
141 #vnd.oasis.opendocument.graphics
142 #vnd.oasis.opendocument.image
143 #vnd.oasis.opendocument.presentation-template
144 #vnd.oasis.opendocument.presentation
145 #vnd.oasis.opendocument.spreadsheet-template
146 #vnd.oasis.opendocument.spreadsheet
148 class OpenDoc(indexer):
149 """ Index OpenDocument files.
151 Q: is it really worth it to index spreadsheets, or do we only get a
152 meaningless list of numbers (cell contents) ?
154 def _getMimeTypes(self):
155 otypes = [ 'text', 'text-web', 'text-template', 'text-master' ]
156 return map(lambda a: 'application/vnd.oasis.opendocument.'+a, otypes)
158 def _getExtensions(self):
159 return ['.odt', '.ott', ] # '.ods'
161 def _doIndexContent(self, content):
162 s = StringIO.StringIO(content)
163 o = odt2txt.OpenDocumentTextFile(s)
164 result = _to_unicode(o.toString())
168 cntIndex.register(OpenDoc())