1 # -*- coding: utf-8 -*-
2 ##############################################################################
4 # OpenERP, Open Source Management Solution
5 # Copyright (C) 2004-2010 Tiny SPRL (<http://tiny.be>).
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as
9 # published by the Free Software Foundation, either version 3 of the
10 # License, or (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ##############################################################################
22 from content_index import indexer, cntIndex
23 from subprocess import Popen, PIPE
26 import sys, zipfile, xml.dom.minidom
30 return s.decode('utf-8')
33 return s.decode('latin')
36 return s.encode('ascii')
40 def textToString(element) :
42 for node in element.childNodes :
43 if node.nodeType == xml.dom.Node.TEXT_NODE :
44 buffer += node.nodeValue
45 elif node.nodeType == xml.dom.Node.ELEMENT_NODE :
46 buffer += textToString(node)
49 class TxtIndex(indexer):
50 def _getMimeTypes(self):
51 return ['text/plain','text/html','text/diff','text/xml', 'text/*',
54 def _getExtensions(self):
55 return ['.txt', '.py']
57 def _doIndexContent(self,content):
60 cntIndex.register(TxtIndex())
62 class PptxIndex(indexer):
63 def _getMimeTypes(self):
64 return [ 'application/vnd.openxmlformats-officedocument.presentationml.presentation']
66 def _getExtensions(self):
69 def _doIndexFile(self,fname):
71 """ Converts the document to a string. """
74 for paragraph in content.getElementsByTagName(val) :
75 buffer += textToString(paragraph) + "\n"
79 zip = zipfile.ZipFile(fname)
80 files = filter(lambda x: x.startswith('ppt/slides/slide'), zip.namelist())
81 for i in range(1, len(files) + 1):
82 content = xml.dom.minidom.parseString(zip.read('ppt/slides/slide%s.xml' % str(i)))
83 res = toString().encode('ascii','replace')
86 return _to_unicode('\n'.join(data))
88 cntIndex.register(PptxIndex())
90 class DocIndex(indexer):
91 def _getMimeTypes(self):
92 return [ 'application/ms-word']
94 def _getExtensions(self):
97 def _doIndexFile(self,fname):
98 fp = Popen(['antiword', fname], shell=False, stdout=PIPE).stdout
100 file_data = _to_unicode(fp.read())
106 cntIndex.register(DocIndex())
108 class DocxIndex(indexer):
109 def _getMimeTypes(self):
110 return [ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document']
112 def _getExtensions(self):
115 def _doIndexFile(self,fname):
116 zip = zipfile.ZipFile(fname)
117 content = xml.dom.minidom.parseString(zip.read("word/document.xml"))
119 """ Converts the document to a string. """
121 for val in ["w:p", "w:h", "text:list"]:
122 for paragraph in content.getElementsByTagName(val) :
123 buffer += textToString(paragraph) + "\n"
126 res = toString().encode('ascii','replace')
128 return _to_unicode(res)
130 cntIndex.register(DocxIndex())
133 class XlsxIndex(indexer):
134 def _getMimeTypes(self):
135 return [ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']
137 def _getExtensions(self):
140 def _doIndexFile(self,fname):
141 zip = zipfile.ZipFile(fname)
142 content = xml.dom.minidom.parseString(zip.read("xl/sharedStrings.xml"))
144 """ Converts the document to a string. """
147 for paragraph in content.getElementsByTagName(val) :
148 buffer += textToString(paragraph) + "\n"
151 res = toString().encode('ascii','replace')
153 return _to_unicode(res)
155 cntIndex.register(XlsxIndex())
157 class PdfIndex(indexer):
158 def _getMimeTypes(self):
159 return [ 'application/pdf']
161 def _getExtensions(self):
164 def _doIndexFile(self,fname):
165 fp = Popen(['pdftotext', '-enc', 'UTF-8', '-nopgbrk', fname, '-'], shell=False, stdout=PIPE).stdout
167 file_data = _to_unicode( fp.read())
173 cntIndex.register(PdfIndex())
175 class ImageNoIndex(indexer):
176 def _getMimeTypes(self):
179 def _getExtensions(self):
180 #better return no extension, and let 'file' do its magic
182 #return ['.png','.jpg','.gif','.jpeg','.bmp','.tiff']
184 def _doIndexContent(self,content):
188 cntIndex.register(ImageNoIndex())
190 # other opendocument formats:
191 # chart-template chart database
192 # formula-template formula graphics-template graphics
194 # presentation-template presentation spreadsheet-template spreadsheet
196 class OpenDoc(indexer):
197 """ Index OpenDocument files.
199 Q: is it really worth it to index spreadsheets, or do we only get a
200 meaningless list of numbers (cell contents) ?
202 def _getMimeTypes(self):
203 otypes = [ 'text', 'text-web', 'text-template', 'text-master' ]
204 return map(lambda a: 'application/vnd.oasis.opendocument.'+a, otypes)
206 def _getExtensions(self):
207 return ['.odt', '.ott', ] # '.ods'
209 def _doIndexContent(self, content):
210 s = StringIO.StringIO(content)
211 o = odt2txt.OpenDocumentTextFile(s)
212 result = _to_unicode(o.toString())
216 cntIndex.register(OpenDoc())