1 # -*- coding: utf-8 -*-
2 ##############################################################################
4 # OpenERP, Open Source Management Solution
5 # Copyright (C) 2004-2010 Tiny SPRL (<http://tiny.be>).
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as
9 # published by the Free Software Foundation, either version 3 of the
10 # License, or (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 ##############################################################################
22 from content_index import indexer, cntIndex
23 from subprocess import Popen, PIPE
26 import sys, zipfile, xml.dom.minidom
28 _logger = logging.getLogger(__name__)
32 return s.decode('utf-8')
35 return s.decode('latin')
38 return s.encode('ascii')
42 def textToString(element):
44 for node in element.childNodes :
45 if node.nodeType == xml.dom.Node.TEXT_NODE :
46 buffer += node.nodeValue
47 elif node.nodeType == xml.dom.Node.ELEMENT_NODE :
48 buffer += textToString(node)
51 class TxtIndex(indexer):
52 def _getMimeTypes(self):
53 return ['text/plain','text/html','text/diff','text/xml', 'text/*',
56 def _getExtensions(self):
57 return ['.txt', '.py']
59 def _doIndexContent(self, content):
62 cntIndex.register(TxtIndex())
64 class PptxIndex(indexer):
65 def _getMimeTypes(self):
66 return [ 'application/vnd.openxmlformats-officedocument.presentationml.presentation']
68 def _getExtensions(self):
71 def _doIndexFile(self, fname):
73 """ Converts the document to a string. """
76 for paragraph in content.getElementsByTagName(val) :
77 buffer += textToString(paragraph) + "\n"
81 zip = zipfile.ZipFile(fname)
82 files = filter(lambda x: x.startswith('ppt/slides/slide'), zip.namelist())
83 for i in range(1, len(files) + 1):
84 content = xml.dom.minidom.parseString(zip.read('ppt/slides/slide%s.xml' % str(i)))
85 res = toString().encode('ascii','replace')
88 return _to_unicode('\n'.join(data))
90 cntIndex.register(PptxIndex())
92 class DocIndex(indexer):
93 def _getMimeTypes(self):
94 return [ 'application/ms-word']
96 def _getExtensions(self):
99 def _doIndexFile(self, fname):
101 pop = Popen(['antiword', fname], shell=False, stdout=PIPE)
102 (data, _) = pop.communicate()
103 return _to_unicode(data)
106 _logger.warning("Failed attempt to execute antiword (MS Word reader). Antiword is necessary to index the file %s of MIME type %s. Detailed error available at DEBUG level.", fname, self._getMimeTypes()[0])
107 _logger.debug("Trace of the failed file indexing attempt.", exc_info=True)
110 cntIndex.register(DocIndex())
112 class DocxIndex(indexer):
113 def _getMimeTypes(self):
114 return [ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document']
116 def _getExtensions(self):
119 def _doIndexFile(self, fname):
120 zip = zipfile.ZipFile(fname)
121 content = xml.dom.minidom.parseString(zip.read("word/document.xml"))
123 """ Converts the document to a string. """
125 for val in ["w:p", "w:h", "text:list"]:
126 for paragraph in content.getElementsByTagName(val) :
127 buffer += textToString(paragraph) + "\n"
130 res = toString().encode('ascii','replace')
132 return _to_unicode(res)
134 cntIndex.register(DocxIndex())
137 class XlsxIndex(indexer):
138 def _getMimeTypes(self):
139 return [ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet']
141 def _getExtensions(self):
144 def _doIndexFile(self, fname):
145 zip = zipfile.ZipFile(fname)
146 content = xml.dom.minidom.parseString(zip.read("xl/sharedStrings.xml"))
148 """ Converts the document to a string. """
151 for paragraph in content.getElementsByTagName(val) :
152 buffer += textToString(paragraph) + "\n"
155 res = toString().encode('ascii','replace')
157 return _to_unicode(res)
159 cntIndex.register(XlsxIndex())
161 class PdfIndex(indexer):
162 def _getMimeTypes(self):
163 return [ 'application/pdf']
165 def _getExtensions(self):
168 def _doIndexFile(self, fname):
170 pop = Popen(['pdftotext', '-enc', 'UTF-8', '-nopgbrk', fname, '-'], shell=False, stdout=PIPE)
171 (data, _) = pop.communicate()
172 return _to_unicode(data)
174 _logger.warning("Failed attempt to execute pdftotext. This program is necessary to index the file %s of MIME type %s. Detailed error available at DEBUG level.", fname, self._getMimeTypes()[0])
175 _logger.debug("Trace of the failed file indexing attempt.", exc_info=True)
178 cntIndex.register(PdfIndex())
180 class ImageNoIndex(indexer):
181 def _getMimeTypes(self):
184 def _getExtensions(self):
185 #better return no extension, and let 'file' do its magic
187 #return ['.png','.jpg','.gif','.jpeg','.bmp','.tiff']
189 def _doIndexContent(self, content):
193 cntIndex.register(ImageNoIndex())
195 # other opendocument formats:
196 # chart-template chart database
197 # formula-template formula graphics-template graphics
199 # presentation-template presentation spreadsheet-template spreadsheet
201 class OpenDoc(indexer):
202 """ Index OpenDocument files.
204 Q: is it really worth it to index spreadsheets, or do we only get a
205 meaningless list of numbers (cell contents) ?
207 def _getMimeTypes(self):
208 otypes = [ 'text', 'text-web', 'text-template', 'text-master' ]
209 return map(lambda a: 'application/vnd.oasis.opendocument.'+a, otypes)
211 def _getExtensions(self):
212 return ['.odt', '.ott', ] # '.ods'
214 def _doIndexContent(self, content):
215 s = StringIO.StringIO(content)
216 o = odt2txt.OpenDocumentTextFile(s)
217 result = _to_unicode(o.toString())
221 cntIndex.register(OpenDoc())
226 # vim:expandtab:smartindent:tabstop=4:softtabstop=4:shiftwidth=4: