addons/base_report_designer/openerp_sxw2rml/openerp_sxw2rml.py

   1 # -*- coding: utf-8 -*-
   2 ##############################################################################
   3 #
   4 # Copyright (c):
   5 #
   6 #     2005 pyopenoffice.py Martin Simon (http://www.bezirksreiter.de)
   7 #     2005 Fabien Pinckaers, TINY SPRL. (http://tiny.be)
   8 #
   9 #    This program is free software: you can redistribute it and/or modify
  10 #    it under the terms of the GNU Affero General Public License as
  11 #    published by the Free Software Foundation, either version 3 of the
  12 #    License, or (at your option) any later version.
  13 #
  14 #    This program is distributed in the hope that it will be useful,
  15 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 #    GNU Affero General Public License for more details.
  18 #
  19 #    You should have received a copy of the GNU Affero General Public License
  20 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  21 #
  22 ##############################################################################
  23 #!/usr/bin/python
  24 """
  25 OpenERP SXW2RML - The OpenERP's report engine
  26
  27 OpenERP SXW2RML is part of the OpenERP Report Project.
  28 OpenERP Report is a module that allows you to render high quality PDF document
  29 from an OpenOffice template (.sxw) and any relationl database.
  30 """
  31 __version__ = '0.9'
  32
  33
  34 import re
  35 import string
  36 import os
  37 import zipfile
  38 import xml.dom.minidom
  39 from reportlab.lib.units import toLength
  40 import base64
  41 import copy
  42
  43 class DomApiGeneral:
  44     """General DOM API utilities."""
  45     def __init__(self, content_string="", file=""):
  46         self.content_string = content_string
  47         self.re_digits = re.compile(r"(.*?\d)(pt|cm|mm|inch|in)")
  48
  49     def _unitTuple(self, string):
  50         """Split values and units to a tuple."""
  51         temp = self.re_digits.findall(string)
  52         if not temp:
  53             return (string,"")
  54         else:
  55             return (temp[0])
  56
  57     def stringPercentToFloat(self, string):
  58         temp = string.replace("""%""","")
  59         return float(temp)/100
  60
  61     def findChildrenByName(self, parent, name, attr_dict=None):
  62         """Helper functions. Does not work recursively.
  63         Optional: also test for certain attribute/value pairs."""
  64         if attr_dict is None:
  65             attr_dict = {}
  66         children = []
  67         for c in parent.childNodes:
  68             if c.nodeType == c.ELEMENT_NODE and c.nodeName == name:
  69                 children.append(c)
  70         if attr_dict == {}:
  71             return children
  72         else:
  73             return self._selectForAttributes(nodelist=children,attr_dict=attr_dict)
  74
  75     def _selectForAttributes(self, nodelist, attr_dict):
  76         "Helper function."""
  77         selected_nodes = []
  78         for n in nodelist:
  79             check = 1
  80             for a in attr_dict.keys():
  81                 if n.getAttribute(a) != attr_dict[a]:
  82                     # at least one incorrect attribute value?
  83                     check = 0
  84             if check:
  85                 selected_nodes.append(n)
  86         return selected_nodes
  87
  88     def _stringToTuple(self, s):
  89         """Helper function."""
  90         try:
  91             temp = string.split(s,",")
  92             return int(temp[0]),int(temp[1])
  93         except:
  94             return None
  95
  96     def _tupleToString(self, t):
  97         try:
  98             return self.openOfficeStringUtf8("%s,%s" % (t[0],t[1]))
  99         except:
 100             return None
 101
 102     def _lengthToFloat(self, value):
 103         v = value
 104         if not self.re_digits.search(v):
 105             return v
 106         try:
 107             if v[-4:] == "inch":
 108                 # OO files use "inch" instead of "in" in Reportlab units
 109                 v = v[:-2]
 110         except:
 111             pass
 112         try:
 113             c = round(toLength(v))
 114             return c
 115         except:
 116             return v
 117
 118     def openOfficeStringUtf8(self, string):
 119         if type(string) == unicode:
 120             return string.encode("utf-8")
 121         tempstring = unicode(string,"cp1252").encode("utf-8")
 122         return tempstring
 123
 124 class DomApi(DomApiGeneral):
 125     """This class provides a DOM-API for XML-Files from an SXW-Archive."""
 126     def __init__(self, xml_content, xml_styles):
 127         DomApiGeneral.__init__(self)
 128         self.content_dom = xml.dom.minidom.parseString(xml_content)
 129         self.styles_dom = xml.dom.minidom.parseString(xml_styles)
 130         body = self.content_dom.getElementsByTagName("office:body")
 131         self.body = body and body[0]
 132
 133         # TODO:
 134         self.style_dict = {}
 135         self.style_properties_dict = {}
 136
 137         # ******** always use the following order:
 138         self.buildStyleDict()
 139         self.buildStylePropertiesDict()
 140         if self.styles_dom.getElementsByTagName("style:page-master").__len__()<>0:
 141             self.page_master = self.styles_dom.getElementsByTagName("style:page-master")[0]
 142         if  self.styles_dom.getElementsByTagName("style:page-layout").__len__()<>0 :
 143                         self.page_master = self.styles_dom.getElementsByTagName("style:page-layout")[0]
 144         self.document = self.content_dom.getElementsByTagName("office:document-content")[0]
 145
 146     def buildStylePropertiesDict(self):
 147         for s in self.style_dict.keys():
 148             self.style_properties_dict[s] = self.getStylePropertiesDict(s)
 149
 150     def updateWithPercents(self, dict, updatedict):
 151         """Sometimes you find values like "115%" in the style hierarchy."""
 152         if not updatedict:
 153             # no style hierarchies for this style? =>
 154             return
 155         new_updatedict = copy.copy(updatedict)
 156         for u in new_updatedict.keys():
 157             try:
 158                 if new_updatedict[u].find("""%""") != -1 and dict.has_key(u):
 159                     number = float(self.re_digits.search(dict[u]).group(1))
 160                     unit = self.re_digits.search(dict[u]).group(2)
 161                     new_number = self.stringPercentToFloat(new_updatedict[u]) * number
 162                     if unit == "pt":
 163                         new_number = int(new_number)
 164                         # no floats allowed for "pt"
 165                         # OOo just takes the int, does not round (try it out!)
 166                     new_updatedict[u] = "%s%s" % (new_number,unit)
 167                 else:
 168                     dict[u] = new_updatedict[u]
 169             except:
 170                 dict[u] = new_updatedict[u]
 171         dict.update(new_updatedict)
 172
 173     def normalizeStyleProperties(self):
 174         """Transfer all style:style-properties attributes from the
 175         self.style_properties_hierarchical dict to the automatic-styles
 176         from content.xml. Use this function to preprocess content.xml for
 177         XSLT transformations etc.Do not try to implement this function
 178         with XSlT - believe me, it's a terrible task..."""
 179         styles_styles = self.styles_dom.getElementsByTagName("style:style")
 180         automatic_styles = self.content_dom.getElementsByTagName("office:automatic-styles")[0]
 181         for s in styles_styles:
 182             automatic_styles.appendChild(s.cloneNode(deep=1))
 183         content_styles = self.content_dom.getElementsByTagName("style:style")
 184         # these are the content_styles with styles_styles added!!!
 185         for s in content_styles:
 186             c = self.findChildrenByName(s,"style:properties")
 187             if c == []:
 188                 # some derived automatic styles do not have "style:properties":
 189                 temp = self.content_dom.createElement("style:properties")
 190                 s.appendChild(temp)
 191                 c = self.findChildrenByName(s,"style:properties")
 192             c = c[0]
 193             dict = self.style_properties_dict[(s.getAttribute("style:name")).encode("utf-8")] or {}
 194             for attribute in dict.keys():
 195                 c.setAttribute(self.openOfficeStringUtf8(attribute),self.openOfficeStringUtf8(dict[attribute]))
 196
 197     def transferStylesXml(self):
 198         """Transfer certain sub-trees from styles.xml to the normalized content.xml
 199         (see above). It is not necessary to do this - for example - with paragraph styles.
 200         the "normalized" style properties contain all information needed for
 201         further processing."""
 202         # TODO: What about table styles etc.?
 203         outline_styles = self.styles_dom.getElementsByTagName("text:outline-style")
 204         t = self.content_dom.createElement("transferredfromstylesxml")
 205         self.document.insertBefore(t,self.body)
 206         t_new = self.body.previousSibling
 207         try:
 208             page_master = self.page_master
 209             t_new.appendChild(page_master.cloneNode(deep=1))
 210             t_new.appendChild(outline_styles[0].cloneNode(deep=1))
 211         except:
 212             pass
 213
 214     def normalizeLength(self):
 215         """Normalize all lengthes to floats (i.e: 1 inch = 72).
 216         Always use this after "normalizeContent" and "transferStyles"!"""
 217         # TODO: The complex attributes of table cell styles are not transferred yet.
 218         #all_styles = self.content_dom.getElementsByTagName("style:properties")
 219         #all_styles += self.content_dom.getElementsByTagName("draw:image")
 220         all_styles = self.content_dom.getElementsByTagName("*")
 221         for s in all_styles:
 222             for x in s._attrs.keys():
 223                 v = s.getAttribute(x)
 224                 s.setAttribute(x,"%s" % self._lengthToFloat(v))
 225                 # convert float to string first!
 226
 227     def normalizeTableColumns(self):
 228         """Handle this strange table:number-columns-repeated attribute."""
 229         columns = self.content_dom.getElementsByTagName("table:table-column")
 230         for c in columns:
 231             if c.hasAttribute("table:number-columns-repeated"):
 232                 number = int(c.getAttribute("table:number-columns-repeated"))
 233                 c.removeAttribute("table:number-columns-repeated")
 234                 for i in range(number-1):
 235                     (c.parentNode).insertBefore(c.cloneNode(deep=1),c)
 236
 237     def buildStyleDict(self):
 238         """Store all style:style-nodes from content.xml and styles.xml in self.style_dict.
 239         Caution: in this dict the nodes from two dom apis are merged!"""
 240         for st in (self.styles_dom,self.content_dom):
 241             for s in st.getElementsByTagName("style:style"):
 242                 name = s.getAttribute("style:name").encode("utf-8")
 243                 self.style_dict[name] = s
 244         return True
 245
 246     def toxml(self):
 247         return self.content_dom.toxml(encoding="utf-8")
 248
 249     def getStylePropertiesDict(self, style_name):
 250         res = {}
 251
 252         if self.style_dict[style_name].hasAttribute("style:parent-style-name"):
 253             parent = self.style_dict[style_name].getAttribute("style:parent-style-name").encode("utf-8")
 254             res = self.getStylePropertiesDict(parent)
 255
 256         children = self.style_dict[style_name].childNodes
 257         for c in children:
 258             if c.nodeType == c.ELEMENT_NODE and c.nodeName.find("properties")>0 :
 259                 for attr in c._attrs.keys():
 260                     res[attr] = c.getAttribute(attr).encode("utf-8")
 261         return res
 262
 263 class PyOpenOffice(object):
 264     """This is the main class which provides all functionality."""
 265     def __init__(self, path='.', save_pict=False):
 266         self.path = path
 267         self.save_pict = save_pict
 268         self.images = {}
 269
 270     def oo_read(self, fname):
 271         z = zipfile.ZipFile(fname,"r")
 272         content = z.read('content.xml')
 273         style = z.read('styles.xml')
 274         all = z.namelist()
 275         for a in all:
 276             if a[:9]=='Pictures/' and len(a)>10:
 277                 pic_content = z.read(a)
 278                 self.images[a[9:]] = pic_content
 279                 if self.save_pict:
 280                     f=open(os.path.join(self.path, os.path.basename(a)),"wb")
 281                     f.write(pic_content)
 282                     f.close()
 283         z.close()
 284         return content,style
 285
 286     def oo_replace(self, content):
 287         regex = [
 288             (r"<para[^>]*/>", ""),
 289             (r"<para(.*)>(.*?)<text:line-break[^>]*/>", "<para$1>$2</para><para$1>"),
 290         ]
 291         for key,val in regex:
 292             content = re.sub(key, val, content)
 293         return content
 294
 295     def unpackNormalize(self, sourcefile):
 296         c,s = self.oo_read(sourcefile)
 297         c = self.oo_replace(c)
 298         dom = DomApi(c,s)
 299         dom.normalizeStyleProperties()
 300         dom.transferStylesXml()
 301         dom.normalizeLength()
 302         dom.normalizeTableColumns()
 303         new_c = dom.toxml()
 304         return new_c
 305
 306 def sxw2rml(sxw_file, xsl, output='.', save_pict=False):
 307     from lxml import etree
 308     from StringIO import StringIO
 309
 310     tool = PyOpenOffice(output, save_pict = save_pict)
 311     res = tool.unpackNormalize(sxw_file)
 312
 313     f = StringIO(xsl)
 314     styledoc = etree.parse(f)
 315     style = etree.XSLT(styledoc)
 316
 317     f = StringIO(res)
 318     doc = etree.parse(f)
 319     result = style(doc)
 320     root = etree.XPathEvaluator(result)("/document/stylesheet")
 321
 322     if root:
 323         root=root[0]
 324         images = etree.Element("images")
 325         for img in tool.images:
 326             node = etree.Element('image', name=img)
 327             node.text = base64.encodestring(tool.images[img])
 328             images.append(node)
 329         root.append(images)
 330
 331     try:
 332         xml = str(result)
 333         return xml
 334     except:
 335         return result
 336
 337 if __name__ == "__main__":
 338     import optparse
 339     parser = optparse.OptionParser(
 340         version="OpenERP Report v%s" % __version__,
 341         usage = 'openerp_sxw2rml.py [options] file.sxw')
 342     parser.add_option("-v", "--verbose", default=False, dest="verbose", help="enable basic debugging")
 343     parser.add_option("-o", "--output", dest="output", default='.', help="directory of image output")
 344     (opt, args) = parser.parse_args()
 345     if len(args) != 1:
 346         parser.error("Incorrect number of arguments.")
 347
 348     import sys
 349
 350     fname = sys.argv[1]
 351     f = fname
 352     xsl_file = 'normalized_oo2rml.xsl'
 353     z = zipfile.ZipFile(fname,"r")
 354     mimetype = z.read('mimetype')
 355     if mimetype.split('/')[-1] == 'vnd.oasis.opendocument.text' :
 356                 xsl_file = 'normalized_odt2rml.xsl'
 357     xsl = file(os.path.join(os.getcwd(), os.path.dirname(sys.argv[0]), xsl_file)).read()
 358     result = sxw2rml(f, xsl, output=opt.output, save_pict=False)
 359
 360     print result
 361 # vim:expandtab:smartindent:tabstop=4:softtabstop=4:shiftwidth=4:
 362