4 from generic import PdfObject
\r
5 from xml.dom import getDOMImplementation
\r
6 from xml.dom.minidom import parseString
\r
8 RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
\r
9 DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
\r
10 XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/"
\r
11 PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/"
\r
12 XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/"
\r
14 # What is the PDFX namespace, you might ask? I might ask that too. It's
\r
15 # a completely undocumented namespace used to place "custom metadata"
\r
16 # properties, which are arbitrary metadata properties with no semantic or
\r
17 # documented meaning. Elements in the namespace are key/value-style storage,
\r
18 # where the element name is the key and the content is the value. The keys
\r
19 # are transformed into valid XML identifiers by substituting an invalid
\r
20 # identifier character with \u2182 followed by the unicode hex ID of the
\r
21 # original character. A key like "my car" is therefore "my\u21820020car".
\r
23 # \u2182, in case you're wondering, is the unicode character
\r
24 # \u{ROMAN NUMERAL TEN THOUSAND}, a straightforward and obvious choice for
\r
25 # escaping characters.
\r
27 # Intentional users of the pdfx namespace should be shot on sight. A
\r
28 # custom data schema and sensical XML elements could be used instead, as is
\r
29 # suggested by Adobe's own documentation on XMP (under "Extensibility of
\r
32 # Information presented here on the /pdfx/ schema is a result of limited
\r
33 # reverse engineering, and does not constitute a full specification.
\r
34 PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/"
\r
36 iso8601 = re.compile("""
\r
44 (?P<minute>[0-9]{2})
\r
45 (:(?P<second>[0-9]{2}(.[0-9]+)?))?
\r
46 (?P<tzd>Z|[-+][0-9]{2}:[0-9]{2})
\r
53 # An object that represents Adobe XMP metadata.
\r
54 class XmpInformation(PdfObject):
\r
56 def __init__(self, stream):
\r
57 self.stream = stream
\r
58 docRoot = parseString(self.stream.getData())
\r
59 self.rdfRoot = docRoot.getElementsByTagNameNS(RDF_NAMESPACE, "RDF")[0]
\r
62 def writeToStream(self, stream, encryption_key):
\r
63 self.stream.writeToStream(stream, encryption_key)
\r
65 def getElement(self, aboutUri, namespace, name):
\r
66 for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
\r
67 if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri:
\r
68 attr = desc.getAttributeNodeNS(namespace, name)
\r
71 for element in desc.getElementsByTagNameNS(namespace, name):
\r
74 def getNodesInNamespace(self, aboutUri, namespace):
\r
75 for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
\r
76 if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri:
\r
77 for i in range(desc.attributes.length):
\r
78 attr = desc.attributes.item(i)
\r
79 if attr.namespaceURI == namespace:
\r
81 for child in desc.childNodes:
\r
82 if child.namespaceURI == namespace:
\r
85 def _getText(self, element):
\r
87 for child in element.childNodes:
\r
88 if child.nodeType == child.TEXT_NODE:
\r
92 def _converter_string(value):
\r
95 def _converter_date(value):
\r
96 m = iso8601.match(value)
\r
97 year = int(m.group("year"))
\r
98 month = int(m.group("month") or "1")
\r
99 day = int(m.group("day") or "1")
\r
100 hour = int(m.group("hour") or "0")
\r
101 minute = int(m.group("minute") or "0")
\r
102 second = decimal.Decimal(m.group("second") or "0")
\r
103 seconds = second.to_integral(decimal.ROUND_FLOOR)
\r
104 milliseconds = (second - seconds) * 1000000
\r
105 tzd = m.group("tzd") or "Z"
\r
106 dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds)
\r
108 tzd_hours, tzd_minutes = [int(x) for x in tzd.split(":")]
\r
112 dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes)
\r
114 _test_converter_date = staticmethod(_converter_date)
\r
116 def _getter_bag(namespace, name, converter):
\r
118 cached = self.cache.get(namespace, {}).get(name)
\r
122 for element in self.getElement("", namespace, name):
\r
123 bags = element.getElementsByTagNameNS(RDF_NAMESPACE, "Bag")
\r
126 for item in bag.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
\r
127 value = self._getText(item)
\r
128 value = converter(value)
\r
129 retval.append(value)
\r
130 ns_cache = self.cache.setdefault(namespace, {})
\r
131 ns_cache[name] = retval
\r
135 def _getter_seq(namespace, name, converter):
\r
137 cached = self.cache.get(namespace, {}).get(name)
\r
141 for element in self.getElement("", namespace, name):
\r
142 seqs = element.getElementsByTagNameNS(RDF_NAMESPACE, "Seq")
\r
145 for item in seq.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
\r
146 value = self._getText(item)
\r
147 value = converter(value)
\r
148 retval.append(value)
\r
150 value = converter(self._getText(element))
\r
151 retval.append(value)
\r
152 ns_cache = self.cache.setdefault(namespace, {})
\r
153 ns_cache[name] = retval
\r
157 def _getter_langalt(namespace, name, converter):
\r
159 cached = self.cache.get(namespace, {}).get(name)
\r
163 for element in self.getElement("", namespace, name):
\r
164 alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt")
\r
167 for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
\r
168 value = self._getText(item)
\r
169 value = converter(value)
\r
170 retval[item.getAttribute("xml:lang")] = value
\r
172 retval["x-default"] = converter(self._getText(element))
\r
173 ns_cache = self.cache.setdefault(namespace, {})
\r
174 ns_cache[name] = retval
\r
178 def _getter_single(namespace, name, converter):
\r
180 cached = self.cache.get(namespace, {}).get(name)
\r
184 for element in self.getElement("", namespace, name):
\r
185 if element.nodeType == element.ATTRIBUTE_NODE:
\r
186 value = element.nodeValue
\r
188 value = self._getText(element)
\r
191 value = converter(value)
\r
192 ns_cache = self.cache.setdefault(namespace, {})
\r
193 ns_cache[name] = value
\r
198 # Contributors to the resource (other than the authors). An unsorted
\r
200 # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
\r
201 dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor", _converter_string))
\r
204 # Text describing the extent or scope of the resource.
\r
205 # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
\r
206 dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage", _converter_string))
\r
209 # A sorted array of names of the authors of the resource, listed in order
\r
211 # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
\r
212 dc_creator = property(_getter_seq(DC_NAMESPACE, "creator", _converter_string))
\r
215 # A sorted array of dates (datetime.datetime instances) of signifigance to
\r
216 # the resource. The dates and times are in UTC.
\r
217 # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
\r
218 dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date))
\r
221 # A language-keyed dictionary of textual descriptions of the content of the
\r
223 # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
\r
224 dc_description = property(_getter_langalt(DC_NAMESPACE, "description", _converter_string))
\r
227 # The mime-type of the resource.
\r
228 # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
\r
229 dc_format = property(_getter_single(DC_NAMESPACE, "format", _converter_string))
\r
232 # Unique identifier of the resource.
\r
233 # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
\r
234 dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier", _converter_string))
\r
237 # An unordered array specifying the languages used in the resource.
\r
238 # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
\r
239 dc_language = property(_getter_bag(DC_NAMESPACE, "language", _converter_string))
\r
242 # An unordered array of publisher names.
\r
243 # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
\r
244 dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher", _converter_string))
\r
247 # An unordered array of text descriptions of relationships to other
\r
249 # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
\r
250 dc_relation = property(_getter_bag(DC_NAMESPACE, "relation", _converter_string))
\r
253 # A language-keyed dictionary of textual descriptions of the rights the
\r
254 # user has to this resource.
\r
255 # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
\r
256 dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights", _converter_string))
\r
259 # Unique identifier of the work from which this resource was derived.
\r
260 # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
\r
261 dc_source = property(_getter_single(DC_NAMESPACE, "source", _converter_string))
\r
264 # An unordered array of descriptive phrases or keywrods that specify the
\r
265 # topic of the content of the resource.
\r
266 # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
\r
267 dc_subject = property(_getter_bag(DC_NAMESPACE, "subject", _converter_string))
\r
270 # A language-keyed dictionary of the title of the resource.
\r
271 # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
\r
272 dc_title = property(_getter_langalt(DC_NAMESPACE, "title", _converter_string))
\r
275 # An unordered array of textual descriptions of the document type.
\r
276 # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
\r
277 dc_type = property(_getter_bag(DC_NAMESPACE, "type", _converter_string))
\r
280 # An unformatted text string representing document keywords.
\r
281 # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
\r
282 pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords", _converter_string))
\r
285 # The PDF file version, for example 1.0, 1.3.
\r
286 # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
\r
287 pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion", _converter_string))
\r
290 # The name of the tool that created the PDF document.
\r
291 # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
\r
292 pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer", _converter_string))
\r
295 # The date and time the resource was originally created. The date and
\r
296 # time are returned as a UTC datetime.datetime object.
\r
297 # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
\r
298 xmp_createDate = property(_getter_single(XMP_NAMESPACE, "CreateDate", _converter_date))
\r
301 # The date and time the resource was last modified. The date and time
\r
302 # are returned as a UTC datetime.datetime object.
\r
303 # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
\r
304 xmp_modifyDate = property(_getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date))
\r
307 # The date and time that any metadata for this resource was last
\r
308 # changed. The date and time are returned as a UTC datetime.datetime
\r
310 # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
\r
311 xmp_metadataDate = property(_getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date))
\r
314 # The name of the first known tool used to create the resource.
\r
315 # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
\r
316 xmp_creatorTool = property(_getter_single(XMP_NAMESPACE, "CreatorTool", _converter_string))
\r
319 # The common identifier for all versions and renditions of this resource.
\r
320 # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
\r
321 xmpmm_documentId = property(_getter_single(XMPMM_NAMESPACE, "DocumentID", _converter_string))
\r
324 # An identifier for a specific incarnation of a document, updated each
\r
325 # time a file is saved.
\r
326 # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
\r
327 xmpmm_instanceId = property(_getter_single(XMPMM_NAMESPACE, "InstanceID", _converter_string))
\r
329 def custom_properties(self):
\r
330 if not hasattr(self, "_custom_properties"):
\r
331 self._custom_properties = {}
\r
332 for node in self.getNodesInNamespace("", PDFX_NAMESPACE):
\r
333 key = node.localName
\r
335 # see documentation about PDFX_NAMESPACE earlier in file
\r
336 idx = key.find(u"\u2182")
\r
339 key = key[:idx] + chr(int(key[idx+1:idx+5], base=16)) + key[idx+5:]
\r
340 if node.nodeType == node.ATTRIBUTE_NODE:
\r
341 value = node.nodeValue
\r
343 value = self._getText(node)
\r
344 self._custom_properties[key] = value
\r
345 return self._custom_properties
\r
348 # Retrieves custom metadata properties defined in the undocumented pdfx
\r
350 # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
\r
351 # @return Returns a dictionary of key/value items for custom metadata
\r
353 custom_properties = property(custom_properties)
\r
357 # vim:expandtab:smartindent:tabstop=4:softtabstop=4:shiftwidth=4: