import sys
import warnings
import xmlrpclib
+import re
+import base64
from psycopg2 import Binary
import osv
size = 0
return (x, tools.human_size(size))
+
+# See http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char
+# and http://bugs.python.org/issue10066
+invalid_xml_low_bytes = re.compile(r'[\x00-\x08\x0b-\x0c\x0e-\x1f]')
+
def sanitize_binary_value(dict_item):
# binary fields should be 7-bit ASCII base64-encoded data,
# but we do additional sanity checks to make sure the values
- # are not something else that won't pass via xmlrpc
+ # are not something else that won't pass via XML-RPC
index, value = dict_item
if isinstance(value, (xmlrpclib.Binary, tuple, list, dict)):
# these builtin types are meant to pass untouched
return index, value
- # For all other cases, handle the value as a binary string:
- # it could be a 7-bit ASCII string (e.g base64 data), but also
- # any 8-bit content from files, with byte values that cannot
- # be passed inside XML!
- # See for more info:
+ # Handle invalid bytes values that will cause problems
+ # for XML-RPC. See for more info:
# - http://bugs.python.org/issue10066
# - http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char
- #
- # One solution is to convert the byte-string to unicode,
- # so it gets serialized as utf-8 encoded data (always valid XML)
- # If invalid XML byte values were present, tools.ustr() uses
- # the Latin-1 codec as fallback, which converts any 8-bit
- # byte value, resulting in valid utf-8-encoded bytes
- # in the end:
- # >>> unicode('\xe1','latin1').encode('utf8') == '\xc3\xa1'
- # Note: when this happens, decoding on the other endpoint
- # is not likely to produce the expected output, but this is
- # just a safety mechanism (in these cases base64 data or
- # xmlrpc.Binary values should be used instead)
- return index, tools.ustr(value)
+
+ # Coercing to unicode would normally allow it to properly pass via
+ # XML-RPC, transparently encoded as UTF-8 by xmlrpclib.
+ # (this works for _any_ byte values, thanks to the fallback
+ # to latin-1 passthrough encoding when decoding to unicode)
+ value = tools.ustr(value)
+
+ # Due to Python bug #10066 this could still yield invalid XML
+ # bytes, specifically in the low byte range, that will crash
+ # the decoding side: [\x00-\x08\x0b-\x0c\x0e-\x1f]
+ # So check for low bytes values, and if any, perform
+ # base64 encoding - not very smart or useful, but this is
+ # our last resort to avoid crashing the request.
+ if invalid_xml_low_bytes.search(value):
+ # b64-encode after restoring the pure bytes with latin-1
+ # passthrough encoding
+ value = base64.b64encode(value.encode('latin-1'))
+
+ return index, value
+
# ---------------------------------------------------------