C
Chris McDonough
ElementTree's XML serialization routine implied by tree._write(file,
node, encoding, namespaces looks like this (elided):
def _write(self, file, node, encoding, namespaces):
# write XML to file
tag = node.tag
if tag is Comment:
file.write("<!-- %s -->" % _escape_cdata(node.text, encoding))
elif tag is ProcessingInstruction:
file.write("<?%s?>" % _escape_cdata(node.text, encoding))
else:
....
file.write("<" + _encode(tag, encoding))
if items or xmlns_items:
items.sort() # lexical order
Note that "_escape_cdata" (which also performs encoding) and "_encode"
are called for pcdata (and attribute values) only, but not for the tag
literals like "<" and "<?%s?>".
In some profiling I've done, I believe encoding during recursion makes
serialization slightly slower than it could be if we could get away with
not encoding any pcdata or attribute values during recursion.
Instead, we might be able to get away with encoding everything just once
at the end. But I don't know if this is kosher. Is there any reason to
not also encode tag literals and quotation marks that are attribute
containers, just once, at the end of serialization?
Even if that's not acceptable in general because tag literals cannot be
encoded, would it be acceptable for "ascii-compatible" encodings like
utf-8, latin-1, and friends?
Something like:
def _escape_cdata(text, encoding=None, replace=string.replace):
# doesn't do any encoding
text = replace(text, "&", "&")
text = replace(text, "<", "<")
text = replace(text, ">", ">")
return text
class _ElementInterface:
...
def write(self, file, encoding="us-ascii"):
assert self._root is not None
if not hasattr(file, "write"):
file = open(file, "wb")
if not encoding:
encoding = "us-ascii"
elif encoding != "utf-8" and encoding != "us-ascii":
file.write("<?xml version='1.0' encoding='%s'?>\n" % encoding)
tmp = StringIO()
self._write(tmp, self._root, encoding, {})
file.write(tmp.getvalue().encode(encoding))
def _write(self, file, node, encoding, namespaces):
# write XML to file
tag = node.tag
if tag is Comment:
file.write("<!-- %s -->" % _escape_cdata(node.text, encoding))
elif tag is ProcessingInstruction:
file.write("<?%s?>" % _escape_cdata(node.text, encoding))
else:
items = node.items()
xmlns_items = [] # new namespaces in this scope
try:
if isinstance(tag, QName) or tag[:1] == "{":
tag, xmlns = fixtag(tag, namespaces)
if xmlns: xmlns_items.append(xmlns)
except TypeError:
_raise_serialization_error(tag)
file.write("<" + tag)
I smell the mention of a Byte Order Mark coming on. ;-)
node, encoding, namespaces looks like this (elided):
def _write(self, file, node, encoding, namespaces):
# write XML to file
tag = node.tag
if tag is Comment:
file.write("<!-- %s -->" % _escape_cdata(node.text, encoding))
elif tag is ProcessingInstruction:
file.write("<?%s?>" % _escape_cdata(node.text, encoding))
else:
....
file.write("<" + _encode(tag, encoding))
if items or xmlns_items:
items.sort() # lexical order
Note that "_escape_cdata" (which also performs encoding) and "_encode"
are called for pcdata (and attribute values) only, but not for the tag
literals like "<" and "<?%s?>".
In some profiling I've done, I believe encoding during recursion makes
serialization slightly slower than it could be if we could get away with
not encoding any pcdata or attribute values during recursion.
Instead, we might be able to get away with encoding everything just once
at the end. But I don't know if this is kosher. Is there any reason to
not also encode tag literals and quotation marks that are attribute
containers, just once, at the end of serialization?
Even if that's not acceptable in general because tag literals cannot be
encoded, would it be acceptable for "ascii-compatible" encodings like
utf-8, latin-1, and friends?
Something like:
def _escape_cdata(text, encoding=None, replace=string.replace):
# doesn't do any encoding
text = replace(text, "&", "&")
text = replace(text, "<", "<")
text = replace(text, ">", ">")
return text
class _ElementInterface:
...
def write(self, file, encoding="us-ascii"):
assert self._root is not None
if not hasattr(file, "write"):
file = open(file, "wb")
if not encoding:
encoding = "us-ascii"
elif encoding != "utf-8" and encoding != "us-ascii":
file.write("<?xml version='1.0' encoding='%s'?>\n" % encoding)
tmp = StringIO()
self._write(tmp, self._root, encoding, {})
file.write(tmp.getvalue().encode(encoding))
def _write(self, file, node, encoding, namespaces):
# write XML to file
tag = node.tag
if tag is Comment:
file.write("<!-- %s -->" % _escape_cdata(node.text, encoding))
elif tag is ProcessingInstruction:
file.write("<?%s?>" % _escape_cdata(node.text, encoding))
else:
items = node.items()
xmlns_items = [] # new namespaces in this scope
try:
if isinstance(tag, QName) or tag[:1] == "{":
tag, xmlns = fixtag(tag, namespaces)
if xmlns: xmlns_items.append(xmlns)
except TypeError:
_raise_serialization_error(tag)
file.write("<" + tag)
I smell the mention of a Byte Order Mark coming on. ;-)