Benchmarking stripping of Unicode characters which are invalid XML

Alex Willmer · Mar 18, 2012

Last week I was surprised to discover that there are Unicode characters that aren't valid in an XML document. That is regardless of escaping (e.g. ) and unicode encoding (e.g. UTF-8) - not every Unicode string can be stored in XML. The valid characters are (as of XML 1.0) #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]. Others such as #x13 mustbe stripped, replaced or placed inside a wrapper such as base64.

I didn't find an existing function to strip these so I wrote some and benchmarked them. I'd be interested for thoughts, suggestions and improvements.

regsub_p2 was the fastest on a string containing mostly printable-ascii.

regsub_p1 0.422097921371 True
regsub_p2 0.353546857834 True
regsub_p3 0.697242021561 True
regsub_p4 0.677567005157 True
genexp_p1 6.43633103371 True
genexp_p2 6.43329787254 True
genexp_p3 6.80837488174 True
genexp_p4 6.81470417976 True
filter_p1 7.21444416046 True
filter_p2 7.46805095673 True
filter_p3 7.37018704414 True
filter_p4 7.03261303902 True
genexp_f1 12.8470640182 True
genexp_f2 5.43630099297 True
genexp_f3 4.9708840847 True
genexp_f4 12.2384109497 True
genexp_f5 6.95861411095 True
genexp_f6 4.7168610096 True
genexp_f7 20.2065701485 True
genexp_f8 21.1112251282 True

Regards, Alex
#!/usr/bin/python
# Valid XML 1.0 characters are
# #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
# http://www.w3.org/TR/2008/PER-xml-20080205/#charsets
#
# Before passing an arbitrary unicode string to an XML encoder invalid characters
# must be stripped or replaced. Escaping them doesn't help - they're simplynot
# allowed in a well formed XML 1.0 document.

# The following script banchmarks several functions that strip them

import re
import string
import timeit

p1 = re.compile(u'[^\x09\x0A\x0D\u0020-\uD7FF'
u'\uE000-\uFFFD\U00010000-\U0010FFFF]', re.U)

p2 = re.compile(u'[^\u0020-\uD7FF\x09\x0A\x0D'
u'\uE000-\uFFFD\U00010000-\U0010FFFF]', re.U)

p3 = re.compile(p1.pattern + u'+', re.U)
p4 = re.compile(p2.pattern + u'+', re.U)

def regsub_p1(s): return p1.sub(u'', s)
def regsub_p2(s): return p2.sub(u'', s)
def regsub_p3(s): return p3.sub(u'', s)
def regsub_p4(s): return p4.sub(u'', s)

def genexp_p1(s): return u''.join(c for c in s if not p1.match(c))
def genexp_p2(s): return u''.join(c for c in s if not p2.match(c))
def genexp_p3(s): return u''.join(c for c in s if not p3.match(c))
def genexp_p4(s): return u''.join(c for c in s if not p4.match(c))

def filter_p1(s): return u''.join(filter(lambda c: not p1.match(c), s))
def filter_p2(s): return u''.join(filter(lambda c: not p2.match(c), s))
def filter_p3(s): return u''.join(filter(lambda c: not p3.match(c), s))
def filter_p4(s): return u''.join(filter(lambda c: not p4.match(c), s))

def f1(c):
i = ord(c)
return (i in set([0x09, 0x0A, 0x0D]) or 0x0020 <= i <= 0xD7FF
or 0xE000 <= i <= 0xFFFD or 0x00010000 <= i <= 0x0010FFFF)

def f2(c):
i = ord(c)
return (0x0020 <= i <= 0xD7FF or i in set([0x09, 0x0A, 0x0D])
or 0xE000 <= i <= 0xFFFD or 0x00010000 <= i <= 0x0010FFFF)

def f3(c):
return (u'\u0020' <= c <= u'\uD7FF' or c in set([u'\x09', u'\x0A', u'\x0D'])
or u'\uE000' <= c <= u'\uFFFD' or u'\U00010000' <= c <=u'\U0010FFFF')

def f4(c):
return (c in set([u'\x09', u'\x0A', u'\x0D']) or u'\u0020' <= c <= u'\uD7FF'
or u'\uE000' <= c <= u'\uFFFD' or u'\U00010000' <= c <=u'\U0010FFFF')

def f5(c):
return (c == u'\x09' or c == u'\x0A' or c == u'\x0D' or u'\u0020' <= c <= u'\uD7FF'
or u'\uE000' <= c <= u'\uFFFD' or u'\U00010000' <= c <=u'\U0010FFFF')

def f6(c):
return (u'\u0020' <= c <= u'\uD7FF' or c == u'\x09' or c ==u'\x0A' or c == u'\x0D'
or u'\uE000' <= c <= u'\uFFFD' or u'\U00010000' <= c <=u'\U0010FFFF')

every_8bit = u''.join(unichr(i) for i in range(256))
valid_8bit = u''.join(c for c in every_8bit if f1(c))
invalid_8bit = u''.join(c for c in every_8bit if not f1(c))
invalid_8bit_iso88591 = invalid_8bit.encode('iso-8859-1')
translator = string.maketrans(invalid_8bit_iso88591,
'\x00' * len(invalid_8bit_iso88591))

def f7(c):
return ((c <= u'\xff' and ord(string.translate(c.encode('iso-8859-1'), translator)))
or u'\uE000' <= c <= u'\uFFFD' or u'\U00010000' <= c <=u'\U0010FFFF')

def f8(c):
return ((c <= u'\xff' and string.translate(c.encode('iso-8859-1'), None, invalid_8bit_iso88591))
or u'\uE000' <= c <= u'\uFFFD' or u'\U00010000' <= c <=u'\U0010FFFF')

def genexp_f1(s): return u''.join(c for c in s if f1(c))
def genexp_f2(s): return u''.join(c for c in s if f2(c))
def genexp_f3(s): return u''.join(c for c in s if f3(c))
def genexp_f4(s): return u''.join(c for c in s if f4(c))
def genexp_f5(s): return u''.join(c for c in s if f5(c))
def genexp_f6(s): return u''.join(c for c in s if f6(c))
def genexp_f7(s): return u''.join(c for c in s if f7(c))
def genexp_f8(s): return u''.join(c for c in s if f8(c))

if __name__ == '__main__':
sample_in = u'''Lorem ipsum dolor sit amet\x00, consectetur adipisicing
elit, \tsed \rdo eiusmod tempor incididunt \x13ut labore et dolore magna
\xf7aliqua.\ufffe'''

expected_out = u'''Lorem ipsum dolor sit amet, consectetur adipisicing
elit, \tsed \rdo eiusmod tempor incididunt ut labore et dolore magna
\xf7aliqua.'''

for func, inner_fun in [(regsub_p1, p1), (regsub_p2, p2),
(regsub_p3, p3), (regsub_p4, p4),
(genexp_p1, p1), (genexp_p2, p2),
(genexp_p3, p3), (genexp_p4, p4),
(filter_p1, p1), (filter_p2, p2),
(filter_p3, p3), (filter_p4, p4),
(genexp_f1, f1), (genexp_f2, f2),
(genexp_f3, f3), (genexp_f4, f4),
(genexp_f5, f5), (genexp_f6, f6),
(genexp_f7, f7), (genexp_f8, f8),
]:
t = timeit.Timer(r'%s(%s)' % (func.__name__, repr(sample_in)),
'from __main__ import %s' % (func.__name__,))
print func.__name__,
print min(t.repeat(3, 100000)),
print func(sample_in) == expected_out,
print

Python point location of intersect between two lines	0	Feb 28, 2018
Unicode characters, XML/RSS	1	Jul 31, 2008
Regex for unicode letter characters	4	Jan 10, 2009
Question about Tashaphyne package in python	1	Mar 2, 2013
Trouble with UnicodeEncodeError and email	0	Jan 8, 2014
Python code problem	2	Apr 23, 2023
Unicode: matching a word and unaccenting characters	2	Nov 14, 2007
ChatBot	4	Jan 19, 2021

Benchmarking stripping of Unicode characters which are invalid XML

Alex Willmer

Ask a Question

Similar Threads

Members online

Forum statistics

Latest Threads