simple ElementTree based parser that allows entity definition map


R

Robin Becker

I'm tasked with writing a 'simple' ElementTree based parser with support for
unknown entities eg &foo;.

This code derived from FL's old documentation fails in both python 2 and 3.
########################
import xml.etree.ElementTree as ET
try:
ascii
except:
from future_builtins import ascii

class EchoTarget:
def start(self, tag, attrib):
print("start %s %s"%(tag, ascii(attrib)))
def end(self, tag):
print("end %s"%tag)
def data(self, data):
print("data %s" % ascii(data))
def close(self):
print("close")

def __getattr__(self,a):
print('target attempting to get attribute %s' % a)

target = EchoTarget()
parser = ET.XMLParser(target=target)
parser.entity['foo'] = b'AAAA&fum;BBBB'
parser.entity['fum'] = b'CCCC'
print("parser.entity=%s" % ascii(parser.entity))
parser.feed("<element>some text &foo;</element>")
parser.feed("")
parser.close()
########################

The entity value doesn't seem to get referenced.



I tried this derived from
http://stackoverflow.com/questions/...tree-support-for-parsing-unknown-xml-entities

########################
__all__=tuple(filter(None,'''
Xml2TT
EntityMap
'''.split()))
import xml.etree.ElementTree as ET
try:
from StringIO import StringIO
except ImportError:
from io import StringIO

class EntityMap(dict):
def __getitem__(self,key):
try:
r = dict.__getitem__(self,key)
except:
r = '&amp;' + key +';'
return r

class Xml2TT:
'''
create a callable object that can turns xml into a tupletree
if mutable is set to True then it's really a list tree
'''
def __init__(self,mutable=False,entityMap=None):
self._mutable = mutable
self._parser = parser = ET.XMLParser()
parser.parser.UseForeignDTD(True)
parser.entity = self._entityMap = entityMap

def __call__(self,xml):
r = self._mtt(ET.ElementTree().parse(StringIO(xml.strip()),
parser=self._parser))
return r[0]

def _mtt(self,node):
t = [node.text] if node.text else []
e = t.extend
for c in node:
e(self._mtt(c))
t = (node.tag,node.attrib,t,None)
if self._mutable:
t = list(t)
return [t,node.tail] if node.tail else [t]

if __name__=='__main__':
print(repr(Xml2TT()('<a>aaaaa<b>bbbb<c ca="123"/>22</b></a>')))
print(repr(Xml2TT()('<a>aaaaa=&amp;=bbbbb&lt; &gt;</a>')))
print(repr(Xml2TT(entityMap=EntityMap({'mu': '…','foo':
'AAA&fum;BBB','fum':'CCC'}))('<a>amp=&amp; moo=&moo; lt=&lt; gt=&gt; mu=&mu;
foo=&foo;</a>')))
########################

and it sort of works in python2, fails in python3 with

AttributeError: 'xml.etree.ElementTree.XMLParser' object has no attribute
'parser'

Even in python 2 there's a subtle bug as the output is

('a', {}, ['aaaaa', ('b', {}, ['bbbb', ('c', {'ca': '123'}, [], None), '22'],
None)], None)
('a', {}, ['aaaaa=&=bbbbb< >'], None)
('a', {}, [u'amp=& moo=&amp;moo; lt=< gt=> mu=… foo=AAA&fum;BBB'], None)

ie the result of the &foo; lookup is not re-parsed so &fum; is not translated.

Is there a way to get a simple ElementTree based parser that can do what I want?
I have several hundred entities and the size of the DTD would probably be larger
than 99% of the strings I need to parse. I think I can live with the
non-reparsing of the map output, but can I get Python 3 to do the UseForeignDTD
thing?
 
Ad

Advertisements


Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Top