I
IamIan
I am using SAX to parse XML that has numeric html entities I need to
convert and feed to JavaScript as part of a CGI. I can get the
characters to print correctly, but not without being surrounded by
linebreaks:
from xml.sax import make_parser
from xml.sax.handler import ContentHandler
import htmlentitydefs, re
def unescape_charref(ref):
name = ref[2:-1]
base = 10
if name.startswith("x"):
name = name[1:]
base = 16
return unichr(int(name, base))
def replace_entities(match):
ent = match.group()
if ent[1] == "#":
return unescape_charref(ent)
repl = htmlentitydefs.name2codepoint.get(ent[1:-1])
if repl is not None:
repl = unichr(repl)
else:
repl = ent
return repl
def unescape(data):
return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
class newsHandler(ContentHandler):
def __init__(self):
self.isNews = 0
def startElement(self, name, attrs):
if name == 'title':
self.isNews = 1
def characters(self, ch):
if self.is ch = unescape(ch)
print ch
def endElement(self, name):
if name == 'title':
self.isNews = 0
parser = make_parser()
parser.setContentHandler(newsHandler())
parser.parse('http://www.some.com/rss/rss.xml')
For a line like 'Mark à Capbreton'
my results print as:
'Mark
à
Capbreton'
Is this another SAX quirk? I've already had to hack my way around SAX
not being able to split results on a colon. No matter if I try strip,
etc the results are always the same: newlines surrounding the html
entities. I'm using version 2.3.5 and need to stick to the standard
libraries. Thanks.
convert and feed to JavaScript as part of a CGI. I can get the
characters to print correctly, but not without being surrounded by
linebreaks:
from xml.sax import make_parser
from xml.sax.handler import ContentHandler
import htmlentitydefs, re
def unescape_charref(ref):
name = ref[2:-1]
base = 10
if name.startswith("x"):
name = name[1:]
base = 16
return unichr(int(name, base))
def replace_entities(match):
ent = match.group()
if ent[1] == "#":
return unescape_charref(ent)
repl = htmlentitydefs.name2codepoint.get(ent[1:-1])
if repl is not None:
repl = unichr(repl)
else:
repl = ent
return repl
def unescape(data):
return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
class newsHandler(ContentHandler):
def __init__(self):
self.isNews = 0
def startElement(self, name, attrs):
if name == 'title':
self.isNews = 1
def characters(self, ch):
if self.is ch = unescape(ch)
print ch
def endElement(self, name):
if name == 'title':
self.isNews = 0
parser = make_parser()
parser.setContentHandler(newsHandler())
parser.parse('http://www.some.com/rss/rss.xml')
For a line like 'Mark à Capbreton'
my results print as:
'Mark
à
Capbreton'
Is this another SAX quirk? I've already had to hack my way around SAX
not being able to split results on a colon. No matter if I try strip,
etc the results are always the same: newlines surrounding the html
entities. I'm using version 2.3.5 and need to stick to the standard
libraries. Thanks.