D
Dean A. Hoover
I am writing a parser for xml that will not have
an associated DTD. I want to be able to handle
certain character references (e.g., ©
in
the program.
When I run the following against a chunk of xml
containing ©, I get the following:
org.xml.sax.SAXParseException: Reference to undefined entity "©".
at org.apache.crimson.parser.Parser2.fatal(Parser2.java:3182)
at org.apache.crimson.parser.Parser2.fatal(Parser2.java:3176)
at
org.apache.crimson.parser.Parser2.expandEntityInContent(Parser2.java:2513)
at
org.apache.crimson.parser.Parser2.maybeReferenceInContent(Parser2.java:2422)
at org.apache.crimson.parser.Parser2.content(Parser2.java:1833)
at org.apache.crimson.parser.Parser2.maybeElement(Parser2.java:1507)
at org.apache.crimson.parser.Parser2.content(Parser2.java:1779)
at org.apache.crimson.parser.Parser2.maybeElement(Parser2.java:1507)
at org.apache.crimson.parser.Parser2.content(Parser2.java:1779)
at org.apache.crimson.parser.Parser2.maybeElement(Parser2.java:1507)
at org.apache.crimson.parser.Parser2.parseInternal(Parser2.java:500)
at org.apache.crimson.parser.Parser2.parse(Parser2.java:305)
at org.apache.crimson.parser.XMLReaderImpl.parse(XMLReaderImpl.java:442)
at javax.xml.parsers.SAXParser.parse(SAXParser.java:345)
at javax.xml.parsers.SAXParser.parse(SAXParser.java:281)
at Article.main(Article.java:18)
What can I do to catch these references in my code and output replacement
text for it?
Thanks.
Dean Hoover
Here's the two java files:
---
import java.io.*;
import javax.xml.parsers.*;
import org.xml.sax.*;
import org.xml.sax.helpers.*;
public class Article
{
public static void main(String argv[])
{
String file = argv[0];
PrintWriter pw = new PrintWriter(System.out);
DefaultHandler handler = new LoadXML(pw, LoadXML.TYPE_HTML);
SAXParserFactory factory = SAXParserFactory.newInstance();
try
{
SAXParser reader = factory.newSAXParser();
reader.parse(new File(file), handler);
}
catch (Exception e)
{
e.printStackTrace();
return;
}
pw.flush();
}
}
---
import java.io.*;
import java.util.*;
import javax.xml.parsers.*;
import org.xml.sax.*;
import org.xml.sax.helpers.*;
public class LoadXML extends DefaultHandler
{
public static final int TYPE_HTML = 1;
public static final int TYPE_TEXT = 2;
public LoadXML
(
java.io.Writer writer,
int type
)
{
elements_ = new Stack();
writer_ = writer;
type_ = type;
}
public InputSource resolveEntity
(
String publicId,
String systemId
) throws SAXException
{
String s = "stuff";
return new InputSource(new CharArrayReader(s.toCharArray()));
}
public void startDocument() throws SAXException
{
}
public void endDocument() throws SAXException
{
}
public void startElement
(
String uri,
String localName,
String qName,
Attributes attributes
) throws SAXException
{
String elementName = qName;
elements_.push(elementName);
try
{
if (elementName.equals("p"))
{
if (type_ == TYPE_HTML)
writer_.write("<p class=\"article-text\">");
}
else if (elementName.equals("title"))
{
if (type_ == TYPE_HTML)
writer_.write("<p class=\"article-title\">");
}
else if (elementName.equals("by"))
{
if (type_ == TYPE_HTML)
writer_.write("<p class=\"article-by\">");
}
else if (elementName.equals("copyright"))
{
if (type_ == TYPE_HTML)
writer_.write("<p class=\"article-copyright\">");
}
}
catch (IOException e)
{
throw new SAXException(e);
}
}
public void endElement
(
String uri,
String localName,
String qName
) throws SAXException
{
String elementName = qName;
elements_.pop();
try
{
if (type_ == TYPE_HTML)
{
if (elementName.equals("p") || elementName.equals("title") ||
elementName.equals("by") || elementName.equals("copyright"))
{
writer_.write("</p>\n");
}
else if (elementName.equals("br"))
{
writer_.write("<br/>\n");
}
}
}
catch (IOException e)
{
throw new SAXException(e);
}
}
public void characters
(
char[] ch,
int start,
int length
) throws SAXException
{
try
{
String content = new String(ch, start, length);
String top = (String)elements_.peek();
String text =
content.replaceAll("\n", " ").replaceAll(" +", " ").trim();
if (text.length() == 0)
return;
if (type_ == TYPE_HTML)
{
if (top.equals("p") || top.equals("title") ||
top.equals("by") || top.equals("copyright"))
writer_.write(text);
}
}
catch (IOException e)
{
throw new SAXException(e);
}
}
private Stack elements_;
private java.io.Writer writer_;
private int type_;
}
an associated DTD. I want to be able to handle
certain character references (e.g., ©
the program.
When I run the following against a chunk of xml
containing ©, I get the following:
org.xml.sax.SAXParseException: Reference to undefined entity "©".
at org.apache.crimson.parser.Parser2.fatal(Parser2.java:3182)
at org.apache.crimson.parser.Parser2.fatal(Parser2.java:3176)
at
org.apache.crimson.parser.Parser2.expandEntityInContent(Parser2.java:2513)
at
org.apache.crimson.parser.Parser2.maybeReferenceInContent(Parser2.java:2422)
at org.apache.crimson.parser.Parser2.content(Parser2.java:1833)
at org.apache.crimson.parser.Parser2.maybeElement(Parser2.java:1507)
at org.apache.crimson.parser.Parser2.content(Parser2.java:1779)
at org.apache.crimson.parser.Parser2.maybeElement(Parser2.java:1507)
at org.apache.crimson.parser.Parser2.content(Parser2.java:1779)
at org.apache.crimson.parser.Parser2.maybeElement(Parser2.java:1507)
at org.apache.crimson.parser.Parser2.parseInternal(Parser2.java:500)
at org.apache.crimson.parser.Parser2.parse(Parser2.java:305)
at org.apache.crimson.parser.XMLReaderImpl.parse(XMLReaderImpl.java:442)
at javax.xml.parsers.SAXParser.parse(SAXParser.java:345)
at javax.xml.parsers.SAXParser.parse(SAXParser.java:281)
at Article.main(Article.java:18)
What can I do to catch these references in my code and output replacement
text for it?
Thanks.
Dean Hoover
Here's the two java files:
---
import java.io.*;
import javax.xml.parsers.*;
import org.xml.sax.*;
import org.xml.sax.helpers.*;
public class Article
{
public static void main(String argv[])
{
String file = argv[0];
PrintWriter pw = new PrintWriter(System.out);
DefaultHandler handler = new LoadXML(pw, LoadXML.TYPE_HTML);
SAXParserFactory factory = SAXParserFactory.newInstance();
try
{
SAXParser reader = factory.newSAXParser();
reader.parse(new File(file), handler);
}
catch (Exception e)
{
e.printStackTrace();
return;
}
pw.flush();
}
}
---
import java.io.*;
import java.util.*;
import javax.xml.parsers.*;
import org.xml.sax.*;
import org.xml.sax.helpers.*;
public class LoadXML extends DefaultHandler
{
public static final int TYPE_HTML = 1;
public static final int TYPE_TEXT = 2;
public LoadXML
(
java.io.Writer writer,
int type
)
{
elements_ = new Stack();
writer_ = writer;
type_ = type;
}
public InputSource resolveEntity
(
String publicId,
String systemId
) throws SAXException
{
String s = "stuff";
return new InputSource(new CharArrayReader(s.toCharArray()));
}
public void startDocument() throws SAXException
{
}
public void endDocument() throws SAXException
{
}
public void startElement
(
String uri,
String localName,
String qName,
Attributes attributes
) throws SAXException
{
String elementName = qName;
elements_.push(elementName);
try
{
if (elementName.equals("p"))
{
if (type_ == TYPE_HTML)
writer_.write("<p class=\"article-text\">");
}
else if (elementName.equals("title"))
{
if (type_ == TYPE_HTML)
writer_.write("<p class=\"article-title\">");
}
else if (elementName.equals("by"))
{
if (type_ == TYPE_HTML)
writer_.write("<p class=\"article-by\">");
}
else if (elementName.equals("copyright"))
{
if (type_ == TYPE_HTML)
writer_.write("<p class=\"article-copyright\">");
}
}
catch (IOException e)
{
throw new SAXException(e);
}
}
public void endElement
(
String uri,
String localName,
String qName
) throws SAXException
{
String elementName = qName;
elements_.pop();
try
{
if (type_ == TYPE_HTML)
{
if (elementName.equals("p") || elementName.equals("title") ||
elementName.equals("by") || elementName.equals("copyright"))
{
writer_.write("</p>\n");
}
else if (elementName.equals("br"))
{
writer_.write("<br/>\n");
}
}
}
catch (IOException e)
{
throw new SAXException(e);
}
}
public void characters
(
char[] ch,
int start,
int length
) throws SAXException
{
try
{
String content = new String(ch, start, length);
String top = (String)elements_.peek();
String text =
content.replaceAll("\n", " ").replaceAll(" +", " ").trim();
if (text.length() == 0)
return;
if (type_ == TYPE_HTML)
{
if (top.equals("p") || top.equals("title") ||
top.equals("by") || top.equals("copyright"))
writer_.write(text);
}
}
catch (IOException e)
{
throw new SAXException(e);
}
}
private Stack elements_;
private java.io.Writer writer_;
private int type_;
}