--- src/java/org/apache/nutch/searcher/OpenSearchServlet.java.orig 2006-05-25 17:53:06.000000000 +0200
+++ src/java/org/apache/nutch/searcher/OpenSearchServlet.java 2006-05-25 18:04:22.000000000 +0200
@@ -278,14 +278,14 @@
private static void addNode(Document doc, Node parent,
String name, String text) {
Element child = doc.createElement(name);
- child.appendChild(doc.createTextNode(text));
+ child.appendChild(doc.createTextNode(getLegalXml(text)));
parent.appendChild(child);
}
private static Element addNode(Document doc, Node parent,
String ns, String name, String text) {
Element child = doc.createElementNS((String)NS_MAP.get(ns), ns+":"+name);
- child.appendChild(doc.createTextNode(text));
+ child.appendChild(doc.createTextNode(getLegalXml(text)));
parent.appendChild(child);
return child;
}
@@ -293,9 +293,50 @@
private static void addAttribute(Document doc, Element node,
String name, String value) {
Attr attribute = doc.createAttribute(name);
- attribute.setValue(value);
+ attribute.setValue(getLegalXml(value));
node.getAttributes().setNamedItem(attribute);
}
+ /*
+ * Ensure string is legal xml.
+ * First look to see if string has illegal characters. If it doesn't,
+ * just return it. Otherwise, create new string with illegal characters
+ * @param text String to verify.
+ * @return Passed text
or a new string with illegal
+ * characters removed if any found in text
.
+ * @see http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char
+ */
+ private static String getLegalXml(final String text) {
+ if (text == null) {
+ return null;
+ }
+ boolean allLegal = true;
+ for (int i = 0; i < text.length(); i++) {
+ if (!isLegalXml(text.charAt(i))) {
+ allLegal = false;
+ break;
+ }
+ }
+ return allLegal? text: createLegalXml(text);
+ }
+
+ private static String createLegalXml(final String text) {
+ if (text == null) {
+ return null;
+ }
+ StringBuffer buffer = new StringBuffer(text.length());
+ for (int i = 0; i < text.length(); i++) {
+ char c = text.charAt(i);
+ if (isLegalXml(c)) {
+ buffer.append(c);
+ }
+ }
+ return buffer.toString();
+ }
+
+ private static boolean isLegalXml(final char c) {
+ return c == 0x9 || c == 0xa || c == 0xd || (c >= 0x20 && c <= 0xd7ff)
+ || (c >= 0xe000 && c <= 0xfffd) || (c >= 0x10000 && c <= 0x10ffff);
+ }
}