001 package com.mockrunner.util.web; 002 003 import java.io.StringReader; 004 import java.util.List; 005 006 import org.apache.xerces.parsers.DOMParser; 007 import org.cyberneko.html.HTMLConfiguration; 008 import org.jdom.Element; 009 import org.jdom.input.DOMBuilder; 010 import org.jdom.output.XMLOutputter; 011 import org.xml.sax.InputSource; 012 013 import com.mockrunner.base.NestedApplicationException; 014 015 /** 016 * Util class for HTML and XML parsing. 017 */ 018 public class XmlUtil 019 { 020 /** 021 * Convinience method for HTML fragments. Returns the body 022 * as JDOM <code>Element</code>. 023 * 024 * If an HTML documents looks like this: 025 * <pre> 026 * <html> 027 * <head> 028 * </head> 029 * <body> 030 * <h1> 031 * </h1> 032 * </body> 033 * </html> 034 * </pre> 035 * 036 * the method returns the h1 tag as <code>Element</code>. 037 * @param document the <code>org.jdom.Document</code> 038 * @return the body <code>Element</code> 039 */ 040 public static Element getBodyFragmentFromJDOMDocument(org.jdom.Document document) 041 { 042 Element element = document.getRootElement().getChild("BODY"); 043 if(null == element) 044 { 045 element = document.getRootElement().getChild("body"); 046 } 047 if(null != element) 048 { 049 List childs = element.getChildren(); 050 if(null != childs && childs.size() > 0) return (Element)childs.get(0); 051 } 052 return null; 053 } 054 055 /** 056 * @deprecated use {@link #getBodyFragmentFromJDOMDocument} 057 */ 058 public static Element getBodyFragmentJDOMDocument(org.jdom.Document document) 059 { 060 return getBodyFragmentFromJDOMDocument(document); 061 } 062 063 /** 064 * Returns the documents XML content as a string. 065 * @param document the <code>org.jdom.Document</code> 066 * @return the output as string 067 */ 068 public static String createStringFromJDOMDocument(org.jdom.Document document) 069 { 070 try 071 { 072 return new XMLOutputter().outputString(document); 073 } 074 catch(Exception exc) 075 { 076 throw new NestedApplicationException(exc); 077 } 078 } 079 080 /** 081 * Creates a JDOM <code>Document</code> from a specified 082 * W3C <code>Document</code>. 083 * @param document the <code>org.w3c.dom.Document</code> 084 * @return the <code>org.jdom.Document</code> 085 */ 086 public static org.jdom.Document createJDOMDocument(org.w3c.dom.Document document) 087 { 088 return new DOMBuilder().build(document); 089 } 090 091 /** 092 * Returns a parser suitable for parsing HTML documents. 093 * The NekoHTML parser is used with some settings to 094 * preserve case of tag names and disable namespace processing. 095 * This method is used by {@link #parseHTML}. 096 * @return instance of <code>org.apache.xerces.parsers.DOMParser</code> 097 * with Neko configuration 098 */ 099 public static DOMParser getHTMLParser() 100 { 101 try 102 { 103 HTMLConfiguration config = new HTMLConfiguration(); 104 config.setProperty("http://cyberneko.org/html/properties/names/elems", "match"); 105 config.setProperty("http://cyberneko.org/html/properties/names/attrs", "no-change"); 106 DOMParser parser = new DOMParser(config); 107 return parser; 108 } 109 catch(Exception exc) 110 { 111 throw new NestedApplicationException(exc); 112 } 113 } 114 115 /** 116 * Parses the specified HTML with the NekoHTML parser. 117 * If you want to use another HTML parser or configure 118 * the NekoHTML parser with special features, you can use 119 * the <code>parse</code> method. 120 * @param source the HTML as String 121 * @return the parsed document as org.w3c.dom.Document 122 */ 123 public static org.w3c.dom.Document parseHTML(String source) 124 { 125 try 126 { 127 return parse(getHTMLParser(), source); 128 } 129 catch(Exception exc) 130 { 131 throw new NestedApplicationException(exc); 132 } 133 } 134 135 /** 136 * Parses the specified XML with the specified parser. 137 * The main purpose of this method is to use the NekoHTML 138 * parser with custom features and properties. If you can live 139 * with the settings provided by Mockrunner, you can use 140 * {@link #parseHTML}. 141 * @param parser the parser (must extend 142 * <code>org.apache.xerces.parsers.DOMParser</code>), 143 * e.g. the one returned by {@link #getHTMLParser} 144 * @param source the XML as String 145 * @return the parsed document as org.w3c.dom.Document 146 */ 147 public static org.w3c.dom.Document parse(DOMParser parser, String source) 148 { 149 try 150 { 151 parser.parse(new InputSource(new StringReader(source))); 152 return parser.getDocument(); 153 } 154 catch(Exception exc) 155 { 156 throw new NestedApplicationException(exc); 157 } 158 } 159 }