001    package com.mockrunner.util.web;
002    
003    import java.io.StringReader;
004    import java.util.List;
005    
006    import org.apache.xerces.parsers.DOMParser;
007    import org.cyberneko.html.HTMLConfiguration;
008    import org.jdom.Element;
009    import org.jdom.input.DOMBuilder;
010    import org.jdom.output.XMLOutputter;
011    import org.xml.sax.InputSource;
012    
013    import com.mockrunner.base.NestedApplicationException;
014    
015    /**
016     * Util class for HTML and XML parsing.
017     */
018    public class XmlUtil
019    { 
020        /**
021         * Convinience method for HTML fragments. Returns the body
022         * as JDOM <code>Element</code>.
023         * 
024         * If an HTML documents looks like this:
025         * <pre>
026         * &lt;html&gt;
027         * &lt;head&gt;
028         * &lt;/head&gt;
029         * &lt;body&gt;
030         * &lt;h1&gt;
031         * &lt;/h1&gt;
032         * &lt;/body&gt;
033         * &lt;/html&gt;
034         * </pre>
035         * 
036         * the method returns the h1 tag as <code>Element</code>.
037         * @param document the <code>org.jdom.Document</code>
038         * @return the body <code>Element</code>
039         */
040        public static Element getBodyFragmentFromJDOMDocument(org.jdom.Document document)
041        {
042            Element element = document.getRootElement().getChild("BODY");
043            if(null == element)
044            {
045                element = document.getRootElement().getChild("body");
046            }
047            if(null != element)
048            {
049                List childs = element.getChildren();
050                if(null != childs && childs.size() > 0) return (Element)childs.get(0);
051            }
052            return null;
053        }
054        
055        /**
056         * @deprecated use {@link #getBodyFragmentFromJDOMDocument}
057         */
058        public static Element getBodyFragmentJDOMDocument(org.jdom.Document document)
059        {
060            return getBodyFragmentFromJDOMDocument(document);
061        }
062        
063        /**
064         * Returns the documents XML content as a string.
065         * @param document the <code>org.jdom.Document</code>
066         * @return the output as string
067         */
068        public static String createStringFromJDOMDocument(org.jdom.Document document)
069        {
070            try
071            {
072                return new XMLOutputter().outputString(document);
073            }
074            catch(Exception exc)
075            {
076                throw new NestedApplicationException(exc);
077            }
078        }
079        
080        /**
081         * Creates a JDOM <code>Document</code> from a specified
082         * W3C <code>Document</code>.
083         * @param document the <code>org.w3c.dom.Document</code>
084         * @return the <code>org.jdom.Document</code>
085         */
086        public static org.jdom.Document createJDOMDocument(org.w3c.dom.Document document)
087        {
088            return new DOMBuilder().build(document);
089        }
090        
091        /**
092         * Returns a parser suitable for parsing HTML documents.
093         * The NekoHTML parser is used with some settings to
094         * preserve case of tag names and disable namespace processing. 
095         * This method is used by {@link #parseHTML}.
096         * @return instance of <code>org.apache.xerces.parsers.DOMParser</code>
097         *         with Neko configuration
098         */
099        public static DOMParser getHTMLParser()
100        {
101            try
102            {
103                HTMLConfiguration config = new HTMLConfiguration();
104                config.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
105                config.setProperty("http://cyberneko.org/html/properties/names/attrs", "no-change");
106                DOMParser parser = new DOMParser(config);
107                return parser;
108            }
109            catch(Exception exc)
110            {
111                throw new NestedApplicationException(exc);
112            }
113        }
114        
115        /**
116         * Parses the specified HTML with the NekoHTML parser.
117         * If you want to use another HTML parser or configure
118         * the NekoHTML parser with special features, you can use
119         * the <code>parse</code> method.
120         * @param source the HTML as String
121         * @return the parsed document as org.w3c.dom.Document
122         */
123        public static org.w3c.dom.Document parseHTML(String source)
124        {
125            try
126            {
127                return parse(getHTMLParser(), source);
128            }
129            catch(Exception exc)
130            {
131                throw new NestedApplicationException(exc);
132            }
133        }
134        
135        /**
136         * Parses the specified XML with the specified parser.
137         * The main purpose of this method is to use the NekoHTML 
138         * parser with custom features and properties. If you can live
139         * with the settings provided by Mockrunner, you can use 
140         * {@link #parseHTML}.
141         * @param parser the parser (must extend 
142         *               <code>org.apache.xerces.parsers.DOMParser</code>), 
143         *               e.g. the one returned by {@link #getHTMLParser}
144         * @param source the XML as String
145         * @return the parsed document as org.w3c.dom.Document
146         */
147        public static org.w3c.dom.Document parse(DOMParser parser, String source)
148        {
149            try
150            {
151                parser.parse(new InputSource(new StringReader(source)));
152                return parser.getDocument();
153            }
154            catch(Exception exc)
155            {
156                throw new NestedApplicationException(exc);
157            }
158        }
159    }