2

Can any one suggest me a best approach for converting html to xml using java Is there any API available for that? The html also might contain javascript code

I have tried below code:

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.DataInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
import org.jdom.output.XMLOutputter;
import java.io.IOException;

class HTML2XML {
    public static void main(String args[]) throws JDOMException {
    InputStream isInHtml = null;
    URL url = null;
    URLConnection connection = null;
    DataInputStream disInHtml = null;
    FileOutputStream fosOutHtml = null;
    FileWriter fwOutXml = null;
    FileReader frInHtml = null;
    BufferedWriter bwOutXml = null;
    BufferedReader brInHtml = null;
    try {
        // url = new URL("www.climb.co.jp");
        // connection = url.openConnection();
        // isInHtml = connection.getInputStream();

        frInHtml = new FileReader("D:\\Second.html");
        brInHtml = new BufferedReader(frInHtml);
        SAXBuilder saxBuilder = new SAXBuilder(
                "org.ccil.cowan.tagsoup.Parser", false);
        org.jdom.Document jdomDocument = saxBuilder.build(brInHtml);

        XMLOutputter outputter = new XMLOutputter();
        org.jdom.output.Format newFormat = outputter.getFormat();
        String encoding = "iso-8859-2";
        newFormat.setEncoding(encoding);
        outputter.setFormat(newFormat);

        try {
            outputter.output(jdomDocument, System.out);
            fwOutXml = new FileWriter("D:\\Second.xml");
            bwOutXml = new BufferedWriter(fwOutXml);
            outputter.output(jdomDocument, bwOutXml);
            System.out.flush();
        } catch (IOException e) {
        }

    } catch (IOException e) {
    } finally {
        System.out.flush();
        try {
            isInHtml.close();
            disInHtml.close();
            fosOutHtml.flush();
            fosOutHtml.getFD().sync();
            fosOutHtml.close();
            fwOutXml.flush();
            fwOutXml.close();
            bwOutXml.close();
        } catch (Exception w) {

        }
    }
}
}

But its not working as expected

Clyde Lobo
  • 9,126
  • 7
  • 34
  • 61
suresh
  • 35
  • 1
  • 1
  • 4
  • Do you mean XHTML? And what about this Javascript code, what do you want to do with that? – GolezTrol Oct 21 '13 at 08:49
  • I have to convert normal html file to xml – suresh Oct 21 '13 at 08:52
  • Do you need to convert them *to* XHTML? XHTML is an XML representation of HTML. 'Just' XML can be anything. – GolezTrol Oct 21 '13 at 08:54
  • Have you tried http://jtidy.sourceforge.net/? – Clyde Lobo Oct 21 '13 at 08:56
  • Otherwise you can just embed the entire HTML document into a single XML element, as proven [in this question](http://stackoverflow.com/questions/4412395/is-it-possible-to-insert-html-content-in-xml-document). That is probably not what you want, but we need more info. – GolezTrol Oct 21 '13 at 08:56

3 Answers3

3

Try jTidy

JTidy can be used as a tool for cleaning up malformed and faulty HTML

Clyde Lobo
  • 9,126
  • 7
  • 34
  • 61
2

If you want to parse html than rather than converting html to xml you can use html parser. http://www.mkyong.com/java/jsoup-html-parser-hello-world-examples/ http://htmlparser.sourceforge.net/javadoc/doc-files/using.html I hope it helps you.

Rajj
  • 101
  • 7
1

HTML is not the same as XML unless it is conforming XHTML or HTML5 in XML mode.

suggesting to use a HTML parser to read the HTML and transform it to XML – or process it directly.

Ahsan Shah
  • 3,931
  • 1
  • 34
  • 48