Is it possible, while getting the full text of an html page (with tika or jsoup), to have carriage return between each 'li' element?
Today I have all text in a compact way.
Thanks
Is it possible, while getting the full text of an html page (with tika or jsoup), to have carriage return between each 'li' element?
Today I have all text in a compact way.
Thanks
Here it is an improved version of Andrew Phillips.
Java
package com.github.davidepastore.stackoverflow33947074;
import java.io.IOException;
import java.io.InputStream;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
/**
* Stackoverflow 33947074
*
*/
public class App
{
public static void main( String[] args ) throws IOException {
ClassLoader classloader = Thread.currentThread()
.getContextClassLoader();
InputStream is = classloader.getResourceAsStream("file.html");
Document document = Jsoup.parse(is, "UTF-8", "");
Element element = document.select("html").first();
String text = getText(element);
System.out.println("Result: " + text);
}
/**
* Get the custom text from the given {@link Element}.
* @param element The {@link Element} from which get the custom text.
* @return Returns the custom text.
*/
private static String getText(Element element) {
String working = "";
for (Node child : element.childNodes()) {
if (child instanceof TextNode) {
working += ((TextNode) child).text();
}
if (child instanceof Element) {
Element childElement = (Element)child;
if (childElement.tag().getName().equalsIgnoreCase("li")) {
working += "\n";
}
working += getText(childElement);
}
}
return working;
}
}
file.html
<html>
<head>
<title>Try jsoup</title>
</head>
<body>
<p>This is <a href="http://jsoup.org/">jsoup</a>.</p>
<ul>
<li>First element</li>
<li><a href="#">Second element</a></li>
<li>Third element <b>Additional for third element</b></li>
</ul>
</body>
</html>
Output
Result: Try jsoup This is jsoup.
First element
Second element
Third element Additional for third element