So here is the plan.
- Get the sourcecode from a weburl's html page e.g. https://stackoverflow.com
- Search through it with a pattern e.g. for links
- Get a hashmap/list of the results in the correct order of appearance
In PHP very simple to make, but it seems to me impossible with native Java.
How it would look like in PHP:
<?php
//the html code, maybe multilined
$htmlCode = file_get_contents("https://stackoverflow.com");
$replacement = Array("\n", "\cr", "\r", "\c", "\t");
//to make it single lined like a long string
$htmlCode = str_replace($replacement, "", $htmlCode);
//Regex part
$pattern = '#<a href="(.*)">.*</a>#siU';
preg_match_all($pattern, $htmlCode, $results, PREG_SET_ORDER);
//Print it out
echo "<pre>Results: ".print_r($results,true)."</pre>";
?>
Results would look like:
Array = ( [0] = "http://abc.net", [1] = "http://test.com", ... );
How can I do the same in Java?
(I already tried it my own with a way I thought how it could work, but it I think it's to overblown. Did it with JerichoHTMLParser.)
Init.java
package test.java.regex;
import java.awt.*;
import java.awt.event.*;
import javax.swing.*;
import net.htmlparser.jericho.*;
import java.net.*;
import java.io.*;
import java.util.*;
import java.util.regex.*;
public class init extends JFrame {
private JLabel lblKeyword;
private JTextField keyword;
private JButton exec;
private JScrollPane sp;
private JTextArea output;
private Pattern pattern;
private Matcher matcher;
private static final String thePATTERN = "<a href=\"(http://[^\"]+)\".*</a>";
public init(){
this.pattern = Pattern.compile(thePATTERN);
this.setTitle("HTML Element Link Position");
this.setResizable(true);
this.setLayout(new GridBagLayout());
GridBagConstraints gbc;
this.lblKeyword = new JLabel("Search for:");
gbc=makeGBC(0, 0, 1, 1);
gbc.anchor = GridBagConstraints.WEST;
this.add(this.lblKeyword,gbc);
this.keyword = new JTextField();
this.keyword.setText("test");
gbc=makeGBC(0, 1, 1, 1);
gbc.fill=GridBagConstraints.HORIZONTAL;
this.add(this.keyword, gbc);
this.exec = new JButton("Execute");
gbc=makeGBC(0, 2, 0, 1);
gbc.anchor = GridBagConstraints.EAST;
gbc.fill=GridBagConstraints.BOTH;
gbc.weightx=1.0;
gbc.weighty=0.1;
this.add(this.exec, gbc);
this.output = new JTextArea();
gbc=makeGBC(0, 3, 1, 2);
gbc.fill=GridBagConstraints.BOTH;
gbc.weightx=1.0;
gbc.weighty=1.0;
this.add(this.output, gbc);
this.sp = new JScrollPane(this.output);
this.sp.setPreferredSize(new Dimension(500,100));
gbc = makeGBC(0, 3, 1, 1);
gbc.fill=GridBagConstraints.BOTH;
gbc.weightx=1.0;
gbc.weighty=1.0;
this.add(this.sp, gbc);
this.exec.addActionListener(
new ActionListener() {
public void actionPerformed(ActionEvent e) {
String pos = getPosition();
System.out.println("Position: "+pos);
//output.setText( pos );
}
});
this.addWindowListener(new WindowAdapter() {
public void windowClosing(WindowEvent e) {
System.exit(0);
}
});
this.setPreferredSize(new Dimension(700,600));
this.pack();
this.setLocation
(
(Toolkit.getDefaultToolkit().getScreenSize().width-this.getWidth())/2,
(Toolkit.getDefaultToolkit().getScreenSize().height-this.getHeight())/2
);
this.setVisible(true);
}
private String getPosition()
{
String urlString = "";
String result="";
if( !this.keyword.getText().isEmpty() )
{
URL url;
URLConnection uc;
StringBuilder parsedContentFromUrl = new StringBuilder();
urlString = "http://stackoverflow.com";
MicrosoftConditionalCommentTagTypes.register();
PHPTagTypes.register();
PHPTagTypes.PHP_SHORT.deregister();
MasonTagTypes.register();
try{
url = new URL( urlString );
uc = url.openConnection();
uc.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20100101 Firefox/5.0");
uc.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
uc.connect();
uc.getInputStream();
BufferedInputStream in = new BufferedInputStream(uc.getInputStream());
int ch;
while ((ch = in.read()) != -1) {
parsedContentFromUrl.append((char) ch);
}
Source source = new Source( parsedContentFromUrl );
result = source.toString();
this.output.setText( result );
matcher = this.pattern.matcher(result);
boolean matches = matcher.matches();
if( matches ){
this.output.setText( "found" );
}
// BufferedReader inl=null;
// inl = new BufferedReader( new InputStreamReader(uc.getInputStream()) );
// StringBuffer sb = new StringBuffer("");
// String line = "";
// //String NL = System.getProperty("line.separator");
// while ((line = inl.readLine()) != null) {
// sb.append(line);
// }
// inl.close();
// result = sb.toString();
// BufferedInputStream in1 = new BufferedInputStream(uc.getInputStream());
// ByteArrayOutputStream baos = new ByteArrayOutputStream();
// int ch1;
// while ((ch1 = in1.read()) != -1) {
// baos.write((byte)ch1);
// }
// baos.close();
// String st = new String(baos.toByteArray(), "UTF-8");
//result = source.getRenderer().toString();
}
catch (Exception e) {
result = e.toString();
}
//
return result;
}else{
return "No keyword given";
}
}
private GridBagConstraints makeGBC(int gx, int gy, int gw, int gh)
{
GridBagConstraints gbc=new GridBagConstraints();
gbc.gridx=gx;
gbc.gridy=gy;
gbc.gridwidth=gw;
gbc.gridheight=gh;
gbc.fill=GridBagConstraints.NONE;
gbc.weightx=0;
gbc.weighty=0;
gbc.anchor=GridBagConstraints.CENTER;
gbc.insets=new Insets(2,2,2,2);
return gbc;
}
public static void main(String[] args) {
try {
for (LookAndFeelInfo laf : UIManager.getInstalledLookAndFeels()) {
if ("Nimbus".equals(laf.getName())) {
UIManager.setLookAndFeel(laf.getClassName());
break;
}
}
} catch (Exception e) {
// If Nimbus is not available, you can set the GUI to another look and feel.
}
new init();
}
}
Thanks in advance for any help. :)