I'm trying to create a program that will look for any type of URL in a text file, for example hxxp://www.testsite.com/images/logo.png. The following code is my attempt from augmenting online tutorials,(the main one is referenced on top of my code.) but I haven't been able to catch any/all URL's including those embedded in within html tags. I'd appreciate any help or suggestions on what I could try. Thanks.
/* Reference: http://www.vogella.com/tutorials/JavaRegularExpressions/article.html
*/
package de.vogella.regex.weblinks;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class LinkGetter {
//variables
private Pattern htmltag;
private Pattern link;
public LinkGetter() {
//Patterns.
htmltag = Pattern.compile("<a\\b[^>]*href=\"[^>]*>(.*?)</a>");
link = Pattern.compile(
"((https?|ftp|gopher|telnet|file):((//)|(\\\\))+[\\w\\d:#@%/;$()~_?\\+-=\\\\\\.&]*)");
}
public static void main(String[] args){
String filepath ="TestFile.rtf";
System.out.println(new LinkGetter().getLinks(filepath));
}
public List<String> getLinks(String filepath) {
List<String> links = new ArrayList<String>();
try {
FileReader FR = new FileReader("TestFile.rtf");
BufferedReader bufferedReader = new BufferedReader(FR);
String A;
StringBuilder builder = new StringBuilder();
while ((A = bufferedReader.readLine()) != null) {
builder.append(A);
}
Matcher tagmatch = htmltag.matcher(builder.toString());
while (tagmatch.find()) {
Matcher matcher = link.matcher(tagmatch.group());
matcher.find();
String link = matcher.group().replaceFirst("href=\"", "")
.replaceFirst("\">", "")
.replaceFirst("\"[\\s]?target=\"[a-zA-Z_0-9]*", "");
if (valid(link)) {
links.add(makeAbsolute(filepath, link));
}
bufferedReader.close();
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return links;
}
private boolean valid(String A) {
if (A.matches("javascript:.*|mailto:.*")) {
return false;
}
return true;
}
private String makeAbsolute(String filepath, String link) {
if (link.matches("http://.*")) {
return link;
}
else if (link.matches("/.*") || link.matches(".*$[^/]")) {
return "/" + link;
throw new RuntimeException("Cannot make absolute. File: " + filepath
+ " Link " + link);
}
}