0

Sorry to bother you guys again, but here's my dilemma.

There must be a "better" regular expression to identify HTML link from a paragraph text (there can be more than 1 html links in the text). How do I extract all the link and anchor it in javascript?

My attempt (in javascript) is like this:

var urlPattern = "(https?|ftp)://(www\\.)?(((([a-zA-Z0-9.-]+\\.){1,}[a-zA-Z]{2,4}|localhost))|((\\d{1,3}\\.){3}(\\d{1,3})))(:(\\d+))?(/([a-zA-Z0-9-._~!$&'()*+,;=:@/]|%[0-9A-F]{2})*)?(\\?([a-zA-Z0-9-._~!$&'()*+,;=:/?@]|%[0-9A-F]{2})*)?(#([a-zA-Z0-9._-]|%[0-9A-F]{2})*)?";

function extractURLs(s) {
    return s.match(new RegExp(urlPattern));
}

//s is of type String

//For testing...
var text = "Check this video out http://ww w.youtube.com/watch?v=y3U3R3b1dOg or http://www.youtube.com/watch?v=sX6Vm0MoPCY";
alert(extractURLs(text));

(spaces on hyperlink has been deliberately added here to allow posting of question in SO). Result: I only get the 1st hyperlink and not the second one.... Has anybody done something similar or better that I can utilize?

Thanks in advance.

Buhake Sindi
  • 87,898
  • 29
  • 167
  • 228
  • http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags This should answer all your questions. – Oscar Kilhed Jan 20 '10 at 08:31
  • Don't worry...I've read that sweet poetry before but Alsciende provided a correct answer for me. – Buhake Sindi Jan 20 '10 at 09:04

3 Answers3

2

Use the "g" modifier:

function extractURLs(s) {
    return s.match(new RegExp(urlPattern, "g"));
}
Alsciende
  • 26,583
  • 9
  • 51
  • 67
0
var urlPattern = "(https?|ftp)://(www\\.)?(((([a-zA-Z0-9.-]+\\.){1,}[a-zA-Z]{2,4}|localhost))|((\\d{1,3}\\.){3}(\\d{1,3})))(:(\\d+))?(/([a-zA-Z0-9-._~!$&'()*+,;=:@/]|%[0-9A-F]{2})*)?(\\?([a-zA-Z0-9-._~!$&'()*+,;=:/?@]|%[0-9A-F]{2})*)?(#([a-zA-Z0-9._-]|%[0-9A-F]{2})*)?";


function extractURLs(s) {
    return s.match(new RegExp(urlPattern));
}

var text = "Check this video out http://www.youtube.com/watch?v=y3U3R3b1dOg or http://www.youtube.com/watch?v=sX6Vm0MoPCY";
var results = extractURLs(text);

alert(extractURLs(results[0]  + ", " + results[1])); 
stepanian
  • 11,373
  • 8
  • 43
  • 63
0

It is better to write it as,

var urlPattern = /(https?|ftp)://(www\\.)?(((([a-zA-Z0-9.-]+\\.){1,}[a-zA-Z]{2,4}|localhost))|((\\d{1,3}\\.){3}(\\d{1,3})))(:(\\d+))?(/([a-zA-Z0-9-._~!$&'()*+,;=:@/]|%[0-9A-F]{2})*)?(\\?([a-zA-Z0-9-._~!$&'()*+,;=:/?@]|%[0-9A-F]{2})*)?(#([a-zA-Z0-9._-]|%[0-9A-F]{2})*)?/g;

function extractURLs(s) {
    return s.match(urlPattern);
}

Here urlPattern is pre-compiled, rather than compiling the RegEx everytime the function is called, hence results in petter performance.

Livingston Samuel
  • 2,422
  • 2
  • 20
  • 35
  • True, but extractURLs(...) isn't the only function available, there's functions like isValidURL(url) that uses urlPattern and some that are prep-ended or post-ended with other expressions. – Buhake Sindi Jan 20 '10 at 13:13
  • actually your urlPattern will fail to compile.... Solution: var urlPattern = /(https?|ftp)://(www\.)?(((([a-zA-Z0-9.-]+\.){1,}[a-zA-Z]{2,4}|localhost))|((\d{1,3}\.){3}(\d{1,3})))(:(\d+))?(/([a-zA-Z0-9-._~!$&'()*+,;=:@/]|%[0-9A-F]{2})*)?(\?([a-zA-Z0-9-._~!$&'()*+,;=:/?@]|%[0-9A-F]{2})*)?(#([a-zA-Z0-9._-]|%[0-9A-F]{2})*)?; You should have removed the /g and replaced the \\ to \ – Buhake Sindi Jan 20 '10 at 13:21