Extracting 1 or more hyperlinks from paragraph text in Javascript using Regular Expression

Question

Sorry to bother you guys again, but here's my dilemma.

There must be a "better" regular expression to identify HTML link from a paragraph text (there can be more than 1 html links in the text). How do I extract all the link and anchor it in javascript?

My attempt (in javascript) is like this:

var urlPattern = "(https?|ftp)://(www\\.)?(((([a-zA-Z0-9.-]+\\.){1,}[a-zA-Z]{2,4}|localhost))|((\\d{1,3}\\.){3}(\\d{1,3})))(:(\\d+))?(/([a-zA-Z0-9-._~!$&'()*+,;=:@/]|%[0-9A-F]{2})*)?(\\?([a-zA-Z0-9-._~!$&'()*+,;=:/?@]|%[0-9A-F]{2})*)?(#([a-zA-Z0-9._-]|%[0-9A-F]{2})*)?";

function extractURLs(s) {
    return s.match(new RegExp(urlPattern));
}

//s is of type String

//For testing...
var text = "Check this video out http://ww w.youtube.com/watch?v=y3U3R3b1dOg or http://www.youtube.com/watch?v=sX6Vm0MoPCY";
alert(extractURLs(text));

(spaces on hyperlink has been deliberately added here to allow posting of question in SO). Result: I only get the 1st hyperlink and not the second one.... Has anybody done something similar or better that I can utilize?

Thanks in advance.

http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags This should answer all your questions. — Oscar Kilhed, Jan 20 '10 at 08:31
Don't worry...I've read that sweet poetry before but Alsciende provided a correct answer for me. — Buhake Sindi, Jan 20 '10 at 09:04

score 2 · Accepted Answer · answered Jan 20 '10 at 08:45

2

Use the "g" modifier:

function extractURLs(s) {
    return s.match(new RegExp(urlPattern, "g"));
}

answered Jan 20 '10 at 08:45

Alsciende

26,583
9
51
67

stepanian · Answer 2 · 2010-01-20T08:46:28.020

0

var urlPattern = "(https?|ftp)://(www\\.)?(((([a-zA-Z0-9.-]+\\.){1,}[a-zA-Z]{2,4}|localhost))|((\\d{1,3}\\.){3}(\\d{1,3})))(:(\\d+))?(/([a-zA-Z0-9-._~!$&'()*+,;=:@/]|%[0-9A-F]{2})*)?(\\?([a-zA-Z0-9-._~!$&'()*+,;=:/?@]|%[0-9A-F]{2})*)?(#([a-zA-Z0-9._-]|%[0-9A-F]{2})*)?";


function extractURLs(s) {
    return s.match(new RegExp(urlPattern));
}

var text = "Check this video out http://www.youtube.com/watch?v=y3U3R3b1dOg or http://www.youtube.com/watch?v=sX6Vm0MoPCY";
var results = extractURLs(text);

alert(extractURLs(results[0]  + ", " + results[1]));

edited Jan 20 '10 at 08:46

answered Jan 20 '10 at 08:22

stepanian

11,373
8
43
63

results[1] gives me "http" so that doesn't retrieve the 2nd url from the text string. – Buhake Sindi Jan 20 '10 at 09:02

score 0 · Answer 3 · answered Jan 20 '10 at 11:23

0

It is better to write it as,

var urlPattern = /(https?|ftp)://(www\\.)?(((([a-zA-Z0-9.-]+\\.){1,}[a-zA-Z]{2,4}|localhost))|((\\d{1,3}\\.){3}(\\d{1,3})))(:(\\d+))?(/([a-zA-Z0-9-._~!$&'()*+,;=:@/]|%[0-9A-F]{2})*)?(\\?([a-zA-Z0-9-._~!$&'()*+,;=:/?@]|%[0-9A-F]{2})*)?(#([a-zA-Z0-9._-]|%[0-9A-F]{2})*)?/g;

function extractURLs(s) {
    return s.match(urlPattern);
}

Here urlPattern is pre-compiled, rather than compiling the RegEx everytime the function is called, hence results in petter performance.

answered Jan 20 '10 at 11:23

Livingston Samuel

2,422
2
20
35

True, but extractURLs(...) isn't the only function available, there's functions like isValidURL(url) that uses urlPattern and some that are prep-ended or post-ended with other expressions. – Buhake Sindi Jan 20 '10 at 13:13
actually your urlPattern will fail to compile.... Solution: var urlPattern = /(https?|ftp)://(www\.)?(((([a-zA-Z0-9.-]+\.){1,}[a-zA-Z]{2,4}|localhost))|((\d{1,3}\.){3}(\d{1,3})))(:(\d+))?(/([a-zA-Z0-9-._~!$&'()*+,;=:@/]|%[0-9A-F]{2})*)?(\?([a-zA-Z0-9-._~!$&'()*+,;=:/?@]|%[0-9A-F]{2})*)?(#([a-zA-Z0-9._-]|%[0-9A-F]{2})*)?; You should have removed the /g and replaced the \\ to \ – Buhake Sindi Jan 20 '10 at 13:21

Extracting 1 or more hyperlinks from paragraph text in Javascript using Regular Expression

3 Answers3