-3

I need a regex pattern for finding links in a string (with HTML code) to get the links with file endings like .gif or .png

Example String:

<a href="//site.com/folder/picture.png" target="_blank">picture.png</a>

For now I get everything between the " " and the text between the <a> and </a>.

I want to get this:

Href = //site.com/folder/picture.png String = picture.png

My code so far:

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Diagnostics;
using System.Drawing;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using System.Windows.Forms;

namespace downloader
{
public partial class Form1 : Form
{
    public Form1()
    {
        InitializeComponent();
    }

    private void button1_Click(object sender, EventArgs e)
    {
        string url = textBox1.Text;
        string s = gethtmlcode(url);
        foreach (LinkItem i in LinkFinder.Find(s))
        {
            richTextBox1.Text += Convert.ToString(i);
        }

    }


    static string gethtmlcode(string url)
    {
        using (WebClient client = new WebClient())
        {
            string htmlCode = client.DownloadString(url);
            return htmlCode;
        }
    }

    public struct LinkItem
    {
        public string Href;
        public string Text;
        public override string ToString()
        {
            return Href + "\n\t" + Text + "\n\t";
        }
    }
    static class LinkFinder
    {
        public static List<LinkItem> Find(string file)
        {
            List<LinkItem> list = new List<LinkItem>();

            // 1.
            // Find all matches in file.
            MatchCollection m1 = Regex.Matches(file, @"(<a.*?>.*?</a>)",
                RegexOptions.Singleline);

            // 2.
            // Loop over each match.
            foreach (Match m in m1)
            {
                string value = m.Groups[1].Value;
                LinkItem i = new LinkItem();

                // 3.
                // Get href attribute.
                Match m2 = Regex.Match(value, @"href=\""(.*?)\""",
                RegexOptions.Singleline);
                if (m2.Success)
                {
                    i.Href = m2.Groups[1].Value;
                }

                // 4.
                // Remove inner tags from text.
                string t = Regex.Replace(value, @"\s*<.*?>\s*", "",
                RegexOptions.Singleline);
                i.Text = t;

                list.Add(i);
            }
            return list;
        }
    }

}

}
Wiktor Stribiżew
  • 607,720
  • 39
  • 448
  • 563
algtr
  • 68
  • 9
  • 2
    I am sorry, but have you considered, say, HtmlAgilityPack? Or any other HTML parsing library? It will make the code much easier to read, use and maintain. – Wiktor Stribiżew Jun 03 '15 at 20:16
  • 4
    [Do not parse HTML with Regex.](http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags) – Der Kommissar Jun 03 '15 at 20:16
  • a coupe of things here: 1. You should tell us what actually does happen when you run that code. 2. Regular Expressions aren't very good at parsing html. You should look for an html parser. – Sam I am says Reinstate Monica Jun 03 '15 at 20:19
  • I'm trying to make my first steps so I'm not very familiar and good with everything c# offers, sorry. I've got a form with a richtextbox, a textbox and a button. 1. I insert the link into the text box 2. Press the button and the html code is loaded into a string 3. it runs the shown method (I'll edit the OP so the whole code will be there) 4. Show the Links with Texts in the richtextbox – algtr Jun 03 '15 at 20:24

1 Answers1

1

I can suggest using HtmlAgilityPack for this task. Install using Manage NuGet Packages for Solution menu, and add the following method:

/// <summary>
/// Collects a href attribute values and a node values if image extension is jpg or png
/// </summary>
/// <param name="html">HTML string or an URL</param>
/// <returns>A key-value pair list of href values and a node values</returns>
private List<KeyValuePair<string, string>> GetLinksWithHtmlAgilityPack(string html)
{
    var result = new List<KeyValuePair<string, string>>();
    HtmlAgilityPack.HtmlDocument hap;
    Uri uriResult;
    if (Uri.TryCreate(html, UriKind.Absolute, out uriResult) && uriResult.Scheme == Uri.UriSchemeHttp)
    { // html is a URL 
        var doc = new HtmlAgilityPack.HtmlWeb();
        hap = doc.Load(uriResult.AbsoluteUri);
    }
    else
    { // html is a string
        hap = new HtmlAgilityPack.HtmlDocument();
        hap.LoadHtml(html);
    }
    var nodes = hap.DocumentNode.SelectNodes("//a");
    if (nodes != null)
        foreach (var node in nodes)
            if (Path.GetExtension(node.InnerText.Trim()).ToLower() == ".png" ||
                    Path.GetExtension(node.InnerText.Trim()).ToLower() == ".jpg")
            result.Add(new KeyValuePair<string,string>(node.GetAttributeValue("href", null), node.InnerText));
    return result;
}

Then, use it as (I am using a dummy string, just for demo)

var result = GetLinksWithHtmlAgilityPack("<a href=\"//site.com/folder/picture.png\" target=\"_blank\">picture.png</a><a href=\"//site.com/folder/picture.bmp\" target=\"_blank\">picture.bmp</a>");

Output:

enter image description here

Or, with a URL, something like:

var result = GetLinksWithHtmlAgilityPack("http://www.google.com");
Wiktor Stribiżew
  • 607,720
  • 39
  • 448
  • 563