-6

Something dosent work good with the text() function why ? For example if im going to www.walla.co.il in internet explorer and view the source i see that the first url address there is http://www.w3.org

But in textBox2 i see the first url address as: http://www.yad1.yad2.co.il

And i want to parse all the url's from the site and show them in textBox2 In textBox1 i just show thew hole content.

This is the code:

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;

namespace WebCrawler
{
    public partial class Form1 : Form
    {
        StreamWriter sw;
        string url = "http://www.walla.co.il";
        StringBuilder sb;
        HttpWebRequest req;
        HttpWebResponse res;


        public Form1()
        {
            InitializeComponent();
            sw = new StreamWriter(@"d:\text.txt");
            req = (HttpWebRequest)WebRequest.Create(url);
            sb = new StringBuilder();
            WebRequestGetExample("http://www.walla.co.il");
            text();
        }

        private void Form1_Load(object sender, EventArgs e)
        {

        }

        public string WebRequestGetExample(string url)
        {
            //validateUrl(url);              
            req.Method = "Get";
            res = (HttpWebResponse)req.GetResponse();
            // Display the status.  
            Console.WriteLine(res.StatusDescription);
            // Get the stream containing content returned by the server.  
            Stream dataStream = res.GetResponseStream();
            // Open the stream using a StreamReader for easy access.  
            StreamReader reader = new StreamReader(dataStream);
            // Read the content.  
            string t = reader.ReadToEnd();
            // Display the content.  
            textBox1.Text = t;
            // Cleanup the streams and the response.  
            reader.Close();
            dataStream.Close();
            res.Close();

            return t;
        }

       /* public void text()
        {
            string val;
            StringBuilder sb = new StringBuilder();
            Regex hrefs = new Regex("<a href.*?>");
            Regex http = new Regex("http:.*?>");
            foreach (Match m in hrefs.Matches(textBox1.Text))
            {

                //sb.Append(m.ToString());
                if (http.IsMatch(m.ToString()))
                {

                    val = http.Match(m.ToString()).ToString();
                    int end = val.IndexOf('"');
                    if (end > 0)
                        sb.Append(val.Substring(0, end));
                    //sb.Append("<br>");
                }
                else
                {
                    val = http.Match(m.ToString()).ToString();
                    sb.Append(val);
                    //sb.Append(m.ToString().Substring(1, m.ToString().Length - 1) + "<br>");

                }

                textBox2.Text = sb.ToString();
            }

        }*/

        public void text()
        {
            StringBuilder sb = new StringBuilder();
            Regex hrefs = new Regex("<a href.*?>");
            Regex http = new Regex("http:.*?>");
            foreach (Match m in hrefs.Matches(textBox1.Text))
            {
                //sb.Append(m.ToString());
                if (http.IsMatch(m.ToString()))
                {
                    string val = http.Match(m.ToString()).ToString();
                    int end = val.IndexOf('"');
                    sb.Append(end > 0 ? val.Substring(0, end) : val);
                    //if (end > 0)
                    //    sb.Append(val.Substring(0, end));
                    //else
                    //    sb.Append(val);
                    //sb.Append("<br>");
                }
                else
                {
                    //sb.Append(m.ToString().Substring(1, m.ToString().Length - 1) + "<br>");
                }
                textBox2.Text = sb.ToString();
                sw.WriteLine(sb.ToString());
            }
            sw.Close();
        }
    }
}
EdChum
  • 376,765
  • 198
  • 813
  • 562
Ben
  • 1
  • 2
  • 1
    First read [this](http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454) and [this](http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1758162#1758162) and then use [HtmlAgilityPack](http://htmlagilitypack.codeplex.com/) – L.B Mar 18 '12 at 17:38

2 Answers2

0

I think it's probably because your hrefs regex is only looking for anchor tags whereas the first urls in the page source are actually part of the DOCTYPE and html tags (they are the only www.w3.org urls I can see in the page source).

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html lang="he-IL" xml:lang="he-IL" xmlns="http://www.w3.org/1999/xhtml">

If you want to include these URLs you'll have to think about amending your hrefs Regex expression

KazR
  • 961
  • 7
  • 16
0

The match you are looking for is looking for an actual link <[ignore]a href....> but the first link you see in the source (the w3c link) is in the head. If you want to get that you would have to remove the hrefs match (eg just find links on the page) or modify it to match

<html lang="he-IL" xml:lang="he-IL" xmlns="http://www.w3.org/1999/xhtml">

However, I suspect you do only want to find links so your code is working correctly.

Sorry about the ignore, StackOverflow kept converting it into actual link :P

T. Kiley
  • 2,752
  • 21
  • 30