0

My program is a webcrawler. Im trying to download images from a website. In my webcrawler site i did:

try
{
    HtmlAgilityPack.HtmlDocument doc = TimeOut.getHtmlDocumentWebClient(mainUrl, false, "", 0, "", "");
    if (doc == null)
    {
        if (wccfg.downloadcontent == true)
        {
            retwebcontent.retrieveImages(mainUrl);
        }
        failed = true;
        wccfg.failedUrls++;
        failed = false;
    }

For example when doc is null the mainUrl contain:

http://members.tripod.com/~VanessaWest/bundybowman2.jpg

Now its jumping to the retrieveImages method in the other class:

namespace GatherLinks
{
    class RetrieveWebContent
    {
        HtmlAgilityPack.HtmlDocument doc;
        string imgg;
        int images;

        public RetrieveWebContent()
        {
            images = 0;
        }

        public List<string> retrieveImages(string address)
        {
            try
            {
                doc = new HtmlAgilityPack.HtmlDocument();
                System.Net.WebClient wc = new System.Net.WebClient();
                List<string> imgList = new List<string>();
                doc.Load(wc.OpenRead(address));
                HtmlNodeCollection imgs = doc.DocumentNode.SelectNodes("//img[@src]");
                if (imgs == null) return new List<string>();

                foreach (HtmlNode img in imgs)
                {
                    if (img.Attributes["src"] == null)
                        continue;
                    HtmlAttribute src = img.Attributes["src"];

                    imgList.Add(src.Value);
                    if (src.Value.StartsWith("http") || src.Value.StartsWith("https") || src.Value.StartsWith("www"))
                    {
                        images++;
                        string[] arr = src.Value.Split('/');
                        imgg = arr[arr.Length - 1];
                        //imgg = Path.GetFileName(new Uri(src.Value).LocalPath);
                        //wc.DownloadFile(src.Value, @"d:\MyImages\" + imgg);
                        wc.DownloadFile(src.Value, "d:\\MyImages\\" + Guid.NewGuid() + ".jpg");
                    }
                }

                return imgList;
            }
            catch
            {
                Logger.Write("There Was Problem Downloading The Image: " + imgg);
                return null;

            }
        }
    }
}

Now im using a breakpoint and step line by line and after doing this line:

HtmlNodeCollection imgs = doc.DocumentNode.SelectNodes("//img[@src]");

The variable imgs is null. Then on the next line that check if its null its jumping to the end and does nothing.

How can i solve it so it will be able to download the image from http://members.tripod.com/~VanessaWest/bundybowman2.jpg ?

EDIT**

public List<string> retrieveImages(string address)
        {
            try
            {
                doc = new HtmlAgilityPack.HtmlDocument();
                System.Net.WebClient wc = new System.Net.WebClient();
                List<string> imgList = new List<string>();
                doc.Load(wc.OpenRead(address));
                string t = doc.DocumentNode.InnerText;
                HtmlNodeCollection imgs = doc.DocumentNode.SelectNodes("//img//[@src]");
                if (imgs == null) return new List<string>();

                foreach (HtmlNode img in imgs)
                {
                    if (img.Attributes["src"] == null)
                        continue;
                    HtmlAttribute src = img.Attributes["src"];
                    wc.DownloadFile(src.Value, "d:\\MyImages\\" + Guid.NewGuid() + ".jpg");
                    imgList.Add(src.Value);
                    if (src.Value.StartsWith("http") || src.Value.StartsWith("https") || src.Value.StartsWith("www"))
                    {
                        images++;
                        string[] arr = src.Value.Split('/');
                        imgg = arr[arr.Length - 1];
                        //imgg = Path.GetFileName(new Uri(src.Value).LocalPath);
                        //wc.DownloadFile(src.Value, @"d:\MyImages\" + imgg);
                        wc.DownloadFile(src.Value, "d:\\MyImages\\" + Guid.NewGuid() + ".jpg");
                    }
                }

                return imgList;
            }
            catch
            {
                Logger.Write("There Was Problem Downloading The Image: " + imgg);
                return null;

            }
        }
DanielVest
  • 823
  • 4
  • 20
  • 39
  • have you try with mainUrl as `http://vanessawest.tripod.com/bundybowman2.jpg`? – Damith Aug 28 '13 at 07:56
  • If you checked `doc` is correct (line before `imgs = doc....`) then you probably has wrong selector (`"//img[@src]"`). Google how to repair selector. If selector is correct, you have problem somewhere else and you should debug to find first line with wrong behaviour – Ari Aug 28 '13 at 07:57
  • 1
    Just a tip. By the collections code conventions defined by MS, you should return an empty list instead of null. There is also a question regarding this problem on SO: http://stackoverflow.com/q/1969993/809009. – Ondrej Janacek Aug 28 '13 at 08:00
  • HtmlAgilityPack can download HTML pages, not images (but note it will not complain it's not HTML...). If SelectNodes returns null, it just means it didn't found any IMG element with an SRC attribute defined. – Simon Mourier Aug 28 '13 at 08:56

1 Answers1

1

If you look inside data which WebClient returns to you, you see that there is not a Html page, but bit-data of the image.

doc.Load(wc.OpenRead(address));
Console.WriteLine(doc.DocumentNode.InnerText);
MikkaRin
  • 3,026
  • 19
  • 34
  • MikkaRin yes it the different. Then what should i do in this case ? – DanielVest Aug 28 '13 at 08:48
  • In this case `src.Value` - it's your URL. Just use `wc.DownloadFile(**URL**, "d:\\MyImages\\" + Guid.NewGuid() + ".jpg");` – MikkaRin Aug 28 '13 at 08:59
  • MikkraRin can you show me the full method how to do it ? I just updated my question added the method with the line you show but it never get there. – DanielVest Aug 28 '13 at 09:06
  • Mikka i had this line already moved it up to try but it never get it since the variable imgs is null. – DanielVest Aug 28 '13 at 09:07
  • Daniel, just check your URL. If address ends with ".jpg", ".png" and etc. than this is link to file - use `DownloadFile` method. If URL ends with ".html", ".htm", ".aspx" etc. - than it's an HTml page, and you need to parse it as you do in previous. – MikkaRin Aug 28 '13 at 09:15
  • Mikka but i havw this two line: HtmlAttribute src = img.Attributes["src"]; wc.DownloadFile(src.Value, "d:\\MyImages\\" + Guid.NewGuid() + ".jpg"); and the variable src = img but if img is null then what src = to ? – DanielVest Aug 28 '13 at 09:52