I'm trying to scrape a website - ive accomplished this on other projects but i cant seem to get this right. It could be that ive been up for over 2 days working and maybe i am missing something. Please could someone look over my code? Here it is :
using System;
using System.Collections.Generic;
using HtmlAgilityPack;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Linq;
using System.Xml.Linq;
using System.IO;
public partial class _Default : System.Web.UI.Page
{
List<string> names = new List<string>();
List<string> address = new List<string>();
List<string> number = new List<string>();
protected void Page_Load(object sender, EventArgs e)
{
string url = "http://www.scoot.co.uk/find/" + "cafe" + " " + "-in-uk?page=" + "4";
var Webget = new HtmlWeb();
var doc = Webget.Load(url);
List<List<string>> mainList = new List<List<string>>();
foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//h2//a"))
{
names.Add(Regex.Replace(node.ChildNodes[0].InnerHtml, @"\s{2,}", " "));
}
foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//p[@class='result-address']"))
{
address.Add(Regex.Replace(node.ChildNodes[0].InnerHtml, @"\s{2,}", " "));
}
foreach (HtmlNode node in doc.DocumentNode.SelectNodes("//p[@class='result-number']"))
{
number.Add(Regex.Replace(node.ChildNodes[0].InnerHtml, @"\s{2,}", " "));
}
XDocument doccy = new XDocument(
new XDeclaration("1.0", "utf-8", "yes"),
new XComment("Business For Sale"),
new XElement("Data",
from data in mainList
select new XElement("data", new XAttribute("data", "data"),
new XElement("Name : ", names[0]),
new XElement("Add : ", address[0]),
new XElement("Number : ", number[0])
)
)
);
var xml = doccy.ToString();
Response.ContentType = "text/xml"; //Must be 'text/xml'
Response.ContentEncoding = System.Text.Encoding.UTF8; //We'd like UTF-8
doccy.Save(Response.Output); //Save to the text-writer
}
}
The website lists business name, phone number and address and they are all defined by a class name (result-address, result-number etc). I am trying to get XML output so i can get the business name, address and phone number from each listing on page 4 for a presentation tomorrow but i cant get it to work at all!
The results are right in all 3 of the for each loops but they wont output in the xml i get an out of range error.