I was scraping data from a webpage using HtmlAgilityPack with this code
string Name= "ARKU2215462";
string containerInfo = LoadContent(Name);
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(containerInfo);
doc.OptionEmptyCollection = true;
HtmlNode[] nodes = doc.DocumentNode
.SelectNodes("//td[@style='padding:7px']")
.ToArray();
if (nodes != null)
{
foreach (HtmlNode item in nodes)
{
Console.WriteLine(item.InnerHtml);
string[] akla = nodes[1].InnerHtml.ToString().Split('-');
emporevmatokibotiaGridView.SetRowCellValue(i, colArithmosAKLA, akla[0]);
emporevmatokibotiaGridView.SetRowCellValue(i, colArithmosEidous, akla[1]);
string[] date = nodes[3].InnerHtml.ToString().Split();
emporevmatokibotiaGridView.SetRowCellValue(i, colEtosAKLA, DateTime.Now.Year);
if (akla[0].IsNullOrEmpty())
{
emporevmatokibotiaGridView.SetRowCellValue(i, colArithmosAKLA, "ΔΕΝ ΕΚΦΟΡΤΩΘΗΚΕ");
}
}
if (emporevmatokibotiaGridView.GetRowCellValue(i, colArithmosAKLA).IsNull())
emporevmatokibotiaGridView.SetRowCellValue(i, colArithmosAKLA, "ΔΕΝ ΕΚΦΟΡΤΩΘΗΚΕ");
}
else
{
emporevmatokibotiaGridView.SetRowCellValue(i, colArithmosAKLA, "ΔΕΝ ΕΚΦΟΡΤΩΘΗΚΕ");
}
}
private static string LoadContent(string reference)
{
string url = $"https://portal.thpa.gr/fnet5/track/index.php";
var hc = new HttpClient();
var reqUrlContent =
hc.PostAsync(url,
new StringContent($"d=1&containerCode={reference}&go=1", Encoding.UTF8,
"application/x-www-form-urlencoded"))
.Result;
Stream stream = reqUrlContent.Content.ReadAsStreamAsync().Result;
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.Load(stream);
return doc.DocumentNode.InnerHtml;
}
Now the webpage has changed completely and I cannot retrieve the data as before.
I found a solution thanks to Victor Scrape data from web page with HtmlAgilityPack c# but my problem is that I altough I have a string containing the webpage, I cannot retrieve the data I want.
The code I am using is
string url = $"https://webportal.thpa.gr/ctreport/container/track";
var html = Download("ARKU2215462");
HtmlDocument doc = new HtmlDocument();
//doc.Load(stream);
//doc.Load(html);
doc.LoadHtml(html);
foreach (HtmlNode item in doc.DocumentNode.ChildNodes)
{
Console.WriteLine(item.InnerHtml);
}
public static string Download(string search)
{
var request = (HttpWebRequest)WebRequest.Create("https://webportal.thpa.gr/ctreport/container/track");
var postData = string.Format("report_container%5Bcontainerno%5D={0}&report_container%5Bsearch%5D=", search);
var data = Encoding.ASCII.GetBytes(postData);
request.Method = "POST";
request.ContentType = "application/x-www-form-urlencoded";
request.ContentLength = data.Length;
using (var stream = request.GetRequestStream())
{
stream.Write(data, 0, data.Length);
}
using (var response = (HttpWebResponse)request.GetResponse())
using (var stream = new StreamReader(response.GetResponseStream()))
{
return stream.ReadToEnd();
}
}
The information is inside the innerHtml but I cannot get it as information inside the variables that I want.
> <tbody> <tr> <td>ARKU2215462</td> <td>2022000648-358</td>
> <td>DISCHARGE</td> <td>2022-05-26 04:42:20</td> <td> </td> </tr>
> <tr><td>ARKU2215462</td> <td>2022000648-358</td> <td>COLLECT</td>
> <td>2022-05-27 20:38:23</td> <td></td> </tr> </tbody>
In the old code I used
HtmlNode[] nodes = doc.DocumentNode
.SelectNodes("//td[@style='padding:7px']")
.ToArray();
and the webpage returned only the last state now it returns all states and I want the last one and I think that htmlAgilityPack doesnt get the string as html so I can use the DocumentNode