This is the code:
using System;
using System.Collections.Generic;
using System.Text;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using HtmlAgilityPack;
namespace ScrollLabelTest
{
class ExtractLinks
{
WebClient contents = new WebClient();
string cont;
List<string> links = new List<string>();
List<string> FilteredLinks = new List<string>();
List<string> Respones = new List<string>();
public void Links(string FileName)
{
HtmlDocument doc = new HtmlDocument();
doc.Load(FileName);
foreach (HtmlNode link in doc.DocumentNode.SelectNodes("//a[@href]"))
{
HtmlAttribute att = link.Attributes["href"];
if (att.Value.StartsWith("http://rotter.net/forum/scoops1"))
{
links.Add(att.Value);
}
}
for (int i = 0; i < links.Count; i++)
{
int f = links[i].IndexOf("#");
string test = links[i].Substring(0, f);
FilteredLinks.Add(test);
}
for (int i = 0; i < FilteredLinks.Count; i++)
{
contents.Encoding = System.Text.Encoding.GetEncoding(1255);
cont = contents.DownloadString(FilteredLinks[i]);
Respones.Add("Thread #1");
GetResponsers(cont);
}
}
private void GetResponsers(string contents)
{
string firstTag = "<FONT CLASS='text16b'>";
string lastTag = "&n";
int f = contents.IndexOf(firstTag);
int g = contents.IndexOf(lastTag, f);
string responser = contents.Substring(f + firstTag.Length, g - f - firstTag.Length);
}
}
}
The problem is in the last method the GetResponsers: This is a line in the text contents this is one line of many i want to extract specific text from:
<font size="2" face="Arial" color="#000099"> <FONT CLASS='text16b'>43.יהי זכרו ברוך, שימליץ עלינו מלמעלה. </font><br>
What i want to extract is only this part:
43.יהי זכרו ברוך, שימליץ עלינו מלמעלה.
Including the poins(dots) and inclusing the number in this case 43 But what i get in the variable responser is:
אזכרה במלאת שנתיים לפטירתו של אבי מורי ז''ל הרב ישעיהו רוטר.
How can i extract also the number and the dot/point near it: 43. ? So what i will get in responser will be:
43.יהי זכרו ברוך, שימליץ עלינו מלמעלה.
How can i do it using the code i already have in GetResponsers ?
I tried to use a loop:
private void GetResponsers(string contents)
{
while(true)
{
string firstTag = "<FONT CLASS='text16b'>";
string lastTag = "&n";
int f = contents.IndexOf(firstTag);
int g = contents.IndexOf(lastTag, f);
string responser = contents.Substring(f + firstTag.Length, g - f - firstTag.Length);
UsersRespones.Add(responser);
}
}
But the List UserResponses contain more then 1000 indexs and all of them the same string. Its extracting the same indexs from the text.
How can i make the loop to extract each the next string in the next places where the tags appears ?
Ok this is the block of text each time i should extract fro in the loop inside the GetResponsers method:
SIZE="2" FACE="Arial" color="#000099"><a href="#19"><font color=''>שנתיים?</font></a></font></td>
<td align="center" nowrap><font SIZE="1"
FACE="Arial" color="#000099">אפריאט</font></td>
<td align="center" nowrap><font SIZE="1"
FACE="Arial" color="#000099">16.06.14 <font SIZE="1"
FACE="Arial" color="red">18:30</font></td>
<td align="center" nowrap><font SIZE="1"
FACE="Arial" color="#000099">19</font></td>
</tr>
From this block i should get some things:
The number in this case #19 and add it to a List
The text in this case: שנתיים? and also add it to a List
The date in this case: 16.06.14 and add to a List
The time in this case: 18:30 and also to a List
And then in the next loop itertion the next block in the who contents variable. And so on untill the end only when it finish it should go back to the Links method download the next content and loop over it again...And so on for all the links in the method Links.
And i prefer to do it all using my code in GetResponsers using IndexOf and Substring.
EDIT
Tried this:
private void GetResponsers(string contents)
{
int startPos = 0;
while(true)
{
string firstTag = "<FONT CLASS='text16b'>";
string lastTag = "&n";
int f = contents.IndexOf(firstTag, startPos);
int g = contents.IndexOf(lastTag, f);
startPos = g + lastTag.Length;
string responser = contents.Substring(f + firstTag.Length, g - f - firstTag.Length);
UsersRespones.Add(responser);
}
}
Bt getting exception on the line:
int g = contents.IndexOf(lastTag, f);
Index was out of range. Must be non-negative and less than the size of the collection
This is happenning after adding the startPos.