0

This is the code:

using System;
using System.Collections.Generic;
using System.Text;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using HtmlAgilityPack;

namespace ScrollLabelTest
{
    class ExtractLinks
    {
        WebClient contents = new WebClient();
        string cont;
        List<string> links = new List<string>();
        List<string> FilteredLinks = new List<string>();
        List<string> Respones = new List<string>();

        public void Links(string FileName)
        {
            HtmlDocument doc = new HtmlDocument();
            doc.Load(FileName);
            foreach (HtmlNode link in doc.DocumentNode.SelectNodes("//a[@href]"))
            {
                HtmlAttribute att = link.Attributes["href"];
                if (att.Value.StartsWith("http://rotter.net/forum/scoops1"))
                {
                    links.Add(att.Value);
                }
            }

            for (int i = 0; i < links.Count; i++)
            {
                int f = links[i].IndexOf("#");
                string test = links[i].Substring(0, f);
                FilteredLinks.Add(test);
            }

            for (int i = 0; i < FilteredLinks.Count; i++)
            {
                contents.Encoding = System.Text.Encoding.GetEncoding(1255);
                cont = contents.DownloadString(FilteredLinks[i]);
                Respones.Add("Thread #1");
                GetResponsers(cont);
            }
        }

        private void GetResponsers(string contents)
        {
            string firstTag = "<FONT CLASS='text16b'>";
            string lastTag = "&n";
            int f = contents.IndexOf(firstTag);
            int g = contents.IndexOf(lastTag, f);
            string responser = contents.Substring(f + firstTag.Length, g - f - firstTag.Length);           
        }


    }
}

The problem is in the last method the GetResponsers: This is a line in the text contents this is one line of many i want to extract specific text from:

<font size="2" face="Arial" color="#000099">&nbsp;&nbsp;<FONT CLASS='text16b'>43.יהי זכרו ברוך, שימליץ עלינו מלמעלה.&nbsp;&nbsp;</font><br>

What i want to extract is only this part:

43.יהי זכרו ברוך, שימליץ עלינו מלמעלה.

Including the poins(dots) and inclusing the number in this case 43 But what i get in the variable responser is:

אזכרה במלאת שנתיים לפטירתו של אבי מורי ז''ל הרב ישעיהו רוטר.

How can i extract also the number and the dot/point near it: 43. ? So what i will get in responser will be:

43.יהי זכרו ברוך, שימליץ עלינו מלמעלה.

How can i do it using the code i already have in GetResponsers ?

I tried to use a loop:

private void GetResponsers(string contents)
        {
            while(true)
            {
            string firstTag = "<FONT CLASS='text16b'>";
            string lastTag = "&n";
            int f = contents.IndexOf(firstTag);
            int g = contents.IndexOf(lastTag, f);
            string responser = contents.Substring(f + firstTag.Length, g - f - firstTag.Length);
            UsersRespones.Add(responser);
            }

        }

But the List UserResponses contain more then 1000 indexs and all of them the same string. Its extracting the same indexs from the text.

How can i make the loop to extract each the next string in the next places where the tags appears ?

Ok this is the block of text each time i should extract fro in the loop inside the GetResponsers method:

SIZE="2" FACE="Arial" color="#000099"><a href="#19"><font color=''>שנתיים?</font></a></font></td>
            <td align="center" nowrap><font SIZE="1" 
               FACE="Arial" color="#000099">אפריאט</font></td>
            <td align="center" nowrap><font SIZE="1" 
                FACE="Arial" color="#000099">16.06.14 <font SIZE="1" 
                FACE="Arial" color="red">18:30</font></td>
            <td align="center" nowrap><font SIZE="1" 
               FACE="Arial" color="#000099">19</font></td>
            </tr>

From this block i should get some things:

  1. The number in this case #19 and add it to a List

  2. The text in this case: שנתיים? and also add it to a List

  3. The date in this case: 16.06.14 and add to a List

  4. The time in this case: 18:30 and also to a List

And then in the next loop itertion the next block in the who contents variable. And so on untill the end only when it finish it should go back to the Links method download the next content and loop over it again...And so on for all the links in the method Links.

And i prefer to do it all using my code in GetResponsers using IndexOf and Substring.

EDIT

Tried this:

private void GetResponsers(string contents)
        {
            int startPos = 0;
            while(true)
            {
            string firstTag = "<FONT CLASS='text16b'>";  
            string lastTag = "&n";
            int f = contents.IndexOf(firstTag, startPos);
            int g = contents.IndexOf(lastTag, f);
             startPos = g + lastTag.Length;
            string responser = contents.Substring(f + firstTag.Length, g - f - firstTag.Length);
            UsersRespones.Add(responser);
            }

        }

Bt getting exception on the line:

int g = contents.IndexOf(lastTag, f);

Index was out of range. Must be non-negative and less than the size of the collection

This is happenning after adding the startPos.

  • The call of *contents.Substring(...)* should provide you with the string including the "43.". Either the *GetResponsers* method in your question is not the same as in your actual code (and thus does not illustrate the problem), or the *contents* string passed to *GetResponsers* does not contain "43."... –  Jun 17 '14 at 12:32
  • I know the problem now. The variable cont download the link cont = contents.DownloadString(FilteredLinks[i]); and then in the GetResponsers im getting only the first response then it's just continue to the next link and make another download and each time getting only the first response. FilteredLinks contain 50 links. – user3747465 Jun 17 '14 at 12:42
  • What i need is somehow to make loop maybe with while and extract in GetResponsers method all the responses and only when finish get all of them to contine and download the next link. How can i use a loop like while inside the GetResponsers method ? – user3747465 Jun 17 '14 at 12:43
  • Regarding the edit in your question: Use something like contents.IndexOf(firstTag, startPos) in the while loop. In the beginning, startPos has to be 0. After the *lastTag* position (g) has been found, set startPos to `g + lastTag.Length` as the starting position for the next iteration. If either the firstTag or the lastTag have not been found by the string.IndexOf methods, leave the method with a simple "return" statement. –  Jun 17 '14 at 12:52
  • elgonzo sorry, edited my question just now again sorry. Does you suggestion also will be good to my edited question ? I mean the same idea ? – user3747465 Jun 17 '14 at 13:01
  • elgonzo please look at my question i used now startPos but getting exception. For now im using the original first tag and end tag but once i added the startPos im getting exception. – user3747465 Jun 17 '14 at 13:06
  • 1
    I would suggest you rethink your approach of how you want to process responses. Your GetResponses method looks for a string like `""`, but the HTML snippet at the end of your question does not even contain such a string. You probably need to contemplate about actually parsing the HTML instead of doing some hard-wired string.IndexOf calls. ([check here about HTML parsing](http://stackoverflow.com/questions/56107/what-is-the-best-way-to-parse-html-in-c)). –  Jun 17 '14 at 13:08
  • The exception you get has **nothing** to do with startPos. Read my last two comments again, carefully. Use the debugger to figure out what is going on there regarding the exception. Examine variable values and return values of method calls. (This kind of discussion can be avoided and such problems much quicker resolved by yourself if you just use the debugger properly...) –  Jun 17 '14 at 13:08
  • elgonzo it's working now. I used break; instead return; checking if the variable f is -1 and now it's working no problems. – user3747465 Jun 17 '14 at 13:18

1 Answers1

0

Try this algorithm:

    string contents = "<tr><font SIZE=\"2\" FACE=\"Arial\" color=\"#000099\">" +
                    "<a href=\"#19\"><font color=''>שנתיים?</font></a></font></td>" +
                    "<td align=\"center\" nowrap>" +
                    "<font SIZE=\"1\" FACE=\"Arial\" color=\"#000099\">אפריאט</font></td>" +
                    "<td align=\"center\" nowrap>" +
                    "<font SIZE=\"1\" FACE=\"Arial\" color=\"#000099\">16.06.14 " +
                    "<font SIZE=\"1\" FACE=\"Arial\" color=\"red\">18:30</font></td>" +
                    "<td align=\"center\" nowrap>" +
                    "<font SIZE=\"1\" FACE=\"Arial\" color=\"#000099\">19</font></td></tr>";

    List<string> myList = new List<string>();
    string hrefToken = "href=\"";
    int hrefOffset = hrefToken.Length;

    int tableRowIndex = contents.IndexOf("<tr>");
    int tableRowEndIndex = -1;
    int rowFontIndex = -1;
    int anchorIndex = -1;
    int anchorHrefIndex = -1;
    int anchorHrefNumIndex = -1;
    string anchorHrefNumber = "";

    int fontIndex = -1;
    int fontAfterTagIndex = -1;
    int fontTerminateIndex = -1;
    string fontItem = "";
    string fontItem1 = "";
    string fontItem2 = "";
    string fontItem3 = "";
    string fontItem4 = "";
    string fontItem5 = "";

    while(tableRowIndex > -1)
    {
        rowFontIndex = contents.IndexOf("<font SIZE=\"2\" FACE=\"Arial\" color=\"#000099\">", tableRowIndex);
        anchorIndex = contents.IndexOf("<a", rowFontIndex);
        anchorHrefIndex = contents.IndexOf(hrefToken, anchorIndex);
        anchorHrefNumIndex = anchorHrefIndex + hrefOffset;
        anchorHrefNumber = contents.Substring(anchorHrefNumIndex, contents.IndexOf("\"", anchorHrefNumIndex) - anchorHrefNumIndex);

        fontTerminateIndex = anchorHrefIndex;

        for(int i = 0; i < 5; i++)
        {
            fontIndex = (i == 3) ? fontTerminateIndex : contents.IndexOf("<font", fontTerminateIndex);

            fontAfterTagIndex = contents.IndexOf(">", fontIndex) + 1;

            fontTerminateIndex = (i == 2) ? contents.IndexOf("<font", fontAfterTagIndex) : contents.IndexOf("</font>", fontAfterTagIndex);

            fontItem = contents.Substring(fontAfterTagIndex, fontTerminateIndex - fontAfterTagIndex);

            switch (i)
            {
                case 0:
                    fontItem1 = fontItem;
                    break;
                case 1:
                    fontItem2 = fontItem;
                    break;
                case 2:
                    fontItem3 = fontItem;
                    fontTerminateIndex = contents.IndexOf(">", fontTerminateIndex);
                    break;
                case 3:
                    fontItem4 = fontItem;
                    break;
                case 4:
                    fontItem5 = fontItem;
                    break;
            }
        }

        myList.Add(anchorHrefNumber);
        myList.Add(fontItem1);
        myList.Add(fontItem2);
        myList.Add(fontItem3);
        myList.Add(fontItem4);
        myList.Add(fontItem5);

        tableRowEndIndex = contents.IndexOf("</tr>", tableRowIndex);
        tableRowIndex = contents.IndexOf("<tr>", tableRowEndIndex);
    }