2

A direct copy-paste of it is

    public static IEnumerable<LRThread> GetPageThreads(HtmlDocument doc)
    {
        var threadNodes =
              doc.DocumentNode
              .SelectNodes("//ul[@class='thread_list']/child::li[@class='row']");

        foreach(var node in threadNodes)
        {
            HtmlNode titleLink = GetTitleLink(node);

            int id;
            try
            {
                string str =
                    ThreadIdUrlPart
                    .Matches(titleLink.GetAttributeValue("href", null))[0]
                    .Groups[1]
                    .Value;
                id = Int32.Parse(str);
            }
            catch(NullReferenceException) { continue; }
            catch(FormatException) { continue; }   

            var thread = new LRThread()
            {
                Id = id,
                Title = titleLink.InnerText,
                Creator = GetCreatorFromRow(node),
                Created = GetDateTimeFromRow(node),
                Deleted = false
            };
            yield return thread;
        }

    }

and what I'm finding is that this returns only the very first item in threadNodes and returns as many copies of it rather than returning the rest of the items. Am I doing something wrong?

EDIT: I'm using the method like

    [TestMethod]
    [IntegrationTest]
    public void FirstPageScanAndSaveTest()
    {
        HtmlDocument doc = BoardScanner.GetBoardPage(0);
        Assert.IsNotNull(doc, "Couldn't get HTML document for first page.");
        var threads = BoardScanner.GetPageThreads(doc);
        Assert.IsTrue(threads.Any(), "Couldn't get any threads");

EDIT: Full code dump. This is absolutely insane what is happening.

    private static LRThread ParseLRThread(HtmlNode node)
    {
        // We expect to at least get the id of the thread. It is in the href of a 
        // <span> element class name 'post title'. Skip to next thread if we 
        // don't find it.
        HtmlNode titleLink = GetTitleLink(node);

        int id;
        try
        {
            string str =
                ThreadIdUrlPart
                .Matches(titleLink.GetAttributeValue("href", null))[0]
                .Groups[1]
                .Value;
            id = Int32.Parse(str);
        }
        catch (NullReferenceException) { return null; }
        catch (FormatException) { return null; }

        // Now that we've found the id, try to get all the other properties
        // of the thread besides Posts, but don't break if we can't find one of them.
        var thread = new LRThread()
        {
            Id = id,
            Title = titleLink.InnerText,
            Creator = GetCreatorFromRow(node),
            Created = GetDateTimeFromRow(node),
            Deleted = false
        };

        return thread;
    }

    /// <summary>
    /// Iterates through the threads on a give page. This will likely need to be updated.
    /// IMPORTANT: The one field of each thread that is not set is Posts because we want
    /// the consumer of this class to handle the way that posts are retrieved.
    /// </summary>
    /// <param name="doc">page html document</param>
    public static IEnumerable<LRThread> GetPageThreads(HtmlDocument doc)
    {
        return
            doc.DocumentNode
            .SelectNodes("//ul[@class='thread_list']/child::li[@class='row']")
            .Select(node => ParseLRThread(node));
    }

and my test is

    [TestMethod]
    [IntegrationTest]
    public void FirstPageScanAndSaveTest()
    {
        HtmlDocument doc = BoardScanner.GetBoardPage(0);
        Assert.IsNotNull(doc, "Couldn't get HTML document for first page.");
        var threads = BoardScanner.GetPageThreads(doc);
        Assert.IsTrue(threads.Any(), "Couldn't get any threads");
        CollectionAssert.AllItemsAreNotNull(threads.Select(t => t.Title).ToList(), "Couldn't parse at least one title");
        CollectionAssert.AllItemsAreNotNull(threads.Select(t => t.Creator).ToList(), "Couldn't parse at least one Creator");
        CollectionAssert.AllItemsAreNotNull(threads.Select(t => t.Created).ToList(), "Couldn't parse at least one date/time");
        CollectionAssert.AllItemsAreUnique(threads.Select(t => t.Id).ToList());
        var thread = threads.First();
        thread.Posts = BoardScanner.GetPostsFromThreadPage(thread, 0).ToList();
        Assert.IsTrue(thread.Posts.Any(), "Couldn't any posts from first page of thread");
        CollectionAssert.AllItemsAreNotNull(thread.Posts.Select(p => p.Poster).ToList(), "Couldn't get the poster for a post");
        CollectionAssert.AllItemsAreNotNull(thread.Posts.Select(p => p.BodyHTML).ToList(), "Couldn't get the html for the body of a post.");
        Repo.AddOrUpdateThreads(threads);
    }
user7127000
  • 3,143
  • 6
  • 24
  • 41
  • Nothing seems wrong with your code see what is inside `threadNodes` – Emad Nov 17 '16 at 06:06
  • Can we see how you're using the method? – Rob Nov 17 '16 at 06:14
  • Side note: you really should not post code that `catch(NullReferenceException) { continue; }` to public sites... people may think you have no idea what you are doing... – Alexei Levenkov Nov 17 '16 at 06:16
  • What's inside `threadNodes` is perfectly legit. It's that somehow when I enumerate it I only get the first element. Makes no sense. – user7127000 Nov 17 '16 at 06:18
  • @AlexeiLevenkov What do you mean by that? – user7127000 Nov 17 '16 at 06:19
  • @Rob sure, I'll post it – user7127000 Nov 17 '16 at 06:20
  • NRE is not an exception that happens in code that correctly handles null values (i.e. with `if` conditions). Ignoring it just hides run-time errors and almost as pad as [pokemon exception handling](http://www.dodgycoder.net/2011/11/yoda-conditions-pokemon-exception.html). You may want to read canonical [What is NRE and how to fix it](http://stackoverflow.com/questions/4660142/what-is-a-nullreferenceexception-and-how-do-i-fix-it) question on the topic. – Alexei Levenkov Nov 17 '16 at 06:43
  • It looks like you may be reusing `threads` which means you are iterating it many times, rather than just once. How ever, you only showed one usage if it (where is the usage demonstrating you are getting the first element repeatedly?) – Rob Nov 17 '16 at 07:06
  • @Rob I'm going to dump a bunch of code because this problem is killing me. I've been at it for 5 hours. Stay tuned for an edit. – user7127000 Nov 17 '16 at 07:23
  • it could also help to see the document you are trying to parse – slawekwin Nov 17 '16 at 07:30
  • @slawekwin here view-source:http://www.letsrun.com/forum/forum.php?board=1 – user7127000 Nov 17 '16 at 07:39
  • @user7127000 which part is returning the duplicates? What happens if you write `threads = BoardScanner.GetPageThreads().ToList()`? – Rob Nov 17 '16 at 08:16

1 Answers1

0

you can try like this. but I did not execute it myself.

{
    var thread = new LRThread()
    Id = id,
    Title = titleLink.InnerText,
    Creator = GetCreatorFromRow(node),
    Created = GetDateTimeFromRow(node),
    Deleted = false
}
slawekwin
  • 6,270
  • 1
  • 44
  • 57