A direct copy-paste of it is
public static IEnumerable<LRThread> GetPageThreads(HtmlDocument doc)
{
var threadNodes =
doc.DocumentNode
.SelectNodes("//ul[@class='thread_list']/child::li[@class='row']");
foreach(var node in threadNodes)
{
HtmlNode titleLink = GetTitleLink(node);
int id;
try
{
string str =
ThreadIdUrlPart
.Matches(titleLink.GetAttributeValue("href", null))[0]
.Groups[1]
.Value;
id = Int32.Parse(str);
}
catch(NullReferenceException) { continue; }
catch(FormatException) { continue; }
var thread = new LRThread()
{
Id = id,
Title = titleLink.InnerText,
Creator = GetCreatorFromRow(node),
Created = GetDateTimeFromRow(node),
Deleted = false
};
yield return thread;
}
}
and what I'm finding is that this returns only the very first item in threadNodes
and returns as many copies of it rather than returning the rest of the items. Am I doing something wrong?
EDIT: I'm using the method like
[TestMethod]
[IntegrationTest]
public void FirstPageScanAndSaveTest()
{
HtmlDocument doc = BoardScanner.GetBoardPage(0);
Assert.IsNotNull(doc, "Couldn't get HTML document for first page.");
var threads = BoardScanner.GetPageThreads(doc);
Assert.IsTrue(threads.Any(), "Couldn't get any threads");
EDIT: Full code dump. This is absolutely insane what is happening.
private static LRThread ParseLRThread(HtmlNode node)
{
// We expect to at least get the id of the thread. It is in the href of a
// <span> element class name 'post title'. Skip to next thread if we
// don't find it.
HtmlNode titleLink = GetTitleLink(node);
int id;
try
{
string str =
ThreadIdUrlPart
.Matches(titleLink.GetAttributeValue("href", null))[0]
.Groups[1]
.Value;
id = Int32.Parse(str);
}
catch (NullReferenceException) { return null; }
catch (FormatException) { return null; }
// Now that we've found the id, try to get all the other properties
// of the thread besides Posts, but don't break if we can't find one of them.
var thread = new LRThread()
{
Id = id,
Title = titleLink.InnerText,
Creator = GetCreatorFromRow(node),
Created = GetDateTimeFromRow(node),
Deleted = false
};
return thread;
}
/// <summary>
/// Iterates through the threads on a give page. This will likely need to be updated.
/// IMPORTANT: The one field of each thread that is not set is Posts because we want
/// the consumer of this class to handle the way that posts are retrieved.
/// </summary>
/// <param name="doc">page html document</param>
public static IEnumerable<LRThread> GetPageThreads(HtmlDocument doc)
{
return
doc.DocumentNode
.SelectNodes("//ul[@class='thread_list']/child::li[@class='row']")
.Select(node => ParseLRThread(node));
}
and my test is
[TestMethod]
[IntegrationTest]
public void FirstPageScanAndSaveTest()
{
HtmlDocument doc = BoardScanner.GetBoardPage(0);
Assert.IsNotNull(doc, "Couldn't get HTML document for first page.");
var threads = BoardScanner.GetPageThreads(doc);
Assert.IsTrue(threads.Any(), "Couldn't get any threads");
CollectionAssert.AllItemsAreNotNull(threads.Select(t => t.Title).ToList(), "Couldn't parse at least one title");
CollectionAssert.AllItemsAreNotNull(threads.Select(t => t.Creator).ToList(), "Couldn't parse at least one Creator");
CollectionAssert.AllItemsAreNotNull(threads.Select(t => t.Created).ToList(), "Couldn't parse at least one date/time");
CollectionAssert.AllItemsAreUnique(threads.Select(t => t.Id).ToList());
var thread = threads.First();
thread.Posts = BoardScanner.GetPostsFromThreadPage(thread, 0).ToList();
Assert.IsTrue(thread.Posts.Any(), "Couldn't any posts from first page of thread");
CollectionAssert.AllItemsAreNotNull(thread.Posts.Select(p => p.Poster).ToList(), "Couldn't get the poster for a post");
CollectionAssert.AllItemsAreNotNull(thread.Posts.Select(p => p.BodyHTML).ToList(), "Couldn't get the html for the body of a post.");
Repo.AddOrUpdateThreads(threads);
}