2

I am trying to build a web scrapper to scrape jobs from internshala.com. I am using go colly to build the web scrapper. I visit every page and then visit the subsequent links of each job to scrape data from. Doing this in a sequential manner scrapes almost all the links, but if I try doing it by using colly's parallel scrapping the number of links scraped decreases. I write all the data in a csv file.

EDIT My question is why does this happen while scrapping parallelly and how can I solve it (how can I scrape all the data even when scrapping parallelly ). Or is there something else I am doing wrong that is cauing the problem. A code review will be really helpful. Thanks :)

package main

import (
    "encoding/csv"
    "log"
    "os"
    "strconv"
    "sync"
    "time"

    "github.com/gocolly/colly"
)


func main(){

        parallel(10)
        seq(10)
}

I comment out one of the two functions before running for obvious reasons.

parallel function :=


func parallel(n int){
    start := time.Now()
    c := colly.NewCollector(
        colly.AllowedDomains("internshala.com", "https://internshala.com/internship/detail", 
        "https://internshala.com/internship/", "internshala.com/", "www.intershala.com"),
        colly.Async(true),
    )

    d := colly.NewCollector(
        colly.AllowedDomains("internshala.com", "https://internshala.com/internship/detail", 
        "https://internshala.com/internship/", "internshala.com/", "www.intershala.com"),
        colly.Async(true),
    ) 

    
    c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 4})
    d.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 4})

    fileName := "data.csv"
    file, err := os.Create(fileName)

    cnt := 0

    if err != nil{
        log.Fatalf("Could not create file, err: %q", err)
        return
    }

    defer file.Close() // close the file after the main routine exits

    writer := csv.NewWriter(file)
    defer writer.Flush()
    var wg sync.WaitGroup
    c.OnHTML("a[href]", func(e *colly.HTMLElement){

        if e.Attr("class") != "view_detail_button"{
            return
        }

        detailsLink := e.Attr("href")

        d.Visit(e.Request.AbsoluteURL(detailsLink))
        
    })
    
    d.OnHTML(".detail_view", func(e *colly.HTMLElement) {
        wg.Add(1)       

        go func(wg *sync.WaitGroup)  {
            writer.Write([]string{
                e.ChildText("span.profile_on_detail_page"),
                e.ChildText(".company_name a"),
                e.ChildText("#location_names a"),
                e.ChildText(".internship_other_details_container > div:first-of-type > div:last-of-type .item_body"),
                e.ChildText("span.stipend"),
                e.ChildText(".applications_message"),
                e.ChildText(".internship_details > div:nth-last-of-type(3)"),
                e.Request.URL.String(), 
            })
            wg.Done()
        }(&wg)
        

    })

    c.OnRequest(func(r *colly.Request) {
        
        log.Println("visiting", r.URL.String())
    })

    d.OnRequest(func(r *colly.Request) {
        
        log.Println("visiting", r.URL.String())
        cnt++
    })

    for i := 1; i < n; i++ {
        c.Visit("https://internshala.com/internships/page-"+strconv.Itoa(i))
    }

    c.Wait()
    d.Wait()
    wg.Wait()

    t := time.Since(start)

    log.Printf("time %v \n", t)
    log.Printf("amount %v \n", cnt)
    log.Printf("Scrapping complete")
    log.Println(c)

}

seq function :=

func seq(n int){
    start := time.Now()
    c := colly.NewCollector(
        colly.AllowedDomains("internshala.com", "https://internshala.com/internship/detail", 
        "https://internshala.com/internship/", "internshala.com/", "www.intershala.com"),
    )

    d := colly.NewCollector(
        colly.AllowedDomains("internshala.com", "https://internshala.com/internship/detail", 
        "https://internshala.com/internship/", "internshala.com/", "www.intershala.com"),
    ) 



    fileName := "data.csv"
    file, err := os.Create(fileName)

    cnt := 0

    if err != nil{
        log.Fatalf("Could not create file, err: %q", err)
        return
    }

    defer file.Close() // close the file after the main routine exits

    writer := csv.NewWriter(file)
    defer writer.Flush()

    c.OnHTML("a[href]", func(e *colly.HTMLElement){

        if e.Attr("class") != "view_detail_button"{
            return
        }

        detailsLink := e.Attr("href")

        d.Visit(e.Request.AbsoluteURL(detailsLink))
        
    })
    
    d.OnHTML(".detail_view", func(e *colly.HTMLElement) {
        
        
        writer.Write([]string{
            e.ChildText("span.profile_on_detail_page"),
            e.ChildText(".company_name a"),
            e.ChildText("#location_names a"),
            e.ChildText(".internship_other_details_container > div:first-of-type > div:last-of-type .item_body"),
            e.ChildText("span.stipend"),
            e.ChildText(".applications_message"),
            e.ChildText(".internship_details > div:nth-last-of-type(3)"),
            e.Request.URL.String(), 
        })
        

    })

    c.OnRequest(func(r *colly.Request) {
        
        log.Println("visiting", r.URL.String())
    })

    d.OnRequest(func(r *colly.Request) {
        
        log.Println("visiting", r.URL.String())
        cnt++
    })

    for i := 1; i < n; i++ {
        // Add URLs to the queue
        c.Visit("https://internshala.com/internships/page-"+strconv.Itoa(i))
    }

    t := time.Since(start)

    log.Printf("time %v \n", t)
    log.Printf("amount %v \n", cnt)
    log.Printf("Scrapping complete")
    log.Println(c)

}

Any help will be much appreciated. :)

Adnan
  • 88
  • 1
  • 7
  • Technically you dod not ask a question. Do you want a review of your code or insight on what websites do if access them excessively? – Volker May 17 '22 at 08:17
  • @Volker My question is why does this happen while scrapping parallelly and how can I solve it (how can I scrape all the data even when scrapping parallelly ). I made an edit to the question as well. A code review will be really helpful. Thanks :) – Adnan May 17 '22 at 08:23

1 Answers1

0

Sorry for being late at the party but I came up with a working solution to your problem. Let me show it:

package main

import (
    "encoding/csv"
    "fmt"
    "log"
    "os"
    "strconv"
    "strings"
    "time"

    "github.com/gocolly/colly/v2"
    "github.com/gocolly/colly/v2/queue"
)

func parallel(n int) {
    start := time.Now()
    cnt := 0
    queue, _ := queue.New(8, &queue.InMemoryQueueStorage{MaxSize: 1000}) // tried up to 8 threads

    fileName := "data_par.csv"
    file, err := os.Create(fileName)
    if err != nil {
        log.Fatalf("Could not create file, err: %q", err)
        return
    }

    defer file.Close() // close the file after the main routine exits

    writer := csv.NewWriter(file)
    defer func() {
        writer.Flush()
        if err := writer.Error(); err != nil {
            panic(err)
        }
    }()

    c := colly.NewCollector(
        colly.AllowedDomains("internshala.com", "https://internshala.com/internship/detail",
            "https://internshala.com/internship/", "internshala.com/", "www.intershala.com"),
    )

    c.OnHTML("a[href]", func(e *colly.HTMLElement) {
        if e.Attr("class") != "view_detail_button" {
            return
        }

        detailsLink := e.Attr("href")
        e.Request.Visit(detailsLink)
    })

    c.OnRequest(func(r *colly.Request) {
        writer.Write([]string{r.URL.String()})
    })

    for i := 1; i < n; i++ {
        queue.AddURL("https://internshala.com/internships/page-" + strconv.Itoa(i))
    }

    queue.Run(c)

    t := time.Since(start)
    log.Printf("time: %v\tamount: %d\n", t, cnt)
}

func seq(n int) {
    start := time.Now()
    c := colly.NewCollector(
        colly.AllowedDomains("internshala.com", "https://internshala.com/internship/detail",
            "https://internshala.com/internship/", "internshala.com/", "www.intershala.com"),
    )

    fileName := "data_seq.csv"
    file, err := os.Create(fileName)

    cnt := 0

    if err != nil {
        log.Fatalf("Could not create file, err: %q", err)
        return
    }

    defer file.Close() // close the file after the main routine exits

    writer := csv.NewWriter(file)
    defer func() {
        writer.Flush()
        if err := writer.Error(); err != nil {
            panic(err)
        }
    }()

    c.OnHTML("a[href]", func(e *colly.HTMLElement) {
        if e.Attr("class") != "view_detail_button" {
            return
        }

        detailsLink := e.Attr("href")
        e.Request.Visit(detailsLink)
    })

    c.OnRequest(func(r *colly.Request) {
        writer.Write([]string{r.URL.String()})
    })

    for i := 1; i < n; i++ {
        c.Visit("https://internshala.com/internships/page-" + strconv.Itoa(i))
    }

    t := time.Since(start)
    log.Printf("time: %v\tamount: %d\n", t, cnt)
}

func main() {
    fmt.Println("sequential")
    seq(6)
    fmt.Println(strings.Repeat("#", 50))
    fmt.Println("parallel")
    parallel(6)
}

The problem

After looking at your code, I think that everything is implemented correctly. Sure things could be done in a better way but at least about the concurrency everything is set up properly. Some aspects that you could have improved are in the following list:

  1. Check for the Error while flushing to the underlying CSV file
  2. Use only one collector instead of two

Again, as I already said, these are only small refinements.

The actual problem

The actual problem is that when you make concurrent (and potentially parallel) requests, the colly framework cannot keep up with it and starts losing some responses. This trend grows exponentially when you increase the number of executions.

The easiest solution (IMO)

gocolly provides the Queue type that fits very well for these challenges. Thanks to them, you'll be sure that every request will be processed as if they've been done concurrently. The steps can be summarized as follows:

  1. Instantiate a new queue with the New function provided by the queue sub-package. You've to set up the number of threads and also the type of queue (in our case it's fine to use an in-memory implementation).
  2. Instantiate a default collector with all of its needed callbacks.
  3. Invoke the method AddUrl on the above-defined queue variable with the appropriate URL to query.
  4. Invoke the Run method that sends the actual requests to the target URLs and waits for the responses.

Note that I simplified the solution you shared just to focus on the number of requests in the two approaches. I didn't check the logic you wrote in the OnHTML callback but I assumed it worked.

Let me know if this solves your issue or share how you were able to solve this problem, thanks!

ossan
  • 1,665
  • 4
  • 10