1

I am scraping google scholar author profile data, there are more than 100k profile which i have to scrape but I am facing a 302 redirect status issue, it stops my crawler after 200-300 profile, I tried every solution from internet but I couldn't get success. i am using scrapy 1.7.3. Attached is the console log. output log

class GoogleauthorSpider(scrapy.Spider):
name = 'googleauthor'

allowed_domains = ['scholar.google.com']
start_urls = ['https://scholar.google.com/citations?view_op=search_authors&mauthors=machine+learning']
handle_httpstatus_list = [302]


def __init__(self):
    self.timepointer = int(time.clock())
    self.cnt = 0
    self.years = []


# parse is function to extract and parsed data
def parse(self,response):
    cookieJar = response.meta.setdefault('cookie_jar',CookieJar())
    cookieJar.extract_cookies(response, response.request)
    header = response.headers
    for author_sel in response.xpath('//div[@class="gsc_1usr"]'):

        link = author_sel.xpath(".//h3[@class='gs_ai_name']/a/@href").extract_first()
        url = response.urljoin(link)
        yield scrapy.Request(url,callback=self.parse_url_to_crawl,meta={'dont_redirect':True,'handle_httpstatus_list': [302]})

    if response.xpath("//button[@class='gs_btnPR gs_in_ib gs_btn_half gs_btn_lsb gs_btn_srt gsc_pgn_pnx']/@onclick").extract_first() != "":
        next_page_url= response.xpath("//button[@class='gs_btnPR gs_in_ib gs_btn_half gs_btn_lsb gs_btn_srt gsc_pgn_pnx']/@onclick").extract_first().replace("\\x3d","=").replace("\\x26","?")

    count=next_page_url.count('?')
    after_author = next_page_url.split("?")[count-1]
    start = next_page_url.split("?")[count]
    join_url = "&"+after_author+"&"+start
    url = "https://scholar.google.com/citations?view_op=search_authors&mauthors=machine+learning"+join_url
    logging.info("chala e bawa jee %s",url)
    if start is not None:
        yield scrapy.Request(url,meta={'dont_redirect':True,'handle_httpstatus_list': [302]})


def parse_url_to_crawl(self,response):
    url = response.url
    idx = url.find("user")
    user = url[idx+5:idx+17]
    yield scrapy.Request(url+'&cstart=0&pagesize=100',callback=self.parse_profile_content,meta={'offset':0,'user':user,'dont_redirect':True,'handle_httpstatus_list': [302]})


# get_user_profile_data
def parse_profile_content(self,response):
    items=[]
    url = response.url
    idx = url.find("user")
    userId = url[idx+5:idx+17]


    total_articles = response.meta.get("total_articles", 0)
    tyear = response.meta.get("years", 0)
    self.years = response.meta.get("allyears",[])


    offset = response.meta['offset']
    user = response.meta['user']

    if response.xpath("//div[@id='gsc_prf_in']/text()").extract_first() !="":
        name = response.xpath("//div[@id='gsc_prf_in']/text()").extract_first()

    if response.xpath("//div[@id='gsc_prf_ivh']/text()").extract_first() != "":
        email = response.xpath("//div[@id='gsc_prf_ivh']/text()").extract_first()

    if response.xpath("//div[@class='gsc_prf_il']/text()[1]").extract_first() != "":
        position1 = response.xpath("//div[@class='gsc_prf_il']/text()[1]").extract_first()
        position2 = response.xpath("//div[@class='gsc_prf_il']/a/text()").extract_first()
        position3 = response.xpath("//div[@class='gsc_prf_il']/text()[2]").extract_first()
        positions = str(position1)+' '+str(position2)+' '+str(position3)

    if response.xpath("//div[@id='gsc_prf_int']/a/text()").extract() !="":
        tags= response.xpath("//div[@id='gsc_prf_int']/a/text()").extract()

    if response.xpath("//tr/td[@class='gsc_rsb_std']/text()").extract() != "":
        publication_data = response.xpath("//tr/td[@class='gsc_rsb_std']/text()").extract()
        if len(publication_data[0]) !=0:
            logging.info('publication data %s',publication_data[0])
            citation= publication_data[0]

        if publication_data[1]:
            citation_2014= publication_data[1]

        if publication_data[2]:
            hindex= publication_data[2]

        if publication_data[3]:
            hindex_2014= publication_data[3]

        if publication_data[4]:
            iindex= publication_data[4]

        if publication_data[5]:
            iindex_2014= publication_data[5]



    tmp = response.xpath('//tbody[@id="gsc_a_b"]/tr[@class="gsc_a_tr"]/td[@class="gsc_a_t"]/a/text()').extract()
    year =  response.xpath("//span[@class='gsc_a_h gsc_a_hc gs_ibl']/text()").extract()

    total_articles += len(tmp)
    tyear += len(year)
    self.years.extend(year)

    item = GooglescholarItem()  


    if tmp:
        offset += 100
        yield scrapy.Request("https://scholar.google.com/citations?hl=en&user={user}&cstart={offset}&pagesize=100".format(offset=offset, user=user),callback=self.parse_profile_content, meta={'offset': offset, 'user': user,'total_articles': total_articles,'years':tyear,'allyears':self.years,'dont_redirect':True,'handle_httpstatus_list': [302]})

    else:
        item['name'] = name
        item['email'] = email
        item['position']=positions
        item['tags'] = tags
        item['citation']= citation
        item['citation_2014']= citation_2014
        item['hindex']= hindex
        item['hindex_2014']= hindex_2014
        item['iindex']= iindex
        item['iindex_2014']= iindex_2014
        item['totaltitle'] = total_articles
        item['maxyear'] = max(self.years)
        item['minyear'] = min(self.years)
        items.append(item)
        self.years = []
        yield item
0xwaleed
  • 124
  • 6

0 Answers0