0

:)

I'm crawling through Selenium Webdriver. If not use yield, it works fine, but When using yield, the following error occurs. What should I do?

Traceback (most recent call last):
  File "c:\programdata\anaconda2\lib\site-packages\scrapy\utils\defer.py", line 45, in mustbe_deferred
    result = f(*args, **kw)
  File "c:\programdata\anaconda2\lib\site-packages\scrapy\core\spidermw.py", line 49, in process_spider_input
    return scrape_func(response, request, spider)
  File "c:\programdata\anaconda2\lib\site-packages\scrapy\core\scraper.py", line 146, in call_spider
    dfd.addCallbacks(request.callback or spider.parse, request.errback)
  File "c:\programdata\anaconda2\lib\site-packages\twisted\internet\defer.py", line 303, in addCallbacks
    assert callable(callback)

this is code here :

# -*- coding: utf-8 -*-

workbook = xlsxwriter.Workbook('arrays.xlsx')
worksheet = workbook.add_worksheet()

class LgSpider(Spider):
    name = 'lg'
    allowed_domains = ['naturecollection.co.kr/product/list.jsp?cate_seq=4']


    def start_requests(self):
        reader = csv.reader(open('urls1.csv'))

        for row in reader:
            url = row[0]
            # self.parse_detail(url)
            yield Request(url=url, callback=self.parse_detail(url))


    def parse_detail(self, url):

        self.driver = webdriver.Chrome('/webdrivers/chromedriver')
        self.driver.get(url)
        sleep(10)

        sel = Selector(text=self.driver.page_source)
        # sleep(2)
        # self.logger.info('Sleeping for 2 seconds.')
        response = url
        sub_kor = sel.xpath('//meta[@property="og:title"]/@content').extract()
        sub_en = sel.xpath('//*[@class="section fr"]//*[@class="subTit"]/text()').extract()
        highlight_1 = sel.xpath('//meta[@property="og:description"]/@content').extract()
        main = sel.xpath('//meta[@property="og:image"]/@content').extract()
        category_1 = sel.xpath('//*[@id="locationArea"]/div/a/text()').extract()
        category_2 = sel.xpath('//*[@id="locationArea"]/strong/text()').extract()
        # table = sel.xpath('//*[@id="specInfoLayer"]//td').extract()
        noop_originpirce = sel.xpath('//*[@class="section fr"]//*[@class="realCost"]/text()').extract()
        noop_real_price = sel.xpath('//*[@class="section fr"]//span[@class="cost"]/text()').extract()
        real_price = sel.xpath('//*[@class="colorChip optionList"]//input[@name="cost"]/@value').extract()
        stock_no = sel.xpath('//*[@class="colorChip optionList"]//*[contains(@id, "stock")]/@value').extract()
        options = sel.xpath('//*[@class="colorChip optionList"]//@title').extract()
        brand = sel.xpath('//span[@class="brand"]/text()').extract_first()
        rating = sel.xpath('//*[@class="starArea"]/span/text()').extract()
        description = sel.xpath('//*[@id="proExplain"]//p').extract()
        image_urls = sel.xpath('//*[@class="thumList"]/li/a/img/@src').extract()
        volume = sel.xpath('//*[@id="specInfoLayer"]//tbody/tr[1]/td/text()').extract()
        skin_type = sel.xpath('//*[@id="specInfoLayer"]//tbody/tr[2]').extract()
        expire_date = sel.xpath('//*[@id="specInfoLayer"]//tbody/tr[3]').extract()
        method = sel.xpath('//*[@id="specInfoLayer"]//tbody/tr[4]').extract()
        manufature = sel.xpath('//*[@id="specInfoLayer"]//tbody/tr[5]').extract()
        ingridient = sel.xpath('//*[@id="specInfoLayer"]//tbody/tr[6]').extract()



        for idx, option in enumerate(options):

            yield  {'Option': options[idx],
                   # 'A': a,
                   'Volume': volume,
                   'Skin_type': skin_type,
                   'Expire_date': expire_date,
                   'Method': method,
                   'Manufature': manufature,
                   'Url': url,
                   'Sub_kor': sub_kor,
                   'Sub_en': sub_en,
                   'Highlight': highlight_1,
                   'Noop_Origin_price': noop_originpirce,
                   'Noop_real_price': noop_real_price,
                   'Real_price': real_price[idx],
                   'Category_1': category_1,
                   'Category_2': category_2,
                   # 'Category_3': category_3,
                   # 'Category_4': category_4,
                   'Stock_no': stock_no,
                   'Description': description,
                   'Rating': rating,
                   'Ingridient': ingridient,
                   'Brand': brand,
                   # 'Ingridient_text': ingridient_text,
                   'Image_urls': image_urls,
                   # 'Table_dts': table_dts,
                   # 'Table_dds': table_dds,
                   # 'Options': options[idx],
                   # 'Brand': brand,
                   # 'Table' : table,
                   # 'Buyer_no': buyer_no,
                   # 'Repurchase' : repurchase,
                   'Main': main
                   }




    def close(self, reason):
        # pass
        csv_file = max(glob.iglob('*.csv'), key=os.path.getctime)

        wb = Workbook()
        ws = wb.active

        with open(csv_file, 'r') as f:
            for row in csv.reader(f):
                # row = row.encode('utf-8')
                try:
                    ws.append(row)
                except:
                    continue

        wb.save(csv_file.replace('.csv', '') + '.xlsx')enter code here

thank you!!!!

undetected Selenium
  • 183,867
  • 41
  • 278
  • 352
susim
  • 221
  • 5
  • 15

2 Answers2

0

I'm pretty sure that when you're using yield you need to iterate through the generator that youv'e created. So your generator would be start_requests, which is yielding each row from your reader.

So you will need to store your generator then iterate over it. See the description of python yield - What does the "yield" keyword do?

Jim Factor
  • 1,465
  • 1
  • 15
  • 24
0

The error is telling you that something is not right with your callback

You should to delete the 2ยบ url call in your callback, your code will be like this

 yield Request(url=url, callback=self.parse_detail)

Additionally I will use a response object instead of a selector.

Your code will be like this:

def parse_detail(self,response)
   sub_kor = response.xpath('//meta....')
V-cash
  • 330
  • 3
  • 14