I'm setting up a new scrapy spider and developed
I am using windows 10 and it's running. My problem is extracting text from different element. This elements sometime on (strong tag, p,) sometime have class , sometime have id but i need to implement to one element to extracting a row text.
Please checkout the link of site
https://exhibits.otcnet.org/otc2019/Public/eBooth.aspx?IndexInList=404&FromPage=Exhibitors.aspx&ParentBoothID=&ListByBooth=true&BoothID=193193&fromFeatured=1
https://exhibits.otcnet.org/otc2019/Public/eBooth.aspx?IndexInList=0&FromPage=Exhibitors.aspx&ParentBoothID=&ListByBooth=true&BoothID=202434
https://exhibits.otcnet.org/otc2019/Public/eBooth.aspx?IndexInList=1218&FromPage=Exhibitors.aspx&ParentBoothID=&ListByBooth=true&BoothID=193194&fromFeatured=1
https://prnt.sc/nkl1vc,
https://prnt.sc/nkl1zy,
https://prnt.sc/nkl247,
# -*- coding: utf-8 -*-
import scrapy
class OtcnetSpider(scrapy.Spider):
name = 'otcnet'
# allowed_domains = ['otcnet.org']
start_urls = ['https://exhibits.otcnet.org/otc2019/Public/Exhibitors.aspx?Index=All&ID=26006&sortMenu=107000']
def parse(self, response):
links = response.css('a.exhibitorName::attr(href)').extract()
for link in links:
ab_link = response.urljoin(link)
yield scrapy.Request(ab_link, callback=self.parse_p)
def parse_p(self, response):
url = response.url
Company = response.xpath('//h1/text()').extract_first()
if Company:
Company = Company.strip()
Country = response.xpath('//*[@class="BoothContactCountry"]/text()').extract_first()
State = response.xpath('//*[@class="BoothContactState"]/text()').extract_first()
if State:
State = State.strip()
Address1 = response.xpath('//*[@class="BoothContactAdd1"]/text()').extract_first()
City = response.xpath('//*[@class="BoothContactCity"]/text()').extract_first()
if City:
City = City.strip()
zip_c = response.xpath('//*[@class="BoothContactZip"]/text()').extract_first()
Address = str(Address1)+' '+str(City)+' '+str(State)+' '+str(zip_c)
Website = response.xpath('//*[@id="BoothContactUrl"]/text()').extract_first()
Booth = response.css('.eBoothControls li:nth-of-type(1)::text').extract_first().replace('Booth: ','')
Description = ''
Products = response.css('.caption b::text').extract()
Products= ', '.join(Products)
vid_bulien = response.css('.aa-videos span.hidden-md::text').extract_first()
if vid_bulien=="Videos":
vid_bulien = "Yes"
else:
vid_bulien = "No"
Video_present = vid_bulien
Conference_link = url
Categories = response.css('.ProductCategoryLi a::text').extract()
Categories = ', '.join(Categories)
Address = Address.replace('None','')
yield {
'Company':Company,
'Country':Country,
'State':State,
'Address':Address,
'Website':Website,
'Booth':Booth,
'Description':Description,
'Products':Products,
'Video_present':Video_present,
'Conference_link':Conference_link,
'Categories':Categories
}
I expect the output would be a row description from different element