1

I am new to web scraping.I am scraping data from a website where i scraped first page href and then i go to each href and find the 'p tag' in class 'address-data'.i want to store one url 'p tag' data in one row and second url 'p tag' tag in second row.My data is appended in 'myUrl'.I want save data in csv file eg, address,longitudelatitude,phone,email then new line starts.

here is my code:

from bs4 import BeautifulSoup
import requests
import csv

myUrl=[]
urls = ["http://www.shaditayari.pk/s&category=326&location=266&a=true&paged{}".format(i) for i in range(1, 10)]  # make a url list and iterate over it
for url in urls:
    r = requests.get(url)
    print('idr1')
    soup = BeautifulSoup(r.text, "html.parser")
    for link in soup.find_all('a', {'main-link'}):
            iurl=link.get('href')  
            r = requests.get(iurl)
            print(iurl)
            soup = BeautifulSoup(r.content, "lxml")
            with open ('lhr.cv','wb') as file:
                divs = soup.find_all('div',attrs={"class":"address-data"})
                for div in divs:
                    myUrl.append(div.find('p').text)
                    #print(myUrl)
                    with open ('lhr.cv','w') as file:
                        writer=csv.writer(file)
                        for row in myUrl:
                                writer.writerow(row)                         

expected output:

9 Fane Road، Lahore 54000, Pakistan|1.561381309140028|74.31484723624567|042-37363901-9|gm@bestwesternlahore.com/sales@bestwesternlahore.com/  reservations@bestwesternlahore.com
1/E-3, Main Boulevard Gulberg III, Lahore|31.525700029363|74.34930089283|0305-2960614|https://www.facebook.com/pages/Zauk-Banquet-Hall/204612846290857
H. tech
  • 31
  • 6
  • Open the CSV **before** you `soup.find_all`. You want to open a file, and **then** write the data. – OneCricketeer Nov 03 '17 at 15:54
  • @cricket_007 i have opened it. – H. tech Nov 03 '17 at 15:57
  • Please could you give an example of a URL that you'd like to scrape without any Python code/formatting applied? – tktk234 Nov 03 '17 at 15:58
  • @cstaff91 it is http://www.shaditayari.pk/s&category=326&location=266&a=true&paged=1, http://www.shaditayari.pk/s&category=326&location=266&a=true&paged=2 and so on – H. tech Nov 03 '17 at 16:00
  • You did, but you opened the **same file** for **each** `div`. (There will only be the last div data in the file). If you want **all** the divs to be in a single file, you need to open the file before the loop – OneCricketeer Nov 03 '17 at 16:00
  • @cstaff91 it is http://www.shaditayari.pk/s&category=326&location=266&a=true&paged=1, http://www.shaditayari.pk/s&category=326&location=266&a=true&paged=2 and so on when paged=1 there are href like http://www.shaditayari.pk/businesses/best-western-hotel-2/http://www.shaditayari.pk/businesses/zaibis-marriage-garden/ etc and ,http://www.shaditayari.pk/businesses/zauk-banquet-hall/ actually pagination is there. – H. tech Nov 03 '17 at 16:07

1 Answers1

0

I've written this in Python 2 and using xpaths (because I think they're cleaner and simpler to use for webscraping), but this code will get you your list of links:

#Load required libraries
import requests
from lxml import html
import pandas as pd

#Create base URL
url = "http://www.shaditayari.pk/?s&post_type=ait-item&a=true&paged="

#First, we want to work out the number of pages to scrape. We load any page and get the largest page number
page = requests.get(url+str(1))
tree = html.fromstring(page.content)
no_pages = tree.xpath("//nav/a[last()]/text()")[0] #This comes out as a list of two - we only want the first one

#Next, we want to scrape the links to each page with the address

links = []
names = []

for i in range(1,int(no_pages)+1):
    page = requests.get(url+str(i))
    tree = html.fromstring(page.content)
    page_links = tree.xpath("//div[@class = 'item-title']/a/@href")
    page_names = tree.xpath("//a/h3/text()")
    links = links + page_links
    names = names + page_names
    print i

address links = {"Name": names,
                "URL": links}

pd.DataFrame.to_csv(u"address_links.csv")

This code needs completing, with the append, the dictionary completion, and a line to create a CSV, but it will get your details:

address_list = []
latitude_list = []
longitude_list = []
telephone_list = []
email_list = []
webpage_list = []

counter = 0

for url in address_links["URL"]:
    page = requests.get("http://www.shaditayari.pk/businesses/rizwan-beyg/")
    tree = html.fromstring(page.content)
    address = tree.xpath("//div[@itemprop = 'streetAddress']/p/text()")
    if len(address) == 0:
        address == ""
    else:
        address == address[0]

    latitude = tree.xpath("//p/meta[@itemprop = 'latitude']/@content")
    if len(latitude) == 0:
        latitude = ""
    else:
        latitude = latitude[0]

    longitude = tree.xpath("//p/meta[@itemprop = 'latitude']/@content")
    if len(longitude) == 0:
        longitude = ""
    else:
        longitude = longitude[0]

    telephone = tree.xpath("//a[@class = 'phone']/text()")
    if len(telephone) == 0:
        telephone = ""
    else:
        telephone = telephone[0]

    email = tree.xpath("//a[@itemprop = 'email']/text()")
    if len(email) == 0:
        email = ""
    else:
        email = email[0]

    webpage = tree.xpath("//a[@itemprop = 'url']/@href")
    if len(webpage) == 0:
        webpage = ""
    else:
        webpage = webpage[0]

    address_list.append(address)
    #continue for others

    counter+=1
    print counter

address_details = {"Name": names,
                  "URL": links,
                  "Address": address_list,
                   #continue for others
                  }

You might need to add in some unicode encoding before you turn it into a CSV. That's answered here.

tktk234
  • 426
  • 3
  • 12