0

I am trying to scrape the data from a webpage having drop down values. The Url_list & all_urls both are producing exactly the same URLs i want, but the code is failing due to connection error each time.

Below is the code i have used

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
import pandas as pd
import json

all_urls=[]
Data =[]
url_list=[]
url = 'https://www.sfma.org.sg/member/category/'
page = 'https://www.sfma.org.sg/member/info'
text = requests.get(url).text
d = re.findall(r'var\s*cObject\s*=\s*(.*)\s*;', text)[0]
d = re.sub(r'(\w+)(?=:)', r'"\1"', d)
d = json.loads(d.replace("'", '"'))
for c in d['category']:
    AC = [c['permalinks']]
    urls = url + AC[0]
    all_urls.append(urls)


for info in all_urls:
    pages = requests.get(info)
    soup = BeautifulSoup(pages.content, 'html.parser')
    script_sections = soup.find_all('script')
    for i in range(len(script_sections)):
        if len(script_sections[i].contents) >= 1:
            txt = script_sections[i].contents[0]
            pattern = re.compile(r'permalink:\'(.*?)\'')
            permlinks = re.findall(pattern, txt)
            for i in permlinks:
                href = "../info/{{permalink}}"
                href = href.split('{')[0]+i
                full_url = urljoin(page, href)
                url_list.append(full_url)
            for m in url_list:
                entry=[]
                Newpages = requests.get(m)
                soup_2 =BeautifulSoup(Newpages.content, 'html.parser')
                Member = soup_2.find_all('h5', attrs={'class' :'w3-text-sfma'})
                Member_Name = [Member_Name.text.strip() for Member_Name in Member]
                Details = soup_2.find_all('p')
                other_details = [other_details.text.strip() for other_details in Details]
                Details_final = other_details[1:9]
                Address = Details_final[0:4]
                Tel = [y for y in Details_final if y.startswith('Tel')]
                Fax = [m for m in Details_final if m.startswith('Fax')]
                Email = [n for n in Details_final if n.startswith('Email')]
                Website =  [s for s in Details_final if s.startswith('Website')]
                entry.append(Member_Name)
                entry.append(Address)
                entry.append(Tel)
                entry.append(Fax)
                entry.append(Email)
                entry.append(Website)
                Data.append(entry)

The error i am getting is

SysCallError: (10054, 'WSAECONNRESET')
ss_0708
  • 183
  • 1
  • 10

1 Answers1

1

Thanks everyone for all the suggestions & answers !! i could finally got almost everything in place. Below is the code ihave used

from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
import pandas as pd
import json
import requests


Data =[]
url_list=[]
urls_list =[]
url = "https://www.sfma.org.sg/member/category/"
text = requests.get(url).text
d = re.findall(r'var\s*cObject\s*=\s*(.*)\s*;', text)[0]
d = re.sub(r'(\w+)(?=:)', r'"\1"', d)
d = json.loads(d.replace("'", '"'))
for c in d['category']:
    AC = [c['permalink']]
    urls = url + AC[0]
    url_list.append(urls)
for m in url_list:
        pages = requests.get(m)
        soup = BeautifulSoup(pages.content, 'html.parser')
        script_sections = soup.find_all('script')
        for i in range(len(script_sections)):
               if len(script_sections[i].contents) >= 1:
                    txt = script_sections[i].contents[0]
                    pattern = re.compile(r'permalink:\'(.*?)\'')
                    permlinks = re.findall(pattern, txt)

                    for i in permlinks:
                        AE = [i['permalink'] for i in d['category'] if 'permalink' in i]
                        if i not in AE:
                            href = "../info/{{permalink}}"
                            href = href.split('{')[0]+i
                            full_url = urljoin(url, href)
                            urls_list.append(full_url)  
        for n in urls_list:
                        entry=[]
                        pages = requests.get(n)
                        soup_2 =BeautifulSoup(pages.content, 'html.parser')
                        Member = soup_2.find_all('h5', attrs={'class' :'w3-text-sfma'})
                        Member_Name = [Member_Name.text.strip() for Member_Name in Member]
                        Details = soup_2.find_all('p')
                        other_details = [other_details.text.strip() for other_details in Details]
                        Details_final = other_details[1:9]
                        Address = Details_final[0:4]
                        Tel = [y for y in Details_final if y.startswith('Tel')]
                        Fax = [m for m in Details_final if m.startswith('Fax')]
                        Email = [n for n in Details_final if n.startswith('Email')]
                        Website =  [s for s in Details_final if s.startswith('Website')]
                        entry.append(Member_Name)
                        entry.append(Address)
                        entry.append(Tel)
                        entry.append(Fax)
                        entry.append(Email)
                        entry.append(Website)
                        entry.append(Category_Name)
                        Data.append(entry) 

Thanks to All!!!

ss_0708
  • 183
  • 1
  • 10