-1

I'm trying to create a webscraping tool that will update a Dataframe with data from multiple tables. The page I'm working on has a base table in which every row has a link that directs you to a new URL that has a secondary table with the data I'm looking for. My objective is to create a unique Dataframe comprehensive of all the data present on all secondary tables of the site. Problem is, every secondary table can have different sets of columns from the previous one, depending on whether that secondary table has a value for that specific column or not, and I cannot know all the possibile column types

I tried multiple solutions. What I'm working on at the moment is to create a for loop that constantly create a new Dataframe out of the new tables and merge them to the previous one. But I'm stuck on trying to merge the two Dataframes on all the columns they have in common. Please forgive me if I made amateur mistakes, I've been using python only for a week.


#create the main DataFrame
link1= links[0]
url_linked = url_l + link1
page_linked = requests.get (url_linked)
soup_linked = BeautifulSoup(page_linked.text,'lxml')
table_linked= soup_linked.find('table', class_="XXXXX")

headers_link=[]
headers_unique=[]
for i in table_linked.find_all('th'):
    title_link=i.text
    title_link=map(str,title_link)
    headers_link.append(title_link)

headers_unique=headers_link
mydata_link = pd.DataFrame(columns=headers_link)


count = 1

for link in links:
    url_linked = url_l + link
    page_linked = requests.get (url_linked)
    soup_linked = BeautifulSoup(page_linked.text,'lxml')
    table_linked= soup_linked.find('table', class_="table table-directory-responsive")

    row2=[]
    n_columns =len(table_linked.find_all('th'))

    #populating the main dataframe

    if count == 1:
        for j in table_linked.find_all('tr'):
            row_data=j.find_all('td')
            row=[i.text for i in row_data]
            row2.append(row)
        
        lenght_link= len(mydata_link)
        row2.remove(['']) #To get rid of empty rows that have no th
        mydata_link.loc[lenght_link]=row2 
        print(mydata_link)

        print('Completato link '+ str(count))
        count= count+1


    #creating the secondary Dataframe
    else:      
        headers_test=[]
        for i in table_linked.find_all('th'):
            title_test=i.text
            title_test=map(str,title_test)
            headers_test.append(title_test)

        mydata_temp=pd.DataFrame(columns=headers_test)

        for j in table_linked.find_all('tr'):
            row_data=j.find_all('td')
            row=[i.text for i in row_data]
            row2.append(row)
        
        lenght_link= len(mydata_link)
        row2.remove(['']) #To get rid of empty rows that have no th
        mydata_temp.loc[lenght_link]=row2
        print(mydata_temp)
        

        #merge the two DataFrames based on the unique set of columns they both have

        headers_unique= set(headers_unique).intersection(headers_test)
        mydata_link=mydata_link.merge(mydata_temp, on=[headers_unique], how='outer')
        print(mydata_link)
        print('Completed link '+ str(count))
        count= count+1

What I need is basically a function that, given these sample dataFrames:

A B C
1 2 3
C A D E
4 5 6 7

Will return the following dataframe:

A B C D E
1 2 3 Nan Nan
5 Nan 4 6 7
Driftr95
  • 4,572
  • 2
  • 9
  • 21
Shattered
  • 1
  • 1
  • could you include some of the tables for example and a sample of how you would like the merged table to look based on those? – Driftr95 Jan 08 '23 at 01:47
  • [https://stackoverflow.com/questions/20109391/how-to-make-good-reproducible-pandas-examples] – gregV Jan 08 '23 at 04:27

1 Answers1

0

Just concatenating all the secondary tables should do - build a list of all the secondary DataFrames, and then pd.concat(dfList).

Btw, have you considered just using .read_html instead of looping through the cells?

#create the main DataFrame
link1 = links[0]
url_linked = url_l + link1
page_linked = requests.get (url_linked)
soup_linked = BeautifulSoup(page_linked.text, 'lxml')
table_linked = soup_linked.find('table', class_="XXXXX")

if table_linked:
    primaryDf = pd.read_html(table_linked.prettify())[0]
    headers_link = [h.get_text(' ').strip() for h in table_linked.find_all('th')]
    dfList = [pd.DataFrame(columns=headers_link if headers_link else primaryDf.columns)]
else: primaryDf, dfList = None, []  

count = 0
for link in links:
    count += 1
    url_linked = url_l + link
    page_linked = requests.get (url_linked)
    soup_linked = BeautifulSoup(page_linked.text, 'lxml')
    table_linked = soup_linked.find('table', class_="table table-directory-responsive")

    if not table_linked:
        ## to see if any response errors or redirects
        print(f'[{page_linked.status_code} {page_linked.reason} from {page_linked.url}]')   

        ## print error message and move to next link
        print(f'Found no tables with required class at link#{count}', url_linked)
        continue
    tempDf = pd.read_html(table_linked.prettify())[0] ## read table as df [if found]

    ## get rid of empty rows and empty columns
    tempDf = tempDf.dropna(axis='rows', 'how'='all').dropna(axis='columns', 'how'='all') 
    dfList.append(tempDf.loc[:]) ## .loc[:] to append a copy, not original (just in case)
    print(f'Completed link#{count} with {len(tempDf)} rows from {url_linked}')
combinedDF = pd.concat(dfList)
Driftr95
  • 4,572
  • 2
  • 9
  • 21