The table extraction is mostly answered here, although not the column names.
I have used the same approach, but as you are using some old libraries, e.g. urllib
, this is a more modern way to do it. I have also used pandas
to parse the table and then extract to json
easily.
# These libraries are easiest
from bs4 import BeautifulSoup
import requests
import json
import pandas as pd
# Download page
req = requests.get('https://www.worldometers.info/gdp/albania-gdp/',
headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(req.content, 'html.parser')
# Extract table
gdp_historic = soup.find(
'table', class_='table table-striped table-bordered table-hover table-condensed table-list')
table_body = gdp_historic.find('tbody')
data = []
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
# Extract column names
colnames = [heading.text for heading in gdp_historic.findAll('th')]
# Convert to json
pd.DataFrame(data, columns=colnames).to_json()
Output:
'{"Year":{"0":"2017","1":"2016","2":"2015","3":"2014","4":"2013","5":"2012","6":"2011","7":"2010","8":"2009","9":"2008","10":"2007","11":"2006","12":"2005","13":"2004","14":"2003","15":"2002","16":"2001","17":"2000","18":"1999","19":"1998","20":"1997","21":"1996","22":"1995","23":"1994"},"GDP Nominal (Current USD) ":{"0":"$13,038,538,300","1":"$11,883,682,171","2":"$11,386,931,490","3":"$13,228,247,844","4":"$12,776,280,961","5":"$12,319,784,886","6":"$12,890,866,743","7":"$11,926,957,255","8":"$12,044,208,086","9":"$12,881,353,508","10":"$10,677,324,144","11":"$8,896,072,919","12":"$8,052,073,539","13":"$7,184,685,782","14":"$5,611,496,257","15":"$4,348,068,242","16":"$3,922,100,794","17":"$3,480,355,258","18":"$3,212,121,651","19":"$2,545,964,541","20":"$2,258,513,974","21":"$3,199,641,336","22":"$2,392,764,853","23":"$1,880,951,520"},"GDP Real (Inflation adj.) ":
<truncated>