i am trying to build web scraper on python with BeautifulSoup libary. I want to get information from all pages of bitcoin forum topic topic. i am using the following code to get username , status, date and time of post , post text , activity, merit from forum https://bitcointalk.org/index.php?topic=2056041.0
url='https://bitcointalk.org/index.php?topic=2056041.0'
from bs4 import BeautifulSoup
import requests
import re
def get_html(url):
r = requests.get(url)
return r.text
html=get_html(url)
soup=BeautifulSoup(html, 'lxml')
results= soup.findAll("td", {"valign" : "top"})
usernames=[]
for i in results:
x=i.findAll('b')
try:
s=str(x[0])
if 'View the profile of' in s :
try:
found = re.search('of (.+?)">', s).group(1)
if found.isdigit()==False:
usernames.append(found)
except Exception as e :print(e)
except Exception as e :pass#print(e)
print(len(usernames))
status=[]
for i in results:
x=i.findAll("div", {"class": "smalltext"})
s=str(x)
try:
found = re.search(' (.+?)<br/>', s).group(1)
if len(found)<25:
status.append(found)
except:pass
print(len(status))
activity=[]
for i in results:
x=i.findAll("div", {"class": "smalltext"})
s=str(x)
try:
x=s.split('Activity: ')[1]
x=x.split('<br/>')[0]
activity.append(x)
except Exception as e :pass
print(activity)
print(len(activity))
posts=[]
for i in results:
x=i.findAll("div", {"class": "post"})
s=str(x)
try:
x=s.split('="post">')[1]
x=x.split('</div>]')[0]
if x.isdigit()!=True:
posts.append(x)
except Exception as e :pass
print(len(posts))
i feel what its a very ugly and not correct solution using re all these try except and etc. Is there more straight and elegant solution for this task?