I am trying to perform BFS on Wikipedia pages. I believe I am implementing this correctly and in the best possible way run-time wise (keeping it to one thread), but it is taking quite a long time to find a connection between two articles. Here is my implementation:
marked = set()
queue = deque()
count = 0
def get_wikipedia_page(wiki_link):
url = BASE_URL + wiki_link
time.sleep(0.1)
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
return soup
def backtrace(parent, start, end):
boolean = False
ret = []
while boolean == False:
if end in parent:
ret.append(parent[end])
end = parent[end]
if end == start:
break;
ret.reverse()
return ret
def bfs(first_wiki_link, second_wiki_link):
print('Searching starting from ' + first_wiki_link + ' looking for ' + second_wiki_link + '...')
parent = {}
queue.append(first_wiki_link)
start_time = time.time()
#current_parent = first_wiki_link
while queue:
link = queue.pop()
current_parent = link
link_list = list(filter(lambda c: c.get('href') != None and c.get('href').startswith('/wiki/') and c.get('href') != '/wiki/Main_Page' and ':' not in c.get('href'), get_wikipedia_page(link).findAll('a')))
for link in link_list:
href = link.get('href')
if not marked.__contains__(href):
parent[href] = current_parent
marked.add(href)
queue.append(href)
if href == second_wiki_link:
ret = backtrace(parent, first_wiki_link, second_wiki_link)
print("connection found")
end_time = time.time()
print("found a connection in: " + str(end_time - start_time) + " seconds.")
print("the path is " + str(len(ret)) + " pages long")
print(ret)
return True
It takes, sometimes, a few minutes before finding a match. Is this expected due to how big wikipedia is? Or am I messing something up here and am performing it in a non optimal way? Or, could it be beautiful soup is running too slow?