I was trying to make a retriever.
I used wget to get the website, and call out all the text.
I want to make a dict like
{'Activity':'index2.html','and':'index2.html','within':'index2.html',...}
{'Rutgers':'index.html','Central':'index.html','Service':'index,html',...}
but I got the output is
{'Activity':'i','and':'n','within':'d',...}
{'Rutgers':'i','Central':'n','Service':'d',...}
It split my filename.
import string
import os
from bs4 import BeautifulSoup as bs
from os import listdir
from os.path import isfile, join
#from os.path import isdir
mypath = "/Users/Tsu-AngChou/MasterProject/Practice/try_test/"
files = listdir(mypath)
translator = str.maketrans("","",string.punctuation)
storage = []
for f in files:
fullpath = join(mypath, f)
if f == '.DS_Store':
os.remove(f)
elif isfile(fullpath):
print(f)
for html_cont in range(1):
response = open(f,'r',encoding='utf-8')
html_cont = response.read()
soup = bs(html_cont, 'html.parser',from_encoding ='utf-8')
regular_string = soup.get_text()
new_string = regular_string.translate(translator).split()
new_list = [item[:14] for item in new_string]
a = dict(zip(new_list,f))
print(a)