I have folder with around 50 000 HTML files. I'm trying to write script which opens file and if title contains certain string than file should be deleted.
This is my attempt so far:
import aiofiles
import glob
from natsort import natsorted
import asyncio
from bs4 import BeautifulSoup
import os
async def main():
i=0
htmls = glob.glob("CarsPages" + "//*.html")
for html in natsorted(htmls):
async with aiofiles.open(html, mode='r', encoding='UTF-8', errors='strict', buffering=1) as f:
contents = await f.read()
soup = BeautifulSoup(contents, features="lxml")
if "Best portal" in soup.title.get_text():
i+=1
os.close(html)
os.remove(html)
print("removing: ", html)
print("Removed: ", i, " pages")
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
But I'm getting :
os.close(html) TypeError: an integer is required (got type str)
Don't know which functions to use for close and remove once it's opened with aiofiles?
EDIT - WORKING CODE BASED ON @joao answer
import aiofiles
import glob
from natsort import natsorted
import asyncio
from bs4 import BeautifulSoup
import os
async def main():
i=0
htmls = glob.glob("CarsPages" + "//*.html")
for html in natsorted(htmls):
async with aiofiles.open(html, mode='r', encoding='UTF-8', errors='strict', buffering=1) as f:
contents = await f.read()
soup = BeautifulSoup(contents, features="lxml")
if "Best portal" in soup.title.get_text():
i+=1
os.remove(html)
print("removed: ", html)
print("Removed: ", i, " pages")
loop = asyncio.get_event_loop()
loop.run_until_complete(main())