I've got 200GB folders with images and some of them can't be opened. I want to find these images and delete them from their folder.
I tried Python code like this:
for image in all_image:
try: # open image
except: # delete image
And it's too slow. How I can do it faster?
How I can parallelize this code?
import PIL
import os
import cv2
from PIL import ImageFile
from tqdm import tqdm
from pathlib import Path
import pandas as pd
def create_df(data_path):
data = pd.DataFrame()
folder_namee = [i for i in data_root.iterdir() if i.is_dir()]
files = [j for i in sku_dirs for j in i.glob('*.jpg')]
data['path'] = [str(i) for i in files]
data['label'] = [i.parts[-2] for i in files]
return data
if __name__ == "__main__":
root = Path('some_path')
data_root = root / 'dataset'
df = create_df(data_root)
for i, row in tqdm(df.iterrows()):
try:
img = PIL.Image.open(row.path)
except Exception:
print(row.path)
print(row)
if os.path.exists(row.path):
os.remove(row.path)