1

So i have a folder with images I am trying to make cluster of near similar images in a python dictionary with Imagehash as the key and a list of similar images as their value, how can i prevent for a image to generate a new key if it's already present in any of the other dictionary key list here is the code that I have done so far:


from PIL import Image
import imagehash
import cv2
import numpy as np
import dhash
import distance

norm_cache: dict = dict()
def _get_image(image_path: str) -> Image:
   try:
       img_arr = cv2.imread(image_path)
       img_arr = cv2.resize(img_arr, (512, 512), interpolation=cv2.INTER_AREA)

       # Convert image into 3 channels if it contains 4
       if len(img_arr.shape) > 2 and img_arr.shape[2] == 4:
           img_arr = cv2.cvtColor(img_arr, cv2.COLOR_BGRA2BGR)

       # using W3C luminance calc to convert into gray-scale
       data = np.inner(img_arr, [299, 587, 114]) / 1000.0

       return Image.fromarray(np.uint8(data),"L")
   except SyntaxError:
       pass


def find_similar_images(userpath):
   import os
   global norm_cache
   def is_image(filename):
       f = filename.lower()
       return f.endswith(".png") or f.endswith(".jpg") or \
           f.endswith(".jpeg") or f.endswith(".bmp") or f.endswith(".gif")
   
   image_filenames = [os.path.join(userpath, path) for path in os.listdir(userpath) if is_image(path)]
   images = {}
   buffer = []
   for img in image_filenames:
       if (len(buffer) == 0):
           print("Original list is empty, Appending first image to buffer.")
           buffer.append(img)
           continue
       gray1 = _get_image(img)
       h1r,h1c =  dhash.dhash_row_col(gray1)
       hash1 = dhash.format_hex(h1r,h1c)
       images[hash1] = images.get(hash1, []) + [img]
       for each in buffer:
           if each in norm_cache:
               print(f"cached val found for {each}")
               gray2 = norm_cache[each]
               h2r,h2c = dhash.dhash_row_col(gray2)
               hash2 = dhash.format_hex(h2r,h2c)
           else:
               print("No cached_val found, Computing and storing in norm_cache")
               gray2 = _get_image(each)
               h2r,h2c = dhash.dhash_row_col(gray2)
               hash2 = dhash.format_hex(h2r,h2c)
               norm_cache[each] = gray2  # Update cache...
           print(f"Comparing ---> {img}:{hash1} with {each}:{hash2}")
           if(distance.hamming(hash1,hash2) <= 22):
             //what should i put in here 

   
   unique = 0
   for k, img_list in images.items():
       if(len(img_list) >= 1):
           print(''.join(img_list))
           unique = unique + 1
   print(unique)
   


if __name__ == '__main__':
   import sys, os
   userpath = <Image folder/>
   find_similar_images(userpath=userpath)
   
  • One characteristic of hashes is, that they difer wildly if even a small part of the original differs. Using hashes as "grouping" to gauge the similarity of images seems flawed. Do I understand you wrong? Please elaborate and [edit] your Q. – Patrick Artner Apr 17 '21 at 08:36
  • I am not using md5 or sha hashes I am using this https://pypi.org/project/dhash/, also I think you didn't understand the question can you please take a look at the code please then it will be more clear. - Thank you – pakulopakito Apr 17 '21 at 08:41
  • 1
    You do not need to check other dictionary keys when inserting an image to the list of some key. Identical images have the same dhash value. Therefor, you only have to heck the list of the key into which you are adding an image. An even better approach would be to place ALL images in a dictionary, using a cryptographic hash function, which (almost) guarantees different images produce different hash values. After yoyu have a dictionary of guaranteed different images, construct you dictionary of lists of images sharing the same dhash. – Amitai Irron Apr 17 '21 at 09:42

0 Answers0