Skip to content

Commit 3c7135d

Browse files
committed
Duplicate Image Search and Delete
1 parent 5ef5444 commit 3c7135d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+212
-0
lines changed

Python/Duplicate_Image/README.md

Lines changed: 92 additions & 0 deletions
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
# Imports
2+
import hashlib
3+
import os
4+
import sys
5+
import keyboard
6+
7+
8+
def image_finder(parent_folder):
9+
# A dictionary to store Hash of Images corresponding to names
10+
"""
11+
Sample -
12+
{hash:[names]}
13+
"""
14+
duplicate_img = {}
15+
for dirName, subdirs, fileList in os.walk(parent_folder):
16+
# Iterating over various Sub-Folders
17+
print('Scanning %s...' % dirName)
18+
for filename in fileList:
19+
# Get the path to the file
20+
path = os.path.join(dirName, filename)
21+
# Calculate hash
22+
file_hash = hash_file(path)
23+
# Add or append the file path in the dictionary
24+
if file_hash in duplicate_img:
25+
duplicate_img[file_hash].append(path)
26+
else:
27+
duplicate_img[file_hash] = [path]
28+
return duplicate_img
29+
30+
31+
def delete_duplicate(duplicate_img):
32+
# Deleting those values whose keys are not unique
33+
for key in duplicate_img:
34+
file_list = duplicate_img[key]
35+
while len(file_list) > 1:
36+
item = file_list.pop()
37+
os.remove(item)
38+
39+
40+
# Joins two dictionaries
41+
def join_dicts(dict1, dict2):
42+
for key in dict2.keys():
43+
if key in dict1:
44+
dict1[key] = dict1[key] + dict2[key]
45+
else:
46+
dict1[key] = dict2[key]
47+
48+
49+
# For finding Hash of various Files
50+
# If 2 files have the same md5checksum,they most likely have the same content
51+
def hash_file(path, blocksize=65536):
52+
img_file = open(path, 'rb')
53+
hasher = hashlib.md5()
54+
buf = img_file.read(blocksize)
55+
while len(buf) > 0:
56+
hasher.update(buf)
57+
buf = img_file.read(blocksize)
58+
img_file.close()
59+
# Return Hex MD5
60+
return hasher.hexdigest()
61+
62+
63+
def print_results(dict1):
64+
results = list(filter(lambda x: len(x) > 1, dict1.values()))
65+
if len(results) > 0:
66+
print('Found Duplicated Images - ')
67+
print('Details -')
68+
print('<--------------------->')
69+
for result in results:
70+
# Print Path of Files
71+
for subresult in result:
72+
print('\t%s' % subresult)
73+
print('<--------------------->')
74+
75+
else:
76+
print('Unable to identify Similar Images')
77+
78+
79+
if __name__ == '__main__':
80+
if len(sys.argv) > 1:
81+
duplicate = {}
82+
folders = sys.argv[1:]
83+
for i in folders:
84+
# Iterate the folders given
85+
if os.path.exists(i):
86+
# Find the duplicated files and append them to the dictionary
87+
join_dicts(duplicate, image_finder(i))
88+
else:
89+
print('%s is not a valid path, please verify' % i)
90+
sys.exit()
91+
print_results(duplicate)
92+
# Delete Duplicate Images
93+
# Comment if not required
94+
print("Do you want to delete the Duplicate Images (If Any)? Press [y] for Yes.")
95+
while True:
96+
if keyboard.read_key() == "y":
97+
print("Deleting Duplicate Files\n")
98+
delete_duplicate(duplicate)
99+
print("Thank You\n")
100+
break
101+
else:
102+
print("Nothing Deleted!!! Thank You\n")
103+
break
104+
else:
105+
print("Use Command Line Interface")
106+
print("Hint: python image_finder.py <path of folders>")
107+
print("Please Read comments for greater detailing")
108+
'''
109+
Suggestions :------
110+
Usage - python image_finder.py <path of folder1, path of folder2, .....>
111+
folder1 - Parent Folder
112+
folder2, folder3 .... - Subsequent Folders
113+
Comparisons are done with in the folder, and from Parent to Subsequent Folders.
114+
115+
No Files are deleted form Parent Folder but the files which are Duplicate to the files in Subsequent Folders are
116+
deleted. Make sure that the paths are correct
117+
118+
Be careful during Keyboard Input.
119+
'''
521 KB
161 KB
109 KB
142 KB
635 KB
256 KB
Binary file not shown.
257 KB
Binary file not shown.
257 KB
Binary file not shown.

0 commit comments

Comments
 (0)