Skip to content

Commit 7d919bf

Browse files
committed
updating code
1 parent bf302fa commit 7d919bf

11 files changed

+1404
-340
lines changed

fastdup/__init__.py

Lines changed: 205 additions & 95 deletions
Large diffs are not rendered by default.

fastdup/definitions.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,11 @@
4545
DEFAULT_MODEL_FEATURE_WIDTH = 576
4646
HIGH_ACCURACY_MODEL_FEATURE_WIDTH = 960
4747

48+
PRINTOUT_BAR_WIDTH = 88
49+
4850
DEFUALT_METRIC_ZERO = 0
4951
DEFAULT_METRIC_MINUS_ONE = -1
50-
VERSION__ = "0.927"
52+
VERSION__ = "1.38"
5153

5254
GITHUB_URL = "https://github.com/visual-layer/fastdup/issues"
5355

@@ -56,7 +58,7 @@
5658
"and special care needs to select the right backend for your OS/Hardware combination. You can install matplot lib using "
5759
"python3.8 -m pip install matplotlib matplotlib-inline. (change the python3.8 to your python version). "
5860

59-
SUPPORTED_IMG_FORMATS = [".png", ".jpg", ".jpeg", ".giff", ".jpeg", ".tif", ".heic", ".heif"]
61+
SUPPORTED_IMG_FORMATS = [".png", ".jpg", ".jpeg", ".giff", ".jpeg", ".tif", ".tiff", ".heic", ".heif", ".bmp", ".webp"]
6062
SUPPORTED_VID_FORMATS = ["mp4", ".avi"]
6163

6264
RUN_ALL = 0
@@ -76,6 +78,17 @@
7678
DINOV2S_MODEL_DIM = 384
7779
DINOV2B_MODEL = "https://vl-company-website.s3.us-east-2.amazonaws.com/model_artifacts/dinov2/dinov2_vitb14.onnx"
7880
DINOV2B_MODEL_DIM = 768
81+
CLIP_MODEL = "https://clip-as-service.s3.us-east-2.amazonaws.com/models-436c69702d61732d53657276696365/onnx/ViT-B-32/visual.onnx"
82+
CLIP_MODEL_DIM = 512
83+
CLIP_MODEL2 = "https://clip-as-service.s3.us-east-2.amazonaws.com/models-436c69702d61732d53657276696365/onnx/ViT-L-14@336px/visual.onnx"
84+
CLIP_MODEL2_DIM = 768
85+
CLIP_MODEL14 = "https://clip-as-service.s3.us-east-2.amazonaws.com/models-436c69702d61732d53657276696365/onnx/ViT-L-14/visual.onnx"
86+
CLIP_MODEL14_DIM = 768
87+
88+
EFFICIENTNET_MODEL = "https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4-11.onnx"
89+
EFFICIENTNET_MODEL_DIM = 1000
90+
RESNET50_MODEL = "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-v1-12.onnx"
91+
RESNET50_MODEL_DIM = 1000
7992

8093
CAPTION_MODEL1_NAME = 'automatic'
8194
CAPTION_MODEL2_NAME = 'blip'

fastdup/engine.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@ class Fastdup(FastdupController):
2929
df_cc, cc_info = fd.connected_components()
3030
"""
3131

32-
def __init__(self, work_dir: Union[str, Path], input_dir: Union[str, Path] = None):
33-
super().__init__(work_dir, input_dir=input_dir)
32+
def __init__(self, work_dir: Union[str, Path]=None, input_dir: Union[str, Path] = None):
33+
super().__init__(work_dir=work_dir, input_dir=input_dir)
3434
self.vis = FastdupVisualizer(self)
3535

3636
def run(self,
@@ -149,10 +149,10 @@ def run(self,
149149
license='' if license is None else license,
150150
high_accuracy=high_accuracy)
151151
if (model_path is not None):
152-
if 'dinov2s' not in model_path and 'dinov2b' not in model_path:
152+
if 'dinov2s' not in model_path and 'dinov2b' not in model_path and 'resnet50' not in model_path and 'efficientnet' not in model_path and 'clip' not in model_path and 'clip336' not in model_path and 'clip14' not in model_path:
153153
assert 'd' in kwargs, 'Please provide d parameter to indicate the model output dimension'
154154
fastdup_func_params['model_path'] = model_path
155155
fastdup_func_params.update(kwargs)
156156

157-
super().run(annotations=annotations, input_dir=input_dir, subset=subset, data_type=data_type,
157+
return super().run(annotations=annotations, input_dir=input_dir, subset=subset, data_type=data_type,
158158
overwrite=overwrite, embeddings=embeddings, **fastdup_func_params)

fastdup/fastdup_controller.py

Lines changed: 334 additions & 88 deletions
Large diffs are not rendered by default.

fastdup/fastdup_create.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77

88
@v1_sentry_handler
9-
def create(work_dir: Union[str, Path], input_dir: Union[str, Path] = None) -> Fastdup:
9+
def create(work_dir: Union[str, Path] = None, input_dir: Union[str, Path, list] = None) -> Fastdup:
1010
"""
1111
Create fastdup analyzer instance.
1212
Usage example

fastdup/fastdup_visualizer.py

Lines changed: 111 additions & 16 deletions
Large diffs are not rendered by default.

fastdup/galleries.py

Lines changed: 424 additions & 67 deletions
Large diffs are not rendered by default.

fastdup/image.py

Lines changed: 108 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,23 @@
55

66
import os
77
import cv2
8+
import fastdup.definitions
89
import numpy as np
910
import base64
1011
import io
12+
13+
import pandas as pd
1114
from fastdup.definitions import *
1215
from fastdup.sentry import fastdup_capture_exception
1316
import tarfile
1417
import platform
18+
import pathlib
19+
from PIL import Image
20+
from pillow_heif import register_heif_opener
21+
22+
register_heif_opener()
23+
24+
1525

1626
def safe_replace(path):
1727
return path.replace('/','_').replace('\\','_').replace(":",'_')
@@ -98,6 +108,32 @@ def truncate_folder_name(path):
98108
return None
99109

100110

111+
112+
def inner_read(img1_path):
113+
if img1_path.lower().endswith('.heic') or img1_path.lower().endswith('.heif'):
114+
img = Image.open(img1_path)
115+
assert img is not None, f"Failed to open image from {img1_path}"
116+
img = np.array(img)
117+
channels = img.shape[-1] if img.ndim == 3 else 1
118+
if channels == 1:
119+
img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
120+
elif channels == 4:
121+
img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
122+
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
123+
else:
124+
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
125+
else:
126+
img = cv2.imread(img1_path, cv2.IMREAD_UNCHANGED)
127+
assert img is not None, f"Failed to open image from {img1_path}"
128+
if img.dtype == 'uint16':
129+
img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX, cv2.CV_8U)
130+
channels = img.shape[-1] if img.ndim == 3 else 1
131+
if channels == 1:
132+
img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
133+
elif channels == 4:
134+
img = cv2.cvtColor(img, cv2.COLOR_RGBA2RGB)
135+
return img
136+
101137
def fastdup_imread(img1_path, input_dir, kwargs):
102138
"""
103139
Read an image from local file, or from a tar file, or from s3/minio path using minio client mc
@@ -108,22 +144,22 @@ def fastdup_imread(img1_path, input_dir, kwargs):
108144
Returns:
109145
img1 (np.array): the image
110146
"""
111-
assert img1_path is not None, f"img1_path should not be None {input_dir}, {kwargs}"
112-
147+
assert not pd.isnull(img1_path), f"img1_path should not be None {img1_path} {input_dir}, {kwargs}"
113148
is_minio_or_s3 = False
114-
if input_dir is not None:
149+
if input_dir is not None and (isinstance(input_dir, str) or isinstance(input_dir, pathlib.Path)):
150+
if input_dir.startswith('~/'):
151+
input_dir = os.path.expanduser(input_dir)
115152
if not input_dir.startswith("s3://") and not input_dir.startswith("minio://"):
116153
assert os.path.exists(input_dir), "Failed to find input_dir: " + input_dir
117154
else:
118155
is_minio_or_s3 = True
119156

120-
157+
if img1_path.startswith('~/'):
158+
img1_path = os.path.expanduser(img1_path)
121159
if os.path.exists(img1_path):
122-
img = cv2.imread(img1_path, cv2.IMREAD_UNCHANGED)
123-
if img is not None:
124-
if img.dtype == 'uint16':
125-
img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX, cv2.CV_8U)
126-
img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
160+
img = inner_read(img1_path)
161+
162+
127163
return img
128164
elif ('/' +S3_TEMP_FOLDER + '/' in img1_path or '/' + S3_TEST_TEMP_FOLDER + '/' in img1_path) and \
129165
'.tar/' in img1_path:
@@ -150,38 +186,68 @@ def fastdup_imread(img1_path, input_dir, kwargs):
150186
minio_prefix = "/".join(input_dir.replace("minio://", "").split('/')[:2])
151187
#print('minio_prefix', minio_prefix)
152188
download_minio(minio_prefix + '/' + local_dir_no_temp + '/' + os.path.basename(img1_path), S3_TEMP_FOLDER)
153-
ret = cv2.imread(os.path.join(S3_TEMP_FOLDER, os.path.basename(img1_path)))
189+
ret = inner_read(os.path.join(S3_TEMP_FOLDER, os.path.basename(img1_path)))
154190
assert ret is not None, f"Failed to read image {os.path.join(S3_TEMP_FOLDER, os.path.basename(img1_path))}"
155191
return ret
156192
elif input_dir.startswith("s3://"):
157193
local_dir_no_temp = truncate_folder_name(os.path.dirname(img1_path))
158194
s3_prefix = 's3://' + "/".join(input_dir.replace("s3://", "").split('/')[:1])
159195
#print('s3_prefix', s3_prefix)
160196
download_s3(s3_prefix + '/' + local_dir_no_temp + '/' + os.path.basename(img1_path), S3_TEMP_FOLDER)
161-
ret = cv2.imread(os.path.join(S3_TEMP_FOLDER, os.path.basename(img1_path)))
162-
assert ret is not None, f"Failed to read image {os.path.join(S3_TEMP_FOLDER, os.path.basename(img1_path))}"
197+
ret = inner_read(os.path.join(S3_TEMP_FOLDER, os.path.basename(img1_path)))
163198
return ret
164199
#Failed to read image1 ..\milvus_vector_db\data\images\..\milvus_vector_db\data\images\Egyptian_Mau_210.jpg
165200
elif img1_path.startswith(input_dir) and len(img1_path) >= len(input_dir) +2:
166201
suffix = img1_path[len(input_dir):]
167202
if input_dir in suffix and os.path.exists(suffix):
168-
img = cv2.imread(suffix, cv2.IMREAD_UNCHANGED)
169-
if img is not None:
170-
if img.dtype == 'uint16':
171-
img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX, cv2.CV_8U)
172-
img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
203+
img = inner_read(suffix)
173204
return img
174205
elif "''" in img1_path: # try to handle french and other languages where c side doubles the '' otherwise pandas can't read it
175206
new_img1_path = img1_path.replace("''","")
176207
if os.path.exists(new_img1_path):
177-
img = cv2.imread(new_img1_path, cv2.IMREAD_UNCHANGED)
208+
img = inner_read(new_img1_path)
178209
return img
179210

180211

181212
print('Failed to read image from img_path', img1_path)
182213
return None
183214

184215

216+
def check_valid_image_extension(filename):
217+
# Check whether a file name ends with an image extension
218+
# Required by OpenCV imwrite
219+
return any([filename.lower().endswith(ext) for ext in SUPPORTED_IMG_FORMATS])
220+
221+
222+
def fastdup_imwrite(local_file, im):
223+
has_extension = check_valid_image_extension(local_file)
224+
if has_extension:
225+
ret = cv2.imwrite(local_file, im)
226+
else:
227+
local_file_wext = local_file + '.jpg'
228+
ret = cv2.imwrite(local_file_wext, im)
229+
assert ret, f"Failed to save img to {local_file} most likely filename is too long for the OS"
230+
231+
# Rename back if extension was added
232+
os.rename(local_file_wext, local_file)
233+
assert os.path.isfile(local_file), "Failed to save img to " + local_file
234+
235+
if ret == False and len(local_file) >= 254:
236+
try:
237+
import uuid
238+
import shutil
239+
file, ext = os.path.splitext(local_file)
240+
tmp_filename = str(uuid.uuid4()) + ext
241+
ret = cv2.imwrite(tmp_filename, im)
242+
if os.path.exists(local_file):
243+
os.unlink(local_file)
244+
shutil.move(tmp_filename, local_file)
245+
finally:
246+
assert ret, f"Failed to save img to {local_file} most likely filename is too long for the OS"
247+
elif ret == False:
248+
assert ret, f"Failed to save img to {local_file}"
249+
assert os.path.isfile(local_file), "Failed to save img to " + local_file
250+
185251
def get_type(str):
186252
if 'train' in str:
187253
return 'train'
@@ -282,17 +348,7 @@ def draw_text(img, text,
282348

283349
return text_size, img
284350

285-
def create_triplet_img(row, work_dir, save_path, extract_filenames, get_bounding_box_func=None, input_dir=None, kwargs=None):
286-
#v1 = 'id_to_filename_func' in kwargs
287-
id_from, id_to = row['from'], row['to']
288-
#if v1:
289-
# assert not isinstance(id_from, str), f"Wrong type {row}"
290-
291-
#suffix_from, suffix_to = (f'_{id_from}', f'_{id_to}') if v1 else ('', '')
292-
#if v1:
293-
# id_to_filename_func = kwargs['id_to_filename_func']
294-
# row[['from','to']] = [id_to_filename_func(row['from']), id_to_filename_func(row['to'])]
295-
351+
def create_triplet_img(index, row, work_dir, save_path, extract_filenames, get_bounding_box_func=None, input_dir=None, kwargs=None):
296352
img1_path, img2_path, distance, ptype = extract_filenames(row, work_dir, save_path, kwargs)
297353

298354
img1 = fastdup_imread(img1_path, input_dir, kwargs)
@@ -301,6 +357,10 @@ def create_triplet_img(row, work_dir, save_path, extract_filenames, get_bounding
301357
assert img1 is not None, f"Failed to read image1 {img1_path} {str(input_dir)}"
302358
assert img2 is not None, f"Failed to read image2 {img2_path} {str(input_dir)}"
303359

360+
if 'crop_filename_from' in row and 'crop_filename_to' in row:
361+
id_from, id_to = row['crop_filename_from'], row['crop_filename_to']
362+
else:
363+
id_from, id_to = row['from'], row['to']
304364
img1 = plot_bounding_box(img1, get_bounding_box_func, id_from)
305365
img2 = plot_bounding_box(img2, get_bounding_box_func, id_to)
306366

@@ -317,9 +377,20 @@ def create_triplet_img(row, work_dir, save_path, extract_filenames, get_bounding
317377
if rimg1.shape != rimg2.shape: # combination of grayscale and color
318378
if len(rimg1.shape) == 2:
319379
rimg1 = cv2.cvtColor(rimg1, cv2.COLOR_GRAY2RGB)
380+
elif len(rimg1.shape) ==3 and rimg1.shape[2] == 4:
381+
rimg1 = cv2.cvtColor(rimg1, cv2.COLOR_RGBA2RGB)
320382
if len(rimg2.shape) == 2:
321383
rimg2 = cv2.cvtColor(rimg2, cv2.COLOR_GRAY2RGB)
322-
cimage = cv2.addWeighted(rimg1,alpha,rimg2,1-alpha,0)
384+
elif len(rimg1.shape) ==3 and rimg2.shape[2] == 4:
385+
rimg2 = cv2.cvtColor(rimg2, cv2.COLOR_RGBA2RGB)
386+
387+
error_weighted = False
388+
try:
389+
cimage = cv2.addWeighted(rimg1,alpha,rimg2,1-alpha,0)
390+
except Exception as ex:
391+
error_weighted = True
392+
fastdup_capture_exception("create_triplet_image", ex, True, f"Dimes are {rimg1.shape} {rimg2.shape}")
393+
323394

324395
hierarchical_run = kwargs is not None and 'hierarchical_run' in kwargs and kwargs['hierarchical_run']
325396
text1 = os.path.splitext(os.path.basename(img1_path))[0]
@@ -330,11 +401,11 @@ def create_triplet_img(row, work_dir, save_path, extract_filenames, get_bounding
330401

331402
(w, h),nimg1 = draw_text(rimg1, text1, font_scale=1, pos=(10, 10))
332403
(w, h),nimg2 = draw_text(rimg2, text2, font_scale=1, pos=(10, 10))
333-
(w, h),cimage = draw_text(cimage, 'blended image', font_scale=1, pos=(10, 10))
404+
if not error_weighted:
405+
(w, h),cimage = draw_text(cimage, 'blended image', font_scale=1, pos=(10, 10))
406+
assert cimage.shape[0] > 0 and cimage.shape[1] > 0
334407

335-
assert cimage.shape[0] > 0 and cimage.shape[1] > 0
336-
337-
if hierarchical_run:
408+
if hierarchical_run or error_weighted:
338409
hcon_img = hconcat_resize_min([nimg1, nimg2])
339410
else:
340411
hcon_img = hconcat_resize_min([nimg1, nimg2, cimage])
@@ -355,11 +426,9 @@ def create_triplet_img(row, work_dir, save_path, extract_filenames, get_bounding
355426
lazy_load = 'lazy_load' in kwargs and kwargs['lazy_load']
356427
if lazy_load:
357428
os.makedirs(os.path.join(save_path, 'images'), exist_ok=True)
358-
hcon_img_path = f'{save_path}/images/{pid}.jpg'
429+
hcon_img_path = f'{save_path}/images/{pid}_{index}.jpg'
359430
else:
360-
hcon_img_path = f'{save_path}/{pid}.jpg'
361-
cv2.imwrite(hcon_img_path, hcon_img)
362-
assert os.path.exists(hcon_img_path), f"Failed to write image to {hcon_img_path}"
363-
431+
hcon_img_path = f'{save_path}/{pid}_{index}.jpg'
432+
fastdup_imwrite(hcon_img_path, hcon_img)
364433
return hcon_img, hcon_img_path
365434

fastdup/sentry.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def init_sentry():
7474
except:
7575
pass
7676

77-
def fastdup_capture_exception(section, e, warn_only=False):
77+
def fastdup_capture_exception(section, e, warn_only=False, extra=""):
7878
if not warn_only:
7979
traceback.print_exc()
8080
if 'SENTRY_OPT_OUT' not in os.environ:
@@ -84,7 +84,10 @@ def fastdup_capture_exception(section, e, warn_only=False):
8484
scope.set_tag("token", token)
8585
scope.set_tag("platform", platform.platform())
8686
scope.set_tag("platform.version", platform.version())
87-
scope.set_tag("python", sys.version)
87+
scope.set_tag("python", sys.version.strip().replace("\n", " "))
88+
scope.set_tag("production", "FASTDUP_PRODUCTION" in os.environ)
89+
if extra != "":
90+
scope.set_tag("extra", extra)
8891
capture_exception(e, scope=scope)
8992

9093

@@ -106,6 +109,7 @@ def fastdup_performance_capture(section, start_time):
106109
scope.set_tag("platform", platform.platform())
107110
scope.set_tag("platform.version", platform.version())
108111
scope.set_tag("python", sys.version.strip().replace("\n", " "))
112+
scope.set_tag("production", "FASTDUP_PRODUCTION" in os.environ)
109113
sentry_sdk.capture_message("Performance", scope=scope)
110114
finally:
111115
sentry_sdk.flush(timeout=5)

fastdup/tensorboard_projector.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def save_labels_tsv(labels, filepath, log_dir):
3030
for label in labels:
3131
f.write('{}\n'.format(label))
3232

33-
def generate_sprite_image(img_path, sample_size, log_dir, get_label_func = None, h = 0, w = 0, alternative_filename = None, alternative_width=None, max_width=None):
33+
def generate_sprite_image(img_path, sample_size, log_dir, get_label_func = None, h = 0, w = 0, alternative_filename = None, alternative_width=None, max_width=None, kwargs={}):
3434
# Generate sprite image
3535
images_pil = []
3636

@@ -54,6 +54,13 @@ def generate_sprite_image(img_path, sample_size, log_dir, get_label_func = None,
5454
if (alternative_width < sample_size):
5555
sample_size = alternative_width
5656
height = 1
57+
elif kwargs and 'force_width' in kwargs and 'force_height' in kwargs:
58+
assert isinstance(kwargs['force_width'], int), "force_width must be an integer"
59+
assert isinstance(kwargs['force_height'], int), "force_height must be an integer"
60+
if kwargs['force_width'] * kwargs['force_height'] > len(img_path):
61+
print(f"Warning: missing images for a full grid, requested {kwargs['force_width'] * kwargs['force_height']} got {len(img_path)}")
62+
NUM_IMAGES_WIDTH = kwargs['force_width']
63+
height = kwargs['force_width']
5764
else:
5865
NUM_IMAGES_WIDTH = int(1.4*np.ceil(np.sqrt(min(sample_size, len(img_path)))))
5966
divs = int(np.ceil(min(sample_size,len(img_path)) / NUM_IMAGES_WIDTH))

0 commit comments

Comments
 (0)