5
5
6
6
import os
7
7
import cv2
8
+ import fastdup .definitions
8
9
import numpy as np
9
10
import base64
10
11
import io
12
+
13
+ import pandas as pd
11
14
from fastdup .definitions import *
12
15
from fastdup .sentry import fastdup_capture_exception
13
16
import tarfile
14
17
import platform
18
+ import pathlib
19
+ from PIL import Image
20
+ from pillow_heif import register_heif_opener
21
+
22
+ register_heif_opener ()
23
+
24
+
15
25
16
26
def safe_replace (path ):
17
27
return path .replace ('/' ,'_' ).replace ('\\ ' ,'_' ).replace (":" ,'_' )
@@ -98,6 +108,32 @@ def truncate_folder_name(path):
98
108
return None
99
109
100
110
111
+
112
+ def inner_read (img1_path ):
113
+ if img1_path .lower ().endswith ('.heic' ) or img1_path .lower ().endswith ('.heif' ):
114
+ img = Image .open (img1_path )
115
+ assert img is not None , f"Failed to open image from { img1_path } "
116
+ img = np .array (img )
117
+ channels = img .shape [- 1 ] if img .ndim == 3 else 1
118
+ if channels == 1 :
119
+ img = cv2 .cvtColor (img , cv2 .COLOR_GRAY2RGB )
120
+ elif channels == 4 :
121
+ img = cv2 .cvtColor (img , cv2 .COLOR_RGBA2RGB )
122
+ img = cv2 .cvtColor (img , cv2 .COLOR_RGB2BGR )
123
+ else :
124
+ img = cv2 .cvtColor (img , cv2 .COLOR_RGB2BGR )
125
+ else :
126
+ img = cv2 .imread (img1_path , cv2 .IMREAD_UNCHANGED )
127
+ assert img is not None , f"Failed to open image from { img1_path } "
128
+ if img .dtype == 'uint16' :
129
+ img = cv2 .normalize (img , None , 0 , 255 , cv2 .NORM_MINMAX , cv2 .CV_8U )
130
+ channels = img .shape [- 1 ] if img .ndim == 3 else 1
131
+ if channels == 1 :
132
+ img = cv2 .cvtColor (img , cv2 .COLOR_GRAY2RGB )
133
+ elif channels == 4 :
134
+ img = cv2 .cvtColor (img , cv2 .COLOR_RGBA2RGB )
135
+ return img
136
+
101
137
def fastdup_imread (img1_path , input_dir , kwargs ):
102
138
"""
103
139
Read an image from local file, or from a tar file, or from s3/minio path using minio client mc
@@ -108,22 +144,22 @@ def fastdup_imread(img1_path, input_dir, kwargs):
108
144
Returns:
109
145
img1 (np.array): the image
110
146
"""
111
- assert img1_path is not None , f"img1_path should not be None { input_dir } , { kwargs } "
112
-
147
+ assert not pd .isnull (img1_path ), f"img1_path should not be None { img1_path } { input_dir } , { kwargs } "
113
148
is_minio_or_s3 = False
114
- if input_dir is not None :
149
+ if input_dir is not None and (isinstance (input_dir , str ) or isinstance (input_dir , pathlib .Path )):
150
+ if input_dir .startswith ('~/' ):
151
+ input_dir = os .path .expanduser (input_dir )
115
152
if not input_dir .startswith ("s3://" ) and not input_dir .startswith ("minio://" ):
116
153
assert os .path .exists (input_dir ), "Failed to find input_dir: " + input_dir
117
154
else :
118
155
is_minio_or_s3 = True
119
156
120
-
157
+ if img1_path .startswith ('~/' ):
158
+ img1_path = os .path .expanduser (img1_path )
121
159
if os .path .exists (img1_path ):
122
- img = cv2 .imread (img1_path , cv2 .IMREAD_UNCHANGED )
123
- if img is not None :
124
- if img .dtype == 'uint16' :
125
- img = cv2 .normalize (img , None , 0 , 255 , cv2 .NORM_MINMAX , cv2 .CV_8U )
126
- img = cv2 .cvtColor (img , cv2 .COLOR_GRAY2RGB )
160
+ img = inner_read (img1_path )
161
+
162
+
127
163
return img
128
164
elif ('/' + S3_TEMP_FOLDER + '/' in img1_path or '/' + S3_TEST_TEMP_FOLDER + '/' in img1_path ) and \
129
165
'.tar/' in img1_path :
@@ -150,38 +186,68 @@ def fastdup_imread(img1_path, input_dir, kwargs):
150
186
minio_prefix = "/" .join (input_dir .replace ("minio://" , "" ).split ('/' )[:2 ])
151
187
#print('minio_prefix', minio_prefix)
152
188
download_minio (minio_prefix + '/' + local_dir_no_temp + '/' + os .path .basename (img1_path ), S3_TEMP_FOLDER )
153
- ret = cv2 . imread (os .path .join (S3_TEMP_FOLDER , os .path .basename (img1_path )))
189
+ ret = inner_read (os .path .join (S3_TEMP_FOLDER , os .path .basename (img1_path )))
154
190
assert ret is not None , f"Failed to read image { os .path .join (S3_TEMP_FOLDER , os .path .basename (img1_path ))} "
155
191
return ret
156
192
elif input_dir .startswith ("s3://" ):
157
193
local_dir_no_temp = truncate_folder_name (os .path .dirname (img1_path ))
158
194
s3_prefix = 's3://' + "/" .join (input_dir .replace ("s3://" , "" ).split ('/' )[:1 ])
159
195
#print('s3_prefix', s3_prefix)
160
196
download_s3 (s3_prefix + '/' + local_dir_no_temp + '/' + os .path .basename (img1_path ), S3_TEMP_FOLDER )
161
- ret = cv2 .imread (os .path .join (S3_TEMP_FOLDER , os .path .basename (img1_path )))
162
- assert ret is not None , f"Failed to read image { os .path .join (S3_TEMP_FOLDER , os .path .basename (img1_path ))} "
197
+ ret = inner_read (os .path .join (S3_TEMP_FOLDER , os .path .basename (img1_path )))
163
198
return ret
164
199
#Failed to read image1 ..\milvus_vector_db\data\images\..\milvus_vector_db\data\images\Egyptian_Mau_210.jpg
165
200
elif img1_path .startswith (input_dir ) and len (img1_path ) >= len (input_dir ) + 2 :
166
201
suffix = img1_path [len (input_dir ):]
167
202
if input_dir in suffix and os .path .exists (suffix ):
168
- img = cv2 .imread (suffix , cv2 .IMREAD_UNCHANGED )
169
- if img is not None :
170
- if img .dtype == 'uint16' :
171
- img = cv2 .normalize (img , None , 0 , 255 , cv2 .NORM_MINMAX , cv2 .CV_8U )
172
- img = cv2 .cvtColor (img , cv2 .COLOR_GRAY2RGB )
203
+ img = inner_read (suffix )
173
204
return img
174
205
elif "''" in img1_path : # try to handle french and other languages where c side doubles the '' otherwise pandas can't read it
175
206
new_img1_path = img1_path .replace ("''" ,"" )
176
207
if os .path .exists (new_img1_path ):
177
- img = cv2 . imread (new_img1_path , cv2 . IMREAD_UNCHANGED )
208
+ img = inner_read (new_img1_path )
178
209
return img
179
210
180
211
181
212
print ('Failed to read image from img_path' , img1_path )
182
213
return None
183
214
184
215
216
+ def check_valid_image_extension (filename ):
217
+ # Check whether a file name ends with an image extension
218
+ # Required by OpenCV imwrite
219
+ return any ([filename .lower ().endswith (ext ) for ext in SUPPORTED_IMG_FORMATS ])
220
+
221
+
222
+ def fastdup_imwrite (local_file , im ):
223
+ has_extension = check_valid_image_extension (local_file )
224
+ if has_extension :
225
+ ret = cv2 .imwrite (local_file , im )
226
+ else :
227
+ local_file_wext = local_file + '.jpg'
228
+ ret = cv2 .imwrite (local_file_wext , im )
229
+ assert ret , f"Failed to save img to { local_file } most likely filename is too long for the OS"
230
+
231
+ # Rename back if extension was added
232
+ os .rename (local_file_wext , local_file )
233
+ assert os .path .isfile (local_file ), "Failed to save img to " + local_file
234
+
235
+ if ret == False and len (local_file ) >= 254 :
236
+ try :
237
+ import uuid
238
+ import shutil
239
+ file , ext = os .path .splitext (local_file )
240
+ tmp_filename = str (uuid .uuid4 ()) + ext
241
+ ret = cv2 .imwrite (tmp_filename , im )
242
+ if os .path .exists (local_file ):
243
+ os .unlink (local_file )
244
+ shutil .move (tmp_filename , local_file )
245
+ finally :
246
+ assert ret , f"Failed to save img to { local_file } most likely filename is too long for the OS"
247
+ elif ret == False :
248
+ assert ret , f"Failed to save img to { local_file } "
249
+ assert os .path .isfile (local_file ), "Failed to save img to " + local_file
250
+
185
251
def get_type (str ):
186
252
if 'train' in str :
187
253
return 'train'
@@ -282,17 +348,7 @@ def draw_text(img, text,
282
348
283
349
return text_size , img
284
350
285
- def create_triplet_img (row , work_dir , save_path , extract_filenames , get_bounding_box_func = None , input_dir = None , kwargs = None ):
286
- #v1 = 'id_to_filename_func' in kwargs
287
- id_from , id_to = row ['from' ], row ['to' ]
288
- #if v1:
289
- # assert not isinstance(id_from, str), f"Wrong type {row}"
290
-
291
- #suffix_from, suffix_to = (f'_{id_from}', f'_{id_to}') if v1 else ('', '')
292
- #if v1:
293
- # id_to_filename_func = kwargs['id_to_filename_func']
294
- # row[['from','to']] = [id_to_filename_func(row['from']), id_to_filename_func(row['to'])]
295
-
351
+ def create_triplet_img (index , row , work_dir , save_path , extract_filenames , get_bounding_box_func = None , input_dir = None , kwargs = None ):
296
352
img1_path , img2_path , distance , ptype = extract_filenames (row , work_dir , save_path , kwargs )
297
353
298
354
img1 = fastdup_imread (img1_path , input_dir , kwargs )
@@ -301,6 +357,10 @@ def create_triplet_img(row, work_dir, save_path, extract_filenames, get_bounding
301
357
assert img1 is not None , f"Failed to read image1 { img1_path } { str (input_dir )} "
302
358
assert img2 is not None , f"Failed to read image2 { img2_path } { str (input_dir )} "
303
359
360
+ if 'crop_filename_from' in row and 'crop_filename_to' in row :
361
+ id_from , id_to = row ['crop_filename_from' ], row ['crop_filename_to' ]
362
+ else :
363
+ id_from , id_to = row ['from' ], row ['to' ]
304
364
img1 = plot_bounding_box (img1 , get_bounding_box_func , id_from )
305
365
img2 = plot_bounding_box (img2 , get_bounding_box_func , id_to )
306
366
@@ -317,9 +377,20 @@ def create_triplet_img(row, work_dir, save_path, extract_filenames, get_bounding
317
377
if rimg1 .shape != rimg2 .shape : # combination of grayscale and color
318
378
if len (rimg1 .shape ) == 2 :
319
379
rimg1 = cv2 .cvtColor (rimg1 , cv2 .COLOR_GRAY2RGB )
380
+ elif len (rimg1 .shape ) == 3 and rimg1 .shape [2 ] == 4 :
381
+ rimg1 = cv2 .cvtColor (rimg1 , cv2 .COLOR_RGBA2RGB )
320
382
if len (rimg2 .shape ) == 2 :
321
383
rimg2 = cv2 .cvtColor (rimg2 , cv2 .COLOR_GRAY2RGB )
322
- cimage = cv2 .addWeighted (rimg1 ,alpha ,rimg2 ,1 - alpha ,0 )
384
+ elif len (rimg1 .shape ) == 3 and rimg2 .shape [2 ] == 4 :
385
+ rimg2 = cv2 .cvtColor (rimg2 , cv2 .COLOR_RGBA2RGB )
386
+
387
+ error_weighted = False
388
+ try :
389
+ cimage = cv2 .addWeighted (rimg1 ,alpha ,rimg2 ,1 - alpha ,0 )
390
+ except Exception as ex :
391
+ error_weighted = True
392
+ fastdup_capture_exception ("create_triplet_image" , ex , True , f"Dimes are { rimg1 .shape } { rimg2 .shape } " )
393
+
323
394
324
395
hierarchical_run = kwargs is not None and 'hierarchical_run' in kwargs and kwargs ['hierarchical_run' ]
325
396
text1 = os .path .splitext (os .path .basename (img1_path ))[0 ]
@@ -330,11 +401,11 @@ def create_triplet_img(row, work_dir, save_path, extract_filenames, get_bounding
330
401
331
402
(w , h ),nimg1 = draw_text (rimg1 , text1 , font_scale = 1 , pos = (10 , 10 ))
332
403
(w , h ),nimg2 = draw_text (rimg2 , text2 , font_scale = 1 , pos = (10 , 10 ))
333
- (w , h ),cimage = draw_text (cimage , 'blended image' , font_scale = 1 , pos = (10 , 10 ))
404
+ if not error_weighted :
405
+ (w , h ),cimage = draw_text (cimage , 'blended image' , font_scale = 1 , pos = (10 , 10 ))
406
+ assert cimage .shape [0 ] > 0 and cimage .shape [1 ] > 0
334
407
335
- assert cimage .shape [0 ] > 0 and cimage .shape [1 ] > 0
336
-
337
- if hierarchical_run :
408
+ if hierarchical_run or error_weighted :
338
409
hcon_img = hconcat_resize_min ([nimg1 , nimg2 ])
339
410
else :
340
411
hcon_img = hconcat_resize_min ([nimg1 , nimg2 , cimage ])
@@ -355,11 +426,9 @@ def create_triplet_img(row, work_dir, save_path, extract_filenames, get_bounding
355
426
lazy_load = 'lazy_load' in kwargs and kwargs ['lazy_load' ]
356
427
if lazy_load :
357
428
os .makedirs (os .path .join (save_path , 'images' ), exist_ok = True )
358
- hcon_img_path = f'{ save_path } /images/{ pid } .jpg'
429
+ hcon_img_path = f'{ save_path } /images/{ pid } _ { index } .jpg'
359
430
else :
360
- hcon_img_path = f'{ save_path } /{ pid } .jpg'
361
- cv2 .imwrite (hcon_img_path , hcon_img )
362
- assert os .path .exists (hcon_img_path ), f"Failed to write image to { hcon_img_path } "
363
-
431
+ hcon_img_path = f'{ save_path } /{ pid } _{ index } .jpg'
432
+ fastdup_imwrite (hcon_img_path , hcon_img )
364
433
return hcon_img , hcon_img_path
365
434
0 commit comments