Skip to content

Commit 790decc

Browse files
committedDec 27, 2024·
Add more pali(2) weights. Switch rest of models adapting open_clip weights to their own weight instances.
·
v1.0.15v1.0.13
1 parent 01cf0f7 commit 790decc

File tree

4 files changed

+178
-107
lines changed

4 files changed

+178
-107
lines changed
 

‎timm/models/byobnet.py

Lines changed: 8 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2282,107 +2282,91 @@ def _cfgr(url='', **kwargs):
22822282
# original attention pool head variants
22832283
'resnet50_clip.openai': _cfgr(
22842284
hf_hub_id='timm/',
2285-
hf_hub_filename='open_clip_pytorch_model.bin',
22862285
num_classes=1024, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
22872286
fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7),
22882287
classifier='head.proj',
22892288
),
22902289
'resnet101_clip.openai': _cfgr(
22912290
hf_hub_id='timm/',
2292-
hf_hub_filename='open_clip_pytorch_model.bin',
22932291
num_classes=512, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
22942292
fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7),
22952293
classifier='head.proj',
22962294
),
22972295
'resnet50x4_clip.openai': _cfgr(
22982296
hf_hub_id='timm/',
2299-
hf_hub_filename='open_clip_pytorch_model.bin',
23002297
num_classes=640, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23012298
fixed_input_size=True, input_size=(3, 288, 288), pool_size=(9, 9),
23022299
classifier='head.proj',
23032300
),
23042301
'resnet50x16_clip.openai': _cfgr(
23052302
hf_hub_id='timm/',
2306-
hf_hub_filename='open_clip_pytorch_model.bin',
23072303
num_classes=768, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23082304
fixed_input_size=True, input_size=(3, 384, 384), pool_size=(12, 12),
23092305
classifier='head.proj',
23102306
),
23112307
'resnet50x64_clip.openai': _cfgr(
23122308
hf_hub_id='timm/',
2313-
hf_hub_filename='open_clip_pytorch_model.bin',
23142309
num_classes=1024, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23152310
fixed_input_size=True, input_size=(3, 448, 448), pool_size=(14, 14),
23162311
classifier='head.proj',
23172312
),
23182313
'resnet50_clip.cc12m': _cfgr(
23192314
hf_hub_id='timm/',
2320-
hf_hub_filename='open_clip_pytorch_model.bin',
23212315
num_classes=1024, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23222316
fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7),
23232317
classifier='head.proj',
23242318
),
23252319
'resnet50_clip.yfcc15m': _cfgr(
23262320
hf_hub_id='timm/',
2327-
hf_hub_filename='open_clip_pytorch_model.bin',
23282321
num_classes=1024, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23292322
fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7),
23302323
classifier='head.proj',
23312324
),
23322325
'resnet101_clip.yfcc15m': _cfgr(
23332326
hf_hub_id='timm/',
2334-
hf_hub_filename='open_clip_pytorch_model.bin',
23352327
num_classes=512, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23362328
fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7),
23372329
classifier='head.proj',
23382330
),
23392331

23402332
# avg-pool w/ optional standard classifier head variants
23412333
'resnet50_clip_gap.openai': _cfgr(
2342-
hf_hub_id='timm/resnet50_clip.openai',
2343-
hf_hub_filename='open_clip_pytorch_model.bin',
2334+
hf_hub_id='timm/',
23442335
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23452336
input_size=(3, 224, 224), pool_size=(7, 7),
23462337
),
23472338
'resnet101_clip_gap.openai': _cfgr(
2348-
hf_hub_id='timm/resnet101_clip.openai',
2349-
hf_hub_filename='open_clip_pytorch_model.bin',
2339+
hf_hub_id='timm/',
23502340
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23512341
input_size=(3, 224, 224), pool_size=(7, 7),
23522342
),
23532343
'resnet50x4_clip_gap.openai': _cfgr(
2354-
hf_hub_id='timm/resnet50x4_clip.openai',
2355-
hf_hub_filename='open_clip_pytorch_model.bin',
2344+
hf_hub_id='timm/',
23562345
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23572346
input_size=(3, 288, 288), pool_size=(9, 9),
23582347
),
23592348
'resnet50x16_clip_gap.openai': _cfgr(
2360-
hf_hub_id='timm/resnet50x16_clip.openai',
2361-
hf_hub_filename='open_clip_pytorch_model.bin',
2349+
hf_hub_id='timm/',
23622350
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23632351
input_size=(3, 384, 384), pool_size=(12, 12),
23642352
),
23652353
'resnet50x64_clip_gap.openai': _cfgr(
2366-
hf_hub_id='timm/resnet50x64_clip.openai',
2367-
hf_hub_filename='open_clip_pytorch_model.bin',
2354+
hf_hub_id='timm/',
23682355
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23692356
input_size=(3, 448, 448), pool_size=(14, 14),
23702357
),
23712358
'resnet50_clip_gap.cc12m': _cfgr(
2372-
hf_hub_id='timm/resnet50_clip.cc12m',
2373-
hf_hub_filename='open_clip_pytorch_model.bin',
2359+
hf_hub_id='timm/',
23742360
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23752361
input_size=(3, 224, 224), pool_size=(7, 7),
23762362
),
23772363
'resnet50_clip_gap.yfcc15m': _cfgr(
2378-
hf_hub_id='timm/resnet50_clip.yfcc15m',
2379-
hf_hub_filename='open_clip_pytorch_model.bin',
2364+
hf_hub_id='timm/',
23802365
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23812366
input_size=(3, 224, 224), pool_size=(7, 7),
23822367
),
23832368
'resnet101_clip_gap.yfcc15m': _cfgr(
2384-
hf_hub_id='timm/resnet101_clip.yfcc15m',
2385-
hf_hub_filename='open_clip_pytorch_model.bin',
2369+
hf_hub_id='timm/',
23862370
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23872371
input_size=(3, 224, 224), pool_size=(7, 7),
23882372
),

‎timm/models/eva.py

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -912,45 +912,52 @@ def _cfg(url='', **kwargs):
912912
# EVA01 and EVA02 CLIP image towers
913913
'eva_giant_patch14_clip_224.laion400m': _cfg(
914914
# hf_hub_id='QuanSun/EVA-CLIP', hf_hub_filename='EVA01_CLIP_g_14_plus_psz14_s11B.pt',
915-
hf_hub_id='timm/eva_giant_patch14_clip_224.laion400m_s11b_b41k', # float16 weights
916-
hf_hub_filename='open_clip_pytorch_model.bin',
915+
# hf_hub_id='timm/eva_giant_patch14_clip_224.laion400m_s11b_b41k', # float16 weights
916+
# hf_hub_filename='open_clip_pytorch_model.bin',
917+
hf_hub_id='timm/',
917918
num_classes=1024,
918919
),
919920
'eva_giant_patch14_clip_224.merged2b': _cfg(
920921
# hf_hub_id='QuanSun/EVA-CLIP', hf_hub_filename='EVA01_CLIP_g_14_plus_psz14_s11B.pt',
921-
hf_hub_id='timm/eva_giant_patch14_plus_clip_224.merged2b_s11b_b114k', # float16 weights
922-
hf_hub_filename='open_clip_pytorch_model.bin',
922+
# hf_hub_id='timm/eva_giant_patch14_plus_clip_224.merged2b_s11b_b114k', # float16 weights
923+
# hf_hub_filename='open_clip_pytorch_model.bin',
924+
hf_hub_id='timm/',
923925
num_classes=1024,
924926
),
925927
'eva02_base_patch16_clip_224.merged2b': _cfg(
926928
# hf_hub_id='QuanSun/EVA-CLIP', hf_hub_filename='EVA02_CLIP_L_psz14_s4B.pt',
927-
hf_hub_id='timm/eva02_base_patch16_clip_224.merged2b_s8b_b131k', # float16 weights
928-
hf_hub_filename='open_clip_pytorch_model.bin',
929+
# hf_hub_id='timm/eva02_base_patch16_clip_224.merged2b_s8b_b131k', # float16 weights
930+
# hf_hub_filename='open_clip_pytorch_model.bin',
931+
hf_hub_id='timm/',
929932
num_classes=512,
930933
),
931934
'eva02_large_patch14_clip_224.merged2b': _cfg(
932935
# hf_hub_id='QuanSun/EVA-CLIP', hf_hub_filename='EVA02_CLIP_L_psz14_s4B.pt',
933-
hf_hub_id='timm/eva02_large_patch14_clip_224.merged2b_s4b_b131k', # float16 weights
934-
hf_hub_filename='open_clip_pytorch_model.bin',
936+
# hf_hub_id='timm/eva02_large_patch14_clip_224.merged2b_s4b_b131k', # float16 weights
937+
# hf_hub_filename='open_clip_pytorch_model.bin',
938+
hf_hub_id='timm/',
935939
num_classes=768,
936940
),
937941
'eva02_large_patch14_clip_336.merged2b': _cfg(
938942
# hf_hub_id='QuanSun/EVA-CLIP', hf_hub_filename='EVA02_CLIP_L_psz14_s4B.pt',
939-
hf_hub_id='timm/eva02_large_patch14_clip_336.merged2b_s6b_b61k', # float16 weights
940-
hf_hub_filename='open_clip_pytorch_model.bin',
943+
# hf_hub_id='timm/eva02_large_patch14_clip_336.merged2b_s6b_b61k', # float16 weights
944+
# hf_hub_filename='open_clip_pytorch_model.bin',
945+
hf_hub_id='timm/',
941946
input_size=(3, 336, 336), crop_pct=1.0,
942947
num_classes=768,
943948
),
944949
'eva02_enormous_patch14_clip_224.laion2b': _cfg(
945950
# hf_hub_id='QuanSun/EVA-CLIP', hf_hub_filename='EVA02_CLIP_E_psz14_plus_s9B.pt',
946-
hf_hub_id='timm/eva02_enormous_patch14_clip_224.laion2b_s4b_b115k', # float16 weights
947-
hf_hub_filename='open_clip_pytorch_model.bin',
951+
# hf_hub_id='timm/eva02_enormous_patch14_clip_224.laion2b_s4b_b115k', # float16 weights
952+
# hf_hub_filename='open_clip_pytorch_model.bin',
953+
hf_hub_id='timm/',
948954
num_classes=1024,
949955
),
950956
'eva02_enormous_patch14_clip_224.laion2b_plus': _cfg(
951957
# hf_hub_id='QuanSun/EVA-CLIP', hf_hub_filename='EVA02_CLIP_E_psz14_plus_s9B.pt',
952-
hf_hub_id='timm/eva02_enormous_patch14_plus_clip_224.laion2b_s9b_b144k', # bfloat16 weights
953-
hf_hub_filename='open_clip_pytorch_model.bin',
958+
# hf_hub_id='timm/eva02_enormous_patch14_plus_clip_224.laion2b_s9b_b144k', # bfloat16 weights
959+
# hf_hub_filename='open_clip_pytorch_model.bin',
960+
hf_hub_id='timm/',
954961
num_classes=1024,
955962
),
956963
'eva02_enormous_patch14_clip_224.pretrain': _cfg(

‎timm/models/hieradet_sam2.py

Lines changed: 43 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -530,26 +530,47 @@ def _cfg(url='', **kwargs):
530530

531531

532532
default_cfgs = generate_default_cfgs({
533-
"sam2_hiera_tiny.r224": _cfg(
534-
hf_hub_id='facebook/sam2-hiera-tiny',
535-
hf_hub_filename='sam2_hiera_tiny.pt',
536-
input_size=(3, 224, 224), pool_size=(7, 7),
537-
), # FIXME reduced res for testing
538-
"sam2_hiera_tiny.r896": _cfg(
539-
hf_hub_id='facebook/sam2-hiera-tiny',
540-
hf_hub_filename='sam2_hiera_tiny.pt',
533+
"sam2_hiera_tiny.fb_r896": _cfg(
534+
# hf_hub_id='facebook/sam2-hiera-tiny',
535+
# hf_hub_filename='sam2_hiera_tiny.pt',
536+
hf_hub_id='timm/',
541537
),
542-
"sam2_hiera_small": _cfg(
543-
hf_hub_id='facebook/sam2-hiera-small',
544-
hf_hub_filename='sam2_hiera_small.pt',
538+
"sam2_hiera_tiny.fb_r896_2pt1": _cfg(
539+
# hf_hub_id='facebook/sam2.1-hiera-tiny',
540+
# hf_hub_filename='sam2.1_hiera_tiny.pt',
541+
hf_hub_id='timm/',
545542
),
546-
"sam2_hiera_base_plus": _cfg(
547-
hf_hub_id='facebook/sam2-hiera-base-plus',
548-
hf_hub_filename='sam2_hiera_base_plus.pt',
543+
"sam2_hiera_small.fb_r896": _cfg(
544+
# hf_hub_id='facebook/sam2-hiera-small',
545+
# hf_hub_filename='sam2_hiera_small.pt',
546+
hf_hub_id='timm/',
549547
),
550-
"sam2_hiera_large": _cfg(
551-
hf_hub_id='facebook/sam2-hiera-large',
552-
hf_hub_filename='sam2_hiera_large.pt',
548+
"sam2_hiera_small.fb_r896_2pt1": _cfg(
549+
# hf_hub_id='facebook/sam2.1-hiera-small',
550+
# hf_hub_filename='sam2.1_hiera_small.pt',
551+
hf_hub_id='timm/',
552+
),
553+
"sam2_hiera_base_plus.fb_r896": _cfg(
554+
# hf_hub_id='facebook/sam2-hiera-base-plus',
555+
# hf_hub_filename='sam2_hiera_base_plus.pt',
556+
hf_hub_id='timm/',
557+
),
558+
"sam2_hiera_base_plus.fb_r896_2pt1": _cfg(
559+
# hf_hub_id='facebook/sam2.1-hiera-base-plus',
560+
# hf_hub_filename='sam2.1_hiera_base_plus.pt',
561+
hf_hub_id='timm/',
562+
),
563+
"sam2_hiera_large.fb_r1024": _cfg(
564+
# hf_hub_id='facebook/sam2-hiera-large',
565+
# hf_hub_filename='sam2_hiera_large.pt',
566+
hf_hub_id='timm/',
567+
min_input_size=(3, 256, 256),
568+
input_size=(3, 1024, 1024), pool_size=(32, 32),
569+
),
570+
"sam2_hiera_large.fb_r1024_2pt1": _cfg(
571+
# hf_hub_id='facebook/sam2.1-hiera-large',
572+
# hf_hub_filename='sam2.1_hiera_large.pt',
573+
hf_hub_id='timm/',
553574
min_input_size=(3, 256, 256),
554575
input_size=(3, 1024, 1024), pool_size=(32, 32),
555576
),
@@ -578,11 +599,11 @@ def checkpoint_filter_fn(state_dict, model=None, prefix=''):
578599
def _create_hiera_det(variant: str, pretrained: bool = False, **kwargs) -> HieraDet:
579600
out_indices = kwargs.pop('out_indices', 4)
580601
checkpoint_prefix = ''
581-
if 'sam2' in variant:
582-
# SAM2 pretrained weights have no classifier or final norm-layer (`head.norm`)
583-
# This is workaround loading with num_classes=0 w/o removing norm-layer.
584-
kwargs.setdefault('pretrained_strict', False)
585-
checkpoint_prefix = 'image_encoder.trunk.'
602+
# if 'sam2' in variant:
603+
# # SAM2 pretrained weights have no classifier or final norm-layer (`head.norm`)
604+
# # This is workaround loading with num_classes=0 w/o removing norm-layer.
605+
# kwargs.setdefault('pretrained_strict', False)
606+
# checkpoint_prefix = 'image_encoder.trunk.'
586607
return build_model_with_cfg(
587608
HieraDet,
588609
variant,

‎timm/models/vision_transformer.py

Lines changed: 106 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -912,26 +912,40 @@ def resize_pos_embed(
912912

913913

914914
@torch.no_grad()
915-
def _load_weights(model: VisionTransformer, checkpoint_path: str, prefix: str = '') -> None:
915+
def _load_weights(model: VisionTransformer, checkpoint_path: str, prefix: str = '', load_bfloat16: bool = False) -> None:
916916
""" Load weights from .npz checkpoints for official Google Brain Flax implementation
917917
"""
918918
import numpy as np
919+
if load_bfloat16:
920+
import jax.numpy as jnp
921+
import ml_dtypes
919922

920-
def _n2p(w, t=True, idx=None):
923+
def _n2p(_w, t=True, idx=None):
921924
if idx is not None:
922-
w = w[idx]
923-
if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1:
924-
w = w.flatten()
925+
_w = _w[idx]
926+
927+
if load_bfloat16:
928+
_w = _w.view(ml_dtypes.bfloat16).astype(jnp.float32)
929+
_w = np.array(_w)
930+
931+
if _w.ndim == 4 and _w.shape[0] == _w.shape[1] == _w.shape[2] == 1:
932+
_w = _w.flatten()
925933
if t:
926-
if w.ndim == 4:
927-
w = w.transpose([3, 2, 0, 1])
928-
elif w.ndim == 3:
929-
w = w.transpose([2, 0, 1])
930-
elif w.ndim == 2:
931-
w = w.transpose([1, 0])
932-
return torch.from_numpy(w)
933-
934-
w = np.load(checkpoint_path)
934+
if _w.ndim == 4:
935+
_w = _w.transpose([3, 2, 0, 1])
936+
elif _w.ndim == 3:
937+
_w = _w.transpose([2, 0, 1])
938+
elif _w.ndim == 2:
939+
_w = _w.transpose([1, 0])
940+
941+
_w = torch.from_numpy(_w)
942+
return _w
943+
944+
if load_bfloat16:
945+
w = jnp.load(checkpoint_path)
946+
else:
947+
w = np.load(checkpoint_path)
948+
935949
interpolation = 'bilinear'
936950
antialias = False
937951
big_vision = False
@@ -1593,18 +1607,18 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
15931607
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1280),
15941608

15951609
'vit_base_patch32_clip_224.laion400m_e32': _cfg(
1596-
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
1610+
hf_hub_id='timm/',
15971611
notes=('natively QuickGELU, use quickgelu model variant for original results',),
15981612
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
15991613
'vit_base_patch16_clip_224.laion400m_e32': _cfg(
1600-
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
1614+
hf_hub_id='timm/',
16011615
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
16021616
'vit_base_patch16_plus_clip_240.laion400m_e32': _cfg(
1603-
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
1617+
hf_hub_id='timm/',
16041618
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
1605-
input_size=(3, 240, 240), crop_pct=1.0, num_classes=512),
1619+
input_size=(3, 240, 240), crop_pct=1.0, num_classes=640),
16061620
'vit_large_patch14_clip_224.laion400m_e32': _cfg(
1607-
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
1621+
hf_hub_id='timm/',
16081622
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),
16091623

16101624
'vit_base_patch32_clip_224.datacompxl': _cfg(
@@ -1622,22 +1636,18 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
16221636
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),
16231637

16241638
'vit_base_patch16_clip_224.dfn2b': _cfg(
1625-
hf_hub_id='apple/DFN2B-CLIP-ViT-B-16',
1626-
hf_hub_filename='open_clip_pytorch_model.bin',
1639+
hf_hub_id='timm/',
16271640
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
16281641
'vit_large_patch14_clip_224.dfn2b': _cfg(
1629-
hf_hub_id='apple/DFN2B-CLIP-ViT-L-14',
1630-
hf_hub_filename='open_clip_pytorch_model.bin',
1642+
hf_hub_id='timm/',
16311643
notes=('natively QuickGELU, use quickgelu model variant for original results',),
16321644
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),
16331645
'vit_huge_patch14_clip_224.dfn5b': _cfg(
1634-
hf_hub_id='apple/DFN5B-CLIP-ViT-H-14',
1635-
hf_hub_filename='open_clip_pytorch_model.bin',
1646+
hf_hub_id='timm/',
16361647
notes=('natively QuickGELU, use quickgelu model variant for original results',),
16371648
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024),
16381649
'vit_huge_patch14_clip_378.dfn5b': _cfg(
1639-
hf_hub_id='apple/DFN5B-CLIP-ViT-H-14-378',
1640-
hf_hub_filename='open_clip_pytorch_model.bin',
1650+
hf_hub_id='timm/',
16411651
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
16421652
notes=('natively QuickGELU, use quickgelu model variant for original results',),
16431653
crop_pct=1.0, input_size=(3, 378, 378), num_classes=1024),
@@ -1700,7 +1710,7 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
17001710
notes=('natively QuickGELU, use quickgelu model variant for original results',),
17011711
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),
17021712
'vit_large_patch14_clip_336.openai': _cfg(
1703-
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
1713+
hf_hub_id='timm/',
17041714
notes=('natively QuickGELU, use quickgelu model variant for original results',),
17051715
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
17061716
crop_pct=1.0, input_size=(3, 336, 336), num_classes=768),
@@ -1907,15 +1917,22 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
19071917
hf_hub_id='timm/',
19081918
num_classes=0),
19091919
'vit_so400m_patch14_siglip_gap_224.pali_mix': _cfg(
1910-
hf_hub_id='google/paligemma-3b-mix-224-jax',
1911-
hf_hub_filename='paligemma-3b-mix-224.npz',
1912-
custom_load='hf',
1920+
hf_hub_id='timm/',
19131921
num_classes=0),
19141922
'vit_so400m_patch14_siglip_gap_224.pali_pt': _cfg(
1915-
hf_hub_id='google/paligemma-3b-pt-224-jax',
1916-
hf_hub_filename='paligemma-3b-pt-224.npz',
1917-
custom_load='hf',
1923+
hf_hub_id='timm/',
1924+
num_classes=0),
1925+
'vit_so400m_patch14_siglip_gap_224.pali2_3b_pt': _cfg(
1926+
hf_hub_id='timm/',
19181927
num_classes=0),
1928+
'vit_so400m_patch14_siglip_gap_224.pali2_10b_pt': _cfg(
1929+
hf_hub_id='timm/',
1930+
num_classes=0),
1931+
# 'vit_so400m_patch14_siglip_gap_224.pali2_28b_pt': _cfg(
1932+
# hf_hub_id='google/paligemma2-28b-pt-224-jax',
1933+
# hf_hub_filename='pt_27b_224.npz',
1934+
# custom_load='hf',
1935+
# num_classes=0),
19191936
'vit_so400m_patch16_siglip_gap_256.webli_i18n': _cfg(
19201937
hf_hub_id='timm/',
19211938
input_size=(3, 256, 256),
@@ -1929,23 +1946,69 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
19291946
input_size=(3, 384, 384), crop_pct=1.0,
19301947
num_classes=0),
19311948
'vit_so400m_patch14_siglip_gap_448.pali_mix': _cfg(
1932-
hf_hub_id='google/paligemma-3b-mix-448-jax',
1933-
hf_hub_filename='paligemma-3b-mix-448.npz',
1934-
custom_load='hf',
1949+
hf_hub_id='timm/',
19351950
input_size=(3, 448, 448), crop_pct=1.0,
19361951
num_classes=0),
19371952
'vit_so400m_patch14_siglip_gap_448.pali_pt': _cfg(
1938-
hf_hub_id='google/paligemma-3b-pt-448-jax',
1939-
hf_hub_filename='paligemma-3b-pt-448.npz',
1940-
custom_load='hf',
1953+
hf_hub_id='timm/',
1954+
input_size=(3, 448, 448), crop_pct=1.0,
1955+
num_classes=0),
1956+
'vit_so400m_patch14_siglip_gap_448.pali_refcoco_seg': _cfg(
1957+
hf_hub_id='timm/',
1958+
input_size=(3, 448, 448), crop_pct=1.0,
1959+
num_classes=0),
1960+
'vit_so400m_patch14_siglip_gap_448.pali_ocrvqa': _cfg(
1961+
hf_hub_id='timm/',
1962+
input_size=(3, 448, 448), crop_pct=1.0,
1963+
num_classes=0),
1964+
'vit_so400m_patch14_siglip_gap_448.pali2_3b_pt': _cfg(
1965+
hf_hub_id='timm/',
1966+
input_size=(3, 448, 448), crop_pct=1.0,
1967+
num_classes=0),
1968+
'vit_so400m_patch14_siglip_gap_448.pali2_10b_pt': _cfg(
1969+
hf_hub_id='timm/',
1970+
input_size=(3, 448, 448), crop_pct=1.0,
1971+
num_classes=0),
1972+
# 'vit_so400m_patch14_siglip_gap_448.pali2_28b_pt': _cfg(
1973+
# hf_hub_id='google/paligemma2-28b-pt-448-jax',
1974+
# hf_hub_filename='pt_27b_448.npz',
1975+
# custom_load='hf',
1976+
# input_size=(3, 448, 448), crop_pct=1.0,
1977+
# num_classes=0),
1978+
'vit_so400m_patch14_siglip_gap_448.pali2_3b_docci': _cfg(
1979+
hf_hub_id='timm/',
1980+
input_size=(3, 448, 448), crop_pct=1.0,
1981+
num_classes=0),
1982+
'vit_so400m_patch14_siglip_gap_448.pali2_10b_docci': _cfg(
1983+
hf_hub_id='timm/',
19411984
input_size=(3, 448, 448), crop_pct=1.0,
19421985
num_classes=0),
19431986
'vit_so400m_patch14_siglip_gap_896.pali_pt': _cfg(
1944-
hf_hub_id='google/paligemma-3b-pt-896-jax',
1945-
hf_hub_filename='paligemma-3b-pt-896.npz',
1946-
custom_load='hf',
1987+
hf_hub_id='timm/',
1988+
input_size=(3, 896, 896), crop_pct=1.0,
1989+
num_classes=0),
1990+
'vit_so400m_patch14_siglip_gap_896.pali_refcoco_seg': _cfg(
1991+
hf_hub_id='timm/',
1992+
input_size=(3, 896, 896), crop_pct=1.0,
1993+
num_classes=0),
1994+
'vit_so400m_patch14_siglip_gap_896.pali_ocrvqa': _cfg(
1995+
hf_hub_id='timm/',
1996+
input_size=(3, 896, 896), crop_pct=1.0,
1997+
num_classes=0),
1998+
'vit_so400m_patch14_siglip_gap_896.pali2_3b_pt': _cfg(
1999+
hf_hub_id='timm/',
2000+
input_size=(3, 896, 896), crop_pct=1.0,
2001+
num_classes=0),
2002+
'vit_so400m_patch14_siglip_gap_896.pali2_10b_pt': _cfg(
2003+
hf_hub_id='timm/',
19472004
input_size=(3, 896, 896), crop_pct=1.0,
19482005
num_classes=0),
2006+
# 'vit_so400m_patch14_siglip_gap_896.pali2_28b_pt': _cfg(
2007+
# hf_hub_id='google/paligemma2-28b-pt-896-jax',
2008+
# hf_hub_filename='pt_27b_896.npz',
2009+
# custom_load='hf',
2010+
# input_size=(3, 896, 896), crop_pct=1.0,
2011+
# num_classes=0),
19492012

19502013
'vit_so400m_patch14_siglip_378.webli_ft_in1k': _cfg(
19512014
hf_hub_id='timm/',
@@ -1958,22 +2021,18 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
19582021

19592022
'vit_xsmall_patch16_clip_224.tinyclip_yfcc15m': _cfg(
19602023
hf_hub_id='timm/',
1961-
hf_hub_filename='open_clip_pytorch_model.bin',
19622024
license='mit',
19632025
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
19642026
'vit_medium_patch32_clip_224.tinyclip_laion400m': _cfg(
19652027
hf_hub_id='timm/',
1966-
hf_hub_filename='open_clip_pytorch_model.bin',
19672028
license='mit',
19682029
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
19692030
'vit_medium_patch16_clip_224.tinyclip_yfcc15m': _cfg(
19702031
hf_hub_id='timm/',
1971-
hf_hub_filename='open_clip_pytorch_model.bin',
19722032
license='mit',
19732033
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
19742034
'vit_betwixt_patch32_clip_224.tinyclip_laion400m': _cfg(
19752035
hf_hub_id='timm/',
1976-
hf_hub_filename='open_clip_pytorch_model.bin',
19772036
license='mit',
19782037
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
19792038

0 commit comments

Comments
 (0)
Please sign in to comment.