@@ -1015,6 +1015,8 @@ def checkpoint_filter_fn(
1015
1015
return _convert_openai_clip (state_dict , model )
1016
1016
elif 'module.visual.class_embedding' in state_dict :
1017
1017
return _convert_openai_clip (state_dict , model , prefix = 'module.visual.' )
1018
+ elif '_image_encoder.module.visual.class_embedding' in state_dict :
1019
+ return _convert_openai_clip (state_dict , model , prefix = '_image_encoder.module.visual.' )
1018
1020
1019
1021
if "mask_token" in state_dict :
1020
1022
state_dict = _convert_dinov2 (state_dict , model )
@@ -1735,6 +1737,10 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
1735
1737
input_size = (3 , 384 , 384 ),
1736
1738
num_classes = 0 ),
1737
1739
1740
+ 'vit_8m_patch16_tinyclip_224.yfcc15m' : _cfg (
1741
+ url = 'https://github.com/wkcn/TinyCLIP-model-zoo/releases/download/checkpoints/TinyCLIP-ViT-8M-16-Text-3M-YFCC15M.pt' ,
1742
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
1743
+
1738
1744
'vit_medium_patch16_reg4_256' : _cfg (
1739
1745
input_size = (3 , 256 , 256 )),
1740
1746
'vit_medium_patch16_reg4_gap_256' : _cfg (
@@ -2621,6 +2627,14 @@ def vit_so400m_patch14_siglip_384(pretrained: bool = False, **kwargs) -> VisionT
2621
2627
return model
2622
2628
2623
2629
2630
+ @register_model
2631
+ def vit_8m_patch16_tinyclip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2632
+ model_args = dict (embed_dim = 256 , depth = 10 , num_heads = 4 , pre_norm = True , norm_layer = nn .LayerNorm )
2633
+ model = _create_vision_transformer (
2634
+ 'vit_8m_patch16_tinyclip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2635
+ return model
2636
+
2637
+
2624
2638
@register_model
2625
2639
def vit_medium_patch16_reg4_256 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2626
2640
model_args = dict (
0 commit comments