@@ -1739,6 +1739,19 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
1739
1739
1740
1740
'vit_8m_patch16_tinyclip_224.yfcc15m' : _cfg (
1741
1741
url = 'https://github.com/wkcn/TinyCLIP-model-zoo/releases/download/checkpoints/TinyCLIP-ViT-8M-16-Text-3M-YFCC15M.pt' ,
1742
+ license = 'mit' ,
1743
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
1744
+ 'vit_39m_patch16_tinyclip_224.yfcc15m' : _cfg (
1745
+ url = 'https://github.com/wkcn/TinyCLIP-model-zoo/releases/download/checkpoints/TinyCLIP-ViT-39M-16-Text-19M-YFCC15M.pt' ,
1746
+ license = 'mit' ,
1747
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
1748
+ 'vit_40m_patch32_tinyclip_224.laion400m' : _cfg (
1749
+ url = 'https://github.com/wkcn/TinyCLIP-model-zoo/releases/download/checkpoints/TinyCLIP-ViT-40M-32-Text-19M-LAION400M.pt' ,
1750
+ license = 'mit' ,
1751
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
1752
+ 'vit_61m_patch32_tinyclip_224.laion400m' : _cfg (
1753
+ url = 'https://github.com/wkcn/TinyCLIP-model-zoo/releases/download/checkpoints/TinyCLIP-ViT-61M-32-Text-29M-LAION400M.pt' ,
1754
+ license = 'mit' ,
1742
1755
mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
1743
1756
1744
1757
'vit_medium_patch16_reg4_256' : _cfg (
@@ -2635,6 +2648,32 @@ def vit_8m_patch16_tinyclip_224(pretrained: bool = False, **kwargs) -> VisionTra
2635
2648
return model
2636
2649
2637
2650
2651
+ @register_model
2652
+ def vit_39m_patch16_tinyclip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2653
+ model_args = dict (embed_dim = 512 , depth = 12 , num_heads = 8 , pre_norm = True , norm_layer = nn .LayerNorm )
2654
+ model = _create_vision_transformer (
2655
+ 'vit_39m_patch16_tinyclip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2656
+ return model
2657
+
2658
+
2659
+ @register_model
2660
+ def vit_40m_patch32_tinyclip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2661
+ model_args = dict (
2662
+ patch_size = 32 , embed_dim = 512 , depth = 12 , num_heads = 8 , pre_norm = True , norm_layer = nn .LayerNorm )
2663
+ model = _create_vision_transformer (
2664
+ 'vit_40m_patch32_tinyclip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2665
+ return model
2666
+
2667
+
2668
+ @register_model
2669
+ def vit_61m_patch32_tinyclip_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2670
+ model_args = dict (
2671
+ patch_size = 32 , embed_dim = 640 , depth = 12 , num_heads = 10 , pre_norm = True , norm_layer = nn .LayerNorm )
2672
+ model = _create_vision_transformer (
2673
+ 'vit_61m_patch32_tinyclip_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2674
+ return model
2675
+
2676
+
2638
2677
@register_model
2639
2678
def vit_medium_patch16_reg4_256 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2640
2679
model_args = dict (
0 commit comments