Update tasks with new models and apps (#1229)

merveenoyan · pcuenca · Vaibhavs10 · web-flow · commit 256e1eaf8231 · 2025-02-28T13:45:44.000+01:00
---------

Co-authored-by: Pedro Cuenca &lt;pedro@huggingface.co&gt;
Co-authored-by: vb &lt;vaibhavs10@gmail.com&gt;
diff --git a/packages/tasks/src/tasks/depth-estimation/data.ts b/packages/tasks/src/tasks/depth-estimation/data.ts
@@ -41,7 +41,7 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "A robust depth estimation model.",
-			id: "apple/DepthPro",
+			id: "apple/DepthPro-hf",
 		},
 	],
 	spaces: [
diff --git a/packages/tasks/src/tasks/image-text-to-text/data.ts b/packages/tasks/src/tasks/image-text-to-text/data.ts
@@ -48,7 +48,7 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "A screenshot understanding model used to control computers.",
-			id: "showlab/ShowUI-2B",
+			id: "microsoft/OmniParser-v2.0",
 		},
 		{
 			description: "Cutting-edge vision language model.",
@@ -63,12 +63,16 @@ const taskData: TaskDataCustom = {
 			id: "Qwen/Qwen2.5-VL-7B-Instruct",
 		},
 		{
-			description: "Image-text-to-text model with reasoning capabilities.",
-			id: "Qwen/QVQ-72B-Preview",
+			description: "Image-text-to-text model with agentic capabilities.",
+			id: "microsoft/Magma-8B",
 		},
 		{
 			description: "Strong image-text-to-text model focused on documents.",
-			id: "stepfun-ai/GOT-OCR2_0",
+			id: "allenai/olmOCR-7B-0225-preview",
+		},
+		{
+			description: "Small yet strong image-text-to-text model.",
+			id: "ibm-granite/granite-vision-3.2-2b",
 		},
 	],
 	spaces: [
@@ -85,8 +89,8 @@ const taskData: TaskDataCustom = {
 			id: "akhaliq/Molmo-7B-D-0924",
 		},
 		{
-			description: "An image-text-to-text application focused on documents.",
-			id: "stepfun-ai/GOT_official_online_demo",
+			description: "Powerful vision language assistant that can understand multiple images.",
+			id: "HuggingFaceTB/SmolVLM2",
 		},
 		{
 			description: "An application for chatting with an image-text-to-text model.",
diff --git a/packages/tasks/src/tasks/keypoint-detection/data.ts b/packages/tasks/src/tasks/keypoint-detection/data.ts
@@ -27,6 +27,10 @@ const taskData: TaskDataCustom = {
 			description: "A robust keypoint detection model.",
 			id: "magic-leap-community/superpoint",
 		},
+		{
+			description: "A robust keypoint matching model.",
+			id: "magic-leap-community/superglue_outdoor",
+		},
 		{
 			description: "Strong keypoint detection model used to detect human pose.",
 			id: "facebook/sapiens-pose-1b",
diff --git a/packages/tasks/src/tasks/object-detection/data.ts b/packages/tasks/src/tasks/object-detection/data.ts
@@ -47,12 +47,12 @@ const taskData: TaskDataCustom = {
 			id: "facebook/detr-resnet-50",
 		},
 		{
-			description: "Real-time and accurate object detection model.",
-			id: "jameslahm/yolov10x",
+			description: "Accurate object detection model.",
+			id: "IDEA-Research/dab-detr-resnet-50",
 		},
 		{
-			description: "Fast and accurate object detection model trained on COCO and Object365 datasets.",
-			id: "PekingU/rtdetr_r18vd_coco_o365",
+			description: "Fast and accurate object detection model.",
+			id: "PekingU/rtdetr_v2_r50vd",
 		},
 		{
 			description: "Object detection model for low-lying objects.",
@@ -70,7 +70,7 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "A cutting-edge object detection application.",
-			id: "Ultralytics/YOLO11",
+			id: "sunsmarterjieleaf/yolov12",
 		},
 		{
 			description: "An object tracking, segmentation and inpainting application.",
diff --git a/packages/tasks/src/tasks/text-generation/data.ts b/packages/tasks/src/tasks/text-generation/data.ts
@@ -76,7 +76,7 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "A very powerful model with reasoning capabilities.",
-			id: "PowerInfer/SmallThinker-3B-Preview",
+			id: "simplescaling/s1.1-32B",
 		},
 		{
 			description: "Strong conversational model that supports very long instructions.",
diff --git a/packages/tasks/src/tasks/text-to-speech/data.ts b/packages/tasks/src/tasks/text-to-speech/data.ts
@@ -76,6 +76,10 @@ const taskData: TaskDataCustom = {
 			description: "An application that synthesizes emotional speech for diverse speaker prompts.",
 			id: "parler-tts/parler-tts-expresso",
 		},
+		{
+			description: "An application that generates podcast episodes.",
+			id: "ngxson/kokoro-podcast-generator",
+		},
 	],
 	summary:
 		"Text-to-Speech (TTS) is the task of generating natural sounding speech given text input. TTS models can be extended to have a single model that generates speech for multiple speakers and multiple languages.",
diff --git a/packages/tasks/src/tasks/text-to-video/data.ts b/packages/tasks/src/tasks/text-to-video/data.ts
@@ -78,6 +78,10 @@ const taskData: TaskDataCustom = {
 			description: "A text-to-video model focusing on physics-aware applications like robotics.",
 			id: "nvidia/Cosmos-1.0-Diffusion-7B-Text2World",
 		},
+		{
+			description: "A robust model for video generation.",
+			id: "Wan-AI/Wan2.1-T2V-1.3B",
+		},
 	],
 	spaces: [
 		{
@@ -86,7 +90,7 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "Consistent video generation application.",
-			id: "TIGER-Lab/T2V-Turbo-V2",
+			id: "Wan-AI/Wan2.1",
 		},
 		{
 			description: "A cutting edge video generation application.",
diff --git a/packages/tasks/src/tasks/video-text-to-text/data.ts b/packages/tasks/src/tasks/video-text-to-text/data.ts
@@ -46,6 +46,10 @@ const taskData: TaskDataCustom = {
 			description: "Strong video-text-to-text model with reasoning capabilities.",
 			id: "GoodiesHere/Apollo-LMMs-Apollo-7B-t32",
 		},
+		{
+			description: "Strong video-text-to-text model.",
+			id: "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
+		},
 	],
 	spaces: [
 		{
@@ -56,6 +60,10 @@ const taskData: TaskDataCustom = {
 			description: "A leaderboard for various video-text-to-text models.",
 			id: "opencompass/openvlm_video_leaderboard",
 		},
+		{
+			description: "An application to generate highlights from a video.",
+			id: "HuggingFaceTB/SmolVLM2-HighlightGenerator",
+		},
 	],
 	summary:
 		"Video-text-to-text models take in a video and a text prompt and output text. These models are also called video-language models.",
diff --git a/packages/tasks/src/tasks/zero-shot-classification/data.ts b/packages/tasks/src/tasks/zero-shot-classification/data.ts
@@ -60,6 +60,10 @@ const taskData: TaskDataCustom = {
 			description: "Cutting-edge zero-shot multilingual text classification model.",
 			id: "MoritzLaurer/ModernBERT-large-zeroshot-v2.0",
 		},
+		{
+			description: "Zero-shot text classification model that can be used for topic and sentiment classification.",
+			id: "knowledgator/gliclass-modern-base-v2.0-init",
+		},
 	],
 	spaces: [],
 	summary:
diff --git a/packages/tasks/src/tasks/zero-shot-image-classification/data.ts b/packages/tasks/src/tasks/zero-shot-image-classification/data.ts
@@ -53,11 +53,11 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "Strong zero-shot image classification model.",
-			id: "google/siglip-so400m-patch14-224",
+			id: "google/siglip2-base-patch16-224",
 		},
 		{
 			description: "Robust zero-shot image classification model.",
-			id: "microsoft/LLM2CLIP-EVA02-L-14-336",
+			id: "intfloat/mmE5-mllama-11b-instruct",
 		},
 		{
 			description: "Powerful zero-shot image classification model supporting 94 languages.",

Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ const taskData: TaskDataCustom = {`
`41`	`41`	`},`
`42`	`42`	`{`
`43`	`43`	`description: "A robust depth estimation model.",`
`44`		`- id: "apple/DepthPro",`
	`44`	`+ id: "apple/DepthPro-hf",`
`45`	`45`	`},`
`46`	`46`	`],`
`47`	`47`	`spaces: [`
Original file line number	Diff line number	Diff line change
`@@ -47,12 +47,12 @@ const taskData: TaskDataCustom = {`
`47`	`47`	`id: "facebook/detr-resnet-50",`
`48`	`48`	`},`
`49`	`49`	`{`
`50`		`- description: "Real-time and accurate object detection model.",`
`51`		`- id: "jameslahm/yolov10x",`
	`50`	`+ description: "Accurate object detection model.",`
	`51`	`+ id: "IDEA-Research/dab-detr-resnet-50",`
`52`	`52`	`},`
`53`	`53`	`{`
`54`		`- description: "Fast and accurate object detection model trained on COCO and Object365 datasets.",`
`55`		`- id: "PekingU/rtdetr_r18vd_coco_o365",`
	`54`	`+ description: "Fast and accurate object detection model.",`
	`55`	`+ id: "PekingU/rtdetr_v2_r50vd",`
`56`	`56`	`},`
`57`	`57`	`{`
`58`	`58`	`description: "Object detection model for low-lying objects.",`
`@@ -70,7 +70,7 @@ const taskData: TaskDataCustom = {`
`70`	`70`	`},`
`71`	`71`	`{`
`72`	`72`	`description: "A cutting-edge object detection application.",`
`73`		`- id: "Ultralytics/YOLO11",`
	`73`	`+ id: "sunsmarterjieleaf/yolov12",`
`74`	`74`	`},`
`75`	`75`	`{`
`76`	`76`	`description: "An object tracking, segmentation and inpainting application.",`
Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,7 @@ const taskData: TaskDataCustom = {`
`76`	`76`	`},`
`77`	`77`	`{`
`78`	`78`	`description: "A very powerful model with reasoning capabilities.",`
`79`		`- id: "PowerInfer/SmallThinker-3B-Preview",`
	`79`	`+ id: "simplescaling/s1.1-32B",`
`80`	`80`	`},`
`81`	`81`	`{`
`82`	`82`	`description: "Strong conversational model that supports very long instructions.",`
Original file line number	Diff line number	Diff line change
`@@ -78,6 +78,10 @@ const taskData: TaskDataCustom = {`
`78`	`78`	`description: "A text-to-video model focusing on physics-aware applications like robotics.",`
`79`	`79`	`id: "nvidia/Cosmos-1.0-Diffusion-7B-Text2World",`
`80`	`80`	`},`
	`81`	`+ {`
	`82`	`+ description: "A robust model for video generation.",`
	`83`	`+ id: "Wan-AI/Wan2.1-T2V-1.3B",`
	`84`	`+ },`
`81`	`85`	`],`
`82`	`86`	`spaces: [`
`83`	`87`	`{`
`@@ -86,7 +90,7 @@ const taskData: TaskDataCustom = {`
`86`	`90`	`},`
`87`	`91`	`{`
`88`	`92`	`description: "Consistent video generation application.",`
`89`		`- id: "TIGER-Lab/T2V-Turbo-V2",`
	`93`	`+ id: "Wan-AI/Wan2.1",`
`90`	`94`	`},`
`91`	`95`	`{`
`92`	`96`	`description: "A cutting edge video generation application.",`
Original file line number	Diff line number	Diff line change
`@@ -53,11 +53,11 @@ const taskData: TaskDataCustom = {`
`53`	`53`	`},`
`54`	`54`	`{`
`55`	`55`	`description: "Strong zero-shot image classification model.",`
`56`		`- id: "google/siglip-so400m-patch14-224",`
	`56`	`+ id: "google/siglip2-base-patch16-224",`
`57`	`57`	`},`
`58`	`58`	`{`
`59`	`59`	`description: "Robust zero-shot image classification model.",`
`60`		`- id: "microsoft/LLM2CLIP-EVA02-L-14-336",`
	`60`	`+ id: "intfloat/mmE5-mllama-11b-instruct",`
`61`	`61`	`},`
`62`	`62`	`{`
`63`	`63`	`description: "Powerful zero-shot image classification model supporting 94 languages.",`