gguf: add memory calculator to CLI (#1225)

ngxson · web-flow · commit 047ce0412a5f · 2025-03-05T19:42:06.000+01:00
The logic to calculate memory usage is taken from various cpp files in
the project. This can be accurate down to around 100MB

Example:

```
pnpm run build &amp;&amp; npx . ~/work/models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -c 8000

* Memory usage estimation (with 8000 tokens in context)
  Item     | Memory usage
  ---------|-------------
  K cache  |      0.52 GB
  V cache  |      0.52 GB
  Weight   |      4.91 GB
  Overhead |      0.28 GB
           |          ---
  TOTAL    |      6.24 GB
```

TODO:
- [x] Fix multi shard gguf
- [x] Do not calc for recurrence models for now (mamba, rwkv)
diff --git a/packages/gguf/src/cli.ts b/packages/gguf/src/cli.ts
@@ -1,6 +1,7 @@
 #!/usr/bin/env node
 
-import { GGMLQuantizationType, gguf } from ".";
+import { GGMLQuantizationType, gguf, ggufAllShards, GGUFParseOutput } from ".";
+import { GGML_QUANT_SIZES } from "./quant-descriptions";
 
 interface PrintColumnHeader {
 	name: string;
@@ -10,11 +11,44 @@ interface PrintColumnHeader {
 
 const mapDtypeToName = Object.fromEntries(Object.entries(GGMLQuantizationType).map(([name, value]) => [value, name]));
 
+function showHelp(exitCode: number) {
+	console.error("Usage: gguf-view [--help|-h] [--show-tensor] [--context|-c N] <path/to/gguf>");
+	console.error("  --help, -h        Show this help message");
+	console.error("  --show-tensor     Show tensor information");
+	console.error("  --context, -c N   Number of tokens in context (default: 4096)");
+	process.exit(exitCode);
+}
+
 async function main() {
-	const ggufPath = process.argv[2];
-	const { metadata, tensorInfos } = await gguf(ggufPath, {
+	let ggufPath = "";
+	let showTensors = false;
+	let nCtx = 4096;
+	for (let i = 2; i < process.argv.length; i++) {
+		if (process.argv[i] === "--help" || process.argv[i] === "-h") {
+			showHelp(0);
+		} else if (process.argv[i] === "--show-tensor") {
+			showTensors = true;
+		} else if (process.argv[i] === "--context" || process.argv[i] === "-c") {
+			nCtx = Number(process.argv[++i]);
+		} else {
+			ggufPath = process.argv[i];
+		}
+	}
+
+	if (!ggufPath.length) {
+		console.error("Error: Missing path to gguf file");
+		showHelp(1);
+	}
+
+	const { shards } = await ggufAllShards(ggufPath, {
 		allowLocalFile: true,
 	});
+	const { metadata, tensorInfos } = shards[0];
+
+	// merge all metadata
+	for (let i = 1; i < shards.length; i++) {
+		tensorInfos.push(...shards[i].tensorInfos);
+	}
 
 	// TODO: print info about endianess
 	console.log(`* Dumping ${Object.keys(metadata).length} key/value pair(s)`);
@@ -43,29 +77,110 @@ async function main() {
 	);
 
 	console.log();
-	console.log(`* Dumping ${tensorInfos.length} tensor(s)`);
-	printTable(
-		[
-			{ name: "Idx", alignRight: true },
-			{ name: "Num Elements", alignRight: true },
-			{ name: "Shape" },
-			{ name: "Data Type" },
-			{ name: "Name" },
-		],
-		tensorInfos.map((tensorInfo, i) => {
-			const shape = [1n, 1n, 1n, 1n];
-			tensorInfo.shape.forEach((dim, i) => {
-				shape[i] = dim;
-			});
-			return [
-				(i + 1).toString(),
-				shape.reduce((acc, n) => acc * n, 1n).toString(),
-				shape.map((n) => n.toString().padStart(6)).join(", "),
-				mapDtypeToName[tensorInfo.dtype],
-				tensorInfo.name,
-			];
-		})
-	);
+	console.log(`* Memory usage estimation (with context length of ${nCtx} tokens)`);
+	try {
+		const kvUsage = calcMemoryUsage(metadata as GGUFParseOutput<{ strict: false }>["metadata"], nCtx);
+		let modelWeightInBytes = 0;
+		for (const tensorInfo of tensorInfos) {
+			const nElem = Number(tensorInfo.shape.reduce((a, b) => a * b, 1n));
+			const tensorSizeInBytes = nElem * (GGML_QUANT_SIZES[tensorInfo.dtype] / 8);
+			modelWeightInBytes += tensorSizeInBytes;
+		}
+		const overhead =
+			calcMemoryUsage(metadata as GGUFParseOutput<{ strict: false }>["metadata"], 256).totalBytes +
+			modelWeightInBytes * 0.05;
+		const totalMemoryUsage = kvUsage.totalBytes + overhead + modelWeightInBytes;
+		printTable(
+			[{ name: "Item" }, { name: "Memory usage", alignRight: true }],
+			[
+				["K cache", (kvUsage.totalBytesK / 1e9).toFixed(2) + " GB"],
+				["V cache", (kvUsage.totalBytesV / 1e9).toFixed(2) + " GB"],
+				["Weight", (modelWeightInBytes / 1e9).toFixed(2) + " GB"],
+				["Overhead", (overhead / 1e9).toFixed(2) + " GB"],
+				["", "---"],
+				["TOTAL", (totalMemoryUsage / 1e9).toFixed(2) + " GB"],
+			]
+		);
+	} catch (e) {
+		console.error(`Error: ${(e as Error).message}`);
+	}
+
+	if (showTensors) {
+		console.log();
+		console.log(`* Dumping ${tensorInfos.length} tensor(s)`);
+		printTable(
+			[
+				{ name: "Idx", alignRight: true },
+				{ name: "Num Elements", alignRight: true },
+				{ name: "Shape" },
+				{ name: "Data Type" },
+				{ name: "Name" },
+			],
+			tensorInfos.map((tensorInfo, i) => {
+				const shape = [1n, 1n, 1n, 1n];
+				tensorInfo.shape.forEach((dim, i) => {
+					shape[i] = dim;
+				});
+				return [
+					(i + 1).toString(),
+					shape.reduce((acc, n) => acc * n, 1n).toString(),
+					shape.map((n) => n.toString().padStart(6)).join(", "),
+					mapDtypeToName[tensorInfo.dtype],
+					tensorInfo.name,
+				];
+			})
+		);
+	} else {
+		console.log();
+		console.log(`* Use --show-tensor to display tensor information`);
+	}
+}
+
+function calcMemoryUsage(
+	metadata: GGUFParseOutput<{ strict: false }>["metadata"],
+	kvSize: number,
+	kvTypeK: GGMLQuantizationType = GGMLQuantizationType.F16,
+	kvTypeV: GGMLQuantizationType = GGMLQuantizationType.F16
+) {
+	const arch = metadata["general.architecture"] ?? "unknown";
+	const n_embd = (metadata[`${arch}.embedding_length`] as number) ?? 0;
+	const n_head = (metadata[`${arch}.attention.head_count`] as number) ?? 0;
+	const n_embd_head_k = (metadata[`${arch}.attention.key_length`] as number) ?? n_embd / n_head;
+	const n_embd_head_v = (metadata[`${arch}.attention.value_length`] as number) ?? n_embd / n_head;
+	const n_head_kv = (metadata[`${arch}.attention.head_count_kv`] as number[] | number) ?? [];
+	const n_layer = (metadata[`${arch}.block_count`] as number) ?? 0;
+
+	if (arch.startsWith("mamba") || arch.startsWith("rwkv")) {
+		throw new Error(`Memory usage estimation for arch "${arch}" is not supported`);
+	}
+
+	const n_head_kv_arr = Array(n_layer).fill(n_head);
+	if (Array.isArray(n_head_kv)) {
+		for (let i = 0; i < n_layer; i++) {
+			if (n_head_kv[i]) {
+				n_head_kv_arr[i] = n_head_kv[i];
+			}
+		}
+	} else {
+		for (let i = 0; i < n_layer; i++) {
+			n_head_kv_arr[i] = n_head_kv;
+		}
+	}
+
+	let totalElemsK = 0;
+	let totalElemsV = 0;
+	for (let i = 0; i < n_layer; i++) {
+		const n_embd_k_gqa = n_embd_head_k * n_head_kv_arr[i];
+		const n_embd_v_gqa = n_embd_head_v * n_head_kv_arr[i];
+		totalElemsK += n_embd_k_gqa * kvSize;
+		totalElemsV += n_embd_v_gqa * kvSize;
+	}
+
+	return {
+		totalBytesK: totalElemsK * (GGML_QUANT_SIZES[kvTypeK] / 8),
+		totalBytesV: totalElemsV * (GGML_QUANT_SIZES[kvTypeV] / 8),
+		totalBytes: (totalElemsK + totalElemsV) * (GGML_QUANT_SIZES[kvTypeV] / 8),
+	};
 }
 
 function printTable(header: PrintColumnHeader[], rows: string[][], leftPad = 2) {
diff --git a/packages/gguf/src/gguf.ts b/packages/gguf/src/gguf.ts
@@ -410,6 +410,7 @@ export async function ggufAllShards(
 		fetch?: typeof fetch;
 		additionalFetchHeaders?: Record<string, string>;
 		parallelDownloads?: number;
+		allowLocalFile?: boolean;
 	}
 ): Promise<{ shards: GGUFParseOutput[]; parameterCount: number }> {
 	const parallelDownloads = params?.parallelDownloads ?? PARALLEL_DOWNLOADS;
diff --git a/packages/gguf/src/quant-descriptions.ts b/packages/gguf/src/quant-descriptions.ts
@@ -125,3 +125,44 @@ export const GGUF_QUANT_DESCRIPTIONS: Record<GGMLQuantizationType, { txt: string
 		src_url: "https://en.wikipedia.org/wiki/Bfloat16_floating-point_format",
 	},
 };
+
+const QK_K = 256;
+const calcBPW = (blockSize: number, typeSize: number) => {
+	return (typeSize * 8) / blockSize;
+};
+
+// copied from https://github.com/ggml-org/llama.cpp/tree/master/gguf-py/gguf/constants.py
+// map quantization type to element size in bits per weight (example: Q4_K -> 4.5 bpw)
+export const GGML_QUANT_SIZES = {
+	[GGMLQuantizationType.F32]: calcBPW(1, 4),
+	[GGMLQuantizationType.F16]: calcBPW(1, 2),
+	[GGMLQuantizationType.Q4_0]: calcBPW(32, 2 + 16),
+	[GGMLQuantizationType.Q4_1]: calcBPW(32, 2 + 2 + 16),
+	[GGMLQuantizationType.Q5_0]: calcBPW(32, 2 + 4 + 16),
+	[GGMLQuantizationType.Q5_1]: calcBPW(32, 2 + 2 + 4 + 16),
+	[GGMLQuantizationType.Q8_0]: calcBPW(32, 2 + 32),
+	[GGMLQuantizationType.Q8_1]: calcBPW(32, 4 + 4 + 32),
+	[GGMLQuantizationType.Q2_K]: calcBPW(256, 2 + 2 + QK_K / 16 + QK_K / 4),
+	[GGMLQuantizationType.Q3_K]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8 + 12),
+	[GGMLQuantizationType.Q4_K]: calcBPW(256, 2 + 2 + QK_K / 2 + 12),
+	[GGMLQuantizationType.Q5_K]: calcBPW(256, 2 + 2 + QK_K / 2 + QK_K / 8 + 12),
+	[GGMLQuantizationType.Q6_K]: calcBPW(256, 2 + QK_K / 2 + QK_K / 4 + QK_K / 16),
+	[GGMLQuantizationType.Q8_K]: calcBPW(256, 4 + QK_K + QK_K / 8),
+	[GGMLQuantizationType.IQ2_XXS]: calcBPW(256, 2 + QK_K / 4),
+	[GGMLQuantizationType.IQ2_XS]: calcBPW(256, 2 + QK_K / 4 + QK_K / 32),
+	[GGMLQuantizationType.IQ3_XXS]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8),
+	[GGMLQuantizationType.IQ1_S]: calcBPW(256, 2 + QK_K / 8 + QK_K / 16),
+	[GGMLQuantizationType.IQ4_NL]: calcBPW(32, 2 + 16),
+	[GGMLQuantizationType.IQ3_S]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8 + QK_K / 32 + 4),
+	[GGMLQuantizationType.IQ2_S]: calcBPW(256, 2 + QK_K / 4 + QK_K / 16),
+	[GGMLQuantizationType.IQ4_XS]: calcBPW(256, 2 + 2 + QK_K / 2 + QK_K / 64),
+	[GGMLQuantizationType.I8]: calcBPW(1, 1),
+	[GGMLQuantizationType.I16]: calcBPW(1, 2),
+	[GGMLQuantizationType.I32]: calcBPW(1, 4),
+	[GGMLQuantizationType.I64]: calcBPW(1, 8),
+	[GGMLQuantizationType.F64]: calcBPW(1, 8),
+	[GGMLQuantizationType.IQ1_M]: calcBPW(256, QK_K / 8 + QK_K / 16 + QK_K / 32),
+	[GGMLQuantizationType.BF16]: calcBPW(1, 2),
+	// [GGMLQuantizationType.TQ1_0]:   calcBPW(256, 2 + 4 * 13),
+	// [GGMLQuantizationType.TQ2_0]:   calcBPW(256, 2 + 64),
+};

Original file line number	Diff line number	Diff line change
`@@ -410,6 +410,7 @@ export async function ggufAllShards(`
`410`	`410`	`fetch?: typeof fetch;`
`411`	`411`	`additionalFetchHeaders?: Record<string, string>;`
`412`	`412`	`parallelDownloads?: number;`
	`413`	`+ allowLocalFile?: boolean;`
`413`	`414`	`}`
`414`	`415`	`): Promise<{ shards: GGUFParseOutput[]; parameterCount: number }> {`
`415`	`416`	`const parallelDownloads = params?.parallelDownloads ?? PARALLEL_DOWNLOADS;`