Skip to content

Commit 047ce04

Browse files
authored
gguf: add memory calculator to CLI (#1225)
The logic to calculate memory usage is taken from various cpp files in the project. This can be accurate down to around 100MB Example: ``` pnpm run build && npx . ~/work/models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -c 8000 * Memory usage estimation (with 8000 tokens in context) Item | Memory usage ---------|------------- K cache | 0.52 GB V cache | 0.52 GB Weight | 4.91 GB Overhead | 0.28 GB | --- TOTAL | 6.24 GB ``` TODO: - [x] Fix multi shard gguf - [x] Do not calc for recurrence models for now (mamba, rwkv)
1 parent b32dcb6 commit 047ce04

File tree

3 files changed

+183
-26
lines changed

3 files changed

+183
-26
lines changed

packages/gguf/src/cli.ts

Lines changed: 141 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/usr/bin/env node
22

3-
import { GGMLQuantizationType, gguf } from ".";
3+
import { GGMLQuantizationType, gguf, ggufAllShards, GGUFParseOutput } from ".";
4+
import { GGML_QUANT_SIZES } from "./quant-descriptions";
45

56
interface PrintColumnHeader {
67
name: string;
@@ -10,11 +11,44 @@ interface PrintColumnHeader {
1011

1112
const mapDtypeToName = Object.fromEntries(Object.entries(GGMLQuantizationType).map(([name, value]) => [value, name]));
1213

14+
function showHelp(exitCode: number) {
15+
console.error("Usage: gguf-view [--help|-h] [--show-tensor] [--context|-c N] <path/to/gguf>");
16+
console.error(" --help, -h Show this help message");
17+
console.error(" --show-tensor Show tensor information");
18+
console.error(" --context, -c N Number of tokens in context (default: 4096)");
19+
process.exit(exitCode);
20+
}
21+
1322
async function main() {
14-
const ggufPath = process.argv[2];
15-
const { metadata, tensorInfos } = await gguf(ggufPath, {
23+
let ggufPath = "";
24+
let showTensors = false;
25+
let nCtx = 4096;
26+
for (let i = 2; i < process.argv.length; i++) {
27+
if (process.argv[i] === "--help" || process.argv[i] === "-h") {
28+
showHelp(0);
29+
} else if (process.argv[i] === "--show-tensor") {
30+
showTensors = true;
31+
} else if (process.argv[i] === "--context" || process.argv[i] === "-c") {
32+
nCtx = Number(process.argv[++i]);
33+
} else {
34+
ggufPath = process.argv[i];
35+
}
36+
}
37+
38+
if (!ggufPath.length) {
39+
console.error("Error: Missing path to gguf file");
40+
showHelp(1);
41+
}
42+
43+
const { shards } = await ggufAllShards(ggufPath, {
1644
allowLocalFile: true,
1745
});
46+
const { metadata, tensorInfos } = shards[0];
47+
48+
// merge all metadata
49+
for (let i = 1; i < shards.length; i++) {
50+
tensorInfos.push(...shards[i].tensorInfos);
51+
}
1852

1953
// TODO: print info about endianess
2054
console.log(`* Dumping ${Object.keys(metadata).length} key/value pair(s)`);
@@ -43,29 +77,110 @@ async function main() {
4377
);
4478

4579
console.log();
46-
console.log(`* Dumping ${tensorInfos.length} tensor(s)`);
47-
printTable(
48-
[
49-
{ name: "Idx", alignRight: true },
50-
{ name: "Num Elements", alignRight: true },
51-
{ name: "Shape" },
52-
{ name: "Data Type" },
53-
{ name: "Name" },
54-
],
55-
tensorInfos.map((tensorInfo, i) => {
56-
const shape = [1n, 1n, 1n, 1n];
57-
tensorInfo.shape.forEach((dim, i) => {
58-
shape[i] = dim;
59-
});
60-
return [
61-
(i + 1).toString(),
62-
shape.reduce((acc, n) => acc * n, 1n).toString(),
63-
shape.map((n) => n.toString().padStart(6)).join(", "),
64-
mapDtypeToName[tensorInfo.dtype],
65-
tensorInfo.name,
66-
];
67-
})
68-
);
80+
console.log(`* Memory usage estimation (with context length of ${nCtx} tokens)`);
81+
try {
82+
const kvUsage = calcMemoryUsage(metadata as GGUFParseOutput<{ strict: false }>["metadata"], nCtx);
83+
let modelWeightInBytes = 0;
84+
for (const tensorInfo of tensorInfos) {
85+
const nElem = Number(tensorInfo.shape.reduce((a, b) => a * b, 1n));
86+
const tensorSizeInBytes = nElem * (GGML_QUANT_SIZES[tensorInfo.dtype] / 8);
87+
modelWeightInBytes += tensorSizeInBytes;
88+
}
89+
const overhead =
90+
calcMemoryUsage(metadata as GGUFParseOutput<{ strict: false }>["metadata"], 256).totalBytes +
91+
modelWeightInBytes * 0.05;
92+
const totalMemoryUsage = kvUsage.totalBytes + overhead + modelWeightInBytes;
93+
printTable(
94+
[{ name: "Item" }, { name: "Memory usage", alignRight: true }],
95+
[
96+
["K cache", (kvUsage.totalBytesK / 1e9).toFixed(2) + " GB"],
97+
["V cache", (kvUsage.totalBytesV / 1e9).toFixed(2) + " GB"],
98+
["Weight", (modelWeightInBytes / 1e9).toFixed(2) + " GB"],
99+
["Overhead", (overhead / 1e9).toFixed(2) + " GB"],
100+
["", "---"],
101+
["TOTAL", (totalMemoryUsage / 1e9).toFixed(2) + " GB"],
102+
]
103+
);
104+
} catch (e) {
105+
console.error(`Error: ${(e as Error).message}`);
106+
}
107+
108+
if (showTensors) {
109+
console.log();
110+
console.log(`* Dumping ${tensorInfos.length} tensor(s)`);
111+
printTable(
112+
[
113+
{ name: "Idx", alignRight: true },
114+
{ name: "Num Elements", alignRight: true },
115+
{ name: "Shape" },
116+
{ name: "Data Type" },
117+
{ name: "Name" },
118+
],
119+
tensorInfos.map((tensorInfo, i) => {
120+
const shape = [1n, 1n, 1n, 1n];
121+
tensorInfo.shape.forEach((dim, i) => {
122+
shape[i] = dim;
123+
});
124+
return [
125+
(i + 1).toString(),
126+
shape.reduce((acc, n) => acc * n, 1n).toString(),
127+
shape.map((n) => n.toString().padStart(6)).join(", "),
128+
mapDtypeToName[tensorInfo.dtype],
129+
tensorInfo.name,
130+
];
131+
})
132+
);
133+
} else {
134+
console.log();
135+
console.log(`* Use --show-tensor to display tensor information`);
136+
}
137+
}
138+
139+
function calcMemoryUsage(
140+
metadata: GGUFParseOutput<{ strict: false }>["metadata"],
141+
kvSize: number,
142+
kvTypeK: GGMLQuantizationType = GGMLQuantizationType.F16,
143+
kvTypeV: GGMLQuantizationType = GGMLQuantizationType.F16
144+
) {
145+
const arch = metadata["general.architecture"] ?? "unknown";
146+
const n_embd = (metadata[`${arch}.embedding_length`] as number) ?? 0;
147+
const n_head = (metadata[`${arch}.attention.head_count`] as number) ?? 0;
148+
const n_embd_head_k = (metadata[`${arch}.attention.key_length`] as number) ?? n_embd / n_head;
149+
const n_embd_head_v = (metadata[`${arch}.attention.value_length`] as number) ?? n_embd / n_head;
150+
const n_head_kv = (metadata[`${arch}.attention.head_count_kv`] as number[] | number) ?? [];
151+
const n_layer = (metadata[`${arch}.block_count`] as number) ?? 0;
152+
153+
if (arch.startsWith("mamba") || arch.startsWith("rwkv")) {
154+
throw new Error(`Memory usage estimation for arch "${arch}" is not supported`);
155+
}
156+
157+
const n_head_kv_arr = Array(n_layer).fill(n_head);
158+
if (Array.isArray(n_head_kv)) {
159+
for (let i = 0; i < n_layer; i++) {
160+
if (n_head_kv[i]) {
161+
n_head_kv_arr[i] = n_head_kv[i];
162+
}
163+
}
164+
} else {
165+
for (let i = 0; i < n_layer; i++) {
166+
n_head_kv_arr[i] = n_head_kv;
167+
}
168+
}
169+
170+
let totalElemsK = 0;
171+
let totalElemsV = 0;
172+
for (let i = 0; i < n_layer; i++) {
173+
const n_embd_k_gqa = n_embd_head_k * n_head_kv_arr[i];
174+
const n_embd_v_gqa = n_embd_head_v * n_head_kv_arr[i];
175+
totalElemsK += n_embd_k_gqa * kvSize;
176+
totalElemsV += n_embd_v_gqa * kvSize;
177+
}
178+
179+
return {
180+
totalBytesK: totalElemsK * (GGML_QUANT_SIZES[kvTypeK] / 8),
181+
totalBytesV: totalElemsV * (GGML_QUANT_SIZES[kvTypeV] / 8),
182+
totalBytes: (totalElemsK + totalElemsV) * (GGML_QUANT_SIZES[kvTypeV] / 8),
183+
};
69184
}
70185

71186
function printTable(header: PrintColumnHeader[], rows: string[][], leftPad = 2) {

packages/gguf/src/gguf.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,7 @@ export async function ggufAllShards(
410410
fetch?: typeof fetch;
411411
additionalFetchHeaders?: Record<string, string>;
412412
parallelDownloads?: number;
413+
allowLocalFile?: boolean;
413414
}
414415
): Promise<{ shards: GGUFParseOutput[]; parameterCount: number }> {
415416
const parallelDownloads = params?.parallelDownloads ?? PARALLEL_DOWNLOADS;

packages/gguf/src/quant-descriptions.ts

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,3 +125,44 @@ export const GGUF_QUANT_DESCRIPTIONS: Record<GGMLQuantizationType, { txt: string
125125
src_url: "https://en.wikipedia.org/wiki/Bfloat16_floating-point_format",
126126
},
127127
};
128+
129+
const QK_K = 256;
130+
const calcBPW = (blockSize: number, typeSize: number) => {
131+
return (typeSize * 8) / blockSize;
132+
};
133+
134+
// copied from https://github.com/ggml-org/llama.cpp/tree/master/gguf-py/gguf/constants.py
135+
// map quantization type to element size in bits per weight (example: Q4_K -> 4.5 bpw)
136+
export const GGML_QUANT_SIZES = {
137+
[GGMLQuantizationType.F32]: calcBPW(1, 4),
138+
[GGMLQuantizationType.F16]: calcBPW(1, 2),
139+
[GGMLQuantizationType.Q4_0]: calcBPW(32, 2 + 16),
140+
[GGMLQuantizationType.Q4_1]: calcBPW(32, 2 + 2 + 16),
141+
[GGMLQuantizationType.Q5_0]: calcBPW(32, 2 + 4 + 16),
142+
[GGMLQuantizationType.Q5_1]: calcBPW(32, 2 + 2 + 4 + 16),
143+
[GGMLQuantizationType.Q8_0]: calcBPW(32, 2 + 32),
144+
[GGMLQuantizationType.Q8_1]: calcBPW(32, 4 + 4 + 32),
145+
[GGMLQuantizationType.Q2_K]: calcBPW(256, 2 + 2 + QK_K / 16 + QK_K / 4),
146+
[GGMLQuantizationType.Q3_K]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8 + 12),
147+
[GGMLQuantizationType.Q4_K]: calcBPW(256, 2 + 2 + QK_K / 2 + 12),
148+
[GGMLQuantizationType.Q5_K]: calcBPW(256, 2 + 2 + QK_K / 2 + QK_K / 8 + 12),
149+
[GGMLQuantizationType.Q6_K]: calcBPW(256, 2 + QK_K / 2 + QK_K / 4 + QK_K / 16),
150+
[GGMLQuantizationType.Q8_K]: calcBPW(256, 4 + QK_K + QK_K / 8),
151+
[GGMLQuantizationType.IQ2_XXS]: calcBPW(256, 2 + QK_K / 4),
152+
[GGMLQuantizationType.IQ2_XS]: calcBPW(256, 2 + QK_K / 4 + QK_K / 32),
153+
[GGMLQuantizationType.IQ3_XXS]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8),
154+
[GGMLQuantizationType.IQ1_S]: calcBPW(256, 2 + QK_K / 8 + QK_K / 16),
155+
[GGMLQuantizationType.IQ4_NL]: calcBPW(32, 2 + 16),
156+
[GGMLQuantizationType.IQ3_S]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8 + QK_K / 32 + 4),
157+
[GGMLQuantizationType.IQ2_S]: calcBPW(256, 2 + QK_K / 4 + QK_K / 16),
158+
[GGMLQuantizationType.IQ4_XS]: calcBPW(256, 2 + 2 + QK_K / 2 + QK_K / 64),
159+
[GGMLQuantizationType.I8]: calcBPW(1, 1),
160+
[GGMLQuantizationType.I16]: calcBPW(1, 2),
161+
[GGMLQuantizationType.I32]: calcBPW(1, 4),
162+
[GGMLQuantizationType.I64]: calcBPW(1, 8),
163+
[GGMLQuantizationType.F64]: calcBPW(1, 8),
164+
[GGMLQuantizationType.IQ1_M]: calcBPW(256, QK_K / 8 + QK_K / 16 + QK_K / 32),
165+
[GGMLQuantizationType.BF16]: calcBPW(1, 2),
166+
// [GGMLQuantizationType.TQ1_0]: calcBPW(256, 2 + 4 * 13),
167+
// [GGMLQuantizationType.TQ2_0]: calcBPW(256, 2 + 64),
168+
};

0 commit comments

Comments
 (0)