Skip to content

Commit 1bcab14

Browse files
authored
refine example for SQ and WOQ (intel#209)
* refine example for SQ and WOQ * add gptq args * use woq as args name * change readme --------- Signed-off-by: Xin He <[email protected]>
1 parent 2651bd8 commit 1bcab14

File tree

5 files changed

+213
-112
lines changed

5 files changed

+213
-112
lines changed

examples/.config/pytorch_optimize.json

Lines changed: 90 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1465,12 +1465,12 @@
14651465
}
14661466
}
14671467
},
1468-
"gpt_j_6b_clm_weight_only": {
1468+
"gpt_j_6b_clm_woq": {
14691469
"working_dir": "huggingface/pytorch/language-modeling/quantization",
14701470
"tune": {
14711471
"cmd": "bash run_tuning.sh",
14721472
"params": {
1473-
"topology": "gpt_j_weight_only",
1473+
"topology": "gpt_j_woq",
14741474
"task": "clm",
14751475
"approach": "weight_only",
14761476
"output_model": "saved_results"
@@ -1479,7 +1479,7 @@
14791479
"benchmark": {
14801480
"cmd": "bash run_benchmark.sh",
14811481
"params": {
1482-
"topology": "gpt_j_weight_only",
1482+
"topology": "gpt_j_woq",
14831483
"task": "clm",
14841484
"mode": "accuracy",
14851485
"batch_size": "112",
@@ -1489,12 +1489,12 @@
14891489
}
14901490
}
14911491
},
1492-
"gpt_j_6b_clm_weight_only_awq": {
1492+
"gpt_j_6b_clm_woq_awq": {
14931493
"working_dir": "huggingface/pytorch/language-modeling/quantization",
14941494
"tune": {
14951495
"cmd": "bash run_tuning.sh",
14961496
"params": {
1497-
"topology": "gpt_j_weight_only_awq",
1497+
"topology": "gpt_j_woq_awq",
14981498
"task": "clm",
14991499
"approach": "weight_only",
15001500
"output_model": "saved_results"
@@ -1503,7 +1503,7 @@
15031503
"benchmark": {
15041504
"cmd": "bash run_benchmark.sh",
15051505
"params": {
1506-
"topology": "gpt_j_weight_only_awq",
1506+
"topology": "gpt_j_woq_awq",
15071507
"task": "clm",
15081508
"mode": "accuracy",
15091509
"batch_size": "112",
@@ -1592,12 +1592,12 @@
15921592
}
15931593
}
15941594
},
1595-
"opt_125m_clm_weight_only": {
1595+
"chatglm_clm_woq": {
15961596
"working_dir": "huggingface/pytorch/language-modeling/quantization",
15971597
"tune": {
15981598
"cmd": "bash run_tuning.sh",
15991599
"params": {
1600-
"topology": "opt_125m_weight_only",
1600+
"topology": "chatglm_woq",
16011601
"task": "clm",
16021602
"approach": "weight_only",
16031603
"output_model": "saved_results"
@@ -1606,7 +1606,7 @@
16061606
"benchmark": {
16071607
"cmd": "bash run_benchmark.sh",
16081608
"params": {
1609-
"topology": "opt_125m_weight_only",
1609+
"topology": "chatglm_woq",
16101610
"task": "clm",
16111611
"mode": "accuracy",
16121612
"batch_size": "112",
@@ -1616,12 +1616,12 @@
16161616
}
16171617
}
16181618
},
1619-
"opt_125m_clm_weight_only_awq": {
1619+
"opt_125m_clm_woq": {
16201620
"working_dir": "huggingface/pytorch/language-modeling/quantization",
16211621
"tune": {
16221622
"cmd": "bash run_tuning.sh",
16231623
"params": {
1624-
"topology": "opt_125m_weight_only_awq",
1624+
"topology": "opt_125m_woq",
16251625
"task": "clm",
16261626
"approach": "weight_only",
16271627
"output_model": "saved_results"
@@ -1630,7 +1630,7 @@
16301630
"benchmark": {
16311631
"cmd": "bash run_benchmark.sh",
16321632
"params": {
1633-
"topology": "opt_125m_weight_only_awq",
1633+
"topology": "opt_125m_woq",
16341634
"task": "clm",
16351635
"mode": "accuracy",
16361636
"batch_size": "112",
@@ -1640,12 +1640,12 @@
16401640
}
16411641
}
16421642
},
1643-
"chatglm_clm_weight_only": {
1643+
"opt_125m_clm_woq_awq": {
16441644
"working_dir": "huggingface/pytorch/language-modeling/quantization",
16451645
"tune": {
16461646
"cmd": "bash run_tuning.sh",
16471647
"params": {
1648-
"topology": "chatglm_weight_only",
1648+
"topology": "opt_125m_woq_awq",
16491649
"task": "clm",
16501650
"approach": "weight_only",
16511651
"output_model": "saved_results"
@@ -1654,7 +1654,7 @@
16541654
"benchmark": {
16551655
"cmd": "bash run_benchmark.sh",
16561656
"params": {
1657-
"topology": "chatglm_weight_only",
1657+
"topology": "opt_125m_woq_awq",
16581658
"task": "clm",
16591659
"mode": "accuracy",
16601660
"batch_size": "112",
@@ -1664,6 +1664,81 @@
16641664
}
16651665
}
16661666
},
1667+
"opt_125m_clm_woq_gptq": {
1668+
"working_dir": "huggingface/pytorch/language-modeling/quantization",
1669+
"tune": {
1670+
"cmd": "bash run_tuning.sh",
1671+
"params": {
1672+
"topology": "opt_125m_woq_gptq",
1673+
"task": "clm",
1674+
"approach": "weight_only",
1675+
"output_model": "saved_results"
1676+
}
1677+
},
1678+
"benchmark": {
1679+
"cmd": "bash run_benchmark.sh",
1680+
"params": {
1681+
"topology": "opt_125m_woq_gptq",
1682+
"task": "clm",
1683+
"mode": "accuracy",
1684+
"batch_size": "112",
1685+
"config": "saved_results",
1686+
"iters": "100",
1687+
"int8": "false"
1688+
}
1689+
}
1690+
},
1691+
"opt_125m_clm_woq_teq": {
1692+
"working_dir": "huggingface/pytorch/language-modeling/quantization",
1693+
"tune": {
1694+
"cmd": "bash run_tuning.sh",
1695+
"params": {
1696+
"topology": "opt_125m_woq_teq",
1697+
"task": "clm",
1698+
"approach": "weight_only",
1699+
"output_model": "saved_results"
1700+
}
1701+
},
1702+
"benchmark": {
1703+
"cmd": "bash run_benchmark.sh",
1704+
"params": {
1705+
"topology": "opt_125m_woq_teq",
1706+
"task": "clm",
1707+
"mode": "accuracy",
1708+
"batch_size": "112",
1709+
"config": "saved_results",
1710+
"iters": "100",
1711+
"int8": "false"
1712+
}
1713+
}
1714+
},
1715+
"opt_125m_clm_ipex": {
1716+
"working_dir": "huggingface/pytorch/language-modeling/quantization",
1717+
"tune": {
1718+
"cmd": "bash run_tuning.sh",
1719+
"params": {
1720+
"topology": "opt_125m",
1721+
"task": "clm",
1722+
"approach": "static",
1723+
"backend": "ipex",
1724+
"output_model": "saved_results"
1725+
}
1726+
},
1727+
"benchmark": {
1728+
"cmd": "bash run_benchmark.sh",
1729+
"params": {
1730+
"topology": "opt_125m",
1731+
"task": "clm",
1732+
"approach": "static",
1733+
"backend": "ipex",
1734+
"mode": "accuracy",
1735+
"batch_size": "112",
1736+
"iters": "100",
1737+
"int8": "false",
1738+
"config": "saved_results"
1739+
}
1740+
}
1741+
},
16671742
"opt_1.3b_clm_ipex": {
16681743
"working_dir": "huggingface/pytorch/language-modeling/quantization",
16691744
"tune": {

examples/huggingface/pytorch/language-modeling/quantization/README.md

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,14 +48,21 @@ python run_clm_no_trainer.py \
4848
--model EleutherAI/gpt-j-6B \
4949
--quantize \
5050
--approach weight_only \
51-
--output_dir "saved_results" \
51+
--woq_bits 4 \
52+
--woq_group_size 128 \
53+
--woq_scheme asym \
54+
--woq_algo RTN \
55+
--woq_mse_range \
56+
--output_dir "saved_results"
5257
```
53-
**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN/AWQ[1]/GPTQ[2] algorithms. You can try it with `--approach weight_only`. `--awq` will trigger AWQ algorithm. `--gptq` will trigger GPTQ algorithm. For example, to run a GPTQ example, try the following command.
58+
**Notes**: Weight-only quantization based on fake quantization is previewly supported and supports RTN, GPTQ[1], AWQ[2], TEQ algorithms. For more details, please refer to [link](https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md)
59+
60+
5461
```bash
5562
python run_clm_no_trainer.py \
5663
--model EleutherAI/gpt-j-6B \
57-
--weight_only_algo GPTQ \
58-
--weight_only_bits 4 \
64+
--woq_algo GPTQ \
65+
--woq_bits 4 \
5966
--quantize \
6067
--pad_max_length 2048 \
6168
--gptq_pad_max_length 2048 \
@@ -242,5 +249,5 @@ python run_mlm.py \
242249
--overwrite_output_dir
243250
```
244251

245-
[1]. Lin, Ji, et al. "AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration." arXiv preprint arXiv:2306.00978 (2023).
246-
[2]. Elias, Frantar, et al. "GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers." arXiv preprint arXiv:2210.17323 (2023).
252+
[1]. Elias, Frantar, et al. "GPTQ: Accurate Post-training Compression for Generative Pretrained Transformers." arXiv preprint arXiv:2210.17323 (2023).
253+
[2]. Lin, Ji, et al. "AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration." arXiv preprint arXiv:2306.00978 (2023).

examples/huggingface/pytorch/language-modeling/quantization/run_benchmark.sh

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -83,17 +83,17 @@ function run_benchmark {
8383
model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B"
8484
extra_cmd=$extra_cmd" --ipex"
8585
fi
86-
elif [ "${topology}" = "gpt_j_weight_only" ]; then
86+
elif [ "${topology}" = "gpt_j_woq" ]; then
8787
script="run_clm_no_trainer.py"
8888
model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B"
8989
lm_eval_tasks="lambada_openai"
9090
extra_cmd=$extra_cmd" --approach weight_only"
91-
elif [ "${topology}" = "chatglm_weight_only" ]; then
91+
elif [ "${topology}" = "chatglm_woq" ]; then
9292
script="run_clm_no_trainer.py"
9393
model_name_or_path="THUDM/chatglm-6b"
9494
lm_eval_tasks="lambada_openai"
9595
extra_cmd=$extra_cmd" --approach weight_only"
96-
elif [ "${topology}" = "gpt_j_weight_only_awq" ]; then
96+
elif [ "${topology}" = "gpt_j_woq_awq" ]; then
9797
script="run_clm_no_trainer.py"
9898
model_name_or_path="/tf_dataset2/models/pytorch/gpt-j-6B"
9999
lm_eval_tasks="lambada_openai"
@@ -107,16 +107,20 @@ function run_benchmark {
107107
elif [ "${topology}" = "falcon_7b_instruct" ]; then
108108
script="run_clm_no_trainer.py"
109109
model_name_or_path="tiiuae/falcon-7b-instruct"
110-
elif [ "${topology}" = "opt_125m_weight_only" ]; then
110+
elif [ "${topology}" = "opt_125m_woq" -o \
111+
"${topology}" = "opt_125m_woq_awq" -o \
112+
"${topology}" = "opt_125m_woq_gptq" -o \
113+
"${topology}" = "opt_125m_woq_teq" ]; then
111114
script="run_clm_no_trainer.py"
112115
model_name_or_path="facebook/opt-125m"
113116
lm_eval_tasks="lambada_openai"
114117
extra_cmd=$extra_cmd" --approach weight_only"
115-
elif [ "${topology}" = "opt_125m_weight_only_awq" ]; then
118+
elif [ "${topology}" = "opt_125m" ]; then
116119
script="run_clm_no_trainer.py"
117120
model_name_or_path="facebook/opt-125m"
118-
lm_eval_tasks="lambada_openai"
119-
extra_cmd=$extra_cmd" --approach weight_only"
121+
if [ "${backend}" = "ipex" ]; then
122+
extra_cmd=$extra_cmd" --ipex"
123+
fi
120124
elif [ "${topology}" = "opt_1.3b" ]; then
121125
script="run_clm_no_trainer.py"
122126
model_name_or_path="facebook/opt-1.3b"

0 commit comments

Comments
 (0)