sramakintel
diff --git a/‎.github/workflows/script/models/cpp_graph_inference.sh
Lines changed: 10 additions & 10 deletions b/‎.github/workflows/script/models/cpp_graph_inference.sh
Lines changed: 10 additions & 10 deletions
diff --git a/‎intel_extension_for_transformers/llm/quantization/utils.py
Lines changed: 1 addition & 1 deletion b/‎intel_extension_for_transformers/llm/quantization/utils.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎intel_extension_for_transformers/llm/runtime/graph/README.md
Lines changed: 3 additions & 4 deletions b/‎intel_extension_for_transformers/llm/runtime/graph/README.md
Lines changed: 3 additions & 4 deletions
diff --git a/‎intel_extension_for_transformers/llm/runtime/graph/__init__.py
Lines changed: 1 addition & 1 deletion b/‎intel_extension_for_transformers/llm/runtime/graph/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎intel_extension_for_transformers/llm/runtime/graph/application/common.cpp
Lines changed: 12 additions & 12 deletions b/‎intel_extension_for_transformers/llm/runtime/graph/application/common.cpp
Lines changed: 12 additions & 12 deletions
diff --git a/‎intel_extension_for_transformers/llm/runtime/graph/application/common.h
Lines changed: 2 additions & 2 deletions b/‎intel_extension_for_transformers/llm/runtime/graph/application/common.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎intel_extension_for_transformers/llm/runtime/graph/application/main_pybind.cpp
Lines changed: 9 additions & 7 deletions b/‎intel_extension_for_transformers/llm/runtime/graph/application/main_pybind.cpp
Lines changed: 9 additions & 7 deletions
diff --git a/‎intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp
Lines changed: 9 additions & 9 deletions b/‎intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp
Lines changed: 9 additions & 9 deletions
diff --git a/‎intel_extension_for_transformers/llm/runtime/graph/models/model_utils/quant_config.h
Lines changed: 5 additions & 5 deletions b/‎intel_extension_for_transformers/llm/runtime/graph/models/model_utils/quant_config.h
Lines changed: 5 additions & 5 deletions
diff --git a/‎intel_extension_for_transformers/llm/runtime/graph/scripts/convert_model.py renamed to ‎intel_extension_for_transformers/llm/runtime/graph/scripts/convert.py b/‎intel_extension_for_transformers/llm/runtime/graph/scripts/convert_model.py renamed to ‎intel_extension_for_transformers/llm/runtime/graph/scripts/convert.py
@@ -110,25 +110,25 @@ function main() {
                     quantized_model="${model}-${precision}.bin"
                     if [[ ! -e ${quantized_model} ]]; then
                       if [[ ${precision} == "q4_j_vnni_b128" ]]; then
-                          ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --bits 4 --block_size 128 --scale_dtype fp32 --compute_type int8 --alg sym
+                          ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --weight_dtype int4 --group_size 128 --scale_dtype fp32 --compute_type int8 --alg sym
                       elif [[ ${precision} == "q4_j_vnni_bf16_b32" ]]; then
-                          ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --bits 4 --block_size 32 --scale_dtype bf16 --compute_type int8 --alg sym
+                          ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --weight_dtype int4 --group_size 32 --scale_dtype bf16 --compute_type int8 --alg sym
                       elif [[ ${precision} == "q4_j_vnni_b32" ]]; then
-                          ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --bits 4 --block_size 32 --scale_dtype fp32 --compute_type int8 --alg sym
+                          ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --weight_dtype int4 --group_size 32 --scale_dtype fp32 --compute_type int8 --alg sym
                       elif [[ ${precision} == "q4_j_b32" ]]; then
-                          ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --bits 4 --block_size 32 --scale_dtype fp32 --compute_type fp32 --alg sym
+                          ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --weight_dtype int4 --group_size 32 --scale_dtype fp32 --compute_type fp32 --alg sym
                       elif [[ ${precision} == "q4_j_b128" ]]; then
-                          ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --bits 4 --block_size 128 --scale_dtype fp32 --compute_type fp32 --alg sym
+                          ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --weight_dtype int4 --group_size 128 --scale_dtype fp32 --compute_type fp32 --alg sym
                       elif [[ ${precision} == "q4_j_b128_asym" ]]; then
-                          ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --bits 4 --block_size 128 --scale_dtype fp32 --compute_type fp32 --alg asym
+                          ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --weight_dtype int4 --group_size 128 --scale_dtype fp32 --compute_type fp32 --alg asym
                       elif [[ ${precision} == "q4_0" ]]; then
-                          ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --bits 4 --block_size 32 --compute_type ggml --alg sym
+                          ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --weight_dtype int4 --group_size 32 --compute_type ggml --alg sym
                       elif [[ ${precision} == "q4_1" ]]; then
-                          ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --bits 4 --block_size 32 --compute_type ggml --alg asym
+                          ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --weight_dtype int4 --group_size 32 --compute_type ggml --alg asym
                       elif [[ ${precision} == "q8_0" ]]; then
-                          ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --bits 8 --block_size 32 --compute_type ggml --alg sym
+                          ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --weight_dtype int8 --group_size 32 --compute_type ggml --alg sym
                       else
-                          ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --bits 4
+                          ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --weight_dtype int4
                       fi
                     fi
                     ## run inference
 
@@ -21,7 +21,6 @@
 from accelerate import init_empty_weights
 from neural_compressor import quantization
 from neural_compressor.config import PostTrainingQuantConfig
-from .nn import QuantizedLinearQBits  # TODO: QuantizedLinearINT4, QuantizedLinearINT8
 
 
 logger = logging.getLogger(__name__)
@@ -108,6 +107,7 @@ def _replace_linear(
 
         if isinstance(module, torch.nn.Linear) and name not in modules_to_not_convert:
             # Check if the current key is not in the `modules_to_not_convert`
+            from .nn import QuantizedLinearQBits  # TODO: QuantizedLinearINT4, QuantizedLinearINT8
             if not any(key in ".".join(current_key_name) for key in modules_to_not_convert):
                 with init_empty_weights():
                     in_features = module.in_features
 
@@ -82,7 +82,7 @@ LLM one-click running script args explanations:
 | -p / --prompt     | prompt to start generation with (default: empty)                        |
 | -n / --n_predict  | number of tokens to predict (default: -1, -1 = infinity)                |
 | -t / --threads    | number of threads to use during computation (default: 56)               |
-| -b / --batch_size | batch size for prompt processing (default: 512)                         |
+| -b / --batch_size_truncate | batch size for prompt processing (default: 512)                         |
 | -c / --ctx_size   | size of the prompt context (default: 512, can not be larger than specific model's context window length)                                                                                |
 | -s / --seed       | NG seed (default: -1, use random seed for < 0)                          |
 | --repeat_penalty  | penalize repeat sequence of tokens (default: 1.1, 1.0 = disabled)       |
@@ -106,12 +106,12 @@ python scripts/convert.py --outtype f32 --outfile ne-f32.bin model_path
 
 # quantize weights of fp32 ggml bin
 # model_name: llama, llama2, mpt, falcon, gptj, starcoder, dolly
-# to neuarl engine graph optimized q4_j with 128 block_size format (recommended)
+# optimized INT4 model with group size 128 (recommended)
 python scripts/quantize.py --model_name llama2 --model_file ne-f32.bin --out_file ne-q4_j.bin --weight_dtype int4 --block_size 128 --compute_type int8
 
 # Alternativly you could run ggml q4_0 format like following
 python scripts/quantize.py --model_name llama2 --model_file ne-f32.bin --out_file ne-q4_0.bin --weight_dtype int4
-# or ues neuarl engine graph optimized q4_j with 32 block_size format
+# optimized INT4 model with group size 32
 python scripts/quantize.py --model_name llama2 --model_file ne-f32.bin --out_file ne-q4_j.bin --weight_dtype int4 --block_size 32 --compute_type int8
 
 ```
@@ -164,4 +164,3 @@ LLM running script args explanations:
 ### 3. Tensor Parallelism cross nodes/sockets
 
 We support tensor parallelism strategy for distributed inference/training on multi-node and multi-socket.  You can refer to [tensor_parallelism.md](./tensor_parallelism.md) to enable this feature.
-
@@ -16,7 +16,7 @@
 # limitations under the License.
 import os
 from transformers import AutoConfig
-from intel_extension_for_transformers.llm.runtime.graph.scripts.convert_model import convert_model
+from intel_extension_for_transformers.llm.runtime.graph.scripts.convert import convert_model
 
 model_maps = {"gpt_neox": "gptneox", "RefinedWebModel": "falcon"}
 
 
@@ -677,9 +677,9 @@ void quant_print_usage(int argc, char** argv, const quant_params& params) {
           "  --config              path to the configuration file (default: "
           ")\n");
   fprintf(stderr, "  --nthread N           number of threads to use (default: 1)\n");
-  fprintf(stderr, "  --bits N              number of bits to use for quantization (default: 4)\n");
+  fprintf(stderr, "  --weight_dtype N              number of bits to use for quantization (default: 4)\n");
   fprintf(stderr, "  --alg                 qquantization algorithm to use: sym/asym (default: sym)\n");
-  fprintf(stderr, "  --block_size N        block size (default: 32)\n");
+  fprintf(stderr, "  --group_size N        group size (default: 32)\n");
   fprintf(stderr, "  --scale_dtype dtype   fp32/bf16 type for scales (default: fp32)\n");
   fprintf(stderr,
           "  --compute_type             Gemm computation data type: int8/fp32/ggml (default: "
@@ -701,12 +701,12 @@ bool quant_params_parse(int argc, char** argv, quant_params& params) {
       params.config = argv[++i];
     } else if (arg == "--nthread") {
       params.nthread = std::stoi(argv[++i]);
-    } else if (arg == "--bits") {
-      params.bits = std::stoi(argv[++i]);
+    } else if (arg == "--weight_dtype") {
+      params.weight_dtype = argv[++i];
     } else if (arg == "--alg") {
       params.alg = argv[++i];
-    } else if (arg == "--block_size") {
-      params.block_size = std::stoi(argv[++i]);
+    } else if (arg == "--group_size") {
+      params.group_size = std::stoi(argv[++i]);
     } else if (arg == "--scale_dtype") {
       params.scale_dtype = argv[++i];
     } else if (arg == "--compute_type") {
@@ -734,19 +734,19 @@ bool quant_params_parse(int argc, char** argv, quant_params& params) {
 
 ne_ftype quant_params_to_ftype(const quant_params& params) {
   if (params.compute_type == "ggml") {
-    if (params.bits == 4) {
+    if (params.weight_dtype == "int4") {
       if (params.alg == "sym") {
         return NE_FTYPE_MOSTLY_Q4_0;
       } else {
         return NE_FTYPE_MOSTLY_Q4_1;
       }
-    } else if (params.bits == 5) {
+    } else if (params.weight_dtype == "int5") {
       if (params.alg == "sym") {
         return NE_FTYPE_MOSTLY_Q5_0;
       } else {
         return NE_FTYPE_MOSTLY_Q5_1;
       }
-    } else if (params.bits == 8) {
+    } else if (params.weight_dtype == "int8") {
       return NE_FTYPE_MOSTLY_Q8_0;
     }
   } else {
@@ -757,19 +757,19 @@ ne_ftype quant_params_to_ftype(const quant_params& params) {
 
 ne_type quant_params_to_type(const quant_params& params) {
   if (params.compute_type == "ggml") {
-    if (params.bits == 4) {
+    if (params.weight_dtype == "int4") {
       if (params.alg == "sym") {
         return NE_TYPE_Q4_0;
       } else {
         return NE_TYPE_Q4_1;
       }
-    } else if (params.bits == 5) {
+    } else if (params.weight_dtype == "int5") {
       if (params.alg == "sym") {
         return NE_TYPE_Q5_0;
       } else {
         return NE_TYPE_Q5_1;
       }
-    } else if (params.bits == 8) {
+    } else if (params.weight_dtype == "int8") {
       return NE_TYPE_Q8_0;
     }
   } else {
 
@@ -148,9 +148,9 @@ struct quant_params {
   std::string config = "";
   int nthread = 1;
 
-  int32_t bits = 4;
+  std::string weight_dtype = "int4";
   std::string alg = "sym";
-  int32_t block_size = 32;
+  int32_t group_size = 32;
   std::string scale_dtype = "fp32";
   std::string compute_type = "ggml";
   std::string model_name = "unknown";
 
@@ -59,8 +59,9 @@ class Model {
   void reinit();
   std::string generate(const std::string& prompt, bool sentence_mode = true);
   bool is_token_end() { return token_eos; }
-  static int quant_model(const std::string& model_path, const std::string& out_path, int bits, const std::string& alg,
-                         int block_size, const std::string& scale_dtype, const std::string& compute_type);
+  static int quant_model(const std::string& model_path, const std::string& out_path, const std::string& weight_dtype,
+                         const std::string& alg, int group_size, const std::string& scale_dtype,
+                         const std::string& compute_type);
 
  private:
   model_context* ctx = nullptr;
@@ -212,8 +213,9 @@ int Model::post_process(float* logits) {
   return id;
 }
 
-int Model::quant_model(const std::string& model_path, const std::string& out_path, int bits, const std::string& alg,
-                       int block_size, const std::string& scale_dtype, const std::string& compute_type) {
+int Model::quant_model(const std::string& model_path, const std::string& out_path, const std::string& weight_dtype,
+                       const std::string& alg, int group_size, const std::string& scale_dtype,
+                       const std::string& compute_type) {
   quant_params q_params;
 #ifdef MODEL_NAME
   q_params.model_name = MODEL_NAME;
@@ -226,9 +228,9 @@ int Model::quant_model(const std::string& model_path, const std::string& out_pat
   q_params.model_arch = mt;
   q_params.model_file = model_path;
   q_params.out_file = out_path;
-  q_params.bits = bits;
+  q_params.weight_dtype = weight_dtype;
   q_params.alg = alg;
-  q_params.block_size = block_size;
+  q_params.group_size = group_size;
   q_params.scale_dtype = scale_dtype;
   q_params.compute_type = compute_type;
 
@@ -300,7 +302,7 @@ PYBIND11_MODULE(chatglm_cpp, m)
       .def("generate", &Model::generate, "Generate tokens with prompt", py::arg("prompt"),
            py::arg("sentence_mode") = true)
       .def_static("quant_model", &Model::quant_model, "Quantize model", py::arg("model_path"), py::arg("out_path"),
-                  py::arg("bits") = 4, py::arg("alg") = "sym", py::arg("block_size") = 32,
+                  py::arg("weight_dtype") = "int4", py::arg("alg") = "sym", py::arg("group_size") = 32,
                   py::arg("scale_dtype") = "fp32", py::arg("compute_type") = "ggml")
       .def("is_token_end", &Model::is_token_end)
       .def("reinit", &Model::reinit);
 
@@ -782,7 +782,7 @@ model_token model_sample_token(struct model_context* ctx, model_token_data_array
 // quantization
 //
 quant_params_internal quant_params_to_internal(const quant_params& params) {
-  return quant_params_internal{parse_bits(params.bits), parse_alg(params.alg), params.block_size,
+  return quant_params_internal{parse_bits(params.weight_dtype), parse_alg(params.alg), params.group_size,
                                parse_scale_dtype(params.scale_dtype), parse_compute_type(params.compute_type)};
 }
 
@@ -799,7 +799,7 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter
         if (params.alg != quant_alg::sym) {
           printf("Current not support asymmetric int8 computation, reset to symmetric\n");
         }
-        if (params.block_size == -1) {
+        if (params.group_size == -1) {
           using Kernel = WeiS4ClipFp32PerN<GcCompInt8, JblasAVX512F>;
           using KernelRef = WeiS4ClipFp32PerN<GcCompInt8, JblasNoSIMD>;
           static Kernel kernel;
@@ -815,7 +815,7 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter
           using KernelRef = WeiS4ClipFp32<GcCompInt8KBlock, JblasNoSIMD>;
           static Kernel kernel;
           static KernelRef kernelref;
-          packedw = kernel.createStorage(n, k, params.block_size);
+          packedw = kernel.createStorage(n, k, params.group_size);
           if (cd->AVX512F()) {
             kernel.packTransposeWeight(n, k, f32ptr, k, packedw);
           } else {
@@ -827,7 +827,7 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter
         using KernelRef = WeiS4ClipFp32<GcCompFp32, JblasNoSIMD>;
         static Kernel kernel;
         static Kernel kernelref;
-        packedw = kernel.createStorage(n, k, params.block_size, params.alg == quant_alg::sym);
+        packedw = kernel.createStorage(n, k, params.group_size, params.alg == quant_alg::sym);
         if (cd->AVX512_FP16()) {
           kernel.packTransposeWeight(n, k, f32ptr, k, packedw);
         } else {
@@ -838,7 +838,7 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter
         using KernelRef = WeiS4ClipFp32<GcCompBf16, JblasNoSIMD>;
         static Kernel kernel;
         static Kernel kernelref;
-        packedw = kernel.createStorage(n, k, params.block_size, params.alg == quant_alg::sym);
+        packedw = kernel.createStorage(n, k, params.group_size, params.alg == quant_alg::sym);
         if (cd->AMX_BF16()) {
           kernel.packTransposeWeight(n, k, f32ptr, k, packedw);
         } else {
@@ -854,7 +854,7 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter
         if (params.alg != quant_alg::sym) {
           printf("Current not support asymmetric int8 computation, reset to symmetric\n");
         }
-        if (params.block_size == -1) {
+        if (params.group_size == -1) {
           using Kernel = WeiS8Fp32PerN<GcCompInt8, JblasAVX512F>;
           using KernelRef = WeiS8Fp32PerN<GcCompInt8, JblasNoSIMD>;
           static Kernel kernel;
@@ -870,7 +870,7 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter
           using KernelRef = WeiS8Fp32<GcCompInt8KBlock, JblasNoSIMD>;
           static Kernel kernel;
           static Kernel kernelref;
-          packedw = kernel.createStorage(n, k, params.block_size);
+          packedw = kernel.createStorage(n, k, params.group_size);
           if (cd->AVX512F()) {
             kernel.packTransposeWeight(n, k, f32ptr, k, packedw);
           } else {
@@ -882,7 +882,7 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter
         using KernelRef = WeiS8Fp32<GcCompFp32, JblasNoSIMD>;
         static Kernel kernel;
         static Kernel kernelref;
-        packedw = kernel.createStorage(n, k, params.block_size, params.alg == quant_alg::sym);
+        packedw = kernel.createStorage(n, k, params.group_size, params.alg == quant_alg::sym);
         if (cd->AVX512_FP16()) {
           kernel.packTransposeWeight(n, k, f32ptr, k, packedw);
         } else {
@@ -893,7 +893,7 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter
         using KernelRef = WeiS8Fp32<GcCompBf16, JblasNoSIMD>;
         static Kernel kernel;
         static Kernel kernelref;
-        packedw = kernel.createStorage(n, k, params.block_size, params.alg == quant_alg::sym);
+        packedw = kernel.createStorage(n, k, params.group_size, params.alg == quant_alg::sym);
         if (cd->AMX_BF16()) {
           kernel.packTransposeWeight(n, k, f32ptr, k, packedw);
         } else {
 
@@ -18,11 +18,11 @@
 #include "core/data_types.h"
 
 enum class quant_bits : int { q4 = 0, q8, count };
-static inline quant_bits parse_bits(int bits) {
-  if (bits == 4) {
+static inline quant_bits parse_bits(const std::string& bits) {
+  if (bits == "int4") {
     return quant_bits::q4;
   }
-  if (bits == 8) {
+  if (bits == "int8") {
     return quant_bits::q8;
   }
   return quant_bits::count;
@@ -88,15 +88,15 @@ static inline quant_comp parse_compute_type(std::string arg) {
 struct quant_params_internal {
   quant_bits bits = quant_bits::q4;
   quant_alg alg = quant_alg::sym;
-  int32_t block_size = 32;
+  int32_t group_size = 32;
   quant_sdtype scale_dtype = quant_sdtype::fp16;
   quant_comp compute_type = quant_comp::ggml;
   bool valid() const {
     return bits != quant_bits::count && alg != quant_alg::count && scale_dtype != quant_sdtype::count &&
            compute_type != quant_comp::count;
   }
   std::string getstr() {
-    return std::to_string(int(bits)) + "_" + std::to_string(int(alg)) + "_" + std::to_string(block_size) + "_" +
+    return std::to_string(int(bits)) + "_" + std::to_string(int(alg)) + "_" + std::to_string(group_size) + "_" +
            std::to_string(int(scale_dtype)) + "_" + std::to_string(int(compute_type));
   }
 };