InternLM
diff --git a/‎autotest/config.yaml‎
Lines changed: 21 additions & 62 deletions b/‎autotest/config.yaml‎
Lines changed: 21 additions & 62 deletions
diff --git a/‎autotest/tools/chat/test_command_chat_hf_pytorch.py‎
Lines changed: 48 additions & 0 deletions b/‎autotest/tools/chat/test_command_chat_hf_pytorch.py‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎autotest/tools/pipeline/test_pipeline_chat_pytorch.py‎
Lines changed: 104 additions & 9 deletions b/‎autotest/tools/pipeline/test_pipeline_chat_pytorch.py‎
Lines changed: 104 additions & 9 deletions
@@ -57,10 +57,13 @@ turbomind_chat_model:
     - deepseek-ai/deepseek-coder-1.3b-instruct
     - codellama/CodeLlama-7b-Instruct-hf
     - THUDM/glm-4-9b-chat
+    - openbmb/MiniCPM-Llama3-V-2_5
+    - openbmb/MiniCPM-V-2_6
 
 pytorch_chat_model:
     - meta-llama/Meta-Llama-3-8B-Instruct
     - meta-llama/Meta-Llama-3-1-8B-Instruct
+    - meta-llama/Llama-3.2-1B-Instruct
     - meta-llama/Llama-2-7b-chat-hf
     - internlm/internlm2_5-7b-chat
     - internlm/internlm2_5-20b-chat
@@ -71,6 +74,7 @@ pytorch_chat_model:
     - OpenGVLab/InternVL2-8B
     - OpenGVLab/InternVL2-26B
     - OpenGVLab/InternVL2-40B
+    - OpenGVLab/InternVL-Chat-V1-5
     - baichuan-inc/Baichuan2-7B-Chat
     - baichuan-inc/Baichuan2-13B-Chat
     - 01-ai/Yi-6B-Chat
@@ -94,9 +98,9 @@ pytorch_chat_model:
     - THUDM/cogvlm2-llama3-chinese-chat-19B
     - THUDM/glm-4v-9b
     - THUDM/glm-4-9b-chat
+    - THUDM/cogvlm-chat-hf
     - microsoft/Phi-3-mini-4k-instruct
     - microsoft/Phi-3-vision-128k-instruct
-    - bigcode/starcoder2-7b
 
 turbomind_base_model:
     - internlm/internlm2_5-7b
@@ -109,6 +113,7 @@ pytorch_base_model:
     - internlm/internlm2_5-7b
     - internlm/internlm2_5-1_8b
     - internlm/internlm2_5-20b
+    - bigcode/starcoder2-7b
 
 vl_model:
     - Qwen/Qwen-VL-Chat
@@ -125,81 +130,27 @@ vl_model:
     - OpenGVLab/InternVL2-40B
     - Qwen/Qwen2-VL-2B-Instruct
     - Qwen/Qwen2-VL-7B-Instruct
-    - internlm/internlm-xcomposer2-vl-7b
     - internlm/internlm-xcomposer2d5-7b
     - internlm/internlm-xcomposer2-4khd-7b
     - THUDM/cogvlm-chat-hf
     - THUDM/cogvlm2-llama3-chinese-chat-19B
     - THUDM/glm-4v-9b
+    - microsoft/Phi-3-mini-4k-instruct
     - microsoft/Phi-3-vision-128k-instruct
     - openbmb/MiniCPM-Llama3-V-2_5
     - openbmb/MiniCPM-V-2_6
 
 turbomind_quatization:
-    awq:
-        - meta-llama/Meta-Llama-3-1-8B-Instruct
-        - meta-llama/Meta-Llama-3-8B-Instruct
-        - meta-llama/Llama-2-7b-chat-hf
-        - internlm/internlm2_5-7b-chat
-        - internlm/internlm2_5-7b
-        - internlm/internlm2_5-20b-chat
-        - internlm/internlm2-chat-20b
-        - internlm/internlm2_5-20b
-        - internlm/internlm-chat-20b
-        - internlm/internlm-xcomposer2-4khd-7b
-        - internlm/internlm-xcomposer2d5-7b
-        - OpenGVLab/InternVL-Chat-V1-5
-        - OpenGVLab/Mini-InternVL-Chat-2B-V1-5
-        - OpenGVLab/InternVL2-2B
-        - OpenGVLab/InternVL2-8B
-        - OpenGVLab/InternVL2-26B
-        - OpenGVLab/InternVL2-40B
-        - Qwen/Qwen1.5-7B-Chat
-        - Qwen/Qwen2-7B-Instruct
-        - Qwen/Qwen2-1.5B-Instruct
-        - Qwen/Qwen2.5-7B-Instruct
-        - Qwen/Qwen-VL-Chat
-        - liuhaotian/llava-v1.5-13b
-        - liuhaotian/llava-v1.6-vicuna-7b
-        - 01-ai/Yi-VL-6B
-        - 01-ai/Yi-6B-Chat
-        - deepseek-ai/deepseek-vl-1.3b-chat
-        - baichuan-inc/Baichuan2-7B-Chat
-        - codellama/CodeLlama-7b-hf
-        - openbmb/MiniCPM-Llama3-V-2_5
-        - THUDM/glm-4-9b-chat
-    gptq:
-        - internlm/internlm2_5-7b-chat
-    kvint:
-        - meta-llama/Meta-Llama-3-1-8B-Instruct
-        - meta-llama/Meta-Llama-3-8B-Instruct
-        - meta-llama/Llama-2-7b-chat-hf
-        - internlm/internlm2_5-7b-chat
-        - internlm/internlm2_5-20b-chat
-        - internlm/internlm2-chat-20b
-        - internlm/internlm2-chat-20b-4bits
-        - internlm/internlm-chat-20b
-        - internlm/internlm-xcomposer2-4khd-7b
-        - internlm/internlm-xcomposer2d5-7b
-        - OpenGVLab/InternVL-Chat-V1-5
-        - Qwen/Qwen2-7B-Instruct
-        - Qwen/Qwen2-7B-Instruct-AWQ
-        - Qwen/Qwen2-1.5B-Instruct
-        - Qwen/Qwen1.5-7B-Chat
-        - Qwen/Qwen1.5-4B-Chat-AWQ
-        - Qwen/Qwen-VL-Chat
+    no_awq:
+        - Qwen/Qwen2-VL-2B-Instruct
+        - Qwen/Qwen2-VL-7B-Instruct
+        - mistralai/Mistral-7B-Instruct-v0.1
         - mistralai/Mistral-7B-Instruct-v0.2
         - mistralai/Mistral-7B-Instruct-v0.3
-        - lmdeploy/llama2-chat-7b-w4
-        - baichuan-inc/Baichuan2-7B-Chat
-        - 01-ai/Yi-6B-Chat
-        - 01-ai/Yi-VL-6B
-        - liuhaotian/llava-v1.5-13b
-        - liuhaotian/llava-v1.6-vicuna-7b
-        - deepseek-ai/deepseek-vl-1.3b-chat
         - deepseek-ai/deepseek-coder-1.3b-instruct
         - codellama/CodeLlama-7b-Instruct-hf
-        - THUDM/glm-4-9b-chat
+    gptq:
+        - internlm/internlm2_5-7b-chat
 
 pytorch_quatization:
     awq:
@@ -211,6 +162,7 @@ pytorch_quatization:
         - internlm/internlm2-chat-20b
         - OpenGVLab/InternVL-Chat-V1-5
         - 01-ai/Yi-6B-Chat
+        - Qwen/Qwen1.5-7B-Chat
         - Qwen/Qwen2-7B-Instruct
         - Qwen/Qwen2-1.5B-Instruct
         - microsoft/Phi-3-mini-4k-instruct
@@ -223,6 +175,13 @@ pytorch_quatization:
         - 01-ai/Yi-6B-Chat
         - internlm/internlm2_5-20b
         - internlm/internlm2_5-7b
+    no_kvint4:
+        - OpenGVLab/InternVL2-4B
+        - deepseek-ai/DeepSeek-V2-Lite-Chat
+        - microsoft/Phi-3-mini-4k-instruct
+        - microsoft/Phi-3-vision-128k-instruct
+    no_kvint8:
+        - deepseek-ai/DeepSeek-V2-Lite-Chat
 
 
 longtext_model:
 
@@ -51,6 +51,54 @@ def test_hf_pytorch_chat_tp2(config, model, cli_case_config, worker_id):
     assert result, msg
 
 
+@pytest.mark.order(10)
+@pytest.mark.usefixtures('cli_case_config')
+@pytest.mark.hf_turbomind_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=1,
+                                              model_type='base_model'))
+def test_hf_pytorch_base_tp1(config, model, cli_case_config, worker_id):
+    usercase = 'base_testcase'
+    result, chat_log, msg = hf_command_line_test(
+        config,
+        usercase,
+        cli_case_config.get(usercase),
+        model,
+        'pytorch',
+        cuda_prefix=get_cuda_prefix_by_workerid(worker_id))
+
+    if chat_log is not None:
+        allure.attach.file(chat_log,
+                           attachment_type=allure.attachment_type.TEXT)
+
+    assert result, msg
+
+
+@pytest.mark.order(10)
+@pytest.mark.usefixtures('cli_case_config')
+@pytest.mark.hf_turbomind_chat
+@pytest.mark.gpu_num_2
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=2,
+                                              model_type='base_model'))
+def test_hf_pytorch_base_tp2(config, model, cli_case_config, worker_id):
+    usercase = 'base_testcase'
+    result, chat_log, msg = hf_command_line_test(
+        config,
+        usercase,
+        cli_case_config.get(usercase),
+        model,
+        'pytorch',
+        cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2))
+
+    if chat_log is not None:
+        allure.attach.file(chat_log,
+                           attachment_type=allure.attachment_type.TEXT)
+
+    assert result, msg
+
+
 @pytest.mark.order(10)
 @pytest.mark.usefixtures('cli_case_config')
 @pytest.mark.hf_pytorch_chat
 
@@ -7,19 +7,13 @@
                                  run_pipeline_chat_test)
 
 
-def getModelList(tp_num):
-    return [
-        item for item in get_torch_model_list(tp_num)
-        if 'falcon' not in item.lower() and 'chatglm2' not in item.lower()
-    ]
-
-
 @pytest.mark.order(6)
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat_pytorch
 @pytest.mark.gpu_num_1
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('model', getModelList(tp_num=1))
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=1, exclude_dup=True))
 def test_pipeline_chat_pytorch_tp1(config, common_case_config, model,
                                    worker_id):
     if 'gw' in worker_id:
@@ -39,7 +33,8 @@ def test_pipeline_chat_pytorch_tp1(config, common_case_config, model,
 @pytest.mark.pipeline_chat_pytorch
 @pytest.mark.gpu_num_2
 @pytest.mark.flaky(reruns=0)
-@pytest.mark.parametrize('model', getModelList(tp_num=2))
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=2, exclude_dup=True))
 def test_pipeline_chat_pytorch_tp2(config, common_case_config, model,
                                    worker_id):
     if 'gw' in worker_id:
@@ -57,6 +52,106 @@ def test_pipeline_chat_pytorch_tp2(config, common_case_config, model,
                              worker_id)
 
 
+@pytest.mark.order(6)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=1,
+                                              quant_policy=4,
+                                              exclude_dup=True))
+def test_pipeline_chat_kvint4_tp1(config, common_case_config, model,
+                                  worker_id):
+    if 'Qwen2' in model:
+        return  # kvint4 for qwen2 is not support
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+    p = Process(target=run_pipeline_chat_test,
+                args=(config, common_case_config, model, 'pytorch-kvint',
+                      worker_id, {
+                          'quant_policy': 4
+                      }))
+    p.start()
+    p.join()
+    assert_pipeline_chat_log(config, common_case_config, model,
+                             'pytorch-kvint', worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_2
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=2,
+                                              quant_policy=4,
+                                              exclude_dup=True))
+def test_pipeline_chat_kvint4_tp2(config, common_case_config, model,
+                                  worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
+                                                                     tp_num=2)
+    p = Process(target=run_pipeline_chat_test,
+                args=(config, common_case_config, model, 'pytorch-kvint',
+                      worker_id, {
+                          'quant_policy': 4
+                      }))
+    p.start()
+    p.join()
+    assert_pipeline_chat_log(config, common_case_config, model,
+                             'pytorch-kvint', worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_1
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=1,
+                                              quant_policy=8,
+                                              exclude_dup=True))
+def test_pipeline_chat_kvint8_tp1(config, common_case_config, model,
+                                  worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id)
+    p = Process(target=run_pipeline_chat_test,
+                args=(config, common_case_config, model, 'pytorch-kvint',
+                      worker_id, {
+                          'quant_policy': 8
+                      }))
+    p.start()
+    p.join()
+    assert_pipeline_chat_log(config, common_case_config, model,
+                             'pytorch-kvint', worker_id)
+
+
+@pytest.mark.order(6)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.pipeline_chat
+@pytest.mark.gpu_num_2
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('model',
+                         get_torch_model_list(tp_num=2,
+                                              quant_policy=8,
+                                              exclude_dup=True))
+def test_pipeline_chat_kvint8_tp2(config, common_case_config, model,
+                                  worker_id):
+    if 'gw' in worker_id:
+        os.environ['CUDA_VISIBLE_DEVICES'] = get_cuda_id_by_workerid(worker_id,
+                                                                     tp_num=2)
+    p = Process(target=run_pipeline_chat_test,
+                args=(config, common_case_config, model, 'pytorch-kvint',
+                      worker_id, {
+                          'quant_policy': 8
+                      }))
+    p.start()
+    p.join()
+    assert_pipeline_chat_log(config, common_case_config, model,
+                             'pytorch-kvint', worker_id)
+
+
 @pytest.mark.order(6)
 @pytest.mark.usefixtures('common_case_config')
 @pytest.mark.pipeline_chat_pytorch