diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 00f7e64b61..0848efe1f1 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -17,7 +17,7 @@ on: required: true description: 'Set benchmark type. Default is "["generation", "throughput", "api_server"]"' type: string - default: "['apiserver', 'generation', 'throughput']" + default: "['apiserver', 'throughput']" offline_mode: required: true description: 'Whether start a offline mode, if true, you should prepare code and whl package by yourself' diff --git a/autotest/benchmark/test_generation_performance.py b/autotest/benchmark/test_generation_performance.py deleted file mode 100644 index 24ed978d25..0000000000 --- a/autotest/benchmark/test_generation_performance.py +++ /dev/null @@ -1,121 +0,0 @@ -import pytest -from utils.benchmark_utils import generation_test -from utils.config_utils import get_benchmark_model_list, get_cuda_prefix_by_workerid - - -@pytest.mark.gpu_num_1 -@pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=1)) -def test_generation_tp1(config, run_id, run_config, worker_id): - result, msg = generation_test(config, - run_id, - run_config, - cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=1), - worker_id=worker_id) - - assert result, msg - - -@pytest.mark.gpu_num_1 -@pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=1, is_longtext=True)) -def test_generation_longtext_tp1(config, run_id, run_config, worker_id): - result, msg = generation_test(config, - run_id, - run_config, - is_longtext=True, - cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=1), - worker_id=worker_id) - - assert result, msg - - -@pytest.mark.gpu_num_2 -@pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=2)) -def test_generation_tp2(config, run_id, run_config, worker_id): - result, msg = generation_test(config, - run_id, - run_config, - cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2), - worker_id=worker_id) - - assert result, msg - - -@pytest.mark.gpu_num_2 -@pytest.mark.longtext -@pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=2, is_longtext=True)) -def test_generation_longtext_tp2(config, run_id, run_config, worker_id): - result, msg = generation_test(config, - run_id, - run_config, - is_longtext=True, - cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2), - worker_id=worker_id) - - assert result, msg - - -@pytest.mark.gpu_num_4 -@pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=4)) -def test_generation_tp4(config, run_id, run_config, worker_id): - result, msg = generation_test(config, - run_id, - run_config, - cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=4), - worker_id=worker_id) - - assert result, msg - - -@pytest.mark.gpu_num_4 -@pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=4, is_longtext=True)) -def test_generation_longtext_tp4(config, run_id, run_config, worker_id): - result, msg = generation_test(config, - run_id, - run_config, - is_longtext=True, - cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=4), - worker_id=worker_id) - - assert result, msg - - -@pytest.mark.gpu_num_8 -@pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', get_benchmark_model_list(tp_num=8)) -def test_generation_tp8(config, run_id, run_config, worker_id): - result, msg = generation_test(config, - run_id, - run_config, - cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=8), - worker_id=worker_id) - - assert result, msg - - -@pytest.mark.function -@pytest.mark.flaky(reruns=0) -@pytest.mark.parametrize('run_config', [{ - 'model': 'internlm/internlm2_5-20b-chat', - 'backend': 'pytorch', - 'tp_num': 2 -}, { - 'model': 'internlm/internlm2_5-20b-chat-inner-4bits', - 'backend': 'turbomind', - 'quant_policy': 0, - 'tp_num': 2 -}]) -def test_generation_fun_tp2(config, run_id, run_config, worker_id): - result, msg = generation_test(config, - run_id, - run_config, - cuda_prefix=get_cuda_prefix_by_workerid(worker_id, tp_num=2), - worker_id=worker_id, - is_smoke=True) - - assert result, msg diff --git a/autotest/utils/benchmark_utils.py b/autotest/utils/benchmark_utils.py index 852c21c047..3a57aefcb4 100644 --- a/autotest/utils/benchmark_utils.py +++ b/autotest/utils/benchmark_utils.py @@ -12,54 +12,6 @@ GENERATION_LONGTEXT_CONFIG = ' -c 1 --session-len 200000 -ct 1024 -pt 198000' -def generation_test(config, - run_id, - run_config, - is_longtext: bool = False, - cuda_prefix: str = None, - worker_id: str = '', - is_smoke: bool = False): - model = run_config['model'] - backend = run_config['backend'] - tp_num = run_config['tp_num'] - model_path = '/'.join([config.get('model_path'), model]) - log_path = config.get('log_path') - benchmark_log = os.path.join(log_path, 'benchmark_generation_' + model.split('/')[1] + worker_id + '.log') - benchmark_path = '/'.join([config.get('benchmark_path'), run_id, model, f'benchmark-generation-{backend}']) - - create_multi_level_directory(benchmark_path) - - print(cuda_prefix) - command = f'python3 benchmark/profile_generation.py {model_path} ' - command = get_command_with_extra(command, cuda_prefix) - - run_config = '' - if backend == 'pytorch': - command += ' --backend pytorch' - if not _is_bf16_supported_by_device(): - command += ' --dtype float16' - else: - if '4bit' in model: - command += ' --model-format awq' - - if is_longtext: - run_config = run_config + GENERATION_LONGTEXT_CONFIG - csv_path = f'{benchmark_path}/generation_longtext.csv' - else: - run_config = run_config + GENERATION_CONFIG - csv_path = f'{benchmark_path}/generation.csv' - if is_smoke: - run_config = ' -c 1 -ct 128 -pt 128' - - cmd = ' '.join([command, run_config, '--tp', str(tp_num), get_max_cache_entry(model, backend), '--csv', csv_path]) - - returncode, stderr = run_testcase(cmd, benchmark_log) - allure.attach.file(benchmark_log, attachment_type=allure.attachment_type.TEXT) - if returncode == 0 and not os.path.isfile(csv_path): - return False, 'result is empty' - return returncode == 0, stderr - - def throughput_test(config, run_id, run_config, cuda_prefix: str = None, worker_id: str = '', is_smoke: bool = False): model = run_config['model'] backend = run_config['backend'] diff --git a/benchmark/README.md b/benchmark/README.md index 9e56768640..c9ded496f0 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -19,20 +19,6 @@ python profile_throughput.py \ --concurrency 64 ``` -## profile without dataset - -`profile_generation.py` perform benchmark with dummy data. - -```shell -pip install nvidia-ml-py -``` - -```bash -python profile_generation.py \ - /path/to/your/model \ - --concurrency 1 8 --prompt-tokens 1 512 --completion-tokens 2048 512 -``` - ## profile restful api `profile_restful_api.py` is used to do benchmark on api server. diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py deleted file mode 100644 index 5e5fa62d15..0000000000 --- a/benchmark/profile_generation.py +++ /dev/null @@ -1,466 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import argparse -import asyncio -import csv -import os -import time -from dataclasses import dataclass -from itertools import count -from typing import List, Union - -import numpy as np -from pynvml import (NVMLError, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, - nvmlDeviceGetName, nvmlDeviceGetPowerState, nvmlDeviceGetTemperature, nvmlInit, nvmlShutdown, - nvmlSystemGetDriverVersion) -from tqdm import tqdm - -from lmdeploy.cli.utils import ArgumentHelper, DefaultsAndTypesHelpFormatter -from lmdeploy.messages import GenerationConfig, PytorchEngineConfig, TurbomindEngineConfig -from lmdeploy.utils import get_logger - -get_logger('lmdeploy').setLevel('WARNING') -os.environ['TM_LOG_LEVEL'] = 'ERROR' -global_session_id = count(0) - - -async def infer(model, worker_id: int, input_ids: List, gen_config: GenerationConfig, test_round: int, - que: asyncio.Queue): - if worker_id == 1: - pbar = tqdm(total=test_round) - chatbot = model.create_instance() - output_seqlen = gen_config.max_new_tokens - stats = [] - for _ in range(test_round): - token_latency_stats = [0] * (output_seqlen + 1) - prev = time.perf_counter() - n_prev_token = 0 - """The iterator provided by `stream_infer` denotes the number of - generated tokens so far, which is represented by the variable - `n_token`. - - Please note that `n_token` is not a continuous value. In other words, during the iteration, - its value might be 5, 7, 8, 16, and so on, rather than 1, 2, 3, 4, etc. - So, it is quite difficult to get the latency of each generated token. - As a work-around, we set the latency `now-prev` of each iteration to the first token of - the new generated tokens, and leave the latency of the rest tokens being 0. - For example, in the first iteration, 5 tokens are generated. - The time elapsing in this iteration `now-prev` is set to the latency of first token of - the 5 tokens, i.e. `token_latency_stats[0]`, and `token_latency_stats[1:4]` is set 0` - """ # noqa: E501 - session_id = next(global_session_id) - async for outputs in chatbot.async_stream_infer(session_id, - input_ids, - gen_config=gen_config, - sequence_start=True, - sequence_end=True, - stream_output=True): - n_token = outputs.num_token - now = time.perf_counter() - if n_prev_token != n_token: - token_latency_stats[n_prev_token] = np.round(now - prev, 3) - n_prev_token = n_token - prev = now - # for pytorch engine to restart a session - if hasattr(chatbot, 'end'): - await chatbot.async_end(session_id) - if worker_id == 1: - pbar.update(1) - - assert output_seqlen <= n_token <= output_seqlen + 1, \ - f'Error. session_id({session_id}) request {output_seqlen} ' \ - f'tokens, but generate {n_token} tokens' - stats.append(token_latency_stats[:output_seqlen]) - await que.put((worker_id, stats)) - - -def warmup(model, concurrency: int, input_ids: List[int], warmup_round: int, gen_config: GenerationConfig, - event_loop: asyncio.BaseEventLoop): - if not warmup_round: - return - - print('start to warmup ...') - - async def _infer(model): - chatbot = model.create_instance() - for _ in range(warmup_round): - session_id = next(global_session_id) - async for _ in chatbot.async_stream_infer(session_id, - input_ids=input_ids, - sequence_start=True, - sequence_end=True, - ignore_eos=True, - gen_config=gen_config): - continue - # for pytorch engine to restart a session - if hasattr(chatbot, 'end'): - await chatbot.async_end(session_id) - - _start = time.perf_counter() - - # start threads - tasks = [] - for i in range(concurrency): - task = _infer(model) - tasks.append(task) - - async def _gather_tasks(tasks): - return await asyncio.gather(*tasks) - - event_loop.run_until_complete(_gather_tasks(tasks)) - - _end = time.perf_counter() - print(f'end warmup, elapsed time: {round(_end - _start, 2)}s') - - -def profile_throughput(model_path: str, concurrency: int, input_seqlen: int, - engine_config: Union[PytorchEngineConfig, TurbomindEngineConfig], gen_config: GenerationConfig, - test_round: int, warmup_round: int): - output_seqlen = gen_config.max_new_tokens - print(f'profiling ... concurrency: {concurrency}, ' - f'n_prompt_token: {input_seqlen}, ' - f'n_completion_token: {output_seqlen}, ' - f'test_round: {test_round}, warmup_round: {warmup_round}') - if isinstance(engine_config, TurbomindEngineConfig): - from lmdeploy.turbomind import TurboMind - tm_model = TurboMind.from_pretrained(model_path, engine_config=engine_config) - elif isinstance(engine_config, PytorchEngineConfig): - from lmdeploy.pytorch.engine import Engine - tm_model = Engine(model_path, engine_config=engine_config) - - event_loop = asyncio.new_event_loop() - asyncio.set_event_loop(event_loop) - - # make up a dummy `input_ids` with the length of `input_seqlen` exactly - assert input_seqlen > 0, 'input_seqlen should > 0' - input_ids = np.random.randint(low=0, high=101, size=input_seqlen).tolist() - warmup(tm_model, concurrency, input_ids, warmup_round, gen_config, event_loop) - - que = asyncio.Queue() - _start = time.perf_counter() - - tasks = [] - for i in range(concurrency): - task = infer(tm_model, i + 1, input_ids, gen_config, test_round, que) - tasks.append(task) - - async def _gather_tasks(tasks): - return await asyncio.gather(*tasks) - - event_loop.run_until_complete(_gather_tasks(tasks)) - - _end = time.perf_counter() - elapsed_time = _end - _start - - tm_model.close() - - token_latency_stats = [] - while not que.empty(): - _, _stats = que.get_nowait() - token_latency_stats += _stats - - # The shape is [concurrency*test_round, output_seqlen] - token_latency_stats = np.stack(token_latency_stats, axis=0) - - first_token_latency_min = np.round(np.min(token_latency_stats[:, 0], axis=0), 3) - first_token_latency_max = np.round(np.max(token_latency_stats[:, 0], axis=0), 3) - first_token_latency_ave = np.round(np.mean(token_latency_stats[:, 0], axis=0), 3) - token_latency_max = np.round(np.max(np.sum(token_latency_stats, axis=1)), 3) - token_latency_min = np.round(np.min(np.sum(token_latency_stats, axis=1)), 3) - token_latency_ave = np.round(np.mean(np.sum(token_latency_stats, axis=1)), 3) - if output_seqlen > 1: - # sort token_latency without the first token's latency - sorted_token_latency = np.sort(token_latency_stats[:, 1:].flatten()) - percentiles = [ - np.round(sorted_token_latency[int(percent * len(sorted_token_latency))], 3) - for percent in [0.5, 0.75, 0.95, 0.99] - ] - else: - percentiles = [ - first_token_latency_ave, - ] * 4 - - out_token_throughput = np.round(token_latency_stats.size / elapsed_time, 2) - total_token_throughput = np.round(concurrency * test_round * (input_seqlen + output_seqlen) / elapsed_time, 2) - print(f'\n{" - " * 50}\ntotal time: {elapsed_time:.2f}s\n' - f'concurrency: {concurrency}, test_round: {test_round}\n' - f'input_tokens: {input_seqlen}, output_tokens: {output_seqlen}\n' - f'first_token latency(min, max, ave): ' - f'{first_token_latency_min}s, {first_token_latency_max}s, ' - f'{first_token_latency_ave}s\ntotal_token latency(min, max, ave): ' - f'{token_latency_min}s, {token_latency_max}s, ' - f'{token_latency_ave}s\n' - f'token_latency percentiles(50%,75%,95%,99%)(s): {percentiles}\n' - f'throughput(output): {out_token_throughput} token/s\n' - f'throughput(total): {total_token_throughput} token/s\n{" - " * 50}') - return model_path, \ - [first_token_latency_min, first_token_latency_max, - first_token_latency_ave], \ - percentiles, out_token_throughput, total_token_throughput, \ - tm_model.gpu_count - - -class MemoryMonitor: - - @classmethod - def init(cls): - from multiprocessing import Manager - cls.max_mem = Manager().Value('f', 0) # GB - cls.device_count = Manager().Value('f', 0) - - @staticmethod - def nvidia_info(): - # pip install nvidia-ml-py - nvidia_dict = {'state': True, 'nvidia_version': '', 'nvidia_count': 0, 'gpus': []} - try: - nvmlInit() - nvidia_dict['nvidia_version'] = nvmlSystemGetDriverVersion() - nvidia_dict['nvidia_count'] = nvmlDeviceGetCount() - for i in range(nvidia_dict['nvidia_count']): - handle = nvmlDeviceGetHandleByIndex(i) - memory_info = nvmlDeviceGetMemoryInfo(handle) - gpu = { - 'gpu_name': nvmlDeviceGetName(handle), - 'total': memory_info.total, - 'free': memory_info.free, - 'used': memory_info.used, - 'temperature': f'{nvmlDeviceGetTemperature(handle, 0)}℃', - 'powerStatus': nvmlDeviceGetPowerState(handle) - } - nvidia_dict['gpus'].append(gpu) - except NVMLError as _: # noqa - nvidia_dict['state'] = False - except Exception as _: # noqa - nvidia_dict['state'] = False - finally: - try: - nvmlShutdown() - except: # noqa - pass - return nvidia_dict - - @classmethod - def mem_monitor(cls): - info = cls.nvidia_info() - max_mem = 0 - mem_start = 0 - cls.device_count.value = len(info['gpus']) - for used_total in info['gpus']: - mem_start += used_total['used'] - while True: - info = cls.nvidia_info() - used = 0 - for used_total in info['gpus']: - used += used_total['used'] - if used > max_mem: - max_mem = used - cls.max_mem.value = (max_mem - mem_start) / (1 << 30) - - @classmethod - def start(cls): - cls._running = True - from multiprocessing import Process - cls.proc = Process(target=cls.mem_monitor, daemon=True) - cls.proc.start() - - @classmethod - def terminate(cls) -> float: - """Terminate the subprocess and return maximum memory.""" - cls.proc.kill() - return cls.max_mem.value - - -@dataclass -class ProfileResult: - model_name: str - batch: int - prompt_tokens: int - completion_tokens: int - first_token_latency: List - percentiles: List - output_throughput: float - total_throughput: float - mem_per_gpu: float - - -def parse_args(): - parser = argparse.ArgumentParser(description='Profile the token generation performance with' - ' pytorch or turbomind engine', - formatter_class=DefaultsAndTypesHelpFormatter) - parser.add_argument('model_path', - type=str, - help='the path of the model in localhost or ' - 'the repo_id of the model in huggingface.co') - parser.add_argument('-c', - '--concurrency', - nargs='+', - type=int, - help='how many requests launched concurrently', - default=[1, 16, 32, 64]) - parser.add_argument('-pt', - '--prompt-tokens', - nargs='+', - type=int, - help='how many tokens in the prompt. One-to-one ' - 'correspondence with completion-tokens', - default=[1, 128, 128, 2048, 2048]) - parser.add_argument('-ct', - '--completion-tokens', - nargs='+', - type=int, - help='how many tokens to be generated. One-to-one ' - 'correspondence with prompt-tokens', - default=[128, 128, 2048, 128, 2048]) - parser.add_argument('--csv', type=str, help='Where to save the result.', default='profile_generation.csv') - parser.add_argument('-tr', '--test-round', type=int, help='number of test rounds', default=3) - parser.add_argument('-w', '--warmup-round', type=int, help='number of warmup rounds', default=1) - - # other args - ArgumentHelper.top_p(parser) - ArgumentHelper.temperature(parser) - ArgumentHelper.top_k(parser) - ArgumentHelper.backend(parser) - # pytorch engine args - pt_group = parser.add_argument_group('PyTorch engine arguments') - ArgumentHelper.eager_mode(pt_group) - - tp_act = ArgumentHelper.tp(pt_group) - cache_count_act = ArgumentHelper.cache_max_entry_count(pt_group) - cache_block_seq_len_act = ArgumentHelper.cache_block_seq_len(pt_group) - session_len_act = ArgumentHelper.session_len(pt_group, default=2048) - prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group) - rope_scaling_factor_act = ArgumentHelper.rope_scaling_factor(pt_group) - dtype_act = ArgumentHelper.dtype(pt_group) - - # turbomind engine args - tb_group = parser.add_argument_group('TurboMind engine argument') - tb_group._group_actions.append(tp_act) - tb_group._group_actions.append(session_len_act) - tb_group._group_actions.append(cache_count_act) - tb_group._group_actions.append(cache_block_seq_len_act) - tb_group._group_actions.append(prefix_caching_act) - tb_group._group_actions.append(rope_scaling_factor_act) - tb_group._group_actions.append(dtype_act) - ArgumentHelper.model_format(tb_group, default='hf') - args = parser.parse_args() - return args - - -def __proc_cb(*args, ret_pipe, target): - try: - ret = target(*args) - ret_pipe[1].send(ret) - except Exception as e: - ret_pipe[1].send(e) - - -def _process_map(target, iterable): - from multiprocessing import Pipe, get_context - - pipe = Pipe(False) - spawn_context = get_context('spawn') - proc = spawn_context.Process(target=__proc_cb, args=iterable, kwargs=dict(ret_pipe=pipe, target=target)) - proc.start() - proc.join() - - ret = pipe[0].recv() - if isinstance(ret, Exception): - raise ret - - return ret - - -def main(): - args = parse_args() - assert len(args.prompt_tokens) == len(args.completion_tokens), \ - f'mismatched size between `prompt-tokens` and `completion-tokenes`' \ - f', {len(args.prompt_tokens)} vs {len(args.completion_tokens)}' - - results: List[ProfileResult] = [] - - MemoryMonitor.init() - for batch in args.concurrency: - for prompt_tokens, completion_tokens in zip(args.prompt_tokens, args.completion_tokens): - MemoryMonitor.start() - from functools import partial - - # make sure session_len >= prompt_tokens + completion_tokens - session_len = max(args.session_len, prompt_tokens + completion_tokens) - if args.backend == 'turbomind': - engine_config = TurbomindEngineConfig( - cache_max_entry_count=args.cache_max_entry_count, - cache_block_seq_len=args.cache_block_seq_len, - model_format=args.model_format, - session_len=session_len, - rope_scaling_factor=args.rope_scaling_factor, - tp=args.tp, - enable_prefix_caching=args.enable_prefix_caching, - dtype=args.dtype, - ) - elif args.backend == 'pytorch': - engine_config = PytorchEngineConfig( - cache_max_entry_count=args.cache_max_entry_count, - block_size=args.cache_block_seq_len, - session_len=session_len, - tp=args.tp, - eager_mode=args.eager_mode, - enable_prefix_caching=args.enable_prefix_caching, - dtype=args.dtype, - ) - gen_config = GenerationConfig(top_k=args.top_k, - top_p=args.top_p, - temperature=args.temperature, - max_new_tokens=completion_tokens, - ignore_eos=True) - profile_target = partial( - profile_throughput, - concurrency=batch, - input_seqlen=prompt_tokens, - engine_config=engine_config, - gen_config=gen_config, - test_round=args.test_round, - warmup_round=args.warmup_round, - ) - output = _process_map(profile_target, (args.model_path, )) - model_name, first_token_latency, percentiles, \ - output_throughput, total_throughput, tp = output - time.sleep(5) # wait a while for releasing GPU mem - memory = MemoryMonitor.terminate() - results.append( - ProfileResult(model_name=model_name, - batch=batch, - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - first_token_latency=first_token_latency, - percentiles=percentiles, - output_throughput=output_throughput, - total_throughput=total_throughput, - mem_per_gpu=memory / tp)) - if args.csv: - with open(args.csv, 'w') as csvfile: - writer = csv.writer(csvfile) - writer.writerow([ - 'batch', - 'prompt_tokens', - 'completion_tokens', - 'throughput(total tok/s)', - 'throughput(out tok/s)', - 'mem(GB)', - 'FTL(ave)(s)', - 'FTL(min)(s)', - 'FTL(max)(s)', - '50%(s)', - '75%(s)', - '95%(s)', - '99%(s)', - ]) - for re in results: - writer.writerow([ - re.batch, re.prompt_tokens, re.completion_tokens, f'{re.total_throughput:.2f}', - f'{re.output_throughput:.2f}', f'{re.mem_per_gpu:.2f}', re.first_token_latency[2], - re.first_token_latency[0], re.first_token_latency[1], re.percentiles[0], re.percentiles[1], - re.percentiles[2], re.percentiles[3] - ]) - - -if __name__ == '__main__': - main() diff --git a/docs/en/quantization/w4a16.md b/docs/en/quantization/w4a16.md index b6ca2bbf92..4288480b7c 100644 --- a/docs/en/quantization/w4a16.md +++ b/docs/en/quantization/w4a16.md @@ -116,7 +116,7 @@ You can overview and try out `api_server` APIs online by swagger UI at `http://0 ## Performance -We benchmarked the Llama-2-7B-chat and Llama-2-13B-chat models with 4-bit quantization on NVIDIA GeForce RTX 4090 using [profile_generation.py](https://github.com/InternLM/lmdeploy/blob/main/benchmark/profile_generation.py). And we measure the token generation throughput (tokens/s) by setting a single prompt token and generating 512 tokens. All the results are measured for single batch inference. +We benchmarked the Llama-2-7B-chat and Llama-2-13B-chat models with 4-bit quantization on NVIDIA GeForce RTX 4090. And we measure the token generation throughput (tokens/s) by setting a single prompt token and generating 512 tokens. All the results are measured for single batch inference. | model | llm-awq | mlc-llm | turbomind | | ---------------- | ------- | ------- | --------- | diff --git a/docs/zh_cn/quantization/w4a16.md b/docs/zh_cn/quantization/w4a16.md index 73edaea937..4aa6b07193 100644 --- a/docs/zh_cn/quantization/w4a16.md +++ b/docs/zh_cn/quantization/w4a16.md @@ -117,7 +117,7 @@ lmdeploy serve api_client http://0.0.0.0:23333 ## 推理性能 -我们在 NVIDIA GeForce RTX 4090 上使用 [profile_generation.py](https://github.com/InternLM/lmdeploy/blob/main/benchmark/profile_generation.py),分别测试了 4-bit Llama-2-7B-chat 和 Llama-2-13B-chat 模型的 token 生成速度。测试配置为 batch size = 1,(prompt_tokens, completion_tokens) = (1, 512) +我们在 NVIDIA GeForce RTX 4090 上分别测试了 4-bit Llama-2-7B-chat 和 Llama-2-13B-chat 模型的 token 生成速度。测试配置为 batch size = 1,(prompt_tokens, completion_tokens) = (1, 512) | model | llm-awq | mlc-llm | turbomind | | ---------------- | ------- | ------- | --------- |