From f57a621ec193e06df622dddeda46e30ed1a53d56 Mon Sep 17 00:00:00 2001 From: rentianyue-jk Date: Fri, 17 May 2024 10:06:05 +0800 Subject: [PATCH 01/15] [benchmark] support https scheme server url --- benchmark/profile_restful_api.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py index 1cf5ea267..610229ab3 100644 --- a/benchmark/profile_restful_api.py +++ b/benchmark/profile_restful_api.py @@ -5,6 +5,7 @@ from queue import Queue from threading import Thread from typing import List, Optional, Tuple +from urllib.parse import urlparse import fire import numpy as np @@ -241,10 +242,12 @@ def main(server_addr: str, csv (str, optional): The path to save the result. seed (int, optional): Seed used in sampling prompts from dataset. Defaults to 0. """ # noqa - if not server_addr.startswith('http://'): + addr_schem = urlparse(server_addr).scheme + if addr_schem not in ["http", "https"]: print(f'[WARNING] server_addr of the api_server should ' - f'start with "http://", but got "{server_addr}"') + f'start with "http://" or "https://", but got "{server_addr}"') server_addr = 'http://' + server_addr.strip() + print(f'[INFO] using server_addr: {server_addr}') random.seed(seed) From 8d6b95bbef1ee9c69a42021dc93f5204e3221734 Mon Sep 17 00:00:00 2001 From: rentianyue-jk Date: Fri, 17 May 2024 11:28:54 +0800 Subject: [PATCH 02/15] [benchmark] calculate the real output tokens --- benchmark/profile_restful_api.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py index 610229ab3..ab009b189 100644 --- a/benchmark/profile_restful_api.py +++ b/benchmark/profile_restful_api.py @@ -94,6 +94,7 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int, req_queue.get, [None, None, None]): timestamps = [] timestamps.append(time.perf_counter()) + full_output = "" for output in client.chat_completions_v1( model=self.model_name, messages=prompt, @@ -104,6 +105,13 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int, stream=stream_output, session_id=session_id, ignore_eos=True): + # Here we ignore the index of the multiple outputs and + # just put all of them together to compute tokens. + for choice in output.get("choices", []): + if stream_output: + full_output += choice["delta"]["content"] + else: + full_output += choice["message"]["content"] timestamps.append(time.perf_counter()) first_token_latency = np.round(timestamps[1] - timestamps[0], 3) @@ -111,9 +119,10 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int, # assert output.pop('finish_reason') == 'length', \ # f'Error. session_id({session_id}) request {output_seqlen} ' \ # f'tokens, but `finish_reason` is not `length`' - total_tokens = input_seqlen + output_seqlen + real_output_seqlen = len(self.tokenizer(full_output).input_ids) + total_tokens = input_seqlen + real_output_seqlen stats.append([ - first_token_latency, output_seqlen, output_seqlen, + first_token_latency, real_output_seqlen, output_seqlen, total_tokens, token_latency ]) self.pbar.update(1) From 1740b5ab063b8d523615f28e30881cad0ed69827 Mon Sep 17 00:00:00 2001 From: rentianyue-jk Date: Fri, 17 May 2024 11:59:06 +0800 Subject: [PATCH 03/15] [benchmark] calculate local tokenlizer time --- benchmark/profile_restful_api.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py index ab009b189..ad954a88f 100644 --- a/benchmark/profile_restful_api.py +++ b/benchmark/profile_restful_api.py @@ -119,11 +119,14 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int, # assert output.pop('finish_reason') == 'length', \ # f'Error. session_id({session_id}) request {output_seqlen} ' \ # f'tokens, but `finish_reason` is not `length`' + tokenlizer_start = time.perf_counter() real_output_seqlen = len(self.tokenizer(full_output).input_ids) + tokenlizer_finish = time.perf_counter() + tokenlizer_time = tokenlizer_finish - tokenlizer_start total_tokens = input_seqlen + real_output_seqlen stats.append([ first_token_latency, real_output_seqlen, output_seqlen, - total_tokens, token_latency + total_tokens, token_latency, tokenlizer_time ]) self.pbar.update(1) @@ -167,7 +170,10 @@ def process_request(self, # f'session {session_id} stats: \n{_stats}\n{"-" * 50}\n') stats.append(np.array(_stats)) - stats = np.concatenate(stats).reshape(-1, 5) + stats = np.concatenate(stats).reshape(-1, 6) + + tokenlizer_time = np.sum(stats[:, 5], axis=0) / concurrency + elapsed_time -= tokenlizer_time first_token_latency_min = np.min(stats[:, 0], axis=0) first_token_latency_max = np.max(stats[:, 0], axis=0) @@ -176,6 +182,7 @@ def process_request(self, request_output_tokens = np.sum(stats[:, 2], axis=0) total_tokens = np.sum(stats[:, 3], axis=0) prompt_tokens = total_tokens - completion_tokens + local_tokenlizer_throughput = completion_tokens / tokenlizer_time completion_token_throughput = completion_tokens / elapsed_time total_token_throughput = total_tokens / elapsed_time rps = len(requests) / elapsed_time @@ -196,6 +203,7 @@ def process_request(self, print( f'number of prompt tokens: {prompt_tokens:.0f}\n' f'number of completion tokens: {completion_tokens:.0f}\n' + f'local tokenlizer throughput (completion token): {local_tokenlizer_throughput:.3f} token/s\n' # noqa f'token throughput (completion token): {completion_token_throughput:.3f} token/s\n' # noqa f'token throughput (prompt + completion token): {total_token_throughput:.3f} token/s\n' # noqa f'RPS (request per second): {rps:.3f} req/s\n' From 3acbf636f76be324d366e45895ecc1bc5912acb9 Mon Sep 17 00:00:00 2001 From: rentianyue-jk Date: Fri, 17 May 2024 12:09:27 +0800 Subject: [PATCH 04/15] [benchmark] fix linting --- benchmark/profile_restful_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py index ad954a88f..7f3aba4ea 100644 --- a/benchmark/profile_restful_api.py +++ b/benchmark/profile_restful_api.py @@ -105,7 +105,7 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int, stream=stream_output, session_id=session_id, ignore_eos=True): - # Here we ignore the index of the multiple outputs and + # Here we ignore the index of the multiple outputs and # just put all of them together to compute tokens. for choice in output.get("choices", []): if stream_output: From b3ee42917944ab52d47b0e9a9ef657157ea21e45 Mon Sep 17 00:00:00 2001 From: rentianyue-jk Date: Fri, 17 May 2024 12:16:20 +0800 Subject: [PATCH 05/15] [benchmark] fix linting --- benchmark/profile_restful_api.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py index 7f3aba4ea..731f02310 100644 --- a/benchmark/profile_restful_api.py +++ b/benchmark/profile_restful_api.py @@ -94,7 +94,7 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int, req_queue.get, [None, None, None]): timestamps = [] timestamps.append(time.perf_counter()) - full_output = "" + full_output = '' for output in client.chat_completions_v1( model=self.model_name, messages=prompt, @@ -107,11 +107,11 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int, ignore_eos=True): # Here we ignore the index of the multiple outputs and # just put all of them together to compute tokens. - for choice in output.get("choices", []): + for choice in output.get('choices', []): if stream_output: - full_output += choice["delta"]["content"] + full_output += choice['delta']['content'] else: - full_output += choice["message"]["content"] + full_output += choice['message']['content'] timestamps.append(time.perf_counter()) first_token_latency = np.round(timestamps[1] - timestamps[0], 3) From 7d719f165cb22302c5e06ab80a24cb224f27466a Mon Sep 17 00:00:00 2001 From: rentianyue-jk Date: Fri, 17 May 2024 12:19:26 +0800 Subject: [PATCH 06/15] [benchmark] fix linting --- benchmark/profile_restful_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py index 731f02310..d418b8c6a 100644 --- a/benchmark/profile_restful_api.py +++ b/benchmark/profile_restful_api.py @@ -260,7 +260,7 @@ def main(server_addr: str, seed (int, optional): Seed used in sampling prompts from dataset. Defaults to 0. """ # noqa addr_schem = urlparse(server_addr).scheme - if addr_schem not in ["http", "https"]: + if addr_schem not in ['http', 'https']: print(f'[WARNING] server_addr of the api_server should ' f'start with "http://" or "https://", but got "{server_addr}"') server_addr = 'http://' + server_addr.strip() From 0cf60fc9a969b07760d31bdf96339aaed6f74d87 Mon Sep 17 00:00:00 2001 From: rentianyue-jk Date: Fri, 17 May 2024 15:56:39 +0800 Subject: [PATCH 07/15] [benchmark] counting error requests --- benchmark/profile_restful_api.py | 49 +++++++++++++++++++------------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py index d418b8c6a..8224d73e0 100644 --- a/benchmark/profile_restful_api.py +++ b/benchmark/profile_restful_api.py @@ -95,23 +95,29 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int, timestamps = [] timestamps.append(time.perf_counter()) full_output = '' - for output in client.chat_completions_v1( - model=self.model_name, - messages=prompt, - temperature=self.temperature, - top_p=self.top_p, - n=1, - max_tokens=output_seqlen, - stream=stream_output, - session_id=session_id, - ignore_eos=True): - # Here we ignore the index of the multiple outputs and - # just put all of them together to compute tokens. - for choice in output.get('choices', []): - if stream_output: - full_output += choice['delta']['content'] - else: - full_output += choice['message']['content'] + failed = 0 + try: + for output in client.chat_completions_v1( + model=self.model_name, + messages=prompt, + temperature=self.temperature, + top_p=self.top_p, + n=1, + max_tokens=output_seqlen, + stream=stream_output, + session_id=session_id, + ignore_eos=True): + # Here we ignore the index of the multiple outputs and + # just put all of them together to compute tokens. + for choice in output.get('choices', []): + if stream_output: + full_output += choice['delta']['content'] + else: + full_output += choice['message']['content'] + timestamps.append(time.perf_counter()) + except Exception as e: + print(f'inference failed: {e}') + failed = 1 timestamps.append(time.perf_counter()) first_token_latency = np.round(timestamps[1] - timestamps[0], 3) @@ -126,7 +132,7 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int, total_tokens = input_seqlen + real_output_seqlen stats.append([ first_token_latency, real_output_seqlen, output_seqlen, - total_tokens, token_latency, tokenlizer_time + total_tokens, token_latency, tokenlizer_time, failed ]) self.pbar.update(1) @@ -170,7 +176,7 @@ def process_request(self, # f'session {session_id} stats: \n{_stats}\n{"-" * 50}\n') stats.append(np.array(_stats)) - stats = np.concatenate(stats).reshape(-1, 6) + stats = np.concatenate(stats).reshape(-1, 7) tokenlizer_time = np.sum(stats[:, 5], axis=0) / concurrency elapsed_time -= tokenlizer_time @@ -178,6 +184,7 @@ def process_request(self, first_token_latency_min = np.min(stats[:, 0], axis=0) first_token_latency_max = np.max(stats[:, 0], axis=0) first_token_latency_ave = np.mean(stats[:, 0], axis=0) + failed_requests = np.sum(stats[:, 6], axis=0) completion_tokens = np.sum(stats[:, 1], axis=0) request_output_tokens = np.sum(stats[:, 2], axis=0) total_tokens = np.sum(stats[:, 3], axis=0) @@ -200,6 +207,10 @@ def process_request(self, f'{first_token_latency_min:.3f}s, ' f'{first_token_latency_max:.3f}s, ' f'{first_token_latency_ave:.3f}s\n') + + if failed_requests > 0: + print(f'number of failed requests: {failed_requests:.0f}\n') + print( f'number of prompt tokens: {prompt_tokens:.0f}\n' f'number of completion tokens: {completion_tokens:.0f}\n' From 32ba3baa101199c4b015458d18a3b912a681e28c Mon Sep 17 00:00:00 2001 From: rentianyue-jk Date: Fri, 17 May 2024 17:45:19 +0800 Subject: [PATCH 08/15] [benchmark] support role in prompt --- benchmark/profile_restful_api.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py index 8224d73e0..998172bd5 100644 --- a/benchmark/profile_restful_api.py +++ b/benchmark/profile_restful_api.py @@ -19,6 +19,7 @@ def sample_requests( dataset_path: str, num_requests: int, tokenizer: Tokenizer, + role: str, ) -> List[Tuple[str, int, int]]: # Load the dataset. with open(dataset_path) as f: @@ -40,7 +41,9 @@ def sample_requests( tokenized_dataset = [] for i in range(len(dataset)): output_len = len(completion_token_ids[i]) - tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len)) + tokenized_dataset.append( + ([{"role": role, "content": prompts[i]}], prompt_token_ids[i], output_len) + ) # Filter out too long sequences. filtered_dataset: List[Tuple[str, int, int]] = [] @@ -251,7 +254,9 @@ def main(server_addr: str, temperature: float = 1.0, stream_output: bool = False, csv: str = './profile_api_server.csv', - seed: int = 0): + seed: int = 0, + role: str = 'user', + ): """Benchmark the request througput of api server. Args: @@ -269,6 +274,7 @@ def main(server_addr: str, stream_output (bool, optional): Indicator for streaming output. Defaults to False. csv (str, optional): The path to save the result. seed (int, optional): Seed used in sampling prompts from dataset. Defaults to 0. + role (str, optional): The role of the messages author in prompts. Defaults to 'user' """ # noqa addr_schem = urlparse(server_addr).scheme if addr_schem not in ['http', 'https']: @@ -287,7 +293,7 @@ def main(server_addr: str, api_key=api_key, model_name=model_name) - requests = sample_requests(dataset, num_prompts, engine.tokenizer) + requests = sample_requests(dataset, num_prompts, engine.tokenizer, role) engine.process_request(requests, concurrency, stream_output) From 6bc0dcf167d00cefaec7c73539951bb8098c9f06 Mon Sep 17 00:00:00 2001 From: rentianyue-jk Date: Fri, 17 May 2024 17:46:55 +0800 Subject: [PATCH 09/15] [benchmark] deal with last empty delta in stream mode --- benchmark/profile_restful_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py index 998172bd5..3f2c7dc71 100644 --- a/benchmark/profile_restful_api.py +++ b/benchmark/profile_restful_api.py @@ -114,7 +114,7 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int, # just put all of them together to compute tokens. for choice in output.get('choices', []): if stream_output: - full_output += choice['delta']['content'] + full_output += choice['delta'].get('content', '') else: full_output += choice['message']['content'] timestamps.append(time.perf_counter()) From ef5988cfc3765a855631e19a92e4f729f1400f3e Mon Sep 17 00:00:00 2001 From: rentianyue-jk Date: Fri, 17 May 2024 17:54:21 +0800 Subject: [PATCH 10/15] [benchmark] fix linting --- benchmark/profile_restful_api.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py index 3f2c7dc71..b8cfeb80d 100644 --- a/benchmark/profile_restful_api.py +++ b/benchmark/profile_restful_api.py @@ -42,7 +42,11 @@ def sample_requests( for i in range(len(dataset)): output_len = len(completion_token_ids[i]) tokenized_dataset.append( - ([{"role": role, "content": prompts[i]}], prompt_token_ids[i], output_len) + ( + [{'role': role, 'content': prompts[i]}], + prompt_token_ids[i], + output_len + ) ) # Filter out too long sequences. From 52a2e66f3bc9de955c1a5fb1b2288e4f3ae707a2 Mon Sep 17 00:00:00 2001 From: rentianyue-jk Date: Fri, 17 May 2024 18:01:02 +0800 Subject: [PATCH 11/15] [benchmark] fix linting --- benchmark/profile_restful_api.py | 40 +++++++++++++++----------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py index b8cfeb80d..dc06c6310 100644 --- a/benchmark/profile_restful_api.py +++ b/benchmark/profile_restful_api.py @@ -41,13 +41,10 @@ def sample_requests( tokenized_dataset = [] for i in range(len(dataset)): output_len = len(completion_token_ids[i]) - tokenized_dataset.append( - ( - [{'role': role, 'content': prompts[i]}], - prompt_token_ids[i], - output_len - ) - ) + tokenized_dataset.append(([{ + 'role': role, + 'content': prompts[i] + }], prompt_token_ids[i], output_len)) # Filter out too long sequences. filtered_dataset: List[Tuple[str, int, int]] = [] @@ -247,20 +244,21 @@ def process_request(self, ]) -def main(server_addr: str, - tokenizer_path: str, - dataset: str, - api_key: Optional[str] = None, - model_name: Optional[str] = None, - concurrency: int = 128, - num_prompts: int = 5000, - top_p: float = 1.0, - temperature: float = 1.0, - stream_output: bool = False, - csv: str = './profile_api_server.csv', - seed: int = 0, - role: str = 'user', - ): +def main( + server_addr: str, + tokenizer_path: str, + dataset: str, + api_key: Optional[str] = None, + model_name: Optional[str] = None, + concurrency: int = 128, + num_prompts: int = 5000, + top_p: float = 1.0, + temperature: float = 1.0, + stream_output: bool = False, + csv: str = './profile_api_server.csv', + seed: int = 0, + role: str = 'user', +): """Benchmark the request througput of api server. Args: From 080cb2328e3dbe4a7c56757113d80f39d207a982 Mon Sep 17 00:00:00 2001 From: rentianyue-jk Date: Fri, 17 May 2024 18:06:34 +0800 Subject: [PATCH 12/15] [benchmark] fix linting --- benchmark/profile_restful_api.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py index dc06c6310..9ae628ff8 100644 --- a/benchmark/profile_restful_api.py +++ b/benchmark/profile_restful_api.py @@ -244,21 +244,19 @@ def process_request(self, ]) -def main( - server_addr: str, - tokenizer_path: str, - dataset: str, - api_key: Optional[str] = None, - model_name: Optional[str] = None, - concurrency: int = 128, - num_prompts: int = 5000, - top_p: float = 1.0, - temperature: float = 1.0, - stream_output: bool = False, - csv: str = './profile_api_server.csv', - seed: int = 0, - role: str = 'user', -): +def main(server_addr: str, + tokenizer_path: str, + dataset: str, + api_key: Optional[str] = None, + model_name: Optional[str] = None, + concurrency: int = 128, + num_prompts: int = 5000, + top_p: float = 1.0, + temperature: float = 1.0, + stream_output: bool = False, + csv: str = './profile_api_server.csv', + seed: int = 0, + role: str = 'user'): """Benchmark the request througput of api server. Args: From fcec4601add0b71f981cef96f6e92df8d1b3f223 Mon Sep 17 00:00:00 2001 From: rentianyue-jk Date: Fri, 17 May 2024 18:44:39 +0800 Subject: [PATCH 13/15] [benchmark] support openai API --- benchmark/profile_restful_api.py | 6 ++++-- lmdeploy/serve/openai/api_client.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py index 9ae628ff8..38bc2e9a0 100644 --- a/benchmark/profile_restful_api.py +++ b/benchmark/profile_restful_api.py @@ -109,8 +109,10 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int, n=1, max_tokens=output_seqlen, stream=stream_output, - session_id=session_id, - ignore_eos=True): + session_id=None, + repetition_penalty=None, + ignore_eos=None, + skip_special_tokens=None): # Here we ignore the index of the multiple outputs and # just put all of them together to compute tokens. for choice in output.get('choices', []): diff --git a/lmdeploy/serve/openai/api_client.py b/lmdeploy/serve/openai/api_client.py index 212b9999d..42990f77a 100644 --- a/lmdeploy/serve/openai/api_client.py +++ b/lmdeploy/serve/openai/api_client.py @@ -137,7 +137,7 @@ def chat_completions_v1(self, pload = { k: v for k, v in locals().copy().items() - if k[:2] != '__' and k not in ['self'] + if k[:2] != '__' and k not in ['self'] and v != None and v != {} } response = requests.post(self.chat_completions_v1_url, headers=self.headers, From 9becb97c5981d9acce0db7e6623cf1c04a14a1d6 Mon Sep 17 00:00:00 2001 From: rentianyue-jk Date: Fri, 17 May 2024 18:52:53 +0800 Subject: [PATCH 14/15] [benchmark] fix linting --- lmdeploy/serve/openai/api_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lmdeploy/serve/openai/api_client.py b/lmdeploy/serve/openai/api_client.py index 42990f77a..88d510086 100644 --- a/lmdeploy/serve/openai/api_client.py +++ b/lmdeploy/serve/openai/api_client.py @@ -137,7 +137,7 @@ def chat_completions_v1(self, pload = { k: v for k, v in locals().copy().items() - if k[:2] != '__' and k not in ['self'] and v != None and v != {} + if k[:2] != '__' and k not in ['self'] and v is not None and v != {} } response = requests.post(self.chat_completions_v1_url, headers=self.headers, From 2768d8277a8b9291ce30b60adc7ab920670d7517 Mon Sep 17 00:00:00 2001 From: rentianyue-jk Date: Fri, 17 May 2024 18:58:51 +0800 Subject: [PATCH 15/15] [benchmark] fix linting --- lmdeploy/serve/openai/api_client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lmdeploy/serve/openai/api_client.py b/lmdeploy/serve/openai/api_client.py index 88d510086..6a964ec14 100644 --- a/lmdeploy/serve/openai/api_client.py +++ b/lmdeploy/serve/openai/api_client.py @@ -136,8 +136,8 @@ def chat_completions_v1(self, """ pload = { k: v - for k, v in locals().copy().items() - if k[:2] != '__' and k not in ['self'] and v is not None and v != {} + for k, v in locals().copy().items() if k[:2] != '__' + and k not in ['self'] and v is not None and v != {} } response = requests.post(self.chat_completions_v1_url, headers=self.headers,