From f57a621ec193e06df622dddeda46e30ed1a53d56 Mon Sep 17 00:00:00 2001
From: rentianyue-jk <rentianyue-jk@360shuke.com>
Date: Fri, 17 May 2024 10:06:05 +0800
Subject: [PATCH 01/15] [benchmark] support https scheme server url

---
 benchmark/profile_restful_api.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py
index 1cf5ea267..610229ab3 100644
--- a/benchmark/profile_restful_api.py
+++ b/benchmark/profile_restful_api.py
@@ -5,6 +5,7 @@
 from queue import Queue
 from threading import Thread
 from typing import List, Optional, Tuple
+from urllib.parse import urlparse
 
 import fire
 import numpy as np
@@ -241,10 +242,12 @@ def main(server_addr: str,
         csv (str, optional): The path to save the result.
         seed (int, optional): Seed used in sampling prompts from dataset. Defaults to 0.
     """    # noqa
-    if not server_addr.startswith('http://'):
+    addr_schem = urlparse(server_addr).scheme
+    if addr_schem not in ["http", "https"]:
         print(f'[WARNING] server_addr of the api_server should '
-              f'start with "http://", but got "{server_addr}"')
+              f'start with "http://" or "https://", but got "{server_addr}"')
         server_addr = 'http://' + server_addr.strip()
+    print(f'[INFO] using server_addr: {server_addr}')
 
     random.seed(seed)
 

From 8d6b95bbef1ee9c69a42021dc93f5204e3221734 Mon Sep 17 00:00:00 2001
From: rentianyue-jk <rentianyue-jk@360shuke.com>
Date: Fri, 17 May 2024 11:28:54 +0800
Subject: [PATCH 02/15] [benchmark] calculate the real output tokens

---
 benchmark/profile_restful_api.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py
index 610229ab3..ab009b189 100644
--- a/benchmark/profile_restful_api.py
+++ b/benchmark/profile_restful_api.py
@@ -94,6 +94,7 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
                 req_queue.get, [None, None, None]):
             timestamps = []
             timestamps.append(time.perf_counter())
+            full_output = ""
             for output in client.chat_completions_v1(
                     model=self.model_name,
                     messages=prompt,
@@ -104,6 +105,13 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
                     stream=stream_output,
                     session_id=session_id,
                     ignore_eos=True):
+                # Here we ignore the index of the multiple outputs and 
+                # just put all of them together to compute tokens.
+                for choice in output.get("choices", []):
+                    if stream_output:
+                        full_output += choice["delta"]["content"]
+                    else:
+                        full_output += choice["message"]["content"]
                 timestamps.append(time.perf_counter())
 
             first_token_latency = np.round(timestamps[1] - timestamps[0], 3)
@@ -111,9 +119,10 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
             # assert output.pop('finish_reason') == 'length', \
             #     f'Error. session_id({session_id}) request {output_seqlen} ' \
             #     f'tokens, but `finish_reason` is not `length`'
-            total_tokens = input_seqlen + output_seqlen
+            real_output_seqlen = len(self.tokenizer(full_output).input_ids)
+            total_tokens = input_seqlen + real_output_seqlen
             stats.append([
-                first_token_latency, output_seqlen, output_seqlen,
+                first_token_latency, real_output_seqlen, output_seqlen,
                 total_tokens, token_latency
             ])
             self.pbar.update(1)

From 1740b5ab063b8d523615f28e30881cad0ed69827 Mon Sep 17 00:00:00 2001
From: rentianyue-jk <rentianyue-jk@360shuke.com>
Date: Fri, 17 May 2024 11:59:06 +0800
Subject: [PATCH 03/15] [benchmark] calculate local tokenlizer time

---
 benchmark/profile_restful_api.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py
index ab009b189..ad954a88f 100644
--- a/benchmark/profile_restful_api.py
+++ b/benchmark/profile_restful_api.py
@@ -119,11 +119,14 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
             # assert output.pop('finish_reason') == 'length', \
             #     f'Error. session_id({session_id}) request {output_seqlen} ' \
             #     f'tokens, but `finish_reason` is not `length`'
+            tokenlizer_start = time.perf_counter()
             real_output_seqlen = len(self.tokenizer(full_output).input_ids)
+            tokenlizer_finish = time.perf_counter()
+            tokenlizer_time = tokenlizer_finish - tokenlizer_start
             total_tokens = input_seqlen + real_output_seqlen
             stats.append([
                 first_token_latency, real_output_seqlen, output_seqlen,
-                total_tokens, token_latency
+                total_tokens, token_latency, tokenlizer_time
             ])
             self.pbar.update(1)
 
@@ -167,7 +170,10 @@ def process_request(self,
             #       f'session {session_id} stats: \n{_stats}\n{"-" * 50}\n')
             stats.append(np.array(_stats))
 
-        stats = np.concatenate(stats).reshape(-1, 5)
+        stats = np.concatenate(stats).reshape(-1, 6)
+
+        tokenlizer_time = np.sum(stats[:, 5], axis=0) / concurrency
+        elapsed_time -= tokenlizer_time
 
         first_token_latency_min = np.min(stats[:, 0], axis=0)
         first_token_latency_max = np.max(stats[:, 0], axis=0)
@@ -176,6 +182,7 @@ def process_request(self,
         request_output_tokens = np.sum(stats[:, 2], axis=0)
         total_tokens = np.sum(stats[:, 3], axis=0)
         prompt_tokens = total_tokens - completion_tokens
+        local_tokenlizer_throughput = completion_tokens / tokenlizer_time
         completion_token_throughput = completion_tokens / elapsed_time
         total_token_throughput = total_tokens / elapsed_time
         rps = len(requests) / elapsed_time
@@ -196,6 +203,7 @@ def process_request(self,
         print(
             f'number of prompt tokens: {prompt_tokens:.0f}\n'
             f'number of completion tokens: {completion_tokens:.0f}\n'
+            f'local tokenlizer throughput (completion token): {local_tokenlizer_throughput:.3f} token/s\n'  # noqa
             f'token throughput (completion token): {completion_token_throughput:.3f} token/s\n'  # noqa
             f'token throughput (prompt + completion token): {total_token_throughput:.3f} token/s\n'  # noqa
             f'RPS (request per second): {rps:.3f} req/s\n'

From 3acbf636f76be324d366e45895ecc1bc5912acb9 Mon Sep 17 00:00:00 2001
From: rentianyue-jk <rentianyue-jk@360shuke.com>
Date: Fri, 17 May 2024 12:09:27 +0800
Subject: [PATCH 04/15] [benchmark] fix linting

---
 benchmark/profile_restful_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py
index ad954a88f..7f3aba4ea 100644
--- a/benchmark/profile_restful_api.py
+++ b/benchmark/profile_restful_api.py
@@ -105,7 +105,7 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
                     stream=stream_output,
                     session_id=session_id,
                     ignore_eos=True):
-                # Here we ignore the index of the multiple outputs and 
+                # Here we ignore the index of the multiple outputs and
                 # just put all of them together to compute tokens.
                 for choice in output.get("choices", []):
                     if stream_output:

From b3ee42917944ab52d47b0e9a9ef657157ea21e45 Mon Sep 17 00:00:00 2001
From: rentianyue-jk <rentianyue-jk@360shuke.com>
Date: Fri, 17 May 2024 12:16:20 +0800
Subject: [PATCH 05/15] [benchmark] fix linting

---
 benchmark/profile_restful_api.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py
index 7f3aba4ea..731f02310 100644
--- a/benchmark/profile_restful_api.py
+++ b/benchmark/profile_restful_api.py
@@ -94,7 +94,7 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
                 req_queue.get, [None, None, None]):
             timestamps = []
             timestamps.append(time.perf_counter())
-            full_output = ""
+            full_output = ''
             for output in client.chat_completions_v1(
                     model=self.model_name,
                     messages=prompt,
@@ -107,11 +107,11 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
                     ignore_eos=True):
                 # Here we ignore the index of the multiple outputs and
                 # just put all of them together to compute tokens.
-                for choice in output.get("choices", []):
+                for choice in output.get('choices', []):
                     if stream_output:
-                        full_output += choice["delta"]["content"]
+                        full_output += choice['delta']['content']
                     else:
-                        full_output += choice["message"]["content"]
+                        full_output += choice['message']['content']
                 timestamps.append(time.perf_counter())
 
             first_token_latency = np.round(timestamps[1] - timestamps[0], 3)

From 7d719f165cb22302c5e06ab80a24cb224f27466a Mon Sep 17 00:00:00 2001
From: rentianyue-jk <rentianyue-jk@360shuke.com>
Date: Fri, 17 May 2024 12:19:26 +0800
Subject: [PATCH 06/15] [benchmark] fix linting

---
 benchmark/profile_restful_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py
index 731f02310..d418b8c6a 100644
--- a/benchmark/profile_restful_api.py
+++ b/benchmark/profile_restful_api.py
@@ -260,7 +260,7 @@ def main(server_addr: str,
         seed (int, optional): Seed used in sampling prompts from dataset. Defaults to 0.
     """    # noqa
     addr_schem = urlparse(server_addr).scheme
-    if addr_schem not in ["http", "https"]:
+    if addr_schem not in ['http', 'https']:
         print(f'[WARNING] server_addr of the api_server should '
               f'start with "http://" or "https://", but got "{server_addr}"')
         server_addr = 'http://' + server_addr.strip()

From 0cf60fc9a969b07760d31bdf96339aaed6f74d87 Mon Sep 17 00:00:00 2001
From: rentianyue-jk <rentianyue-jk@360shuke.com>
Date: Fri, 17 May 2024 15:56:39 +0800
Subject: [PATCH 07/15] [benchmark] counting error requests

---
 benchmark/profile_restful_api.py | 49 +++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py
index d418b8c6a..8224d73e0 100644
--- a/benchmark/profile_restful_api.py
+++ b/benchmark/profile_restful_api.py
@@ -95,23 +95,29 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
             timestamps = []
             timestamps.append(time.perf_counter())
             full_output = ''
-            for output in client.chat_completions_v1(
-                    model=self.model_name,
-                    messages=prompt,
-                    temperature=self.temperature,
-                    top_p=self.top_p,
-                    n=1,
-                    max_tokens=output_seqlen,
-                    stream=stream_output,
-                    session_id=session_id,
-                    ignore_eos=True):
-                # Here we ignore the index of the multiple outputs and
-                # just put all of them together to compute tokens.
-                for choice in output.get('choices', []):
-                    if stream_output:
-                        full_output += choice['delta']['content']
-                    else:
-                        full_output += choice['message']['content']
+            failed = 0
+            try:
+                for output in client.chat_completions_v1(
+                        model=self.model_name,
+                        messages=prompt,
+                        temperature=self.temperature,
+                        top_p=self.top_p,
+                        n=1,
+                        max_tokens=output_seqlen,
+                        stream=stream_output,
+                        session_id=session_id,
+                        ignore_eos=True):
+                    # Here we ignore the index of the multiple outputs and
+                    # just put all of them together to compute tokens.
+                    for choice in output.get('choices', []):
+                        if stream_output:
+                            full_output += choice['delta']['content']
+                        else:
+                            full_output += choice['message']['content']
+                    timestamps.append(time.perf_counter())
+            except Exception as e:
+                print(f'inference failed: {e}')
+                failed = 1
                 timestamps.append(time.perf_counter())
 
             first_token_latency = np.round(timestamps[1] - timestamps[0], 3)
@@ -126,7 +132,7 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
             total_tokens = input_seqlen + real_output_seqlen
             stats.append([
                 first_token_latency, real_output_seqlen, output_seqlen,
-                total_tokens, token_latency, tokenlizer_time
+                total_tokens, token_latency, tokenlizer_time, failed
             ])
             self.pbar.update(1)
 
@@ -170,7 +176,7 @@ def process_request(self,
             #       f'session {session_id} stats: \n{_stats}\n{"-" * 50}\n')
             stats.append(np.array(_stats))
 
-        stats = np.concatenate(stats).reshape(-1, 6)
+        stats = np.concatenate(stats).reshape(-1, 7)
 
         tokenlizer_time = np.sum(stats[:, 5], axis=0) / concurrency
         elapsed_time -= tokenlizer_time
@@ -178,6 +184,7 @@ def process_request(self,
         first_token_latency_min = np.min(stats[:, 0], axis=0)
         first_token_latency_max = np.max(stats[:, 0], axis=0)
         first_token_latency_ave = np.mean(stats[:, 0], axis=0)
+        failed_requests = np.sum(stats[:, 6], axis=0)
         completion_tokens = np.sum(stats[:, 1], axis=0)
         request_output_tokens = np.sum(stats[:, 2], axis=0)
         total_tokens = np.sum(stats[:, 3], axis=0)
@@ -200,6 +207,10 @@ def process_request(self,
                   f'{first_token_latency_min:.3f}s, '
                   f'{first_token_latency_max:.3f}s, '
                   f'{first_token_latency_ave:.3f}s\n')
+
+        if failed_requests > 0:
+            print(f'number of failed requests: {failed_requests:.0f}\n')
+
         print(
             f'number of prompt tokens: {prompt_tokens:.0f}\n'
             f'number of completion tokens: {completion_tokens:.0f}\n'

From 32ba3baa101199c4b015458d18a3b912a681e28c Mon Sep 17 00:00:00 2001
From: rentianyue-jk <rentianyue-jk@360shuke.com>
Date: Fri, 17 May 2024 17:45:19 +0800
Subject: [PATCH 08/15] [benchmark] support role in prompt

---
 benchmark/profile_restful_api.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py
index 8224d73e0..998172bd5 100644
--- a/benchmark/profile_restful_api.py
+++ b/benchmark/profile_restful_api.py
@@ -19,6 +19,7 @@ def sample_requests(
     dataset_path: str,
     num_requests: int,
     tokenizer: Tokenizer,
+    role: str,
 ) -> List[Tuple[str, int, int]]:
     # Load the dataset.
     with open(dataset_path) as f:
@@ -40,7 +41,9 @@ def sample_requests(
     tokenized_dataset = []
     for i in range(len(dataset)):
         output_len = len(completion_token_ids[i])
-        tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
+        tokenized_dataset.append(
+            ([{"role": role, "content": prompts[i]}], prompt_token_ids[i], output_len)
+        )
 
     # Filter out too long sequences.
     filtered_dataset: List[Tuple[str, int, int]] = []
@@ -251,7 +254,9 @@ def main(server_addr: str,
          temperature: float = 1.0,
          stream_output: bool = False,
          csv: str = './profile_api_server.csv',
-         seed: int = 0):
+         seed: int = 0,
+         role: str = 'user',
+         ):
     """Benchmark the request througput of api server.
 
     Args:
@@ -269,6 +274,7 @@ def main(server_addr: str,
         stream_output (bool, optional): Indicator for streaming output. Defaults to False.
         csv (str, optional): The path to save the result.
         seed (int, optional): Seed used in sampling prompts from dataset. Defaults to 0.
+        role (str, optional): The role of the messages author in prompts. Defaults to 'user'
     """    # noqa
     addr_schem = urlparse(server_addr).scheme
     if addr_schem not in ['http', 'https']:
@@ -287,7 +293,7 @@ def main(server_addr: str,
                     api_key=api_key,
                     model_name=model_name)
 
-    requests = sample_requests(dataset, num_prompts, engine.tokenizer)
+    requests = sample_requests(dataset, num_prompts, engine.tokenizer, role)
 
     engine.process_request(requests, concurrency, stream_output)
 

From 6bc0dcf167d00cefaec7c73539951bb8098c9f06 Mon Sep 17 00:00:00 2001
From: rentianyue-jk <rentianyue-jk@360shuke.com>
Date: Fri, 17 May 2024 17:46:55 +0800
Subject: [PATCH 09/15] [benchmark] deal with last empty delta in stream mode

---
 benchmark/profile_restful_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py
index 998172bd5..3f2c7dc71 100644
--- a/benchmark/profile_restful_api.py
+++ b/benchmark/profile_restful_api.py
@@ -114,7 +114,7 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
                     # just put all of them together to compute tokens.
                     for choice in output.get('choices', []):
                         if stream_output:
-                            full_output += choice['delta']['content']
+                            full_output += choice['delta'].get('content', '')
                         else:
                             full_output += choice['message']['content']
                     timestamps.append(time.perf_counter())

From ef5988cfc3765a855631e19a92e4f729f1400f3e Mon Sep 17 00:00:00 2001
From: rentianyue-jk <rentianyue-jk@360shuke.com>
Date: Fri, 17 May 2024 17:54:21 +0800
Subject: [PATCH 10/15] [benchmark] fix linting

---
 benchmark/profile_restful_api.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py
index 3f2c7dc71..b8cfeb80d 100644
--- a/benchmark/profile_restful_api.py
+++ b/benchmark/profile_restful_api.py
@@ -42,7 +42,11 @@ def sample_requests(
     for i in range(len(dataset)):
         output_len = len(completion_token_ids[i])
         tokenized_dataset.append(
-            ([{"role": role, "content": prompts[i]}], prompt_token_ids[i], output_len)
+            (
+                [{'role': role, 'content': prompts[i]}],
+                prompt_token_ids[i],
+                output_len
+            )
         )
 
     # Filter out too long sequences.

From 52a2e66f3bc9de955c1a5fb1b2288e4f3ae707a2 Mon Sep 17 00:00:00 2001
From: rentianyue-jk <rentianyue-jk@360shuke.com>
Date: Fri, 17 May 2024 18:01:02 +0800
Subject: [PATCH 11/15] [benchmark] fix linting

---
 benchmark/profile_restful_api.py | 40 +++++++++++++++-----------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py
index b8cfeb80d..dc06c6310 100644
--- a/benchmark/profile_restful_api.py
+++ b/benchmark/profile_restful_api.py
@@ -41,13 +41,10 @@ def sample_requests(
     tokenized_dataset = []
     for i in range(len(dataset)):
         output_len = len(completion_token_ids[i])
-        tokenized_dataset.append(
-            (
-                [{'role': role, 'content': prompts[i]}],
-                prompt_token_ids[i],
-                output_len
-            )
-        )
+        tokenized_dataset.append(([{
+            'role': role,
+            'content': prompts[i]
+        }], prompt_token_ids[i], output_len))
 
     # Filter out too long sequences.
     filtered_dataset: List[Tuple[str, int, int]] = []
@@ -247,20 +244,21 @@ def process_request(self,
                 ])
 
 
-def main(server_addr: str,
-         tokenizer_path: str,
-         dataset: str,
-         api_key: Optional[str] = None,
-         model_name: Optional[str] = None,
-         concurrency: int = 128,
-         num_prompts: int = 5000,
-         top_p: float = 1.0,
-         temperature: float = 1.0,
-         stream_output: bool = False,
-         csv: str = './profile_api_server.csv',
-         seed: int = 0,
-         role: str = 'user',
-         ):
+def main(
+    server_addr: str,
+    tokenizer_path: str,
+    dataset: str,
+    api_key: Optional[str] = None,
+    model_name: Optional[str] = None,
+    concurrency: int = 128,
+    num_prompts: int = 5000,
+    top_p: float = 1.0,
+    temperature: float = 1.0,
+    stream_output: bool = False,
+    csv: str = './profile_api_server.csv',
+    seed: int = 0,
+    role: str = 'user',
+):
     """Benchmark the request througput of api server.
 
     Args:

From 080cb2328e3dbe4a7c56757113d80f39d207a982 Mon Sep 17 00:00:00 2001
From: rentianyue-jk <rentianyue-jk@360shuke.com>
Date: Fri, 17 May 2024 18:06:34 +0800
Subject: [PATCH 12/15] [benchmark] fix linting

---
 benchmark/profile_restful_api.py | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py
index dc06c6310..9ae628ff8 100644
--- a/benchmark/profile_restful_api.py
+++ b/benchmark/profile_restful_api.py
@@ -244,21 +244,19 @@ def process_request(self,
                 ])
 
 
-def main(
-    server_addr: str,
-    tokenizer_path: str,
-    dataset: str,
-    api_key: Optional[str] = None,
-    model_name: Optional[str] = None,
-    concurrency: int = 128,
-    num_prompts: int = 5000,
-    top_p: float = 1.0,
-    temperature: float = 1.0,
-    stream_output: bool = False,
-    csv: str = './profile_api_server.csv',
-    seed: int = 0,
-    role: str = 'user',
-):
+def main(server_addr: str,
+         tokenizer_path: str,
+         dataset: str,
+         api_key: Optional[str] = None,
+         model_name: Optional[str] = None,
+         concurrency: int = 128,
+         num_prompts: int = 5000,
+         top_p: float = 1.0,
+         temperature: float = 1.0,
+         stream_output: bool = False,
+         csv: str = './profile_api_server.csv',
+         seed: int = 0,
+         role: str = 'user'):
     """Benchmark the request througput of api server.
 
     Args:

From fcec4601add0b71f981cef96f6e92df8d1b3f223 Mon Sep 17 00:00:00 2001
From: rentianyue-jk <rentianyue-jk@360shuke.com>
Date: Fri, 17 May 2024 18:44:39 +0800
Subject: [PATCH 13/15] [benchmark] support openai API

---
 benchmark/profile_restful_api.py    | 6 ++++--
 lmdeploy/serve/openai/api_client.py | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/benchmark/profile_restful_api.py b/benchmark/profile_restful_api.py
index 9ae628ff8..38bc2e9a0 100644
--- a/benchmark/profile_restful_api.py
+++ b/benchmark/profile_restful_api.py
@@ -109,8 +109,10 @@ def _inference(self, req_queue: Queue, res_queue: Queue, session_id: int,
                         n=1,
                         max_tokens=output_seqlen,
                         stream=stream_output,
-                        session_id=session_id,
-                        ignore_eos=True):
+                        session_id=None,
+                        repetition_penalty=None,
+                        ignore_eos=None,
+                        skip_special_tokens=None):
                     # Here we ignore the index of the multiple outputs and
                     # just put all of them together to compute tokens.
                     for choice in output.get('choices', []):
diff --git a/lmdeploy/serve/openai/api_client.py b/lmdeploy/serve/openai/api_client.py
index 212b9999d..42990f77a 100644
--- a/lmdeploy/serve/openai/api_client.py
+++ b/lmdeploy/serve/openai/api_client.py
@@ -137,7 +137,7 @@ def chat_completions_v1(self,
         pload = {
             k: v
             for k, v in locals().copy().items()
-            if k[:2] != '__' and k not in ['self']
+            if k[:2] != '__' and k not in ['self'] and v != None and v != {}
         }
         response = requests.post(self.chat_completions_v1_url,
                                  headers=self.headers,

From 9becb97c5981d9acce0db7e6623cf1c04a14a1d6 Mon Sep 17 00:00:00 2001
From: rentianyue-jk <rentianyue-jk@360shuke.com>
Date: Fri, 17 May 2024 18:52:53 +0800
Subject: [PATCH 14/15] [benchmark] fix linting

---
 lmdeploy/serve/openai/api_client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmdeploy/serve/openai/api_client.py b/lmdeploy/serve/openai/api_client.py
index 42990f77a..88d510086 100644
--- a/lmdeploy/serve/openai/api_client.py
+++ b/lmdeploy/serve/openai/api_client.py
@@ -137,7 +137,7 @@ def chat_completions_v1(self,
         pload = {
             k: v
             for k, v in locals().copy().items()
-            if k[:2] != '__' and k not in ['self'] and v != None and v != {}
+            if k[:2] != '__' and k not in ['self'] and v is not None and v != {}
         }
         response = requests.post(self.chat_completions_v1_url,
                                  headers=self.headers,

From 2768d8277a8b9291ce30b60adc7ab920670d7517 Mon Sep 17 00:00:00 2001
From: rentianyue-jk <rentianyue-jk@360shuke.com>
Date: Fri, 17 May 2024 18:58:51 +0800
Subject: [PATCH 15/15] [benchmark] fix linting

---
 lmdeploy/serve/openai/api_client.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lmdeploy/serve/openai/api_client.py b/lmdeploy/serve/openai/api_client.py
index 88d510086..6a964ec14 100644
--- a/lmdeploy/serve/openai/api_client.py
+++ b/lmdeploy/serve/openai/api_client.py
@@ -136,8 +136,8 @@ def chat_completions_v1(self,
         """
         pload = {
             k: v
-            for k, v in locals().copy().items()
-            if k[:2] != '__' and k not in ['self'] and v is not None and v != {}
+            for k, v in locals().copy().items() if k[:2] != '__'
+            and k not in ['self'] and v is not None and v != {}
         }
         response = requests.post(self.chat_completions_v1_url,
                                  headers=self.headers,