InternLM · irexyc · Oct 30, 2024
diff --git a/docs/en/multi_modal/qwen2_vl.md b/docs/en/multi_modal/qwen2_vl.md
@@ -5,7 +5,7 @@ LMDeploy supports the following Qwen-VL series of models, which are detailed in
 |    Model     |  Size  | Supported Inference Engine |
 | :----------: | :----: | :------------------------: |
 | Qwen-VL-Chat |   -    |     TurboMind, Pytorch     |
-|   Qwen2-VL   | 2B, 7B |          PyTorch           |
+|   Qwen2-VL   | 2B-72B |     TurboMind, PyTorch     |
 
 The next chapter demonstrates how to deploy an Qwen-VL model using LMDeploy, with [Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct) as an example.
 

diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
@@ -19,6 +19,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |         Qwen          |  1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Qwen1.5        | 1.8B - 110B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |         Qwen2         |  1.5B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       QWen2-VL        |   2B- 72B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Mistral        |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Mixtral        | 8x7B, 8x22B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Qwen-VL        |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |

diff --git a/docs/zh_cn/multi_modal/qwen2_vl.md b/docs/zh_cn/multi_modal/qwen2_vl.md
@@ -5,7 +5,7 @@ LMDeploy 支持 Qwen-VL 系列模型，具体如下：
 |    Model     |  Size  | Supported Inference Engine |
 | :----------: | :----: | :------------------------: |
 | Qwen-VL-Chat |   -    |     TurboMind, Pytorch     |
-|   Qwen2-VL   | 2B, 7B |          PyTorch           |
+|   Qwen2-VL   | 2B-72B |     TurboMind,PyTorch      |
 
 本文将以[Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)为例，演示使用 LMDeploy 部署 Qwen2-VL 系列模型的方法
 

diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
@@ -19,6 +19,7 @@
 |         Qwen          |  1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Qwen1.5        | 1.8B - 110B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |         Qwen2         |  1.5B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
+|       QWen2-VL        |   2B- 72B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Mistral        |      7B      | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Mixtral        | 8x7B, 8x22B  | LLM  |    Yes    |   Yes   |   Yes   |  Yes  |
 |        Qwen-VL        |      7B      | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |

diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
@@ -2,6 +2,7 @@
 import inspect
 import json
 from dataclasses import asdict, fields
+from typing import List
 
 # use pydantic.dataclasses.dataclass to check data type
 from pydantic.dataclasses import dataclass
@@ -73,6 +74,7 @@ class AttentionConfig:
     high_freq_factor: float = 1.0
     beta_fast: float = 32.0
     beta_slow: float = 1.0
+    mrope_section: List[int] = None
     use_logn_attn: int = 0
     cache_block_seq_len: int = 64
 

diff --git a/lmdeploy/turbomind/deploy/source_model/qwen.py b/lmdeploy/turbomind/deploy/source_model/qwen.py
@@ -119,4 +119,13 @@ def tokenizer_info(self):
     def model_info(self):
         cfg = super().model_info()
         cfg['attn_bias'] = 1
+        params_path = osp.join(self.model_path, 'config.json')
+        with open(params_path) as f:
+            config = json.load(f)
+            rope_scaling = config['rope_scaling']
+            if rope_scaling is not None:
+                if rope_scaling.get('type', '') == 'mrope':
+                    selection = rope_scaling['mrope_section']
+                    cfg['rope_scaling_type'] = 'mrope'
+                    cfg['mrope_section'] = selection
         return cfg
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
@@ -20,6 +20,8 @@
     QWenLMHeadModel='qwen',
     # Qwen2
     Qwen2ForCausalLM='qwen2',
+    # # Qwen2-VL
+    Qwen2VLForConditionalGeneration='qwen2',
     # mistral
     MistralForCausalLM='llama',
     # llava

diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
@@ -515,6 +515,8 @@ def prepare_inputs(self,
                        gen_config: GenerationConfig,
                        input_embeddings=None,
                        input_embedding_ranges=None,
+                       mrope_position_ids=None,
+                       mrope_position_delta=None,
                        sequence_start: bool = True,
                        sequence_end: bool = False,
                        step=0,
@@ -572,6 +574,18 @@ def _broadcast_np(data, dtype, shape=(batch_size, )):
             inputs['input_embeddings'] = input_embeddings
             inputs['input_embedding_ranges'] = input_embedding_ranges
 
+        if mrope_position_ids is not None:
+            assert isinstance(mrope_position_ids, torch.Tensor)
+            assert isinstance(mrope_position_delta, torch.Tensor)
+            assert input_lengths.size(0) == 1
+            assert mrope_position_ids.size(-1) == input_ids.size(-1)
+            mrope_position_ids = pad_sequence([mrope_position_ids],
+                                              batch_first=True,
+                                              padding_value=-1).transpose(
+                                                  1, 2).int().reshape(1, -1)
+            inputs['mrope_position_ids'] = mrope_position_ids
+            inputs['mrope_position_delta'] = mrope_position_delta
+
         if gen_config.min_new_tokens is not None:
             inputs['min_length'] = _broadcast_np(gen_config.min_new_tokens,
                                                  np.int32)
@@ -611,6 +625,8 @@ async def async_stream_infer(self,
                                  input_ids,
                                  input_embeddings=None,
                                  input_embedding_ranges=None,
+                                 mrope_position_ids=None,
+                                 mrope_position_delta=None,
                                  sequence_start: bool = True,
                                  sequence_end: bool = False,
                                  step=0,
@@ -648,6 +664,8 @@ async def async_stream_infer(self,
             input_ids=input_ids,
             input_embeddings=input_embeddings,
             input_embedding_ranges=input_embedding_ranges,
+            mrope_position_ids=mrope_position_ids,
+            mrope_position_delta=mrope_position_delta,
             sequence_start=sequence_start,
             sequence_end=sequence_end,
             step=step,
@@ -734,6 +752,8 @@ def stream_infer(self,
                      input_ids,
                      input_embeddings=None,
                      input_embedding_ranges=None,
+                     mrope_position_ids=None,
+                     mrope_position_delta=None,
                      sequence_start: bool = True,
                      sequence_end: bool = False,
                      step=0,
@@ -766,6 +786,8 @@ def stream_infer(self,
             input_ids=input_ids,
             input_embeddings=input_embeddings,
             input_embedding_ranges=input_embedding_ranges,
+            mrope_position_ids=mrope_position_ids,
+            mrope_position_delta=mrope_position_delta,
             sequence_start=sequence_start,
             sequence_end=sequence_end,
             step=step,

diff --git a/src/turbomind/kernels/attention/attention_params.h b/src/turbomind/kernels/attention/attention_params.h
@@ -67,10 +67,16 @@ struct AttentionParams {
     float llama3_inv_scaling_factor;
     float llama3_alpha;
     float llama3_beta;
-    // the following are use by yarn
+    // the following are used by yarn
     float yarn_ramp_inv_factor_div_2;
     float yarn_ramp_inv_factor_mul_min;
     float yarn_inv_scaling_factor;
+    // the following are used by qwen2-vl
+    int3 mrope_section;
+    int* mrope_position_ids;  // 3 x session_len_
+    int  mrope_offset;        // session_len_
+    int* mrope_position_delta;
+    int* mrope_position_length;
 
     // log(n) attention
     bool use_logn_attn;

diff --git a/src/turbomind/kernels/attention/attention_universal.h b/src/turbomind/kernels/attention/attention_universal.h
@@ -223,6 +223,14 @@ struct AttentionUniversal {
 
         ApplyBias(vec_Q, vec_K, vec_V, params, head_idx, kv_head_idx, offset);
 
+        int* mrope_ids    = nullptr;
+        int  mrope_length = 0;
+        int  mrope_delta  = 0;
+        if (params.mrope_position_ids != nullptr) {
+            mrope_ids    = params.mrope_position_ids + batch_idx * 3 * params.mrope_offset;
+            mrope_length = params.mrope_position_length[batch_idx];
+            mrope_delta  = params.mrope_position_delta[batch_idx];
+        }
         const float rope_base = params.rope_theta ? params.rope_theta[batch_idx] : params.rotary_embedding_base;
         PRAGMA_UNROLL
         for (int c = 0; c < ITER_C; ++c) {
@@ -239,6 +247,10 @@ struct AttentionUniversal {
                           params.yarn_ramp_inv_factor_mul_min,
                           params.yarn_inv_scaling_factor,
                           params.attention_scaling,
+                          params.mrope_section,
+                          mrope_ids,
+                          mrope_length,
+                          mrope_delta,
                           std::integral_constant<int, kVecSize>{});
             PRAGMA_UNROLL
             for (int s = 0; s < ITER_S; ++s) {

diff --git a/src/turbomind/kernels/attention/kv_cache_utils_v2.cu b/src/turbomind/kernels/attention/kv_cache_utils_v2.cu
@@ -31,6 +31,11 @@ __global__ void __launch_bounds__(128) ProcessKV_v2(char**       blocks,
                                                     float        yarn_ramp_inv_factor_mul_min,
                                                     float        yarn_inv_scaling_factor,
                                                     float        attention_scaling,
+                                                    int3         mrope_section,
+                                                    int*         mrope_position_ids,
+                                                    int          mrope_offset,
+                                                    int*         mrope_position_delta,
+                                                    int*         mrope_position_length,
                                                     int64_t      stride_b,
                                                     int64_t      stride_c,
                                                     int64_t      stride_h,
@@ -125,6 +130,14 @@ __global__ void __launch_bounds__(128) ProcessKV_v2(char**       blocks,
     }
 
     if (rope_base) {
+        int* mrope_ids    = nullptr;
+        int  mrope_length = 0;
+        int  mrope_delta  = 0;
+        if (mrope_position_ids != nullptr) {
+            mrope_ids    = mrope_position_ids + batch_idx * 3 * mrope_offset;
+            mrope_length = mrope_position_length[batch_idx];
+            mrope_delta  = mrope_position_delta[batch_idx];
+        }
         float base = rope_base[batch_idx];
         PRAGMA_UNROLL
         for (int c = 0; c < ITER_C; ++c) {
@@ -141,6 +154,10 @@ __global__ void __launch_bounds__(128) ProcessKV_v2(char**       blocks,
                           yarn_ramp_inv_factor_mul_min,
                           yarn_inv_scaling_factor,
                           attention_scaling,
+                          mrope_section,
+                          mrope_ids,
+                          mrope_length,
+                          mrope_delta,
                           std::integral_constant<int, kVecSize>{});
             PRAGMA_UNROLL
             for (int s = 0; s < ITER_S; ++s) {
@@ -222,6 +239,11 @@ void invokeProcessKV_v2(char**       blocks,
                         float        yarn_ramp_inv_factor_mul_min,
                         float        yarn_inv_scaling_factor,
                         float        attention_scaling,
+                        int3         mrope_section,
+                        int*         mrope_position_ids,
+                        int          mrope_offset,
+                        int*         mrope_position_delta,
+                        int*         mrope_position_length,
                         int64_t      stride_b,
                         int64_t      stride_c,
                         int64_t      stride_h,
@@ -268,6 +290,11 @@ void invokeProcessKV_v2(char**       blocks,
                                                                               yarn_ramp_inv_factor_mul_min,
                                                                               yarn_inv_scaling_factor,
                                                                               attention_scaling,
+                                                                              mrope_section,
+                                                                              mrope_position_ids,
+                                                                              mrope_offset,
+                                                                              mrope_position_delta,
+                                                                              mrope_position_length,
                                                                               stride_b,
                                                                               stride_c,
                                                                               stride_h,
@@ -307,6 +334,11 @@ void invokeProcessKV_v2(char**       blocks,
                                      float        yarn_ramp_inv_factor_mul_min,                                        \
                                      float        yarn_inv_scaling_factor,                                             \
                                      float        attention_scaling,                                                   \
+                                     int3         mrope_section,                                                       \
+                                     int*         mrope_position_ids,                                                  \
+                                     int          mrope_offset,                                                        \
+                                     int*         mrope_position_delta,                                                \
+                                     int*         mrope_position_length,                                               \
                                      int64_t      stride_b,                                                            \
                                      int64_t      stride_c,                                                            \
                                      int64_t      stride_h,                                                            \
@@ -342,6 +374,11 @@ __global__ void __launch_bounds__(128) flattenKV_v2(T*           k,
                                                     float        yarn_ramp_inv_factor_mul_min,
                                                     float        yarn_inv_scaling_factor,
                                                     float        attention_scaling,
+                                                    int3         mrope_section,
+                                                    int*         mrope_position_ids,
+                                                    int          mrope_offset,
+                                                    int*         mrope_position_delta,
+                                                    int*         mrope_position_length,
                                                     int64_t      stride_b,
                                                     int64_t      stride_c,
                                                     int64_t      stride_h,
@@ -419,6 +456,14 @@ __global__ void __launch_bounds__(128) flattenKV_v2(T*           k,
     }
 
     if (rope_base) {
+        int* mrope_ids    = nullptr;
+        int  mrope_length = 0;
+        int  mrope_delta  = 0;
+        if (mrope_position_ids != nullptr) {
+            mrope_ids    = mrope_position_ids + batch_idx * 3 * mrope_offset;
+            mrope_length = mrope_position_length[batch_idx];
+            mrope_delta  = mrope_position_delta[batch_idx];
+        }
         float base = rope_base[batch_idx];
         PRAGMA_UNROLL
         for (int c = 0; c < ITER_C; ++c) {
@@ -435,6 +480,10 @@ __global__ void __launch_bounds__(128) flattenKV_v2(T*           k,
                           yarn_ramp_inv_factor_mul_min,
                           yarn_inv_scaling_factor,
                           attention_scaling,
+                          mrope_section,
+                          mrope_ids,
+                          mrope_length,
+                          mrope_delta,
                           std::integral_constant<int, kVecSize>{});
             PRAGMA_UNROLL
             for (int s = 0; s < ITER_S; ++s) {
@@ -477,6 +526,11 @@ void invokeFlattenKV_v2(T*           k,
                         float        yarn_ramp_inv_factor_mul_min,
                         float        yarn_inv_scaling_factor,
                         float        attention_scaling,
+                        int3         mrope_section,
+                        int*         mrope_position_ids,
+                        int          mrope_offset,
+                        int*         mrope_position_delta,
+                        int*         mrope_position_length,
                         int64_t      stride_b,
                         int64_t      stride_c,
                         int64_t      stride_h,
@@ -520,6 +574,11 @@ void invokeFlattenKV_v2(T*           k,
                                                                             yarn_ramp_inv_factor_mul_min,
                                                                             yarn_inv_scaling_factor,
                                                                             attention_scaling,
+                                                                            mrope_section,
+                                                                            mrope_position_ids,
+                                                                            mrope_offset,
+                                                                            mrope_position_delta,
+                                                                            mrope_position_length,
                                                                             stride_b,
                                                                             stride_c,
                                                                             stride_h,
@@ -556,6 +615,11 @@ void invokeFlattenKV_v2(T*           k,
                                      float        yarn_ramp_inv_factor_mul_min,                                        \
                                      float        yarn_inv_scaling_factor,                                             \
                                      float        attention_scaling,                                                   \
+                                     int3         mrope_section,                                                       \
+                                     int*         mrope_position_ids,                                                  \
+                                     int          mrope_offset,                                                        \
+                                     int*         mrope_position_delta,                                                \
+                                     int*         mrope_position_length,                                               \
                                      int64_t      stride_b,                                                            \
                                      int64_t      stride_c,                                                            \
                                      int64_t      stride_h,                                                            \