Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support qwen2-vl with turbomind backend #2720

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/en/multi_modal/qwen2_vl.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ LMDeploy supports the following Qwen-VL series of models, which are detailed in
| Model | Size | Supported Inference Engine |
| :----------: | :----: | :------------------------: |
| Qwen-VL-Chat | - | TurboMind, Pytorch |
| Qwen2-VL | 2B, 7B | PyTorch |
| Qwen2-VL | 2B-72B | TurboMind, PyTorch |

The next chapter demonstrates how to deploy an Qwen-VL model using LMDeploy, with [Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct) as an example.

Expand Down
1 change: 1 addition & 0 deletions docs/en/supported_models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
| Qwen | 1.8B - 72B | LLM | Yes | Yes | Yes | Yes |
| Qwen1.5 | 1.8B - 110B | LLM | Yes | Yes | Yes | Yes |
| Qwen2 | 1.5B - 72B | LLM | Yes | Yes | Yes | Yes |
| QWen2-VL | 2B- 72B | MLLM | Yes | Yes | Yes | Yes |
| Mistral | 7B | LLM | Yes | Yes | Yes | Yes |
| Mixtral | 8x7B, 8x22B | LLM | Yes | Yes | Yes | Yes |
| Qwen-VL | 7B | MLLM | Yes | Yes | Yes | Yes |
Expand Down
2 changes: 1 addition & 1 deletion docs/zh_cn/multi_modal/qwen2_vl.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ LMDeploy 支持 Qwen-VL 系列模型,具体如下:
| Model | Size | Supported Inference Engine |
| :----------: | :----: | :------------------------: |
| Qwen-VL-Chat | - | TurboMind, Pytorch |
| Qwen2-VL | 2B, 7B | PyTorch |
| Qwen2-VL | 2B-72B | TurboMind,PyTorch |

本文将以[Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)为例,演示使用 LMDeploy 部署 Qwen2-VL 系列模型的方法

Expand Down
1 change: 1 addition & 0 deletions docs/zh_cn/supported_models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
| Qwen | 1.8B - 72B | LLM | Yes | Yes | Yes | Yes |
| Qwen1.5 | 1.8B - 110B | LLM | Yes | Yes | Yes | Yes |
| Qwen2 | 1.5B - 72B | LLM | Yes | Yes | Yes | Yes |
| QWen2-VL | 2B- 72B | MLLM | Yes | Yes | Yes | Yes |
| Mistral | 7B | LLM | Yes | Yes | Yes | Yes |
| Mixtral | 8x7B, 8x22B | LLM | Yes | Yes | Yes | Yes |
| Qwen-VL | 7B | MLLM | Yes | Yes | Yes | Yes |
Expand Down
2 changes: 2 additions & 0 deletions lmdeploy/turbomind/deploy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import inspect
import json
from dataclasses import asdict, fields
from typing import List

# use pydantic.dataclasses.dataclass to check data type
from pydantic.dataclasses import dataclass
Expand Down Expand Up @@ -73,6 +74,7 @@ class AttentionConfig:
high_freq_factor: float = 1.0
beta_fast: float = 32.0
beta_slow: float = 1.0
mrope_section: List[int] = None
use_logn_attn: int = 0
cache_block_seq_len: int = 64

Expand Down
9 changes: 9 additions & 0 deletions lmdeploy/turbomind/deploy/source_model/qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,4 +119,13 @@ def tokenizer_info(self):
def model_info(self):
cfg = super().model_info()
cfg['attn_bias'] = 1
params_path = osp.join(self.model_path, 'config.json')
with open(params_path) as f:
config = json.load(f)
rope_scaling = config['rope_scaling']
if rope_scaling is not None:
if rope_scaling.get('type', '') == 'mrope':
selection = rope_scaling['mrope_section']
cfg['rope_scaling_type'] = 'mrope'
cfg['mrope_section'] = selection
return cfg
2 changes: 2 additions & 0 deletions lmdeploy/turbomind/supported_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
QWenLMHeadModel='qwen',
# Qwen2
Qwen2ForCausalLM='qwen2',
# # Qwen2-VL
Qwen2VLForConditionalGeneration='qwen2',
# mistral
MistralForCausalLM='llama',
# llava
Expand Down
22 changes: 22 additions & 0 deletions lmdeploy/turbomind/turbomind.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,8 @@ def prepare_inputs(self,
gen_config: GenerationConfig,
input_embeddings=None,
input_embedding_ranges=None,
mrope_position_ids=None,
mrope_position_delta=None,
sequence_start: bool = True,
sequence_end: bool = False,
step=0,
Expand Down Expand Up @@ -572,6 +574,18 @@ def _broadcast_np(data, dtype, shape=(batch_size, )):
inputs['input_embeddings'] = input_embeddings
inputs['input_embedding_ranges'] = input_embedding_ranges

if mrope_position_ids is not None:
assert isinstance(mrope_position_ids, torch.Tensor)
assert isinstance(mrope_position_delta, torch.Tensor)
assert input_lengths.size(0) == 1
assert mrope_position_ids.size(-1) == input_ids.size(-1)
mrope_position_ids = pad_sequence([mrope_position_ids],
batch_first=True,
padding_value=-1).transpose(
1, 2).int().reshape(1, -1)
inputs['mrope_position_ids'] = mrope_position_ids
inputs['mrope_position_delta'] = mrope_position_delta

if gen_config.min_new_tokens is not None:
inputs['min_length'] = _broadcast_np(gen_config.min_new_tokens,
np.int32)
Expand Down Expand Up @@ -611,6 +625,8 @@ async def async_stream_infer(self,
input_ids,
input_embeddings=None,
input_embedding_ranges=None,
mrope_position_ids=None,
mrope_position_delta=None,
sequence_start: bool = True,
sequence_end: bool = False,
step=0,
Expand Down Expand Up @@ -648,6 +664,8 @@ async def async_stream_infer(self,
input_ids=input_ids,
input_embeddings=input_embeddings,
input_embedding_ranges=input_embedding_ranges,
mrope_position_ids=mrope_position_ids,
mrope_position_delta=mrope_position_delta,
sequence_start=sequence_start,
sequence_end=sequence_end,
step=step,
Expand Down Expand Up @@ -734,6 +752,8 @@ def stream_infer(self,
input_ids,
input_embeddings=None,
input_embedding_ranges=None,
mrope_position_ids=None,
mrope_position_delta=None,
sequence_start: bool = True,
sequence_end: bool = False,
step=0,
Expand Down Expand Up @@ -766,6 +786,8 @@ def stream_infer(self,
input_ids=input_ids,
input_embeddings=input_embeddings,
input_embedding_ranges=input_embedding_ranges,
mrope_position_ids=mrope_position_ids,
mrope_position_delta=mrope_position_delta,
sequence_start=sequence_start,
sequence_end=sequence_end,
step=step,
Expand Down
8 changes: 7 additions & 1 deletion src/turbomind/kernels/attention/attention_params.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,16 @@ struct AttentionParams {
float llama3_inv_scaling_factor;
float llama3_alpha;
float llama3_beta;
// the following are use by yarn
// the following are used by yarn
float yarn_ramp_inv_factor_div_2;
float yarn_ramp_inv_factor_mul_min;
float yarn_inv_scaling_factor;
// the following are used by qwen2-vl
int3 mrope_section;
int* mrope_position_ids; // 3 x session_len_
int mrope_offset; // session_len_
int* mrope_position_delta;
int* mrope_position_length;

// log(n) attention
bool use_logn_attn;
Expand Down
12 changes: 12 additions & 0 deletions src/turbomind/kernels/attention/attention_universal.h
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,14 @@ struct AttentionUniversal {

ApplyBias(vec_Q, vec_K, vec_V, params, head_idx, kv_head_idx, offset);

int* mrope_ids = nullptr;
int mrope_length = 0;
int mrope_delta = 0;
if (params.mrope_position_ids != nullptr) {
mrope_ids = params.mrope_position_ids + batch_idx * 3 * params.mrope_offset;
mrope_length = params.mrope_position_length[batch_idx];
mrope_delta = params.mrope_position_delta[batch_idx];
}
const float rope_base = params.rope_theta ? params.rope_theta[batch_idx] : params.rotary_embedding_base;
PRAGMA_UNROLL
for (int c = 0; c < ITER_C; ++c) {
Expand All @@ -239,6 +247,10 @@ struct AttentionUniversal {
params.yarn_ramp_inv_factor_mul_min,
params.yarn_inv_scaling_factor,
params.attention_scaling,
params.mrope_section,
mrope_ids,
mrope_length,
mrope_delta,
std::integral_constant<int, kVecSize>{});
PRAGMA_UNROLL
for (int s = 0; s < ITER_S; ++s) {
Expand Down
64 changes: 64 additions & 0 deletions src/turbomind/kernels/attention/kv_cache_utils_v2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ __global__ void __launch_bounds__(128) ProcessKV_v2(char** blocks,
float yarn_ramp_inv_factor_mul_min,
float yarn_inv_scaling_factor,
float attention_scaling,
int3 mrope_section,
int* mrope_position_ids,
int mrope_offset,
int* mrope_position_delta,
int* mrope_position_length,
int64_t stride_b,
int64_t stride_c,
int64_t stride_h,
Expand Down Expand Up @@ -125,6 +130,14 @@ __global__ void __launch_bounds__(128) ProcessKV_v2(char** blocks,
}

if (rope_base) {
int* mrope_ids = nullptr;
int mrope_length = 0;
int mrope_delta = 0;
if (mrope_position_ids != nullptr) {
mrope_ids = mrope_position_ids + batch_idx * 3 * mrope_offset;
mrope_length = mrope_position_length[batch_idx];
mrope_delta = mrope_position_delta[batch_idx];
}
float base = rope_base[batch_idx];
PRAGMA_UNROLL
for (int c = 0; c < ITER_C; ++c) {
Expand All @@ -141,6 +154,10 @@ __global__ void __launch_bounds__(128) ProcessKV_v2(char** blocks,
yarn_ramp_inv_factor_mul_min,
yarn_inv_scaling_factor,
attention_scaling,
mrope_section,
mrope_ids,
mrope_length,
mrope_delta,
std::integral_constant<int, kVecSize>{});
PRAGMA_UNROLL
for (int s = 0; s < ITER_S; ++s) {
Expand Down Expand Up @@ -222,6 +239,11 @@ void invokeProcessKV_v2(char** blocks,
float yarn_ramp_inv_factor_mul_min,
float yarn_inv_scaling_factor,
float attention_scaling,
int3 mrope_section,
int* mrope_position_ids,
int mrope_offset,
int* mrope_position_delta,
int* mrope_position_length,
int64_t stride_b,
int64_t stride_c,
int64_t stride_h,
Expand Down Expand Up @@ -268,6 +290,11 @@ void invokeProcessKV_v2(char** blocks,
yarn_ramp_inv_factor_mul_min,
yarn_inv_scaling_factor,
attention_scaling,
mrope_section,
mrope_position_ids,
mrope_offset,
mrope_position_delta,
mrope_position_length,
stride_b,
stride_c,
stride_h,
Expand Down Expand Up @@ -307,6 +334,11 @@ void invokeProcessKV_v2(char** blocks,
float yarn_ramp_inv_factor_mul_min, \
float yarn_inv_scaling_factor, \
float attention_scaling, \
int3 mrope_section, \
int* mrope_position_ids, \
int mrope_offset, \
int* mrope_position_delta, \
int* mrope_position_length, \
int64_t stride_b, \
int64_t stride_c, \
int64_t stride_h, \
Expand Down Expand Up @@ -342,6 +374,11 @@ __global__ void __launch_bounds__(128) flattenKV_v2(T* k,
float yarn_ramp_inv_factor_mul_min,
float yarn_inv_scaling_factor,
float attention_scaling,
int3 mrope_section,
int* mrope_position_ids,
int mrope_offset,
int* mrope_position_delta,
int* mrope_position_length,
int64_t stride_b,
int64_t stride_c,
int64_t stride_h,
Expand Down Expand Up @@ -419,6 +456,14 @@ __global__ void __launch_bounds__(128) flattenKV_v2(T* k,
}

if (rope_base) {
int* mrope_ids = nullptr;
int mrope_length = 0;
int mrope_delta = 0;
if (mrope_position_ids != nullptr) {
mrope_ids = mrope_position_ids + batch_idx * 3 * mrope_offset;
mrope_length = mrope_position_length[batch_idx];
mrope_delta = mrope_position_delta[batch_idx];
}
float base = rope_base[batch_idx];
PRAGMA_UNROLL
for (int c = 0; c < ITER_C; ++c) {
Expand All @@ -435,6 +480,10 @@ __global__ void __launch_bounds__(128) flattenKV_v2(T* k,
yarn_ramp_inv_factor_mul_min,
yarn_inv_scaling_factor,
attention_scaling,
mrope_section,
mrope_ids,
mrope_length,
mrope_delta,
std::integral_constant<int, kVecSize>{});
PRAGMA_UNROLL
for (int s = 0; s < ITER_S; ++s) {
Expand Down Expand Up @@ -477,6 +526,11 @@ void invokeFlattenKV_v2(T* k,
float yarn_ramp_inv_factor_mul_min,
float yarn_inv_scaling_factor,
float attention_scaling,
int3 mrope_section,
int* mrope_position_ids,
int mrope_offset,
int* mrope_position_delta,
int* mrope_position_length,
int64_t stride_b,
int64_t stride_c,
int64_t stride_h,
Expand Down Expand Up @@ -520,6 +574,11 @@ void invokeFlattenKV_v2(T* k,
yarn_ramp_inv_factor_mul_min,
yarn_inv_scaling_factor,
attention_scaling,
mrope_section,
mrope_position_ids,
mrope_offset,
mrope_position_delta,
mrope_position_length,
stride_b,
stride_c,
stride_h,
Expand Down Expand Up @@ -556,6 +615,11 @@ void invokeFlattenKV_v2(T* k,
float yarn_ramp_inv_factor_mul_min, \
float yarn_inv_scaling_factor, \
float attention_scaling, \
int3 mrope_section, \
int* mrope_position_ids, \
int mrope_offset, \
int* mrope_position_delta, \
int* mrope_position_length, \
int64_t stride_b, \
int64_t stride_c, \
int64_t stride_h, \
Expand Down
Loading
Loading