Skip to content

Commit 0a76b89

Browse files
committed
fix: ci test
1 parent 337c97a commit 0a76b89

File tree

9 files changed

+364
-207
lines changed

9 files changed

+364
-207
lines changed

rtp_llm/cpp/devices/DeviceBase.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,9 @@ class DeviceBase: public DeviceOps {
167167
torch::Tensor ropeCache() const {
168168
return rope_cache_;
169169
}
170+
int ropeCacheDim() const {
171+
return rope_cache_dim_;
172+
}
170173

171174
public:
172175
// device-independence op implementations
@@ -228,6 +231,7 @@ class DeviceBase: public DeviceOps {
228231
std::once_flag rope_cache_flag_;
229232
bool use_rope_cache_ = false;
230233
torch::Tensor rope_cache_;
234+
int rope_cache_dim_;
231235

232236
protected:
233237
std::unique_ptr<BufferManager> buffer_manager_;

rtp_llm/cpp/devices/base_tests/AttentionOpTest.hpp

Lines changed: 123 additions & 84 deletions
Large diffs are not rendered by default.

rtp_llm/cpp/devices/base_tests/UnfusedAttentionTest.hpp

Lines changed: 40 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,8 @@ void UnfusedAttentionTest::addFusedQKVBiasTransposeTest(size_t batch_size,
291291
params.common.cu_seqlens->data<int>(),
292292
params.common.cu_seqlens_without_prefix->data<int>(),
293293
device->use_rope_cache_,
294-
device->use_rope_cache_ && device->rope_cache_.defined() ?
294+
device->use_rope_cache_ && device->rope_cache_.defined()
295+
&& device->rope_cache_dim_ == params.configs.rope_config.dim ?
295296
device->rope_cache_.data_ptr<float>() :
296297
nullptr,
297298
batch_size,
@@ -335,7 +336,8 @@ void UnfusedAttentionTest::addFusedQKVBiasTransposeTest(size_t batch_size,
335336
params.common.cu_seqlens->data<int>(),
336337
params.common.cu_seqlens_without_prefix->data<int>(),
337338
device->use_rope_cache_,
338-
device->use_rope_cache_ && device->rope_cache_.defined() ?
339+
device->use_rope_cache_ && device->rope_cache_.defined()
340+
&& device->rope_cache_dim_ == params.configs.rope_config.dim ?
339341
device->rope_cache_.data_ptr<float>() :
340342
nullptr,
341343
batch_size,
@@ -374,40 +376,42 @@ void UnfusedAttentionTest::addFusedQKVBiasTransposeTest(size_t batch_size,
374376
bool store_kv = true;
375377
bool store_cache = false;
376378

377-
DISPATCH_CUDA_FUNCTION_DATA_TYPE(
378-
params.input.type(),
379-
invokeAddFusedQKVBiasTranspose,
380-
q_no_transpose_output->data(),
381-
q_output->data(),
382-
k_output->data(),
383-
v_output->data(),
384-
&prefix_prompt_param,
385-
params.input.data(),
386-
qkv_buf_fp8 ? qkv_buf_fp8->data() : nullptr,
387-
params.common.position_ids->data<int>(),
388-
params.weights.qkv_weight->bias->data(),
389-
params.common.padding_offset->data<int>(),
390-
params.common.cu_seqlens->data<int>(),
391-
params.common.cu_seqlens_without_prefix->data<int>(),
392-
device->use_rope_cache_,
393-
device->use_rope_cache_ && device->rope_cache_.defined() ? device->rope_cache_.data_ptr<float>() : nullptr,
394-
batch_size,
395-
seq_len,
396-
token_num,
397-
num_heads,
398-
num_key_value_heads,
399-
head_dim,
400-
params.configs.rope_config,
401-
params.configs.use_logn_attn,
402-
scale_out_ptr,
403-
int8_mode,
404-
use_paged_fmha,
405-
store_qkv,
406-
store_q_no_transpose,
407-
store_q,
408-
store_kv,
409-
store_cache,
410-
device->getStream());
379+
DISPATCH_CUDA_FUNCTION_DATA_TYPE(params.input.type(),
380+
invokeAddFusedQKVBiasTranspose,
381+
q_no_transpose_output->data(),
382+
q_output->data(),
383+
k_output->data(),
384+
v_output->data(),
385+
&prefix_prompt_param,
386+
params.input.data(),
387+
qkv_buf_fp8 ? qkv_buf_fp8->data() : nullptr,
388+
params.common.position_ids->data<int>(),
389+
params.weights.qkv_weight->bias->data(),
390+
params.common.padding_offset->data<int>(),
391+
params.common.cu_seqlens->data<int>(),
392+
params.common.cu_seqlens_without_prefix->data<int>(),
393+
device->use_rope_cache_,
394+
device->use_rope_cache_ && device->rope_cache_.defined()
395+
&& device->rope_cache_dim_ == params.configs.rope_config.dim ?
396+
device->rope_cache_.data_ptr<float>() :
397+
nullptr,
398+
batch_size,
399+
seq_len,
400+
token_num,
401+
num_heads,
402+
num_key_value_heads,
403+
head_dim,
404+
params.configs.rope_config,
405+
params.configs.use_logn_attn,
406+
scale_out_ptr,
407+
int8_mode,
408+
use_paged_fmha,
409+
store_qkv,
410+
store_q_no_transpose,
411+
store_q,
412+
store_kv,
413+
store_cache,
414+
device->getStream());
411415

412416
device->syncAndCheck();
413417

rtp_llm/cpp/devices/cuda_impl/CudaAttentionOp.cc

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,8 @@ void CudaDevice::getRopeCacheOnce(const RopeConfig& rope_config, int max_positio
9999
std::call_once(rope_cache_flag_, [&]() {
100100
use_rope_cache_ = rope_config.style == RopeStyle::Base || rope_config.style == RopeStyle::Yarn;
101101
if (use_rope_cache_) {
102-
rope_cache_ = getRopeCache(rope_config, max_position_embeddings);
102+
rope_cache_ = getRopeCache(rope_config, max_position_embeddings);
103+
rope_cache_dim_ = rope_config.dim;
103104
}
104105
});
105106
}
@@ -219,7 +220,9 @@ AttentionModuleOutput CudaDevice::contextAttention(const AttentionModuleParams&
219220
params.common.cu_seqlens->data<int>(),
220221
params.common.cu_seqlens_without_prefix->data<int>(),
221222
use_rope_cache_,
222-
use_rope_cache_ && rope_cache_.defined() ? rope_cache_.data_ptr<float>() : nullptr,
223+
use_rope_cache_ && rope_cache_.defined() && rope_cache_dim_ == params.configs.rope_config.dim ?
224+
rope_cache_.data_ptr<float>() :
225+
nullptr,
223226
batch_size,
224227
seq_len,
225228
token_num,
@@ -410,8 +413,10 @@ AttentionModuleOutput CudaDevice::decoderSelfAttention(const AttentionModulePara
410413
params.weights.qkv_weight->bias->data() :
411414
nullptr,
412415
use_rope_cache_,
413-
use_rope_cache_ && rope_cache_.defined() ? rope_cache_.data_ptr<float>() :
414-
nullptr,
416+
use_rope_cache_ && rope_cache_.defined()
417+
&& rope_cache_dim_ == params.configs.rope_config.dim ?
418+
rope_cache_.data_ptr<float>() :
419+
nullptr,
415420
batch_size,
416421
local_head_num,
417422
local_kv_head_num,

rtp_llm/cpp/devices/rocm_impl/ROCmAttentionOp.cc

Lines changed: 41 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -571,7 +571,8 @@ void ROCmDevice::getRopeCacheOnce(const RopeConfig& rope_config, int max_positio
571571
std::call_once(rope_cache_flag_, [&]() {
572572
use_rope_cache_ = rope_config.style == RopeStyle::Base;
573573
if (use_rope_cache_) {
574-
rope_cache_ = getRopeCache(rope_config, max_position_embeddings);
574+
rope_cache_ = getRopeCache(rope_config, max_position_embeddings);
575+
rope_cache_dim_ = rope_config.dim;
575576
}
576577
});
577578
}
@@ -778,42 +779,46 @@ AttentionModuleOutput ROCmDevice::contextAttention(const AttentionModuleParams&
778779
}
779780
check_cuda_error();
780781
} else {
781-
DISPATCH_CUDA_FUNCTION_DATA_TYPE(datatype,
782-
invokeAddFusedQKVBiasTranspose,
783-
nullptr,
784-
q_output->data(),
785-
k_output->data(),
786-
v_output->data(),
787-
&prefix_prompt_param,
788-
params.input.data(),
782+
DISPATCH_CUDA_FUNCTION_DATA_TYPE(
783+
datatype,
784+
invokeAddFusedQKVBiasTranspose,
785+
nullptr,
786+
q_output->data(),
787+
k_output->data(),
788+
v_output->data(),
789+
&prefix_prompt_param,
790+
params.input.data(),
791+
nullptr,
792+
params.common.position_ids ? params.common.position_ids->dataWithOffset<int>(
793+
decoder_batch_size * params.configs.rope_config.index_factor) :
789794
nullptr,
790-
params.common.position_ids ?
791-
params.common.position_ids->dataWithOffset<int>(
792-
decoder_batch_size * params.configs.rope_config.index_factor) :
793-
nullptr,
794-
params.configs.fuse_qkv_add_bias && params.weights.qkv_weight->bias ?
795-
params.weights.qkv_weight->bias->data() :
796-
nullptr,
797-
params.common.padding_offset->data<int>(),
798-
params.common.cu_seqlens->data<int>(),
799-
params.common.cu_seqlens_without_prefix->data<int>(),
800-
batch_size,
801-
seq_len,
802-
token_num,
803-
head_num,
804-
kv_head_num,
805-
size_per_head,
806-
params.configs.rope_config,
807-
params.configs.use_logn_attn,
808-
scale_out_ptr,
809-
int8_mode,
810-
false,
811-
store_qkv,
812-
false,
813-
store_q,
814-
store_kv,
815-
store_cache,
816-
stream_);
795+
params.configs.fuse_qkv_add_bias && params.weights.qkv_weight->bias ?
796+
params.weights.qkv_weight->bias->data() :
797+
nullptr,
798+
params.common.padding_offset->data<int>(),
799+
params.common.cu_seqlens->data<int>(),
800+
params.common.cu_seqlens_without_prefix->data<int>(),
801+
use_rope_cache_,
802+
use_rope_cache_ && rope_cache_.defined() && rope_cache_dim_ == params.configs.rope_config.dim ?
803+
rope_cache_.data_ptr<float>() :
804+
nullptr,
805+
batch_size,
806+
seq_len,
807+
token_num,
808+
head_num,
809+
kv_head_num,
810+
size_per_head,
811+
params.configs.rope_config,
812+
params.configs.use_logn_attn,
813+
scale_out_ptr,
814+
int8_mode,
815+
false,
816+
store_qkv,
817+
false,
818+
store_q,
819+
store_kv,
820+
store_cache,
821+
stream_);
817822
check_cuda_error();
818823
}
819824
writeCacheStore(params);

rtp_llm/cpp/kernels/unfused_attention_kernels.cu

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -931,6 +931,7 @@ INSTANTIATEDEBUGKERNEL2(__nv_bfloat16);
931931

932932
// Bandwidth-bound kernel by reading cos/sin coefficients from global memory (pre-computed and saved as weights).
933933

934+
#if USING_CUDA
934935
template<typename T,
935936
typename Tcache,
936937
bool PREFIX_PROMPT,
@@ -1572,6 +1573,7 @@ __global__ void add_fusedQKV_bias_transpose_non_int8_with_rope_cache_kernel(T* q
15721573
}
15731574
}
15741575
}
1576+
#endif
15751577

15761578
template<typename T, typename Tcache, bool PREFIX_PROMPT, bool USE_PAGED_FMHA, RopeStyle ROPE_STYLE>
15771579
__global__ void add_fusedQKV_bias_transpose_with_rope_cache_kernel(T* q_no_transpose_buf,
@@ -2189,10 +2191,12 @@ void invokeAddFusedQKVBiasTranspose(T* q_no_transpos
21892191
const bool store_cache,
21902192
cudaStream_t stream) {
21912193
if (use_rope_cache && rope_cache) {
2194+
#if USING_CUDA
21922195
if (head_num % 8 == 0 && head_num_kv % 4 == 0 && param_ptr->kv_block_array.cache_type != KvCacheDataType::INT8
21932196
&& size_per_head == rope_config.dim) {
21942197
ADD_FUSEDQKV_BIAS_TRANSPOSE_NON_INT8_WITH_ROPE_CACHE(8, 4, 4);
21952198
} else {
2199+
#endif
21962200
dim3 block((size_per_head / Vec_t<T>::size + 31) / 32 * 32);
21972201
dim3 grid(token_num, head_num + head_num_kv * 2);
21982202
const size_t smem_size = rope_config.style == RopeStyle::No ? 0 : 2 * rope_config.dim * sizeof(T);
@@ -2234,7 +2238,9 @@ void invokeAddFusedQKVBiasTranspose(T* q_no_transpos
22342238
});
22352239
});
22362240
});
2241+
#if USING_CUDA
22372242
}
2243+
#endif
22382244
} else {
22392245
dim3 block((size_per_head / Vec_t<T>::size + 31) / 32 * 32);
22402246
dim3 grid(token_num, head_num);

rtp_llm/models_py/bindings/cuda/FusedRopeKVCacheOp.cc

Lines changed: 29 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,10 @@ torch::Tensor FusedRopeKVCachePrefillOp::forward(const torch::Tensor&
115115
params->cu_seqlens.data_ptr<int>(),
116116
params->cu_seqlens_without_prefix.data_ptr<int>(),
117117
device_->useRopeCache(),
118-
device_->useRopeCache() && device_->ropeCache().defined() ? device_->ropeCache().data_ptr<float>() : nullptr,
118+
device_->useRopeCache() && device_->ropeCache().defined()
119+
&& device_->ropeCacheDim() == attn_configs_.rope_config.dim ?
120+
device_->ropeCache().data_ptr<float>() :
121+
nullptr,
119122
batch_size,
120123
params->max_seq_len, // seq_len
121124
token_num,
@@ -195,29 +198,31 @@ torch::Tensor FusedRopeKVCacheDecodeOp::forward(const torch::Tensor&
195198
device_->getRopeCacheOnce(attn_configs_.rope_config, device_->initParams().max_seq_len);
196199

197200
RTP_LLM_CHECK_WITH_INFO(params->sequence_lengths.is_pinned(), "sequence_lengths is not pinned memory");
198-
DISPATCH_CUDA_FUNCTION_DATA_TYPE(
199-
torchDTypeToDataType(qkv.dtype()),
200-
invokeDecodeAddFusedQKVBiasTranspose,
201-
q_output.data_ptr(),
202-
nullptr, // k_buf
203-
nullptr, // v_buf
204-
kv_block_array,
205-
qkv.data_ptr(),
206-
params->sequence_lengths.data_ptr<int>(),
207-
nullptr, // params.configs.fuse_qkv_add_bias && params.weights.qkv_weight->bias ?
208-
// params.weights.qkv_weight->bias->data() : nullptr,
209-
device_->useRopeCache(),
210-
device_->useRopeCache() && device_->ropeCache().defined() ? device_->ropeCache().data_ptr<float>() : nullptr,
211-
batch_size,
212-
local_head_num,
213-
local_head_num_kv,
214-
size_per_head,
215-
attn_configs_.rope_config,
216-
attn_configs_.use_logn_attn,
217-
true, // store_q,
218-
false, // store_kv,
219-
true, // store_cache,
220-
device_->getStream());
201+
DISPATCH_CUDA_FUNCTION_DATA_TYPE(torchDTypeToDataType(qkv.dtype()),
202+
invokeDecodeAddFusedQKVBiasTranspose,
203+
q_output.data_ptr(),
204+
nullptr, // k_buf
205+
nullptr, // v_buf
206+
kv_block_array,
207+
qkv.data_ptr(),
208+
params->sequence_lengths.data_ptr<int>(),
209+
nullptr, // params.configs.fuse_qkv_add_bias && params.weights.qkv_weight->bias ?
210+
// params.weights.qkv_weight->bias->data() : nullptr,
211+
device_->useRopeCache(),
212+
device_->useRopeCache() && device_->ropeCache().defined()
213+
&& device_->ropeCacheDim() == attn_configs_.rope_config.dim ?
214+
device_->ropeCache().data_ptr<float>() :
215+
nullptr,
216+
batch_size,
217+
local_head_num,
218+
local_head_num_kv,
219+
size_per_head,
220+
attn_configs_.rope_config,
221+
attn_configs_.use_logn_attn,
222+
true, // store_q,
223+
false, // store_kv,
224+
true, // store_cache,
225+
device_->getStream());
221226
return q_output;
222227
}
223228

rtp_llm/models_py/standalone/rtp_auto_model.py

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,17 @@
1+
from rtp_llm.utils.model_weight import W
2+
from rtp_llm.utils.base_model_datatypes import ModelConfig
3+
from rtp_llm.ops.compute_ops import (
4+
KVCache,
5+
PyAttentionInputs,
6+
PyModelInputs,
7+
PyModelOutputs,
8+
get_device,
9+
get_typemeta,
10+
init_device,
11+
)
12+
from rtp_llm.model_factory import ModelFactory
13+
from rtp_llm.config.py_config_modules import StaticConfig
14+
import rtp_llm.models
115
import os
216
import sys
317
from pathlib import Path
@@ -9,21 +23,6 @@
923
rtp_opensouce_path = Path(__file__).resolve().parent.parent.parent.parent
1024
sys.path.append(str(rtp_opensouce_path))
1125

12-
import rtp_llm.models
13-
from rtp_llm.config.py_config_modules import StaticConfig
14-
from rtp_llm.model_factory import ModelFactory
15-
from rtp_llm.ops.compute_ops import (
16-
KVCache,
17-
PyAttentionInputs,
18-
PyModelInputs,
19-
PyModelOutputs,
20-
get_device,
21-
get_typemeta,
22-
init_device,
23-
)
24-
from rtp_llm.utils.base_model_datatypes import ModelConfig
25-
from rtp_llm.utils.model_weight import W
26-
2726

2827
class AutoModel:
2928
def __init__(
@@ -154,6 +153,9 @@ def _prepare_prefill_attention_inputs(self, input_length: int) -> PyAttentionInp
154153
attention_inputs.cu_seqlens = torch.tensor(
155154
[0, input_length], dtype=torch.int32, device=self.device
156155
)
156+
attention_inputs.cu_seqlens_without_prefix = torch.tensor(
157+
[0, input_length], dtype=torch.int32, device=self.device
158+
)
157159
attention_inputs.prefix_lengths = torch.tensor([0], dtype=torch.int32)
158160
attention_inputs.padding_offset = torch.tensor(
159161
[0 for _ in range(input_length)], dtype=torch.int32, device=self.device

0 commit comments

Comments
 (0)