未验证 提交 f1fab902 编写于 作者: L LiYu Lu 提交者: GitHub

fix spelling error (#3482)

Co-authored-by: NLogan Adams <114770087+loadams@users.noreply.github.com>
上级 87edbc8d
...@@ -143,7 +143,7 @@ at::Tensor einsum_sec_sm_ecm(at::Tensor& Q, at::Tensor& W) ...@@ -143,7 +143,7 @@ at::Tensor einsum_sec_sm_ecm(at::Tensor& Q, at::Tensor& W)
/* /*
// Reallocate memory if we received a new prompt // Reallocate memory if we received a new prompt
if (!workspace || input.size(1) != 1) { if (!workspace || input.size(1) != 1) {
allocate_workspace<T>(W.size(1), InferenceContext::Instance().GetMaxTokenLenght(), allocate_workspace<T>(W.size(1), InferenceContext::Instance().GetMaxTokenLength(),
Q.size(0), 1, head_size); workspace = (T*)InferenceContext::Instance().GetWorkSpace(); Q.size(0), 1, head_size); workspace = (T*)InferenceContext::Instance().GetWorkSpace();
} }
*/ */
...@@ -384,7 +384,7 @@ void attention_unfused(T* prev_key_cont, ...@@ -384,7 +384,7 @@ void attention_unfused(T* prev_key_cont,
workspace, workspace,
CUBLAS_OP_T, CUBLAS_OP_T,
CUBLAS_OP_N, CUBLAS_OP_N,
InferenceContext::Instance().GetMaxTokenLenght() * k, InferenceContext::Instance().GetMaxTokenLength() * k,
seq_len * k, seq_len * k,
seq_len * soft_len, seq_len * soft_len,
bsz * heads, bsz * heads,
...@@ -417,7 +417,7 @@ void attention_unfused(T* prev_key_cont, ...@@ -417,7 +417,7 @@ void attention_unfused(T* prev_key_cont,
(T*)output, (T*)output,
CUBLAS_OP_N, CUBLAS_OP_N,
CUBLAS_OP_N, CUBLAS_OP_N,
InferenceContext::Instance().GetMaxTokenLenght() * k, InferenceContext::Instance().GetMaxTokenLength() * k,
seq_len * soft_len, seq_len * soft_len,
seq_len * k, seq_len * k,
bsz * heads, bsz * heads,
...@@ -468,11 +468,11 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value, ...@@ -468,11 +468,11 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
auto query_cont = workspace + 5 * buf_size; auto query_cont = workspace + 5 * buf_size;
size_t offset = size_t offset =
10 * (hidden_dim * bsz * InferenceContext::Instance().GetMaxTokenLenght()) + 10 * (hidden_dim * bsz * InferenceContext::Instance().GetMaxTokenLength()) +
layer_id * 2 * bsz * InferenceContext::Instance().GetMaxTokenLenght() * hidden_dim; layer_id * 2 * bsz * InferenceContext::Instance().GetMaxTokenLength() * hidden_dim;
unsigned all_tokens = soft_len; unsigned all_tokens = soft_len;
auto kv_cache = workspace + offset + (hidden_dim / heads) * (is_prompt ? 0 : soft_len - 1); auto kv_cache = workspace + offset + (hidden_dim / heads) * (is_prompt ? 0 : soft_len - 1);
size_t value_offset = bsz * InferenceContext::Instance().GetMaxTokenLenght() * hidden_dim; size_t value_offset = bsz * InferenceContext::Instance().GetMaxTokenLength() * hidden_dim;
T* temp_buf = (T*)output.data_ptr() + at::numel(output); T* temp_buf = (T*)output.data_ptr() + at::numel(output);
launch_bias_add_transform_0213<T>((T*)query_cont, launch_bias_add_transform_0213<T>((T*)query_cont,
...@@ -491,7 +491,7 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value, ...@@ -491,7 +491,7 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
rotate_every_two, rotate_every_two,
InferenceContext::Instance().GetCurrentStream(), InferenceContext::Instance().GetCurrentStream(),
3, 3,
InferenceContext::Instance().GetMaxTokenLenght()); InferenceContext::Instance().GetMaxTokenLength());
if (rotary_dim > 0 && rotate_half) if (rotary_dim > 0 && rotate_half)
launch_apply_rotary_pos_emb(query_cont, launch_apply_rotary_pos_emb(query_cont,
kv_cache, kv_cache,
...@@ -502,7 +502,7 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value, ...@@ -502,7 +502,7 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
heads, heads,
bsz, bsz,
InferenceContext::Instance().GetCurrentStream(), InferenceContext::Instance().GetCurrentStream(),
InferenceContext::Instance().GetMaxTokenLenght()); InferenceContext::Instance().GetMaxTokenLength());
attention_unfused<T>(workspace + offset, attention_unfused<T>(workspace + offset,
(T*)query_cont, (T*)query_cont,
...@@ -533,8 +533,8 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value, ...@@ -533,8 +533,8 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
if (layer_id == num_layers - 1) InferenceContext::Instance().advance_tokens(); if (layer_id == num_layers - 1) InferenceContext::Instance().advance_tokens();
auto prev_key = torch::from_blob(workspace + offset, auto prev_key = torch::from_blob(workspace + offset,
{bsz, heads, all_tokens, k}, {bsz, heads, all_tokens, k},
{hidden_dim * InferenceContext::Instance().GetMaxTokenLenght(), {hidden_dim * InferenceContext::Instance().GetMaxTokenLength(),
k * InferenceContext::Instance().GetMaxTokenLenght(), k * InferenceContext::Instance().GetMaxTokenLength(),
k, k,
1}, 1},
options); options);
...@@ -542,8 +542,8 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value, ...@@ -542,8 +542,8 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
auto prev_value = auto prev_value =
torch::from_blob(workspace + offset + value_offset, torch::from_blob(workspace + offset + value_offset,
{bsz, heads, all_tokens, k}, {bsz, heads, all_tokens, k},
{hidden_dim * InferenceContext::Instance().GetMaxTokenLenght(), {hidden_dim * InferenceContext::Instance().GetMaxTokenLength(),
k * InferenceContext::Instance().GetMaxTokenLenght(), k * InferenceContext::Instance().GetMaxTokenLength(),
k, k,
1}, 1},
options); options);
...@@ -1861,7 +1861,7 @@ std::vector<at::Tensor> apply_rotary_pos_emb(at::Tensor& mixed_query, ...@@ -1861,7 +1861,7 @@ std::vector<at::Tensor> apply_rotary_pos_emb(at::Tensor& mixed_query,
num_heads, num_heads,
bsz, bsz,
InferenceContext::Instance().GetCurrentStream(), InferenceContext::Instance().GetCurrentStream(),
InferenceContext::Instance().GetMaxTokenLenght()); InferenceContext::Instance().GetMaxTokenLength());
else else
launch_apply_rotary_pos_emb<__half>((__half*)query_cont.data_ptr(), launch_apply_rotary_pos_emb<__half>((__half*)query_cont.data_ptr(),
(__half*)key_cont.data_ptr(), (__half*)key_cont.data_ptr(),
...@@ -1872,7 +1872,7 @@ std::vector<at::Tensor> apply_rotary_pos_emb(at::Tensor& mixed_query, ...@@ -1872,7 +1872,7 @@ std::vector<at::Tensor> apply_rotary_pos_emb(at::Tensor& mixed_query,
num_heads, num_heads,
bsz, bsz,
InferenceContext::Instance().GetCurrentStream(), InferenceContext::Instance().GetCurrentStream(),
InferenceContext::Instance().GetMaxTokenLenght()); InferenceContext::Instance().GetMaxTokenLength());
return {query_cont, key_cont}; return {query_cont, key_cont};
} }
......
...@@ -175,7 +175,7 @@ public: ...@@ -175,7 +175,7 @@ public:
_workSpaceSize = workSpaceSize; _workSpaceSize = workSpaceSize;
_attention_unfused_workspace_offset = workSpaceSize - temp_size; _attention_unfused_workspace_offset = workSpaceSize - temp_size;
} }
inline size_t GetMaxTokenLenght() const { return _max_seq_len; } inline size_t GetMaxTokenLength() const { return _max_seq_len; }
cudaEvent_t GetCompEvent(int id) { return id == 1 ? _comp1_event : _comp2_event; } cudaEvent_t GetCompEvent(int id) { return id == 1 ? _comp1_event : _comp2_event; }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册