未验证 提交 f1fab902 编写于 作者: L LiYu Lu 提交者: GitHub

fix spelling error (#3482)

Co-authored-by: NLogan Adams <114770087+loadams@users.noreply.github.com>
上级 87edbc8d
......@@ -143,7 +143,7 @@ at::Tensor einsum_sec_sm_ecm(at::Tensor& Q, at::Tensor& W)
/*
// Reallocate memory if we received a new prompt
if (!workspace || input.size(1) != 1) {
allocate_workspace<T>(W.size(1), InferenceContext::Instance().GetMaxTokenLenght(),
allocate_workspace<T>(W.size(1), InferenceContext::Instance().GetMaxTokenLength(),
Q.size(0), 1, head_size); workspace = (T*)InferenceContext::Instance().GetWorkSpace();
}
*/
......@@ -384,7 +384,7 @@ void attention_unfused(T* prev_key_cont,
workspace,
CUBLAS_OP_T,
CUBLAS_OP_N,
InferenceContext::Instance().GetMaxTokenLenght() * k,
InferenceContext::Instance().GetMaxTokenLength() * k,
seq_len * k,
seq_len * soft_len,
bsz * heads,
......@@ -417,7 +417,7 @@ void attention_unfused(T* prev_key_cont,
(T*)output,
CUBLAS_OP_N,
CUBLAS_OP_N,
InferenceContext::Instance().GetMaxTokenLenght() * k,
InferenceContext::Instance().GetMaxTokenLength() * k,
seq_len * soft_len,
seq_len * k,
bsz * heads,
......@@ -468,11 +468,11 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
auto query_cont = workspace + 5 * buf_size;
size_t offset =
10 * (hidden_dim * bsz * InferenceContext::Instance().GetMaxTokenLenght()) +
layer_id * 2 * bsz * InferenceContext::Instance().GetMaxTokenLenght() * hidden_dim;
10 * (hidden_dim * bsz * InferenceContext::Instance().GetMaxTokenLength()) +
layer_id * 2 * bsz * InferenceContext::Instance().GetMaxTokenLength() * hidden_dim;
unsigned all_tokens = soft_len;
auto kv_cache = workspace + offset + (hidden_dim / heads) * (is_prompt ? 0 : soft_len - 1);
size_t value_offset = bsz * InferenceContext::Instance().GetMaxTokenLenght() * hidden_dim;
size_t value_offset = bsz * InferenceContext::Instance().GetMaxTokenLength() * hidden_dim;
T* temp_buf = (T*)output.data_ptr() + at::numel(output);
launch_bias_add_transform_0213<T>((T*)query_cont,
......@@ -491,7 +491,7 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
rotate_every_two,
InferenceContext::Instance().GetCurrentStream(),
3,
InferenceContext::Instance().GetMaxTokenLenght());
InferenceContext::Instance().GetMaxTokenLength());
if (rotary_dim > 0 && rotate_half)
launch_apply_rotary_pos_emb(query_cont,
kv_cache,
......@@ -502,7 +502,7 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
heads,
bsz,
InferenceContext::Instance().GetCurrentStream(),
InferenceContext::Instance().GetMaxTokenLenght());
InferenceContext::Instance().GetMaxTokenLength());
attention_unfused<T>(workspace + offset,
(T*)query_cont,
......@@ -533,8 +533,8 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
if (layer_id == num_layers - 1) InferenceContext::Instance().advance_tokens();
auto prev_key = torch::from_blob(workspace + offset,
{bsz, heads, all_tokens, k},
{hidden_dim * InferenceContext::Instance().GetMaxTokenLenght(),
k * InferenceContext::Instance().GetMaxTokenLenght(),
{hidden_dim * InferenceContext::Instance().GetMaxTokenLength(),
k * InferenceContext::Instance().GetMaxTokenLength(),
k,
1},
options);
......@@ -542,8 +542,8 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
auto prev_value =
torch::from_blob(workspace + offset + value_offset,
{bsz, heads, all_tokens, k},
{hidden_dim * InferenceContext::Instance().GetMaxTokenLenght(),
k * InferenceContext::Instance().GetMaxTokenLenght(),
{hidden_dim * InferenceContext::Instance().GetMaxTokenLength(),
k * InferenceContext::Instance().GetMaxTokenLength(),
k,
1},
options);
......@@ -1861,7 +1861,7 @@ std::vector<at::Tensor> apply_rotary_pos_emb(at::Tensor& mixed_query,
num_heads,
bsz,
InferenceContext::Instance().GetCurrentStream(),
InferenceContext::Instance().GetMaxTokenLenght());
InferenceContext::Instance().GetMaxTokenLength());
else
launch_apply_rotary_pos_emb<__half>((__half*)query_cont.data_ptr(),
(__half*)key_cont.data_ptr(),
......@@ -1872,7 +1872,7 @@ std::vector<at::Tensor> apply_rotary_pos_emb(at::Tensor& mixed_query,
num_heads,
bsz,
InferenceContext::Instance().GetCurrentStream(),
InferenceContext::Instance().GetMaxTokenLenght());
InferenceContext::Instance().GetMaxTokenLength());
return {query_cont, key_cont};
}
......
......@@ -175,7 +175,7 @@ public:
_workSpaceSize = workSpaceSize;
_attention_unfused_workspace_offset = workSpaceSize - temp_size;
}
inline size_t GetMaxTokenLenght() const { return _max_seq_len; }
inline size_t GetMaxTokenLength() const { return _max_seq_len; }
cudaEvent_t GetCompEvent(int id) { return id == 1 ? _comp1_event : _comp2_event; }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册