未验证 提交 0331cfda 编写于 作者: H hong 提交者: GitHub

Move embedding to phi (#39901)

* move embeding to phi;

* update sig; test=develop

* move reset impl to phi; test=develop

* remove old register; test=develop

* fix cpu bf16 bug; test=develop

* fix lookup speed error

* polish code

* fix paddle throw type
上级 64c268b2
......@@ -203,14 +203,6 @@ REGISTER_OPERATOR(lookup_table_v2_grad, ops::LookupTableV2OpGrad,
ops::LookupTableV2GradOpNoBufferVarsInferer,
ops::LookupTableV2OpGradVarTypeInference);
REGISTER_OP_CPU_KERNEL(lookup_table_v2, ops::LookupTableV2Kernel<float>,
ops::LookupTableV2Kernel<double>,
ops::LookupTableV2Kernel<paddle::platform::bfloat16>);
REGISTER_OP_CPU_KERNEL(
lookup_table_v2_grad, ops::LookupTableV2GradKernel<float>,
ops::LookupTableV2GradKernel<double>,
ops::LookupTableV2GradKernel<paddle::platform::bfloat16>);
/* ========================== register checkpoint ===========================*/
REGISTER_OP_VERSION(lookup_table_v2)
.AddCheckpoint(
......
......@@ -235,13 +235,3 @@ class LookupTableV2GradCUDAKernel : public framework::OpKernel<T> {
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(lookup_table_v2, ops::LookupTableV2CUDAKernel<float>,
ops::LookupTableV2CUDAKernel<double>,
ops::LookupTableV2CUDAKernel<plat::float16>);
REGISTER_OP_CUDA_KERNEL(lookup_table_v2_grad,
ops::LookupTableV2GradCUDAKernel<float>,
ops::LookupTableV2GradCUDAKernel<double>,
ops::LookupTableV2GradCUDAKernel<plat::float16>);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/embedding_grad_kernel.h"
#include "paddle/phi/kernels/funcs/embedding_util.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
struct EmbeddingGradCPUFunctor {
EmbeddingGradCPUFunctor(const Context& dev_ctx,
const DenseTensor& input,
const DenseTensor& weight,
const DenseTensor& out_grad,
int64_t padding_idx,
DenseTensor* weight_grad)
: dev_ctx_(dev_ctx),
input_(input),
weight_(weight),
out_grad_(out_grad),
weight_grad_(weight_grad),
padding_idx_(padding_idx) {}
template <typename IdT>
void apply() {
DDim table_dim = weight_.dims();
auto ids = CopyIdsToVector<IdT, int64_t>(input_);
auto ids_num = static_cast<int64_t>(ids.size());
// Since paddings are not trainable and fixed in forward, the gradient of
// paddings makes no sense and we don't deal with it in backward.
{
auto* d_output = &out_grad_;
auto* ids_data = ids.data();
int64_t N = table_dim[0];
int64_t D = table_dim[1];
auto* d_output_data = d_output->template data<T>();
dev_ctx_.template Alloc<T>(weight_grad_);
auto* d_table_data = weight_grad_->data<T>();
memset(d_table_data, 0, weight_grad_->numel() * sizeof(T));
for (int64_t i = 0; i < ids_num; ++i) {
if (padding_idx_ != kNoPadding && ids_data[i] == padding_idx_) {
// the gradient of padding_idx should be 0, already done by memset, so
// do nothing.
} else {
PADDLE_ENFORCE_LT(
ids_data[i],
N,
phi::errors::InvalidArgument(
"Variable value (input) of "
"OP(paddle.nn.functional.embedding) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
N,
ids_data[i]));
PADDLE_ENFORCE_GE(
ids_data[i],
0,
phi::errors::InvalidArgument(
"Variable value (input) of "
"OP(paddle.nn.functional.embedding) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
N,
ids_data[i]));
for (int j = 0; j < D; ++j) {
d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j];
}
}
}
}
}
private:
const Context& dev_ctx_;
const DenseTensor& input_;
const DenseTensor& weight_;
const DenseTensor& out_grad_;
DenseTensor* weight_grad_;
int64_t padding_idx_;
};
template <typename T, typename Context>
void EmbeddingGradKernel(const Context& ctx,
const DenseTensor& input,
const DenseTensor& weight,
const DenseTensor& out_grad,
int64_t padding_idx,
DenseTensor* weight_grad) {
EmbeddingGradCPUFunctor<T, Context> functor(
ctx, input, weight, out_grad, padding_idx, weight_grad);
if (input.dtype() == phi::DataType::INT32) {
functor.template apply<int>();
} else if (input.dtype() == phi::DataType::INT64) {
functor.template apply<int64_t>();
} else {
PADDLE_THROW(phi::errors::Unimplemented(
"emebdding input only support int32 and int64"));
}
}
template <typename T, typename Context>
struct EmbeddingSparseGradCPUFunctor {
EmbeddingSparseGradCPUFunctor(const Context& dev_ctx,
const DenseTensor& input,
const DenseTensor& weight,
const DenseTensor& out_grad,
int64_t padding_idx,
SelectedRows* weight_grad)
: dev_ctx_(dev_ctx),
input_(input),
weight_(weight),
out_grad_(out_grad),
weight_grad_(weight_grad),
padding_idx_(padding_idx) {}
template <typename IdT>
void apply() {
DDim table_dim = weight_.dims();
auto ids = CopyIdsToVector<IdT, int64_t>(input_);
auto ids_num = static_cast<int64_t>(ids.size());
// Since paddings are not trainable and fixed in forward, the gradient of
// paddings makes no sense and we don't deal with it in backward.
auto* d_table = weight_grad_;
auto* d_output = &out_grad_;
d_table->set_rows(ids);
auto* d_table_value = d_table->mutable_value();
d_table_value->Resize({ids_num, table_dim[1]});
dev_ctx_.template Alloc<T>(d_table_value);
d_table->set_height(table_dim[0]);
auto* d_output_data = d_output->template data<T>();
auto* d_table_data = d_table_value->template data<T>();
auto d_output_dims = d_output->dims();
auto d_output_dims_2d =
flatten_to_2d(d_output_dims, d_output_dims.size() - 1);
PADDLE_ENFORCE_EQ(d_table_value->dims(),
d_output_dims_2d,
phi::errors::InvalidArgument(
"ShapeError: The shape of lookup_table@Grad and "
"output@Grad should be same. "
"But received lookup_table@Grad's shape = [%s], "
"output@Grad's shape = [%s].",
d_table_value->dims(),
d_output_dims_2d));
memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
}
private:
const Context& dev_ctx_;
const DenseTensor& input_;
const DenseTensor& weight_;
const DenseTensor& out_grad_;
SelectedRows* weight_grad_;
int64_t padding_idx_;
};
template <typename T, typename Context>
void EmbeddingSparseGradKernel(const Context& ctx,
const DenseTensor& input,
const DenseTensor& weight,
const DenseTensor& out_grad,
int64_t padding_idx,
SelectedRows* weight_grad) {
EmbeddingSparseGradCPUFunctor<T, Context> functor(
ctx, input, weight, out_grad, padding_idx, weight_grad);
if (input.dtype() == phi::DataType::INT32) {
functor.template apply<int>();
} else if (input.dtype() == phi::DataType::INT64) {
functor.template apply<int64_t>();
} else {
PADDLE_THROW(phi::errors::Unimplemented(
"emebdding input only support int32 and int64"));
}
}
} // namespace phi
PD_REGISTER_KERNEL(embedding_grad,
CPU,
ALL_LAYOUT,
phi::EmbeddingGradKernel,
float,
double,
phi::dtype::bfloat16) {}
PD_REGISTER_KERNEL(embedding_sparse_grad,
CPU,
ALL_LAYOUT,
phi::EmbeddingSparseGradKernel,
float,
double,
phi::dtype::bfloat16) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/embedding_kernel.h"
#include "paddle/phi/kernels/funcs/embedding_util.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/utils/data_type.h"
namespace phi {
template <typename T, typename Context>
struct EmbeddingCPUFunctor {
EmbeddingCPUFunctor(const Context& dev_ctx,
const DenseTensor& input,
const DenseTensor& weight,
int64_t padding_idx,
DenseTensor* out)
: dev_ctx_(dev_ctx),
input_(input),
weight_(weight),
out_(out),
padding_idx_(padding_idx) {}
template <typename IdT>
void apply() {
auto ids = CopyIdsToVector<IdT, int64_t>(input_);
auto ids_numel = static_cast<int64_t>(ids.size());
int64_t row_number = weight_.dims()[0];
int64_t row_width = weight_.dims()[1];
auto* table = weight_.data<T>();
dev_ctx_.template Alloc<T>(out_);
auto* output = out_->data<T>();
for (int64_t i = 0; i < ids_numel; ++i) {
if (padding_idx_ != kNoPadding && ids[i] == padding_idx_) {
memset(output + i * row_width, 0, row_width * sizeof(T));
} else {
PADDLE_ENFORCE_LT(
ids[i],
row_number,
phi::errors::InvalidArgument(
"Variable value (input) of OP(fluid.layers.embedding) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
row_number,
ids[i]));
PADDLE_ENFORCE_GE(
ids[i],
0,
phi::errors::InvalidArgument(
"Variable value (input) of OP(fluid.layers.embedding) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
row_number,
ids[i]));
memcpy(output + i * row_width,
table + ids[i] * row_width,
row_width * sizeof(T));
}
}
}
private:
const Context& dev_ctx_;
const DenseTensor& input_;
const DenseTensor& weight_;
DenseTensor* out_;
int64_t padding_idx_;
};
template <typename T, typename Context>
void EmbeddingKernel(const Context& ctx,
const DenseTensor& input,
const DenseTensor& weight,
int64_t padding_idx,
DenseTensor* out) {
EmbeddingCPUFunctor<T, Context> functor(ctx, input, weight, padding_idx, out);
if (input.dtype() == phi::DataType::INT32) {
functor.template apply<int>();
} else if (input.dtype() == phi::DataType::INT64) {
functor.template apply<int64_t>();
} else {
PADDLE_THROW(phi::errors::Unimplemented(
"emebdding input only support int32 and int64"));
}
}
} // namespace phi
PD_REGISTER_KERNEL(embedding,
CPU,
ALL_LAYOUT,
phi::EmbeddingKernel,
float,
double,
phi::dtype::bfloat16) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/sparse_weight_embedding_grad_kernel.h"
#include "paddle/phi/kernels/funcs/embedding_util.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/utils/data_type.h"
namespace phi {
template <typename T, typename Context>
struct SparseWeightEmbeddingGradCPUFunctor {
SparseWeightEmbeddingGradCPUFunctor(const Context& dev_ctx,
const DenseTensor& input,
const SelectedRows& weight,
const DenseTensor& out_grad,
int64_t padding_idx,
DenseTensor* weight_grad)
: dev_ctx_(dev_ctx),
input_(input),
weight_(weight),
out_grad_(out_grad),
weight_grad_(weight_grad),
padding_idx_(padding_idx) {}
template <typename IdT>
void apply() {
DDim table_dim = weight_.dims();
auto ids = CopyIdsToVector<IdT, int64_t>(input_);
auto ids_num = static_cast<int64_t>(ids.size());
// Since paddings are not trainable and fixed in forward, the gradient of
// paddings makes no sense and we don't deal with it in backward.
{
auto* d_output = &out_grad_;
// auto d_table = weight_grad_;
auto* ids_data = ids.data();
int64_t N = table_dim[0];
int64_t D = table_dim[1];
auto* d_output_data = d_output->template data<T>();
dev_ctx_.template Alloc<T>(weight_grad_);
auto* d_table_data = weight_grad_->data<T>();
memset(d_table_data, 0, weight_grad_->numel() * sizeof(T));
for (int64_t i = 0; i < ids_num; ++i) {
if (padding_idx_ != kNoPadding && ids_data[i] == padding_idx_) {
// the gradient of padding_idx should be 0, already done by memset, so
// do nothing.
} else {
PADDLE_ENFORCE_LT(
ids_data[i],
N,
phi::errors::InvalidArgument(
"Variable value (input) of "
"OP(paddle.nn.functional.embedding) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
N,
ids_data[i]));
PADDLE_ENFORCE_GE(
ids_data[i],
0,
phi::errors::InvalidArgument(
"Variable value (input) of "
"OP(paddle.nn.functional.embedding) "
"expected >= 0 and < %ld, but got %ld. Please check input "
"value.",
N,
ids_data[i]));
for (int j = 0; j < D; ++j) {
d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j];
}
}
}
}
}
private:
const Context& dev_ctx_;
const DenseTensor& input_;
const SelectedRows& weight_;
const DenseTensor& out_grad_;
DenseTensor* weight_grad_;
int64_t padding_idx_;
};
template <typename T, typename Context>
struct SparseWeightEmbeddingSparseGradCPUFunctor {
SparseWeightEmbeddingSparseGradCPUFunctor(const Context& dev_ctx,
const DenseTensor& input,
const SelectedRows& weight,
const DenseTensor& out_grad,
int64_t padding_idx,
SelectedRows* weight_grad)
: dev_ctx_(dev_ctx),
input_(input),
weight_(weight),
out_grad_(out_grad),
weight_grad_(weight_grad),
padding_idx_(padding_idx) {}
template <typename IdT>
void apply() {
DDim table_dim = weight_.dims();
auto ids = CopyIdsToVector<IdT, int64_t>(input_);
auto ids_num = static_cast<int64_t>(ids.size());
// Since paddings are not trainable and fixed in forward, the gradient of
// paddings makes no sense and we don't deal with it in backward.
auto* d_table = weight_grad_;
auto* d_output = &out_grad_;
d_table->set_rows(ids);
auto* d_table_value = d_table->mutable_value();
d_table_value->Resize({ids_num, table_dim[1]});
dev_ctx_.template Alloc<T>(d_table_value);
d_table->set_height(table_dim[0]);
auto* d_output_data = d_output->template data<T>();
auto* d_table_data = d_table_value->template data<T>();
auto d_output_dims = d_output->dims();
auto d_output_dims_2d =
phi::flatten_to_2d(d_output_dims, d_output_dims.size() - 1);
PADDLE_ENFORCE_EQ(d_table_value->dims(),
d_output_dims_2d,
phi::errors::InvalidArgument(
"ShapeError: The shape of lookup_table@Grad and "
"output@Grad should be same. "
"But received lookup_table@Grad's shape = [%s], "
"output@Grad's shape = [%s].",
d_table_value->dims(),
d_output_dims_2d));
memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
}
private:
const Context& dev_ctx_;
const DenseTensor& input_;
const SelectedRows& weight_;
const DenseTensor& out_grad_;
SelectedRows* weight_grad_;
int64_t padding_idx_;
};
template <typename T, typename Context>
void SparseWeightEmbeddingGradKernel(const Context& ctx,
const DenseTensor& input,
const SelectedRows& weight,
const DenseTensor& out_grad,
int64_t padding_idx,
DenseTensor* weight_grad) {
SparseWeightEmbeddingGradCPUFunctor<T, Context> functor(
ctx, input, weight, out_grad, padding_idx, weight_grad);
if (input.dtype() == phi::DataType::INT32) {
functor.template apply<int>();
} else if (input.dtype() == phi::DataType::INT64) {
functor.template apply<int64_t>();
} else {
PADDLE_THROW(phi::errors::Unimplemented(
"emebdding input only support int32 and int64"));
}
}
template <typename T, typename Context>
void SparseWeightEmbeddingSparseGradKernel(const Context& ctx,
const DenseTensor& input,
const SelectedRows& weight,
const DenseTensor& out_grad,
int64_t padding_idx,
SelectedRows* weight_grad) {
SparseWeightEmbeddingSparseGradCPUFunctor<T, Context> functor(
ctx, input, weight, out_grad, padding_idx, weight_grad);
if (input.dtype() == phi::DataType::INT32) {
functor.template apply<int>();
} else if (input.dtype() == phi::DataType::INT64) {
functor.template apply<int64_t>();
} else {
PADDLE_THROW(phi::errors::Unimplemented(
"emebdding input only support int32 and int64"));
}
}
} // namespace phi
PD_REGISTER_KERNEL(sparse_weight_embedding_grad,
CPU,
ALL_LAYOUT,
phi::SparseWeightEmbeddingGradKernel,
float,
double,
phi::dtype::bfloat16) {}
PD_REGISTER_KERNEL(sparse_weight_embedding_sparse_grad,
CPU,
ALL_LAYOUT,
phi::SparseWeightEmbeddingSparseGradKernel,
float,
double,
phi::dtype::bfloat16) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/embedding_kernel.h"
#include "paddle/phi/kernels/funcs/embedding_util.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/utils/data_type.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
namespace phi {
template <typename T, typename Context>
struct EmbeddingCPUSparseFunctor {
EmbeddingCPUSparseFunctor(const Context& dev_ctx,
const DenseTensor& input,
const SelectedRows& weight,
int64_t padding_idx,
DenseTensor* out)
: dev_ctx_(dev_ctx),
input_(input),
weight_(weight),
out_(out),
padding_idx_(padding_idx) {}
template <typename IdT>
void apply() {
auto ids = CopyIdsToVector<IdT, int64_t>(input_);
auto ids_numel = static_cast<int64_t>(ids.size());
const auto& table_t = weight_;
auto output_t = out_;
int64_t row_width = table_t.value().dims()[1];
const auto* table = table_t.value().template data<T>();
auto* output = dev_ctx_.template Alloc<T>(output_t);
auto input_data_type =
paddle::framework::TransToProtoVarType(table_t.value().dtype());
for (int64_t i = 0; i < ids_numel; ++i) {
if (padding_idx_ != kNoPadding && ids[i] == padding_idx_) {
memset(output + i * row_width, 0, row_width * sizeof(T));
} else {
PADDLE_ENFORCE_GE(
ids[i],
0,
phi::errors::InvalidArgument(
"Variable value (input) of OP(fluid.layers.embedding) "
"expected >= 0. But received %ld",
ids[i]));
auto id_index = table_t.Index(ids[i]);
PADDLE_ENFORCE_GE(
id_index,
0,
phi::errors::InvalidArgument(
"the input key should be exists. But received %d.", id_index));
if (input_data_type == paddle::framework::proto::VarType::BF16) {
memcpy(output + i * row_width,
table + id_index * row_width,
row_width * sizeof(T));
} else {
auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(dev_ctx_);
blas.VCOPY(
row_width, table + id_index * row_width, output + i * row_width);
}
}
}
}
private:
const Context& dev_ctx_;
const DenseTensor& input_;
const SelectedRows& weight_;
DenseTensor* out_;
int64_t padding_idx_;
};
template <typename T, typename Context>
void SparseWeightEmbeddingKernel(const Context& ctx,
const DenseTensor& input,
const SelectedRows& weight,
int64_t padding_idx,
DenseTensor* out) {
EmbeddingCPUSparseFunctor<T, Context> functor(
ctx, input, weight, padding_idx, out);
if (input.dtype() == phi::DataType::INT32) {
functor.template apply<int>();
} else if (input.dtype() == phi::DataType::INT64) {
functor.template apply<int64_t>();
} else {
PADDLE_THROW(phi::errors::Unimplemented(
"emebdding input only support int32 and int64"));
}
}
} // namespace phi
PD_REGISTER_KERNEL(sparse_weight_embedding,
CPU,
ALL_LAYOUT,
phi::SparseWeightEmbeddingKernel,
float,
double,
phi::dtype::bfloat16) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/selected_rows.h"
namespace phi {
template <typename T, typename Context>
void EmbeddingGradKernel(const Context& ctx,
const DenseTensor& input,
const DenseTensor& weight,
const DenseTensor& out_grad,
int64_t padding_idx,
DenseTensor* weight_grad);
template <typename T, typename Context>
void EmbeddingSparseGradKernel(const Context& ctx,
const DenseTensor& input,
const DenseTensor& weight,
const DenseTensor& out_grad,
int64_t padding_idx,
SelectedRows* weight_grad);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void EmbeddingKernel(const Context& ctx,
const DenseTensor& inputx,
const DenseTensor& weight,
int64_t padding_idx,
DenseTensor* out);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
constexpr int64_t kNoPadding = -1;
template <typename InT, typename OutT>
static std::vector<OutT> CopyIdsToVector(const DenseTensor &ids) {
auto numel = ids.numel();
const auto *src = ids.data<InT>();
std::vector<OutT> ret(numel);
if (std::is_same<InT, OutT>::value) {
std::memcpy(ret.data(), src, numel * sizeof(InT));
} else {
for (decltype(numel) i = 0; i < numel; ++i) {
ret[i] = src[i];
}
}
return ret;
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/embedding_grad_kernel.h"
#include "paddle/phi/kernels/funcs/embedding_util.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
namespace phi {
template <typename InT, typename OutT>
__global__ void InputTypeConvert(const InT* in_ids,
const int64_t K,
OutT* out_ids) {
for (int i = 0; i < K; i++) {
out_ids[i] = static_cast<OutT>(in_ids[i]);
}
}
template <typename T, typename IdT>
__global__ void EmbeddingGrad(T* table,
const T* output,
const IdT* ids,
const int64_t N,
const int64_t K,
const int64_t D) {
int idx = threadIdx.x;
int idy = blockIdx.x + threadIdx.y * gridDim.x;
while (idy < K) {
auto id = static_cast<int64_t>(ids[idy]);
const T* out = output + idy * D;
T* tab = table + id * D;
#ifdef PADDLE_WITH_CUDA
paddle::platform::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab);
#else
for (int i = idx; i < D; i += blockDim.x) {
paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
}
#endif
idy += blockDim.y * gridDim.x;
}
}
template <typename T, typename Context>
struct EmbeddingGradCUDAFunctor {
EmbeddingGradCUDAFunctor(const Context& dev_ctx,
const DenseTensor& input,
const DenseTensor& weight,
const DenseTensor& out_grad,
int64_t padding_idx,
DenseTensor* weight_grad)
: dev_ctx_(dev_ctx),
input_(input),
weight_(weight),
out_grad_(out_grad),
padding_idx_(padding_idx),
weight_grad_(weight_grad) {}
template <typename IdT>
void apply() {
// Since paddings are not trainable and fixed in forward, the gradient of
// paddings makes no sense and we don't deal with it in backward.
{
auto d_output_t = out_grad_;
auto d_table_t = weight_grad_;
int N = weight_grad_->dims()[0];
int D = weight_grad_->dims()[1];
int K = input_.numel();
const T* d_output = d_output_t.template data<T>();
const auto* ids = input_.template data<IdT>();
T* d_table = dev_ctx_.template Alloc<T>(d_table_t);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx_.stream()));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx_.stream()));
#endif
const int gridx = 2 * dev_ctx_.GetSMCount();
dim3 threads(128, 8);
dim3 grids(gridx, 1);
EmbeddingGrad<T, IdT><<<grids, threads, 0, dev_ctx_.stream()>>>(
d_table, d_output, ids, N, K, D);
}
}
private:
const phi::GPUContext& dev_ctx_;
const DenseTensor& input_;
const DenseTensor& weight_;
const DenseTensor& out_grad_;
int64_t padding_idx_;
DenseTensor* weight_grad_;
};
template <typename T, typename Context>
void EmbeddingGradKernel(const Context& ctx,
const DenseTensor& input,
const DenseTensor& weight,
const DenseTensor& out_grad,
int64_t padding_idx,
DenseTensor* weight_grad) {
EmbeddingGradCUDAFunctor<T, Context> functor(
ctx, input, weight, out_grad, padding_idx, weight_grad);
if (input.dtype() == phi::DataType::INT32) {
functor.template apply<int>();
} else if (input.dtype() == phi::DataType::INT64) {
functor.template apply<int64_t>();
} else {
PADDLE_THROW(phi::errors::Unimplemented(
"emebdding input only support int32 and int64"));
}
}
template <typename T, typename Context>
struct EmbeddingSparseGradCUDAFunctor {
EmbeddingSparseGradCUDAFunctor(const Context& dev_ctx,
const DenseTensor& input,
const DenseTensor& weight,
const DenseTensor& out_grad,
int64_t padding_idx,
SelectedRows* weight_grad)
: dev_ctx_(dev_ctx),
input_(input),
weight_(weight),
out_grad_(out_grad),
padding_idx_(padding_idx),
weight_grad_(weight_grad) {}
template <typename IdT>
void apply() {
// Since paddings are not trainable and fixed in forward, the gradient of
// paddings makes no sense and we don't deal with it in backward.
const auto* ids_data = input_.template data<IdT>();
auto* d_table = weight_grad_;
auto* table = &weight_;
auto* d_output = &out_grad_;
int64_t ids_num = input_.numel();
dim3 threads(128, 8);
dim3 grids(8, 1);
auto stream = dev_ctx_.stream();
paddle::framework::Vector<int64_t> new_rows;
new_rows.resize(ids_num);
auto gpu_place = dev_ctx_.GetPlace();
paddle::framework::MixVector<int64_t> mixv_new_rows(&new_rows);
if (!std::is_same<IdT, int64_t>::value) {
InputTypeConvert<<<grids, threads, 0, stream>>>(
ids_data, ids_num, mixv_new_rows.MutableData(gpu_place));
} else {
paddle::memory::Copy(gpu_place,
mixv_new_rows.CUDAMutableData(gpu_place),
gpu_place,
ids_data,
ids_num * sizeof(int64_t),
stream);
}
mixv_new_rows.CopyToCPU();
d_table->set_rows(new_rows);
auto* d_table_value = d_table->mutable_value();
d_table_value->Resize({ids_num, table->dims()[1]});
dev_ctx_.template Alloc<T>(d_table_value);
auto* d_table_data = d_table_value->template data<T>();
auto* d_output_data = d_output->template data<T>();
auto d_output_dims = d_output->dims();
auto d_output_dims_2d =
phi::flatten_to_2d(d_output_dims, d_output_dims.size() - 1);
PADDLE_ENFORCE_EQ(d_table_value->dims(),
d_output_dims_2d,
phi::errors::InvalidArgument(
"ShapeError: The shape of lookup_table@Grad and "
"output@Grad should be same. "
"But received lookup_table@Grad's shape = [%s], "
"output@Grad's shape = [%s].",
d_table_value->dims(),
d_output_dims_2d));
paddle::memory::Copy(gpu_place,
d_table_data,
gpu_place,
d_output_data,
d_output->numel() * sizeof(T),
stream);
}
private:
const phi::GPUContext& dev_ctx_;
const DenseTensor& input_;
const DenseTensor& weight_;
const DenseTensor& out_grad_;
int64_t padding_idx_;
SelectedRows* weight_grad_;
};
template <typename T, typename Context>
void EmbeddingSparseGradKernel(const Context& ctx,
const DenseTensor& input,
const DenseTensor& weight,
const DenseTensor& out_grad,
int64_t padding_idx,
SelectedRows* weight_grad) {
EmbeddingSparseGradCUDAFunctor<T, Context> functor(
ctx, input, weight, out_grad, padding_idx, weight_grad);
if (input.dtype() == phi::DataType::INT32) {
functor.template apply<int>();
} else if (input.dtype() == phi::DataType::INT64) {
functor.template apply<int64_t>();
} else {
PADDLE_THROW(phi::errors::Unimplemented(
"emebdding input only support int32 and int64"));
}
}
} // namespace phi
PD_REGISTER_KERNEL(embedding_grad,
GPU,
ALL_LAYOUT,
phi::EmbeddingGradKernel,
float,
double,
phi::dtype::float16) {}
PD_REGISTER_KERNEL(embedding_sparse_grad,
GPU,
ALL_LAYOUT,
phi::EmbeddingSparseGradKernel,
float,
double,
phi::dtype::float16) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/embedding_kernel.h"
#include "paddle/phi/kernels/funcs/embedding_util.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
namespace phi {
template <typename T, typename IdT, bool PaddingFlag>
__global__ void EmbeddingFW(T *output,
const T *table,
const IdT *ids,
const int64_t N,
const int64_t K,
const int64_t D,
const int64_t padding_idx) {
int idx = threadIdx.x;
int idy = blockIdx.x + threadIdx.y * gridDim.x;
while (idy < K) {
auto id = static_cast<int64_t>(ids[idy]);
T *out = output + idy * D;
const T *tab = table + id * D;
for (int i = idx; i < D; i += blockDim.x) {
if (PaddingFlag) {
if (id == padding_idx)
out[i] = static_cast<T>(0);
else
out[i] = tab[i];
} else {
out[i] = tab[i];
}
}
idy += blockDim.y * gridDim.x;
}
}
template <typename T, typename Context>
struct EmbeddingCUDAFunctor {
EmbeddingCUDAFunctor(const Context &dev_ctx,
const DenseTensor &input,
const DenseTensor &weight,
int64_t padding_idx,
DenseTensor *out)
: dev_ctx_(dev_ctx),
input_(input),
weight_(weight),
out_(out),
padding_idx_(padding_idx) {}
template <typename IdT>
void apply() {
size_t N = weight_.dims()[0];
size_t D = weight_.dims()[1];
size_t K = input_.numel();
const int gridx = 2 * dev_ctx_.GetSMCount();
dim3 threads(256, 4);
dim3 grids(gridx, 1);
const T *table = weight_.template data<T>();
const IdT *ids = input_.template data<IdT>();
auto *output = dev_ctx_.template Alloc<T>(out_);
auto stream = dev_ctx_.stream();
if (padding_idx_ == -1) {
EmbeddingFW<T, IdT, false><<<grids, threads, 0, stream>>>(
output, table, ids, N, K, D, padding_idx_);
} else {
EmbeddingFW<T, IdT, true><<<grids, threads, 0, stream>>>(
output, table, ids, N, K, D, padding_idx_);
}
}
private:
const phi::GPUContext &dev_ctx_;
const DenseTensor &input_;
const DenseTensor &weight_;
DenseTensor *out_;
int64_t padding_idx_;
};
template <typename T, typename Context>
void EmbeddingKernel(const Context &ctx,
const DenseTensor &input,
const DenseTensor &weight,
int64_t padding_idx,
DenseTensor *out) {
EmbeddingCUDAFunctor<T, Context> functor(
ctx, input, weight, padding_idx, out);
if (input.dtype() == phi::DataType::INT32) {
functor.template apply<int32_t>();
} else if (input.dtype() == phi::DataType::INT64) {
functor.template apply<int64_t>();
} else {
PADDLE_THROW(phi::errors::Unimplemented(
"emebdding input only support int32 and int64"));
}
}
} // namespace phi
PD_REGISTER_KERNEL(embedding,
GPU,
ALL_LAYOUT,
phi::EmbeddingKernel,
float,
double,
phi::dtype::float16) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/selected_rows.h"
namespace phi {
template <typename T, typename Context>
void SparseWeightEmbeddingGradKernel(const Context& ctx,
const DenseTensor& input,
const SelectedRows& weight,
const DenseTensor& out_grad,
int64_t padding_idx,
DenseTensor* weight_grad);
template <typename T, typename Context>
void SparseWeightEmbeddingSparseGradKernel(const Context& ctx,
const DenseTensor& input,
const SelectedRows& weight,
const DenseTensor& out_grad,
int64_t padding_idx,
SelectedRows* weight_grad);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/selected_rows.h"
namespace phi {
template <typename T, typename Context>
void SparseWeightEmbeddingKernel(const Context& ctx,
const DenseTensor& inputx,
const SelectedRows& weight,
int64_t padding_idx,
DenseTensor* out);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature EmbeddingOpArgumentMapping(const ArgumentMappingContext& ctx) {
if (ctx.IsDenseTensorInput("W")) {
return KernelSignature("embedding", {"Ids", "W"}, {"padding_idx"}, {"Out"});
} else {
return KernelSignature(
"sparse_weight_embedding", {"Ids", "W"}, {"padding_idx"}, {"Out"});
}
}
KernelSignature EmbeddingGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
if (ctx.IsDenseTensorInput("W")) {
if ((paddle::any_cast<bool>(ctx.Attr("is_sparse"))) == true) {
return KernelSignature("embedding_sparse_grad",
{"Ids", "W", GradVarName("Out")},
{"padding_idx"},
{GradVarName("W")});
} else {
return KernelSignature("embedding_grad",
{"Ids", "W", GradVarName("Out")},
{"padding_idx"},
{GradVarName("W")});
}
} else {
if ((paddle::any_cast<bool>(ctx.Attr("is_sparse"))) == true) {
return KernelSignature("sparse_weight_embedding_sparse_grad",
{"Ids", "W", GradVarName("Out")},
{"padding_idx"},
{GradVarName("W")});
} else {
return KernelSignature("sparse_weight_embedding_grad",
{"Ids", "W", GradVarName("Out")},
{"padding_idx"},
{GradVarName("W")});
}
}
}
} // namespace phi
PD_REGISTER_BASE_KERNEL_NAME(lookup_table_v2, embedding);
PD_REGISTER_BASE_KERNEL_NAME(lookup_table_v2_grad, embedding_grad);
PD_REGISTER_ARG_MAPPING_FN(lookup_table_v2, phi::EmbeddingOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(lookup_table_v2_grad,
phi::EmbeddingGradOpArgumentMapping);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册