diff --git a/paddle/fluid/operators/collective/c_embedding_op.cc b/paddle/fluid/operators/collective/c_embedding_op.cc index 3055e2ceb23dd239cf98188aa81a0d783b4f9e96..7c5ec23ca68ba94137528f6902e66989fcac8d78 100644 --- a/paddle/fluid/operators/collective/c_embedding_op.cc +++ b/paddle/fluid/operators/collective/c_embedding_op.cc @@ -46,6 +46,17 @@ class CEmbeddingOp : public framework::OperatorWithKernel { framework::proto::VarType::LOD_TENSOR) { ctx->ShareLoD("Ids", /*->*/ "Out"); } + + // check valid + const int64_t height = table_dims[0]; + const int64_t width = table_dims[1]; + const int64_t start_idx = ctx->Attrs().Get("start_index"); + + PADDLE_ENFORCE_EQ( + (height > 0 && width > 0 && start_idx >= 0), true, + platform::errors::InvalidArgument( + "height:%ld width:%ld start_idx:%ld must not have negtive values", + height, width, start_idx)); } protected: @@ -63,7 +74,7 @@ class CEmbeddingOpMaker : public framework::OpProtoAndCheckerMaker { "(Tensor) The input represents embedding tensors, " "which is a learnable parameter."); AddInput("Ids", - "An input with type int64 " + "An input with type int32 or int64 in CPU and GPU, int32 in NPU " "contains the ids to be looked up in W."); AddOutput("Out", "The lookup results, which have the same type as W."); @@ -111,6 +122,21 @@ class CEmbeddingOpGrad : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { auto table_dims = ctx->GetInputDim("W"); ctx->SetOutputDim(framework::GradVarName("W"), table_dims); + + // check valid + PADDLE_ENFORCE_EQ(table_dims.size(), 2, + platform::errors::InvalidArgument( + "Only accept the dims of table_t == 2")); + + const int64_t start_idx = ctx->Attrs().Get("start_index"); + const int64_t height = table_dims[0]; + const int64_t width = table_dims[1]; + + PADDLE_ENFORCE_EQ( + (height > 0 && width > 0 && start_idx >= 0), true, + platform::errors::InvalidArgument( + "height:%ld width:%ld start_idx:%ld must not have negtive values", + height, width, start_idx)); } protected: @@ -137,6 +163,7 @@ class CEmbeddingOpGradVarTypeInference : public framework::VarTypeInference { } // namespace paddle namespace ops = paddle::operators; +namespace plat = paddle::platform; REGISTER_OPERATOR(c_embedding, ops::CEmbeddingOp, ops::CEmbeddingOpMaker, ops::CEmbeddingGradOpMaker, ops::CEmbeddingGradOpMaker); @@ -146,4 +173,9 @@ REGISTER_OPERATOR(c_embedding_grad, ops::CEmbeddingOpGrad, ops::CEmbeddingOpGradVarTypeInference); REGISTER_OP_CPU_KERNEL(c_embedding, ops::CEmbeddingOpCPUKernel, - ops::CEmbeddingOpCPUKernel); + ops::CEmbeddingOpCPUKernel, + ops::CEmbeddingOpCPUKernel); + +REGISTER_OP_CPU_KERNEL(c_embedding_grad, ops::CEmbeddingGradOpCPUKernel, + ops::CEmbeddingGradOpCPUKernel, + ops::CEmbeddingGradOpCPUKernel); diff --git a/paddle/fluid/operators/collective/c_embedding_op.cu b/paddle/fluid/operators/collective/c_embedding_op.cu index ecf3887eef4ac6a8af7538789ec5fc56691b83bb..858ca79f85b0e66b830e4ca78098a84e15d25df9 100644 --- a/paddle/fluid/operators/collective/c_embedding_op.cu +++ b/paddle/fluid/operators/collective/c_embedding_op.cu @@ -105,6 +105,9 @@ class CEmbeddingCUDAKernel : public framework::OpKernel { CEmbedding<<>>( output, table, ids_t->data(), K, D, N, start_idx, end_idx, limit); + } else { + PADDLE_THROW(platform::errors::Unavailable( + "GPU c_embedding ids only support int32 or int64.")); } } }; diff --git a/paddle/fluid/operators/collective/c_embedding_op.h b/paddle/fluid/operators/collective/c_embedding_op.h index 3cab6d7184441df4c87382904e7a1d35caddfbca..1d410847dd798b6c442cf2008f0ccd53c8e3a319 100644 --- a/paddle/fluid/operators/collective/c_embedding_op.h +++ b/paddle/fluid/operators/collective/c_embedding_op.h @@ -27,12 +27,115 @@ namespace operators { using LoDTensor = framework::LoDTensor; +inline void CheckTableValid() {} + +template +void GetIdsEmbedding(const TIds* ids, size_t ids_len, int64_t start_idx, + const TData* table, int64_t height, int64_t width, + TData* out) { + for (size_t i = 0; i < ids_len; i++) { + TIds id = ids[i]; + int64_t local = id - start_idx; + + if (local >= 0 && local < height) { + // for (int64_t w = 0; w < width; w++) { + // out[i * width + w] = table[local * width + w]; + // } + + memcpy(out + i * width, table + local * width, width * sizeof(TData)); + } else { + memset(out + i * width, 0, width * sizeof(TData)); + } + } +} + template class CEmbeddingOpCPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_THROW(platform::errors::Unavailable( - "Do not support c_embedding for cpu kernel now.")); + auto* table_t = ctx.Input("W"); + auto* ids_t = ctx.Input("Ids"); + auto* output_t = ctx.Output("Out"); + const int64_t start_idx = ctx.Attr("start_index"); + + VLOG(10) << "table_dims:" << table_t->dims(); + + const T* table_data = table_t->data(); + T* output_data = output_t->mutable_data(ctx.GetPlace()); + + const int64_t height = table_t->dims()[0]; + const int64_t width = table_t->dims()[1]; + + const auto& index_type = ids_t->type(); + if (index_type == framework::proto::VarType::INT32) { + GetIdsEmbedding(ids_t->data(), ids_t->numel(), start_idx, + table_data, height, width, output_data); + } else if (index_type == framework::proto::VarType::INT64) { + GetIdsEmbedding(ids_t->data(), ids_t->numel(), start_idx, + table_data, height, width, output_data); + } else { + PADDLE_THROW(platform::errors::Unavailable( + "CPU c_embedding ids only support int32 or int64.")); + } + } +}; + +template +void UpdateEmbedding(const TIds* ids, size_t ids_len, int64_t start_idx, + TData* table, int64_t height, int64_t width, + const TData* out) { + for (size_t i = 0; i < ids_len; i++) { + TIds id = ids[i]; + int64_t local = id - start_idx; + + if (local >= 0 && local < height) { + for (int64_t w = 0; w < width; w++) { + table[local * width + w] += out[i * width + w]; + } + } + } +} + +template +class CEmbeddingGradOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const int64_t start_idx = context.Attr("start_index"); + auto ids_t = context.Input("Ids"); + auto d_output_t = context.Input(framework::GradVarName("Out")); + auto table_t = context.Input("W"); + auto table_grad_t = context.Output(framework::GradVarName("W")); + + T* table_grad_data = + table_grad_t->mutable_data(table_t->dims(), context.GetPlace()); + + size_t table_t_mem_size = + table_t->numel() * framework::SizeOfType(table_grad_t->type()); + size_t table_grad_t_mem_size = + table_grad_t->numel() * framework::SizeOfType(table_grad_t->type()); + + VLOG(10) << "table_dims:" << table_t->dims() + << ", table_t memory_size:" << table_t_mem_size + << ", table_grad_t memory_size:" << table_grad_t_mem_size + << ", start_index:" << start_idx; + + memset(table_grad_data, 0, table_grad_t_mem_size); + const T* d_output_data = d_output_t->data(); + + const int64_t height = table_t->dims()[0]; + const int64_t width = table_t->dims()[1]; + + const auto& index_type = ids_t->type(); + if (index_type == framework::proto::VarType::INT32) { + UpdateEmbedding(ids_t->data(), ids_t->numel(), start_idx, + table_grad_data, height, width, d_output_data); + } else if (index_type == framework::proto::VarType::INT64) { + UpdateEmbedding(ids_t->data(), ids_t->numel(), start_idx, + table_grad_data, height, width, d_output_data); + } else { + PADDLE_THROW(platform::errors::Unavailable( + "CPU c_embedding ids only support int32 or int64.")); + } } }; diff --git a/paddle/fluid/operators/collective/c_embedding_op_npu.cc b/paddle/fluid/operators/collective/c_embedding_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..c2d607223868a22a28bffea30637c74146a54b2b --- /dev/null +++ b/paddle/fluid/operators/collective/c_embedding_op_npu.cc @@ -0,0 +1,244 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/collective/c_embedding_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/platform/npu_info.h" + +namespace paddle { +namespace operators { + +template +inline void FillNPU(Tensor *dst, T val, + const framework::ExecutionContext &context) { + Tensor value(dst->type()); + value.mutable_data({1}, context.GetPlace()); + FillNpuTensorWithConstant(&value, static_cast(val)); + + auto stream = + context.template device_context() + .stream(); + + const auto &runner = NpuOpRunner( + "FillD", {value}, {*dst}, {{"dims", framework::vectorize(dst->dims())}}); + runner.Run(stream); +} + +template +void shard_index(const Tensor &table_t, const Tensor &ids_t, int64_t start_idx, + const Tensor &id_t, + const framework::ExecutionContext &context) { + const int height = table_t.dims()[0]; + + auto stream = + context.template device_context() + .stream(); + framework::Tensor id_t_d; + id_t_d.mutable_data(ids_t.dims(), context.GetPlace()); + FillNPU(&id_t_d, static_cast(0.0), context); + id_t_d.Resize(ids_t.dims()); + + framework::Tensor id_t_u; + id_t_u.mutable_data(ids_t.dims(), context.GetPlace()); + FillNPU(&id_t_u, static_cast(height - 1), context); + id_t_u.Resize(ids_t.dims()); + + framework::Tensor id_matched_d; + id_matched_d.mutable_data(ids_t.dims(), context.GetPlace()); + framework::Tensor id_matched_u; + id_matched_u.mutable_data(ids_t.dims(), context.GetPlace()); + framework::Tensor ignore_tensor; + ignore_tensor.mutable_data(ids_t.dims(), context.GetPlace()); + FillNPU(&ignore_tensor, static_cast(height), context); + ignore_tensor.Resize(ids_t.dims()); + + NpuOpRunner sub_runner; + sub_runner.SetType("Sub") + .AddInput(ids_t) + .AddInput(std::vector{static_cast(start_idx)}) + .AddOutput(id_t); + sub_runner.Run(); + + NpuOpRunner lessequal1_runner; + lessequal1_runner.SetType("LessEqual") + .AddInput(id_t) + .AddInput(id_t_u) + .AddOutput(id_matched_u); + lessequal1_runner.Run(); + + NpuOpRunner lessequal2_runner; + lessequal2_runner.SetType("LessEqual") + .AddInput(id_t_d) + .AddInput(id_t) + .AddOutput(id_matched_d); + lessequal2_runner.Run(); + + NpuOpRunner("Equal", {id_matched_u, id_matched_d}, {id_matched_d}, {}) + .Run(stream); + NpuOpRunner("Select", {id_matched_d, id_t, ignore_tensor}, {id_t}, {}) + .Run(stream); +} + +template +void NPUGetIdsEmbedding(const framework::ExecutionContext &context) { + auto *table_t = context.Input("W"); + auto *ids_t = context.Input("Ids"); + auto *output_t = context.Output("Out"); + const int64_t start_idx = context.Attr("start_index"); + + auto stream = + context.template device_context() + .stream(); + + framework::Tensor ids_t_local; + ids_t_local.mutable_data(ids_t->dims(), context.GetPlace()); + shard_index(*table_t, *ids_t, start_idx, ids_t_local, context); + + auto pad_shape = + framework::make_ddim({table_t->dims()[0] + 1, table_t->dims()[1]}); + framework::LoDTensor table_t_pad; + + size_t mem_size = table_t->numel() * framework::SizeOfType(table_t->type()); + size_t line_mem_size = + table_t->dims()[1] * framework::SizeOfType(table_t->type()); + PADDLE_ENFORCE_EQ(line_mem_size % 64, 0, + platform::errors::InvalidArgument( + "NPU only accept the second dim must align by 64")); + + VLOG(10) << "mem_size:" << mem_size << ",line_mem_size:" << line_mem_size + << ", pad_shape:" << pad_shape << ", table_dims:" << table_t->dims(); + + uint8_t *pad_data = reinterpret_cast( + table_t_pad.mutable_data(pad_shape, context.GetPlace())); + PADDLE_ENFORCE_NPU_SUCCESS( + aclrtMemcpyAsync(pad_data, mem_size, table_t->data(), mem_size, + ACL_MEMCPY_DEVICE_TO_DEVICE, stream)); + PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemsetAsync( + pad_data + mem_size, line_mem_size, 0, line_mem_size, stream)); + + output_t->mutable_data(context.GetPlace()); + NpuOpRunner runner; + runner.SetType("GatherV2") + .AddInput(table_t_pad) + .AddInput(ids_t_local) + .AddInput(std::vector{0}) + .AddOutput(*output_t); + runner.Run(); +} + +template +class CEmbeddingNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *ids_t = context.Input("Ids"); + + const auto &index_type = ids_t->type(); + if (index_type == framework::proto::VarType::INT32) { + NPUGetIdsEmbedding(context); + } else { + PADDLE_THROW(platform::errors::Unavailable( + "NPU c_embedding ids only support int32.")); + } + } +}; + +template +void NPUUpdateEmbedding(const framework::ExecutionContext &context) { + // get inputs + const int64_t start_idx = context.Attr("start_index"); + auto ids_t = context.Input("Ids"); + auto d_output_t = context.Input(framework::GradVarName("Out")); + auto table_t = context.Input("W"); + auto table_grad_t = context.Output(framework::GradVarName("W")); + + VLOG(10) << "ids_t:" << ids_t << ", d_output_t:" << d_output_t + << ", table_t:" << table_t << ", table_grad_t" << table_grad_t; + + auto stream = + context.template device_context() + .stream(); + + // convert ids_t to local valid + framework::Tensor ids_t_local; + ids_t_local.mutable_data(ids_t->dims(), context.GetPlace()); + shard_index(*table_t, *ids_t, start_idx, ids_t_local, context); + + // padding table_t -> table_t_pad + auto pad_shape = + framework::make_ddim({table_t->dims()[0] + 1, table_t->dims()[1]}); + framework::LoDTensor table_t_pad; + + // set table_t_pad to zero + uint8_t *pad_data = reinterpret_cast( + table_t_pad.mutable_data(pad_shape, context.GetPlace())); + size_t table_t_pad_mem_size = + table_t_pad.numel() * framework::SizeOfType(table_t_pad.type()); + PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemsetAsync(pad_data, table_t_pad_mem_size, 0, + table_t_pad_mem_size, stream)); + + // NOTE(zhiqiu): It seems in cann 20.1, the first input and output + // can be different tensor, but in cann 20.2+, it does inplace operation. + // Thus, the first input and output should be same tensor. + const auto &runner_scatter = + NpuOpRunner("ScatterAdd", {table_t_pad, ids_t_local, *d_output_t}, + {table_t_pad}, {{"use_locking", true}}); + runner_scatter.Run(stream); + + // copy table_t_pad to table_t + T *dst = table_grad_t->mutable_data(table_t->dims(), context.GetPlace()); + const size_t mem_size = + table_grad_t->numel() * framework::SizeOfType(table_grad_t->type()); + + // check align + size_t line_mem_size = + table_grad_t->dims()[1] * framework::SizeOfType(table_grad_t->type()); + PADDLE_ENFORCE_EQ(line_mem_size % 64, 0, + platform::errors::InvalidArgument( + "NPU only accept the second dim must align by 64")); + + PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpyAsync( + dst, mem_size, pad_data, mem_size, ACL_MEMCPY_DEVICE_TO_DEVICE, stream)); +} + +template +class CEmbeddingGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *ids_t = context.Input("Ids"); + + const auto &index_type = ids_t->type(); + if (index_type == framework::proto::VarType::INT32) { + NPUUpdateEmbedding(context); + } else { + PADDLE_THROW( + platform::errors::Unavailable("c_embedding ids only support int32.")); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_NPU_KERNEL(c_embedding, ops::CEmbeddingNPUKernel, + ops::CEmbeddingNPUKernel, + ops::CEmbeddingNPUKernel); +REGISTER_OP_NPU_KERNEL(c_embedding_grad, ops::CEmbeddingGradNPUKernel, + ops::CEmbeddingGradNPUKernel, + ops::CEmbeddingGradNPUKernel); diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index e70fe1e280fa4c0904791924e1ba55858aff4616..2a251460e47bf6e0a012f7d03e5b891e48371db8 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -1230,65 +1230,6 @@ def _parallel_embedding(x, return out -def _parallel_embedding_npu(x, - per_part_embeddings, - origin_size, - param_attr, - inner_rank, - num_partitions, - name, - group=None): - """ - NPU Parallel Embedding - """ - if group is not None and not group.is_member(): - return - ring_id = 0 if group is None else group.id - - origin_num_embeddings = origin_size[0] - embedding = paddle.nn.Embedding( - per_part_embeddings, - origin_size[1], - padding_idx=per_part_embeddings - 1, - sparse=False, - weight_attr=param_attr, - name=name) - - origin_input_shape = x.shape - if len(origin_input_shape) == 2: - x = paddle.unsqueeze(x, axis=-1) - else: - assert origin_input_shape[-1] == 1, ( - "The last dimension size of x must be 1.") - x_shard = paddle.shard_index(x, origin_num_embeddings, num_partitions, - inner_rank, per_part_embeddings - 1) - if len(origin_input_shape) == 2: - x_shard = paddle.squeeze(x_shard, axis=-1) - emb_out = embedding(x_shard) - startup_block = paddle.static.default_startup_program().global_block() - main_block = paddle.static.default_main_program().global_block() - startup_block.vars[embedding.weight.name].is_distributed = True - main_block.vars[embedding.weight.name].is_distributed = True - out = main_block.create_var( - shape=emb_out.shape, - dtype=emb_out.dtype, - type=emb_out.type, - lod_level=emb_out.lod_level, - persistable=False, - is_data=False, - need_check_feed=emb_out.desc.need_check_feed()) - main_block.append_op( - type='c_allreduce_sum', - inputs={'X': emb_out}, - outputs={'Out': out}, - attrs={ - 'ring_id': ring_id, - 'use_calc_stream': True, - 'use_model_parallel': True - }) - return out - - def split(x, size, operation, @@ -1403,28 +1344,16 @@ def split(x, "but received vocabulary={} num_partitions={}".format(size[0], num_partitions) per_part_size = size[0] // num_partitions - if core.is_compiled_with_npu(): - emb_out = _parallel_embedding_npu( - x, - per_part_size, - size, - weight_attr, - inner_rank, - num_partitions, - name, - group=None) - return emb_out - else: - emb_out = _parallel_embedding( - x, - per_part_size, - size, - weight_attr, - inner_rank, - num_partitions, - name, - group=None) - return emb_out + emb_out = _parallel_embedding( + x, + per_part_size, + size, + weight_attr, + inner_rank, + num_partitions, + name, + group=None) + return emb_out else: should_split = False if axis == 0: diff --git a/python/paddle/fluid/tests/unittests/c_embedding_op_base.py b/python/paddle/fluid/tests/unittests/c_embedding_op_base.py new file mode 100644 index 0000000000000000000000000000000000000000..8bdeecae4569ea53ce0692a605e36a66415efebe --- /dev/null +++ b/python/paddle/fluid/tests/unittests/c_embedding_op_base.py @@ -0,0 +1,132 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest +import paddle +import paddle.fluid as fluid +from paddle.framework import core + +SEED = 2021 +np.random.seed(SEED) + + +def get_c_embedding(start, end, table, ids): + index = ids.flatten() + input_mask = (index < start) | (index >= end) + masked_input = index - start + masked_input[input_mask] = 0 + output = table[masked_input] + output[input_mask] = 0.0 + return output + + +class TestCEmbeddingCPU(OpTest): + def setUp(self): + self.init_dtype() + self.initcase() + if core.is_compiled_with_npu(): + self.__class__.use_npu = True + elif core.is_compiled_with_cuda(): + self.__class__.exist_fp64_check_grad = True + + def initcase(self): + self.op_type = "c_embedding" + table = np.random.random((17, 64)).astype(self.dtype) + ids = np.random.randint( + low=0, high=17 * 2, size=(2, 4)).astype(self.ids_dtype) + self.start_index = 10 + self.end_index = self.start_index + 17 + + self.inputs = {'W': table, 'Ids': ids} + np_out = get_c_embedding(self.start_index, self.end_index, table, ids) + self.outputs = {'Out': np_out.reshape((2, 4, 64))} + self.attrs = {'start_index': self.start_index} + if core.is_compiled_with_npu(): + self.__class__.use_npu = True + + def test_check_cpu(self): + self.check_output_with_place(core.CPUPlace()) + + def test_check_cpu_grad(self): + self.check_grad_with_place(core.CPUPlace(), ['W'], 'Out') + + def init_dtype(self): + self.dtype = "float32" + self.ids_dtype = "int64" + + +class TestCEmbeddingOpBase(TestCEmbeddingCPU): + def setUp(self): + self.init_dtype() + self.initcase() + + def test_check_output(self): + if core.is_compiled_with_cuda(): + self.check_output_with_place(core.CUDAPlace(0)) + elif core.is_compiled_with_npu(): + self.check_output_with_place(core.NPUPlace(0)) + + def test_check_grad(self): + if core.is_compiled_with_cuda(): + self.check_grad_with_place(core.CUDAPlace(0), ['W'], 'Out') + elif core.is_compiled_with_npu(): + self.check_grad_with_place(core.NPUPlace(0), ['W'], 'Out') + + def init_dtype(self): + if core.is_compiled_with_cuda(): + self.dtype = "float64" + self.ids_dtype = "int64" + elif core.is_compiled_with_npu(): + self.dtype = "float32" + self.ids_dtype = "int32" + + +class TestCEmbeddingOpFP32(TestCEmbeddingOpBase): + def setUp(self): + self.init_dtype() + self.initcase() + + def initcase(self): + self.op_type = "c_embedding" + table = np.random.random((17, 64)).astype(self.dtype) + ids = np.random.randint( + low=0, high=17 * 2, size=(2, 4)).astype(self.ids_dtype) + self.start_index = 10 + ids[0][1] = 12 + ids[0][2] = 12 + ids[1][2] = 12 + ids[1][3] = 12 + self.end_index = self.start_index + 17 + + self.inputs = {'W': table, 'Ids': ids} + np_out = get_c_embedding(self.start_index, self.end_index, table, ids) + self.outputs = {'Out': np_out.reshape((2, 4, 64))} + self.attrs = {'start_index': self.start_index} + + if core.is_compiled_with_npu(): + self.__class__.use_npu = True + elif core.is_compiled_with_cuda(): + self.__class__.exist_fp64_check_grad = True + + def init_dtype(self): + self.dtype = "float32" + self.ids_dtype = "int32" + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_c_embedding_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_c_embedding_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..533a3fd12fd5275c8f304a52bb84ade23eda08a1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_c_embedding_op_npu.py @@ -0,0 +1,36 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.tests.unittests.c_embedding_op_base import TestCEmbeddingCPU, TestCEmbeddingOpBase, TestCEmbeddingOpFP32 + +paddle.enable_static() + +TestCEmbeddingCPU() + +TestCEmbeddingOpBase() + +TestCEmbeddingOpFP32() + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_c_embedding_op.py b/python/paddle/fluid/tests/unittests/test_c_embedding_op.py index c0cae78ed2953837e98a8f69f2d5eaf20b4769aa..adf215a649ad99e823c644b1bd91e2566bc6fba3 100644 --- a/python/paddle/fluid/tests/unittests/test_c_embedding_op.py +++ b/python/paddle/fluid/tests/unittests/test_c_embedding_op.py @@ -20,40 +20,13 @@ from op_test import OpTest import paddle import paddle.fluid as fluid from paddle.framework import core +from paddle.fluid.tests.unittests.c_embedding_op_base import TestCEmbeddingCPU, TestCEmbeddingOpBase, TestCEmbeddingOpFP32 +TestCEmbeddingCPU() -def get_c_embedding(start, end, table, ids): - index = ids.flatten() - input_mask = (index < start) | (index >= end) - masked_input = index - start - masked_input[input_mask] = 0 - output = table[masked_input] - output[input_mask] = 0.0 - return output - - -class TestCEmbeddingOp(OpTest): - def setUp(self): - self.op_type = "c_embedding" - table = np.random.random((17, 31)).astype("float64") - ids = np.random.randint( - low=0, high=17 * 2, size=(2, 4, 5)).astype("int32") - self.start_index = 10 - self.end_index = self.start_index + 17 - - self.inputs = {'W': table, 'Ids': ids} - np_out = get_c_embedding(self.start_index, self.end_index, table, ids) - self.outputs = {'Out': np_out.reshape((2, 4, 5, 31))} - self.attrs = {'start_index': self.start_index} - - def test_check_output_gpu(self): - if core.is_compiled_with_cuda(): - self.check_output_with_place(core.CUDAPlace(0)) - - def test_check_grad_gpu(self): - if core.is_compiled_with_cuda(): - self.check_grad_with_place(core.CUDAPlace(0), ['W'], 'Out') +TestCEmbeddingOpBase() +TestCEmbeddingOpFP32() if __name__ == "__main__": unittest.main()