From 5471d16278b85757d37c1566c83919440e7189be Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Mon, 28 Feb 2022 19:24:23 +0800 Subject: [PATCH 001/272] fix where api doc (#39980) --- python/paddle/tensor/search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py index ecf70ffe4a1..0ba47d79050 100644 --- a/python/paddle/tensor/search.py +++ b/python/paddle/tensor/search.py @@ -542,7 +542,7 @@ def where(condition, x=None, y=None, name=None): Args: - condition(Tensor): The condition to choose x or y. + condition(Tensor): The condition to choose x or y. When True(nonzero), yield x, otherwise yield y. x(Tensor or Scalar, optional): x is a Tensor or Scalar with data type float32, float64, int32, int64. Either both or neither of x and y should be given. y(Tensor or Scalar, optional): y is a Tensor or Scalar with data type float32, float64, int32, int64. Either both or neither of x and y should be given. -- GitLab From 496776367781aba1e4eea190da75a8c339aec43d Mon Sep 17 00:00:00 2001 From: Aganlengzi Date: Mon, 28 Feb 2022 22:38:07 +0800 Subject: [PATCH 002/272] [custom kernel] change kernel name judgement and remove macro control for selected_row (#39977) --- paddle/phi/core/custom_kernel.cc | 8 ++++---- paddle/phi/core/kernel_registry.h | 4 ---- paddle/phi/core/kernel_utils.h | 6 ------ paddle/phi/core/tensor_meta.h | 7 ------- paddle/phi/tests/core/test_custom_kernel.cc | 8 +++----- 5 files changed, 7 insertions(+), 26 deletions(-) diff --git a/paddle/phi/core/custom_kernel.cc b/paddle/phi/core/custom_kernel.cc index f84a2bd8d9c..58f9e1c623e 100644 --- a/paddle/phi/core/custom_kernel.cc +++ b/paddle/phi/core/custom_kernel.cc @@ -20,16 +20,16 @@ void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) { auto& kernel_info_map = custom_kernel_map.GetMap(); VLOG(3) << "Size of custom_kernel_map: " << kernel_info_map.size(); + auto& kernels = KernelFactory::Instance().kernels(); for (auto& pair : kernel_info_map) { - PADDLE_ENFORCE_EQ( - KernelFactory::Instance().HasCompatiblePhiKernel(pair.first), - true, + PADDLE_ENFORCE_NE( + kernels.find(pair.first), + kernels.end(), phi::errors::InvalidArgument( "The kernel %s is not ready for custom kernel registering.", pair.first)); for (auto& info_pair : pair.second) { - auto& kernels = KernelFactory::Instance().kernels(); PADDLE_ENFORCE_EQ( kernels[pair.first].find(info_pair.first), kernels[pair.first].end(), diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h index 6a1688947b9..7a05452cbeb 100644 --- a/paddle/phi/core/kernel_registry.h +++ b/paddle/phi/core/kernel_registry.h @@ -87,13 +87,11 @@ struct KernelArgsParseFunctor { default_tensor_layout, default_key.dtype(), arg_type); -#ifndef PADDLE_WITH_CUSTOM_KERNEL } else if (arg_type == std::type_index(typeid(const SelectedRows&))) { args_def->AppendInput(default_key.backend(), default_tensor_layout, default_key.dtype(), arg_type); -#endif } else if (arg_type == std::type_index(typeid(DenseTensor*))) { args_def->AppendOutput(default_key.backend(), default_tensor_layout, @@ -105,13 +103,11 @@ struct KernelArgsParseFunctor { default_tensor_layout, default_key.dtype(), arg_type); -#ifndef PADDLE_WITH_CUSTOM_KERNEL } else if (arg_type == std::type_index(typeid(SelectedRows*))) { args_def->AppendOutput(default_key.backend(), default_tensor_layout, default_key.dtype(), arg_type); -#endif } else { // Attribute deal with // TODO(chenweihang): now here allow any types of attribute, maybe diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h index 2fda3cb6db4..e5de5e2b49e 100644 --- a/paddle/phi/core/kernel_utils.h +++ b/paddle/phi/core/kernel_utils.h @@ -23,9 +23,7 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_context.h" -#ifndef PADDLE_WITH_CUSTOM_KERNEL #include "paddle/phi/core/selected_rows.h" -#endif #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/sparse_csr_tensor.h" #include "paddle/phi/core/type_defs.h" @@ -222,9 +220,7 @@ struct KernelImpl { PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor); PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor); PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor); -#ifndef PADDLE_WITH_CUSTOM_KERNEL PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows); -#endif PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor); PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor); @@ -259,9 +255,7 @@ struct KernelImpl { PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor); PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor); -#ifndef PADDLE_WITH_CUSTOM_KERNEL PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRows); -#endif PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor); PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor); diff --git a/paddle/phi/core/tensor_meta.h b/paddle/phi/core/tensor_meta.h index 3d2da542c74..f4bd0be0b45 100644 --- a/paddle/phi/core/tensor_meta.h +++ b/paddle/phi/core/tensor_meta.h @@ -23,13 +23,6 @@ limitations under the License. */ #include "paddle/utils/any.h" #include "paddle/utils/optional.h" -// Note: mixed_vector include many header now, LoD will be -// used on CUDA device? Can we use small_vector here? -// @zhanlve: Rollback to original LoD for now -#ifndef PADDLE_WITH_CUSTOM_KERNEL -#include "paddle/fluid/framework/mixed_vector.h" -#endif - namespace phi { using DDim = phi::DDim; diff --git a/paddle/phi/tests/core/test_custom_kernel.cc b/paddle/phi/tests/core/test_custom_kernel.cc index d8e42c9d0d8..69922c055cb 100644 --- a/paddle/phi/tests/core/test_custom_kernel.cc +++ b/paddle/phi/tests/core/test_custom_kernel.cc @@ -146,12 +146,10 @@ TEST(CustomKernel, custom_kernel_dot) { custom_fake_dot_kernels.end()); // 3.before register - auto& kernel_factory_instance = phi::KernelFactory::Instance(); auto& kernels = phi::KernelFactory::Instance().kernels(); - EXPECT_TRUE(!kernel_factory_instance.HasCompatiblePhiKernel(op_name)); + EXPECT_TRUE(kernels.find(op_name) == kernels.end()); - // mock fake_dot is supported by phi for HasCompatiblePhiKernel check while - // registering + // mock fake_dot is supported by phi for check while registering auto& fake_dot_kernels = kernels[op_name]; EXPECT_TRUE(fake_dot_kernels.find( @@ -196,7 +194,7 @@ TEST(CustomKernel, custom_kernel_dot) { fake_dot_kernels.end()); // 4.kernel select - auto kernel = kernel_factory_instance.SelectKernelOrThrowError( + auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( op_name, phi::KernelKey(backend, layout, phi::DataType::UINT8)); // 5.prepare parameters for kernel -- GitLab From 1b585b2896c05f08a69c6513ba16fd6817739118 Mon Sep 17 00:00:00 2001 From: seemingwang Date: Mon, 28 Feb 2022 22:50:21 +0800 Subject: [PATCH 003/272] Move index sample (#39905) * graph engine demo * upload unsaved changes * fix dependency error * fix shard_num problem * py client * remove lock and graph-type * add load direct graph * add load direct graph * add load direct graph * batch random_sample * batch_sample_k * fix num_nodes size * batch brpc * batch brpc * add test * add test * add load_nodes; change add_node function * change sample return type to pair * resolve conflict * resolved conflict * resolved conflict * separate server and client * merge pair type * fix * resolved conflict * fixed segment fault; high-level VLOG for load edges and load nodes * random_sample return 0 * rm useless loop * test:load edge * fix ret -1 * test: rm sample * rm sample * random_sample return future * random_sample return int * test fake node * fixed here * memory leak * remove test code * fix return problem * add common_graph_table * random sample node &test & change data-structure from linkedList to vector * add common_graph_table * sample with srand * add node_types * optimize nodes sample * recover test * random sample * destruct weighted sampler * GraphEdgeBlob * WeightedGraphEdgeBlob to GraphEdgeBlob * WeightedGraphEdgeBlob to GraphEdgeBlob * pybind sample nodes api * pull nodes with step * fixed pull_graph_list bug; add test for pull_graph_list by step * add graph table;name * add graph table;name * add pybind * add pybind * add FeatureNode * add FeatureNode * add FeatureNode Serialize * add FeatureNode Serialize * get_feat_node * avoid local rpc * fix get_node_feat * fix get_node_feat * remove log * get_node_feat return py:bytes * merge develop with graph_engine * fix threadpool.h head * fix * fix typo * resolve conflict * fix conflict * recover lost content * fix pybind of FeatureNode * recover cmake * recover tools * resolve conflict * resolve linking problem * code style * change test_server port * fix code problems * remove shard_num config * remove redundent threads * optimize start server * remove logs * fix code problems by reviewers' suggestions * move graph files into a folder * code style change * remove graph operations from base table * optimize get_feat function of graph engine * fix long long count problem * remove redandunt graph files * remove unused shell * recover dropout_op_pass.h * fix potential stack overflow when request number is too large & node add & node clear & node remove * when sample k is larger than neigbor num, return directly * using random seed generator of paddle to speed up * fix bug of random sample k * fix code style * fix code style * add remove graph to fleet_py.cc * fix blocking_queue problem * fix style * fix * recover capacity check * add remove graph node; add set_feature * add remove graph node; add set_feature * add remove graph node; add set_feature * add remove graph node; add set_feature * fix distributed op combining problems * optimize * remove logs * fix MultiSlotDataGenerator error * cache for graph engine * fix type compare error * more test&fix thread terminating problem * remove header * change time interval of shrink * use cache when sample nodes * remove unused function * change unique_ptr to shared_ptr * simplify cache template * cache api on client * fix * reduce sample threads when cache is not used * reduce cache memory * cache optimization * remove test function * remove extra fetch function * graph-engine data transfer optimization * support graph_split load&query * remove logs * change shards to pointer vector * use inference * remove test code * renorm op * simplify renorm op * recover local changes * recover renorm op kernel * fix init * add blanklines in renorm doc * fix import * fix import * add renorm to init.py * merge * move index_sample op * Delete api.h * Delete api.cc * fix * remove logs * recover infer shape of grad * recover changes * change shape * fix label * fix * fix * fix * fix * fix * fix * fix * fix * fix * fix Co-authored-by: Huang Zhengjie <270018958@qq.com> Co-authored-by: Weiyue Su Co-authored-by: suweiyue Co-authored-by: luobin06 Co-authored-by: liweibin02 Co-authored-by: tangwei12 --- paddle/fluid/operators/index_sample_op.cc | 61 +---- paddle/fluid/operators/index_sample_op.cu | 215 ------------------ paddle/fluid/operators/index_sample_op.h | 198 ---------------- paddle/fluid/operators/index_sample_op_npu.cc | 3 +- paddle/phi/infermeta/binary.cc | 35 +++ paddle/phi/infermeta/binary.h | 5 + .../kernels/cpu/index_sample_grad_kernel.cc | 106 +++++++++ paddle/phi/kernels/cpu/index_sample_kernel.cc | 118 ++++++++++ .../kernels/gpu/index_sample_grad_kernel.cu | 146 ++++++++++++ paddle/phi/kernels/gpu/index_sample_kernel.cu | 119 ++++++++++ paddle/phi/kernels/index_sample_grad_kernel.h | 28 +++ paddle/phi/kernels/index_sample_kernel.h | 27 +++ paddle/phi/ops/compat/index_sample_sig.cc | 30 +++ 13 files changed, 623 insertions(+), 468 deletions(-) delete mode 100644 paddle/fluid/operators/index_sample_op.cu delete mode 100644 paddle/fluid/operators/index_sample_op.h create mode 100644 paddle/phi/kernels/cpu/index_sample_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/index_sample_kernel.cc create mode 100644 paddle/phi/kernels/gpu/index_sample_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/index_sample_kernel.cu create mode 100644 paddle/phi/kernels/index_sample_grad_kernel.h create mode 100644 paddle/phi/kernels/index_sample_kernel.h create mode 100644 paddle/phi/ops/compat/index_sample_sig.cc diff --git a/paddle/fluid/operators/index_sample_op.cc b/paddle/fluid/operators/index_sample_op.cc index 2d97797cfec..68d002fceea 100644 --- a/paddle/fluid/operators/index_sample_op.cc +++ b/paddle/fluid/operators/index_sample_op.cc @@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/index_sample_op.h" #include #include "paddle/fluid/framework/no_need_buffer_vars_inference.h" -#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { class IndexSampleOpMaker : public framework::OpProtoAndCheckerMaker { @@ -42,44 +44,6 @@ class IndexSampleOpMaker : public framework::OpProtoAndCheckerMaker { class IndexSampleOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Inputs(Input) of FindByIndex should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true, - platform::errors::InvalidArgument( - "Inputs(Index) of FindByIndex should not be null.")); - - auto input_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_EQ( - input_dims.size(), 2, - platform::errors::InvalidArgument( - "Inputs(X) shape of IndexSample op should be 2-D, but " - "got X's shape = [%s], please check X shape.", - input_dims)); - - auto index_dims = ctx->GetInputDim("Index"); - PADDLE_ENFORCE_EQ( - input_dims.size(), 2, - platform::errors::InvalidArgument( - "Inputs(Index) shape of IndexSample op should be 2-D, but " - "got Index's shape [%s] , please check index shape.", - input_dims)); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(input_dims[0], index_dims[0], - platform::errors::InvalidArgument( - "Inputs(X)'s value of dimension 0 must same with " - "Inputs(Index)'s value of dimension 0, but " - "got %d of Inputs(X), and got %d of Inputs(Index), " - "please check Inputs shape.", - input_dims[0], index_dims[0])); - } - ctx->SetOutputDim("Out", index_dims); - auto type = ctx->GetInputsVarType("Index")[0]; - if (type == framework::proto::VarType::LOD_TENSOR) { - ctx->ShareLoD("Index", /*->*/ "Out"); - } - } protected: framework::OpKernelType GetExpectedKernelType( @@ -136,20 +100,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSampleGradNoNeedBufferVarInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(index_sample, IndexSampleInferShapeFunctor, + PT_INFER_META(phi::IndexSampleInferMeta)); REGISTER_OPERATOR(index_sample, ops::IndexSampleOp, ops::IndexSampleOpMaker, ops::IndexSampleGradMaker, - ops::IndexSampleGradMaker); + ops::IndexSampleGradMaker, + IndexSampleInferShapeFunctor); REGISTER_OPERATOR(index_sample_grad, ops::IndexSampleGradOp, ops::IndexSampleGradNoNeedBufferVarInferer); -REGISTER_OP_CPU_KERNEL( - index_sample, - ops::IndexSampleKernel, - ops::IndexSampleKernel, - ops::IndexSampleKernel, - ops::IndexSampleKernel); -REGISTER_OP_CPU_KERNEL( - index_sample_grad, - ops::IndexSampleGradKernel, - ops::IndexSampleGradKernel, - ops::IndexSampleGradKernel, - ops::IndexSampleGradKernel); diff --git a/paddle/fluid/operators/index_sample_op.cu b/paddle/fluid/operators/index_sample_op.cu deleted file mode 100644 index e8acbfb8be9..00000000000 --- a/paddle/fluid/operators/index_sample_op.cu +++ /dev/null @@ -1,215 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/index_sample_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -#define PREDEFINED_BLOCK_SIZE_X 512 -#define PREDEFINED_BLOCK_SIZE 1024 -#define MIN(a, b) ((a) < (b) ? (a) : (b)) - -namespace paddle { -namespace operators { - -namespace { -void LimitGridDim(const framework::ExecutionContext& ctx, dim3* grid_dim) { - auto max_grid_dim = ctx.template device_context() - .GetCUDAMaxGridDimSize(); - grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0]; - grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1]; -} -} - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -template -__global__ void IndexSampleForward(const IndexT* index, const T* in_data, - T* out_data, size_t index_length, - size_t input_length, size_t batch_size) { - unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x; - unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y; - for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) { - index_i = blockDim.x * blockIdx.x + threadIdx.x; - for (; index_i < index_length; index_i += blockDim.x * gridDim.x) { - unsigned int index_idx = index_j * index_length + index_i; - unsigned int in_idx = index_j * input_length + index_i; - IndexT sample_idx = index[index_idx]; - out_data[index_idx] = in_data[in_idx - index_i + sample_idx]; - } - } -} - -template -__global__ void IndexSampleGrad(const IndexT* index, T* in_grad, - const T* out_grad, size_t index_length, - size_t input_length, size_t batch_size, - bool same_data_in_row = true) { - unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x; - unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y; - - for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) { - index_i = blockDim.x * blockIdx.x + threadIdx.x; - for (; index_i < index_length; index_i += blockDim.x * gridDim.x) { - unsigned int index_idx = index_j * index_length + index_i; - unsigned int in_idx = index_j * input_length + index_i; - IndexT sample_idx = index[index_idx]; - if (same_data_in_row) { - platform::CudaAtomicAdd(&(in_grad[in_idx - index_i + sample_idx]), - out_grad[sample_idx]); - } else { - in_grad[in_idx - index_i + sample_idx] = out_grad[index_idx]; - } - } - } -} - -template -class IndexSampleKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* index = ctx.Input("Index"); - auto* output = ctx.Output("Out"); - - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT64 || - index_type == framework::proto::VarType::INT32; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - const auto* in_data = input->data(); - auto* out_data = output->mutable_data(ctx.GetPlace()); - auto stream = - ctx.template device_context().stream(); - - auto input_dim = input->dims(); - auto index_dim = index->dims(); - size_t batch_size = input_dim[0]; - size_t input_length = input_dim[1]; - size_t index_length = index_dim[1]; - - auto block_width = platform::RoundToPowerOfTwo(index_length); - block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X); - int block_height = - platform::RoundToPowerOfTwo(index_length * batch_size) / block_width; - block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width); - dim3 block_dim(block_width, block_height); - dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x, - (batch_size + block_dim.y - 1) / block_dim.y); - LimitGridDim(ctx, &grid_dim); - - if (index_type == framework::proto::VarType::INT64) { - const int64_t* index_data = index->data(); - IndexSampleForward<<>>( - index_data, in_data, out_data, index_length, input_length, - batch_size); - } else if (index_type == framework::proto::VarType::INT32) { - const int* index_data = index->data(); - IndexSampleForward<<>>( - index_data, in_data, out_data, index_length, input_length, - batch_size); - } - } -}; - -template -class IndexSampleGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* output_grad = ctx.Input(framework::GradVarName("Out")); - auto* input_grad = ctx.Output(framework::GradVarName("X")); - auto* index = ctx.Input("Index"); - - const auto* output_grad_data = output_grad->data(); - auto* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); - - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT64 || - index_type == framework::proto::VarType::INT32; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - - auto stream = - ctx.template device_context().stream(); - auto input_num = input_grad->numel(); - auto input_dim = input_grad->dims(); - auto index_dim = index->dims(); - size_t batch_size = index_dim[0]; - size_t input_length = input_dim[1]; - size_t index_length = index_dim[1]; - bool same_data_in_index_row = index_length == 1 ? false : true; - - auto block_width = platform::RoundToPowerOfTwo(index_length); - block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X); - auto block_height = - platform::RoundToPowerOfTwo(index_length * batch_size) / block_width; - block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width); - dim3 block_dim(block_width, block_height); - dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x, - (batch_size + block_dim.y - 1) / block_dim.y); - LimitGridDim(ctx, &grid_dim); - - phi::funcs::SetConstant set_zero; - auto& dev_ctx = ctx.template device_context(); - set_zero(dev_ctx, input_grad, static_cast(0)); - - if (index_type == framework::proto::VarType::INT64) { - const int64_t* index_data = index->data(); - IndexSampleGrad<<>>( - index_data, input_grad_data, output_grad_data, index_length, - input_length, batch_size, same_data_in_index_row); - } else if (index_type == framework::proto::VarType::INT32) { - const int* index_data = index->data(); - IndexSampleGrad<<>>( - index_data, input_grad_data, output_grad_data, index_length, - input_length, batch_size, same_data_in_index_row); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - index_sample, - ops::IndexSampleKernel, - ops::IndexSampleKernel, - ops::IndexSampleKernel, - ops::IndexSampleKernel); -REGISTER_OP_CUDA_KERNEL( - index_sample_grad, - ops::IndexSampleGradKernel, - ops::IndexSampleGradKernel, - ops::IndexSampleGradKernel, - ops::IndexSampleGradKernel); diff --git a/paddle/fluid/operators/index_sample_op.h b/paddle/fluid/operators/index_sample_op.h deleted file mode 100644 index 6cc8ff04c54..00000000000 --- a/paddle/fluid/operators/index_sample_op.h +++ /dev/null @@ -1,198 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include "gflags/gflags.h" -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; -using DDim = framework::DDim; - -template -void IndexSampleInner(const framework::ExecutionContext &context, - const LoDTensor &input, const LoDTensor &index, - LoDTensor *output) { - auto input_dims = input.dims(); - auto index_dims = index.dims(); - - int batch_size = input_dims[0]; - auto value_length = input_dims[1]; - auto index_length = index_dims[1]; - int index_ids_num = index.numel(); - - std::vector input_vec; - std::vector index_vec; - paddle::framework::TensorToVector(input, context.device_context(), - &input_vec); - paddle::framework::TensorToVector(index, context.device_context(), - &index_vec); - - std::vector res(index_ids_num); - for (int i = 0; i < index_ids_num; i++) { - int b = floor(i / index_length); - PADDLE_ENFORCE_GE( - index_vec[i], 0, - platform::errors::InvalidArgument( - "Variable value (index) of OP(index_sample) " - "expected >= 0 and < %ld, but got %ld. Please check input " - "value.", - value_length, index_vec[i])); - PADDLE_ENFORCE_LT( - index_vec[i], value_length, - platform::errors::InvalidArgument( - "Variable value (index) of OP(index_sample) " - "expected >= 0 and < %ld, but got %ld. Please check input " - "value.", - value_length, index_vec[i])); - - int v_i = b * value_length + static_cast(index_vec[i]); - T v = input_vec[v_i]; - VLOG(4) << "Index Sample: batch = " << b << " index = " << v_i - << " value = " << v; - res[i] = v; - } - - auto ddim = phi::make_ddim({batch_size, index_length}); - output->mutable_data(context.GetPlace()); - framework::TensorFromVector(res, context.device_context(), output); - output->Resize(ddim); -} - -template -class IndexSampleKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *input_var = ctx.InputVar("X"); - auto *index_var = ctx.InputVar("Index"); - - auto &input_tensor = input_var->Get(); - auto &index_tensor = index_var->Get(); - - auto *out_var = ctx.OutputVar("Out"); - auto *out_tensor = out_var->GetMutable(); - - const auto &index_type = - framework::TransToProtoVarType(index_tensor.dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - if (index_type == framework::proto::VarType::INT32) { - IndexSampleInner(ctx, input_tensor, index_tensor, out_tensor); - } else if (index_type == framework::proto::VarType::INT64) { - IndexSampleInner(ctx, input_tensor, index_tensor, out_tensor); - } - } -}; - -template -void IndexSampleGradInner(const framework::ExecutionContext &context, - const LoDTensor &out_grad, const LoDTensor &index, - LoDTensor *x_grad) { - std::vector out_grad_vec; - std::vector index_vec; - paddle::framework::TensorToVector(out_grad, context.device_context(), - &out_grad_vec); - paddle::framework::TensorToVector(index, context.device_context(), - &index_vec); - - auto index_dims = index.dims(); - auto x_grad_dims = x_grad->dims(); - - auto value_length = x_grad_dims[1]; - auto index_length = index_dims[1]; - int index_ids_num = index.numel(); - - std::vector x_grad_vec(x_grad->numel(), 0); - - for (int i = 0; i < index_ids_num; i++) { - int b = floor(i / index_length); - PADDLE_ENFORCE_GE( - index_vec[i], 0, - platform::errors::InvalidArgument( - "Variable value (index) of OP(index_sample_grad) " - "expected >= 0 and < %ld, but got %ld. Please check input " - "value.", - value_length, index_vec[i])); - PADDLE_ENFORCE_LT( - index_vec[i], value_length, - platform::errors::InvalidArgument( - "Variable value (index) of OP(index_sample_grad) " - "expected >= 0 and < %ld, but got %ld. Please check input " - "value.", - value_length, index_vec[i])); - int v_i = b * value_length + static_cast(index_vec[i]); - x_grad_vec[v_i] += out_grad_vec[i]; - } - x_grad->mutable_data(context.GetPlace()); - framework::TensorFromVector(x_grad_vec, context.device_context(), x_grad); - x_grad->Resize(x_grad_dims); -} - -template -class IndexSampleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto *index_var = context.InputVar("Index"); - auto *x_grad_var = context.OutputVar(framework::GradVarName("X")); - auto *out_grad_var = context.InputVar(framework::GradVarName("Out")); - - auto &index_tensor = index_var->Get(); - auto &out_grad_tensor = out_grad_var->Get(); - auto *x_grad_tensor = x_grad_var->GetMutable(); - - const auto &index_type = - framework::TransToProtoVarType(index_tensor.dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - if (index_type == framework::proto::VarType::INT32) { - IndexSampleGradInner(context, out_grad_tensor, index_tensor, - x_grad_tensor); - } else if (index_type == framework::proto::VarType::INT64) { - IndexSampleGradInner(context, out_grad_tensor, index_tensor, - x_grad_tensor); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/index_sample_op_npu.cc b/paddle/fluid/operators/index_sample_op_npu.cc index f460d0622bc..38eb5b45149 100644 --- a/paddle/fluid/operators/index_sample_op_npu.cc +++ b/paddle/fluid/operators/index_sample_op_npu.cc @@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/index_sample_op.h" - +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index dfaabf7cae2..1905e33bd03 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -225,6 +225,41 @@ void HuberLossInferMeta(const MetaTensor& input, out->share_lod(input); } +void IndexSampleInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out, + MetaConfig config) { + auto input_dims = x.dims(); + PADDLE_ENFORCE_EQ(input_dims.size(), + 2, + errors::InvalidArgument( + "Inputs(X) shape of IndexSample op should be 2-D, but " + "got X's shape = [%s], please check X shape.", + input_dims)); + + auto index_dims = y.dims(); + PADDLE_ENFORCE_EQ( + index_dims.size(), + 2, + errors::InvalidArgument( + "Inputs(Index) shape of IndexSample op should be 2-D, but " + "got Index's shape [%s] , please check index shape.", + input_dims)); + if (config.is_runtime) { + PADDLE_ENFORCE_EQ(input_dims[0], + index_dims[0], + errors::InvalidArgument( + "Inputs(X)'s value of dimension 0 must same with " + "Inputs(Index)'s value of dimension 0, but " + "got %d of Inputs(X), and got %d of Inputs(Index), " + "please check Inputs shape.", + input_dims[0], + index_dims[0])); + } + out->set_dtype(x.dtype()); + out->set_dims(index_dims); + out->share_lod(y); +} void CrossInferMeta(const MetaTensor& x, const MetaTensor& y, int axis, diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index 02750482dcc..a0140c9a579 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -53,6 +53,11 @@ void HuberLossInferMeta(const MetaTensor& input_meta, MetaTensor* residual, MetaConfig config = MetaConfig()); +void IndexSampleInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void CrossInferMeta(const MetaTensor& x, const MetaTensor& y, int axis, diff --git a/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc new file mode 100644 index 00000000000..006711ceef7 --- /dev/null +++ b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc @@ -0,0 +1,106 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/index_sample_grad_kernel.h" +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/kernel_registry.h" +namespace phi { +template +void IndexSampleGradInner(const Context& context, + const DenseTensor& out_grad, + const DenseTensor& index, + DenseTensor* x_grad) { + std::vector out_grad_vec; + std::vector index_vec; + paddle::framework::TensorToVector(out_grad, context, &out_grad_vec); + paddle::framework::TensorToVector(index, context, &index_vec); + + auto index_dims = index.dims(); + auto x_grad_dims = x_grad->dims(); + + auto value_length = x_grad_dims[1]; + auto index_length = index_dims[1]; + int index_ids_num = index.numel(); + + std::vector x_grad_vec(x_grad->numel(), 0); + + for (int i = 0; i < index_ids_num; i++) { + int b = floor(i / index_length); + PADDLE_ENFORCE_GE( + index_vec[i], + 0, + errors::InvalidArgument( + "Variable value (index) of OP(index_sample_grad) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + value_length, + index_vec[i])); + PADDLE_ENFORCE_LT( + index_vec[i], + value_length, + errors::InvalidArgument( + "Variable value (index) of OP(index_sample_grad) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + value_length, + index_vec[i])); + int v_i = b * value_length + static_cast(index_vec[i]); + x_grad_vec[v_i] += out_grad_vec[i]; + } + context.template Alloc(x_grad); + paddle::framework::TensorFromVector(x_grad_vec, context, x_grad); + x_grad->Resize(x_grad_dims); +} + +template +void IndexSampleGradKernel(const Context& ctx, + const DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& index, + DenseTensor* x_grad) { + auto index_type = index.dtype(); + bool index_type_match = + index_type == DataType::INT32 || index_type == DataType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, + true, + errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType(index_type)), + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType(DataType::INT32)), + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType((DataType::INT64))))); + if (index_type == DataType::INT32) { + IndexSampleGradInner(ctx, out_grad, index, x_grad); + } else if (index_type == DataType::INT64) { + IndexSampleGradInner(ctx, out_grad, index, x_grad); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(index_sample_grad, + CPU, + ALL_LAYOUT, + phi::IndexSampleGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/index_sample_kernel.cc b/paddle/phi/kernels/cpu/index_sample_kernel.cc new file mode 100644 index 00000000000..21bf9faee13 --- /dev/null +++ b/paddle/phi/kernels/cpu/index_sample_kernel.cc @@ -0,0 +1,118 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/index_sample_kernel.h" +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/kernel_registry.h" +namespace phi { +template +void IndexSampleInner(const Context &context, + const DenseTensor &input, + const DenseTensor &index, + DenseTensor *output) { + auto input_dims = input.dims(); + auto index_dims = index.dims(); + + int batch_size = input_dims[0]; + auto value_length = input_dims[1]; + auto index_length = index_dims[1]; + int index_ids_num = index.numel(); + + std::vector input_vec; + std::vector index_vec; + paddle::framework::TensorToVector(input, context, &input_vec); + paddle::framework::TensorToVector(index, context, &index_vec); + + std::vector res(index_ids_num); + for (int i = 0; i < index_ids_num; i++) { + int b = floor(i / index_length); + PADDLE_ENFORCE_GE( + index_vec[i], + 0, + errors::InvalidArgument( + "Variable value (index) of OP(index_sample) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + value_length, + index_vec[i])); + PADDLE_ENFORCE_LT( + index_vec[i], + value_length, + errors::InvalidArgument( + "Variable value (index) of OP(index_sample) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + value_length, + index_vec[i])); + + int v_i = b * value_length + static_cast(index_vec[i]); + T v = input_vec[v_i]; + VLOG(4) << "Index Sample: batch = " << b << " index = " << v_i + << " value = " << v; + res[i] = v; + } + + auto ddim = phi::make_ddim({batch_size, index_length}); + context.template Alloc(output); + paddle::framework::TensorFromVector(res, context, output); + output->Resize(ddim); +} + +template +void IndexSampleKernel(const Context &ctx, + const DenseTensor &x, + const DenseTensor &index, + DenseTensor *out) { + ctx.template Alloc(out); + auto index_type = index.dtype(); + bool index_type_match = + index_type == DataType::INT32 || index_type == DataType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, + true, + errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType(index_type)), + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType(DataType::INT32)), + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType((DataType::INT64))))); + if (index_type == DataType::INT32) { + IndexSampleInner(ctx, x, index, out); + } else if (index_type == DataType::INT64) { + IndexSampleInner(ctx, x, index, out); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(index_sample, + CPU, + ALL_LAYOUT, + phi::IndexSampleKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu new file mode 100644 index 00000000000..8b1ef964124 --- /dev/null +++ b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu @@ -0,0 +1,146 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/index_sample_grad_kernel.h" + +#include +#include +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +namespace { +template +void LimitGridDim(const Context& ctx, dim3* grid_dim) { + auto max_grid_dim = + reinterpret_cast(ctx).GetCUDAMaxGridDimSize(); + grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0]; + grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1]; +} +#define PREDEFINED_BLOCK_SIZE_X 512 +#define PREDEFINED_BLOCK_SIZE 1024 +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +}; + +template +__global__ void IndexSampleGrad(const IndexT* index, + T* in_grad, + const T* out_grad, + size_t index_length, + size_t input_length, + size_t batch_size, + bool same_data_in_row = true) { + unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y; + + for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) { + index_i = blockDim.x * blockIdx.x + threadIdx.x; + for (; index_i < index_length; index_i += blockDim.x * gridDim.x) { + unsigned int index_idx = index_j * index_length + index_i; + unsigned int in_idx = index_j * input_length + index_i; + IndexT sample_idx = index[index_idx]; + if (same_data_in_row) { + paddle::platform::CudaAtomicAdd( + &(in_grad[in_idx - index_i + sample_idx]), out_grad[sample_idx]); + } else { + in_grad[in_idx - index_i + sample_idx] = out_grad[index_idx]; + } + } + } +} + +template +void IndexSampleGradKernel(const Context& ctx, + const DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& index, + DenseTensor* x_grad) { + const T* output_grad_data = out_grad.data(); + T* input_grad_data = ctx.template Alloc(x_grad); + auto index_type = index.dtype(); + bool index_type_match = + index_type == DataType::INT32 || index_type == DataType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, + true, + errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType(index_type)), + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType(DataType::INT32)), + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType((DataType::INT64))))); + + auto stream = reinterpret_cast(ctx).stream(); + auto input_num = x.numel(); + auto input_dim = x.dims(); + auto index_dim = index.dims(); + size_t batch_size = index_dim[0]; + size_t input_length = input_dim[1]; + size_t index_length = index_dim[1]; + bool same_data_in_index_row = index_length == 1 ? false : true; + + auto block_width = paddle::platform::RoundToPowerOfTwo(index_length); + block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X); + auto block_height = + paddle::platform::RoundToPowerOfTwo(index_length * batch_size) / + block_width; + block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width); + dim3 block_dim(block_width, block_height); + dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x, + (batch_size + block_dim.y - 1) / block_dim.y); + LimitGridDim(ctx, &grid_dim); + + phi::funcs::SetConstant set_zero; + set_zero(ctx, x_grad, static_cast(0)); + + if (index_type == DataType::INT64) { + const int64_t* index_data = index.data(); + IndexSampleGrad<<>>( + index_data, + input_grad_data, + output_grad_data, + index_length, + input_length, + batch_size, + same_data_in_index_row); + } else if (index_type == DataType::INT32) { + const int* index_data = index.data(); + IndexSampleGrad<<>>( + index_data, + input_grad_data, + output_grad_data, + index_length, + input_length, + batch_size, + same_data_in_index_row); + } +} +} // namespace phi + +PD_REGISTER_KERNEL(index_sample_grad, + GPU, + ALL_LAYOUT, + phi::IndexSampleGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/index_sample_kernel.cu b/paddle/phi/kernels/gpu/index_sample_kernel.cu new file mode 100644 index 00000000000..0e042089e1e --- /dev/null +++ b/paddle/phi/kernels/gpu/index_sample_kernel.cu @@ -0,0 +1,119 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/index_sample_kernel.h" + +#include +#include +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +namespace { +template +void LimitGridDim(const Context& ctx, dim3* grid_dim) { + auto max_grid_dim = + reinterpret_cast(ctx).GetCUDAMaxGridDimSize(); + grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0]; + grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1]; +} +#define PREDEFINED_BLOCK_SIZE_X 512 +#define PREDEFINED_BLOCK_SIZE 1024 +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +} + +template +__global__ void IndexSampleForward(const IndexT* index, + const T* in_data, + T* out_data, + size_t index_length, + size_t input_length, + size_t batch_size) { + unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y; + for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) { + index_i = blockDim.x * blockIdx.x + threadIdx.x; + for (; index_i < index_length; index_i += blockDim.x * gridDim.x) { + unsigned int index_idx = index_j * index_length + index_i; + unsigned int in_idx = index_j * input_length + index_i; + IndexT sample_idx = index[index_idx]; + out_data[index_idx] = in_data[in_idx - index_i + sample_idx]; + } + } +} + +template +void IndexSampleKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& index, + DenseTensor* out) { + auto index_type = index.dtype(); + bool index_type_match = + index_type == DataType::INT32 || index_type == DataType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, + true, + errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType(index_type)), + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType(DataType::INT32)), + paddle::framework::DataTypeToString( + paddle::framework::TransToProtoVarType((DataType::INT64))))); + const T* in_data = x.data(); + T* out_data = ctx.template Alloc(out); + auto stream = reinterpret_cast(ctx).stream(); + auto input_dim = x.dims(); + auto index_dim = index.dims(); + size_t batch_size = input_dim[0]; + size_t input_length = input_dim[1]; + size_t index_length = index_dim[1]; + + auto block_width = paddle::platform::RoundToPowerOfTwo(index_length); + block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X); + int block_height = + paddle::platform::RoundToPowerOfTwo(index_length * batch_size) / + block_width; + block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width); + dim3 block_dim(block_width, block_height); + dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x, + (batch_size + block_dim.y - 1) / block_dim.y); + LimitGridDim(ctx, &grid_dim); + + if (index_type == DataType::INT64) { + const int64_t* index_data = index.data(); + IndexSampleForward<<>>( + index_data, in_data, out_data, index_length, input_length, batch_size); + } else if (index_type == DataType::INT32) { + const int* index_data = index.data(); + IndexSampleForward<<>>( + index_data, in_data, out_data, index_length, input_length, batch_size); + } +} +} // namespace phi + +PD_REGISTER_KERNEL(index_sample, + GPU, + ALL_LAYOUT, + phi::IndexSampleKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/index_sample_grad_kernel.h b/paddle/phi/kernels/index_sample_grad_kernel.h new file mode 100644 index 00000000000..5c6e101f1b4 --- /dev/null +++ b/paddle/phi/kernels/index_sample_grad_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void IndexSampleGradKernel(const Context& ctx, + const DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& index, + DenseTensor* in_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/index_sample_kernel.h b/paddle/phi/kernels/index_sample_kernel.h new file mode 100644 index 00000000000..fb43c0c6c5f --- /dev/null +++ b/paddle/phi/kernels/index_sample_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void IndexSampleKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& index, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/index_sample_sig.cc b/paddle/phi/ops/compat/index_sample_sig.cc new file mode 100644 index 00000000000..0d2aed68a72 --- /dev/null +++ b/paddle/phi/ops/compat/index_sample_sig.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature IndexSampleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("index_sample_grad", + {GradVarName("Out"), "X", "Index"}, + {}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(index_sample_grad, + phi::IndexSampleGradOpArgumentMapping); -- GitLab From d17961edc0f32f640861db93ed2e8660062ba2b7 Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Tue, 1 Mar 2022 09:55:33 +0800 Subject: [PATCH 004/272] Optimize the CUDA kernel in DistributedFusedLamb optimizer (#39972) * vectorize lamb kernel * remove flags, add ut * remove useless codes * refine code, add param order --- .../distributed_fused_lamb_init_op.cc | 39 +- .../distributed_fused_lamb_init_op.cu | 162 ++--- .../optimizers/distributed_fused_lamb_op.cc | 34 +- .../optimizers/distributed_fused_lamb_op.cu | 682 ++++++++++-------- .../operators/optimizers/multi_tensor_apply.h | 61 +- .../distributed_fused_lamb_test_base.py | 5 + .../optimizer/distributed_fused_lamb.py | 21 +- 7 files changed, 546 insertions(+), 458 deletions(-) diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc index 28c6efef141..efec50efa92 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc @@ -61,30 +61,31 @@ class DistributedFusedLambInitOpMaker "The fp32 beta1 power accumulator tensor. Its shape is [1]."); AddOutput("Beta2Pow", "The fp32 beta2 power accumulator tensor. Its shape is [1]."); - AddOutput("FusedIndices", - "The param index of each element in FP32FusedParam. Its shape is " - "[M1+M2]. It is like [0,0,0,1,1,1,1,2,2,...]."); AddOutput( "FusedParamOffsets", "The numel offset of each parameter inside the FP32FusedParam. Its " "shape is [param_num + 1]. It is like [0, n_0, n_0 + n_1, n_0 + n_1 " - "+ n_2, ...]."); - AddOutput("FP32ShardFusedParamOffsets", - "The sharded numel offset of each parameter in the local rank. " - "Its shape is [fp32_local_param_num + 1]."); - AddOutput("FP16ShardFusedParamOffsets", - "The sharded numel offset of each parameter in the local rank. " - "Its shape is [fp16_local_param_num + 1]."); + "+ n_2, ...]. It should be in CPUPlace."); AddOutput( - "WeightDecay", - "The sharded fp32 weight decay tensor. Its shape is [(M1+M2)/N]."); + "FP32ShardFusedParamOffsets", + "The sharded numel offset of each parameter in the local rank. " + "Its shape is [fp32_local_param_num + 1]. It should be in CPUPlace."); + AddOutput( + "FP16ShardFusedParamOffsets", + "The sharded numel offset of each parameter in the local rank. " + "Its shape is [fp16_local_param_num + 1]. It should be in CPUPlace."); AddOutput("ParamInfo", "The param info. It should be in CPUPlace, and its shape is [6]" - "CPUPlace, and its shape is [6]. It is " + "CPUPlace, and its shape is [8]. It is " "[fp32_shard_param_start_idx, fp32_local_param_num, " - "fp32_global_param_num, fp16_shard_param_start_idx, " - "fp16_local_param_num, fp16_global_param_num]."); - + "fp32_global_param_num, fp32_weight_decay_end_idx, " + "fp16_shard_param_start_idx, " + "fp16_local_param_num, fp16_global_param_num, " + "fp16_weight_decay_end_idx]."); + AddOutput("ParamOrder", + "The reordered parameter order. Inside this op, " + "the parameter would be reordered by data type and weight decay " + "value."); AddOutput("ParamOut", "The output parameter list.").AsDuplicable(); AddOutput("MasterParamOut", "The output master parameter list. It would share the memory of " @@ -96,10 +97,8 @@ class DistributedFusedLambInitOpMaker AddAttr("beta1", "The initial value of Beta1Pow."); AddAttr("beta2", "The initial value of Beta2Pow."); - AddAttr>( - "weight_decay", - "The weight decay for each parameter. Its " - "shape is equal to the global parameter number."); + AddAttr>("apply_weight_decay", + "Whether to apply weight decay."); AddAttr("alignment", "The alignment in bytes for the fused tensors."); AddAttr("rank", "The global rank of the current process."); AddAttr("nranks", "The global world size."); diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu index 3445e9b658b..7d8a7186d58 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu @@ -258,32 +258,6 @@ static void ShareBufferForNonInitedTensor(framework::Tensor *origin, << ") , dtype = " << fused_out->dtype(); } -template -static __global__ void LambFillFusedIndicesCUDAKernel(const OffsetT *offsets, - IndexT *out, - int offset_num, - int out_num) { - CUDA_KERNEL_LOOP_TYPE(i, out_num, int) { - auto idx = phi::funcs::LowerBound(offsets, offset_num, i); - if (idx == offset_num || offsets[idx] != i) { - --idx; - } - out[i] = idx; - } -} - -template -static void CopyVectorToTensor(const std::vector &src, - framework::Tensor *dst, - const platform::Place &place, - gpuStream_t stream) { - dst->Resize({static_cast(src.size())}); - T *dst_ptr = dst->mutable_data(place); - const T *src_ptr = src.data(); - auto nbytes = src.size() * sizeof(T); - memory::Copy(place, dst_ptr, platform::CPUPlace(), src_ptr, nbytes, stream); -} - template static void CopyVectorToCPUTensor(const std::vector &src, framework::Tensor *dst) { @@ -294,6 +268,42 @@ static void CopyVectorToCPUTensor(const std::vector &src, std::memcpy(dst_ptr, src_ptr, nbytes); } +static size_t ReorderParamGradInfoList(const std::vector &flags, + std::vector *infos) { + size_t n = infos->size(); + std::vector cur_flags; + cur_flags.reserve(n); + for (size_t i = 0; i < n; ++i) { + auto idx = (*infos)[i].idx; + cur_flags.push_back(flags[idx]); + } + + auto origin_infos = *infos; + size_t j = 0; + for (size_t i = 0; i < n; ++i) { + if (cur_flags[i]) { + (*infos)[j] = origin_infos[i]; + ++j; + } + } + size_t ret_idx = j; + + for (size_t i = 0; i < n; ++i) { + if (!cur_flags[i]) { + (*infos)[j] = origin_infos[i]; + ++j; + } + } + return ret_idx; +} + +template +static T ClipByBound(T x, T low_value, T high_value) { + if (x < low_value) return low_value; + if (x > high_value) return high_value; + return x; +} + template class DistributedFusedLambInitOpKernel : public framework::OpKernel { @@ -404,6 +414,24 @@ class DistributedFusedLambInitOpKernel info->numel_offset = 0; // not determined yet } } + const auto &apply_weight_decay = + ctx.Attr>("apply_weight_decay"); + size_t fp32_wd_end_idx = + ReorderParamGradInfoList(apply_weight_decay, &fp32_infos); + size_t fp16_wd_end_idx = + ReorderParamGradInfoList(apply_weight_decay, &fp16_infos); + + auto *param_order_t = ctx.Output("ParamOrder"); + auto param_num = fp32_infos.size() + fp16_infos.size(); + param_order_t->Resize({static_cast(param_num)}); + auto *param_order = param_order_t->mutable_data(platform::CPUPlace()); + for (size_t i = 0; i < fp32_infos.size(); ++i) { + param_order[i] = static_cast(fp32_infos[i].idx); + } + for (size_t i = 0; i < fp16_infos.size(); ++i) { + param_order[i + fp32_infos.size()] = static_cast(fp16_infos[i].idx); + } + VLOG(10) << "Fill ParamGradInfo ends"; // Step 2: determine the numel_with_padding and numel_offset @@ -568,45 +596,29 @@ class DistributedFusedLambInitOpKernel VLOG(10) << "Found the sharding arguments"; auto *param_info_t = ctx.Output("ParamInfo"); - param_info_t->Resize({6}); + param_info_t->Resize({8}); auto *param_info = param_info_t->mutable_data(platform::CPUPlace()); param_info[0] = static_cast(fp32_start_idx); param_info[1] = static_cast(fp32_local_param_num); param_info[2] = static_cast(fp32_infos.size()); - param_info[3] = static_cast(fp16_start_idx + fp32_infos.size()); - param_info[4] = static_cast(fp16_local_param_num); - param_info[5] = static_cast(fp16_infos.size()); + param_info[3] = ClipByBound(fp32_wd_end_idx, fp32_start_idx, + fp32_start_idx + fp32_local_param_num) - + static_cast(fp32_start_idx); + param_info[4] = static_cast(fp16_start_idx + fp32_infos.size()); + param_info[5] = static_cast(fp16_local_param_num); + param_info[6] = static_cast(fp16_infos.size()); + param_info[7] = ClipByBound(fp16_wd_end_idx, fp16_start_idx, + fp16_start_idx + fp16_local_param_num) - + static_cast(fp16_start_idx); VLOG(10) << "Start FP32 idx: " << param_info[0]; VLOG(10) << "Local FP32 param num: " << param_info[1]; VLOG(10) << "Global FP32 param num: " << param_info[2]; - VLOG(10) << "Start FP16 idx: " << param_info[3]; - VLOG(10) << "Local FP16 param num: " << param_info[4]; - VLOG(10) << "Global FP16 param num: " << param_info[5]; + VLOG(10) << "Start FP16 idx: " << param_info[4]; + VLOG(10) << "Local FP16 param num: " << param_info[5]; + VLOG(10) << "Global FP16 param num: " << param_info[6]; - // For WeightDecay, shard and perform H2D copy - const auto &origin_weight_decay = - ctx.Attr>("weight_decay"); - PADDLE_ENFORCE_EQ(params.size(), origin_weight_decay.size(), - platform::errors::InvalidArgument( - "The attr(weight_decay) should have the " - "same length with Input(Param).")); - std::vector shard_weight_decay; - shard_weight_decay.reserve(total_local_param_num); - for (size_t i = 0; i < fp32_local_param_num; ++i) { - shard_weight_decay.push_back( - origin_weight_decay[fp32_infos[i + fp32_start_idx].idx]); - } - for (size_t i = 0; i < fp16_local_param_num; ++i) { - shard_weight_decay.push_back( - origin_weight_decay[fp16_infos[i + fp16_start_idx].idx]); - } - - // For FusedIndices, launch CUDA kernel to do binary search - auto *fused_indices_t = ctx.Output("FusedIndices"); - fused_indices_t->Resize({static_cast(total_numel)}); - auto *fused_indices = fused_indices_t->mutable_data(place); std::vector numel_offsets; numel_offsets.reserve(params.size() + 1); for (const auto &info : fp32_infos) { @@ -621,21 +633,6 @@ class DistributedFusedLambInitOpKernel "The numel_offsets number must be one larger than " "the parameter number.")); VLOG(10) << "Total numel offset: " << FlattenToString(numel_offsets); - auto *fused_param_offset_t = - ctx.Output("FusedParamOffsets"); - fused_param_offset_t->Resize({static_cast(numel_offsets.size())}); - auto *fused_param_offset = fused_param_offset_t->mutable_data(place); - memory::Copy(place, fused_param_offset, platform::CPUPlace(), - numel_offsets.data(), - numel_offsets.size() * sizeof(numel_offsets[0]), stream); - auto config = platform::GetGpuLaunchConfig1D(dev_ctx, total_numel); - LambFillFusedIndicesCUDAKernel<<>>( - fused_param_offset, fused_indices, numel_offsets.size() - 1, - total_numel); - - std::vector lengths; - lengths.reserve(fp32_local_param_num + fp16_local_param_num); std::vector fp32_partial_numel_offsets; fp32_partial_numel_offsets.reserve(fp32_local_param_num + 1); @@ -659,9 +656,9 @@ class DistributedFusedLambInitOpKernel VLOG(10) << "FP32 Partial numel = [" << valid_start_n + fp32_infos[i].numel << "," << end_n + fp32_infos[i].numel; - lengths.push_back(end_n - valid_start_n); + auto len = end_n - valid_start_n; fp32_partial_numel_offsets.push_back(fp32_partial_numel_offsets.back() + - lengths.back()); + len); } std::vector fp16_partial_numel_offsets; @@ -682,9 +679,9 @@ class DistributedFusedLambInitOpKernel PADDLE_ENFORCE_NE(valid_start_n, end_n, platform::errors::InvalidArgument( "Indices sharding error. This may be a bug.")); - lengths.push_back(end_n - valid_start_n); + auto len = end_n - valid_start_n; fp16_partial_numel_offsets.push_back(fp16_partial_numel_offsets.back() + - lengths.back()); + len); } CopyVectorToCPUTensor(numel_offsets, @@ -696,23 +693,6 @@ class DistributedFusedLambInitOpKernel fp16_partial_numel_offsets, ctx.Output("FP16ShardFusedParamOffsets")); - // Fill the weight decay tensor - PADDLE_ENFORCE_EQ(lengths.size(), shard_weight_decay.size(), - platform::errors::InvalidArgument( - "Invalid weight decay sharding. This may be a bug.")); - std::vector wd_cpu; - for (size_t i = 0; i < shard_weight_decay.size(); ++i) { - int len = lengths[i]; - for (int j = 0; j < len; ++j) { - wd_cpu.push_back(shard_weight_decay[i]); - } - } - PADDLE_ENFORCE_EQ(wd_cpu.size() * nranks, fp32_numel + fp16_numel, - platform::errors::InvalidArgument( - "Invalid weight decay sharding. This may be a bug.")); - CopyVectorToTensor(wd_cpu, ctx.Output("WeightDecay"), - place, stream); - auto *global_scale = ctx.Output("GlobalScale"); if (!global_scale->IsInitialized()) { TensorFillConstant(dev_ctx, global_scale, {1}, 1.0f); diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc index e5b27446eb3..8f7c87912e9 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc @@ -66,28 +66,31 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker { "The fp32 beta1 power accumulator tensor. Its shape is [1]."); AddInput("Beta2Pow", "The fp32 beta2 power accumulator tensor. Its shape is [1]."); - AddInput("FusedIndices", - "The param index of each element in FP32FusedParam. Its shape is " - "[M1+M2]. It is like [0,0,0,1,1,1,1,2,2,...]."); AddInput( "FusedParamOffsets", "The numel offset of each parameter inside the FP32FusedParam. Its " "shape is [param_num + 1]. It is like [0, n_0, n_0 + n_1, n_0 + n_1 " - "+ n_2, ...]."); - AddInput("FP32ShardFusedParamOffsets", - "The sharded numel offset of each parameter in the local rank. " - "Its shape is [fp32_local_param_num + 1]."); - AddInput("FP16ShardFusedParamOffsets", - "The sharded numel offset of each parameter in the local rank. " - "Its shape is [fp16_local_param_num + 1]."); - AddInput("WeightDecay", - "The sharded fp32 weight decay tensor. Its shape is [(M1+M2)/N]."); + "+ n_2, ...]. It should be in CPUPlace."); + AddInput( + "FP32ShardFusedParamOffsets", + "The sharded numel offset of each parameter in the local rank. " + "Its shape is [fp32_local_param_num + 1]. It should be in CPUPlace."); + AddInput( + "FP16ShardFusedParamOffsets", + "The sharded numel offset of each parameter in the local rank. " + "Its shape is [fp16_local_param_num + 1]. It should be in CPUPlace."); AddInput("ParamInfo", "The param info. It should be in CPUPlace, and its shape is [6]" - "CPUPlace, and its shape is [6]. It is " + "CPUPlace, and its shape is [8]. It is " "[fp32_shard_param_start_idx, fp32_local_param_num, " - "fp32_global_param_num, fp16_shard_param_start_idx, " - "fp16_local_param_num, fp16_global_param_num]."); + "fp32_global_param_num, fp32_weight_decay_end_idx, " + "fp16_shard_param_start_idx, " + "fp16_local_param_num, fp16_global_param_num, " + "fp16_weight_decay_end_idx]."); + AddInput("ParamOrder", + "The reordered parameter order. Inside this op, " + "the parameter would be reordered by data type and weight decay " + "value."); AddInput("LearningRate", "The fp32 learning rate tensor. Its shape is [1]."); @@ -116,6 +119,7 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker { "max_global_grad_norm", "The maximum global gradient l2-norm value for clipping. If " "max_global_grad_norm <= 0, no clipping would be performed."); + AddAttr("weight_decay", "The weight decay value."); AddAttr("clip_after_allreduce", "Whether to clip before allreduce, only valid when the " "world size is larger than 1."); diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu index 3f90140f772..ca0828a6f6a 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu @@ -87,7 +87,7 @@ struct L2NormFunctor { } }; -template +template static __global__ void MultiTensorL2NormReduceAgainCUDAKernel( const InT *x, OutT *y, int max_chunk_num) { int tensor_id = blockIdx.x; @@ -100,11 +100,7 @@ static __global__ void MultiTensorL2NormReduceAgainCUDAKernel( } sum = BlockReduce(storage).Reduce(sum, cub::Sum()); if (threadIdx.x == 0) { - if (NeedSqrt) { - y[blockIdx.x] = static_cast(sqrtf(sum)); - } else { - y[blockIdx.x] = static_cast(sum); - } + y[blockIdx.x] = static_cast(sum); } } @@ -118,6 +114,7 @@ static int GetChunkedVecSize(const T *ptr, int chunk_size) { constexpr int vec8 = alignof(platform::AlignedVector); constexpr int vec4 = alignof(platform::AlignedVector); constexpr int vec2 = alignof(platform::AlignedVector); + chunk_size *= sizeof(T); if (address % vec8 == 0 && chunk_size % vec8 == 0) { return std::min(8, valid_vec_size); } else if (address % vec4 == 0 && chunk_size % vec4 == 0) { @@ -129,27 +126,26 @@ static int GetChunkedVecSize(const T *ptr, int chunk_size) { } } -#define PD_VEC_MULTI_TENSOR_APPLY_CASE(__vec_size, ...) \ - case __vec_size: { \ - constexpr int kVecSize = __vec_size; \ - __VA_ARGS__; \ - break; \ +#define PD_VEC_LAUNCH_KERNEL_CASE(__vec_size, ...) \ + case __vec_size: { \ + constexpr int kVecSize = __vec_size; \ + __VA_ARGS__; \ + break; \ } -#define PD_VEC_MULTI_TENSOR_APPLY(__vec_size, ...) \ - do { \ - switch (__vec_size) { \ - PD_VEC_MULTI_TENSOR_APPLY_CASE(8, __VA_ARGS__); \ - PD_VEC_MULTI_TENSOR_APPLY_CASE(4, __VA_ARGS__); \ - PD_VEC_MULTI_TENSOR_APPLY_CASE(2, __VA_ARGS__); \ - PD_VEC_MULTI_TENSOR_APPLY_CASE(1, __VA_ARGS__); \ - } \ +#define PD_VEC_LAUNCH_KERNEL(__vec_size, ...) \ + do { \ + switch (__vec_size) { \ + PD_VEC_LAUNCH_KERNEL_CASE(8, __VA_ARGS__); \ + PD_VEC_LAUNCH_KERNEL_CASE(4, __VA_ARGS__); \ + PD_VEC_LAUNCH_KERNEL_CASE(2, __VA_ARGS__); \ + PD_VEC_LAUNCH_KERNEL_CASE(1, __VA_ARGS__); \ + } \ } while (0) // TODO(zengjinle): which chunk_size is better? -template +template static void MultiTensorL2Norm(const platform::CUDAPlace &place, gpuStream_t stream, const InT *x, const int *offsets, int n, OutT *y, @@ -158,7 +154,7 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place, constexpr int kNumTensor = MaxTensorNumPerLaunch; constexpr int kNumChunk = MaxChunkNumPerLaunch; - constexpr int kBlockDim = BlockDim; + constexpr int kBlockDim = 512; int max_chunk_num = -1; int vec_size = 8; @@ -181,22 +177,22 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place, auto *tmp_out_ptr = tmp_out.Alloc(n * max_chunk_num); FillZeroWithPtr(tmp_out_ptr, n * max_chunk_num, stream); -#define PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL \ - do { \ - using FunctorT = L2NormFunctor; \ - VLOG(10) << __func__ << " " << typeid(InT).name() \ - << " VecSize = " << kVecSize; \ - MultiTensorApply( \ - FunctorT(), stream, offsets, n, chunk_size, x, tmp_out_ptr, \ - max_chunk_num); \ +#define PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL \ + do { \ + using FunctorT = L2NormFunctor; \ + VLOG(10) << __func__ << " " << typeid(InT).name() \ + << " VecSize = " << kVecSize; \ + MultiTensorApply( \ + FunctorT(), stream, offsets, n, chunk_size, kBlockDim, x, tmp_out_ptr, \ + max_chunk_num); \ } while (0) - PD_VEC_MULTI_TENSOR_APPLY(vec_size, PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL); -#undef PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL + PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL); +#undef PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL - MultiTensorL2NormReduceAgainCUDAKernel<<>>( - tmp_out_ptr, y, max_chunk_num); + MultiTensorL2NormReduceAgainCUDAKernel< + MT, OutT, kBlockDim><<>>(tmp_out_ptr, y, + max_chunk_num); } template @@ -208,34 +204,17 @@ static void LogParamAndTrustRatioDivSquareNorm( auto tensors = ctx.MultiInput("Param"); if (tensors.empty()) return; + const auto *order = ctx.Input("ParamOrder")->data(); + size_t n = tensors.size(); auto place = tensors[0]->place(); auto pn_vec = ToVector(param_square_norm, n, place); auto tn_vec = ToVector(trust_ratio_div_square_norm, n, place); - std::vector fp32_indices, fp16_indices; - fp32_indices.reserve(n); - fp16_indices.reserve(n); - for (size_t i = 0; i < n; ++i) { - const auto *t = tensors[i]; - if (t->dtype() == phi::DataType::FLOAT32) { - fp32_indices.push_back(i); - } else if (t->dtype() == phi::DataType::FLOAT16) { - fp16_indices.push_back(i); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported data type %s.", t->dtype())); - } - } - - for (auto idx : fp16_indices) { - fp32_indices.push_back(idx); - } - const auto &names = ctx.GetOp().Inputs("Param"); - for (size_t i = 0; i < fp32_indices.size(); ++i) { - auto idx = fp32_indices[i]; + for (size_t i = 0; i < n; ++i) { + auto idx = order[i]; VLOG(LogLevel) << "Param " << tensors[idx]->dtype() << " " << names[idx] << " pn = " << pn_vec[i] << " , tn = " << tn_vec[i]; } @@ -353,7 +332,7 @@ static __global__ void CalcGradNormClipBeforeAllReduceScale( const T1 *__restrict__ global_scale, T1 max_global_grad_norm, const T1 *__restrict__ square_grad_norm, T1 *__restrict__ out1, T2 *__restrict__ out2, T1 clip_rescale_grad) { - T1 grad_norm = static_cast(sqrt(*square_grad_norm)) * clip_rescale_grad; + T1 grad_norm = static_cast(sqrtf(*square_grad_norm)) * clip_rescale_grad; T1 scale = global_scale[0] * max_global_grad_norm / (1e-6 + grad_norm); bool found_nan_inf = !isfinite(scale); if (scale >= 1 || found_nan_inf) { @@ -380,19 +359,24 @@ static __global__ void SetNanInfValueCUDAKernelTwoFlag(const bool *in_flag_p_1, ((*in_flag_p_1) || (*in_flag_p_2)) ? __int_as_float(0x7fffffffU) : 0.0f; } -// TODO(zengjinle): Vectorize this function -// NOTE: this method does not update Beta1Pow and Beta2Pow! -template -static __global__ void UpdateLambMoment( +template +static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel( const T *__restrict__ param_p, const GradT *__restrict__ grad_p, const T *__restrict__ square_grad_norm_p, - const T *__restrict__ global_scale, const IndexT *__restrict__ indices, - const T *__restrict__ weight_decay_p, const T *__restrict__ beta1pow_p, + const T *__restrict__ global_scale, const T *__restrict__ beta1pow_p, const T *__restrict__ beta2pow_p, T *__restrict__ mom1_p, - T *__restrict__ mom2_p, T *__restrict__ trust_ratio_div_p, T beta1, T beta2, - T epsilon, T max_global_grad_norm, int num, T rescale_grad) { + T *__restrict__ mom2_p, T *__restrict__ trust_ratio_div_p, bool *found_inf, + T weight_decay, int weight_decay_end_numel, T beta1, T beta2, T epsilon, + T max_global_grad_norm, int num, T rescale_grad) { T square_grad_norm = *square_grad_norm_p; - if (!isfinite(square_grad_norm)) return; + bool need_update_found_inf = + (found_inf && threadIdx.x == 0 && blockIdx.x == 0); + if (!isfinite(square_grad_norm)) { + if (need_update_found_inf) *found_inf = true; + return; + } else if (need_update_found_inf) { + *found_inf = false; + } T scale = rescale_grad / global_scale[0]; if (max_global_grad_norm > 0) { @@ -406,27 +390,112 @@ static __global__ void UpdateLambMoment( T one_minus_beta1pow = 1 - beta1pow_p[0]; T one_minus_beta2pow = 1 - beta2pow_p[0]; - CUDA_KERNEL_LOOP(i, num) { - T p = param_p[i]; - T g = static_cast(grad_p[i]) * scale; - T weight_decay = weight_decay_p[i]; - T mom1 = mom1_p[i]; - T mom2 = mom2_p[i]; - - mom1 = beta1 * mom1 + (1 - beta1) * g; - mom2 = beta2 * mom2 + (1 - beta2) * g * g; - - T mom1_unbiased = mom1 / one_minus_beta1pow; - T mom2_unbiased = mom2 / one_minus_beta2pow; - T trust_ratio_div = - mom1_unbiased / (sqrtf(mom2_unbiased) + epsilon) + weight_decay * p; - - mom1_p[i] = mom1; - mom2_p[i] = mom2; - trust_ratio_div_p[i] = trust_ratio_div; + int i = (threadIdx.x + blockIdx.x * blockDim.x) * VecSize; + int stride = blockDim.x * gridDim.x * VecSize; + + for (; i + VecSize <= num; i += stride) { + platform::AlignedVector param_vec; + platform::AlignedVector grad_vec; + platform::AlignedVector weight_decay_vec; + platform::AlignedVector mom1_vec; + platform::AlignedVector mom2_vec; + platform::AlignedVector trust_ratio_div_vec; + + T cur_weight_decay = (i < weight_decay_end_numel) * weight_decay; + if (cur_weight_decay != static_cast(0.0)) { + platform::Load(param_p + i, ¶m_vec); + } else { +#pragma unroll + for (int j = 0; j < VecSize; ++j) { + param_vec[j] = static_cast(0); + } + } + platform::Load(grad_p + i, &grad_vec); + platform::Load(mom1_p + i, &mom1_vec); + platform::Load(mom2_p + i, &mom2_vec); + +#define PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(__param, __grad, __mom1, __mom2, \ + __trust_ratio_div, __idx) \ + T p = __param[__idx]; \ + T g = static_cast(__grad[__idx]) * scale; \ + T mom1 = __mom1[__idx]; \ + T mom2 = __mom2[__idx]; \ + mom1 = beta1 * mom1 + (1 - beta1) * g; \ + mom2 = beta2 * mom2 + (1 - beta2) * g * g; \ + T mom1_unbiased = mom1 / one_minus_beta1pow; \ + T mom2_unbiased = mom2 / one_minus_beta2pow; \ + __trust_ratio_div[__idx] = \ + mom1_unbiased / (sqrtf(mom2_unbiased) + epsilon) + cur_weight_decay * p; \ + __mom1[__idx] = mom1; \ + __mom2[__idx] = mom2; + +#pragma unroll + for (int j = 0; j < VecSize; ++j) { + PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(param_vec, grad_vec, mom1_vec, + mom2_vec, trust_ratio_div_vec, j); + } + + platform::Store(mom1_vec, mom1_p + i); + platform::Store(mom2_vec, mom2_p + i); + platform::Store(trust_ratio_div_vec, trust_ratio_div_p + i); + } + + for (; i < num; ++i) { + T cur_weight_decay = (i < weight_decay_end_numel) * weight_decay; + PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(param_p, grad_p, mom1_p, mom2_p, + trust_ratio_div_p, i); } } +template +static void MultiTensorUpdateLambMomentAndTrustRatioDiv( + const platform::CUDADeviceContext &dev_ctx, const int *offsets, int n, + const T *param_p, const GradT *grad_p, const T *square_grad_norm_p, + const T *global_scale, const T *beta1pow_p, const T *beta2pow_p, T *mom1_p, + T *mom2_p, T *trust_ratio_div_p, bool *found_inf_p, T weight_decay, + int weight_decay_end_idx, T beta1, T beta2, T epsilon, + T max_global_grad_norm, T rescale_grad) { + if (n <= 0) return; + int numel = offsets[n] - offsets[0]; + PADDLE_ENFORCE_GE(weight_decay_end_idx, 0, + platform::errors::InvalidArgument( + "The weight decay end index should be >= 0.")); + PADDLE_ENFORCE_LE(weight_decay_end_idx, n, + platform::errors::InvalidArgument( + "The weight decay end index should be < %d.", n)); + auto weight_decay_end_numel = offsets[weight_decay_end_idx] - offsets[0]; + + int vec_size = GetChunkedVecSize(param_p, 0); + vec_size = std::min(vec_size, GetChunkedVecSize(grad_p, 0)); + vec_size = std::min(vec_size, GetChunkedVecSize(mom1_p, 0)); + vec_size = std::min(vec_size, GetChunkedVecSize(mom2_p, 0)); + vec_size = std::min(vec_size, GetChunkedVecSize(trust_ratio_div_p, 0)); + for (int i = 0; i < n; ++i) { + auto length = offsets[i + 1] - offsets[i]; + while (length % vec_size != 0) { + vec_size /= 2; + } + } + + VLOG(1) << __func__ << " VecSize = " << vec_size; + + auto stream = dev_ctx.stream(); + auto config = platform::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size); + +#define PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL \ + do { \ + UpdateLambMomentAndTrustRatioDivCUDAKernel<<< \ + config.block_per_grid, config.thread_per_block, 0, stream>>>( \ + param_p, grad_p, square_grad_norm_p, global_scale, beta1pow_p, \ + beta2pow_p, mom1_p, mom2_p, trust_ratio_div_p, found_inf_p, \ + weight_decay, weight_decay_end_numel, beta1, beta2, epsilon, \ + max_global_grad_norm, numel, rescale_grad); \ + } while (0) + + PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL); +#undef PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL +} + template struct LambBetaPowUpdateOnceHelper { LambBetaPowUpdateOnceHelper(T *beta1pow, T *beta2pow, T beta1, T beta2) { @@ -468,33 +537,6 @@ struct LambBetaPowUpdateOnceHelper { HOSTDEVICE void UpdateBetaPows() const {} }; -template -struct LambFoundInfHelper { - public: - explicit LambFoundInfHelper(bool *found_inf) : found_inf_(found_inf) { - PADDLE_ENFORCE_NOT_NULL(found_inf, - platform::errors::InvalidArgument( - "The found_inf should not be nullptr.")); - } - - HOSTDEVICE void UpdateFoundInf(bool value) { *found_inf_ = value; } - - private: - bool *__restrict__ found_inf_; -}; - -template <> -struct LambFoundInfHelper { - public: - explicit LambFoundInfHelper(bool *found_inf) { - PADDLE_ENFORCE_EQ( - found_inf, nullptr, - platform::errors::InvalidArgument("The found_inf should be nullptr.")); - } - - HOSTDEVICE void UpdateFoundInf(bool) {} -}; - template struct LambParamHelper { LambParamHelper(T *param, MasterT *master_param) { @@ -509,12 +551,9 @@ struct LambParamHelper { master_param_ = master_param; } - HOSTDEVICE void SetParam(int i, MasterT updated_p) { - param_[i] = static_cast(updated_p); - master_param_[i] = updated_p; - } + HOSTDEVICE T *__restrict__ ParamPtr() { return param_; } - HOSTDEVICE MasterT GetParam(int i) { return master_param_[i]; } + HOSTDEVICE MasterT *__restrict__ MasterParamPtr() { return master_param_; } private: T *__restrict__ param_; @@ -538,158 +577,169 @@ struct LambParamHelper { param_ = param; } - HOSTDEVICE void SetParam(int i, MasterT updated_p) { - param_[i] = static_cast(updated_p); - } + HOSTDEVICE T *__restrict__ ParamPtr() { return param_; } - HOSTDEVICE MasterT GetParam(int i) { - return static_cast>(param_[i]); - } + HOSTDEVICE constexpr MasterT *MasterParamPtr() { return nullptr; } private: T *__restrict__ param_; }; -template -struct LambParamAndBetaPowsUpdateHelper - : public LambParamHelper, - public LambBetaPowUpdateOnceHelper, NeedUpdateBetaPow>, - public LambFoundInfHelper { - LambParamAndBetaPowsUpdateHelper( - ParamT *param, MasterT *master_param, MasterT *beta1pow, - MasterT *beta2pow, MasterT beta1, MasterT beta2, - bool *found_inf, const MasterT *trust_ratio_div, - const MasterT *lr, const IndexT *index, +template +struct LambUpdateParamAndBetaPowsFunctor { + DEVICE void operator()( + int tensor_id, int chunk_id, int offset, int size, + LambParamHelper param_helper, + const MasterT *trust_ratio_div, const MasterT *lr, const MasterT *param_square_norm, - const MasterT *trust_ratio_div_square_norm, - const MasterT *update_flag) - : LambParamHelper(param, master_param), - LambBetaPowUpdateOnceHelper, NeedUpdateBetaPow>( - beta1pow, beta2pow, beta1, beta2), - LambFoundInfHelper(found_inf), - trust_ratio_div(trust_ratio_div), - lr(lr), - index(index), - param_square_norm(param_square_norm), - trust_ratio_div_square_norm(trust_ratio_div_square_norm), - update_flag(update_flag) {} - - const MasterT *__restrict__ trust_ratio_div; - const MasterT *__restrict__ lr; - const IndexT *__restrict__ index; - const MasterT *__restrict__ param_square_norm; - const MasterT *__restrict__ trust_ratio_div_square_norm; - const MasterT *__restrict__ update_flag; -}; + const MasterT *trust_ratio_div_square_norm, const bool *found_inf, + LambBetaPowUpdateOnceHelper, NeedUpdateBetaPow> + betapow_helper) const { + if (*found_inf) return; + + using MT = MasterT; -template -static __global__ void LambUpdateParamAndBetaPowsCUDAKernel( - LambParamAndBetaPowsUpdateHelper - args, - int num) { - auto should_update = *args.update_flag; - if (!isfinite(should_update)) { - if (HasFoundInf && threadIdx.x == 0 && blockIdx.x == 0) { - args.UpdateFoundInf(true); + MT p_square_norm = param_square_norm[tensor_id]; + MT t_square_norm = trust_ratio_div_square_norm[tensor_id]; + MT lr_value = *lr; + MT ratio = (p_square_norm != static_cast(0) && + t_square_norm != static_cast(0) + ? lr_value * sqrtf(p_square_norm / t_square_norm) + : lr_value); + + int i; + int stride = blockDim.x * VecSize; + + ParamT *param = param_helper.ParamPtr() + offset; + MT *master_param = HasMasterParam ? param_helper.MasterParamPtr() + offset + : param_helper.MasterParamPtr(); + trust_ratio_div += offset; + + for (i = threadIdx.x * VecSize; i + VecSize <= size; i += stride) { + platform::AlignedVector trust_ratio_div_vec; + platform::Load(trust_ratio_div + i, &trust_ratio_div_vec); + if (HasMasterParam) { + platform::AlignedVector master_param_vec; + platform::Load(master_param + i, &master_param_vec); + platform::AlignedVector param_vec; +#pragma unroll + for (int j = 0; j < VecSize; ++j) { + MT p = master_param_vec[j] - ratio * trust_ratio_div_vec[j]; + master_param_vec[j] = p; + param_vec[j] = static_cast(p); + } + platform::Store(master_param_vec, master_param + i); + platform::Store(param_vec, param + i); + } else { + platform::AlignedVector param_vec; + platform::Load(param + i, ¶m_vec); +#pragma unroll + for (int j = 0; j < VecSize; ++j) { + MT p = static_cast(param_vec[j]) - ratio * trust_ratio_div_vec[j]; + param_vec[j] = static_cast(p); + } + platform::Store(param_vec, param + i); + } + } + + for (; i < size; ++i) { + if (HasMasterParam) { + MT p = master_param[i] - ratio * trust_ratio_div[i]; + master_param[i] = p; + param[i] = static_cast(p); + } else { + MT p = static_cast(param[i]) - ratio * trust_ratio_div[i]; + param[i] = static_cast(p); + } + } + + if (NeedUpdateBetaPow && threadIdx.x == 0 && blockIdx.x == 0) { + betapow_helper.UpdateBetaPows(); } - return; - } else if (HasFoundInf && threadIdx.x == 0 && blockIdx.x == 0) { - args.UpdateFoundInf(false); } +}; - if (NeedUpdateBetaPow && threadIdx.x == 0 && blockIdx.x == 0) { - args.UpdateBetaPows(); +// TODO(zengjinle): which block_dim and chunk_size would be better? +template +static void MultiTensorUpdateLambParamAndBetaPows( + const platform::CUDADeviceContext &dev_ctx, const int *offsets, int n, + const MasterT *trust_ratio_div, const MasterT *lr, + const MasterT *param_square_norm, + const MasterT *trust_ratio_div_square_norm, const bool *found_inf, + ParamT *param, MasterT *master_param, MasterT *beta1pow, + MasterT *beta2pow, MasterT beta1, MasterT beta2, + int chunk_size = 65536) { + constexpr bool kHasMasterParam = + !(std::is_same>::value); + + bool has_beta_pow = (beta1pow != nullptr); + if (has_beta_pow) { + PADDLE_ENFORCE_NOT_NULL(beta2pow, platform::errors::InvalidArgument( + "Beta2Pow should not be nullptr.")); + } else { + PADDLE_ENFORCE_EQ(beta2pow, nullptr, platform::errors::InvalidArgument( + "Beta2Pow should be nullptr.")); } - using MT = MasterT; + const int block_dim = 512; - MT lr_value = *args.lr; - CUDA_KERNEL_LOOP(i, num) { - MT p = args.GetParam(i); - MT t = args.trust_ratio_div[i]; - auto norm_idx = args.index[i]; - MT p_square_norm = args.param_square_norm[norm_idx]; - MT t_square_norm = args.trust_ratio_div_square_norm[norm_idx]; + int vec_size = 8; + for (int i = 0; i < n; ++i) { + int offset = offsets[i] - offsets[0]; + vec_size = + std::min(vec_size, GetChunkedVecSize(param + offset, chunk_size)); + if (kHasMasterParam) { + vec_size = std::min(vec_size, + GetChunkedVecSize(master_param + offset, chunk_size)); + } + vec_size = std::min( + vec_size, GetChunkedVecSize(trust_ratio_div + offset, chunk_size)); + } - MT p_norm = static_cast(sqrtf(p_square_norm)); - MT t_norm = static_cast(sqrtf(t_square_norm)); + VLOG(1) << __func__ << " VecSize = " << vec_size; - auto update = (p_norm != static_cast(0) && t_norm != static_cast(0)) - ? p_norm / t_norm - : static_cast(1); + constexpr auto kNumTensor = MaxTensorNumPerLaunch; + constexpr auto kNumChunk = MaxChunkNumPerLaunch; - MT updated_p = p - lr_value * update * t; - args.SetParam(i, updated_p); - } -} + auto stream = dev_ctx.stream(); +#define PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(__has_beta_pow) \ + do { \ + using FunctorT = \ + LambUpdateParamAndBetaPowsFunctor; \ + LambParamHelper param_helper(param, \ + master_param); \ + LambBetaPowUpdateOnceHelper, __has_beta_pow> \ + betapow_helper(beta1pow, beta2pow, beta1, beta2); \ + launcher.Launch(FunctorT(), param_helper, trust_ratio_div, lr, \ + param_square_norm, trust_ratio_div_square_norm, found_inf, \ + betapow_helper); \ + } while (0) -template -static void LambUpdateParamAndBetaPows( - const platform::CUDADeviceContext &dev_ctx, - const MasterT *trust_ratio_div, const MasterT *lr, - const IndexT *index, const MasterT *param_square_norm, - const MasterT *trust_ratio_div_square_norm, - const MasterT *update_flag, MasterT **beta1pow, - MasterT **beta2pow, bool **found_inf, MasterT beta1, - MasterT beta2, int num, ParamT *param, - MasterT *master_param, gpuStream_t stream) { - if (num == 0) return; - - bool has_master_param = !(std::is_same>::value); - auto has_beta_pow = (*beta1pow) != nullptr && (*beta2pow) != nullptr; - auto has_found_inf = (*found_inf) != nullptr; - -#define PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL( \ - __has_master_param, __has_beta_pow, __has_found_inf) \ - do { \ - LambParamAndBetaPowsUpdateHelper \ - helper(param, master_param, *beta1pow, *beta2pow, beta1, beta2, \ - *found_inf, trust_ratio_div, lr, index, param_square_norm, \ - trust_ratio_div_square_norm, update_flag); \ - auto config = platform::GetGpuLaunchConfig1D(dev_ctx, num); \ - LambUpdateParamAndBetaPowsCUDAKernel<<< \ - config.block_per_grid, config.thread_per_block, 0, stream>>>(helper, \ - num); \ +#define PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE \ + do { \ + auto callback = [&]( \ + const MultiTensorLauncher &launcher, \ + int launch_n) { \ + if (has_beta_pow && launch_n == 0) { \ + PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(true); \ + beta1pow = nullptr; \ + beta2pow = nullptr; \ + } else { \ + PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(false); \ + } \ + }; \ + MultiTensorApplyWithCallback( \ + stream, offsets, n, chunk_size, block_dim, callback); \ } while (0) - if (has_master_param) { - if (has_beta_pow) { - if (has_found_inf) { - PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, true, true); - } else { - PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, true, false); - } - } else { - if (has_found_inf) { - PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, false, true); - } else { - PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, false, false); - } - } - } else { - if (has_beta_pow) { - if (has_found_inf) { - PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, true, true); - } else { - PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, true, false); - } - } else { - if (has_found_inf) { - PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, false, true); - } else { - PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, false, false); - } - } - } + PD_VEC_LAUNCH_KERNEL(vec_size, + PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE); - *beta1pow = nullptr; - *beta2pow = nullptr; - *found_inf = nullptr; -#undef PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL +#undef PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW +#undef PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE } #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) @@ -1005,15 +1055,16 @@ class DistributedFusedLambOpKernel "Too many parameter number. Only <= %d is supported.", std::numeric_limits::max())); - // Step 3: Get FusedIndices, ParamInfo - const auto *indices = GetInputTensorPtr(ctx, "FusedIndices"); + // Step 3: Get ParamInfo const auto *param_info_tensor = GetInputTensorPtr(ctx, "ParamInfo"); auto fp32_local_start_idx = param_info_tensor[0]; auto fp32_local_param_num = param_info_tensor[1]; auto fp32_global_param_num = param_info_tensor[2]; - auto fp16_local_start_idx = param_info_tensor[3]; - auto fp16_local_param_num = param_info_tensor[4]; - auto fp16_global_param_num = param_info_tensor[5]; + auto fp32_weight_decay_end_idx = param_info_tensor[3]; + auto fp16_local_start_idx = param_info_tensor[4]; + auto fp16_local_param_num = param_info_tensor[5]; + auto fp16_global_param_num = param_info_tensor[6]; + auto fp16_weight_decay_end_idx = param_info_tensor[7]; auto local_param_num = fp32_local_param_num + fp16_local_param_num; auto param_num = fp32_global_param_num + fp16_global_param_num; @@ -1031,7 +1082,7 @@ class DistributedFusedLambOpKernel << " , fp16_global_param_num = " << fp16_global_param_num; // Step 4: Get LearningRate, Moment1, Moment2, Beta1Pow, Beta2Pow, - // WeightDecay, GlobalScale, FoundInf + // GlobalScale, FoundInf const auto *global_scale = GetInputTensorPtr(ctx, "GlobalScale"); const auto *lr = GetInputTensorPtr(ctx, "LearningRate"); int64_t partial_numel = 0; @@ -1065,14 +1116,15 @@ class DistributedFusedLambOpKernel GetSameInOutTensorPtr(ctx, place, "Beta1Pow", "Beta1PowOut"); auto *beta2pow = GetSameInOutTensorPtr(ctx, place, "Beta2Pow", "Beta2PowOut"); - const float *weight_decay = GetInputTensorPtr(ctx, "WeightDecay"); auto *found_inf_t = ctx.Output("FoundInf"); found_inf_t->Resize({1}); auto *found_inf = found_inf_t->mutable_data(place); - // Step 5: Get attributes beta1, beta2, epsilon, max_grad_norm, ring_id, + // Step 5: Get attributes weight_decay, beta1, beta2, epsilon, + // max_grad_norm, ring_id, // use_master_param_norm, is_grad_scaled_by_nranks + auto weight_decay = ctx.Attr("weight_decay"); auto beta1 = ctx.Attr("beta1"); auto beta2 = ctx.Attr("beta2"); auto epsilon = ctx.Attr("epsilon"); @@ -1105,7 +1157,8 @@ class DistributedFusedLambOpKernel platform::float16 *fp16_sum_grad; auto fp32_numel_each_device = fp32_numel / num_devices; auto fp16_numel_each_device = fp16_numel / num_devices; - if (num_devices > 1) { + if (num_devices > 1 || + (max_global_grad_norm > 0 && !clip_after_allreduce)) { auto ptr = sum_grad_buffer.Alloc( fp32_numel_each_device * sizeof(float) + fp16_numel_each_device * sizeof(platform::float16)); @@ -1181,7 +1234,11 @@ class DistributedFusedLambOpKernel float, platform::float16><<<1, 1, 0, stream>>>( global_scale, max_global_grad_norm, fp32_square_grad_norm, fp32_scale, fp16_scale, clip_scale); - VLOG(1) << "Grad scale: " << FlattenToString(fp32_scale, 1, place); + if (fp32_scale) { + VLOG(1) << "Grad scale: " << FlattenToString(fp32_scale, 1, place); + } else { + VLOG(1) << "Grad scale: " << FlattenToString(fp16_scale, 1, place); + } if (num_devices > 1) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( fp32_square_grad_norm, fp32_square_grad_norm, 1, ncclFloat32, @@ -1218,36 +1275,56 @@ class DistributedFusedLambOpKernel VLOG(10) << "ReduceScatter done"; // Step 7: update the moment1, moment2. Calcuate the trust_ratio_div + auto *fused_offsets_t = ctx.Input("FusedParamOffsets"); + auto *fused_offsets = fused_offsets_t->data(); + auto *fp32_partial_fused_offsets_t = + ctx.Input("FP32ShardFusedParamOffsets"); + const auto *fp32_partial_fused_offsets = + fp32_partial_fused_offsets_t->data(); + auto *fp16_partial_fused_offsets_t = + ctx.Input("FP16ShardFusedParamOffsets"); + const auto *fp16_partial_fused_offsets = + fp16_partial_fused_offsets_t->data(); + + VLOG(1) << "FusedParamOffsets: " + << FlattenToString(fused_offsets, fused_offsets_t->numel(), + fused_offsets_t->place()); + VLOG(1) << "FP32ShardFusedParamOffsets: " + << FlattenToString(fp32_partial_fused_offsets, + fp32_partial_fused_offsets_t->numel(), + fp32_partial_fused_offsets_t->place()); + VLOG(1) << "FP16ShardFusedParamOffsets: " + << FlattenToString(fp16_partial_fused_offsets, + fp16_partial_fused_offsets_t->numel(), + fp16_partial_fused_offsets_t->place()); + memory::Buffer trust_ratio_div_buffer(place); auto *trust_ratio_div = trust_ratio_div_buffer.Alloc(partial_numel); auto fp32_offset = rank * fp32_numel_each_device; auto fp16_offset = rank * fp16_numel_each_device; if (has_fp32_param) { - auto config = - platform::GetGpuLaunchConfig1D(dev_ctx, fp32_numel_each_device); VLOG(10) << "Update FP32 Moment and TrustRatioDiv starts"; - UpdateLambMoment<<>>( + MultiTensorUpdateLambMomentAndTrustRatioDiv( + dev_ctx, fp32_partial_fused_offsets, fp32_local_param_num, fp32_param + fp32_offset, fp32_sum_grad, fp32_square_grad_norm, - global_scale, indices + fp32_offset, weight_decay, beta1pow, beta2pow, - moment1, moment2, trust_ratio_div, beta1, beta2, epsilon, - max_global_grad_norm, fp32_numel_each_device, rescale_grad); + global_scale, beta1pow, beta2pow, moment1, moment2, trust_ratio_div, + found_inf, weight_decay, fp32_weight_decay_end_idx, beta1, beta2, + epsilon, max_global_grad_norm, rescale_grad); VLOG(10) << "Update FP32 Moment and TrustRatioDiv done"; } float *master_param = nullptr; if (has_fp16_param) { master_param = fp32_param + fp32_numel; - auto config = - platform::GetGpuLaunchConfig1D(dev_ctx, fp16_numel_each_device); VLOG(10) << "Update FP16 Moment and TrustRatioDiv starts"; - UpdateLambMoment<<>>( + auto tmp_found_inf = has_fp32_param ? nullptr : found_inf; + MultiTensorUpdateLambMomentAndTrustRatioDiv( + dev_ctx, fp16_partial_fused_offsets, fp16_local_param_num, master_param + fp16_offset, fp16_sum_grad, fp32_square_grad_norm, - global_scale, indices + fp32_numel + fp16_offset, weight_decay, - beta1pow, beta2pow, moment1 + fp32_numel_each_device, + global_scale, beta1pow, beta2pow, moment1 + fp32_numel_each_device, moment2 + fp32_numel_each_device, - trust_ratio_div + fp32_numel_each_device, beta1, beta2, epsilon, - max_global_grad_norm, fp16_numel_each_device, rescale_grad); + trust_ratio_div + fp32_numel_each_device, tmp_found_inf, weight_decay, + fp16_weight_decay_end_idx, beta1, beta2, epsilon, + max_global_grad_norm, rescale_grad); VLOG(10) << "Update FP16 Moment and TrustRatioDiv done"; } @@ -1257,30 +1334,6 @@ class DistributedFusedLambOpKernel memory::Buffer square_norm_buffer(place); auto *param_square_norm = square_norm_buffer.Alloc(2 * param_num); auto *trust_ratio_div_square_norm = param_square_norm + param_num; - - auto *fused_offsets_t = ctx.Input("FusedParamOffsets"); - auto *fused_offsets = fused_offsets_t->data(); - auto *fp32_partial_fused_offsets_t = - ctx.Input("FP32ShardFusedParamOffsets"); - const auto *fp32_partial_fused_offsets = - fp32_partial_fused_offsets_t->data(); - auto *fp16_partial_fused_offsets_t = - ctx.Input("FP16ShardFusedParamOffsets"); - const auto *fp16_partial_fused_offsets = - fp16_partial_fused_offsets_t->data(); - - VLOG(1) << "FusedParamOffsets: " - << FlattenToString(fused_offsets, fused_offsets_t->numel(), - fused_offsets_t->place()); - VLOG(1) << "FP32ShardFusedParamOffsets: " - << FlattenToString(fp32_partial_fused_offsets, - fp32_partial_fused_offsets_t->numel(), - fp32_partial_fused_offsets_t->place()); - VLOG(1) << "FP16ShardFusedParamOffsets: " - << FlattenToString(fp16_partial_fused_offsets, - fp16_partial_fused_offsets_t->numel(), - fp16_partial_fused_offsets_t->place()); - if (num_devices > 1) { if (use_master_param_norm) { FillZeroWithPtr(param_square_norm + fp32_global_param_num, @@ -1296,11 +1349,11 @@ class DistributedFusedLambOpKernel fp16_partial_fused_offsets, fp16_local_param_num, param_square_norm + fp16_local_start_idx); } else { - // NOTE: extra computation is performed. We can improve this performance - // if needed in the future. MultiTensorL2Norm( - place, stream, fp16_param, fused_offsets + fp32_global_param_num, - fp16_global_param_num, param_square_norm + fp32_global_param_num); + place, stream, fp16_param + fused_offsets[fp16_local_start_idx] - + fused_offsets[fp32_global_param_num], + fused_offsets + fp16_local_start_idx, fp16_local_param_num, + param_square_norm + fp16_local_start_idx); } MultiTensorL2Norm(place, stream, trust_ratio_div, @@ -1333,26 +1386,29 @@ class DistributedFusedLambOpKernel // Step 9: update parameter, beta1pow, beta2pow. All gather parameters. if (has_fp32_param) { - LambUpdateParamAndBetaPows( - dev_ctx, trust_ratio_div, lr, indices + fp32_offset, - param_square_norm, trust_ratio_div_square_norm, fp32_square_grad_norm, - &beta1pow, &beta2pow, &found_inf, beta1, beta2, - fp32_numel_each_device, fp32_param + fp32_offset, nullptr, stream); + MultiTensorUpdateLambParamAndBetaPows( + dev_ctx, fp32_partial_fused_offsets, fp32_local_param_num, + trust_ratio_div, lr, param_square_norm + fp32_local_start_idx, + trust_ratio_div_square_norm + fp32_local_start_idx, found_inf, + fp32_param + fp32_offset, nullptr, beta1pow, beta2pow, beta1, beta2); if (num_devices > 1) { // ncclAllGather PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( fp32_param + fp32_offset, fp32_param, fp32_numel_each_device, ncclFloat32, comm, stream)); } + + beta1pow = nullptr; + beta2pow = nullptr; } if (has_fp16_param) { - LambUpdateParamAndBetaPows( - dev_ctx, trust_ratio_div + fp32_numel_each_device, lr, - indices + fp32_numel + fp16_offset, param_square_norm, - trust_ratio_div_square_norm, fp32_square_grad_norm, &beta1pow, - &beta2pow, &found_inf, beta1, beta2, fp16_numel_each_device, - fp16_param + fp16_offset, master_param + fp16_offset, stream); - + MultiTensorUpdateLambParamAndBetaPows( + dev_ctx, fp16_partial_fused_offsets, fp16_local_param_num, + trust_ratio_div + fp32_numel_each_device, lr, + param_square_norm + fp16_local_start_idx, + trust_ratio_div_square_norm + fp16_local_start_idx, found_inf, + fp16_param + fp16_offset, master_param + fp16_offset, beta1pow, + beta2pow, beta1, beta2); if (num_devices > 1) { // ncclAllGather PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( diff --git a/paddle/fluid/operators/optimizers/multi_tensor_apply.h b/paddle/fluid/operators/optimizers/multi_tensor_apply.h index 5d8d03c733d..179e8f45254 100644 --- a/paddle/fluid/operators/optimizers/multi_tensor_apply.h +++ b/paddle/fluid/operators/optimizers/multi_tensor_apply.h @@ -94,11 +94,40 @@ static __global__ void MultiTensorApplyCUDAKernel( args...); } -template -static void MultiTensorApply(Functor functor, gpuStream_t stream, - const int *offsets, int n, int chunk_size, - Args... args) { +template +class MultiTensorLauncher { + public: + MultiTensorLauncher( + const TensorMetaList &meta, + const int &chunk_id, const int &chunk_size, const int &block_dim, + const gpuStream_t &stream) + : meta_(meta), + chunk_id_(chunk_id), + chunk_size_(chunk_size), + block_dim_(block_dim), + stream_(stream) {} + + template + void Launch(Functor &&functor, Args &&... args) const { + MultiTensorApplyCUDAKernel< + Functor, MaxTensorNumPerLaunch, + MaxChunkNumPerLaunch><<>>( + functor, meta_, chunk_size_, args...); + } + + private: + const TensorMetaList &meta_; + const int &chunk_id_; + const int &chunk_size_; + const int &block_dim_; + const gpuStream_t &stream_; +}; + +template +static void MultiTensorApplyWithCallback(gpuStream_t stream, const int *offsets, + int n, int chunk_size, int block_dim, + Callback &&callback) { if (n == 0) return; constexpr auto NumTensor = MaxTensorNumPerLaunch; @@ -110,6 +139,11 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream, int numel_offset = 0; metas.start_tensor_id = 0; metas.start_chunk_id = 0; + int launch_num = 0; + + MultiTensorLauncher launcher( + metas, chunk_id, chunk_size, block_dim, stream); + for (int i = 0; i < n; ++i) { auto length = offsets[i + 1] - offsets[i]; if (tensor_id == 0) { @@ -132,9 +166,8 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream, bool last_chunk = (i + 1 == n && j + 1 == chunk_num); if (tensor_full || block_full || last_chunk) { - MultiTensorApplyCUDAKernel<<>>( - functor, metas, chunk_size, args...); + callback(launcher, launch_num); + ++launch_num; chunk_id = 0; if (j + 1 == chunk_num) { // chunk for the current tensor is full metas.start_chunk_id = 0; @@ -152,5 +185,17 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream, } } +template +static void MultiTensorApply(Functor functor, gpuStream_t stream, + const int *offsets, int n, int chunk_size, + int block_dim, Args &&... args) { + auto callback = [&](const MultiTensorLauncher &launcher, + int i) { launcher.Launch(functor, args...); }; + MultiTensorApplyWithCallback( + stream, offsets, n, chunk_size, block_dim, callback); +} + } // namespace operators } // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py index e0529c5d5f8..00d2a1f71d6 100644 --- a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py +++ b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py @@ -144,6 +144,11 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs): grad_clip = kwargs.get('grad_clip', None) clip_after_allreduce = kwargs.get('clip_after_allreduce', True) + parameters = [p.name for p in main.all_parameters()] + exclude_fn = lambda var: var.name in parameters[::4] + kwargs['exclude_from_weight_decay_fn'] = exclude_fn + kwargs['lamb_weight_decay'] = 0.1 + if use_distributed_lamb: optimizer_class = DistributedFusedLamb kwargs = dict(kwargs) diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py index e7c3cfbb7b9..cc33a909632 100644 --- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py +++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py @@ -171,10 +171,7 @@ class DistributedFusedLamb(Optimizer): moment2.is_distributed = True beta1pow = self._create_persistable_var('beta1pow') beta2pow = self._create_persistable_var('beta2pow') - fused_indices = self._create_persistable_var( - 'fused_indices', dtype='int32') - weight_decay = self._create_persistable_var('weight_decay') - weight_decay.is_distributed = True + param_info = self._create_persistable_var('param_info', dtype='int32') param_info.is_distributed = True @@ -189,17 +186,20 @@ class DistributedFusedLamb(Optimizer): 'fp16_partial_fused_offsets', dtype='int32') fp16_partial_fused_offsets.is_distributed = True + param_order = self._create_persistable_var('param_order', dtype='int32') + param_order.is_distributed = True + rank = get_rank() nranks = get_world_size() scale = self._get_or_create_scale() params = [p for p, _ in params_grads] grads = [g for _, g in params_grads] - weight_decay_values = [self._weight_decay] * len(params) + apply_weight_decay = [1] * len(params) if self._exclude_from_weight_decay_fn is not None: for i, p in enumerate(params): if self._exclude_from_weight_decay_fn(p): - weight_decay_values[i] = 0.0 + apply_weight_decay[i] = 0 startup_block = self.helper.startup_program.global_block() for g in grads: @@ -225,8 +225,6 @@ class DistributedFusedLamb(Optimizer): 'Moment2': [moment2], 'Beta1Pow': [beta1pow], 'Beta2Pow': [beta2pow], - 'FusedIndices': [fused_indices], - 'WeightDecay': [weight_decay], 'GlobalScale': [scale], 'ParamInfo': [param_info], 'ParamOut': params, @@ -235,12 +233,13 @@ class DistributedFusedLamb(Optimizer): 'FP32ShardFusedParamOffsets': [fp32_partial_fused_offsets], 'FP16ShardFusedParamOffsets': [fp16_partial_fused_offsets], 'FusedParamOffsets': [fused_offsets], + 'ParamOrder': [param_order], }, attrs={ 'alignment': self._alignment, 'rank': rank, 'nranks': nranks, - 'weight_decay': weight_decay_values, + 'apply_weight_decay': apply_weight_decay, 'moment1': 0.0, 'moment2': 0.0, 'beta1': self._beta1, @@ -272,8 +271,6 @@ class DistributedFusedLamb(Optimizer): 'Moment2': [moment2], 'Beta1Pow': [beta1pow], 'Beta2Pow': [beta2pow], - 'FusedIndices': [fused_indices], - 'WeightDecay': [weight_decay], 'GlobalScale': [scale], 'ParamInfo': [param_info], 'Param': params, @@ -281,6 +278,7 @@ class DistributedFusedLamb(Optimizer): 'FusedParamOffsets': [fused_offsets], 'FP32ShardFusedParamOffsets': [fp32_partial_fused_offsets], 'FP16ShardFusedParamOffsets': [fp16_partial_fused_offsets], + 'ParamOrder': [param_order], }, outputs={ 'FP32FusedParamOut': [fp32_fused_param], @@ -294,6 +292,7 @@ class DistributedFusedLamb(Optimizer): 'FoundInf': [self._found_inf], }, attrs={ + 'weight_decay': self._weight_decay, 'beta1': self._beta1, 'beta2': self._beta2, 'epsilon': self._epsilon, -- GitLab From 4149cabeec527fa171a45a10ab21ba7fd1374a3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?= <78149749+winter-wang@users.noreply.github.com> Date: Tue, 1 Mar 2022 10:00:01 +0800 Subject: [PATCH 005/272] add type constrait for DenseTensor (#39967) --- paddle/infrt/dialect/infrt/infrt_ops_base.td | 6 ++++++ paddle/infrt/dialect/init_infrt_dialects.cc | 4 ++-- paddle/infrt/dialect/phi/CMakeLists.txt | 11 +---------- paddle/infrt/dialect/phi/ir/CMakeLists.txt | 9 +++++++++ paddle/infrt/dialect/phi/{ => ir}/infrt_phi_base.td | 0 .../infrt/dialect/phi/{ => ir}/infrt_phi_kernel.td | 2 +- .../infrt/dialect/phi/{ => ir}/infrt_phi_tensor.cc | 10 +++++----- paddle/infrt/dialect/phi/{ => ir}/infrt_phi_tensor.h | 8 ++++---- .../infrt/dialect/phi/{ => ir}/infrt_phi_tensor.td | 2 +- paddle/infrt/dialect/phi/{ => ir}/phi_base.cc | 12 ++++++------ paddle/infrt/dialect/phi/{ => ir}/phi_base.h | 8 +++++--- 11 files changed, 40 insertions(+), 32 deletions(-) create mode 100644 paddle/infrt/dialect/phi/ir/CMakeLists.txt rename paddle/infrt/dialect/phi/{ => ir}/infrt_phi_base.td (100%) rename paddle/infrt/dialect/phi/{ => ir}/infrt_phi_kernel.td (92%) rename paddle/infrt/dialect/phi/{ => ir}/infrt_phi_tensor.cc (71%) rename paddle/infrt/dialect/phi/{ => ir}/infrt_phi_tensor.h (83%) rename paddle/infrt/dialect/phi/{ => ir}/infrt_phi_tensor.td (97%) rename paddle/infrt/dialect/phi/{ => ir}/phi_base.cc (84%) rename paddle/infrt/dialect/phi/{ => ir}/phi_base.h (84%) diff --git a/paddle/infrt/dialect/infrt/infrt_ops_base.td b/paddle/infrt/dialect/infrt/infrt_ops_base.td index 81d3d028a66..978b126d754 100644 --- a/paddle/infrt/dialect/infrt/infrt_ops_base.td +++ b/paddle/infrt/dialect/infrt/infrt_ops_base.td @@ -40,6 +40,12 @@ def DenseTensor : Infrt_Type<"DenseTensor"> { ); } +// Type Constrait for concrete DenseTensor type. +class DenseTensor : + Type, + "!infrt.DenseTensor<"#target#","#precision#","#layout#">", + "::infrt::DenseTensorType">; + // Base class for infrt dialect attributes. class Infrt_Attr traits = [], string baseCppClass = "::mlir::Attribute"> diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_infrt_dialects.cc index b5b8de7a20d..c5c81b4b0f2 100644 --- a/paddle/infrt/dialect/init_infrt_dialects.cc +++ b/paddle/infrt/dialect/init_infrt_dialects.cc @@ -21,8 +21,8 @@ #include "paddle/infrt/dialect/infrt/infrt_dialect.h" #include "paddle/infrt/dialect/infrt_base.h" #include "paddle/infrt/dialect/pd_ops.h" -#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h" -#include "paddle/infrt/dialect/phi/phi_base.h" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" +#include "paddle/infrt/dialect/phi/ir/phi_base.h" #include "paddle/infrt/dialect/tensor_shape.h" namespace infrt { diff --git a/paddle/infrt/dialect/phi/CMakeLists.txt b/paddle/infrt/dialect/phi/CMakeLists.txt index 626b02c1f79..d477b6b9bdc 100644 --- a/paddle/infrt/dialect/phi/CMakeLists.txt +++ b/paddle/infrt/dialect/phi/CMakeLists.txt @@ -2,16 +2,7 @@ if (NOT INFRT_WITH_PHI) return() endif() -#mlir_tablegen_on(infrt_phi_base DIALECT phi) -add_mlir_dialect(infrt_phi_base phi) -add_mlir_dialect(infrt_phi_tensor phi_dt) -add_mlir_dialect(infrt_phi_kernel phi_kernel) -#mlir_tablegen_on(infrt_phi_tensor) - -gather_srcs(infrt_src SRCS - phi_base.cc infrt_phi_tensor.cc - infrt_phi_tensor.cc) - +add_subdirectory(ir) add_subdirectory(pass) add_executable(phi-exec phi_exec.cc) diff --git a/paddle/infrt/dialect/phi/ir/CMakeLists.txt b/paddle/infrt/dialect/phi/ir/CMakeLists.txt new file mode 100644 index 00000000000..8c1d75629d0 --- /dev/null +++ b/paddle/infrt/dialect/phi/ir/CMakeLists.txt @@ -0,0 +1,9 @@ +#mlir_tablegen_on(infrt_phi_base DIALECT phi) +add_mlir_dialect(infrt_phi_base phi) +add_mlir_dialect(infrt_phi_tensor phi_dt) +add_mlir_dialect(infrt_phi_kernel phi_kernel) +#mlir_tablegen_on(infrt_phi_tensor) + +gather_srcs(infrt_src SRCS + phi_base.cc + infrt_phi_tensor.cc) diff --git a/paddle/infrt/dialect/phi/infrt_phi_base.td b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td similarity index 100% rename from paddle/infrt/dialect/phi/infrt_phi_base.td rename to paddle/infrt/dialect/phi/ir/infrt_phi_base.td diff --git a/paddle/infrt/dialect/phi/infrt_phi_kernel.td b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td similarity index 92% rename from paddle/infrt/dialect/phi/infrt_phi_kernel.td rename to paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td index 879994907cc..37bf0b5ef21 100644 --- a/paddle/infrt/dialect/phi/infrt_phi_kernel.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td @@ -4,7 +4,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/IR/OpBase.td" include "paddle/infrt/dialect/infrt_base.td" -include "paddle/infrt/dialect/phi/infrt_phi_base.td" +include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td" def PHI_KernelDialect : Dialect { let name = "phi_kernel"; diff --git a/paddle/infrt/dialect/phi/infrt_phi_tensor.cc b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cc similarity index 71% rename from paddle/infrt/dialect/phi/infrt_phi_tensor.cc rename to paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cc index 9df1a47031b..64780294be9 100644 --- a/paddle/infrt/dialect/phi/infrt_phi_tensor.cc +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cc @@ -12,12 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" #include -#include "paddle/infrt/dialect/phi/infrt_phi_tensorDialect.cpp.inc" -#include "paddle/infrt/dialect/phi/infrt_phi_tensorTypes.cpp.inc" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorDialect.cpp.inc" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorTypes.cpp.inc" namespace infrt { namespace phi { @@ -25,7 +25,7 @@ namespace phi { void PHIDenseTensorDialect::initialize() { #define GET_OP_LIST addOperations< -#include "paddle/infrt/dialect/phi/infrt_phi_tensor.cpp.inc" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cpp.inc" >(); } @@ -33,4 +33,4 @@ void PHIDenseTensorDialect::initialize() { } // namespace infrt #define GET_OP_CLASSES -#include "paddle/infrt/dialect/phi/infrt_phi_tensor.cpp.inc" // NOLINT +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cpp.inc" // NOLINT diff --git a/paddle/infrt/dialect/phi/infrt_phi_tensor.h b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h similarity index 83% rename from paddle/infrt/dialect/phi/infrt_phi_tensor.h rename to paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h index 2780f975918..9a92558daab 100644 --- a/paddle/infrt/dialect/phi/infrt_phi_tensor.h +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h @@ -29,11 +29,11 @@ #include #include -#include "paddle/infrt/dialect/phi/infrt_phi_tensorDialect.h.inc" -#include "paddle/infrt/dialect/phi/infrt_phi_tensorTypes.h.inc" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorDialect.h.inc" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorTypes.h.inc" #include "paddle/infrt/dialect/dense_tensor.h" -#include "paddle/infrt/dialect/phi/phi_base.h" +#include "paddle/infrt/dialect/phi/ir/phi_base.h" // NOLINT #define GET_OP_CLASSES -#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h.inc" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h.inc" diff --git a/paddle/infrt/dialect/phi/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td similarity index 97% rename from paddle/infrt/dialect/phi/infrt_phi_tensor.td rename to paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td index b7b3b061fdb..dc3a4b340d7 100644 --- a/paddle/infrt/dialect/phi/infrt_phi_tensor.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td @@ -2,7 +2,7 @@ #else #define PHI_TENSOR -include "paddle/infrt/dialect/phi/infrt_phi_base.td" +include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td" include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/IR/OpBase.td" include "paddle/infrt/dialect/infrt_base.td" diff --git a/paddle/infrt/dialect/phi/phi_base.cc b/paddle/infrt/dialect/phi/ir/phi_base.cc similarity index 84% rename from paddle/infrt/dialect/phi/phi_base.cc rename to paddle/infrt/dialect/phi/ir/phi_base.cc index a1caa40f638..7a6b3f3f0a4 100644 --- a/paddle/infrt/dialect/phi/phi_base.cc +++ b/paddle/infrt/dialect/phi/ir/phi_base.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/dialect/phi/phi_base.h" +#include "paddle/infrt/dialect/phi/ir/phi_base.h" #include #include @@ -21,8 +21,8 @@ #include #include #include "paddle/infrt/common/global.h" -#include "paddle/infrt/dialect/phi/infrt_phi_base.cpp.inc" -#include "paddle/infrt/dialect/phi/infrt_phi_baseDialect.cpp.inc" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.cpp.inc" namespace infrt { namespace phi { @@ -51,11 +51,11 @@ void PHIDialect::printType(::mlir::Type type, void PHIDialect::initialize() { addOperations< #define GET_OP_LIST -#include "paddle/infrt/dialect/phi/infrt_phi_base.cpp.inc" // NOLINT +#include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc" // NOLINT >(); addTypes< #define GET_TYPEDEF_LIST -#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.cpp.inc" // NOLINT +#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.cpp.inc" // NOLINT >(); } @@ -81,4 +81,4 @@ mlir::Type PHIDialect::parseType(mlir::DialectAsmParser& parser) const { } // namespace infrt #define GET_TYPEDEF_CLASSES -#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.cpp.inc" // NOLINT +#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.cpp.inc" // NOLINT diff --git a/paddle/infrt/dialect/phi/phi_base.h b/paddle/infrt/dialect/phi/ir/phi_base.h similarity index 84% rename from paddle/infrt/dialect/phi/phi_base.h rename to paddle/infrt/dialect/phi/ir/phi_base.h index 11174290f92..a08d8229fcc 100644 --- a/paddle/infrt/dialect/phi/phi_base.h +++ b/paddle/infrt/dialect/phi/ir/phi_base.h @@ -19,11 +19,13 @@ #include -#include "paddle/infrt/dialect/phi/infrt_phi_base.h.inc" -#include "paddle/infrt/dialect/phi/infrt_phi_baseDialect.h.inc" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.h.inc" #define GET_TYPEDEF_CLASSES -#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.h.inc" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.h.inc" + +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/phi/ir/infrt_phi_base.h.inc" namespace mlir { namespace OpTrait { -- GitLab From 75280d36afe1e5e4aab0df51a9d7ee0828ee12fa Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Tue, 1 Mar 2022 10:24:17 +0800 Subject: [PATCH 006/272] remove dot infershape (#39945) --- paddle/fluid/operators/dot_op.cc | 55 ++++++-------------------------- 1 file changed, 9 insertions(+), 46 deletions(-) diff --git a/paddle/fluid/operators/dot_op.cc b/paddle/fluid/operators/dot_op.cc index ed2b09796ee..a86a3bb3592 100644 --- a/paddle/fluid/operators/dot_op.cc +++ b/paddle/fluid/operators/dot_op.cc @@ -14,6 +14,10 @@ #include "paddle/fluid/operators/dot_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" + namespace paddle { namespace operators { @@ -21,51 +25,6 @@ class DotOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(true, ctx->HasInput("X"), - platform::errors::PreconditionNotMet( - "Input(X) of DotOp should not be null.")); - PADDLE_ENFORCE_EQ(true, ctx->HasInput("Y"), - platform::errors::PreconditionNotMet( - "Input(Y) of DotOp should not be null.")); - PADDLE_ENFORCE_EQ(true, ctx->HasOutput("Out"), - platform::errors::PreconditionNotMet( - "Output(Out) of DotOp should not be null.")); - - auto x_dims = ctx->GetInputDim("X"); - auto x_rank = static_cast(x_dims.size()); - PADDLE_ENFORCE_EQ(true, 1 == x_rank || 2 == x_rank, - platform::errors::PreconditionNotMet( - "ShapeError: The dimensions of input tensor X (%s) " - "should be 1 or 2", - x_dims.to_str())); - - auto y_dims = ctx->GetInputDim("Y"); - PADDLE_ENFORCE_EQ( - true, x_rank == (size_t)y_dims.size(), - platform::errors::PreconditionNotMet( - "ShapeError: The shape of input tensor Y: %s should match with " - "input tenosr X: %s", - y_dims.to_str(), x_dims.to_str())); - bool shape_match = true; - for (size_t i = 0; i < x_rank; ++i) { - if (x_dims[i] != y_dims[i]) { - shape_match = false; - break; - } - } - - PADDLE_ENFORCE_EQ(true, shape_match, - platform::errors::PreconditionNotMet( - "ShapeError: The shape of input tensor X: %s should " - "be exactly the same " - "with input tensor Y: %s", - x_dims.to_str(), y_dims.to_str())); - auto dims = vectorize(x_dims); - dims[dims.size() - 1] = 1; - ctx->SetOutputDim("Out", phi::make_ddim(dims)); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( @@ -142,9 +101,13 @@ class DotOpGradMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(dot, DotInferShapeFunctor, + PT_INFER_META(phi::DotInferMeta)); + REGISTER_OPERATOR(dot, ops::DotOp, ops::DotOpMaker, ops::DotOpGradMaker, - ops::DotOpGradMaker); + ops::DotOpGradMaker, + DotInferShapeFunctor); REGISTER_OPERATOR(dot_grad, ops::DotGradOp); -- GitLab From 657dd5a97de6b54e59aa60a7d7afcab33bf36420 Mon Sep 17 00:00:00 2001 From: crystal <62974595+Zjq9409@users.noreply.github.com> Date: Tue, 1 Mar 2022 10:48:13 +0800 Subject: [PATCH 007/272] Optimize group_norm op forward (#39596) * optimize group norm forward * use vectorized optimization * add scalar calculation code * optimize code --- paddle/fluid/operators/group_norm_op.cu | 149 ++++++++++++++++++++---- 1 file changed, 129 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu index 72a90d17998..b376334f1e9 100644 --- a/paddle/fluid/operators/group_norm_op.cu +++ b/paddle/fluid/operators/group_norm_op.cu @@ -29,6 +29,7 @@ namespace operators { using DataLayout = framework::DataLayout; enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 }; +#define ALIGN_BYTES 16 #define CHECK_CASE(i, flags, kernel_name, ...) \ if (i == flags) { \ @@ -56,8 +57,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) { template __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W, int imsize, int groups, - int group_size, T* mean, T* var, - const DataLayout data_layout) { + int group_size, T* mean, T* var) { int gid = blockIdx.y; int cid = blockIdx.x; int bid = blockIdx.z; @@ -68,13 +68,10 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W, T x_mean = 0, x_var = 0; for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { T val; - if (data_layout == DataLayout::kNCHW) { - val = x[(bid * C + ccid) * imsize + imid]; - } else { - int hid = imid / W; - int wid = imid % W; - val = x[(bid * H + hid) * W * C + wid * C + ccid]; - } + int hid = imid / W; + int wid = imid % W; + val = x[(bid * H + hid) * W * C + wid * C + ccid]; + x_mean += val; x_var += val * val; } @@ -84,6 +81,85 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W, CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var); } +template +__device__ __forceinline__ void ThreadReduce(const T* input, int size, + const int offset, AccT* mean, + AccT* var) { + using VecT = kps::details::VectorType; + int tid = threadIdx.x; + if (offset > 0) { + input -= offset; + size += offset; + if (tid >= offset) { + AccT temp = input[tid]; + *mean += temp; + *var += temp * temp; + } + size -= blockDim.x; + input += blockDim.x; + } + int remain = size % (VecSize * blockDim.x); + + T ins[VecSize]; + VecT* ins_vec = reinterpret_cast(&ins); + + // vector part + for (; VecSize * tid < (size - remain); tid += blockDim.x) { + *ins_vec = reinterpret_cast(input)[tid]; + +#pragma unroll + for (int i = 0; i < VecSize; ++i) { + AccT temp = ins[i]; + *mean += temp; + *var += temp * temp; + } + } + + // scalar part + tid = size - remain + threadIdx.x; + for (; tid < size; tid += blockDim.x) { + AccT temp = input[tid]; + *mean += temp; + *var += temp * temp; + } +} + +template +__global__ void ScalarGetMeanAndVarNCHW(const T* x, T* mean, T* var, int size) { + int i = blockIdx.x; + T x_mean = 0, x_var = 0; + for (int j = threadIdx.x; j < size; j += blockDim.x) { + T val; + val = x[i * size + j]; + x_mean += val; + x_var += val * val; + } + x_mean /= size; + x_var /= size; + CudaAtomicAddWithWarp(&mean[i], x_mean); + CudaAtomicAddWithWarp(&var[i], x_var); +} + +template +__global__ void VectorizedGetMeanAndVarNCHW(const T* x, T* mean, T* var, + int size) { + int i = blockIdx.x; + AccT x_mean = static_cast(0); + AccT x_var = static_cast(0); + const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T); + x += i * size; + ThreadReduce(x, size, input_offset, &x_mean, &x_var); + x_mean = kps::details::BlockXReduce>( + x_mean, kps::AddFunctor()); + x_var = kps::details::BlockXReduce>( + x_var, kps::AddFunctor()); + __syncthreads(); + if (threadIdx.x == 0) { + mean[i] = static_cast(x_mean / size); + var[i] = static_cast(x_var / size); + } +} + template __global__ void GroupNormForward(const T* x, const T* mean, const T* var, const T* scale, const T* bias, int N, int C, @@ -96,26 +172,34 @@ __global__ void GroupNormForward(const T* x, const T* mean, const T* var, int H = imsize / W; int ccid = gid * group_size + cid; if (ccid >= C) return; - T x_mean = mean[bid * groups + gid]; - T x_var = var[bid * groups + gid]; + auto ng = bid * groups + gid; + T x_mean = mean[ng]; + T x_var = var[ng]; x_var = x_var - x_mean * x_mean; - T var_inv = 1.0 / sqrt(x_var + epsilon); - if (cid == 0 && threadIdx.x == 0) real_var[bid * groups + gid] = x_var; + T var_inv = rsqrt(x_var + epsilon); + if (cid == 0 && threadIdx.x == 0) { + real_var[ng] = x_var; + } for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { T val; int hid, wid; + int index = (bid * C + ccid) * imsize + imid; if (data_layout == DataLayout::kNCHW) { - val = x[(bid * C + ccid) * imsize + imid]; + val = x[index]; } else { hid = imid / W; wid = imid % W; val = x[(bid * H + hid) * W * C + wid * C + ccid]; } val = (val - x_mean) * var_inv; - if (flags & kHasScale) val *= scale[gid * group_size + cid]; - if (flags & kHasBias) val += bias[gid * group_size + cid]; + if (flags & kHasScale) { + val *= scale[ccid]; + } + if (flags & kHasBias) { + val += bias[ccid]; + } if (data_layout == DataLayout::kNCHW) { - y[(bid * C + ccid) * imsize + imid] = val; + y[index] = val; } else { y[(bid * H + hid) * W * C + wid * C + ccid] = val; } @@ -182,16 +266,41 @@ class GroupNormKernel imsize *= x_dims[i]; } } + #ifdef __HIPCC__ int block_size = std::max(std::min(256, imsize), 64); #else int block_size = std::min(1024, imsize); #endif + dim3 grid(group_size, groups, x_dims[0]); dim3 threads(block_size, 1, 1); - GroupNormForwardGetMeanAndVar<<>>( - x_data, x_dims[0], C, W, imsize, groups, group_size, mean_data, - temp_var_data, data_layout); + if (data_layout == DataLayout::kNCHW) { + using AccT = typename details::MPTypeTrait::Type; + constexpr int vec_size = sizeof(float4) / sizeof(T); + int size = group_size * imsize; + const int max_num_threads = 1024; + int max_block_size = std::min(size / vec_size, max_num_threads); + int block_size_nchw = 1; + while (block_size_nchw < max_block_size) { + block_size_nchw *= 2; + } + block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize); + dim3 grids(x_dims[0] * groups); + dim3 blocks(block_size_nchw); + if (size < vec_size) { + ScalarGetMeanAndVarNCHW<<>>( + x_data, mean_data, temp_var_data, size); + } else { + VectorizedGetMeanAndVarNCHW< + T, AccT, vec_size><<>>( + x_data, mean_data, temp_var_data, size); + } + } else { + GroupNormForwardGetMeanAndVar<<>>( + x_data, x_dims[0], C, W, imsize, groups, group_size, mean_data, + temp_var_data); + } int flags = (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias; UNROLL_ALL_CASES(flags, GroupNormForward, x_data, mean_data, temp_var_data, -- GitLab From 4da841e0caeb36b758039b4afa8758dd91d6252c Mon Sep 17 00:00:00 2001 From: ShenLiang <1422485404@qq.com> Date: Tue, 1 Mar 2022 10:53:16 +0800 Subject: [PATCH 008/272] [DP] Construct reducer group (#39987) * add reducer --- .../distributed/collective/CMakeLists.txt | 1 + .../fluid/distributed/collective/reducer.cc | 131 ++++++++++++++ paddle/fluid/distributed/collective/reducer.h | 32 ++++ paddle/fluid/pybind/CMakeLists.txt | 2 +- paddle/fluid/pybind/distributed_py.cc | 14 ++ python/paddle/fluid/dygraph/parallel.py | 8 +- .../tests/unittests/test_imperative_group.py | 168 ++++++++---------- 7 files changed, 265 insertions(+), 91 deletions(-) create mode 100644 paddle/fluid/distributed/collective/reducer.cc create mode 100644 paddle/fluid/distributed/collective/reducer.h diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index 41652f8b6ed..a5b40f8aa07 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -1,4 +1,5 @@ cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api) +cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup) if(WITH_NCCL) cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api) diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc new file mode 100644 index 00000000000..59f3ea3b0a7 --- /dev/null +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/collective/reducer.h" +#include "paddle/phi/common/data_type.h" + +namespace paddle { +namespace distributed { + +std::vector> Eager_AssignGroupBySize( + const std::vector tensors, + const std::vector &is_sparse_gradient, + const std::vector &group_size_limits, + const std::vector &tensor_indices) { + PADDLE_ENFORCE_EQ( + tensors.size(), is_sparse_gradient.size(), + platform::errors::PreconditionNotMet( + "tensors len must be equal to is_sparse_gradient len, but " + "[%lu] != [%lu]", + tensors.size(), is_sparse_gradient.size())); + auto check_perm = [](const std::vector &x) -> bool { + size_t len = x.size(); + std::vector cnt(len, 0); + for (size_t i = 0; i < len; ++i) { + if (x[i] >= static_cast(len) || x[i] < 0 || cnt[x[i]]) { + return false; + } + cnt[x[i]]++; + } + return true; + }; + + PADDLE_ENFORCE_EQ(true, check_perm(tensor_indices), + platform::errors::PreconditionNotMet( + "tensor_indices must be a permutation from 0 to %lu", + tensor_indices.size())); + // the return vector + std::vector> res; + + // Key: the var type + // Value: should use which index in group_size_limits for group size limit + std::map group_limit_index; + + // Key: the var type + // Value: + std::map, size_t>> + next_group; + + for (size_t i = 0; i < tensors.size(); ++i) { + const auto &var = tensors[i]; + + size_t tensor_real_index = i; + if (!tensor_indices.empty()) { + tensor_real_index = tensor_indices[i]; + } + + if (is_sparse_gradient[tensor_real_index]) { + // we keep sparse var a single group + res.push_back({tensor_real_index}); + continue; + } + + const auto &var_dtype = var.dtype(); + VLOG(3) << "var[" << var.name() << "] 's type is " << var_dtype; + auto &group_info = next_group[var_dtype]; + + int64_t var_size = -1; + + if (var.is_dense_tensor()) { + var_size = + std::dynamic_pointer_cast(var.impl())->numel(); + } else { + VLOG(3) << "var " << var.name() + << " is not tensor or selected_rows, so skip it"; + continue; + } + + group_info.first.push_back(tensor_real_index); + group_info.second += experimental::SizeOf(var_dtype) * var_size; + // group_info.second += framework::SizeOfType(var_dtype) * var_size; + + if (group_limit_index.find(var_dtype) == group_limit_index.end()) { + // means it is the first var of var_dtype + group_limit_index[var_dtype] = 0; + } + auto &cur_limit_index = group_limit_index[var_dtype]; + if (group_info.second >= group_size_limits[cur_limit_index]) { + // exceed group capacity and create a new group + res.emplace_back(std::move(group_info.first)); + group_info = std::pair, size_t>(); + cur_limit_index = + (std::min)(cur_limit_index + 1, group_size_limits.size() - 1); + } + } + + // add the final groups + for (auto &e : next_group) { + auto &group_info = e.second; + if (!group_info.first.empty()) { + res.emplace_back(std::move(group_info.first)); + } + } + + for (const auto &group_index : res) { + PADDLE_ENFORCE_NE( + group_index.empty(), true, + platform::errors::PreconditionNotMet( + "AssignGroupBySize construct empty group, please check.")); + } + if (tensor_indices.empty()) { + std::sort(res.begin(), res.end(), + [](const std::vector &x, const std::vector &y) { + return x.front() < y.front(); + }); + } + return res; +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h new file mode 100644 index 00000000000..f8c75385ef8 --- /dev/null +++ b/paddle/fluid/distributed/collective/reducer.h @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/eager/api/utils/tensor_utils.h" + +namespace paddle { +namespace distributed { +using Tensor = paddle::experimental::Tensor; + +std::vector> Eager_AssignGroupBySize( + const std::vector, const std::vector& is_sparse_gradient, + const std::vector& group_size_limits, + const std::vector& tensor_indices = {}); + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 1f06eda8a2e..c61e8212b02 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -81,7 +81,7 @@ set(PYBIND_SRCS cuda_streams_py.cc) if(NOT ON_INFER) - set (PYBIND_DEPS ${PYBIND_DEPS} processgroup) + set (PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer) if (WITH_NCCL) set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl) endif() diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index e057fb53cce..7b59188a9f3 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/distributed/collective/ProcessGroup.h" #include "paddle/fluid/distributed/collective/Types.h" +#include "paddle/fluid/distributed/collective/reducer.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/imperative/layer.h" @@ -143,6 +144,19 @@ void BindDistributed(py::module *m) { [](distributed::ProcessGroupStrategy &self, int nrings) { self.nrings_ = nrings; }); + + m->def("eager_assign_group_by_size", + [](py::handle py_tensors, std::vector is_sparse_gradient, + std::vector group_size_limits, + std::vector tensor_indices) { + auto tensors = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0); + return distributed::Eager_AssignGroupBySize( + tensors, is_sparse_gradient, group_size_limits, tensor_indices); + }, + py::arg("tensors"), py::arg("is_sparse_gradient"), + py::arg("group_size_limits") = std::vector{25 * 1024 * 1024}, + py::arg("tensor_indices") = std::vector{}, + py::call_guard()); } } // end namespace pybind diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index ddb86848f84..0049f387b70 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -560,13 +560,19 @@ class DataParallel(layers.Layer): strategy=None, comm_buffer_size=25, last_comm_buffer_size=1, - find_unused_parameters=False): + find_unused_parameters=False, + process_group=None, + gradient_as_buffer_view=False, + static_graph=False): super(DataParallel, self).__init__(layers.full_name() + "_data_parallel") self._layers = layers self.find_unused_parameters = find_unused_parameters self.grad_need_sync = True + self.process_group = process_group + self.gradient_as_buffer_view = gradient_as_buffer_view + self.static_graph = static_graph # NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy. # It just stores some environment variables, which can be constructed by diff --git a/python/paddle/fluid/tests/unittests/test_imperative_group.py b/python/paddle/fluid/tests/unittests/test_imperative_group.py index f9635809651..89535797ed0 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_group.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_group.py @@ -26,159 +26,149 @@ import paddle.fluid.dygraph as dygraph from paddle.fluid.dygraph.nn import Linear import paddle.fluid.core as core from paddle.fluid.optimizer import SGDOptimizer - - -class MLP(fluid.Layer): - def __init__(self, param_attr=None, bias_attr=None): - super(MLP, self).__init__() - - self._linear1 = Linear(784, 10) - self._linear2 = Linear(10, 10) - - def forward(self, inputs): - y = self._linear1(inputs) - y = self._linear2(y) - return y +from paddle.fluid.framework import _test_eager_guard class TestDataParallelGroup(unittest.TestCase): - def create_varbase(self, dtype, shape, - type=core.VarDesc.VarType.LOD_TENSOR): - return core.VarBase(dtype, shape, "", type, True) + def create_varbase(self, dtype, shape): + return paddle.rand(shape=shape, dtype=dtype) + + def assign_group_by_size(self, *args): + return core.assign_group_by_size(*args) def test_construct_group0(self): # one dtype & one limit capability var_list = [] - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50])) - var_list.append( - self.create_varbase(core.VarDesc.VarType.FP32, [2, 100])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25])) - res = core.assign_group_by_size(var_list, [False, False, False, False], + var_list.append(self.create_varbase("float32", [2, 50])) + var_list.append(self.create_varbase("float32", [2, 100])) + var_list.append(self.create_varbase("float32", [2, 50])) + var_list.append(self.create_varbase("float32", [2, 25])) + res = self.assign_group_by_size(var_list, [False, False, False, False], [400]) self.assertEqual([[0], [1], [2], [3]], res) def test_construct_group1(self): # multi dtype & one limit capability var_list = [] - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - res = core.assign_group_by_size( + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + res = self.assign_group_by_size( var_list, [False, False, False, False, False, False], [400]) self.assertEqual([[0, 2], [1, 3], [4], [5]], res) def test_construct_group2(self): # one dtype & multi limit capability var_list = [] - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50])) - res = core.assign_group_by_size(var_list, [False, False, False, False], + var_list.append(self.create_varbase("float32", [2, 50])) + var_list.append(self.create_varbase("float32", [2, 50])) + var_list.append(self.create_varbase("float32", [2, 50])) + var_list.append(self.create_varbase("float32", [2, 50])) + res = self.assign_group_by_size(var_list, [False, False, False, False], [400, 800]) self.assertEqual([[0], [1, 2], [3]], res) def test_construct_group3(self): # multi dtype & multi limit capability var_list = [] - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - res = core.assign_group_by_size( + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + res = self.assign_group_by_size( var_list, [False, False, False, False, False, False], [200, 400]) self.assertEqual([[0], [1], [2, 4], [3, 5]], res) def test_construct_group4(self): # multi dtype & zero limit capability var_list = [] - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - res = core.assign_group_by_size( + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + res = self.assign_group_by_size( var_list, [False, False, False, False, False, False], [0]) self.assertEqual([[0], [1], [2], [3], [4], [5]], res) def test_construct_group5(self): # multi dtype & infinite capability var_list = [] - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - res = core.assign_group_by_size( + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + res = self.assign_group_by_size( var_list, [False, False, False, False, False, False], [10000]) self.assertEqual([[0, 2, 4], [1, 3, 5]], res) def test_construct_group6(self): # multi dtype & limit capability & multi tensor type var_list = [] - var_list.append( - self.create_varbase(core.VarDesc.VarType.FP32, [1, 50], - core.VarDesc.VarType.SELECTED_ROWS)) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append( - self.create_varbase(core.VarDesc.VarType.FP64, [1, 25], - core.VarDesc.VarType.SELECTED_ROWS)) - res = core.assign_group_by_size( + var_list.append(self.create_varbase( + "float32", + [1, 50], )) + var_list.append(self.create_varbase("float64", [1, 25])) + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + res = self.assign_group_by_size( var_list, [True, False, False, False, False, True], [400]) self.assertEqual([[0], [1, 3], [2, 4], [5]], res) def test_construct_group7(self): # multi dtype & multi limit capability & multi tensor type var_list = [] - var_list.append( - self.create_varbase(core.VarDesc.VarType.FP32, [1, 50], - core.VarDesc.VarType.SELECTED_ROWS)) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50])) - var_list.append( - self.create_varbase(core.VarDesc.VarType.FP64, [1, 25], - core.VarDesc.VarType.SELECTED_ROWS)) - res = core.assign_group_by_size( + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + var_list.append(self.create_varbase("float32", [1, 50])) + var_list.append(self.create_varbase("float64", [1, 25])) + res = self.assign_group_by_size( var_list, [True, False, False, False, False, True], [200, 400]) self.assertEqual([[0], [1], [2], [3], [4], [5]], res) def test_construct_group8(self): # one dtype & one limit capability & have tensor_indices var_list = [] - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25])) - var_list.append( - self.create_varbase(core.VarDesc.VarType.FP32, [2, 100])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25])) - res = core.assign_group_by_size(var_list, [False, False, False, False], + var_list.append(self.create_varbase("float32", [2, 25])) + var_list.append(self.create_varbase("float32", [2, 100])) + var_list.append(self.create_varbase("float32", [2, 50])) + var_list.append(self.create_varbase("float32", [2, 25])) + res = self.assign_group_by_size(var_list, [False, False, False, False], [400], [3, 0, 1, 2]) self.assertEqual([[3, 0], [1], [2]], res) def test_construct_group9(self): # one dtype & one limit capability & have tensor_indices var_list = [] - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25])) - var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25])) - var_list.append( - self.create_varbase(core.VarDesc.VarType.FP32, [2, 1000])) - res = core.assign_group_by_size(var_list, [False, False, False, True], + var_list.append(self.create_varbase("float32", [2, 25])) + var_list.append(self.create_varbase("float32", [2, 25])) + var_list.append(self.create_varbase("float32", [2, 25])) + var_list.append(self.create_varbase("float32", [2, 1000])) + res = self.assign_group_by_size(var_list, [False, False, False, True], [300], [1, 0, 2, 3]) self.assertEqual([[1, 0], [3], [2]], res) +class TestDataParallelGroupEager(TestDataParallelGroup): + def create_varbase(self, dtype, shape): + with _test_eager_guard(): + return paddle.rand(shape=shape, dtype=dtype) + + def assign_group_by_size(self, *args): + return core.eager_assign_group_by_size(*args) + + if __name__ == '__main__': unittest.main() -- GitLab From 8c2379732257f6d6bdf8fbe9157afea51a364942 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Tue, 1 Mar 2022 10:59:51 +0800 Subject: [PATCH 009/272] [Phi] Migrate logical_and/or/not/xor into Phi (#39942) * [Phi] Migrate logical_and/or/not/xor into Phi * fix unittest * fix function name --- .../operators/controlflow/CMakeLists.txt | 2 +- .../fluid/operators/controlflow/logical_op.cc | 10 +- .../fluid/operators/controlflow/logical_op.cu | 69 ----------- .../fluid/operators/controlflow/logical_op.h | 111 ------------------ .../operators/controlflow/logical_op_npu.cc | 2 +- paddle/phi/kernels/cpu/logical_kernel.cc | 72 ++++++++++++ paddle/phi/kernels/funcs/logical_functor.h | 41 +++++++ paddle/phi/kernels/gpu/logical_kernel.cu | 79 +++++++++++++ paddle/phi/kernels/logical_kernel.h | 38 ++++++ .../fluid/tests/unittests/test_diff_op.py | 2 +- 10 files changed, 234 insertions(+), 192 deletions(-) delete mode 100644 paddle/fluid/operators/controlflow/logical_op.cu delete mode 100644 paddle/fluid/operators/controlflow/logical_op.h create mode 100644 paddle/phi/kernels/cpu/logical_kernel.cc create mode 100644 paddle/phi/kernels/funcs/logical_functor.h create mode 100644 paddle/phi/kernels/gpu/logical_kernel.cu create mode 100644 paddle/phi/kernels/logical_kernel.h diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt index 1a2df2a0c7b..a974f2ec335 100644 --- a/paddle/fluid/operators/controlflow/CMakeLists.txt +++ b/paddle/fluid/operators/controlflow/CMakeLists.txt @@ -20,5 +20,5 @@ else() endif() file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n") -file(APPEND ${pybind_file} "USE_OP(logical_and);\nUSE_OP(logical_or);\nUSE_OP(logical_xor);\nUSE_OP(logical_not);\n") +file(APPEND ${pybind_file} "USE_OP_ITSELF(logical_and);\nUSE_OP_ITSELF(logical_or);\nUSE_OP_ITSELF(logical_xor);\nUSE_OP_ITSELF(logical_not);\n") file(APPEND ${pybind_file} "USE_OP(bitwise_and);\nUSE_OP(bitwise_or);\nUSE_OP(bitwise_xor);\nUSE_OP(bitwise_not);\n") diff --git a/paddle/fluid/operators/controlflow/logical_op.cc b/paddle/fluid/operators/controlflow/logical_op.cc index a4262d40543..4d11cb5ff74 100644 --- a/paddle/fluid/operators/controlflow/logical_op.cc +++ b/paddle/fluid/operators/controlflow/logical_op.cc @@ -9,11 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/controlflow/logical_op.h" #include #include #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" namespace paddle { namespace operators { @@ -145,15 +145,7 @@ class BinaryLogicalOp : public LogicalOp { ::paddle::framework::EmptyGradOpMaker); REGISTER_BINARY_LOGICAL_OP(logical_and, "$$Out = X \\&\\& Y$$"); -REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CPU, - paddle::operators::LogicalAndFunctor); REGISTER_BINARY_LOGICAL_OP(logical_or, "$$Out = X || Y$$"); -REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CPU, - paddle::operators::LogicalOrFunctor); REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$"); -REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU, - paddle::operators::LogicalNotFunctor); REGISTER_BINARY_LOGICAL_OP(logical_xor, "$$Out = (X || Y) \\&\\& !(X \\&\\& Y)$$"); -REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU, - paddle::operators::LogicalXorFunctor); diff --git a/paddle/fluid/operators/controlflow/logical_op.cu b/paddle/fluid/operators/controlflow/logical_op.cu deleted file mode 100644 index d88658607ed..00000000000 --- a/paddle/fluid/operators/controlflow/logical_op.cu +++ /dev/null @@ -1,69 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/controlflow/logical_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" - -namespace paddle { -namespace operators { - -template -class BinaryLogicalOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using InT = typename Functor::ELEMENT_TYPE; - using OutT = bool; - - auto functor = Functor(); - std::vector ins; - std::vector outs; - const auto& cuda_ctx = - ctx.template device_context(); - int axis = PackTensorsIntoVector(ctx, &ins, &outs); - - if (ins.size() == 1) { - paddle::operators::LaunchElementwiseCudaKernel( - cuda_ctx, ins, &outs, axis, functor); - } else { - paddle::operators::LaunchElementwiseCudaKernel( - cuda_ctx, ins, &outs, axis, functor); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -#define REGISTER_LOGICAL_CUDA_KERNEL(op_name, func) \ - REGISTER_OP_CUDA_KERNEL( \ - op_name, \ - ops::BinaryLogicalOpKernel>, \ - ops::BinaryLogicalOpKernel>, \ - ops::BinaryLogicalOpKernel>, \ - ops::BinaryLogicalOpKernel>, \ - ops::BinaryLogicalOpKernel>, \ - ops::BinaryLogicalOpKernel>, \ - ops::BinaryLogicalOpKernel>); - -REGISTER_LOGICAL_CUDA_KERNEL(logical_or, LogicalOrFunctor) -REGISTER_LOGICAL_CUDA_KERNEL(logical_and, LogicalAndFunctor) -REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, LogicalXorFunctor) -REGISTER_LOGICAL_CUDA_KERNEL(logical_not, LogicalNotFunctor) -#undef REGISTER_LOGICAL_CUDA_KERNEL diff --git a/paddle/fluid/operators/controlflow/logical_op.h b/paddle/fluid/operators/controlflow/logical_op.h deleted file mode 100644 index 15cd643a858..00000000000 --- a/paddle/fluid/operators/controlflow/logical_op.h +++ /dev/null @@ -1,111 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/fluid/platform/transform.h" - -namespace paddle { -namespace operators { - -#define LOGICAL_BINARY_FUNCTOR(func_name, op) \ - template \ - struct func_name { \ - using ELEMENT_TYPE = T; \ - HOSTDEVICE bool operator()(const T a, const T b) const { \ - return static_cast(a) op static_cast(b); \ - } \ - }; - -LOGICAL_BINARY_FUNCTOR(LogicalOrFunctor, ||) -LOGICAL_BINARY_FUNCTOR(LogicalAndFunctor, &&) -LOGICAL_BINARY_FUNCTOR(LogicalXorFunctor, ^) -#undef LOGICAL_BINARY_FUNCTOR - -template -struct LogicalNotFunctor { - using ELEMENT_TYPE = T; - HOSTDEVICE bool operator()(const T a) const { return !a; } -}; - -template -class BinaryLogicalOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEMENT_TYPE; - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* out = context.Output("Out"); - Functor binary_func; - ElementwiseComputeEx(context, x, y, -1, - binary_func, out); - } -}; - -template -class UnaryLogicalOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEMENT_TYPE; - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - Functor unary_func; - platform::Transform trans; - trans(context.template device_context(), x->data(), - x->data() + x->numel(), - out->mutable_data(context.GetPlace()), unary_func); - } -}; - -} // namespace operators -} // namespace paddle - -#define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor) \ - REGISTER_OP_##dev##_KERNEL( \ - op_type, ::paddle::operators::BinaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::BinaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::BinaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::BinaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::BinaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::BinaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::BinaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>); - -#define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor) \ - REGISTER_OP_##dev##_KERNEL( \ - op_type, ::paddle::operators::UnaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::UnaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::UnaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::UnaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::UnaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::UnaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::UnaryLogicalOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>); diff --git a/paddle/fluid/operators/controlflow/logical_op_npu.cc b/paddle/fluid/operators/controlflow/logical_op_npu.cc index 02f95254035..c3d7df8d027 100644 --- a/paddle/fluid/operators/controlflow/logical_op_npu.cc +++ b/paddle/fluid/operators/controlflow/logical_op_npu.cc @@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/controlflow/logical_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/phi/kernels/cpu/logical_kernel.cc b/paddle/phi/kernels/cpu/logical_kernel.cc new file mode 100644 index 00000000000..3d179e1e75f --- /dev/null +++ b/paddle/phi/kernels/cpu/logical_kernel.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/logical_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/elementwise.h" +#include "paddle/phi/kernels/funcs/logical_functor.h" + +// See Note [ Why still include the fluid headers? ] +#include "paddle/fluid/platform/transform.h" + +namespace phi { + +#define DEFINE_LOGICAL_BINARY_KERNEL(type) \ + template \ + void Logical##type##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + DenseTensor* out) { \ + funcs::Logical##type##Functor binary_func; \ + ElementwiseCompute, T, bool>( \ + dev_ctx, x, y, -1, binary_func, out); \ + } + +DEFINE_LOGICAL_BINARY_KERNEL(And) +DEFINE_LOGICAL_BINARY_KERNEL(Or) +DEFINE_LOGICAL_BINARY_KERNEL(Xor) +#undef DEFINE_LOGICAL_BINARY_KERNEL + +template +void LogicalNotKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out) { + auto* out_ptr = dev_ctx.template Alloc(out); + funcs::LogicalNotFunctor unary_func; + + paddle::platform::Transform trans; + trans(dev_ctx, x.data(), x.data() + x.numel(), out_ptr, unary_func); +} + +} // namespace phi + +#define REGISTER_LOGICAL_CPU_KERNEL(logical_and, func_type) \ + PD_REGISTER_KERNEL(logical_and, \ + CPU, \ + ALL_LAYOUT, \ + phi::Logical##func_type##Kernel, \ + float, \ + double, \ + bool, \ + int64_t, \ + int, \ + int8_t, \ + int16_t) {} + +REGISTER_LOGICAL_CPU_KERNEL(logical_and, And) +REGISTER_LOGICAL_CPU_KERNEL(logical_or, Or) +REGISTER_LOGICAL_CPU_KERNEL(logical_not, Not) +REGISTER_LOGICAL_CPU_KERNEL(logical_xor, Xor) diff --git a/paddle/phi/kernels/funcs/logical_functor.h b/paddle/phi/kernels/funcs/logical_functor.h new file mode 100644 index 00000000000..1ea7fc43e6b --- /dev/null +++ b/paddle/phi/kernels/funcs/logical_functor.h @@ -0,0 +1,41 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace phi { +namespace funcs { + +#define LOGICAL_BINARY_FUNCTOR(func_name, op) \ + template \ + struct func_name { \ + using ELEMENT_TYPE = T; \ + HOSTDEVICE bool operator()(const T a, const T b) const { \ + return static_cast(a) op static_cast(b); \ + } \ + }; + +LOGICAL_BINARY_FUNCTOR(LogicalOrFunctor, ||) +LOGICAL_BINARY_FUNCTOR(LogicalAndFunctor, &&) +LOGICAL_BINARY_FUNCTOR(LogicalXorFunctor, ^) +#undef LOGICAL_BINARY_FUNCTOR + +template +struct LogicalNotFunctor { + using ELEMENT_TYPE = T; + HOSTDEVICE bool operator()(const T a) const { return !a; } +}; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/logical_kernel.cu b/paddle/phi/kernels/gpu/logical_kernel.cu new file mode 100644 index 00000000000..f32d4c77d40 --- /dev/null +++ b/paddle/phi/kernels/gpu/logical_kernel.cu @@ -0,0 +1,79 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/logical_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/funcs/logical_functor.h" +#include "paddle/phi/kernels/gpu/elementwise.h" + +namespace phi { + +#define DEFINE_LOGICAL_BINARY_KERNEL(type) \ + template \ + void Logical##type##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + DenseTensor* out) { \ + using InT = typename funcs::Logical##type##Functor::ELEMENT_TYPE; \ + using OutT = bool; \ + dev_ctx.template Alloc(out); \ + funcs::Logical##type##Functor binary_func; \ + std::vector ins = {&x, &y}; \ + std::vector outs = {out}; \ + funcs::BroadcastKernel( \ + dev_ctx, ins, &outs, -1, binary_func); \ + } + +DEFINE_LOGICAL_BINARY_KERNEL(And) +DEFINE_LOGICAL_BINARY_KERNEL(Or) +DEFINE_LOGICAL_BINARY_KERNEL(Xor) +#undef DEFINE_LOGICAL_BINARY_KERNEL + +template +void LogicalNotKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out) { + using InT = typename funcs::LogicalNotFunctor::ELEMENT_TYPE; + using OutT = bool; + + dev_ctx.template Alloc(out); + funcs::LogicalNotFunctor unary_func; + std::vector ins = {&x}; + std::vector outs = {out}; + funcs::BroadcastKernel( + dev_ctx, ins, &outs, -1, unary_func); +} + +} // namespace phi + +#define REGISTER_LOGICAL_CUDA_KERNEL(logical_and, func_type) \ + PD_REGISTER_KERNEL(logical_and, \ + GPU, \ + ALL_LAYOUT, \ + phi::Logical##func_type##Kernel, \ + float, \ + double, \ + bool, \ + int64_t, \ + int, \ + int8_t, \ + int16_t) {} + +REGISTER_LOGICAL_CUDA_KERNEL(logical_and, And) +REGISTER_LOGICAL_CUDA_KERNEL(logical_or, Or) +REGISTER_LOGICAL_CUDA_KERNEL(logical_not, Not) +REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, Xor) diff --git a/paddle/phi/kernels/logical_kernel.h b/paddle/phi/kernels/logical_kernel.h new file mode 100644 index 00000000000..3ccc03a5b59 --- /dev/null +++ b/paddle/phi/kernels/logical_kernel.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +#define DECLEAR_LOGICAL_BINARY_KERNEL(type) \ + template \ + void Logical##type##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + DenseTensor* out); + +DECLEAR_LOGICAL_BINARY_KERNEL(And) +DECLEAR_LOGICAL_BINARY_KERNEL(Or) +DECLEAR_LOGICAL_BINARY_KERNEL(Xor) +#undef DECLEAR_LOGICAL_BINARY_KERNEL + +template +void LogicalNotKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out); + +} // namespace phi diff --git a/python/paddle/fluid/tests/unittests/test_diff_op.py b/python/paddle/fluid/tests/unittests/test_diff_op.py index 345dad54132..1ae780f488d 100644 --- a/python/paddle/fluid/tests/unittests/test_diff_op.py +++ b/python/paddle/fluid/tests/unittests/test_diff_op.py @@ -55,7 +55,7 @@ class TestDiffOp(unittest.TestCase): def test_dygraph(self): for place in self.places: - paddle.disable_static(place) + paddle.disable_static() x = paddle.to_tensor(self.input, place=place) if self.prepend is not None: self.prepend = paddle.to_tensor(self.prepend, place=place) -- GitLab From e8d4558366d1dbf81f341eac5bbdb712eeb1ba0d Mon Sep 17 00:00:00 2001 From: zyfncg Date: Tue, 1 Mar 2022 11:13:32 +0800 Subject: [PATCH 010/272] [PHI] Support Multi Input and Output for InferShape (#39870) * add multi input for infer_shape * support multi output for infershape * fix split bug * fix bug of concat * support vector in infrt * fix bug --- paddle/fluid/framework/infershape_utils.cc | 69 +++++++++++------- paddle/fluid/operators/concat_op.cc | 44 +++--------- paddle/fluid/operators/split_op.cc | 55 +++------------ paddle/infrt/host_context/value.h | 2 +- paddle/phi/api/lib/api_custom_impl.cc | 6 +- paddle/phi/core/infermeta_utils.cc | 16 ++--- paddle/phi/core/infermeta_utils.h | 15 ++-- paddle/phi/infermeta/multiary.cc | 23 ++++-- paddle/phi/infermeta/multiary.h | 2 +- paddle/phi/infermeta/unary.cc | 82 ++++++++++++---------- paddle/phi/infermeta/unary.h | 2 +- paddle/phi/kernels/concat_kernel.h | 5 +- paddle/phi/kernels/cpu/concat_kernel.cc | 4 +- paddle/phi/kernels/cpu/split_kernel.cc | 14 ---- paddle/phi/kernels/gpu/split_kernel.cu | 14 ---- paddle/phi/kernels/split_kernel.h | 12 ++-- python/paddle/utils/code_gen/api_base.py | 19 ++++- 17 files changed, 175 insertions(+), 209 deletions(-) diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index e14b91d935d..d9287b9a624 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -308,22 +308,25 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, // TODO(chenweihang): support multiple inputs and outputs later phi::InferMetaContext infer_mete_context; for (auto& in_name : input_names) { - if (ctx->HasInput(in_name)) { - infer_meta_context.EmplaceBackInput(std::make_shared( - ctx->GetInputVarPtrs(in_name)[0], ctx->IsRuntime())); + if (ctx->HasInputs(in_name)) { + auto input_var = ctx->GetInputVarPtrs(in_name); + if (input_var.size() == 1) { + infer_meta_context.EmplaceBackInput( + std::make_shared(input_var[0], ctx->IsRuntime())); + } else { + paddle::SmallVector> inputs; + inputs.reserve(input_var.size()); + for (const auto& in : input_var) { + inputs.push_back( + std::make_shared(in, ctx->IsRuntime())); + } + infer_meta_context.EmplaceBackInputs(std::move(inputs)); + } } else { infer_meta_context.EmplaceBackInput({nullptr}); } } - for (auto& out_name : output_names) { - if (ctx->HasOutput(out_name)) { - infer_meta_context.EmplaceBackOutput(std::make_shared( - ctx->GetOutputVarPtrs(out_name)[0], ctx->IsRuntime())); - } else { - infer_meta_context.EmplaceBackOutput({nullptr}); - } - } auto attr_reader = ctx->Attrs(); for (size_t i = 0; i < attr_names.size(); ++i) { auto attr_name = attr_names[i]; @@ -348,13 +351,13 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, } } else { // If is not in runtime, we will set default value(-1) for ScalarArray - int64_t num_ele = 0; std::vector vars; vars.reserve(infershape_inputs.size()); - for (size_t i = 0; i < infershape_inputs.size(); i++) { + for (size_t i = 0; i < infershape_inputs.size(); ++i) { vars.push_back(BOOST_GET_CONST(VarDesc*, infershape_inputs[i])); } + int64_t num_ele = 0; if (vars.size() == 1) { num_ele = 1; const auto& tensor_dims = vars[0]->GetShape(); @@ -362,16 +365,7 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, num_ele *= tensor_dims[i]; } } else { - for (auto& var : vars) { - const auto& tensor_dims = var->GetShape(); - PADDLE_ENFORCE_EQ(tensor_dims.size(), 1, - platform::errors::InvalidArgument( - "The shape is constructed by multi-tensor, " - "every tensor's dims should be 1. But your " - "shape has tensor that dims is %s.", - tensor_dims.size())); - num_ele += tensor_dims[0]; - } + num_ele = vars.size(); } phi::ScalarArray tensor_attr(std::vector(num_ele, -1)); tensor_attr.SetFromTensor(true); @@ -383,10 +377,14 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, std::type_index(typeid(std::vector))) { infer_meta_context.EmplaceBackAttr(std::move( phi::ScalarArray(BOOST_GET_CONST(std::vector, attr)))); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(int))) { + infer_meta_context.EmplaceBackAttr( + phi::ScalarArray({BOOST_GET_CONST(int, attr)})); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported cast op attribute `%s` to ScalarArray when " - "construct KernelContext.", + "construct InferMetaContext.", attr_name)); } } @@ -414,7 +412,6 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, } } else if (ctx->HasInput(attr_name)) { const auto& infershape_input = ctx->GetInputVarPtrs(attr_name); - if (infershape_input.size() == 1) { if (ctx->IsRuntime()) { Variable* var = BOOST_GET_CONST(Variable*, infershape_input[0]); @@ -490,6 +487,28 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, "Unsupported attribute type is received when call " "InferShapeFunctor.")); } + } else { + // do nothing + } + } + + for (auto& out_name : output_names) { + if (ctx->HasOutputs(out_name)) { + auto output_var = ctx->GetOutputVarPtrs(out_name); + if (output_var.size() == 1) { + infer_meta_context.EmplaceBackOutput(std::make_shared( + output_var[0], ctx->IsRuntime())); + } else { + paddle::SmallVector> outputs; + outputs.reserve(output_var.size()); + for (const auto& out : output_var) { + outputs.emplace_back( + std::make_shared(out, ctx->IsRuntime())); + } + infer_meta_context.EmplaceBackOutputs(std::move(outputs)); + } + } else { + infer_meta_context.EmplaceBackOutput({nullptr}); } } diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index 55de4087f57..1da7798ea26 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -18,7 +18,9 @@ limitations under the License. */ #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/infermeta/multiary.h" #include "paddle/phi/kernels/funcs/concat_funcs.h" #ifdef PADDLE_WITH_MKLDNN @@ -33,41 +35,6 @@ class ConcatOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "Concat"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Concat"); - - auto inputs_dims = ctx->GetInputsDim("X"); - - const size_t inputs_num = inputs_dims.size(); - PADDLE_ENFORCE_GT( - inputs_num, static_cast(0), - platform::errors::InvalidArgument( - "The number of input tensors in concat op should > 0. But " - "received inputs' length is 0.")); - if (inputs_num == 1) { - VLOG(3) << "Warning: concat op have only one input, may waste memory"; - } - - if (ctx->HasInput("AxisTensor")) { - auto out_dims = - phi::make_ddim(std::vector(inputs_dims[0].size(), -1)); - ctx->SetOutputDim("Out", out_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } else { - size_t axis = - ComputeAxis(static_cast(ctx->Attrs().Get("axis")), - static_cast(inputs_dims[0].size())); - framework::DDim out_dims = - phi::funcs::ComputeAndCheckShape(ctx->IsRuntime(), inputs_dims, axis); - if (out_dims[axis] < 0) { - out_dims[axis] = -1; - } - ctx->SetOutputDim("Out", out_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -237,9 +204,14 @@ class ConcatDoubleGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; + +DELCARE_INFER_SHAPE_FUNCTOR(concat, ConcatInferShapeFunctor, + PT_INFER_META(phi::ConcatInferMeta)); + REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker, ops::ConcatGradOpMaker, - ops::ConcatGradOpMaker); + ops::ConcatGradOpMaker, + ConcatInferShapeFunctor); REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad, ops::ConcatDoubleGradOpMaker, ops::ConcatDoubleGradOpMaker, diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc index a8f05d94563..6678320f9ff 100644 --- a/paddle/fluid/operators/split_op.cc +++ b/paddle/fluid/operators/split_op.cc @@ -15,6 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/split_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace operators { using framework::Tensor; @@ -23,52 +26,6 @@ class SplitOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of SplitOp should not be null.")); - PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL, - platform::errors::InvalidArgument( - "Outputs(Out) of SplitOp should not be empty.")); - auto in_dims = ctx->GetInputDim("X"); - auto outs_names = ctx->Outputs("Out"); - size_t axis = static_cast(ctx->Attrs().Get("axis")); - size_t num = static_cast(ctx->Attrs().Get("num")); - std::vector sections = static_cast>( - ctx->Attrs().Get>("sections")); - const size_t outs_number = outs_names.size(); - - if (sections.size() > 0) { - PADDLE_ENFORCE_EQ( - sections.size(), outs_number, - platform::errors::InvalidArgument("tensor split sections size " - "should be equal to output size.")); - } - - if (ctx->HasInput("AxisTensor")) { - auto out_dims = phi::make_ddim(std::vector(in_dims.size(), -1)); - std::vector outs_dims(outs_number, out_dims); - ctx->SetOutputsDim("Out", outs_dims); - for (size_t i = 0; i < outs_number; ++i) { - ctx->ShareLoD("X", "Out", 0, i); - } - return; - } - - bool each_section_is_known = - (sections.size() > 0 && !ctx->HasInputs("SectionsTensorList")); - - auto outs_dims = UpdateOutsDims(ctx->IsRuntime(), each_section_is_known, - in_dims, num, sections, axis, outs_number); - ctx->SetOutputsDim("Out", outs_dims); - if (axis != 0) { - // Only pass LoD when not spliting along the first dim. - for (size_t i = 0; i < outs_number; ++i) { - ctx->ShareLoD("X", "Out", 0, i); - } - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -168,6 +125,10 @@ Example: namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(split, SplitInferShapeFunctor, + PT_INFER_META(phi::SplitInferMeta)); + REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, ops::SplitGradMaker, - ops::SplitGradMaker); + ops::SplitGradMaker, + SplitInferShapeFunctor); diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h index eb9a2092657..7e7d77d3af7 100644 --- a/paddle/infrt/host_context/value.h +++ b/paddle/infrt/host_context/value.h @@ -73,7 +73,7 @@ using ValueVariantType = std::vector, paddle::experimental::ScalarBase, paddle::experimental::ScalarArrayBase, - std::vector, + std::vector, phi::MetaConfig, paddle::experimental::Backend, paddle::experimental::DataLayout, diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index c7400b93fcd..19b113838ea 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -94,12 +94,16 @@ std::vector split_impl(const Tensor& x, std::vector out; auto dense_outs = SetKernelOutput(out_number, kernel_backend, &out); std::vector meta_outs; + meta_outs.reserve(out_number); + std::vector meta_out_ptrs; + meta_out_ptrs.reserve(out_number); for (size_t i = 0; i < out_number; ++i) { meta_outs.push_back(dense_outs[i]); + meta_out_ptrs.push_back(&meta_outs.back()); } phi::SplitInferMeta( - MakeMetaTensor(*dense_x), num_or_sections, axis, &meta_outs); + MakeMetaTensor(*dense_x), num_or_sections, axis, meta_out_ptrs); using kernel_signature = void (*)(const platform::DeviceContext&, const phi::DenseTensor&, diff --git a/paddle/phi/core/infermeta_utils.cc b/paddle/phi/core/infermeta_utils.cc index f3dd056911e..671ba2ec7dc 100644 --- a/paddle/phi/core/infermeta_utils.cc +++ b/paddle/phi/core/infermeta_utils.cc @@ -75,13 +75,13 @@ paddle::optional InferMetaContext::OptionalInputAt( : paddle::optional{paddle::none}; } -std::vector InferMetaContext::InputsBetween(size_t start, - size_t end) const { - std::vector result; +std::vector InferMetaContext::InputsBetween(size_t start, + size_t end) const { + std::vector result; result.reserve(end - start); for (size_t i = start; i < end; ++i) { - result.emplace_back(*inputs_.at(i)); + result.push_back(inputs_.at(i).get()); } return result; @@ -91,12 +91,12 @@ MetaTensor* InferMetaContext::MutableOutputAt(size_t idx) { return outputs_.at(idx).get(); } -std::vector InferMetaContext::MutableOutputBetween(size_t start, - size_t end) { - std::vector result; +std::vector InferMetaContext::MutableOutputBetween(size_t start, + size_t end) { + std::vector result; result.reserve(end - start); for (size_t i = start; i < end; ++i) { - result.emplace_back(*outputs_.at(i)); + result.emplace_back(outputs_.at(i).get()); } return result; } diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h index 203dbb26984..a5775db7438 100644 --- a/paddle/phi/core/infermeta_utils.h +++ b/paddle/phi/core/infermeta_utils.h @@ -50,13 +50,13 @@ class InferMetaContext { const std::pair& OutputRangeAt(size_t idx) const; const MetaConfig& GetMetaConfig() const; - const MetaTensor& InputAt(size_t idx) const; + const MetaTensor& InputAt(size_t idx) const; paddle::optional OptionalInputAt(size_t idx) const; + std::vector InputsBetween(size_t start, size_t end) const; - std::vector InputsBetween(size_t start, size_t end) const; MetaTensor* MutableOutputAt(size_t idx); - std::vector MutableOutputBetween(size_t start, size_t end); + std::vector MutableOutputBetween(size_t start, size_t end); template AttrType AttrAt(size_t idx) { @@ -157,7 +157,7 @@ struct InferMetaFnImpl { }; template - struct InferMetaFnCallHelper&, Tail...> { + struct InferMetaFnCallHelper&, Tail...> { template static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) { static_assert(attr_idx == 0, @@ -165,7 +165,7 @@ struct InferMetaFnImpl { static_assert(out_idx == 0, "InferMeta's Input should appear before Outputs."); const std::pair range = ctx->InputRangeAt(in_idx); - std::vector arg = + std::vector arg = ctx->InputsBetween(range.first, range.second); InferMetaFnCallHelper< Tail...>::template Call(ctx, @@ -210,13 +210,12 @@ struct InferMetaFnImpl { }; template - struct InferMetaFnCallHelper*, Tail...> { + struct InferMetaFnCallHelper, Tail...> { template static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) { const std::pair range = ctx->OutputRangeAt(out_idx); - std::vector tmp = + std::vector arg = ctx->MutableOutputBetween(range.first, range.second); - std::vector* arg = &tmp; InferMetaFnCallHelper< Tail...>::template Call(ctx, pargs..., diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 7a0db3d5c17..8857c2cf424 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -84,7 +84,7 @@ void BilinearTensorProductInferMeta(const MetaTensor& x, out->set_dtype(x.dtype()); } -void ConcatInferMeta(const std::vector& x, +void ConcatInferMeta(const std::vector& x, const Scalar& axis_scalar, MetaTensor* out, MetaConfig config) { @@ -93,10 +93,19 @@ void ConcatInferMeta(const std::vector& x, phi::errors::InvalidArgument( "The size of input meta vector should be greater" "than 0.")); + if (axis_scalar.FromTensor()) { + auto out_dims = + phi::make_ddim(std::vector(x.at(0)->dims().size(), -1)); + out->set_dims(out_dims); + out->set_dtype(x.at(0)->dtype()); + out->set_layout(x.at(0)->layout()); + out->share_lod(*x.at(0)); + return; + } int axis = axis_scalar.to(); // 1. calculate axis - int rank = x.at(0).dims().size(); + int rank = x.at(0)->dims().size(); PADDLE_ENFORCE_EQ( axis >= -rank && axis < rank, true, @@ -111,15 +120,17 @@ void ConcatInferMeta(const std::vector& x, // 2. calculate out dims std::vector x_dims; - for (auto& x_t : x) { - x_dims.push_back(x_t.dims()); + x_dims.reserve(x.size()); + for (const auto* x_t : x) { + x_dims.emplace_back(x_t->dims()); } phi::DDim out_dim = phi::funcs::ComputeAndCheckShape(config.is_runtime, x_dims, axis); out->set_dims(out_dim); - out->set_dtype(x.at(0).dtype()); - out->set_layout(x.at(0).layout()); + out->set_dtype(x.at(0)->dtype()); + out->set_layout(x.at(0)->layout()); + out->share_lod(*x.at(0)); } } // namespace phi diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index a5fb2a4cbdd..473845c6e40 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -25,7 +25,7 @@ void BilinearTensorProductInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); -void ConcatInferMeta(const std::vector& x, +void ConcatInferMeta(const std::vector& x, const Scalar& axis_scalar, MetaTensor* out, MetaConfig config = MetaConfig()); diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 49fd0a343a4..4696187bd23 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -459,8 +459,19 @@ void TransferLayoutInferMeta(const MetaTensor& x, void SplitInferMeta(const MetaTensor& x, const ScalarArray& num_or_sections, const Scalar& axis, - std::vector* out, + std::vector out, MetaConfig config) { + if (!config.is_runtime) { + if (axis.FromTensor() || num_or_sections.FromTensor()) { + auto out_dims = phi::make_ddim(std::vector(x.dims().size(), -1)); + for (auto* item : out) { + item->set_dims(out_dims); + item->share_lod(x); + } + return; + } + } + int axis_value = axis.to(); int rank = x.dims().size(); PADDLE_ENFORCE_EQ( @@ -475,27 +486,34 @@ void SplitInferMeta(const MetaTensor& x, axis_value = axis_value + rank; } + std::vector out_dims(out.size(), x.dims()); + auto input_axis_dim = x.dims().at(axis_value); auto num_or_sections_data = num_or_sections.GetData(); - // step1: get formated sections - std::vector sections; // num_or_sections is a number if (num_or_sections_data.size() == 1) { - int num = num_or_sections_data.at(0); + if (config.is_runtime || input_axis_dim > 0) { + int num = num_or_sections_data.at(0); + PADDLE_ENFORCE_EQ( + input_axis_dim % num, + 0, + phi::errors::InvalidArgument( + "The input's size along the split dimension " + "must be evenly divisible by Attr(num_or_sections). " + "But received Attr(num_or_sections) " + "= %d, input(X)'s shape = [%s], Attr(dim) = %d.", + num, + x.dims(), + axis_value)); - PADDLE_ENFORCE_EQ(input_axis_dim % num, - 0, - phi::errors::InvalidArgument( - "The input's size along the split dimension " - "must be evenly divisible by Attr(num_or_sections). " - "But received Attr(num_or_sections) " - "= %d, input(X)'s shape = [%s], Attr(dim) = %d.", - num, - x.dims(), - axis_value)); - - for (int i = 0; i < num; ++i) { - sections.push_back(input_axis_dim / num); + size_t out_axis_dim = input_axis_dim / num; + for (auto& out_dim : out_dims) { + out_dim[axis_value] = out_axis_dim; + } + } else { + for (auto& out_dim : out_dims) { + out_dim[axis_value] = -1; + } } } else { // num_or_sections is a sections @@ -503,10 +521,9 @@ void SplitInferMeta(const MetaTensor& x, int unknow_dim_idx = -1; int num_of_unknow = 0; int sum_of_section = 0; + std::vector sections = num_or_sections_data; for (size_t i = 0; i < num_or_sections_data.size(); ++i) { - sections.push_back(num_or_sections_data[i]); - if (num_or_sections_data[i] == unknow_dim_val) { num_of_unknow++; unknow_dim_idx = i; @@ -558,31 +575,22 @@ void SplitInferMeta(const MetaTensor& x, x.dims(), axis_value)); } - } - - // setp2: fill out dims - std::vector out_dims(sections.size(), x.dims()); - if (config.is_runtime || input_axis_dim > 0) { - for (size_t i = 0; i < sections.size(); ++i) { + for (size_t i = 0; i < out_dims.size(); ++i) { out_dims[i][axis_value] = sections[i]; } - } else { - for (size_t i = 0; i < sections.size(); ++i) { - out_dims[i][axis_value] = -1; - } } - for (size_t i = 0; i < sections.size(); ++i) { + for (size_t i = 0; i < out.size(); ++i) { if (axis_value != 0) { // Only pass LoD when not spliting along the first dim. - (*out)[i].set_dtype(x.dtype()); - (*out)[i].set_dims(out_dims[i]); - (*out)[i].set_layout(x.layout()); + out.at(i)->set_dtype(x.dtype()); + out.at(i)->set_dims(out_dims[i]); + out.at(i)->set_layout(x.layout()); } else { - (*out)[i].set_dtype(x.dtype()); - (*out)[i].set_dims(out_dims[i]); - (*out)[i].set_layout(x.layout()); - (*out)[i].share_lod(x); + out.at(i)->set_dtype(x.dtype()); + out.at(i)->set_dims(out_dims[i]); + out.at(i)->set_layout(x.layout()); + out.at(i)->share_lod(x); } } } diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 4fab1ec68ec..b3929b9d2b4 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -107,7 +107,7 @@ void TransferLayoutInferMeta(const MetaTensor& x, void SplitInferMeta(const MetaTensor& x_meta, const ScalarArray& num_or_sections, const Scalar& axis, - std::vector* out, + std::vector out, MetaConfig config = MetaConfig()); void UnbindInferMeta(const MetaTensor& x, diff --git a/paddle/phi/kernels/concat_kernel.h b/paddle/phi/kernels/concat_kernel.h index fbc4a86f5af..f1366788146 100644 --- a/paddle/phi/kernels/concat_kernel.h +++ b/paddle/phi/kernels/concat_kernel.h @@ -31,13 +31,16 @@ DenseTensor Concat(const Context& dev_ctx, const std::vector& x, const Scalar& axis) { std::vector meta_x; + meta_x.reserve(x.size()); + std::vector meta_x_ptr; for (const auto& t : x) { meta_x.emplace_back(t); + meta_x_ptr.push_back(&meta_x.back()); } auto dense_out = phi::Empty(dev_ctx); MetaTensor meta_out(&dense_out); - ConcatInferMeta(meta_x, axis.to(), &meta_out, /*is_runtime=*/true); + ConcatInferMeta(meta_x_ptr, axis.to(), &meta_out, /*is_runtime=*/true); ConcatKernel(dev_ctx, x, axis, &dense_out); return dense_out; } diff --git a/paddle/phi/kernels/cpu/concat_kernel.cc b/paddle/phi/kernels/cpu/concat_kernel.cc index 18bb8837b10..5c4202837c4 100644 --- a/paddle/phi/kernels/cpu/concat_kernel.cc +++ b/paddle/phi/kernels/cpu/concat_kernel.cc @@ -37,6 +37,7 @@ void ConcatKernel(const Context& dev_ctx, axis = phi::funcs::ComputeAxis(axis, x[0].dims().size()); std::vector x_dims; + x_dims.reserve(x.size()); for (size_t i = 0; i < x.size(); ++i) { x_dims.push_back(x[i].dims()); } @@ -97,9 +98,10 @@ void ConcatKernel(const Context& dev_ctx, } } else { std::vector inputs; + inputs.reserve(x.size()); for (size_t j = 0; j < x.size(); ++j) { if (x[j].numel() > 0) { - inputs.push_back(x[j]); + inputs.emplace_back(x[j]); } else { continue; } diff --git a/paddle/phi/kernels/cpu/split_kernel.cc b/paddle/phi/kernels/cpu/split_kernel.cc index 722681fb7bc..4acf9b02028 100644 --- a/paddle/phi/kernels/cpu/split_kernel.cc +++ b/paddle/phi/kernels/cpu/split_kernel.cc @@ -28,20 +28,6 @@ void SplitKernel(const Context& dev_ctx, const ScalarArray& num_or_sections, const Scalar& axis_scalar, std::vector outs) { - // need to infershape output - if (num_or_sections.FromTensor() || axis_scalar.FromTensor()) { - std::vector out_metas; - for (size_t i = 0; i < outs.size(); ++i) { - out_metas.push_back(outs[i]); - } - - phi::SplitInferMeta(x, num_or_sections, axis_scalar, &out_metas, true); - - for (size_t i = 0; i < out_metas.size(); ++i) { - outs[i]->Resize(out_metas[i].dims()); - } - } - std::vector shape_refer; for (size_t j = 0; j < outs.size(); ++j) { dev_ctx.template Alloc(outs[j]); diff --git a/paddle/phi/kernels/gpu/split_kernel.cu b/paddle/phi/kernels/gpu/split_kernel.cu index a698b9e7161..d2473d5b0b1 100644 --- a/paddle/phi/kernels/gpu/split_kernel.cu +++ b/paddle/phi/kernels/gpu/split_kernel.cu @@ -27,20 +27,6 @@ void SplitKernel(const Context& dev_ctx, const ScalarArray& num_or_sections, const Scalar& axis_scalar, std::vector outs) { - // need to infershape output - if (num_or_sections.FromTensor() || axis_scalar.FromTensor()) { - std::vector out_metas; - for (size_t i = 0; i < outs.size(); ++i) { - out_metas.push_back(outs[i]); - } - - phi::SplitInferMeta(x, num_or_sections, axis_scalar, &out_metas, true); - - for (size_t i = 0; i < out_metas.size(); ++i) { - outs[i]->Resize(out_metas[i].dims()); - } - } - std::vector shape_refer; for (size_t j = 0; j < outs.size(); ++j) { dev_ctx.template Alloc(outs[j]); diff --git a/paddle/phi/kernels/split_kernel.h b/paddle/phi/kernels/split_kernel.h index 1e730d809bc..840fe4366ce 100644 --- a/paddle/phi/kernels/split_kernel.h +++ b/paddle/phi/kernels/split_kernel.h @@ -43,18 +43,18 @@ std::vector Split(const Context& dev_ctx, } std::vector out_meta; + std::vector out_meta_ptr; out_meta.reserve(out_number); + out_meta_ptr.reserve(out_number); std::vector result; result.reserve(out_number); for (size_t i = 0; i < out_number; ++i) { - auto dense_out = phi::Empty(dev_ctx); - MetaTensor tmp_meta(&dense_out); - - result.push_back(dense_out); - out_meta.push_back(&result.back()); + result.emplace_back(phi::Empty(dev_ctx)); + out_meta.emplace_back(&result.back()); + out_meta_ptr.push_back(&out_meta.back()); } - SplitInferMeta(x, num_or_sections, axis, &out_meta); + SplitInferMeta(x, num_or_sections, axis, out_meta_ptr); std::vector outs; outs.reserve(out_meta.size()); diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py index 5fc9dfe3f64..cfd817c24c7 100644 --- a/python/paddle/utils/code_gen/api_base.py +++ b/python/paddle/utils/code_gen/api_base.py @@ -451,7 +451,20 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self. param_code = "" for param in infer_meta_params: if param in input_names: - if param in self.optional_vars: + if self.inputs['input_info'][param] == "const Tensor&": + param_code = param_code + "MakeMetaTensor(*" + PREFIX_TENSOR_NAME + param + "), " + elif self.inputs['input_info'][ + param] == "const std::vector&": + meta_tensor_code = meta_tensor_code + f""" +{code_indent} auto {param}_meta_vec = MakeMetaTensor(*{PREFIX_TENSOR_NAME}{param}); +{code_indent} std::vector {param}_metas({param}_meta_vec.size()); +{code_indent} for (size_t i = 0; i < {param}_meta_vec.size(); ++i) {{ +{code_indent} {param}_metas[i] = &{param}_meta_vec[i]; +{code_indent} }} +""" + + param_code = param_code + param + "_metas, " + elif param in self.optional_vars: meta_tensor_code = meta_tensor_code + f""" {code_indent} paddle::optional {PREFIX_TENSOR_NAME}meta_ref_{param}(paddle::none); {code_indent} auto {PREFIX_TENSOR_NAME}meta_{param} = MakeMetaTensor({PREFIX_TENSOR_NAME}{param}); @@ -461,7 +474,9 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self. param_code = param_code + f"{PREFIX_TENSOR_NAME}meta_ref_{param}, " else: - param_code = param_code + "MakeMetaTensor(*" + PREFIX_TENSOR_NAME + param + "), " + raise ValueError( + f"{self.api} : Param of infer_meta error : {self.inputs['input_info'][param]} type is not supported." + ) elif param in kernel_output_names: meta_tensor_code = meta_tensor_code + code_indent + " phi::MetaTensor " + param.replace( 'kernel_', PREFIX_META_TENSOR_NAME) + "(" + param + ");\n" -- GitLab From d49115946db8f9b0dc15986ee10b7209a702fa6e Mon Sep 17 00:00:00 2001 From: helen88 Date: Tue, 1 Mar 2022 11:21:22 +0800 Subject: [PATCH 011/272] optimize mergeadd for sparse_adam,*test=kunlun (#39966) * optimize mergeadd for sparse_adam,*test=kunlun * optimize mergeadd for sparse_adam,*test=kunlun * optimize mergeadd for sparse_adam, *test=kunlun --- cmake/external/xpu.cmake | 2 +- .../operators/math/selected_rows_functor.cc | 71 +++++++++---------- 2 files changed, 34 insertions(+), 39 deletions(-) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 415c0fe9bef..45a76fdc1f1 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -36,7 +36,7 @@ ENDIF() if(NOT DEFINED XPU_BASE_URL) SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220219") + SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220228") else() SET(XPU_BASE_URL "${XPU_BASE_URL}") endif() diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index fcd5c06a6f3..5ac39953462 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/platform/device/device_wrapper.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/operators/mkldnn/axpy_handler.h" @@ -502,32 +503,29 @@ struct MergeAdd { out.mutable_value()->mutable_data( phi::make_ddim({static_cast(merge_rows.size()), input_width}), context.GetPlace()); - int r = - xpu::constant(context.x_context(), out.mutable_value()->data(), - merge_rows.size() * input_width, static_cast(0.f)); - PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, - platform::errors::External("XPU constant op return" - " wrong value[%d %s].", - r, XPUAPIErrorMsg[r])); std::unordered_map rows_to_id; for (size_t i = 0; i < merge_rows.size(); ++i) { rows_to_id[merge_rows[i]] = i; } - auto* out_data = out.mutable_value()->data(); - auto* input_data = input.value().data(); + auto* y_data = out.mutable_value()->data(); + auto* x_data = input.value().data(); + int xm = input_rows.size(); + int ym = merge_rows.size(); int n = input_width; - for (size_t i = 0; i < input_rows.size(); i++) { - size_t out_i = rows_to_id[input_rows[i]]; - auto r = xpu::add(context.x_context(), &input_data[i * input_width], - &out_data[out_i * input_width], - &out_data[out_i * input_width], n); - PADDLE_ENFORCE_EQ( - r, XPU_SUCCESS, - platform::errors::External("XPU API return wrong value[%d %s], ", r, - XPUAPIErrorMsg[r])); - } + + xpu::ctx_guard RAII_GUARD(context.x_context()); + int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm(xm); + int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm(ym); + memory::Copy(context.GetPlace(), y_rows_data, platform::CPUPlace(), + merge_rows.data(), ym * sizeof(int64_t)); + memory::Copy(context.GetPlace(), x_rows_data, platform::CPUPlace(), + input_rows.data(), xm * sizeof(int64_t)); + int r = + xpu::merge_dup_rows(context.x_context(), x_data, y_data, + x_rows_data, y_rows_data, xm, n, ym); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "merge_dup_rows"); } void operator()(const platform::XPUDeviceContext& context, @@ -582,15 +580,7 @@ struct MergeAdd { {static_cast(merged_row_set.size()), input_width}), context.GetPlace()); - int r = - xpu::constant(context.x_context(), out.mutable_value()->data(), - merge_rows.size() * input_width, static_cast(0.f)); - PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, - platform::errors::External("XPU constant op return" - " wrong value[%d %s].", - r, XPUAPIErrorMsg[r])); - - float* out_data = reinterpret_cast(out.mutable_value()->data()); + float* y_data = reinterpret_cast(out.mutable_value()->data()); std::unordered_map rows_to_id; for (size_t i = 0; i < merge_rows.size(); ++i) { @@ -603,17 +593,22 @@ struct MergeAdd { } auto& input_rows = input->rows(); + auto* x_data = input->value().data(); + int xm = input_rows.size(); + int ym = merge_rows.size(); int n = input_width; - for (size_t i = 0; i < input_rows.size(); i++) { - size_t out_i = rows_to_id[input_rows[i]]; - auto r = xpu::add( - context.x_context(), input->value().data() + i * input_width, - &out_data[out_i * input_width], &out_data[out_i * input_width], n); - PADDLE_ENFORCE_EQ( - r, XPU_SUCCESS, - platform::errors::External("XPU API return wrong value[%d %s], ", r, - XPUAPIErrorMsg[r])); - } + + xpu::ctx_guard RAII_GUARD(context.x_context()); + int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm(xm); + int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm(ym); + memory::Copy(context.GetPlace(), y_rows_data, platform::CPUPlace(), + merge_rows.data(), ym * sizeof(int64_t)); + memory::Copy(context.GetPlace(), x_rows_data, platform::CPUPlace(), + input_rows.data(), xm * sizeof(int64_t)); + int r = + xpu::merge_dup_rows(context.x_context(), x_data, y_data, + x_rows_data, y_rows_data, xm, n, ym); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "merge_dup_rows"); } } }; -- GitLab From 08b43cce6d2d5e2f57a4317461eb26f88af9bd3c Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 1 Mar 2022 11:24:52 +0800 Subject: [PATCH 012/272] [Phi] Support kps backend and kernel registry (#39941) * support kps backend and compile * resolve conflict * fix kps backend trans * test in xpu2 device * remove dummy kernel --- cmake/generic.cmake | 1 + cmake/phi.cmake | 60 +++++++++++++++++++++---- paddle/fluid/framework/phi_utils.cc | 4 ++ paddle/phi/backends/gpu/gpu_context.h | 8 ++++ paddle/phi/backends/xpu/xpu_context.h | 8 ++++ paddle/phi/common/backend.h | 8 ++++ paddle/phi/core/compat/convert_utils.cc | 8 ++++ paddle/phi/tests/common/test_backend.cc | 4 ++ 8 files changed, 93 insertions(+), 8 deletions(-) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index f7c17bd7cfe..51ed537ce5d 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -667,6 +667,7 @@ function(xpu_library TARGET_NAME) else() xpu_add_library(${TARGET_NAME} STATIC ${xpu_library_SRCS} DEPENDS ${xpu_library_DEPS}) find_fluid_modules(${TARGET_NAME}) + find_phi_modules(${TARGET_NAME}) endif() if (xpu_library_DEPS) add_dependencies(${TARGET_NAME} ${xpu_library_DEPS}) diff --git a/cmake/phi.cmake b/cmake/phi.cmake index d9132b84455..f6e15758379 100644 --- a/cmake/phi.cmake +++ b/cmake/phi.cmake @@ -83,6 +83,8 @@ function(kernel_declare TARGET_LIST) file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n") elseif (${kernel_path} MATCHES "./gpudnn\/") file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPUDNN, ALL_LAYOUT);\n") + elseif (${kernel_path} MATCHES "./kps\/") + file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, KPS, ALL_LAYOUT);\n") else () # deal with device independent kernel, now we use CPU temporaary file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n") @@ -97,6 +99,7 @@ function(kernel_library TARGET) set(gpu_srcs) set(xpu_srcs) set(gpudnn_srcs) + set(kps_srcs) set(selected_rows_srcs) # parse and save the deps kerenl targets set(all_srcs) @@ -128,6 +131,9 @@ function(kernel_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc) list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc) endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) + list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) + endif() if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu) list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu) endif() @@ -137,6 +143,15 @@ function(kernel_library TARGET) list(APPEND xpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc) endif() endif() + if (WITH_XPU_KP) + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) + # Change XPU2 file suffix + # NOTE(chenweihang): If we can be sure that the *.kps suffix is no longer used, it can be copied directly to *.xpu + file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps) + file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.cu ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps) + list(APPEND kps_srcs ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps) + endif() + endif() else() # TODO(chenweihang): impl compile by source later endif() @@ -150,6 +165,7 @@ function(kernel_library TARGET) list(APPEND all_srcs ${gpu_srcs}) list(APPEND all_srcs ${xpu_srcs}) list(APPEND all_srcs ${gpudnn_srcs}) + list(APPEND all_srcs ${kps_srcs}) foreach(src ${all_srcs}) file(READ ${src} target_content) string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content}) @@ -159,11 +175,11 @@ function(kernel_library TARGET) string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content}) endif() foreach(include_kernel ${include_kernels}) - if ("${kernel_library_SUB_DIR}" STREQUAL "") - string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel}) - else() - string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel}) - endif() + if ("${kernel_library_SUB_DIR}" STREQUAL "") + string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel}) + else() + string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel}) + endif() string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name}) list(APPEND kernel_deps ${kernel_name}) endforeach() @@ -176,11 +192,20 @@ function(kernel_library TARGET) list(LENGTH gpu_srcs gpu_srcs_len) list(LENGTH xpu_srcs xpu_srcs_len) list(LENGTH gpudnn_srcs gpudnn_srcs_len) + list(LENGTH kps_srcs kps_srcs_len) list(LENGTH selected_rows_srcs selected_rows_srcs_len) + # kernel source file level + # level 1: base device kernel + # - cpu_srcs / gpu_srcs / xpu_srcs / kps_srcs + # level 2: device-independent kernel + # - common_srcs + # level 3: Kernel implemented by reusing device-independent kernel + # - selected_rows_srcs + # Build Target according different src organization if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR - ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) AND + ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) AND (${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0)) # If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule. if (WITH_GPU) @@ -193,6 +218,11 @@ function(kernel_library TARGET) hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) endif() + elseif (WITH_XPU_KP) + if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) + xpu_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + xpu_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) + endif() else() if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) @@ -200,7 +230,7 @@ function(kernel_library TARGET) endif() endif() # If there are only specific device srcs, build target using this rule. - elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) + elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) if (WITH_GPU) if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) @@ -209,6 +239,10 @@ function(kernel_library TARGET) if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() + elseif (WITH_XPU_KP) + if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) + xpu_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + endif() else() if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) @@ -222,6 +256,9 @@ function(kernel_library TARGET) elseif (WITH_ROCM) hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + elseif (WITH_XPU_KP) + xpu_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) else() cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) @@ -232,6 +269,8 @@ function(kernel_library TARGET) nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) elseif (WITH_ROCM) hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + elseif (WITH_XPU_KP) + xpu_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) else() cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() @@ -240,6 +279,8 @@ function(kernel_library TARGET) nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) elseif (WITH_ROCM) hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + elseif (WITH_XPU_KP) + xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) else() cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() @@ -249,7 +290,7 @@ function(kernel_library TARGET) if (${target_build_flag} EQUAL 1) if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR - ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR + ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0) # append target into PHI_KERNELS property get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS) @@ -275,6 +316,9 @@ function(kernel_library TARGET) if (${gpudnn_srcs_len} GREATER 0) kernel_declare(${gpudnn_srcs}) endif() + if (${kps_srcs_len} GREATER 0) + kernel_declare(${kps_srcs}) + endif() if (${selected_rows_srcs_len} GREATER 0) kernel_declare(${selected_rows_srcs}) endif() diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc index 355291beb60..1a39a87fb99 100644 --- a/paddle/fluid/framework/phi_utils.cc +++ b/paddle/fluid/framework/phi_utils.cc @@ -68,6 +68,8 @@ OpKernelType TransPhiKernelKeyToOpKernelType(const phi::KernelKey& kernel_key) { library_type = LibraryType::kMKLDNN; } else if (kernel_key.backend() == phi::Backend::GPUDNN) { library_type = LibraryType::kCUDNN; + } else if (kernel_key.backend() == phi::Backend::KPS) { + library_type = LibraryType::kKP; } else { // do nothing } @@ -82,6 +84,8 @@ phi::KernelKey TransOpKernelTypeToPhiKernelKey( backend = phi::Backend::MKLDNN; } else if (kernel_type.library_type_ == LibraryType::kCUDNN) { backend = phi::Backend::GPUDNN; + } else if (kernel_type.library_type_ == LibraryType::kKP) { + backend = phi::Backend::KPS; } else { // do } diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h index 603ce0817c4..b9d843982dc 100644 --- a/paddle/phi/backends/gpu/gpu_context.h +++ b/paddle/phi/backends/gpu/gpu_context.h @@ -227,4 +227,12 @@ class GPUContext : public DeviceContext { // must use different function name for cudnn kernel using GPUDNNContext = GPUContext; +// KPS (Kernel PrimitiveS API) needs to exist as a kind of backend, +// because we want to implement a KPS-based kernel and make it run +// on GPU and XPU at the same time, so we need KPSContext when registering +// KPS Kernel. Note: XPU and GPU cannot be compiled at the same time! +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +using KPSContext = GPUContext; +#endif + } // namespace phi diff --git a/paddle/phi/backends/xpu/xpu_context.h b/paddle/phi/backends/xpu/xpu_context.h index 3005d1707e6..b87489c567c 100644 --- a/paddle/phi/backends/xpu/xpu_context.h +++ b/paddle/phi/backends/xpu/xpu_context.h @@ -66,4 +66,12 @@ class XPUContext : public DeviceContext { std::unique_ptr impl_; }; +// KPS (Kernel PrimitiveS API) needs to exist as a kind of backend, +// because we want to implement a KPS-based kernel and make it run +// on GPU and XPU at the same time, so we need KPSContext when registering +// KPS Kernel. Note: XPU and GPU cannot be compiled at the same time! +#if PADDLE_WITH_XPU_KP +using KPSContext = XPUContext; +#endif + } // namespace phi diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h index 4b7bf65be39..a9e12f5d81e 100644 --- a/paddle/phi/common/backend.h +++ b/paddle/phi/common/backend.h @@ -52,6 +52,9 @@ enum class Backend : uint8_t { MKLDNN, GPUDNN, // cuDNN and hipDNN + // paddle kernel primitives backend + KPS, + // end of backend types NUM_BACKENDS, @@ -115,6 +118,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) { case Backend::GPUDNN: os << "GPUDNN"; break; + case Backend::KPS: + os << "KPS"; + break; default: { size_t device_type_id_ = static_cast(backend) - static_cast(Backend::NUM_BACKENDS); @@ -147,6 +153,8 @@ inline Backend StringToBackend(const char* backend_cstr) { return Backend::MKLDNN; } else if (s == std::string("GPUDNN")) { return Backend::GPUDNN; + } else if (s == std::string("KPS")) { + return Backend::KPS; } else { return static_cast(static_cast(Backend::NUM_BACKENDS) + phi::GetOrRegisterGlobalDeviceTypeId(s)); diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc index 3b7a733ede9..b85db07bd9d 100644 --- a/paddle/phi/core/compat/convert_utils.cc +++ b/paddle/phi/core/compat/convert_utils.cc @@ -66,6 +66,14 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) { case phi::Backend::XPU: return phi::XPUPlace( set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0); +#endif + case phi::Backend::KPS: +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + return phi::GPUPlace( + set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0); +#elif defined(PADDLE_WITH_XPU_KP) + return phi::XPUPlace( + set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0); #endif default: { #ifdef PADDLE_WITH_CUSTOM_DEVICE diff --git a/paddle/phi/tests/common/test_backend.cc b/paddle/phi/tests/common/test_backend.cc index fa4ffc84bf5..5d6862c368c 100644 --- a/paddle/phi/tests/common/test_backend.cc +++ b/paddle/phi/tests/common/test_backend.cc @@ -44,6 +44,9 @@ TEST(Backend, OStream) { oss << phi::Backend::GPUDNN; EXPECT_EQ(oss.str(), "GPUDNN"); oss.str(""); + oss << phi::Backend::KPS; + EXPECT_EQ(oss.str(), "KPS"); + oss.str(""); try { oss << phi::Backend::NUM_BACKENDS; } catch (const std::exception& exception) { @@ -61,6 +64,7 @@ TEST(Backend, StringToBackend) { EXPECT_EQ(phi::Backend::NPU, pexp::StringToBackend("NPU")); EXPECT_EQ(phi::Backend::MKLDNN, pexp::StringToBackend("MKLDNN")); EXPECT_EQ(phi::Backend::GPUDNN, pexp::StringToBackend("GPUDNN")); + EXPECT_EQ(phi::Backend::KPS, pexp::StringToBackend("KPS")); EXPECT_EQ(static_cast( static_cast(phi::Backend::NUM_BACKENDS) + 1), pexp::StringToBackend("CustomBackend")); -- GitLab From b34663876056740261a9f58cf3e5d90e9e49788f Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Tue, 1 Mar 2022 11:25:24 +0800 Subject: [PATCH 013/272] [phi] move uniform_random to phi (#39937) * move uniform_random to phi * fit selected_rows * replace mutable_data --- paddle/fluid/framework/operator.cc | 3 + paddle/fluid/operators/uniform_random_op.cc | 4 - paddle/fluid/operators/uniform_random_op.cu | 3 - .../phi/kernels/cpu/uniform_random_kernel.cc | 115 ++++++++ paddle/phi/kernels/funcs/aligned_vector.h | 75 ++++++ .../phi/kernels/funcs/distribution_helper.h | 249 ++++++++++++++++++ paddle/phi/kernels/funcs/index_impl.cu.h | 93 +++++++ .../phi/kernels/gpu/uniform_random_kernel.cu | 163 ++++++++++++ .../selected_rows/uniform_random_kernel.cc | 88 +++++++ paddle/phi/kernels/uniform_random_kernel.h | 66 +++++ paddle/phi/ops/compat/uniform_random_sig.cc | 159 +++++++++++ 11 files changed, 1011 insertions(+), 7 deletions(-) create mode 100644 paddle/phi/kernels/cpu/uniform_random_kernel.cc create mode 100644 paddle/phi/kernels/funcs/aligned_vector.h create mode 100644 paddle/phi/kernels/funcs/distribution_helper.h create mode 100644 paddle/phi/kernels/funcs/index_impl.cu.h create mode 100644 paddle/phi/kernels/gpu/uniform_random_kernel.cu create mode 100644 paddle/phi/kernels/selected_rows/uniform_random_kernel.cc create mode 100644 paddle/phi/kernels/uniform_random_kernel.h create mode 100644 paddle/phi/ops/compat/uniform_random_sig.cc diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index d33791f70c4..36208c41ed5 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2074,6 +2074,7 @@ void OperatorWithKernel::BuildPhiKernelContext( } pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i); } + VLOG(4) << "Done inputs"; for (size_t i = 0; i < output_names.size(); ++i) { auto it = ctx.outputs.find(output_names[i]); @@ -2118,6 +2119,7 @@ void OperatorWithKernel::BuildPhiKernelContext( pt_kernel_context->AssignOutputRange(std::make_pair(start_idx, end_idx), i); } + VLOG(4) << "Done outputs"; for (size_t i = 0; i < attr_names.size(); ++i) { if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) { @@ -2226,6 +2228,7 @@ void OperatorWithKernel::BuildPhiKernelContext( } } } + VLOG(4) << "Done attributes"; } } // namespace framework diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc index 353d653f481..1c22e60fa87 100644 --- a/paddle/fluid/operators/uniform_random_op.cc +++ b/paddle/fluid/operators/uniform_random_op.cc @@ -281,10 +281,6 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, paddle::operators::UniformRandomOpVarTypeInference); -REGISTER_OP_CPU_KERNEL( - uniform_random, paddle::operators::CPUUniformRandomKernel, - paddle::operators::CPUUniformRandomKernel, - paddle::operators::CPUUniformRandomKernel); REGISTER_OP_CPU_KERNEL( uniform_random_batch_size_like, paddle::operators::CPUUniformRandomKernel, diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu index fb38a6aded4..2ceb8a68d86 100644 --- a/paddle/fluid/operators/uniform_random_op.cu +++ b/paddle/fluid/operators/uniform_random_op.cu @@ -58,9 +58,6 @@ class GPUUniformRandomKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_CUDA_KERNEL(uniform_random, - paddle::operators::GPUUniformRandomKernel, - paddle::operators::GPUUniformRandomKernel); REGISTER_OP_CUDA_KERNEL(uniform_random_batch_size_like, paddle::operators::GPUUniformRandomKernel, paddle::operators::GPUUniformRandomKernel); diff --git a/paddle/phi/kernels/cpu/uniform_random_kernel.cc b/paddle/phi/kernels/cpu/uniform_random_kernel.cc new file mode 100644 index 00000000000..8ec1d9683e1 --- /dev/null +++ b/paddle/phi/kernels/cpu/uniform_random_kernel.cc @@ -0,0 +1,115 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/uniform_random_kernel.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +inline void UniformRealDistribution(T *data, + const int64_t &size, + const float &min, + const float &max, + std::shared_ptr engine) { + std::uniform_real_distribution dist(static_cast(min), + static_cast(max)); + for (int64_t i = 0; i < size; ++i) { + data[i] = dist(*engine); + } +} + +template <> +inline void UniformRealDistribution(phi::dtype::bfloat16 *data, + const int64_t &size, + const float &min, + const float &max, + std::shared_ptr engine) { + std::uniform_real_distribution dist(min, max); + for (int64_t i = 0; i < size; ++i) { + data[i] = static_cast(dist(*engine)); + } +} + +template +void UniformRandomRawKernel(const Context &dev_ctx, + const ScalarArray &shape, + DataType dtype, + float min, + float max, + int seed, + int diag_num, + int diag_step, + float diag_val, + DenseTensor *out) { + out->Resize(phi::make_ddim(shape.GetData())); + VLOG(4) << out->dims(); + T *data = dev_ctx.template Alloc(out); + auto size = out->numel(); + std::shared_ptr engine; + if (seed) { + engine = std::make_shared(); + engine->seed(seed); + } else { + engine = dev_ctx.GetGenerator()->GetCPUEngine(); + } + UniformRealDistribution(data, size, min, max, engine); + if (diag_num > 0) { + PADDLE_ENFORCE_GT( + size, + (diag_num - 1) * (diag_step + 1), + phi::errors::InvalidArgument( + "ShapeInvalid: the diagonal's elements is equal (num-1) " + "* (step-1) with num %d, step %d," + "It should be smaller than %d, but received %d", + diag_num, + diag_step, + (diag_num - 1) * (diag_step + 1), + size)); + for (int64_t i = 0; i < diag_num; ++i) { + int64_t pos = i * diag_step + i; + data[pos] = diag_val; + } + } +} + +template +void UniformRandomKernel(const Context &dev_ctx, + const ScalarArray &shape, + DataType dtype, + float min, + float max, + int seed, + DenseTensor *out) { + UniformRandomRawKernel( + dev_ctx, shape, dtype, min, max, seed, 0, 0, 0.0f, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(uniform_random_raw, + CPU, + ALL_LAYOUT, + phi::UniformRandomRawKernel, + float, + double, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(uniform_random, + CPU, + ALL_LAYOUT, + phi::UniformRandomKernel, + float, + double, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/funcs/aligned_vector.h b/paddle/phi/kernels/funcs/aligned_vector.h new file mode 100644 index 00000000000..9382b03cf93 --- /dev/null +++ b/paddle/phi/kernels/funcs/aligned_vector.h @@ -0,0 +1,75 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.1 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.1 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/hostdevice.h" + +namespace phi { + +// Aligned vector generates vectorized load/store on CUDA. +template +struct alignas(sizeof(T) * Size) AlignedVector { + T val[Size]; + + HOSTDEVICE inline const T& operator[](int i) const { return val[i]; } + HOSTDEVICE inline T& operator[](int i) { return val[i]; } +}; + +template +HOSTDEVICE inline void Load(const T* addr, AlignedVector* vec) { + const AlignedVector* addr_vec = + reinterpret_cast*>(addr); + *vec = *addr_vec; +} + +template +HOSTDEVICE inline void Store(const AlignedVector& vec, T* addr) { + AlignedVector* addr_vec = + reinterpret_cast*>(addr); + *addr_vec = vec; +} + +/* +* Only the address of input data is the multiplier of 1,2,4, vectorized load +* with corresponding multiplier-value is possible. Moreover, the maximum length +* of vectorized load is 128 bits once. Hence, valid length of vectorized load +* shall be determined under both former constraints. +*/ +template +int GetVectorizedSize(const T* pointer) { + constexpr int max_load_bits = 128; + int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T); + uint64_t address = reinterpret_cast(pointer); + constexpr int vec8 = std::alignment_of>::value; // NOLINT + constexpr int vec4 = std::alignment_of>::value; // NOLINT + constexpr int vec2 = std::alignment_of>::value; // NOLINT + if (address % vec8 == 0) { + /* + * Currently, decide to deal with no more than 4 data once while adopting + * vectorization load/store, if performance test shows that dealing with + * 8 data once in vectorization load/store does get optimized, return code + * below can be changed into " return std::min(8, valid_vec_size); " . + */ + return std::min(4, valid_vec_size); + } else if (address % vec4 == 0) { + return std::min(4, valid_vec_size); + } else if (address % vec2 == 0) { + return std::min(2, valid_vec_size); + } else { + return 1; + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h new file mode 100644 index 00000000000..49e1c82482c --- /dev/null +++ b/paddle/phi/kernels/funcs/distribution_helper.h @@ -0,0 +1,249 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef __NVCC__ +#include +#endif +#ifdef __HIPCC__ +#include +#endif + +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" +#include "paddle/phi/core/generator.h" + +#include "paddle/phi/kernels/funcs/index_impl.cu.h" + +#if defined(__NVCC__) || defined(__HIPCC__) +#include "paddle/phi/kernels/primitive/kernel_primitives.h" +#endif + +#if !defined(_WIN32) +#define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) +#else +// there is no equivalent intrinsics in msvc. +#define UNLIKELY(condition) (condition) +#endif + +namespace phi { +namespace distribution { + +/********************* Transformation Function **********************/ +template +struct exponential_transform { + explicit exponential_transform(T lambda) : lambda_(lambda) {} + + HOSTDEVICE inline T operator()(T val) const { +#if defined(__NVCC__) || defined(__HIPCC__) + if (std::is_same::value) { + return static_cast(-1.0) / lambda_ * log(val); + } else { + return static_cast(-1.0) / lambda_ * __logf(val); + } +#else + return static_cast(-1.0) / lambda_ * std::log(static_cast(1.0) - val); +#endif + } + + private: + T lambda_; +}; + +template +struct uniform_transform { + explicit uniform_transform(T min, T max) : range_(max - min), min_(min) {} + + HOSTDEVICE inline T operator()(T val) const { + if (UNLIKELY(val == static_cast(1.0))) { + return min_; + } else { + return val * range_ + min_; + } + } + + private: + T range_; + T min_; +}; + +template +struct normal_transform { + explicit normal_transform(T mean, T std) : mean_(mean), std_(std) {} + + HOSTDEVICE inline T operator()(T val) const { return val * std_ + mean_; } + + private: + T mean_; + T std_; +}; + +#if defined(__NVCC__) || defined(__HIPCC__) + +namespace kps = phi::kps; + +/*********************** Distribution Function *************************/ +template +struct uniform_distribution; + +template +struct normal_distribution; + +#if defined(__NVCC__) +template <> +struct uniform_distribution { + __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const { + return curand_uniform4(state); + } + static constexpr int kReturnsCount = 4; +}; + +template <> +struct uniform_distribution { + __device__ inline double2 operator()( + curandStatePhilox4_32_10_t *state) const { + return curand_uniform2_double(state); + } + static constexpr int kReturnsCount = 2; +}; + +template <> +struct normal_distribution { + __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const { + return curand_normal4(state); + } + static constexpr int kReturnsCount = 4; +}; + +template <> +struct normal_distribution { + __device__ inline double2 operator()( + curandStatePhilox4_32_10_t *state) const { + return curand_normal2_double(state); + } + static constexpr int kReturnsCount = 2; +}; + +#else +template <> +struct uniform_distribution { + __device__ inline float4 operator()( + hiprandStatePhilox4_32_10_t *state) const { + return hiprand_uniform4(state); + } + static constexpr int kReturnsCount = 4; +}; + +template <> +struct uniform_distribution { + __device__ inline double2 operator()( + hiprandStatePhilox4_32_10_t *state) const { + return hiprand_uniform2_double(state); + } + static constexpr int kReturnsCount = 2; +}; + +template <> +struct normal_distribution { + __device__ inline float4 operator()( + hiprandStatePhilox4_32_10_t *state) const { + return hiprand_normal4(state); + } + static constexpr int kReturnsCount = 4; +}; + +template <> +struct normal_distribution { + __device__ inline double2 operator()( + hiprandStatePhilox4_32_10_t *state) const { + return hiprand_normal2_double(state); + } + static constexpr int kReturnsCount = 2; +}; +#endif + +/******** Launch GPU function of distribution and transformation *********/ +template +__global__ void DistributionKernel(size_t size, + uint64_t seed, + uint64_t offset, + DistOp dist, + TransformOp trans, + T *out_data, + size_t stride) { + size_t idx = static_cast(BLOCK_ID_X * BLOCK_NUM_X); + static constexpr int kCount = DistOp::kReturnsCount; +#if defined(__NVCC__) + curandStatePhilox4_32_10_t state; + curand_init(seed, idx + THREAD_ID_X, offset, &state); + using SType = curandStatePhilox4_32_10_t; +#else + hiprandStatePhilox4_32_10_t state; + hiprand_init(seed, idx + THREAD_ID_X, offset, &state); + using SType = hiprandStatePhilox4_32_10_t; +#endif + size_t total_thread = GRID_NUM_X * BLOCK_NUM_X; + T args[kCount]; + T result[kCount]; + for (size_t i = idx; i < size; i += total_thread * kCount) { + kps::ElementwiseRandom(&args[0], dist, &state); + kps::ElementwiseUnary( + &result[0], &args[0], trans); + kps::WriteData( + out_data + i, &result[0], size - i, 1, stride, 1); + __syncthreads(); + } +} + +template +void distribution_and_transform(const GPUContext &dev_ctx, + DenseTensor *out, + DistOp dist, + TransformOp trans) { + T *out_data = dev_ctx.template Alloc(out); + auto size = out->numel(); + + int64_t device_id = dev_ctx.GetPlace().GetDeviceId(); + auto gen_cuda = dev_ctx.GetGenerator(); + + size_t block_size = 256; + size_t expect_grid_size = (size + block_size - 1) / block_size; + const auto &prop = backends::gpu::GetDeviceProperties(device_id); + size_t max_grid_size = (prop.maxThreadsPerMultiProcessor / block_size) * + prop.multiProcessorCount; + size_t grid_size = + expect_grid_size > max_grid_size ? max_grid_size : expect_grid_size; + + size_t total_thread = block_size * grid_size; + size_t curand4_loop_times = + (size + 4 * total_thread - 1) / (4 * total_thread); + // 'increment' shoulde be multiple of 4 + uint64_t increment = curand4_loop_times * 4; + + auto seed_offset = gen_cuda->IncrementOffset(increment); + uint64_t seed = seed_offset.first; + uint64_t offset = seed_offset.second; + + DistributionKernel< + T, + DistOp, + TransformOp><<>>( + size, seed, offset, dist, trans, out_data, total_thread); +} + +#endif +} // namespace distribution +} // namespace phi diff --git a/paddle/phi/kernels/funcs/index_impl.cu.h b/paddle/phi/kernels/funcs/index_impl.cu.h new file mode 100644 index 00000000000..ccb70fe25dd --- /dev/null +++ b/paddle/phi/kernels/funcs/index_impl.cu.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" +#include "paddle/phi/kernels/primitive/kernel_primitives.h" + +namespace phi { + +template +__global__ void VectorizedIndexKernel(T *out, + size_t numel, + size_t main_offset, + Functor func) { + size_t data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize; + size_t stride = BLOCK_NUM_X * GRID_NUM_X * VecSize; + size_t args[VecSize]; + T result[VecSize]; + for (; data_offset < main_offset; data_offset += stride) { + kps::InitWithDataIndex(&args[0], data_offset); + kps::ElementwiseUnary( + &result[0], &args[0], func); + kps::WriteData( + out + data_offset, &result[0], BLOCK_NUM_X * VecSize); + } + size_t num = numel - data_offset; + if (num > 0) { + kps::InitWithDataIndex(&args[0], data_offset); + kps::ElementwiseUnary( + &result[0], &args[0], func); + kps::WriteData(out + data_offset, &result[0], num); + } +} + +template +void IndexKernel(const KPDevice &dev_ctx, DenseTensor *out, Functor func) { + int numel = out->numel(); + T *out_data = dev_ctx.template Alloc(out); + if (numel <= 0) return; + int vec_size = phi::GetVectorizedSize(out_data); +#ifdef PADDLE_WITH_XPU_KP + int block = 64; + int grid = 8; + auto stream = dev_ctx.x_context()->xpu_stream; +#else + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size); + int grid = config.block_per_grid.x; + int block = config.thread_per_block.x; + auto stream = dev_ctx.stream(); +#endif + size_t main_offset = (numel / (vec_size * block)) * vec_size * block; + switch (vec_size) { + case 4: + VectorizedIndexKernel<<>>( + out_data, numel, main_offset, func); + break; + case 2: + VectorizedIndexKernel<<>>( + out_data, numel, main_offset, func); + break; + case 1: + VectorizedIndexKernel<<>>( + out_data, numel, main_offset, func); + break; + default: { + PADDLE_THROW(phi::errors::Unimplemented( + "Unsupported vectorized size: %d !", vec_size)); + break; + } + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/uniform_random_kernel.cu b/paddle/phi/kernels/gpu/uniform_random_kernel.cu new file mode 100644 index 00000000000..7f24a6667e5 --- /dev/null +++ b/paddle/phi/kernels/gpu/uniform_random_kernel.cu @@ -0,0 +1,163 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/uniform_random_kernel.h" + +#include "gflags/gflags.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/distribution_helper.h" +#include "paddle/phi/kernels/funcs/index_impl.cu.h" + +DECLARE_bool(use_curand); + +namespace phi { + +template +struct UniformGenerator { + T min_, max_; + unsigned int seed_; + T diag_val_; + unsigned int diag_num_; + unsigned int diag_step_; + __host__ __device__ UniformGenerator( + T min, T max, int seed, int diag_num, int diag_step, T diag_val) + : min_(min), + max_(max), + seed_(seed), + diag_num_(diag_num), + diag_step_(diag_step), + diag_val_(diag_val) {} + + __host__ __device__ T operator()(const unsigned int n) const { + thrust::minstd_rand rng; + rng.seed(seed_); + thrust::uniform_real_distribution dist(min_, max_); + rng.discard(n); + T out = dist(rng); + unsigned int remainder = n % (diag_step_ + 1); + if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) { + out = diag_val_; + } + return out; + } +}; + +template +struct UniformGeneratorOffset { + T min_, max_; + unsigned int seed_; + T diag_val_; + unsigned int diag_num_; + unsigned int diag_step_; + int offset_; + __host__ __device__ UniformGeneratorOffset(T min, + T max, + int seed, + int diag_num, + int diag_step, + T diag_val, + int offset) + : min_(min), + max_(max), + seed_(seed), + diag_num_(diag_num), + diag_step_(diag_step), + diag_val_(diag_val), + offset_(offset) {} + + __host__ __device__ T operator()(const unsigned int n) const { + thrust::minstd_rand rng; + rng.seed(seed_); + thrust::uniform_real_distribution dist(min_, max_); + rng.discard(n + offset_); + T out = dist(rng); + unsigned int remainder = n % (diag_step_ + 1); + if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) { + out = diag_val_; + } + return out; + } +}; + +template +void UniformRandomRawKernel(const Context& dev_ctx, + const ScalarArray& shape, + DataType dtype, + float min, + float max, + int seed, + int diag_num, + int diag_step, + float diag_val, + DenseTensor* out) { + out->Resize(phi::make_ddim(shape.GetData())); + T* data = dev_ctx.template Alloc(out); + auto size = out->numel(); + bool seed_flag = false; + if (seed == 0) { + std::random_device rd; + seed = rd(); + seed_flag = true; + } + + auto generator = dev_ctx.GetGenerator(); + if (generator->GetIsInitPy() && seed_flag) { + if (FLAGS_use_curand) { + using MT = typename kps::details::MPTypeTrait::Type; + distribution::uniform_distribution dist; + distribution::uniform_transform trans(min, max); + distribution::distribution_and_transform(dev_ctx, out, dist, trans); + } else { + auto seed_offset = generator->IncrementOffset(1); + int64_t gen_offset = size * seed_offset.second; + auto func = UniformGeneratorOffset(min, + max, + seed_offset.first, + diag_num, + diag_step, + diag_val, + gen_offset); + IndexKernel>(dev_ctx, out, func); + } + } else { + auto func = + UniformGenerator(min, max, seed, diag_num, diag_step, diag_val); + IndexKernel>(dev_ctx, out, func); + } +} + +template +void UniformRandomKernel(const Context& dev_ctx, + const ScalarArray& shape, + DataType dtype, + float min, + float max, + int seed, + DenseTensor* out) { + UniformRandomRawKernel( + dev_ctx, shape, dtype, min, max, seed, 0, 0, 0.0f, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(uniform_random_raw, + GPU, + ALL_LAYOUT, + phi::UniformRandomRawKernel, + float, + double) {} + +PD_REGISTER_KERNEL( + uniform_random, GPU, ALL_LAYOUT, phi::UniformRandomKernel, float, double) {} diff --git a/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc b/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc new file mode 100644 index 00000000000..881180b71b1 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc @@ -0,0 +1,88 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/uniform_random_kernel.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void UniformRandomRawSRKernel(const Context& dev_ctx, + const ScalarArray& shape, + DataType dtype, + float min, + float max, + int seed, + int diag_num, + int diag_step, + float diag_val, + SelectedRows* out) { + phi::UniformRandomRawKernel(dev_ctx, + shape, + dtype, + min, + max, + seed, + diag_num, + diag_step, + diag_val, + out->mutable_value()); +} + +template +void UniformRandomSRKernel(const Context& dev_ctx, + const ScalarArray& shape, + DataType dtype, + float min, + float max, + int seed, + SelectedRows* out) { + phi::UniformRandomKernel( + dev_ctx, shape, dtype, min, max, seed, out->mutable_value()); +} + +} // namespace phi + +PD_REGISTER_KERNEL(uniform_random_raw_sr, + CPU, + ALL_LAYOUT, + phi::UniformRandomRawSRKernel, + float, + double, + phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(uniform_random_sr, + CPU, + ALL_LAYOUT, + phi::UniformRandomSRKernel, + float, + double, + phi::dtype::bfloat16) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + +PD_REGISTER_KERNEL(uniform_random_raw_sr, + GPU, + ALL_LAYOUT, + phi::UniformRandomRawSRKernel, + float, + double) {} + +PD_REGISTER_KERNEL(uniform_random_sr, + GPU, + ALL_LAYOUT, + phi::UniformRandomSRKernel, + float, + double) {} +#endif diff --git a/paddle/phi/kernels/uniform_random_kernel.h b/paddle/phi/kernels/uniform_random_kernel.h new file mode 100644 index 00000000000..5bba1272785 --- /dev/null +++ b/paddle/phi/kernels/uniform_random_kernel.h @@ -0,0 +1,66 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" +#include "paddle/phi/core/selected_rows.h" + +namespace phi { + +template +void UniformRandomRawKernel(const Context& dev_ctx, + const ScalarArray& shape, + DataType dtype, + float min, + float max, + int seed, + int diag_num, + int diag_step, + float diag_val, + DenseTensor* out); + +template +void UniformRandomKernel(const Context& dev_ctx, + const ScalarArray& shape, + DataType dtype, + float min, + float max, + int seed, + DenseTensor* out); + +template +void UniformRandomRawSRKernel(const Context& dev_ctx, + const ScalarArray& shape, + DataType dtype, + float min, + float max, + int seed, + int diag_num, + int diag_step, + float diag_val, + SelectedRows* out); + +template +void UniformRandomSRKernel(const Context& dev_ctx, + const ScalarArray& shape, + DataType dtype, + float min, + float max, + int seed, + SelectedRows* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/uniform_random_sig.cc b/paddle/phi/ops/compat/uniform_random_sig.cc new file mode 100644 index 00000000000..d06d4026f4f --- /dev/null +++ b/paddle/phi/ops/compat/uniform_random_sig.cc @@ -0,0 +1,159 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature UniformRandomOpArgumentMapping( + const ArgumentMappingContext& ctx) { + int diag_num = paddle::any_cast(ctx.Attr("diag_num")); + if (ctx.IsDenseTensorOutput("Out")) { + if (diag_num) { + if (ctx.InputSize("ShapeTensorList") > 0) { + return KernelSignature("uniform_random_raw", + {}, + {"ShapeTensorList", + "dtype", + "min", + "max", + "seed", + "diag_num", + "diag_step", + "diag_val"}, + {"Out"}); + } else { + const auto& shape = + paddle::any_cast>(ctx.Attr("shape")); + if (ctx.HasInput("ShapeTensor") && shape.empty()) { + return KernelSignature("uniform_random_raw", + {}, + {"ShapeTensor", + "dtype", + "min", + "max", + "seed", + "diag_num", + "diag_step", + "diag_val"}, + {"Out"}); + } else { + return KernelSignature("uniform_random_raw", + {}, + {"shape", + "dtype", + "min", + "max", + "seed", + "diag_num", + "diag_step", + "diag_val"}, + {"Out"}); + } + } + } else { + if (ctx.InputSize("ShapeTensorList") > 0) { + return KernelSignature( + "uniform_random", + {}, + {"ShapeTensorList", "dtype", "min", "max", "seed"}, + {"Out"}); + } else { + const auto& shape = + paddle::any_cast>(ctx.Attr("shape")); + if (ctx.HasInput("ShapeTensor") && shape.empty()) { + return KernelSignature("uniform_random", + {}, + {"ShapeTensor", "dtype", "min", "max", "seed"}, + {"Out"}); + } else { + return KernelSignature("uniform_random", + {}, + {"shape", "dtype", "min", "max", "seed"}, + {"Out"}); + } + } + } + } else if (ctx.IsSelectedRowsOutput("Out")) { + if (diag_num) { + if (ctx.InputSize("ShapeTensorList") > 0) { + return KernelSignature("uniform_random_raw_sr", + {}, + {"ShapeTensorList", + "dtype", + "min", + "max", + "seed", + "diag_num", + "diag_step", + "diag_val"}, + {"Out"}); + } else { + const auto& shape = + paddle::any_cast>(ctx.Attr("shape")); + if (ctx.HasInput("ShapeTensor") && shape.empty()) { + return KernelSignature("uniform_random_raw_sr", + {}, + {"ShapeTensor", + "dtype", + "min", + "max", + "seed", + "diag_num", + "diag_step", + "diag_val"}, + {"Out"}); + } else { + return KernelSignature("uniform_random_raw_sr", + {}, + {"shape", + "dtype", + "min", + "max", + "seed", + "diag_num", + "diag_step", + "diag_val"}, + {"Out"}); + } + } + } else { + if (ctx.InputSize("ShapeTensorList") > 0) { + return KernelSignature( + "uniform_random_sr", + {}, + {"ShapeTensorList", "dtype", "min", "max", "seed"}, + {"Out"}); + } else { + const auto& shape = + paddle::any_cast>(ctx.Attr("shape")); + if (ctx.HasInput("ShapeTensor") && shape.empty()) { + return KernelSignature("uniform_random_sr", + {}, + {"ShapeTensor", "dtype", "min", "max", "seed"}, + {"Out"}); + } else { + return KernelSignature("uniform_random_sr", + {}, + {"shape", "dtype", "min", "max", "seed"}, + {"Out"}); + } + } + } + } + return KernelSignature("unregistered", {}, {}, {}); +} +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(uniform_random, phi::UniformRandomOpArgumentMapping); -- GitLab From 4fbcf6f4c52adccbc6ea0786b302485f14e5a951 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Tue, 1 Mar 2022 11:51:22 +0800 Subject: [PATCH 014/272] [PHI] Remove reseting dtype, layout and allocation by arg_def for outputs in executor (#39781) * remove SetAllocationForOutputTenosr * add place param for copy kernel * recover SetAllocationForOutputTenosr * polish code * fix empty_dev api bug * remove reseting dtype and layout for output in executor * fix merge bug * [Phi] Add ClearHolder when re-alloc on new place in DeviceContext * fix hostAlloc * remove setting output allocation * remove full_kernel_impl.h * fix bug of xpu full_like Co-authored-by: Aurelius84 --- paddle/fluid/framework/operator.cc | 6 ------ paddle/fluid/framework/phi_utils.cc | 21 --------------------- paddle/fluid/framework/phi_utils.h | 3 --- paddle/fluid/imperative/prepared_operator.h | 6 ------ paddle/phi/api/lib/utils/tensor_utils.cc | 21 --------------------- paddle/phi/api/lib/utils/tensor_utils.h | 3 --- paddle/phi/core/CMakeLists.txt | 2 +- paddle/phi/core/dense_tensor_impl.cc | 5 ----- paddle/phi/kernels/xpu/full_kernel.cc | 3 ++- 9 files changed, 3 insertions(+), 67 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 36208c41ed5..b12ad552aba 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2108,12 +2108,6 @@ void OperatorWithKernel::BuildPhiKernelContext( "Unsupported output `%s` type when call pt kernel.", framework::ToTypeName(var->Type()))); } - - experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out, - output_defs.at(i)); - SetAllocationForOutputTenosr( - tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend)); - pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); } diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc index 1a39a87fb99..93bc2c02d57 100644 --- a/paddle/fluid/framework/phi_utils.cc +++ b/paddle/fluid/framework/phi_utils.cc @@ -233,26 +233,5 @@ static void SetAllocationForUninitializedDenseTensor( dense_tensor->ResetHolder(shared_allocation); } -void SetAllocationForOutputTenosr(phi::TensorBase* tensor, - const platform::Place& place) { - if (phi::DenseTensor::classof(tensor)) { - auto* dense_tensor = static_cast(tensor); - if (!dense_tensor->IsInitialized() || !(dense_tensor->place() == place)) { - SetAllocationForUninitializedDenseTensor(dense_tensor, place); - } - } else if (phi::SelectedRows::classof(tensor)) { - auto* selected_rows = static_cast(tensor); - if (!selected_rows->value().IsInitialized() || - !(selected_rows->place() == place)) { - SetAllocationForUninitializedDenseTensor(selected_rows->mutable_value(), - place); - } - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported tensor type is received when setting allocation for " - "output tensor.")); - } -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h index 1a1f79d8277..a1757881692 100644 --- a/paddle/fluid/framework/phi_utils.h +++ b/paddle/fluid/framework/phi_utils.h @@ -62,9 +62,6 @@ class KernelArgsNameMaker { void InitDefaultKernelSignatureMap(); -void SetAllocationForOutputTenosr(phi::TensorBase* tensor, - const platform::Place& place); - // TODO(Wilber): support others device context. template struct ConvertToPhiContext { diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 8e1e2fbe9a1..3b5762720e7 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -323,12 +323,6 @@ void BuildDygraphPhiKernelContext( "Unsupported output `%s` type when call pt kernel.", framework::ToTypeName(var->Type()))); } - - experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out, - output_defs.at(i)); - framework::SetAllocationForOutputTenosr( - tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend)); - kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out); } kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i); diff --git a/paddle/phi/api/lib/utils/tensor_utils.cc b/paddle/phi/api/lib/utils/tensor_utils.cc index 31325e22afa..1c9f7c3a868 100644 --- a/paddle/phi/api/lib/utils/tensor_utils.cc +++ b/paddle/phi/api/lib/utils/tensor_utils.cc @@ -136,26 +136,5 @@ phi::ScalarArray MakePhiScalarArrayFromVarList( return result; } -void ResetTensorDtypeAndLayoutByArgDef(phi::TensorBase* dst, - const phi::TensorArgDef& arg_def) { - VLOG(5) << "ResetTensor by TensorArgDef."; - if (phi::DenseTensor::classof(dst)) { - auto* dense_t = static_cast(dst); - auto* meta = phi::DenseTensorUtils::GetMutableMeta(dense_t); - meta->dtype = arg_def.dtype; - meta->layout = arg_def.layout; - } else if (phi::SelectedRows::classof(dst)) { - auto* selected_rows = static_cast(dst); - auto* meta = - phi::DenseTensorUtils::GetMutableMeta(selected_rows->mutable_value()); - meta->dtype = arg_def.dtype; - meta->layout = arg_def.layout; - } else { - PADDLE_THROW(phi::errors::Unimplemented( - "Unsupported tensor type is received when reseting tensor dtype and " - "layout by argument definition.")); - } -} - } // namespace experimental } // namespace paddle diff --git a/paddle/phi/api/lib/utils/tensor_utils.h b/paddle/phi/api/lib/utils/tensor_utils.h index 8b30d5421ab..64df59c1a2a 100644 --- a/paddle/phi/api/lib/utils/tensor_utils.h +++ b/paddle/phi/api/lib/utils/tensor_utils.h @@ -42,8 +42,5 @@ phi::ScalarArray MakePhiScalarArrayFromVar(const framework::Variable& variable); phi::ScalarArray MakePhiScalarArrayFromVarList( const std::vector& variable_list); -void ResetTensorDtypeAndLayoutByArgDef(phi::TensorBase* dst, - const phi::TensorArgDef& arg_def); - } // namespace experimental } // namespace paddle diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt index f4f57a0acbb..8ffacbb39bb 100644 --- a/paddle/phi/core/CMakeLists.txt +++ b/paddle/phi/core/CMakeLists.txt @@ -22,8 +22,8 @@ cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_ cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor) cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor) -cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows) cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy) +cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows) cc_library(phi_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils) diff --git a/paddle/phi/core/dense_tensor_impl.cc b/paddle/phi/core/dense_tensor_impl.cc index 29e7dc01f32..5ee83089589 100644 --- a/paddle/phi/core/dense_tensor_impl.cc +++ b/paddle/phi/core/dense_tensor_impl.cc @@ -73,11 +73,6 @@ void DenseTensor::set_layout(const paddle::framework::DataLayout layout) { // Note: When you reset holder, you need to ensure the offset is correct void DenseTensor::ResetHolder(const std::shared_ptr& holder) { if (holder_) { - // TODO(zyfncg): The change of static_cast<> in check will recover back - // when SetAllocationForOutputTenosr is deleted. - // Now the numel() may return -1, and will cast to a very large number when - // compare with a data with unsigned long type, this will make checking - // failed, so it's a temporary solution to deal with this problem. PADDLE_ENFORCE_LE( numel() * static_cast(SizeOf(dtype())) + static_cast(meta_.offset), diff --git a/paddle/phi/kernels/xpu/full_kernel.cc b/paddle/phi/kernels/xpu/full_kernel.cc index 574f4e991a2..d43126d56e8 100644 --- a/paddle/phi/kernels/xpu/full_kernel.cc +++ b/paddle/phi/kernels/xpu/full_kernel.cc @@ -59,7 +59,7 @@ void FullKernel(const Context& dev_ctx, const Scalar& val, DataType dtype, DenseTensor* out) { - out->ResizeAndAllocate(phi::make_ddim(shape.GetData())); + out->Resize(phi::make_ddim(shape.GetData())); FullValueXPU(dev_ctx, out, val.to()); } @@ -69,6 +69,7 @@ void FullLikeKernel(const Context& dev_ctx, const Scalar& val, DataType dtype, DenseTensor* out) { + dev_ctx.template Alloc(out); auto value = val.to(); using XPUInTDType = typename XPUTypeTrait::Type; using CommonType = typename std::common_type< -- GitLab From 468a2a17ce13a43452bbaf6888de4e18e15f063f Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Tue, 1 Mar 2022 13:11:37 +0800 Subject: [PATCH 015/272] [phi] migrate where kernel into phi (#39811) --- paddle/fluid/operators/where_op.cc | 46 ++----- paddle/fluid/operators/where_op.cu | 126 ------------------ paddle/fluid/operators/where_op.h | 73 ---------- paddle/fluid/operators/where_op_npu.cc | 2 +- paddle/fluid/operators/where_op_xpu.cc | 2 +- paddle/phi/infermeta/binary.cc | 3 +- paddle/phi/infermeta/multiary.cc | 25 ++++ paddle/phi/infermeta/multiary.h | 4 + paddle/phi/kernels/cpu/atan2_grad_kernel.cc | 5 +- paddle/phi/kernels/cpu/atan2_kernel.cc | 5 +- paddle/phi/kernels/cpu/where_grad_kernel.cc | 54 ++++++++ paddle/phi/kernels/cpu/where_kernel.cc | 40 ++++++ paddle/phi/kernels/gpu/atan2_grad_kernel.cu | 5 +- paddle/phi/kernels/gpu/atan2_kernel.cu | 5 +- paddle/phi/kernels/gpu/where_grad_kernel.cu | 64 +++++++++ paddle/phi/kernels/gpu/where_kernel.cu | 48 +++++++ .../phi/kernels/impl/atan2_grad_kernel_impl.h | 5 +- paddle/phi/kernels/impl/atan2_kernel_impl.h | 5 +- paddle/phi/kernels/where_grad_kernel.h | 33 +++++ paddle/phi/kernels/where_kernel.h | 31 +++++ paddle/phi/ops/compat/where_grad_sig.cc | 28 ++++ 21 files changed, 352 insertions(+), 257 deletions(-) delete mode 100644 paddle/fluid/operators/where_op.cu delete mode 100644 paddle/fluid/operators/where_op.h create mode 100644 paddle/phi/kernels/cpu/where_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/where_kernel.cc create mode 100644 paddle/phi/kernels/gpu/where_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/where_kernel.cu create mode 100644 paddle/phi/kernels/where_grad_kernel.h create mode 100644 paddle/phi/kernels/where_kernel.h create mode 100644 paddle/phi/ops/compat/where_grad_sig.cc diff --git a/paddle/fluid/operators/where_op.cc b/paddle/fluid/operators/where_op.cc index 92ed2bbdc33..0f10efefa13 100644 --- a/paddle/fluid/operators/where_op.cc +++ b/paddle/fluid/operators/where_op.cc @@ -12,8 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/where_op.h" - +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { @@ -21,31 +23,6 @@ class WhereOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Condition"), "Input", "Condition", "Where"); - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Where"); - OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "Where"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Where"); - - auto cond_dims = ctx->GetInputDim("Condition"); - auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); - PADDLE_ENFORCE_EQ( - cond_dims, x_dims, - platform::errors::InvalidArgument( - "The dims of Inputs(Condition) and Inputs(X) should be same. " - "But received Condition's shape is [%s], X's shape is [%s]", - cond_dims, x_dims)); - PADDLE_ENFORCE_EQ(x_dims, y_dims, - platform::errors::InvalidArgument( - "The dims of Inputs(X) and Inputs(Y) should be same. " - "But received X's shape is [%s], Y's shape is [%s]", - x_dims, y_dims)); - - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -140,19 +117,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(WhereGradNoNeedBufferVarsInferer, "X", "Y"); } // namespace paddle namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(where, WhereInferShapeFunctor, + PT_INFER_META(phi::WhereInferMeta)); REGISTER_OPERATOR(where, ops::WhereOp, ops::WhereOpMaker, ops::WhereOpGradMaker, - ops::WhereOpGradMaker); + ops::WhereOpGradMaker, + WhereInferShapeFunctor); REGISTER_OPERATOR(where_grad, ops::WhereGradOp, ops::WhereGradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL( - where, ops::WhereKernel, - ops::WhereKernel, - ops::WhereKernel, - ops::WhereKernel); -REGISTER_OP_CPU_KERNEL( - where_grad, ops::WhereGradKernel, - ops::WhereGradKernel, - ops::WhereGradKernel, - ops::WhereGradKernel); diff --git a/paddle/fluid/operators/where_op.cu b/paddle/fluid/operators/where_op.cu deleted file mode 100644 index 61a1691e4fe..00000000000 --- a/paddle/fluid/operators/where_op.cu +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" -#include "paddle/fluid/operators/where_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" - -namespace platform = paddle::platform; - -namespace paddle { -namespace operators { - -template -struct CondFunctor { - HOSTDEVICE inline CondFunctor() {} - - HOSTDEVICE inline T operator()(const bool cond, const T x, const T y) const { - return cond ? x : y; - } -}; - -template -__global__ void WhereCUDAKernel(const int N, const bool* cond, const T* x, - const T* y, T* out) { - int idx = blockDim.x * blockIdx.x + threadIdx.x; - for (; idx < N; idx += blockDim.x * gridDim.x) { - out[idx] = cond[idx] ? x[idx] : y[idx]; - } -} - -template -__global__ void WhereGradCUDAKernel(const int N, const T* dout, - const bool* cond, T* dx, T* dy) { - int idx = blockDim.x * blockIdx.x + threadIdx.x; - for (; idx < N; idx += blockDim.x * gridDim.x) { - if (dx != nullptr) { - dx[idx] = cond[idx] ? dout[idx] : 0.; - } - if (dy != nullptr) { - dy[idx] = cond[idx] ? 0. : dout[idx]; - } - } -} - -template -class WhereKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* condition = context.Input("Condition"); - auto* X = context.Input("X"); - auto* Y = context.Input("Y"); - auto* out = context.Output("Out"); - auto numel = condition->numel(); - - // TODO(GaaoWei8): Input of where can be broadcast - const bool* cond_data = condition->data(); - const T* x_data = X->data(); - const T* y_data = Y->data(); - T* out_data = out->mutable_data(context.GetPlace()); - - auto stream = context.cuda_device_context().stream(); - auto& dev_ctx = - context.template device_context(); - auto functor = CondFunctor(); - std::vector ins = {condition, X, Y}; - std::vector outs = {out}; - paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, - &outs, functor); - } -}; - -template -class WhereGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* condition = context.Input("Condition"); - const bool* cond_data = condition->data(); - auto numel = condition->numel(); - - auto* dout_t = - context.Input(framework::GradVarName("Out")); - auto* dx_t = context.Output(framework::GradVarName("X")); - auto* dy_t = context.Output(framework::GradVarName("Y")); - auto* dout = dout_t->data(); - T* dx = - (dx_t != nullptr) ? dx_t->mutable_data(context.GetPlace()) : nullptr; - T* dy = - (dy_t != nullptr) ? dy_t->mutable_data(context.GetPlace()) : nullptr; - - auto stream = context.cuda_device_context().stream(); - auto& dev_ctx = - context.template device_context(); - auto config = GetGpuLaunchConfig1D(dev_ctx, condition->numel()); - WhereGradCUDAKernel< - T><<>>( - numel, dout, cond_data, dx, dy); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL( - where, paddle::operators::WhereKernel, - paddle::operators::WhereKernel, - paddle::operators::WhereKernel, - paddle::operators::WhereKernel); -REGISTER_OP_CUDA_KERNEL( - where_grad, - paddle::operators::WhereGradKernel, - paddle::operators::WhereGradKernel, - paddle::operators::WhereGradKernel, - paddle::operators::WhereGradKernel); diff --git a/paddle/fluid/operators/where_op.h b/paddle/fluid/operators/where_op.h deleted file mode 100644 index 5398ee024a2..00000000000 --- a/paddle/fluid/operators/where_op.h +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class WhereKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* condition = context.Input("Condition"); - auto* X = context.Input("X"); - auto* Y = context.Input("Y"); - auto* out = context.Output("Out"); - - const bool* cond_data = condition->data(); - const T* x_data = X->data(); - const T* y_data = Y->data(); - T* out_data = out->mutable_data(context.GetPlace()); - - auto x_numel = X->numel(); - for (int i = 0; i < x_numel; i++) { - out_data[i] = cond_data[i] ? x_data[i] : y_data[i]; - } - } -}; - -template -class WhereGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* condition = context.Input("Condition"); - const auto* cond_data = condition->data(); - auto numel = condition->numel(); - - auto* dout_t = - context.Input(framework::GradVarName("Out")); - auto* dx_t = context.Output(framework::GradVarName("X")); - auto* dy_t = context.Output(framework::GradVarName("Y")); - - auto* dout = dout_t->data(); - if (dx_t != nullptr) { - auto* dx = dx_t->mutable_data(context.GetPlace()); - for (int i = 0; i < numel; i++) { - dx[i] = dout[i] * (cond_data[i] ? 1. : 0.); - } - } - if (dy_t != nullptr) { - auto* dy = dy_t->mutable_data(context.GetPlace()); - for (int i = 0; i < numel; i++) { - dy[i] = dout[i] * (cond_data[i] ? 0. : 1.); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/where_op_npu.cc b/paddle/fluid/operators/where_op_npu.cc index d4294393daa..35508950941 100755 --- a/paddle/fluid/operators/where_op_npu.cc +++ b/paddle/fluid/operators/where_op_npu.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/where_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/where_op_xpu.cc b/paddle/fluid/operators/where_op_xpu.cc index 3a4875c0700..41232c8b5e8 100644 --- a/paddle/fluid/operators/where_op_xpu.cc +++ b/paddle/fluid/operators/where_op_xpu.cc @@ -14,7 +14,7 @@ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/where_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 1905e33bd03..675e68af743 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -306,8 +306,7 @@ void CrossInferMeta(const MetaTensor& x, } void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { - auto in_dims = x.dims(); - out->set_dims(in_dims); + out->share_meta(x); } void BCELossInferMeta(const MetaTensor& input, diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 8857c2cf424..7634e5e01ac 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -133,4 +133,29 @@ void ConcatInferMeta(const std::vector& x, out->share_lod(*x.at(0)); } +void WhereInferMeta(const MetaTensor& condition, + const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out) { + auto cond_dims = condition.dims(); + auto x_dims = x.dims(); + auto y_dims = y.dims(); + PADDLE_ENFORCE_EQ( + cond_dims, + x_dims, + phi::errors::InvalidArgument( + "The dims of Inputs(Condition) and Inputs(X) should be same. " + "But received Condition's shape is [%s], X's shape is [%s]", + cond_dims, + x_dims)); + PADDLE_ENFORCE_EQ(x_dims, + y_dims, + phi::errors::InvalidArgument( + "The dims of Inputs(X) and Inputs(Y) should be same. " + "But received X's shape is [%s], Y's shape is [%s]", + x_dims, + y_dims)); + out->share_meta(x); +} + } // namespace phi diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 473845c6e40..2afb79daa35 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -30,4 +30,8 @@ void ConcatInferMeta(const std::vector& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void WhereInferMeta(const MetaTensor& condition, + const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/cpu/atan2_grad_kernel.cc b/paddle/phi/kernels/cpu/atan2_grad_kernel.cc index 6ff7431f0c8..7a519aab0ad 100644 --- a/paddle/phi/kernels/cpu/atan2_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/atan2_grad_kernel.cc @@ -12,11 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/atan2_grad_kernel.h" +#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h" + #include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/device_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h" PD_REGISTER_KERNEL(atan2_grad, CPU, diff --git a/paddle/phi/kernels/cpu/atan2_kernel.cc b/paddle/phi/kernels/cpu/atan2_kernel.cc index eb38a6c90b7..df6f5f59ac0 100644 --- a/paddle/phi/kernels/cpu/atan2_kernel.cc +++ b/paddle/phi/kernels/cpu/atan2_kernel.cc @@ -12,11 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/atan2_kernel.h" +#include "paddle/phi/kernels/impl/atan2_kernel_impl.h" + #include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/device_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/atan2_kernel_impl.h" PD_REGISTER_KERNEL(atan2, CPU, diff --git a/paddle/phi/kernels/cpu/where_grad_kernel.cc b/paddle/phi/kernels/cpu/where_grad_kernel.cc new file mode 100644 index 00000000000..67c8cee1038 --- /dev/null +++ b/paddle/phi/kernels/cpu/where_grad_kernel.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/where_grad_kernel.h" + +namespace phi { + +template +void WhereGradKernel(const Context& ctx, + const DenseTensor& condition, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + DenseTensor* x_grad, + DenseTensor* y_grad) { + const auto* cond_data = condition.data(); + auto numel = condition.numel(); + auto* dout = out_grad.data(); + + if (x_grad != nullptr) { + auto* dx = ctx.template Alloc(x_grad); + for (int i = 0; i < numel; i++) { + dx[i] = dout[i] * (cond_data[i] ? 1. : 0.); + } + } + if (y_grad != nullptr) { + auto* dy = ctx.template Alloc(y_grad); + for (int i = 0; i < numel; i++) { + dy[i] = dout[i] * (cond_data[i] ? 0. : 1.); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(where_grad, + CPU, + ALL_LAYOUT, + phi::WhereGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/where_kernel.cc b/paddle/phi/kernels/cpu/where_kernel.cc new file mode 100644 index 00000000000..f624c13c262 --- /dev/null +++ b/paddle/phi/kernels/cpu/where_kernel.cc @@ -0,0 +1,40 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/where_kernel.h" + +namespace phi { + +template +void WhereKernel(const Context& ctx, + const DenseTensor& condition, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + const bool* cond_data = condition.data(); + const T* x_data = x.data(); + const T* y_data = y.data(); + auto x_numel = x.numel(); + + T* out_data = ctx.template Alloc(out); + + for (int i = 0; i < x_numel; i++) { + out_data[i] = cond_data[i] ? x_data[i] : y_data[i]; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + where, CPU, ALL_LAYOUT, phi::WhereKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/gpu/atan2_grad_kernel.cu b/paddle/phi/kernels/gpu/atan2_grad_kernel.cu index 1cc3311c363..6652d242de5 100644 --- a/paddle/phi/kernels/gpu/atan2_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/atan2_grad_kernel.cu @@ -12,11 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h" + #include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/device_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/atan2_grad_kernel.h" -#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h" PD_REGISTER_KERNEL(atan2_grad, GPU, diff --git a/paddle/phi/kernels/gpu/atan2_kernel.cu b/paddle/phi/kernels/gpu/atan2_kernel.cu index 702c959b78f..dd0bba177de 100644 --- a/paddle/phi/kernels/gpu/atan2_kernel.cu +++ b/paddle/phi/kernels/gpu/atan2_kernel.cu @@ -12,11 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/phi/kernels/impl/atan2_kernel_impl.h" + #include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/device_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/atan2_kernel.h" -#include "paddle/phi/kernels/impl/atan2_kernel_impl.h" PD_REGISTER_KERNEL(atan2, GPU, diff --git a/paddle/phi/kernels/gpu/where_grad_kernel.cu b/paddle/phi/kernels/gpu/where_grad_kernel.cu new file mode 100644 index 00000000000..f21aca80e21 --- /dev/null +++ b/paddle/phi/kernels/gpu/where_grad_kernel.cu @@ -0,0 +1,64 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/where_grad_kernel.h" + +namespace phi { + +template +__global__ void WhereGradCUDAKernel( + const int N, const T* dout, const bool* cond, T* dx, T* dy) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + for (; idx < N; idx += blockDim.x * gridDim.x) { + if (dx != nullptr) { + dx[idx] = cond[idx] ? dout[idx] : 0.; + } + if (dy != nullptr) { + dy[idx] = cond[idx] ? 0. : dout[idx]; + } + } +} + +template +void WhereGradKernel(const Context& ctx, + const DenseTensor& condition, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + DenseTensor* x_grad, + DenseTensor* y_grad) { + const bool* cond_data = condition.data(); + auto numel = condition.numel(); + auto* dout = out_grad.data(); + + T* dx = (x_grad != nullptr) ? ctx.template Alloc(x_grad) : nullptr; + T* dy = (y_grad != nullptr) ? ctx.template Alloc(y_grad) : nullptr; + + auto stream = ctx.stream(); + auto config = backends::gpu::GetGpuLaunchConfig1D(ctx, numel); + WhereGradCUDAKernel< + T><<>>( + numel, dout, cond_data, dx, dy); +} + +} // namespace phi + +PD_REGISTER_KERNEL(where_grad, + GPU, + ALL_LAYOUT, + phi::WhereGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/where_kernel.cu b/paddle/phi/kernels/gpu/where_kernel.cu new file mode 100644 index 00000000000..03c24eea3a9 --- /dev/null +++ b/paddle/phi/kernels/gpu/where_kernel.cu @@ -0,0 +1,48 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/where_kernel.h" + +#include "paddle/phi/kernels/funcs/broadcast_function.h" +#include "paddle/phi/kernels/funcs/elementwise_functor.h" + +namespace phi { + +// Cond +template +struct CondFunctor { + inline HOSTDEVICE T operator()(const bool cond, const T x, const T y) const { + return cond ? x : y; + } +}; + +template +void WhereKernel(const Context& ctx, + const DenseTensor& condition, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + std::vector ins = {&condition, &x, &y}; + std::vector outs = {out}; + ctx.template Alloc(out); + + CondFunctor func; + funcs::BroadcastKernel( + ctx, ins, &outs, -1, func); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + where, GPU, ALL_LAYOUT, phi::WhereKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h index d0dd1829851..0eff1378f41 100644 --- a/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h @@ -14,9 +14,10 @@ #pragma once -#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/atan2_grad_kernel.h" -#include "paddle/phi/kernels/funcs/for_range.h" + +#include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/core/dense_tensor.h" namespace phi { diff --git a/paddle/phi/kernels/impl/atan2_kernel_impl.h b/paddle/phi/kernels/impl/atan2_kernel_impl.h index 2cae914e2f6..7653032f211 100644 --- a/paddle/phi/kernels/impl/atan2_kernel_impl.h +++ b/paddle/phi/kernels/impl/atan2_kernel_impl.h @@ -14,9 +14,10 @@ #pragma once -#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/atan2_kernel.h" -#include "paddle/phi/kernels/funcs/for_range.h" + +#include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/core/dense_tensor.h" namespace phi { template diff --git a/paddle/phi/kernels/where_grad_kernel.h b/paddle/phi/kernels/where_grad_kernel.h new file mode 100644 index 00000000000..1a3c66ee6ed --- /dev/null +++ b/paddle/phi/kernels/where_grad_kernel.h @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void WhereGradKernel(const Context& ctx, + const DenseTensor& condition, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + DenseTensor* x_grad, + DenseTensor* y_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/where_kernel.h b/paddle/phi/kernels/where_kernel.h new file mode 100644 index 00000000000..254271ac9c7 --- /dev/null +++ b/paddle/phi/kernels/where_kernel.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void WhereKernel(const Context& ctx, + const DenseTensor& condition, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/where_grad_sig.cc b/paddle/phi/ops/compat/where_grad_sig.cc new file mode 100644 index 00000000000..71984a26d35 --- /dev/null +++ b/paddle/phi/ops/compat/where_grad_sig.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature WhereGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("where_grad", + {"Condition", "X", "Y", GradVarName("Out")}, + {}, + {GradVarName("X"), GradVarName("Y")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(where_grad, phi::WhereGradOpArgumentMapping); -- GitLab From a7acfc5b357b8d7de29bd3cf240309c2deb72a2e Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Tue, 1 Mar 2022 13:16:45 +0800 Subject: [PATCH 016/272] update error_string when target is out of bound (#40001) --- python/paddle/nn/functional/loss.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index e59ef5ebfb0..e6efde83628 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1667,11 +1667,11 @@ def cross_entropy(input, label_min = paddle.min(valid_label) label_max = paddle.max(valid_label) if label_min < 0: - raise ValueError("label should not out of bound, but got{}". - format(label_min)) + raise ValueError("Target {} is out of lower bound.".format( + label_min.item())) if label_max >= input.shape[axis]: - raise ValueError("label should not out of bound, but got{}". - format(label_max)) + raise ValueError("Target {} is out of upper bound.".format( + label_max.item())) if core.is_compiled_with_npu() or core.is_compiled_with_mlu(): _, _, out = _C_ops.softmax_with_cross_entropy( input, label, 'soft_label', soft_label, 'ignore_index', -- GitLab From 4204b97ab350298812dd56fb4a5eac504b848aae Mon Sep 17 00:00:00 2001 From: pangyoki Date: Tue, 1 Mar 2022 13:53:39 +0800 Subject: [PATCH 017/272] change tests_v2 to dynamic_tests_v2 in CI op benchmark (#39995) --- tools/ci_op_benchmark.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/ci_op_benchmark.sh b/tools/ci_op_benchmark.sh index 80efd32ecf1..1db79418b2d 100644 --- a/tools/ci_op_benchmark.sh +++ b/tools/ci_op_benchmark.sh @@ -106,7 +106,7 @@ function prepare_benchmark_environment { [ $? -ne 0 ] && LOG "[FATAL] Clone benchmark repo fail." && exit -1 LOG "[INFO] Collect api info ..." python benchmark/api/deploy/collect_api_info.py \ - --test_module_name tests_v2 \ + --test_module_name dynamic_tests_v2 \ --info_file api_info.txt >& 2 [ $? -ne 0 ] && LOG "[FATAL] Collect api info fail." && exit -1 [ ! -f benchmark/ci/scripts/op_benchmark.config ] && LOG "[FATAL] Missing op_benchmark.config!" && exit -1 @@ -185,7 +185,7 @@ function run_op_benchmark_test { logs_dir="$(pwd)/logs-${branch_name}" [ -d $logs_dir ] && rm -rf $logs_dir/* || mkdir -p $logs_dir pushd benchmark/api > /dev/null - bash deploy/main_control.sh tests_v2 \ + bash deploy/main_control.sh dynamic_tests_v2 \ tests_v2/configs \ $logs_dir \ $VISIBLE_DEVICES \ @@ -212,7 +212,7 @@ function check_op_benchmark_result { # there is no need to recompile and install paddle LOG "[INFO] retry ${retry_time} times ..." pushd benchmark/api > /dev/null - bash deploy/main_control.sh tests_v2 \ + bash deploy/main_control.sh dynamic_tests_v2 \ tests_v2/configs \ ${logs_dir} \ $VISIBLE_DEVICES \ -- GitLab From 9de798928509d5bc7e213c385ef565fc7ecfa3dc Mon Sep 17 00:00:00 2001 From: Guoxia Wang Date: Tue, 1 Mar 2022 14:45:38 +0800 Subject: [PATCH 018/272] add MasterParam and MasterParamOut for sparse_momentum op (#39969) --- paddle/fluid/pybind/op_function_generator.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h index 2b07a439d33..d23b3dd64ab 100644 --- a/paddle/fluid/pybind/op_function_generator.h +++ b/paddle/fluid/pybind/op_function_generator.h @@ -60,7 +60,8 @@ std::map> op_ins_map = { {"momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}}, {"merged_momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}}, - {"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}}, + {"sparse_momentum", + {"Param", "Grad", "Velocity", "Index", "LearningRate", "MasterParam"}}, {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}}, {"run_program", {"X", "Params"}}, {"fused_feedforward", @@ -124,7 +125,7 @@ std::map> op_outs_map = { {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}}, {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, {"merged_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, - {"sparse_momentum", {"ParamOut", "VelocityOut"}}, + {"sparse_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, {"rnn", {"DropoutState", "Reserve", "Out", "State"}}, {"run_program", {"DOut"}}, {"adam", @@ -181,7 +182,7 @@ std::map> op_passing_outs_map = { "out_old_num_accumulates", "out_num_updates"}}, {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, {"merged_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, - {"sparse_momentum", {"ParamOut", "VelocityOut"}}, + {"sparse_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}}, {"batch_norm", {"MeanOut", "VarianceOut"}}, {"sync_batch_norm", {"MeanOut", "VarianceOut"}}, {"accuracy", {"Correct", "Total"}}, -- GitLab From 6d26b332d9fee77f16a8655c8ead3f21f2805975 Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Tue, 1 Mar 2022 14:52:54 +0800 Subject: [PATCH 019/272] [bf16] add bf16 kernel: scale gather sum (#39683) * add scale gather sum * refine CUDA_ATOMIC_WRAPPER ADD for bf16 * add gather unittest * solve conflict * add scale uinttest * add sum unittest * solve conflict * refine gather unittest * refine unittest --- paddle/fluid/operators/gather_op.cc | 6 +- paddle/fluid/operators/gather_op.cu | 6 +- .../operators/math/selected_rows_functor.cu | 2 + paddle/fluid/operators/sum_op.cu | 3 +- .../platform/device/gpu/gpu_primitives.h | 67 +++++++++++++++++++ paddle/phi/kernels/gpu/scale_kernel.cu | 1 + .../paddle/fluid/tests/unittests/op_test.py | 7 +- .../fluid/tests/unittests/test_gather_op.py | 35 +++++++++- .../fluid/tests/unittests/test_scale_op.py | 19 +++++- .../fluid/tests/unittests/test_sum_op.py | 26 +++++++ 10 files changed, 164 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc index cf4d7b1d670..8a405cc6fc1 100644 --- a/paddle/fluid/operators/gather_op.cc +++ b/paddle/fluid/operators/gather_op.cc @@ -201,12 +201,14 @@ REGISTER_OPERATOR(gather_grad, ops::GatherGradOp, REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel, ops::GatherOpKernel, ops::GatherOpKernel, ops::GatherOpKernel, - ops::GatherOpKernel); + ops::GatherOpKernel, + ops::GatherOpKernel); REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel, ops::GatherGradientOpKernel, ops::GatherGradientOpKernel, ops::GatherGradientOpKernel, - ops::GatherGradientOpKernel); + ops::GatherGradientOpKernel, + ops::GatherGradientOpKernel); REGISTER_OP_VERSION(gather) .AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC", paddle::framework::compatible::OpVersionDesc().NewInput( diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu index 19568835a6e..a502a130409 100644 --- a/paddle/fluid/operators/gather_op.cu +++ b/paddle/fluid/operators/gather_op.cu @@ -130,9 +130,11 @@ REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel); + ops::GatherOpCUDAKernel, + ops::GatherOpCUDAKernel); REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel, ops::GatherGradOpCUDAKernel, ops::GatherGradOpCUDAKernel, ops::GatherGradOpCUDAKernel, - ops::GatherGradOpCUDAKernel); + ops::GatherGradOpCUDAKernel, + ops::GatherGradOpCUDAKernel); diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index 8563d8b05b1..a4678550cf7 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -16,6 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -445,6 +446,7 @@ template struct MergeAdd; template struct MergeAdd; template struct MergeAdd; template struct MergeAdd; +template struct MergeAdd; template struct MergeAdd>; template struct MergeAdd>; diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu index 3e2d2a5495b..33590c1d7cc 100644 --- a/paddle/fluid/operators/sum_op.cu +++ b/paddle/fluid/operators/sum_op.cu @@ -258,4 +258,5 @@ REGISTER_OP_CUDA_KERNEL( ops::SumKernel, ops::SumKernel, ops::SumKernel, - ops::SumKernel); + ops::SumKernel, + ops::SumKernel); diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h index 8aec8e840f3..803674779e7 100644 --- a/paddle/fluid/platform/device/gpu/gpu_primitives.h +++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h @@ -20,6 +20,7 @@ limitations under the License. */ #include #endif #include +#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" @@ -244,6 +245,72 @@ __device__ __forceinline__ void VectorizedAtomicAddPerBlock( #endif #endif +// NOTE(zhangbo): cuda do not have atomicCAS for __nv_bfloat16. +inline static __device__ uint32_t bf16_add_to_low_half(uint32_t val, float x) { + bfloat16 low_half; + // the bfloat16 in lower 16bits + low_half.x = static_cast(val & 0xFFFFu); + low_half = static_cast(static_cast(low_half) + x); + return (val & 0xFFFF0000u) | low_half.x; +} + +inline static __device__ uint32_t bf16_add_to_high_half(uint32_t val, float x) { + bfloat16 high_half; + // the bfloat16 in higher 16bits + high_half.x = static_cast(val >> 16); + high_half = static_cast(static_cast(high_half) + x); + return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); +} + +#if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 +static __device__ __forceinline__ bfloat16 CUDABF16ToPDBF16(__nv_bfloat16 x) { + return *reinterpret_cast(&x); +} + +static __device__ __forceinline__ __nv_bfloat16 PDBF16ToCUDABF16(bfloat16 x) { + return *reinterpret_cast<__nv_bfloat16 *>(&x); +} + +CUDA_ATOMIC_WRAPPER(Add, bfloat16) { + return CUDABF16ToPDBF16(atomicAdd(reinterpret_cast<__nv_bfloat16 *>(address), + PDBF16ToCUDABF16(val))); +} +#else +CUDA_ATOMIC_WRAPPER(Add, bfloat16) { + // concrete packed bfloat16 value may exsits in lower or higher 16bits + // of the 32bits address. + uint32_t *address_as_ui = reinterpret_cast( + reinterpret_cast(address) - + (reinterpret_cast(address) & 0x02)); + float val_f = static_cast(val); + uint32_t old = *address_as_ui; + uint32_t sum; + uint32_t newval; + uint32_t assumed; + if (((uintptr_t)address & 0x02) == 0) { + // the bfloat16 value stay at lower 16 bits of the address. + do { + assumed = old; + old = atomicCAS(address_as_ui, assumed, + bf16_add_to_low_half(assumed, val_f)); + } while (old != assumed); + bfloat16 ret; + ret.x = old & 0xFFFFu; + return ret; + } else { + // the bfloat16 value stay at higher 16 bits of the address. + do { + assumed = old; + old = atomicCAS(address_as_ui, assumed, + bf16_add_to_high_half(assumed, val_f)); + } while (old != assumed); + bfloat16 ret; + ret.x = old >> 16; + return ret; + } +} +#endif + CUDA_ATOMIC_WRAPPER(Add, complex) { float *real = reinterpret_cast(address); float *imag = real + 1; diff --git a/paddle/phi/kernels/gpu/scale_kernel.cu b/paddle/phi/kernels/gpu/scale_kernel.cu index d9c8de21c5b..930c50a24be 100644 --- a/paddle/phi/kernels/gpu/scale_kernel.cu +++ b/paddle/phi/kernels/gpu/scale_kernel.cu @@ -70,6 +70,7 @@ PD_REGISTER_KERNEL(scale, float, double, phi::dtype::float16, + phi::dtype::bfloat16, uint8_t, int8_t, int16_t, diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 848ebae0706..5694ef25c79 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -482,7 +482,12 @@ class OpTest(unittest.TestCase): op_proto = OpProtoHolder.instance().get_op_proto(self.op_type) "infer datatype from inputs and outputs for this test case" - self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs) + if self.is_bfloat16_op(): + self.dtype = np.uint16 + self.__class__.dtype = self.dtype + self.output_dtype = np.uint16 + else: + self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs) inputs = append_input_output(block, op_proto, self.inputs, True, self.dtype) outputs = append_input_output(block, op_proto, self.outputs, False, diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py index 83b39a62f15..978a3d86d88 100644 --- a/python/paddle/fluid/tests/unittests/test_gather_op.py +++ b/python/paddle/fluid/tests/unittests/test_gather_op.py @@ -16,7 +16,7 @@ from __future__ import print_function import unittest import numpy as np -from op_test import OpTest +from op_test import OpTest, convert_float_to_uint16 import paddle import paddle.fluid as fluid from paddle.framework import core @@ -117,6 +117,39 @@ class TestCase6(TestGatherOp): self.index_type = "int32" +class TestGatherBF16Op(OpTest): + def setUp(self): + self.op_type = "gather" + self.dtype = np.uint16 + self.config() + xnp = np.random.random(self.x_shape).astype(np.float32) + axis_np = np.array(self.axis).astype(self.axis_type) + index_np = np.array(self.index).astype(self.index_type) + self.inputs = { + 'X': convert_float_to_uint16(xnp), + 'Index': index_np, + 'Axis': axis_np + } + out = gather_numpy(self.inputs['X'], index_np, axis_np[0]) + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out', numeric_grad_delta=0.5) + + def config(self): + """ + For multi-dimension input + """ + self.x_shape = (3, 88, 3) + self.index = [1, 3, 5] + self.index_type = "int32" + self.axis = [1] + self.axis_type = "int32" + + class TestGatherOp1(OpTest): def setUp(self): self.op_type = "gather" diff --git a/python/paddle/fluid/tests/unittests/test_scale_op.py b/python/paddle/fluid/tests/unittests/test_scale_op.py index c1ce032f506..d432b8057f6 100644 --- a/python/paddle/fluid/tests/unittests/test_scale_op.py +++ b/python/paddle/fluid/tests/unittests/test_scale_op.py @@ -16,7 +16,7 @@ from __future__ import print_function import unittest import numpy as np -from op_test import OpTest +from op_test import OpTest, convert_float_to_uint16 import paddle import paddle.fluid as fluid import paddle.fluid.core as core @@ -153,6 +153,23 @@ class TestScaleFp16Op(TestScaleOp): place, ["X"], "Out", max_relative_error=0.05) +class TestScaleBF16Op(OpTest): + def setUp(self): + self.op_type = "scale" + self.dtype = np.uint16 + self.attrs = {'scale': -2.3} + x = np.random.random((10, 10)).astype(np.float32) + out = x * np.float32(self.attrs['scale']) + self.inputs = {'X': convert_float_to_uint16(x)} + self.outputs = {'Out': convert_float_to_uint16(out)} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out', numeric_grad_delta=0.8) + + @unittest.skipIf(not core.is_compiled_with_cuda(), "core is not compiled with CUDA") class TestScaleFp16OpSelectedRows(TestScaleOpSelectedRows): diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py index eddccd4ff24..7040145a768 100644 --- a/python/paddle/fluid/tests/unittests/test_sum_op.py +++ b/python/paddle/fluid/tests/unittests/test_sum_op.py @@ -298,6 +298,32 @@ def create_test_sum_fp16_class(parent): globals()[cls_name] = TestSumFp16Case +#----------- test bf16 ----------- +class TestSumBF16Op(OpTest): + def setUp(self): + self.op_type = "sum" + self.init_kernel_type() + x0 = np.random.random((3, 40)).astype(np.float32) + x1 = np.random.random((3, 40)).astype(np.float32) + x2 = np.random.random((3, 40)).astype(np.float32) + y = x0 + x1 + x2 + self.inputs = { + "X": [("x0", convert_float_to_uint16(x0)), + ("x1", convert_float_to_uint16(x1)), + ("x2", convert_float_to_uint16(x2))] + } + self.outputs = {'Out': convert_float_to_uint16(y)} + + def init_kernel_type(self): + self.dtype = np.uint16 + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['x0'], 'Out', numeric_grad_delta=0.5) + + class API_Test_Add_n(unittest.TestCase): def test_api(self): with fluid.program_guard(fluid.Program(), fluid.Program()): -- GitLab From 25650774d9623a3975567fa9f9b9a35b928ffce2 Mon Sep 17 00:00:00 2001 From: zhangchunle Date: Tue, 1 Mar 2022 14:57:54 +0800 Subject: [PATCH 020/272] add test_warpctc_op in mac (#39983) --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 2361bd27062..7d64cf7bd89 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -590,7 +590,7 @@ foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach(TEST_OP) py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4) -if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL) +if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL OR APPLE) py_test_modules(test_warpctc_op MODULES test_warpctc_op) set_tests_properties(test_warpctc_op PROPERTIES TIMEOUT 120) endif() -- GitLab From fc06be9dbd82da832c8eed8cac8573d0166638ba Mon Sep 17 00:00:00 2001 From: wenbin Date: Tue, 1 Mar 2022 15:08:27 +0800 Subject: [PATCH 021/272] remove conv_affine_channel_fuse_pass (#39817) * remove * pass * more pass --- paddle/fluid/framework/ir/CMakeLists.txt | 1 - .../ir/conv_affine_channel_fuse_pass.cc | 420 ------------------ .../ir/conv_affine_channel_fuse_pass.h | 54 --- .../inference/api/paddle_pass_builder.cc | 56 ++- .../quantization/quant2_int8_mkldnn_pass.py | 3 - .../test_conv_affine_channel_fuse_pass.py | 160 ------- ...onv_eltwiseadd_affine_channel_fuse_pass.py | 183 -------- tools/parallel_UT_rule.py | 2 - tools/static_mode_white_list.py | 1 - 9 files changed, 25 insertions(+), 855 deletions(-) delete mode 100644 paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc delete mode 100644 paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h delete mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py delete mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_affine_channel_fuse_pass.py diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index dad5358590c..0d53a54ff82 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -78,7 +78,6 @@ pass_library(is_test_pass base) pass_library(conv_elementwise_add_act_fuse_pass inference) pass_library(conv_elementwise_add2_act_fuse_pass inference) pass_library(conv_elementwise_add_fuse_pass inference) -pass_library(conv_affine_channel_fuse_pass inference) pass_library(transpose_flatten_concat_fuse_pass inference) pass_library(identity_scale_op_clean_pass base) pass_library(sync_batch_norm_pass base) diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc deleted file mode 100644 index f28c9988bd8..00000000000 --- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc +++ /dev/null @@ -1,420 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h" - -#include - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/op_version_registry.h" - -namespace phi { -class DenseTensor; -} // namespace phi - -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -} // namespace paddle - -namespace paddle { -namespace framework { -namespace ir { - -class Node; - -#define GET_CONV_BN_NODES(pattern_name) \ - /* OPERATORS */ \ - GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name); \ - GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \ - /* CONV inputs */ \ - GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name); \ - /* CONV outputs */ \ - GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name); \ - /* Affine Channel inputs */ \ - GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name); \ - GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name); \ - /* Affine channel outputs */ \ - GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */ - -void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight, - const ir::Node& ac_scale, - const LoDTensor& ac_bias_tensor, - LoDTensor* eltwise_y_in_tensor) { - using EigenVectorArrayMap = - Eigen::Map>; - using ConstEigenVectorArrayMap = - Eigen::Map>; - using EigenMatrixArrayMap = Eigen::Map< - Eigen::Array>; - - // Re-compute bias of conv2d from AffineChannel - PADDLE_ENFORCE_EQ( - eltwise_y_in_tensor->dims(), ac_bias_tensor.dims(), - platform::errors::InvalidArgument( - "Tensor elementwise y(%d) and activation bias(%d) must have same " - "dimension.", - eltwise_y_in_tensor->dims().size(), ac_bias_tensor.dims().size())); - - auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable(); - - ConstEigenVectorArrayMap scale_array(scale_tensor->data(), - scale_tensor->numel(), 1); - ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data(), - ac_bias_tensor.numel(), 1); - - EigenVectorArrayMap eltwise_y_in_array( - eltwise_y_in_tensor->mutable_data(platform::CPUPlace()), - eltwise_y_in_tensor->numel(), 1); - - eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array; - - // Re-compute weight of conv2d from AffineChannel - auto* weights = scope->FindVar(conv_weight->Name())->GetMutable(); - auto weights_shape = weights->dims(); - auto weights_shape_2d = phi::flatten_to_2d(weights_shape, 1); - auto* weights_data = weights->mutable_data(platform::CPUPlace()); - - EigenMatrixArrayMap weights_array_2d(weights_data, weights_shape_2d[0], - weights_shape_2d[1]); - - weights_array_2d.colwise() *= scale_array; - - // Check for subnormal values that slows down convolution execution - for (int i = 0; i < weights->numel(); ++i) { - if (std::fpclassify(weights_data[i]) == FP_SUBNORMAL) weights_data[i] = 0; - } -} - -ConvAffineChannelFusePass::ConvAffineChannelFusePass() { - AddOpCompat(OpCompat("conv2d")) - .AddInput("Input") - .IsTensor() - .End() - .AddInput("Filter") - .IsTensor() - .End() - .AddInput("Bias") - .IsTensor() - .IsOptional() - .End() - .AddInput("ResidualData") - .IsTensor() - .IsOptional() - .End() - .AddOutput("Output") - .IsTensor() - .End() - .AddAttr("strides") - .IsType>() - .End() - .AddAttr("paddings") - .IsType>() - .End() - .AddAttr("padding_algorithm") - .IsOptional() - .IsStringIn({"EXPLICIT", "SAME", "VALID"}) - .End() - .AddAttr("groups") - .IsNumGE(1) - .End() - .AddAttr("dilations") - .IsType>() - .End() - .AddAttr("data_format") - .IsStringIn({"NCHW", "AnyLayout"}) - .End(); - - AddOpCompat(OpCompat("affine_channel")) - .AddInput("X") - .IsTensor() - .End() - .AddInput("Scale") - .IsTensor() - .End() - .AddInput("Bias") - .IsTensor() - .IsOptional() - .End() - .AddOutput("Out") - .IsTensor() - .End() - .AddAttr("data_layout") - .IsStringIn({"NCHW", "AnyLayout"}) - .End(); - - AddOpCompat(OpCompat("elementwise_add")) - .AddInput("X") - .IsTensor() - .End() - .AddInput("Y") - .IsTensor() - .End() - .AddOutput("Out") - .IsTensor() - .End() - .AddAttr("axis") - .IsNumEQ(1) - .End(); -} - -void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const { - PADDLE_ENFORCE_NOT_NULL( - graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); - FusePassBase::Init(name_scope_, graph); - - auto* scope = param_scope(); - PADDLE_ENFORCE_NOT_NULL( - scope, platform::errors::InvalidArgument("Scope cannot be nullptr.")); - - GraphPatternDetector gpd; - auto* conv_input = - gpd.mutable_pattern() - ->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) - ->AsInput() - ->assert_is_op_input("conv2d", "Input"); - patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(), - name_scope_); - conv_ac_pattern(conv_input, false /*with_eltwise_add*/); - - int found_conv_ac_count = 0; - auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, - Graph* g) { - if (!IsCompat(subgraph, g)) { - LOG(WARNING) << "ConvAffineChannelFusePass in op compat failed."; - return; - } - - VLOG(4) << "handle ConvAffineChannel fuse"; - - GET_CONV_BN_NODES(conv_ac_pattern); - - auto data_format = conv->Op()->GetAttrIfExists("data_format"); - if (data_format == "AnyLayout") { - LOG_FIRST_N(WARNING, 1) << "conv_affine_channel_fuse_pass is enabled, " - "it's wrong if data_format of conv is not " - "NCHW."; - } - - // Get affine_channel bias for resizing eltwise_y! - auto* ac_bias_tensor = - scope->FindVar(ac_bias->Name())->GetMutable(); - - // Create eltwise_y (conv bias) variable - VarDesc eltwise_y_in_desc( - patterns::PDNodeName(name_scope_, "eltwise_y_in")); - // Set shape && datatype manually - eltwise_y_in_desc.SetShape(phi::vectorize(ac_bias_tensor->dims())); - eltwise_y_in_desc.SetDataType( - framework::TransToProtoVarType(ac_bias_tensor->dtype())); - eltwise_y_in_desc.SetLoDLevel(ac_bias->Var()->GetLoDLevel()); - eltwise_y_in_desc.SetPersistable(true); - - // Initialize eltwise_y - auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc); - auto* eltwise_y_in_tensor = - scope->Var(eltwise_y_in_node->Name())->GetMutable(); - eltwise_y_in_tensor->Resize(ac_bias_tensor->dims()); - std::fill_n(eltwise_y_in_tensor->mutable_data(platform::CPUPlace()), - eltwise_y_in_tensor->numel(), 0.0f); - - // update weights and biases - recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor, - eltwise_y_in_tensor); - - // create an elementwise add node. - OpDesc desc; - desc.SetInput("X", std::vector({conv_out->Name()})); - desc.SetInput("Y", std::vector({eltwise_y_in_node->Name()})); - desc.SetOutput("Out", std::vector({ac_out->Name()})); - desc.SetType("elementwise_add"); - desc.SetAttr("axis", 1); - desc.SetAttr("use_mkldnn", conv->Op()->GetAttrIfExists("use_mkldnn")); - - auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied. - - GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel}); - - IR_NODE_LINK_TO(conv_out, eltwise_op); - IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op); - IR_NODE_LINK_TO(eltwise_op, ac_out); - found_conv_ac_count++; - }; - - gpd(graph, handler); - - AddStatis(found_conv_ac_count); -} - -ConvEltwiseAddAffineChannelFusePass::ConvEltwiseAddAffineChannelFusePass() { - AddOpCompat(OpCompat("conv2d")) - .AddInput("Input") - .IsTensor() - .End() - .AddInput("Filter") - .IsTensor() - .End() - .AddInput("Bias") - .IsTensor() - .IsOptional() - .End() - .AddInput("ResidualData") - .IsTensor() - .IsOptional() - .End() - .AddOutput("Output") - .IsTensor() - .End() - .AddAttr("strides") - .IsType>() - .End() - .AddAttr("paddings") - .IsType>() - .End() - .AddAttr("padding_algorithm") - .IsOptional() - .IsStringIn({"EXPLICIT", "SAME", "VALID"}) - .End() - .AddAttr("groups") - .IsNumGE(1) - .End() - .AddAttr("dilations") - .IsType>() - .End() - .AddAttr("data_format") - .IsStringIn({"NCHW", "AnyLayout"}) - .End(); - AddOpCompat(OpCompat("affine_channel")) - .AddInput("X") - .IsTensor() - .End() - .AddInput("Scale") - .IsTensor() - .End() - .AddInput("Bias") - .IsTensor() - .IsOptional() - .End() - .AddOutput("Out") - .IsTensor() - .End() - .AddAttr("data_layout") - .IsStringIn({"NCHW", "AnyLayout"}) - .End(); - AddOpCompat(OpCompat("elementwise_add")) - .AddInput("X") - .IsTensor() - .End() - .AddInput("Y") - .IsTensor() - .End() - .AddOutput("Out") - .IsTensor() - .End() - .AddAttr("axis") - .IsNumEQ(1) - .End(); -} - -void ConvEltwiseAddAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const { - PADDLE_ENFORCE_NOT_NULL( - graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); - FusePassBase::Init(name_scope_, graph); - - auto* scope = param_scope(); - PADDLE_ENFORCE_NOT_NULL( - scope, platform::errors::InvalidArgument("Scope cannot be nullptr.")); - - GraphPatternDetector gpd; - auto* conv_input = - gpd.mutable_pattern() - ->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) - ->AsInput() - ->assert_is_op_input("conv2d", "Input"); - patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(), - name_scope_); - conv_ac_pattern(conv_input, true /*with_eltwise_add*/); - - int found_conv_ac_count = 0; - auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, - Graph* g) { - if (!IsCompat(subgraph, g)) { - LOG(WARNING) - << "ConvEltwiseAddAffineChannelFusePass in op compat failed."; - return; - } - - VLOG(4) << "handle ConvBN fuse"; - - GET_CONV_BN_NODES(conv_ac_pattern); - auto data_format = conv->Op()->GetAttrIfExists("data_format"); - if (data_format == "AnyLayout") { - LOG_FIRST_N(WARNING, 1) << "conv_eltwiseadd_affine_channel_fuse_pass is " - "enabled, it's wrong if data_format of conv " - "is not NCHW."; - } - // OPERATORS - GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_ac_pattern); - // BIAS inputs - GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_ac_pattern); - // BIAS outputs - GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_ac_pattern); - - // Get eltwise_y (conv bias) variable - auto* eltwise_y_in_tensor = - scope->FindVar(eltwise_y_in->Name())->GetMutable(); - - // Get batch norm bias - auto* ac_bias_tensor = - scope->FindVar(ac_bias->Name())->GetMutable(); - - recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor, - eltwise_y_in_tensor); - - // Update the elementwise_add node - eltwise->Op()->SetAttr("axis", 1); - eltwise->Op()->SetOutput("Out", std::vector({ac_out->Name()})); - - GraphSafeRemoveNodes(graph, - {ac_scale, ac_bias, affine_channel, eltwise_out}); - - IR_NODE_LINK_TO(eltwise, ac_out); - - found_conv_ac_count++; - }; - - gpd(graph, handler); - AddStatis(found_conv_ac_count); -} - -} // namespace ir -} // namespace framework -} // namespace paddle - -REGISTER_PASS(conv_affine_channel_fuse_pass, - paddle::framework::ir::ConvAffineChannelFusePass); -REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass, - paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass); -REGISTER_PASS_CAPABILITY(conv_affine_channel_fuse_pass) - .AddCombination( - paddle::framework::compatible::OpVersionComparatorCombination() - .LE("conv2d", 1) - .EQ("affine_channel", 0)); -REGISTER_PASS_CAPABILITY(conv_eltwiseadd_affine_channel_fuse_pass) - .AddCombination( - paddle::framework::compatible::OpVersionComparatorCombination() - .LE("conv2d", 1) - .LE("elementwise_add", 1) - .EQ("affine_channel", 0)); diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h deleted file mode 100644 index 8cfaf5c6a89..00000000000 --- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" - -namespace paddle { -namespace framework { -namespace ir { - -/* - * Fuse the Conv and ConvAffineChannel. - */ -class Graph; - -class ConvAffineChannelFusePass : public FusePassBase { - public: - ConvAffineChannelFusePass(); - virtual ~ConvAffineChannelFusePass() {} - - protected: - void ApplyImpl(ir::Graph*) const override; - const std::string name_scope_{"conv_affine_channel_fuse"}; -}; - -class ConvEltwiseAddAffineChannelFusePass : public FusePassBase { - public: - ConvEltwiseAddAffineChannelFusePass(); - virtual ~ConvEltwiseAddAffineChannelFusePass() {} - - protected: - void ApplyImpl(ir::Graph*) const override; - const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"}; -}; - -} // namespace ir -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 313e1f2faea..f5f36d805b4 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -75,13 +75,11 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { void PaddlePassBuilder::ClearPasses() { passes_.clear(); } const std::vector kTRTSubgraphPasses({ - "conv_affine_channel_fuse_pass", // - "adaptive_pool2d_convert_global_pass", - "conv_eltwiseadd_affine_channel_fuse_pass", // - "shuffle_channel_detect_pass", // - "quant_conv2d_dequant_fuse_pass", // - "delete_quant_dequant_op_pass", // - "delete_quant_dequant_filter_op_pass", // + "adaptive_pool2d_convert_global_pass", + "shuffle_channel_detect_pass", // + "quant_conv2d_dequant_fuse_pass", // + "delete_quant_dequant_op_pass", // + "delete_quant_dequant_filter_op_pass", // // "fc_fuse_pass", // "simplify_with_basic_ops_pass", // "embedding_eltwise_layernorm_fuse_pass", // @@ -134,22 +132,20 @@ const std::vector kLiteSubgraphPasses({ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { passes_.assign({ // "identity_scale_op_clean_pass", // - "is_test_pass", // - "simplify_with_basic_ops_pass", // - "conv_affine_channel_fuse_pass", // - "conv_eltwiseadd_affine_channel_fuse_pass", // - "conv_bn_fuse_pass", // - "conv_eltwiseadd_bn_fuse_pass", // - "embedding_eltwise_layernorm_fuse_pass", // - "multihead_matmul_fuse_pass_v2", // - "gpu_cpu_squeeze2_matmul_fuse_pass", // - "gpu_cpu_reshape2_matmul_fuse_pass", // - "gpu_cpu_flatten2_matmul_fuse_pass", // - "gpu_cpu_map_matmul_v2_to_mul_pass", // - "gpu_cpu_map_matmul_v2_to_matmul_pass", // - "gpu_cpu_map_matmul_to_mul_pass", // - "fc_fuse_pass", // - "fc_elementwise_layernorm_fuse_pass", // + "is_test_pass", // + "simplify_with_basic_ops_pass", // + "conv_bn_fuse_pass", // + "conv_eltwiseadd_bn_fuse_pass", // + "embedding_eltwise_layernorm_fuse_pass", // + "multihead_matmul_fuse_pass_v2", // + "gpu_cpu_squeeze2_matmul_fuse_pass", // + "gpu_cpu_reshape2_matmul_fuse_pass", // + "gpu_cpu_flatten2_matmul_fuse_pass", // + "gpu_cpu_map_matmul_v2_to_mul_pass", // + "gpu_cpu_map_matmul_v2_to_matmul_pass", // + "gpu_cpu_map_matmul_to_mul_pass", // + "fc_fuse_pass", // + "fc_elementwise_layernorm_fuse_pass", // #if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be // guaranteed at least v7 // cudnn8.0 has memory leak problem in conv + eltwise + act, so we @@ -236,14 +232,12 @@ void CpuPassStrategy::EnableMKLDNN() { passes_.insert(passes_.begin(), "mkldnn_placement_pass"); for (auto &pass : std::vector({ - "depthwise_conv_mkldnn_pass", // - "conv_bn_fuse_pass", // Execute BN passes again to - "conv_eltwiseadd_bn_fuse_pass", // preserve correct pass order - "conv_affine_channel_fuse_pass", // - "conv_eltwiseadd_affine_channel_fuse_pass", // - "conv_transpose_bn_fuse_pass", // - "conv_transpose_eltwiseadd_bn_fuse_pass", // - "conv_bias_mkldnn_fuse_pass", // + "depthwise_conv_mkldnn_pass", // + "conv_bn_fuse_pass", // Execute BN passes again to + "conv_eltwiseadd_bn_fuse_pass", // preserve correct pass order + "conv_transpose_bn_fuse_pass", // + "conv_transpose_eltwiseadd_bn_fuse_pass", // + "conv_bias_mkldnn_fuse_pass", // "conv_transpose_bias_mkldnn_fuse_pass", // TODO(baoachun): Need to support 5-dimensional input. // "conv3d_bias_mkldnn_fuse_pass", // diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py index d5bc2e6b530..9d9fbd39a57 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py @@ -426,9 +426,6 @@ class Quant2Int8MkldnnPass(object): graph = self._apply_pass(graph, 'depthwise_conv_mkldnn_pass') graph = self._apply_pass(graph, 'conv_bn_fuse_pass') graph = self._apply_pass(graph, 'conv_eltwiseadd_bn_fuse_pass') - graph = self._apply_pass(graph, 'conv_affine_channel_fuse_pass') - graph = self._apply_pass(graph, - 'conv_eltwiseadd_affine_channel_fuse_pass') graph = self._apply_pass(graph, 'conv_transpose_bn_fuse_pass') graph = self._apply_pass(graph, 'conv_transpose_eltwiseadd_bn_fuse_pass') diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py deleted file mode 100644 index 5afaf08eec3..00000000000 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py +++ /dev/null @@ -1,160 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from auto_scan_test import PassAutoScanTest, IgnoreReasons -from program_config import TensorConfig, ProgramConfig, OpConfig -import numpy as np -import paddle.inference as paddle_infer -from functools import partial -from typing import Optional, List, Callable, Dict, Any, Set -import unittest - -import hypothesis -from hypothesis import given, settings, seed, example, assume, reproduce_failure -import hypothesis.strategies as st - - -class TestConvAffineChannelFusePass(PassAutoScanTest): - def is_program_valid(self, program_config: ProgramConfig) -> bool: - return True - - def sample_program_config(self, draw): - padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"])) - groups = draw(st.integers(min_value=1, max_value=3)) - data_format = draw(st.sampled_from(["NCHW", "NHWC"])) - axis = draw(st.sampled_from([1])) - filter_channel = draw(st.integers(min_value=1, max_value=16)) * 4 - filter_size = draw(st.integers(min_value=1, max_value=4)) - in_channel = groups * filter_channel - out_channel_factor = draw(st.integers(min_value=1, max_value=16)) * 4 - out_channel = groups * out_channel_factor - batch_size = draw(st.integers(min_value=1, max_value=4)) - dilations = draw( - st.lists( - st.integers( - min_value=1, max_value=2), min_size=2, max_size=2)) - paddings = draw( - st.lists( - st.integers( - min_value=0, max_value=2), min_size=2, max_size=2)) - strides = draw( - st.lists( - st.integers( - min_value=1, max_value=2), min_size=2, max_size=2)) - has_bias = draw(st.booleans()) - - x_shape = [ - batch_size, in_channel, 64, 64 - ] if data_format == "NCHW" else [batch_size, 64, 64, in_channel] - w_shape = [out_channel, filter_channel, filter_size, filter_size] - scale_shape = [out_channel] - bias_shape = [out_channel] - - def generate_input(): - return np.random.random(x_shape).astype(np.float32) - - def generate_weight(): - return np.random.random(w_shape).astype(np.float32) - - def generate_bias(): - return np.random.random(bias_shape).astype(np.float32) - - def generate_scale_bias(): - return np.random.random(bias_shape).astype(np.float32) - - conv2d_op = OpConfig( - "conv2d", - inputs={ - "Input": ["input_data"], - "Filter": ["conv2d_weight"], - }, - outputs={"Output": ["conv_output"]}, - data_format=data_format, - dilations=dilations, - padding_algorithm=padding_algorithm, - groups=groups, - paddings=paddings, - strides=strides, - has_bias=has_bias, - is_test=True) - ac_op = OpConfig( - "affine_channel", - inputs={ - "X": ["conv_output"], - "Scale": ["affine_channel_scale"], - "Bias": ["affine_channel_bias"] - }, - outputs={"Out": ["affine_channel_ouput"]}, - data_layout=data_format) - if has_bias == True: - conv2d_op.inputs["Bias"] = ["conv2d_bias"] - ops = [conv2d_op, ac_op] - - program_config = ProgramConfig( - ops=ops, - inputs={ - "input_data": TensorConfig(data_gen=partial(generate_input)), - }, - weights={ - "conv2d_weight": - TensorConfig(data_gen=partial(generate_weight)), - "affine_channel_scale": - TensorConfig(data_gen=partial(generate_scale_bias)), - "affine_channel_bias": - TensorConfig(data_gen=partial(generate_scale_bias)), - }, - outputs=["affine_channel_ouput"]) - if has_bias == True: - program_config.weights["conv2d_bias"] = TensorConfig( - data_gen=partial(generate_bias)) - return program_config - - def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_gpu=True) - yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4) - - config = self.create_inference_config(use_mkldnn=True) - yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4) - - def add_ignore_pass_case(self): - # If the problem has been fixed, the judgment - # in is_program_valid needs to be deleted!!! - def teller1(program_config, predictor_config): - if program_config.ops[0].attrs['data_format'] == "NHWC": - return True - return False - - # mkldnn Output has diff with bias! - def teller2(program_config, predictor_config): - return predictor_config.mkldnn_enabled() and program_config.ops[ - 0].attrs['has_bias'] == True - - self.add_ignore_check_case( - teller1, IgnoreReasons.PASS_ACCURACY_ERROR, - "The output format of conv2d is wrong when data_format attribute is NHWC, \ - because currently its fused op (Conv2DFusion) only supports data format of channel first (NCHW)." - ) - - self.add_ignore_check_case( - teller2, IgnoreReasons.PASS_ACCURACY_ERROR, - "Currently mkldnn Output has diff with bias!") - - def test(self): - self.run_and_statis( - quant=False, - passes=["conv_affine_channel_fuse_pass"], ) - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_affine_channel_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_affine_channel_fuse_pass.py deleted file mode 100644 index a8bfdb79ca1..00000000000 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_affine_channel_fuse_pass.py +++ /dev/null @@ -1,183 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from auto_scan_test import PassAutoScanTest, IgnoreReasons -from program_config import TensorConfig, ProgramConfig, OpConfig -import numpy as np -import paddle.inference as paddle_infer -from functools import partial -from typing import Optional, List, Callable, Dict, Any, Set -import unittest - -import hypothesis -from hypothesis import given, settings, seed, example, assume -import hypothesis.strategies as st - - -class TestConvEltwiseAddAffineChannelFusePass(PassAutoScanTest): - def is_program_valid(self, program_config: ProgramConfig) -> bool: - attrs = [ - program_config.ops[i].attrs - for i in range(len(program_config.ops)) - ] - - if attrs[0]['data_format'] == "NHWC" and attrs[1]['axis'] != 3: - return False - - return True - - def sample_program_config(self, draw): - padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"])) - groups = draw(st.integers(min_value=1, max_value=3)) - data_format = draw(st.sampled_from(["NCHW", "NHWC"])) - axis = draw(st.sampled_from([1])) - filter_channel = draw(st.integers(min_value=1, max_value=16)) * 4 - filter_size = draw(st.integers(min_value=1, max_value=4)) - in_channel = groups * filter_channel - out_channel_factor = draw(st.integers(min_value=1, max_value=16)) * 4 - out_channel = groups * out_channel_factor - batch_size = draw(st.integers(min_value=1, max_value=4)) - dilations = draw( - st.lists( - st.integers( - min_value=1, max_value=2), min_size=2, max_size=2)) - paddings = draw( - st.lists( - st.integers( - min_value=0, max_value=2), min_size=2, max_size=2)) - strides = draw( - st.lists( - st.integers( - min_value=1, max_value=2), min_size=2, max_size=2)) - has_bias = draw(st.booleans()) - - x_shape = [ - batch_size, in_channel, 64, 64 - ] if data_format == "NCHW" else [batch_size, 64, 64, in_channel] - w_shape = [out_channel, filter_channel, filter_size, filter_size] - scale_shape = [out_channel] - bias_shape = [out_channel] - - def generate_input(): - return np.random.random(x_shape).astype(np.float32) - - def generate_weight(): - return np.random.random(w_shape).astype(np.float32) - - def generate_bias(): - return np.random.random(bias_shape).astype(np.float32) - - def generate_scale_bias(): - return np.random.random(bias_shape).astype(np.float32) - - conv2d_op = OpConfig( - "conv2d", - inputs={ - "Input": ["input_data"], - "Filter": ["conv2d_weight"], - }, - outputs={"Output": ["conv_output"]}, - data_format=data_format, - dilations=dilations, - padding_algorithm=padding_algorithm, - groups=groups, - paddings=paddings, - strides=strides, - has_bias=has_bias, - is_test=True) - eltwise_op = OpConfig( - "elementwise_add", - inputs={"X": ["conv_output"], - "Y": ["conv2d_bias"]}, - outputs={"Out": ["elementwise_output"]}, - axis=axis) - ac_op = OpConfig( - "affine_channel", - inputs={ - "X": ["elementwise_output"], - "Scale": ["affine_channel_scale"], - "Bias": ["affine_channel_bias"] - }, - outputs={"Out": ["affine_channel_ouput"]}, - data_layout=data_format) - if has_bias == True: - conv2d_op.inputs["Bias"] = ["conv2d_bias"] - ops = [conv2d_op, eltwise_op, ac_op] - program_config = ProgramConfig( - ops=ops, - inputs={ - "input_data": TensorConfig(data_gen=partial(generate_input)), - }, - weights={ - "conv2d_weight": - TensorConfig(data_gen=partial(generate_weight)), - "conv2d_bias": TensorConfig(data_gen=partial(generate_bias)), - "affine_channel_scale": - TensorConfig(data_gen=partial(generate_scale_bias)), - "affine_channel_bias": - TensorConfig(data_gen=partial(generate_scale_bias)), - }, - outputs=["affine_channel_ouput"]) - return program_config - - def sample_predictor_configs(self, program_config): - config = self.create_inference_config(use_gpu=True) - yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4) - - config = self.create_inference_config(use_mkldnn=True) - yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4) - - # TRT - config = self.create_trt_inference_config() - config.enable_tensorrt_engine( - workspace_size=1 << 20, - max_batch_size=4, - min_subgraph_size=1, - precision_mode=paddle_infer.PrecisionType.Float32, - use_static=False, - use_calib_mode=False) - yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4) - - def add_ignore_pass_case(self): - # If the problem has been fixed, the judgment - # in is_program_valid needs to be deleted!!! - def teller1(program_config, predictor_config): - if program_config.ops[0].attrs['data_format'] == "NHWC": - return True - return False - - # mkldnn Output has diff with bias! - def teller2(program_config, predictor_config): - return predictor_config.mkldnn_enabled() and program_config.ops[ - 0].attrs['has_bias'] == True - - self.add_ignore_check_case( - teller1, IgnoreReasons.PASS_ACCURACY_ERROR, - "The output format of conv2d is wrong when data_format attribute is NHWC, \ - it will trigger Broadcast dimension mismatch bug \ - when data_format attribute is NHWC and axis of eltwise op is 1 for this pass." - ) - - self.add_ignore_check_case( - teller2, IgnoreReasons.PASS_ACCURACY_ERROR, - "Currently mkldnn Output has diff with bias!") - - def test(self): - self.run_and_statis( - quant=False, - passes=["conv_eltwiseadd_affine_channel_fuse_pass"], ) - - -if __name__ == "__main__": - unittest.main() diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py index 4df27bfe4e9..7f8e516496f 100755 --- a/tools/parallel_UT_rule.py +++ b/tools/parallel_UT_rule.py @@ -958,7 +958,6 @@ FOURTH_HIGH_PARALLEL_JOB_NEW = [ 'test_dynamic_rnn_stop_gradient', 'test_raw_program_optimizer', 'test_pow', 'test_inplace_softmax_with_cross_entropy', 'test_transforms', 'test_unfold_op', 'test_assign_op', 'test_isinstance', - 'test_conv_affine_channel_fuse_pass', 'auto_growth_best_fit_allocator_facade_test', 'test_cholesky_op', 'test_adaptive_avg_pool3d', 'test_paddle_save_load_binary', 'test_fused_fc_elementwise_layernorm_op', 'test_sequence_enumerate_op', @@ -1873,7 +1872,6 @@ TETRAD_PARALLEL_JOB = [ 'test_dataloader_unkeep_order', 'test_parallel_executor_profiler', 'test_correlation', - 'test_conv_affine_channel_fuse_pass', 'test_ir_inplace_pass', 'test_moving_average_abs_max_scale_op', 'test_flatten_contiguous_range_op', diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py index 694283264ca..7356f0c8db0 100755 --- a/tools/static_mode_white_list.py +++ b/tools/static_mode_white_list.py @@ -578,7 +578,6 @@ STATIC_MODE_TESTING_LIST = [ 'test_ir_embedding_eltwise_layernorm_fuse_pass', 'test_ir_fc_fuse_pass', 'test_ir_skip_layernorm_pass', - 'test_conv_affine_channel_fuse_pass', 'test_conv_bias_mkldnn_fuse_pass', 'test_conv_bn_fuse_pass', 'test_conv_elementwise_add2_act_fuse_pass', -- GitLab From ce8ed978cbfce2e0fa503690d31d2e3244066b31 Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Tue, 1 Mar 2022 16:11:28 +0800 Subject: [PATCH 022/272] [bf16] add bf16 kernel: layer_norm p_norm reduce_sum (#39843) * add layer norm * add p norm * add reduce sum * refine layer norm register bf16 for cudnn811 * add bf16 cast for hip * add unittest * refine rocm * refine layer_norm unittest * refine reduce op * refine unittest * enhance atol for reduce unittest --- paddle/fluid/operators/cast_op.cu | 4 - paddle/fluid/operators/layer_norm_kernel.cu.h | 6 +- paddle/fluid/operators/layer_norm_op.cu | 15 ++++ paddle/fluid/operators/p_norm_op.cu | 12 +++ .../reduce_ops/reduce_sum_op.part.cu | 1 + paddle/phi/kernels/gpu/cast_kernel.cu | 4 - paddle/phi/kernels/gpu/math_kernel.cu | 1 + paddle/phi/kernels/math_kernel.cc | 1 + .../paddle/fluid/tests/unittests/op_test.py | 2 +- .../tests/unittests/test_layer_norm_op.py | 47 ++++++++++++ .../fluid/tests/unittests/test_norm_all.py | 76 ++++++++++++++++++- .../fluid/tests/unittests/test_reduce_op.py | 33 +++++++- 12 files changed, 188 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu index 5c7dd0e2561..eb51215790b 100644 --- a/paddle/fluid/operators/cast_op.cu +++ b/paddle/fluid/operators/cast_op.cu @@ -29,9 +29,5 @@ using CUDA = paddle::platform::CUDADeviceContext; ops::CastOpKernel>, \ ops::CastOpKernel>, ##__VA_ARGS__); -#if !defined(PADDLE_WITH_HIP) // See [ why register transfer_dtype_op alias with cast_op? ] in cast_op.cc REGISTER_CAST_CUDA_BASE(transfer_dtype, ops::CastOpKernel) -#else -REGISTER_CAST_CUDA_BASE(transfer_dtype) -#endif diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h index b31c7a1cde0..62c21dd2eee 100644 --- a/paddle/fluid/operators/layer_norm_kernel.cu.h +++ b/paddle/fluid/operators/layer_norm_kernel.cu.h @@ -474,11 +474,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( for (int it = 0; it < LDGS; it++) { #pragma unroll for (int jt = 0; jt < VecSize; jt++) { - U x_tmp = x[it][jt]; + U x_tmp = static_cast(x[it][jt]); U y_tmp = var_cur_row * (x_tmp - mean_cur_row); U dy_tmp = static_cast(gamma[it][jt]) * - static_cast(dout[it][jt]); // scale * dy - U dout_tmp = dout[it][jt]; // dy + static_cast(dout[it][jt]); // scale * dy + U dout_tmp = static_cast(dout[it][jt]); // dy // used for get dx (row reduction) sum_loss1 += dy_tmp; // scale * dy, sum_1 diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu index d439b3220d9..dfe73d37271 100644 --- a/paddle/fluid/operators/layer_norm_op.cu +++ b/paddle/fluid/operators/layer_norm_op.cu @@ -259,6 +259,21 @@ REGISTER_OP_CUDA_KERNEL( ops::LayerNormGradKernel, ops::LayerNormGradKernel); +#elif CUDNN_VERSION_MIN(8, 1, 0) +REGISTER_OP_CUDA_KERNEL( + layer_norm, + ops::LayerNormKernel, + ops::LayerNormKernel, + ops::LayerNormKernel, + ops::LayerNormKernel); +REGISTER_OP_CUDA_KERNEL( + layer_norm_grad, + ops::LayerNormGradKernel, + ops::LayerNormGradKernel, + ops::LayerNormGradKernel, + ops::LayerNormGradKernel); #else REGISTER_OP_CUDA_KERNEL( layer_norm, diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu index f2cb427a0a5..d0b78b9b064 100644 --- a/paddle/fluid/operators/p_norm_op.cu +++ b/paddle/fluid/operators/p_norm_op.cu @@ -39,6 +39,11 @@ __device__ __forceinline__ int sgn(T val) { __device__ __forceinline__ platform::float16 inline_abs(platform::float16 x) { return static_cast(abs(static_cast(x))); } + +__device__ __forceinline__ platform::bfloat16 inline_abs(platform::bfloat16 x) { + return static_cast(abs(static_cast(x))); +} + __device__ __forceinline__ float inline_abs(float x) { return abs(x); } __device__ __forceinline__ double inline_abs(double x) { return abs(x); } @@ -53,6 +58,11 @@ __device__ __forceinline__ platform::float16 inline_pow( return static_cast( pow(static_cast(base), static_cast(exponent))); } +__device__ __forceinline__ platform::bfloat16 inline_pow( + platform::bfloat16 base, platform::bfloat16 exponent) { + return static_cast( + pow(static_cast(base), static_cast(exponent))); +} __device__ __forceinline__ float inline_pow(float base, float exponent) { return pow(base, exponent); } @@ -202,9 +212,11 @@ using CUDA = paddle::platform::CUDADeviceContext; REGISTER_OP_CUDA_KERNEL(p_norm, ops::PnormCUDAKernel, + ops::PnormCUDAKernel, ops::PnormCUDAKernel, ops::PnormCUDAKernel); REGISTER_OP_CUDA_KERNEL( p_norm_grad, ops::PnormGradCUDAKernel, + ops::PnormGradCUDAKernel, ops::PnormGradCUDAKernel, ops::PnormGradCUDAKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu index c3d3e0cf6ec..2f6bf127518 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu @@ -23,6 +23,7 @@ REGISTER_OP_CUDA_KERNEL( reduce_sum_grad, CUDAReduceSumGradKernel, CUDAReduceSumGradKernel, CUDAReduceSumGradKernel, CUDAReduceSumGradKernel, + CUDAReduceSumGradKernel, CUDAReduceSumGradKernel, CUDAReduceSumGradKernel, CUDAReduceSumGradKernel>, CUDAReduceSumGradKernel>); diff --git a/paddle/phi/kernels/gpu/cast_kernel.cu b/paddle/phi/kernels/gpu/cast_kernel.cu index 7a6c99c5fe1..569a46f56d5 100644 --- a/paddle/phi/kernels/gpu/cast_kernel.cu +++ b/paddle/phi/kernels/gpu/cast_kernel.cu @@ -80,8 +80,4 @@ void CastKernel(const Context& dev_ctx, paddle::experimental::DataType::UNDEFINED); \ } -#if !defined(PADDLE_WITH_HIP) PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, phi::dtype::bfloat16) -#else -PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast) -#endif diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu index 56e8b16ccbe..fc73ccca6de 100644 --- a/paddle/phi/kernels/gpu/math_kernel.cu +++ b/paddle/phi/kernels/gpu/math_kernel.cu @@ -155,6 +155,7 @@ PD_REGISTER_KERNEL(sum_raw, float, double, float16, + bfloat16, int16_t, int, int64_t, diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/math_kernel.cc index 3cb7b66ddf7..480eb56c8b0 100644 --- a/paddle/phi/kernels/math_kernel.cc +++ b/paddle/phi/kernels/math_kernel.cc @@ -165,6 +165,7 @@ PD_REGISTER_KERNEL(sum, float, double, phi::dtype::float16, + phi::dtype::bfloat16, int16_t, int, int64_t, diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 5694ef25c79..628791afef5 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -1140,7 +1140,7 @@ class OpTest(unittest.TestCase): else: atol = 2 else: - atol = 1e-2 + atol = 1e-1 if no_check_set is not None: if self.op_type not in no_check_set_white_list.no_check_set_white_list: diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py index 7dd310d2b88..ca9a489c749 100644 --- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py @@ -375,6 +375,53 @@ class TestFP16ScaleBiasLayerNorm(unittest.TestCase): assert_equal(b_g_np_1, b_g_np_2) +class TestBF16ScaleBiasLayerNorm(unittest.TestCase): + def check_main(self, x_np, weight_np, bias_np, dtype): + paddle.disable_static() + + x = paddle.to_tensor(x_np) + weight = paddle.to_tensor(weight_np) + bias = paddle.to_tensor(bias_np) + + if dtype == "bfloat16": + x = x.cast(paddle.fluid.core.VarDesc.VarType.BF16) + + x.stop_gradient = False + weight.stop_gradient = False + bias.stop_gradient = False + + y = F.layer_norm(x, x.shape[1:], weight, bias) + x_g, w_g, b_g = paddle.grad(y, [x, weight, bias]) + + y_np = y.cast('float32').numpy() + x_g_np = x_g.cast('float32').numpy() + w_g_np = w_g.cast('float32').numpy() + b_g_np = b_g.cast('float32').numpy() + + paddle.enable_static() + return y_np, x_g_np, w_g_np, b_g_np + + def test_main(self): + if (not core.is_compiled_with_cuda()) or (core.cudnn_version() < 8100): + return + x_np = np.random.random([10, 20]).astype('float32') + weight_np = np.random.random([20]).astype('float32') + bias_np = np.random.random([20]).astype('float32') + + y_np_1, x_g_np_1, w_g_np_1, b_g_np_1 = self.check_main( + x_np, weight_np, bias_np, 'float32') + y_np_2, x_g_np_2, w_g_np_2, b_g_np_2 = self.check_main( + x_np, weight_np, bias_np, 'bfloat16') + + def assert_equal(x, y): + self.assertTrue(np.allclose(x, y, atol=1.e-1)) + + assert_equal(y_np_1, y_np_2) + assert_equal(x_g_np_1, x_g_np_2) + assert_equal(w_g_np_1, w_g_np_2) + assert_equal(b_g_np_1, b_g_np_2) + + class TestGetSetKeepLayerNormScaleBiasFP32Flag(unittest.TestCase): def test_main(self): self.assertTrue(_keep_layer_norm_scale_bias_to_fp32()) diff --git a/python/paddle/fluid/tests/unittests/test_norm_all.py b/python/paddle/fluid/tests/unittests/test_norm_all.py index b20305b78ef..575bc653618 100644 --- a/python/paddle/fluid/tests/unittests/test_norm_all.py +++ b/python/paddle/fluid/tests/unittests/test_norm_all.py @@ -16,7 +16,7 @@ from __future__ import print_function import unittest import numpy as np -from op_test import OpTest +from op_test import OpTest, convert_float_to_uint16 import paddle import paddle.fluid as fluid import paddle.fluid.core as core @@ -282,6 +282,80 @@ class TestPnormOpFP161(TestPnormOpFP16): self.asvector = True +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestPnormBF16Op(OpTest): + def setUp(self): + self.op_type = "p_norm" + self.init_test_case() + self.x = (np.random.random(self.shape) + 0.5).astype(np.float32) + self.norm = p_norm(self.x, self.axis, self.porder, self.keepdim, + self.asvector) + self.gradient = self.calc_gradient() + self.inputs = {'X': convert_float_to_uint16(self.x)} + self.attrs = { + 'epsilon': self.epsilon, + 'axis': self.axis, + 'keepdim': self.keepdim, + 'porder': float(self.porder), + 'asvector': self.asvector + } + self.outputs = {'Out': convert_float_to_uint16(self.norm)} + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place, atol=1e-3) + + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, ['X'], 'Out', user_defined_grads=self.gradient) + + def init_test_case(self): + self.shape = [2, 3, 4, 5] + self.axis = 1 + self.epsilon = 1e-12 + self.porder = 2.0 + self.keepdim = False + self.dtype = np.uint16 + self.asvector = False + + def calc_gradient(self): + self.attrs = { + 'epsilon': self.epsilon, + 'axis': self.axis, + 'keepdim': self.keepdim, + 'porder': float(self.porder), + 'asvector': self.asvector + } + x = self.x + porder = self.attrs["porder"] + axis = self.attrs["axis"] + asvector = self.attrs["asvector"] + x_dtype = x.dtype + x = x.astype(np.float32) if x.dtype == np.float16 else x + if porder == 0: + grad = np.zeros(x.shape).astype(x.dtype) + elif porder in [float("inf"), float("-inf")]: + norm = p_norm( + x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector) + x_abs = np.abs(x) + grad = np.sign(x) + grad[x_abs != norm] = 0.0 + else: + norm = p_norm( + x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector) + grad = np.power(norm, 1 - porder) * np.power( + np.abs(x), porder - 1) * np.sign(x) + + numel = 1 + for s in x.shape: + numel *= s + divisor = numel if asvector else x.shape[axis] + numel /= divisor + return [grad.astype(x_dtype) * 1 / numel] + + def run_fro(self, p, axis, shape_x, dtype, keep_dim, check_dim=False): with fluid.program_guard(fluid.Program()): data = fluid.data(name="X", shape=shape_x, dtype=dtype) diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py index faa67e1d6da..d246356b4ec 100644 --- a/python/paddle/fluid/tests/unittests/test_reduce_op.py +++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py @@ -16,7 +16,7 @@ from __future__ import print_function import unittest import numpy as np -from op_test import OpTest, skip_check_grad_ci +from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 import paddle import paddle.fluid.core as core import paddle.fluid as fluid @@ -61,6 +61,37 @@ class TestSumOp_fp16(OpTest): self.check_grad(['X'], 'Out', user_defined_grads=self.gradient) +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestSumOp_bf16(OpTest): + def setUp(self): + np.random.seed(100) + self.op_type = "reduce_sum" + self.dtype = np.uint16 + self.x = np.random.uniform(0, 0.1, (2, 5, 10)).astype(np.float32) + self.attrs = {'dim': [0, 1, 2]} + self.out = self.x.sum(axis=tuple(self.attrs['dim'])) + self.gradient = self.calc_gradient() + + self.inputs = {'X': convert_float_to_uint16(self.x)} + self.outputs = {'Out': convert_float_to_uint16(self.out)} + self.gradient = self.calc_gradient() + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, ['X'], 'Out', user_defined_grads=self.gradient) + + def calc_gradient(self): + x = self.x + grad = np.ones(x.shape, dtype=x.dtype) + return [grad] + + class TestSumOp_fp16_withInt(OpTest): def setUp(self): self.op_type = "reduce_sum" -- GitLab From eb7c211a762c0961915c0f9a5d7b0010cd2746e2 Mon Sep 17 00:00:00 2001 From: "joanna.wozna.intel" Date: Tue, 1 Mar 2022 11:33:10 +0100 Subject: [PATCH 023/272] Add mobilenetv3_large performance test for bf16 and int8 (#39738) * Add mobilenetv3_large performance test * Disable the BF16 test if the device does not support BF16 computations * Change test timeout --- .../fluid/inference/tests/api/CMakeLists.txt | 29 ++++++++++++++++++ ...er_bfloat16_image_classification_tester.cc | 15 ++++++++-- ...alyzer_int8_image_classification_tester.cc | 7 ++++- .../fluid/inference/tests/api/tester_helper.h | 30 +++++++++++-------- .../fluid/contrib/slim/tests/CMakeLists.txt | 11 +++++-- 5 files changed, 75 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 37214534f3c..0281fd91765 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -453,6 +453,23 @@ if(WITH_MKLDNN) download_int8_data_without_verify(${INT8_GOOGLENET_MODEL_DIR} "GoogleNet_int8_model.tar.gz" ) inference_analysis_api_int8_test_run_custom_warmup_batch_size(test_analyzer_int8_googlenet ${INT8_IMG_CLASS_TEST_APP} ${INT8_GOOGLENET_MODEL_DIR} ${IMAGENET_DATA_PATH} 10) + # mobilenetv3_large_x1_0 int8 + set(INT8_MOBILENETV3_LARGE_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv3_large") + set(INT8_MOBILENETV3_FILE_NAME "MobileNetV3_large_x1_0_infer.tar") + if (NOT EXISTS ${INT8_MOBILENETV3_LARGE_MODEL_DIR}/${INT8_MOBILENETV3_FILE_NAME}) + inference_download_and_uncompress_without_verify(${INT8_MOBILENETV3_LARGE_MODEL_DIR} "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/" ${INT8_MOBILENETV3_FILE_NAME}) + endif() + inference_analysis_test_run(test_analyzer_int8_mobilenetv3_large + COMMAND ${INT8_IMG_CLASS_TEST_APP} + ARGS --infer_model=${INT8_MOBILENETV3_LARGE_MODEL_DIR}/MobileNetV3_large_x1_0_infer + --infer_data=${IMAGENET_DATA_PATH} + --warmup_batch_size=50 + --batch_size=1 + --enable_int8=true + --cpu_num_threads=${CPU_NUM_THREADS_ON_CI} + --iterations=100 + --with_accuracy_layer=false) + ### BFLOAT16 tests # build test binary to be used in subsequent tests @@ -472,6 +489,17 @@ if(WITH_MKLDNN) # mobilenetv2 bfloat16 inference_analysis_api_bfloat16_test_run(test_analyzer_bfloat16_mobilenetv2 ${BF16_IMG_CLASS_TEST_APP} ${INT8_MOBILENETV2_MODEL_DIR} ${IMAGENET_DATA_PATH}) + # mobilenetv3_large + inference_analysis_test_run(test_analyzer_bfloat16_mobilenetv3_large + COMMAND ${BF16_IMG_CLASS_TEST_APP} + ARGS --infer_model=${INT8_MOBILENETV3_LARGE_MODEL_DIR}/MobileNetV3_large_x1_0_infer + --infer_data=${IMAGENET_DATA_PATH} + --batch_size=1 + --enable_bf16=true + --paddle_num_threads=${CPU_NUM_THREADS_ON_CI} + --iterations=100 + --with_accuracy_layer=false) + ### Object detection models set(PASCALVOC_DATA_PATH "${INT8_DATA_DIR}/pascalvoc_val_head_300.bin") set(INT8_OBJ_DETECT_TEST_APP "test_analyzer_int8_object_detection") @@ -739,6 +767,7 @@ if(WITH_MKLDNN) set_tests_properties(test_analyzer_quant_performance_benchmark PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_int8_mobilenetv2 PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_int8_mobilenetv1 PROPERTIES TIMEOUT 120) + set_tests_properties(test_analyzer_int8_mobilenetv3_large PROPERTIES TIMEOUT 120) endif() set_tests_properties(lite_resnet50_test PROPERTIES TIMEOUT 120) diff --git a/paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc index 3b16b0d34fd..f267f0f28d6 100644 --- a/paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc @@ -14,13 +14,19 @@ limitations under the License. */ #include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" +#include "paddle/fluid/platform/cpu_info.h" namespace paddle { namespace inference { namespace analysis { void SetConfig(AnalysisConfig *cfg) { - cfg->SetModel(FLAGS_infer_model); + std::ifstream model_file(FLAGS_infer_model + "/__model__"); + if (model_file.good()) + cfg->SetModel(FLAGS_infer_model); + else + cfg->SetModel(FLAGS_infer_model + "/inference.pdmodel", + FLAGS_infer_model + "/inference.pdiparams"); cfg->DisableGpu(); cfg->SwitchIrOptim(); cfg->SwitchSpecifyInputNames(); @@ -38,7 +44,12 @@ TEST(Analyzer_bfloat16_image_classification, bfloat16) { // read data from file and prepare batches with test data std::vector> input_slots_all; SetInputs(&input_slots_all); - b_cfg.EnableMkldnnBfloat16(); + if (FLAGS_enable_bf16 && + platform::MayIUse(platform::cpu_isa_t::avx512_bf16)) { + b_cfg.EnableMkldnnBfloat16(); + } else { + FLAGS_enable_bf16 = false; + } CompareBFloat16AndAnalysis(&cfg, &b_cfg, input_slots_all); } diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc index 8f8b7304423..b07163b518b 100644 --- a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc @@ -22,7 +22,12 @@ namespace inference { namespace analysis { void SetConfig(AnalysisConfig *cfg) { - cfg->SetModel(FLAGS_infer_model); + std::ifstream model_file(FLAGS_infer_model + "/__model__"); + if (model_file.good()) + cfg->SetModel(FLAGS_infer_model); + else + cfg->SetModel(FLAGS_infer_model + "/inference.pdmodel", + FLAGS_infer_model + "/inference.pdiparams"); cfg->DisableGpu(); cfg->SwitchIrOptim(); cfg->SwitchSpecifyInputNames(); diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 637fa16e31b..e63dfd14175 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -213,15 +213,15 @@ std::shared_ptr> GetWarmupData( element_in_batch * 3 * 224 * 224, 3 * 224 * 224, static_cast(images.data.data()) + i * 3 * 224 * 224); - - std::copy_n(static_cast(test_data[batch][1].data.data()) + - element_in_batch, - 1, static_cast(labels.data.data()) + i); + if (FLAGS_with_accuracy_layer) + std::copy_n(static_cast(test_data[batch][1].data.data()) + + element_in_batch, + 1, static_cast(labels.data.data()) + i); } - - auto warmup_data = std::make_shared>(2); + auto warmup_data = std::make_shared>( + FLAGS_with_accuracy_layer ? 2 : 1); (*warmup_data)[0] = std::move(images); - (*warmup_data)[1] = std::move(labels); + if (FLAGS_with_accuracy_layer) (*warmup_data)[1] = std::move(labels); return warmup_data; } @@ -254,9 +254,13 @@ void SetInputs(std::vector> *inputs, } for (auto i = 0; i < iterations; i++) { auto images = image_reader.NextBatch(); - auto labels = label_reader.NextBatch(); - inputs->emplace_back( - std::vector{std::move(images), std::move(labels)}); + std::vector tmp_vec; + tmp_vec.push_back(std::move(images)); + if (FLAGS_with_accuracy_layer) { + auto labels = label_reader.NextBatch(); + tmp_vec.push_back(std::move(labels)); + } + inputs->push_back(std::move(tmp_vec)); } } @@ -825,7 +829,8 @@ void CompareQuantizedAndAnalysis( SummarizePerformance("FP32", sample_latency_fp32, "INT8", sample_latency_int8); - CompareAccuracy(quantized_outputs, analysis_outputs, compared_idx); + if (FLAGS_with_accuracy_layer) + CompareAccuracy(quantized_outputs, analysis_outputs, compared_idx); } void CompareBFloat16AndAnalysis( @@ -864,7 +869,8 @@ void CompareBFloat16AndAnalysis( SummarizePerformance("FP32", sample_latency_fp32, "BF16", sample_latency_bf16); - CompareAccuracy(bf16_outputs, analysis_outputs, compared_idx); + if (FLAGS_with_accuracy_layer) + CompareAccuracy(bf16_outputs, analysis_outputs, compared_idx); } void CompareAnalysisAndAnalysis( diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt index 494ea969797..f75a0fa50a5 100644 --- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt +++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt @@ -25,6 +25,12 @@ function(inference_analysis_python_api_int8_test_mkldnn target model_dir data_pa _inference_analysis_python_api_int8_test(${target} ${model_dir} ${data_path} ${filename} True) endfunction() +function(download_data install_dir url data_file check_sum) + if (NOT EXISTS ${install_dir}/${data_file}) + inference_download_and_uncompress(${install_dir} ${url} ${data_file} ${check_sum}) + endif() +endfunction() + function(download_quant_data install_dir data_file check_sum) if (NOT EXISTS ${install_dir}/${data_file}) inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file} ${check_sum}) @@ -290,8 +296,9 @@ if(LINUX AND WITH_MKLDNN) ### PTQ INT8 # PTQ int8 lstm model - set(LSTM_DATA_ARCHIVE "unittest_model_data/quant_lstm_input_data.tar.gz") - download_quant_data(${QUANT2_INT8_LSTM_SAVE_PATH} ${LSTM_DATA_ARCHIVE} add84c754e9b792fea1fbd728d134ab7) + set(LSTM_DATA_FILE "quant_lstm_input_data.tar.gz") + set(LSTM_URL "${INFERENCE_URL}/int8/unittest_model_data") + download_data(${QUANT2_INT8_LSTM_SAVE_PATH} ${LSTM_URL} ${LSTM_DATA_FILE} add84c754e9b792fea1fbd728d134ab7) set(QUANT2_FP32_LSTM_MODEL_ARCHIVE "lstm_fp32_model.tar.gz") download_lstm_model(${QUANT2_INT8_LSTM_SAVE_PATH} ${QUANT2_FP32_LSTM_MODEL_ARCHIVE} eecd9f44d69a84acc1cf2235c4b8b743) inference_quant2_int8_lstm_model_test(test_quant2_int8_lstm_mkldnn ${QUANT2_INT8_LSTM_SAVE_PATH}/lstm_fp32_model ${QUANT2_LSTM_MODEL_DIR}/lstm_quant ${QUANT2_INT8_LSTM_SAVE_PATH}/quant_lstm_input_data) -- GitLab From 2592805ba0bc121bef82331214cd5d233c08d636 Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Tue, 1 Mar 2022 18:46:16 +0800 Subject: [PATCH 024/272] Fixed auto codegen for intermediate tensors (#39797) * Refactored GradNodeAccumulation data structure and behaviour * Fixed CI issues * Fix compilation issues * Fixed minor issues * Reverted changes for intermediate and OverwriteOutput * fixed minor issue * Fixed auto codegen for intermediate tensors * Removed restriction on AccumulationNode modification * Fixed CI Coverage issues * Adjusted Log contents * Fixed CI issues --- paddle/fluid/eager/api/utils/hook_utils.cc | 63 +++++++++---------- .../auto_code_generator/eager_generator.cc | 25 +++++--- paddle/fluid/eager/backward.cc | 7 ++- paddle/fluid/eager/grad_node_info.cc | 2 +- paddle/fluid/eager/utils.cc | 9 +++ 5 files changed, 58 insertions(+), 48 deletions(-) diff --git a/paddle/fluid/eager/api/utils/hook_utils.cc b/paddle/fluid/eager/api/utils/hook_utils.cc index c7927716300..9abd7be49d4 100644 --- a/paddle/fluid/eager/api/utils/hook_utils.cc +++ b/paddle/fluid/eager/api/utils/hook_utils.cc @@ -52,49 +52,44 @@ void RegisterReduceHookForTensor(const paddle::experimental::Tensor& tensor, } } -static void RetainGradForRegularNode( - const paddle::experimental::Tensor& tensor) { - AutogradMeta* meta = EagerUtils::unsafe_autograd_meta(tensor); - if (meta->RetainGrads()) { +void RetainGradForTensor(const paddle::experimental::Tensor& tensor) { + if (IsLeafTensor(tensor)) { + // Leaf tensor's grad will always be retained + // Refer to implementation of AccumulationNode for more details return; } else { - meta->SetRetainGrads(true); - } + AutogradMeta* meta = EagerUtils::unsafe_autograd_meta(tensor); + if (meta->RetainGrads()) { + return; + } else { + meta->SetRetainGrads(true); + } - std::weak_ptr weak_grad_tensor = - meta->WeakGrad(); + std::weak_ptr weak_grad_tensor = + meta->WeakGrad(); - // Define Hook - auto hook = [weak_grad_tensor](const paddle::experimental::Tensor& t) { - if (!weak_grad_tensor.expired()) { - auto grad_tensor = weak_grad_tensor.lock(); - if (t.defined()) { - VLOG(7) << "Set impl for RetainGrad Hook for tensor: " << t.name(); - // Simply Copy impl() to grad_tensor - grad_tensor->set_impl(t.impl()); - return *grad_tensor.get(); + // Define Hook + auto hook = [weak_grad_tensor](const paddle::experimental::Tensor& t) { + if (!weak_grad_tensor.expired()) { + auto grad_tensor = weak_grad_tensor.lock(); + if (t.defined()) { + VLOG(7) << "Set impl for RetainGrad Hook for tensor: " << t.name(); + // Simply Copy impl() to grad_tensor + grad_tensor->set_impl(t.impl()); + return *grad_tensor.get(); + } else { + VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook"; + return paddle::experimental::Tensor(); + } } else { VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook"; return paddle::experimental::Tensor(); } - } else { - VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook"; - return paddle::experimental::Tensor(); - } - }; + }; - // Append to GradientHooks - RegisterGradientHookForTensor(tensor, - std::make_shared(hook)); -} - -void RetainGradForTensor(const paddle::experimental::Tensor& tensor) { - if (IsLeafTensor(tensor)) { - // Leaf tensor's grad will always be retained - // Refer to implementation of AccumulationNode for more details - return; - } else { - RetainGradForRegularNode(tensor); + // Append to GradientHooks + RegisterGradientHookForTensor(tensor, + std::make_shared(hook)); } } diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index a8e0ed7a41a..102fad56373 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -1156,11 +1156,13 @@ static std::string GenerateGradNodeCreationContent( grad_node_creation_str += paddle::string::Sprintf( SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position); - const char* SET_HISTORY_TEMPLATE = - " egr::EagerUtils::SetHistory(&%s, grad_node);\n"; - grad_node_creation_str += - paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name); - + // Intermediate Tensor does not require SetHistory + if (!output.intermediate()) { + const char* SET_HISTORY_TEMPLATE = + " egr::EagerUtils::SetHistory(&%s, grad_node);\n"; + grad_node_creation_str += + paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name); + } const char* SET_GRAD_IN_META_TEMPLATE = " grad_node->SetGradInMeta(&%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( @@ -1173,17 +1175,20 @@ static std::string GenerateGradNodeCreationContent( grad_node_creation_str += paddle::string::Sprintf( SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position); - const char* SET_HISTORY_TEMPLATE = - " egr::EagerUtils::SetHistory(%s, grad_node);\n"; - grad_node_creation_str += - paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name); - + // Intermediate Tensor does not require SetHistory + if (!output.intermediate()) { + const char* SET_HISTORY_TEMPLATE = + " egr::EagerUtils::SetHistory(%s, grad_node);\n"; + grad_node_creation_str += + paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name); + } const char* SET_GRAD_IN_META_TEMPLATE = " grad_node->SetGradInMeta(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( SET_GRAD_IN_META_TEMPLATE, output_autograd_name, output_position); } + // Intermediate Tensor does not require CheckAndRetainGrad if (!output.intermediate()) { VLOG(6) << "Generated Call RetainGradForTensor"; const char* RETAIN_GRAD_TEMPLATE = diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 7073ca8f052..356fdcaf054 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -221,10 +221,11 @@ void RunBackward(const std::vector& tensors, << " 's name is: " << grad_output_tensor.name(); auto* next_node = next_node_shared.get(); - if (!node_input_buffers_dict.count(next_node)) { - node_input_buffers_dict[next_node] = - std::make_unique(next_node->InputMeta()); + const auto& input_meta = next_node->InputMeta(); + auto grad_tensor_holder = + std::make_unique(input_meta); + node_input_buffers_dict[next_node] = std::move(grad_tensor_holder); } VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first << ", rank: " << edge_rank.second; diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index 35416281f18..b1189106b8f 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -244,7 +244,7 @@ GradNodeBase::ApplyGradientHooks( if (!out.defined() || !out.initialized()) { out = (*hook)(tensors[slot_id][rank]); } else { - // If more than one hook is registered, the input to the next hook func + // If more than one hook is registered, the input to the next hook func // should be the output of the previous hook out = (*hook)(out); } diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc index a7e5931f1f9..39861c80522 100644 --- a/paddle/fluid/eager/utils.cc +++ b/paddle/fluid/eager/utils.cc @@ -122,12 +122,21 @@ paddle::experimental::Tensor* EagerUtils::mutable_grad( void EagerUtils::SetHistory(std::vector* autograd_metas, const std::shared_ptr& grad_node) { for (const auto& autograd_meta : *autograd_metas) { + if (dynamic_cast(autograd_meta->GradNode())) { + VLOG(6) << "Warning: Reseting GradNodeAccumulation for leaf tensor is " + "detected"; + } autograd_meta->SetGradNode(grad_node); } } void EagerUtils::SetHistory(AutogradMeta* autograd_meta, const std::shared_ptr& grad_node) { + if (dynamic_cast(autograd_meta->GradNode())) { + VLOG(6) + << "Warning: Reseting GradNodeAccumulation for leaf tensor is detected"; + } + autograd_meta->SetGradNode(grad_node); } -- GitLab From 255bf609e5d9289dfc6d5122e7fda746c933b6e2 Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Tue, 1 Mar 2022 18:48:02 +0800 Subject: [PATCH 025/272] Add function description for Kernel Primitive API (#39884) * Add function description for Kernel Primitive API 1. Set cumsum and sort share memory size = 1024 2.sort and cumsum api limitation : blockDim.x must be less than 512 (blockDim.x <= 512) --- .../kernels/primitive/compute_primitives.h | 284 +++++++++++++----- .../primitive/compute_primitives_xpu2.h | 23 ++ .../kernels/primitive/datamover_primitives.h | 32 ++ .../primitive/datamover_primitives_xpu2.h | 41 +++ 4 files changed, 311 insertions(+), 69 deletions(-) diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h index 4f3c069f3b2..19427551fb3 100644 --- a/paddle/phi/kernels/primitive/compute_primitives.h +++ b/paddle/phi/kernels/primitive/compute_primitives.h @@ -136,7 +136,9 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) { return shared_memory[threadIdx.x]; } -// Swap data +/** + * @brief Swap data + */ template __device__ __forceinline__ void Swap(T* first_value, T* second_value) { T t_value; @@ -145,7 +147,9 @@ __device__ __forceinline__ void Swap(T* first_value, T* second_value) { (*second_value) = t_value; } -// swap with monotonic_type +/** + * @brief Swap data according to monotonic_type. + */ template __device__ __forceinline__ void Comparator(T* first_value, T* second_value, @@ -155,6 +159,9 @@ __device__ __forceinline__ void Comparator(T* first_value, } } +/** + * @brief Swap data and data index according to monotonic_type. + */ template __device__ __forceinline__ void ComparatorWithIndex(T* first_value, @@ -170,6 +177,18 @@ __device__ __forceinline__ void ComparatorWithIndex(T* first_value, } } +/** + * @brief get the last pow of 2 + */ +__device__ inline int GetLastPow2(int n) { + n |= (n >> 1); + n |= (n >> 2); + n |= (n >> 4); + n |= (n >> 8); + n |= (n >> 16); + return std::max(1, n - (n >> 1)); +} + } // namespace details /** @@ -453,6 +472,29 @@ __device__ __forceinline__ void Reduce(T* out, } } +/* +* @brief Fill register with a constant according to OpFunc +* +* @template paraments +* InT: The data type of in1 and in2. +* OutT: The data type of out. +* NX: The number of data columns loaded by each thread. +* NY: The number of data rows loaded by each thread. +* BlockSize: Identifies the current device thread index method. Currently only +* GPU was supported. +* OpFunc: Compute functor which has an operator() as following +* template +* struct XxxFunctor { +* HOSTDEVICE InT operator()() +* const { +* return a; +* } +* }; +* +* @param +* out: The register pointer of out, the size is NX * NY. +* compute: Compute function which was declared like OpFunc(). +*/ template +* struct XxxFunctor { +* HOSTDEVICE InT operator()(StateType state) +* const { +* return ranomd(state); // Returns ReturnsCount random numbers with +* data type T +* } +* }; +* +* @param +* out: The register pointer of out, the size is NX * NY. +* compute: Compute function which was declared like OpFunc(). +*/ + template +/* +* @brief Complete the prefix and in the block, each thread calculates 2 data, +* the size of out and in is 2, and BlockDim.x must be less then 512. +* +* @template paraments +* InT: the type of input register. +* OutT: the type of out register. +* BlockSize: Identifies the current device thread index method. Currently only +* GPU was supported. +* OpFunc: Compute functor which has an operator() as following +* template +* struct XxxFunctor { +* HOSTDEVICE InT operator()(T a, T b) +* const { +* return a + b; +* } +* }; +* +* @param +* out: The register pointer of out, the size is 2; +* in: The register pointer of input, the size is 2; +* compute: Compute function which was declared like OpFunc(). +*/ + +#define SHARED_SIZE_LIMIT 512 +template __device__ __forceinline__ void Cumsum(OutT* out, const InT* in, OpFunc compute) { - __shared__ InT temp[shared_size * 2 + (shared_size * 2) / 32]; + constexpr int kSize = SHARED_SIZE_LIMIT * 2 + (SHARED_SIZE_LIMIT * 2) / 32; + __shared__ InT temp[kSize]; + int stride_size = blockDim.x; int tidx = threadIdx.x; temp[tidx + tidx / 32] = in[0]; - temp[shared_size + tidx + (shared_size + tidx) / 32] = in[1]; - for (int stride = 1; stride <= blockDim.x; stride *= 2) { + temp[stride_size + tidx + (stride_size + tidx) / 32] = in[1]; + for (int stride = 1; stride <= stride_size; stride *= 2) { __syncthreads(); int index = (tidx + 1) * 2 * stride - 1; if (index < (blockDim.x * 2)) { - temp[index + index / 32] += temp[index - stride + (index - stride) / 32]; + temp[index + index / 32] = + compute(temp[index + index / 2], + temp[index - stride + (index - stride) / 32]); } } for (int stride = (blockDim.x * 2) / 4; stride > 0; stride /= 2) { __syncthreads(); int index = (tidx + 1) * 2 * stride - 1; if ((index + stride) < (blockDim.x * 2)) { - temp[index + stride + (stride + index) / 32] += - temp[index + (index) / 32]; + temp[index + stride + (stride + index) / 32] = + compute(temp[index + stride + (stride + index) / 32], + temp[index + (index) / 32]); } } __syncthreads(); out[0] = static_cast(temp[tidx + tidx / 32]); out[1] = - static_cast(temp[tidx + shared_size + (tidx + shared_size) / 32]); + static_cast(temp[tidx + stride_size + (tidx + stride_size) / 32]); } - -#define SHARED_SIZE_LIMIT \ - 1024 // each thread load 2 data from global memory so SHARED_SIZE_LIMIT must - // larger than blockDim.x * 2 -// if monotonic_type = 1 then increase -// if gridDim.x > 1 please set monotonic_type = blockIdx.x & 1; blockIdx.x % 2 -// == 1 the increase -template -__device__ __forceinline__ void Sort(T* dst, - const T* src_data, +#undef SHARED_SIZE_LIMIT + +/* +* @brief Sort data in this block, each thread calculates 2 data, the size of out +* and in is 2, and BlockDim.x must be less then 512. +* +* @template paraments +* InT: the type of input register. +* OutT: the type of out register. +* BlockSize: Identifies the current device thread index method. Currently only +* GPU was supported. +* +* @param +* out: The register pointer of out, the size is 2. +* in: The register pointer of input, the size is 2. +* num: The num of this block +* monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles +* sorted in escending. +*/ +#define SHARED_SIZE_LIMIT 1024 +// each thread load 2 data from global memory so SHARED_SIZE_LIMIT must +// larger than blockDim.x * 2 +template +__device__ __forceinline__ void Sort(OutT* out, + const InT* in, int num, int monotonic_type) { - // todo: set num = Pow2(num) + int upper_bound = blockDim.x; + // update upper_bound + upper_bound = std::min(details::GetLastPow2(num), upper_bound); // shareMem for value and index num must smaller than SHARED_SIZE_LIMIT / 2 - __shared__ T value[SHARED_SIZE_LIMIT]; // shareMem's size must larger than - // blockDim * 2 - // Copy value and index from src and src_index - value[threadIdx.x] = src_data[0]; - value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = src_data[1]; + __shared__ InT value[SHARED_SIZE_LIMIT]; + int stride_size = blockDim.x; + // shareMem's size must larger than blockDim * 2 + // Copy value from in + value[threadIdx.x] = in[0]; + value[threadIdx.x + stride_size] = in[1]; // make bitonicSort - for (int size = 2; size < num; size <<= 1) { + for (int size = 2; size < upper_bound; size <<= 1) { int bitonic_type = (threadIdx.x & (size / 2)) != 0; for (int stride = size / 2; stride > 0; stride >>= 1) { __syncthreads(); int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); - details::Comparator(&value[pos], &value[pos + stride], bitonic_type); + details::Comparator(&value[pos], &value[pos + stride], bitonic_type); } } // last sort - for (int stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) { + for (int stride = stride_size; stride > 0; stride >>= 1) { __syncthreads(); int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); // last sort when monotonic_type = 1 then increase - details::Comparator(&value[pos], &value[pos + stride], monotonic_type); + details::Comparator(&value[pos], &value[pos + stride], monotonic_type); } __syncthreads(); - dst[0] = value[threadIdx.x]; - dst[1] = value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; + out[0] = static_cast(value[threadIdx.x]); + out[1] = static_cast(value[threadIdx.x + stride_size]); } -template -__device__ __forceinline__ void Sort(T* dst, - IndexType* dst_index, - const T* src_data, - IndexType* src_index, +/* +* @brief Sort data with data_index in this block, each thread calculates 2 data, +* the size of out and in is 2, and BlockDim.x must be less then 512. +* +* @template paraments +* InT: The type of input register. +* OutT: The type of out register. +* IndexType: The type of index. +* BlockSize: Identifies the current device thread index method. Currently only +* GPU was supported. +* +* @param +* out: The register pointer of out, the size is 2. +* out_index: The register pointer of out_index, the size is 2. +* in: The register pointer of input, the size is 2. +* in_index: The register pointer of in_index, the size is 2. +* num: The num of this block. +* monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles +* sorted in escending. +*/ +template +__device__ __forceinline__ void Sort(OutT* out, + IndexType* out_index, + const InT* in, + IndexType* in_index, int num, int monotonic_type) { - // todo: set num = Pow2(num) + int upper_bound = blockDim.x; + // update upper_bound + upper_bound = std::min(details::GetLastPow2(num), upper_bound); // shareMem for value and index num must smaller than SHARED_SIZE_LIMIT / 2 - __shared__ T value[SHARED_SIZE_LIMIT]; // shareMem's size must larger than - // blockDim * 2 + __shared__ InT value[SHARED_SIZE_LIMIT]; + // shareMem's size must larger than blockDim * 2 __shared__ IndexType index[SHARED_SIZE_LIMIT]; - // Copy value and index from src and src_index - value[threadIdx.x] = src_data[0]; - value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = src_data[1]; + // Copy value and index from in and in_index + int stride_size = blockDim.x; + value[threadIdx.x] = in[0]; + value[threadIdx.x + stride_size] = in[1]; // index - index[threadIdx.x] = src_index[0]; - index[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = src_index[1]; + index[threadIdx.x] = in_index[0]; + index[threadIdx.x + stride_size] = in_index[1]; // make bitonicSort - for (int size = 2; size < num; size <<= 1) { + for (int size = 2; size < upper_bound; size <<= 1) { int bitonic_type = (threadIdx.x & (size / 2)) != 0; for (int stride = size / 2; stride > 0; stride >>= 1) { __syncthreads(); int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); - details::ComparatorWithIndex(&value[pos], - &value[pos + stride], - &index[pos], - &index[pos + stride], - bitonic_type); + details::ComparatorWithIndex(&value[pos], + &value[pos + stride], + &index[pos], + &index[pos + stride], + bitonic_type); } } - for (int stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) { + for (int stride = stride_size; stride > 0; stride >>= 1) { __syncthreads(); int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); // last sort when monotonic_type = 1 then increase - details::ComparatorWithIndex(&value[pos], - &value[pos + stride], - &index[pos], - &index[pos + stride], - monotonic_type); + details::ComparatorWithIndex(&value[pos], + &value[pos + stride], + &index[pos], + &index[pos + stride], + monotonic_type); } __syncthreads(); - dst[0] = value[threadIdx.x]; - dst[1] = value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; - dst_index[0] = index[threadIdx.x]; - dst_index[1] = index[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; + out[0] = static_cast(value[threadIdx.x]); + out[1] = static_cast(value[threadIdx.x + stride_size]); + out_index[0] = index[threadIdx.x]; + out_index[1] = index[threadIdx.x + stride_size]; +} + +template +HOSTDEVICE __forceinline__ void OperatorTernary( + OutT* out, const T1* in1, const T2* in2, OpFunc func, int num) { + func(out, in1, in2, num); +} + +template +HOSTDEVICE __forceinline__ void OperatorBinary(OutT* out, + const InT* in, + OpFunc func, + int num) { + func(out, in, num); } } // namespace kps diff --git a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h index a445f4a02ea..1f4ef2ed932 100644 --- a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h +++ b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h @@ -348,6 +348,29 @@ __device__ __forceinline__ void Reduce(T* out, } } +/* +* @brief Fill register with a constant according to OpFunc +* +* @template paraments +* InT: The data type of in1 and in2. +* OutT: The data type of out. +* NX: The number of data columns loaded by each thread. +* NY: The number of data rows loaded by each thread. +* BlockSize: Identifies the current device thread index method. For xpu, +* core_id() is used as the index. +* OpFunc: Compute functor which has an operator() as following +* template +* struct XxxFunctor { +* HOSTDEVICE InT operator()() +* const { +* return a; +* } +* }; +* +* @param +* out: The register pointer of out, the size is NX * NY. +* compute: Compute function which was declared like OpFunc(). +*/ template or std::tuple + * Index: The index of data stored in dst. + * BlockSize: Identifies the current device thread index method. For GPU, + * threadIdx.x is used as the thread index. Currently only GPU was supported. + * IsBoundary: Whether to make an out-of-bounds judgment on access to memory. + * When the number of data processed by this block is less than + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access + * crossing the boundary. + * + * @param: + * dst: The register pointer of the thread, the size is NX * NY. + * src: The data pointer of the current block. + * size: The current block needs to load size data continuously. */ template __device__ __forceinline__ void InitWithDataIndex(T* dst, int block_offset) { int thread_offset = block_offset + threadIdx.x * NX; diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h index 75b2dbaf7e6..53a8b7d0c9e 100644 --- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h +++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h @@ -244,6 +244,24 @@ __device__ __inline__ void ReadData(T* dst, /** * @brief Read 1D data from global memory to register. The difference * from the above function is that it supports different data types of inputs. + * + * @template paraments + * T: The type of data. + * NX: Each thread load NX data from global memory continuously. + * NY: Each thread need to load NY rows, only NY = 1 was supported. + * ArgsT: The Type if dst, ArgsT can be std::tuple or std::tuple + * Index: The index of data stored in dst. + * BlockSize: Identifies the current device thread index method. For xpu, + * core_id() is used as the index. + * IsBoundary: Whether to make an out-of-bounds judgment on access to memory. + * When the number of data processed by this block is less than + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access + * crossing the boundary. + * + * @param: + * dst: The register pointer of the thread, the size is NX * NY. + * src: The data pointer of the current block. + * size: The current block needs to load size data continuously. */ template +__device__ __forceinline__ void InitWithDataIndex(T* dst, int block_offset) { + int thread_offset = block_offset + core_id() * NX; +#pragma unroll + for (int nx = 0; nx < NX; ++nx) { + dst[nx] = static_cast(thread_offset + nx); + } +} + } // namespace kps } // namespace phi -- GitLab From 197da15ae4a5a127d1ce1208e2bed4bab05f836a Mon Sep 17 00:00:00 2001 From: xiongkun Date: Tue, 1 Mar 2022 19:00:30 +0800 Subject: [PATCH 026/272] [phi] tranfer the selu_op and pass the CI (#39819) * tranfer the selu_op and pass the CI * add sig files * fix code * fix by code review * remove TOOD * change the include position * change the head position --- paddle/fluid/operators/selu_op.cc | 8 -- paddle/fluid/operators/selu_op.cu | 22 ---- paddle/fluid/operators/selu_op.h | 123 ------------------ paddle/phi/kernels/cpu/selu_grad_kernel.cc | 21 +++ paddle/phi/kernels/cpu/selu_kernel.cc | 21 +++ paddle/phi/kernels/gpu/selu_grad_kernel.cu | 22 ++++ paddle/phi/kernels/gpu/selu_kernel.cu | 21 +++ .../phi/kernels/impl/selu_grad_kernel_impl.h | 35 +++++ paddle/phi/kernels/impl/selu_kernel_impl.h | 88 +++++++++++++ paddle/phi/kernels/selu_grad_kernel.h | 29 +++++ paddle/phi/kernels/selu_kernel.h | 28 ++++ paddle/phi/ops/compat/selu_sig.cc | 28 ++++ 12 files changed, 293 insertions(+), 153 deletions(-) delete mode 100644 paddle/fluid/operators/selu_op.cu delete mode 100644 paddle/fluid/operators/selu_op.h create mode 100644 paddle/phi/kernels/cpu/selu_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/selu_kernel.cc create mode 100644 paddle/phi/kernels/gpu/selu_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/selu_kernel.cu create mode 100644 paddle/phi/kernels/impl/selu_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/selu_kernel_impl.h create mode 100644 paddle/phi/kernels/selu_grad_kernel.h create mode 100644 paddle/phi/kernels/selu_kernel.h create mode 100644 paddle/phi/ops/compat/selu_sig.cc diff --git a/paddle/fluid/operators/selu_op.cc b/paddle/fluid/operators/selu_op.cc index 0adf61d7ce3..88ef1f3ea4a 100644 --- a/paddle/fluid/operators/selu_op.cc +++ b/paddle/fluid/operators/selu_op.cc @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/selu_op.h" - #include #include #include @@ -127,9 +125,3 @@ REGISTER_OPERATOR(selu, ops::SeluOp, ops::SeluOpMaker, ops::SeluOpInferVarType, ops::SeluGradMaker, ops::SeluGradMaker); REGISTER_OPERATOR(selu_grad, ops::SeluGradOp); -REGISTER_OP_CPU_KERNEL( - selu, ops::SeluKernel, - ops::SeluKernel); -REGISTER_OP_CPU_KERNEL( - selu_grad, ops::SeluGradKernel, - ops::SeluGradKernel); diff --git a/paddle/fluid/operators/selu_op.cu b/paddle/fluid/operators/selu_op.cu deleted file mode 100644 index fb3245ab760..00000000000 --- a/paddle/fluid/operators/selu_op.cu +++ /dev/null @@ -1,22 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/selu_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - selu, ops::SeluKernel, - ops::SeluKernel); -REGISTER_OP_CUDA_KERNEL( - selu_grad, ops::SeluGradKernel, - ops::SeluGradKernel); diff --git a/paddle/fluid/operators/selu_op.h b/paddle/fluid/operators/selu_op.h deleted file mode 100644 index b2fc834c42f..00000000000 --- a/paddle/fluid/operators/selu_op.h +++ /dev/null @@ -1,123 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math.h" -#include "paddle/fluid/platform/for_range.h" - -namespace paddle { -namespace operators { - -template -struct SeluFunctor { - SeluFunctor(const T* x_data_ptr, float alpha, float scale, T* y_data_ptr) - : x_data_ptr_(x_data_ptr), - alpha_(alpha), - scale_(scale), - y_data_ptr_(y_data_ptr) {} - - HOSTDEVICE void operator()(size_t idx) const { - T x_ele = x_data_ptr_[idx]; - if (x_ele <= 0) { - x_ele = alpha_ * real_exp(x_ele) - alpha_; - } - y_data_ptr_[idx] = scale_ * x_ele; - } - const T* x_data_ptr_; - const float alpha_; - const float scale_; - T* y_data_ptr_; -}; - -template -struct SeluGradFunctor { - SeluGradFunctor(const T* y_data_ptr, const T* dy_data_ptr, float alpha, - float scale, T* dx_data_ptr) - : y_data_ptr_(y_data_ptr), - dy_data_ptr_(dy_data_ptr), - alpha_(alpha), - scale_(scale), - la_(alpha * scale), - dx_data_ptr_(dx_data_ptr) {} - - HOSTDEVICE void operator()(size_t idx) const { - T y_ele = y_data_ptr_[idx]; - T dy_ele = dy_data_ptr_[idx]; - - float tmp = scale_; - if (y_ele <= 0) { - tmp = y_ele + la_; - } - dx_data_ptr_[idx] = dy_ele * tmp; - } - const T* y_data_ptr_; - const T* dy_data_ptr_; - const float alpha_; - const float scale_; - const float la_; - T* dx_data_ptr_; -}; - -template -class SeluKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using Tensor = framework::Tensor; - - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - - float alpha = context.Attr("alpha"); - float scale = context.Attr("scale"); - - auto out_ptr = out->mutable_data(context.GetPlace()); - - SeluFunctor functor(x->data(), alpha, scale, out_ptr); - - auto& dev_ctx = context.template device_context(); - size_t limit = static_cast(x->numel()); - platform::ForRange for_range(dev_ctx, limit); - for_range(functor); - } -}; - -template -class SeluGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using Tensor = framework::Tensor; - - auto* out = context.Input("Out"); - auto* dout = context.Input(framework::GradVarName("Out")); - auto* dx = context.Output(framework::GradVarName("X")); - - float alpha = context.Attr("alpha"); - float scale = context.Attr("scale"); - - auto dx_ptr = dx->mutable_data(context.GetPlace()); - - SeluGradFunctor functor(out->data(), dout->data(), alpha, scale, - dx_ptr); - - auto& dev_ctx = context.template device_context(); - size_t limit = static_cast(out->numel()); - platform::ForRange for_range(dev_ctx, limit); - for_range(functor); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/kernels/cpu/selu_grad_kernel.cc b/paddle/phi/kernels/cpu/selu_grad_kernel.cc new file mode 100644 index 00000000000..32101b19132 --- /dev/null +++ b/paddle/phi/kernels/cpu/selu_grad_kernel.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/selu_grad_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/selu_grad_kernel_impl.h" + +PD_REGISTER_KERNEL( + selu_grad, CPU, ALL_LAYOUT, phi::SeluGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/selu_kernel.cc b/paddle/phi/kernels/cpu/selu_kernel.cc new file mode 100644 index 00000000000..bc5a0616a72 --- /dev/null +++ b/paddle/phi/kernels/cpu/selu_kernel.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/selu_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/selu_kernel_impl.h" + +PD_REGISTER_KERNEL(selu, CPU, ALL_LAYOUT, phi::SeluKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/selu_grad_kernel.cu b/paddle/phi/kernels/gpu/selu_grad_kernel.cu new file mode 100644 index 00000000000..0ed299413c1 --- /dev/null +++ b/paddle/phi/kernels/gpu/selu_grad_kernel.cu @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/selu_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/selu_grad_kernel_impl.h" + +PD_REGISTER_KERNEL( + selu_grad, GPU, ALL_LAYOUT, phi::SeluGradKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/selu_kernel.cu b/paddle/phi/kernels/gpu/selu_kernel.cu new file mode 100644 index 00000000000..99303d8c18a --- /dev/null +++ b/paddle/phi/kernels/gpu/selu_kernel.cu @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/selu_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/selu_kernel_impl.h" + +PD_REGISTER_KERNEL(selu, GPU, ALL_LAYOUT, phi::SeluKernel, float, double) {} diff --git a/paddle/phi/kernels/impl/selu_grad_kernel_impl.h b/paddle/phi/kernels/impl/selu_grad_kernel_impl.h new file mode 100644 index 00000000000..d09c87b0a4e --- /dev/null +++ b/paddle/phi/kernels/impl/selu_grad_kernel_impl.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/phi/kernels/impl/selu_kernel_impl.h" + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { +template +void SeluGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& dout, + float scale, + float alpha, + DenseTensor* dx) { + auto dx_ptr = dev_ctx.template Alloc(dx); + SeluGradFunctor functor( + out.data(), dout.data(), alpha, scale, dx_ptr); + size_t limit = static_cast(out.numel()); + paddle::platform::ForRange for_range(dev_ctx, limit); + for_range(functor); +} +} // namespace phi diff --git a/paddle/phi/kernels/impl/selu_kernel_impl.h b/paddle/phi/kernels/impl/selu_kernel_impl.h new file mode 100644 index 00000000000..888bac42bfd --- /dev/null +++ b/paddle/phi/kernels/impl/selu_kernel_impl.h @@ -0,0 +1,88 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/operators/math.h" +#include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +struct SeluFunctor { + SeluFunctor(const T* x_data_ptr, float alpha, float scale, T* y_data_ptr) + : x_data_ptr_(x_data_ptr), + alpha_(alpha), + scale_(scale), + y_data_ptr_(y_data_ptr) {} + + HOSTDEVICE void operator()(size_t idx) const { + T x_ele = x_data_ptr_[idx]; + if (x_ele <= 0) { + x_ele = alpha_ * paddle::operators::real_exp(x_ele) - alpha_; + } + y_data_ptr_[idx] = scale_ * x_ele; + } + const T* x_data_ptr_; + const float alpha_; + const float scale_; + T* y_data_ptr_; +}; + +template +struct SeluGradFunctor { + SeluGradFunctor(const T* y_data_ptr, + const T* dy_data_ptr, + float alpha, + float scale, + T* dx_data_ptr) + : y_data_ptr_(y_data_ptr), + dy_data_ptr_(dy_data_ptr), + alpha_(alpha), + scale_(scale), + la_(alpha * scale), + dx_data_ptr_(dx_data_ptr) {} + + HOSTDEVICE void operator()(size_t idx) const { + T y_ele = y_data_ptr_[idx]; + T dy_ele = dy_data_ptr_[idx]; + + float tmp = scale_; + if (y_ele <= 0) { + tmp = y_ele + la_; + } + dx_data_ptr_[idx] = dy_ele * tmp; + } + const T* y_data_ptr_; + const T* dy_data_ptr_; + const float alpha_; + const float scale_; + const float la_; + T* dx_data_ptr_; +}; + +template +void SeluKernel(const Context& dev_ctx, + const DenseTensor& x, + float scale, + float alpha, + DenseTensor* out) { + auto out_ptr = dev_ctx.template Alloc(out); + SeluFunctor functor(x.data(), alpha, scale, out_ptr); + size_t limit = static_cast(x.numel()); + paddle::platform::ForRange for_range(dev_ctx, limit); + for_range(functor); +} +} // namespace phi diff --git a/paddle/phi/kernels/selu_grad_kernel.h b/paddle/phi/kernels/selu_grad_kernel.h new file mode 100644 index 00000000000..42cde6deabe --- /dev/null +++ b/paddle/phi/kernels/selu_grad_kernel.h @@ -0,0 +1,29 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void SeluGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& d_out, + float scale, + float alpha, + DenseTensor* d_x); +} // namespace phi diff --git a/paddle/phi/kernels/selu_kernel.h b/paddle/phi/kernels/selu_kernel.h new file mode 100644 index 00000000000..cd5d27e98cc --- /dev/null +++ b/paddle/phi/kernels/selu_kernel.h @@ -0,0 +1,28 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void SeluKernel(const Context& dev_ctx, + const DenseTensor& x, + float scale, + float alpha, + DenseTensor* out); +} // phi diff --git a/paddle/phi/ops/compat/selu_sig.cc b/paddle/phi/ops/compat/selu_sig.cc new file mode 100644 index 00000000000..23f5cc34515 --- /dev/null +++ b/paddle/phi/ops/compat/selu_sig.cc @@ -0,0 +1,28 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature SeluGradGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("selu_grad", + {"Out", GradVarName("Out")}, + {"scale", "alpha"}, + {GradVarName("X")}); +} +} // namespace phi +PD_REGISTER_ARG_MAPPING_FN(selu_grad, phi::SeluGradGradOpArgumentMapping); -- GitLab From 090396368c80360fc33d09dfb1df7492f7dfb544 Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Tue, 1 Mar 2022 19:23:04 +0800 Subject: [PATCH 027/272] [Phi]rm reduce infershape (#39820) * modify infershape utils and rm reduce infershape * merge develop * fix infermete bug * add IsForInferShape func in ArgumentMappingContext * add reduce_mean infermeta * modify annotation * add default dims --- paddle/fluid/framework/infershape_utils.cc | 6 +- paddle/fluid/framework/operator.h | 2 + .../operators/reduce_ops/reduce_mean_op.cc | 10 +++- .../operators/reduce_ops/reduce_sum_op.cc | 10 +++- .../dialect/phi/pass/proto_arg_map_context.h | 2 + paddle/phi/core/compat/arg_map_context.h | 4 ++ paddle/phi/infermeta/unary.cc | 60 +++++++++++++++---- paddle/phi/infermeta/unary.h | 15 +++-- paddle/phi/kernels/math_kernel.h | 2 +- paddle/phi/ops/compat/reduce_sig.cc | 34 +++++++---- paddle/phi/tests/ops/test_op_signature.h | 2 + python/paddle/utils/code_gen/api.yaml | 2 +- 12 files changed, 117 insertions(+), 32 deletions(-) diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index d9287b9a624..57fb68e8042 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -88,6 +88,8 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext { return var_types[0] == proto::VarType::SELECTED_ROWS; } + bool IsForInferShape() const override { return true; } + private: const InferShapeContext& ctx_; }; @@ -127,7 +129,9 @@ class CompatMetaTensor : public phi::MetaTensor { } } else { auto* var = BOOST_GET_CONST(VarDesc*, var_); - return phi::make_ddim(var->GetShape()); + + return var->GetShape().empty() ? phi::make_ddim({0UL}) + : phi::make_ddim(var->GetShape()); } } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 16718a31651..e33d4feb82a 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -489,6 +489,8 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext { return ctx_.OutputVar(name)->IsType(); } + bool IsForInferShape() const override { return false; } + private: const ExecutionContext& ctx_; }; diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc index e80df5f95bb..6157a3a925d 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc @@ -18,6 +18,10 @@ #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace operators { @@ -92,9 +96,13 @@ class __reduce_meanMaker__ : public ops::ReduceOpMaker { virtual std::string GetOpType() const { return "Reduce reduce_mean"; } }; +DELCARE_INFER_SHAPE_FUNCTOR(reduce_mean, ReduceMeanInferShapeFunctor, + PT_INFER_META(phi::MeanRawInferMeta)); + REGISTER_OPERATOR(reduce_mean, ops::ReduceOp, __reduce_meanMaker__, ops::ReduceMeanOpGradMaker, - ops::ReduceMeanOpGradMaker); + ops::ReduceMeanOpGradMaker, + ReduceMeanInferShapeFunctor); REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp, ops::ReduceMeanDoubleGradDescMaker, ops::ReduceMeanDoubleGradOpBaseMaker, diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc index bdab14a18a0..8ef0712dc7a 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc @@ -16,6 +16,10 @@ #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace framework { class OpDesc; @@ -98,10 +102,14 @@ class ReduceSumOpMaker : public ops::ReduceOpMaker { virtual std::string GetOpType() const { return "Reduce reduce_sum"; } }; +DELCARE_INFER_SHAPE_FUNCTOR(reduce_sum, ReduceSumInferShapeFunctor, + PT_INFER_META(phi::ReduceInferMetaBase)); + REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ReduceSumOpMaker, ops::ReduceSumVarTypeInference, ops::ReduceSumOpGradMaker, - ops::ReduceSumOpGradMaker); + ops::ReduceSumOpGradMaker, + ReduceSumInferShapeFunctor); REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp, ops::ReduceSumDoubleOpGradMaker, ops::ReduceSumDoubleOpGradMaker, diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h index 843b19d217f..ca8a22a7e75 100644 --- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h +++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h @@ -46,6 +46,8 @@ class ProtoArgumentMappingContext : public phi::ArgumentMappingContext { bool IsDenseTensorOutput(const std::string& name) const override; bool IsSelectedRowsOutput(const std::string& name) const override; + bool IsForInferShape() const override { return false; } + private: mlir::Operation* op_; const std::unordered_map& input_map_; diff --git a/paddle/phi/core/compat/arg_map_context.h b/paddle/phi/core/compat/arg_map_context.h index af29b3bab5c..f625d57df2e 100644 --- a/paddle/phi/core/compat/arg_map_context.h +++ b/paddle/phi/core/compat/arg_map_context.h @@ -91,6 +91,10 @@ class ArgumentMappingContext { virtual bool IsDenseTensorOutput(const std::string& name) const = 0; virtual bool IsSelectedRowsOutput(const std::string& name) const = 0; + + // use this function to mark it comes from InferShapeArgumentMappingContext + // and will be used in infershape + virtual bool IsForInferShape() const = 0; }; } // namespace phi diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 4696187bd23..983e0162264 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -375,7 +375,7 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x, ReshapeInferMeta(x, shape, out, config); } -/* Why not use ReduceInferMeta directly? +/* Why not use ReduceInferMetaBase directly? Because we need make InferMetaFunction's args follow the design of api.yaml */ void SumInferMeta(const MetaTensor& x, @@ -383,22 +383,53 @@ void SumInferMeta(const MetaTensor& x, DataType dtype, bool keep_dim, MetaTensor* out) { - ReduceInferMetaBase(x, axis, keep_dim, dtype, out); + bool reduce_all = false; + ReduceInferMetaBase(x, axis, keep_dim, reduce_all, dtype, out); } void ReduceInferMetaBase(const MetaTensor& x, const std::vector& axis, bool keep_dim, + bool reduce_all, DataType dtype, MetaTensor* out) { - bool reduce_all = true; - std::set dims_set(axis.begin(), axis.end()); + auto x_rank = x.dims().size(); + + std::vector formated_axis = axis; + for (size_t i = 0; i < axis.size(); ++i) { + PADDLE_ENFORCE_LT(axis[i], + x_rank, + errors::InvalidArgument( + "The reduce dim index %d should be in the " + "range [-dimension(X), dimension(X)] " + "which dimesion = %d. But received dim index = %d.", + i, + x_rank, + axis[i])); + PADDLE_ENFORCE_GE(axis[i], + -x_rank, + errors::InvalidArgument( + "The reduce dim index %d should be in the " + "range [-dimension(X), dimension(X)] " + "which dimesion = %d. But received dim index = %d.", + i, + x_rank, + axis[i])); + + if (axis[i] < 0) { + formated_axis[i] = axis[i] + x_rank; + } + } + + bool full_dim = true; + std::set dims_set(formated_axis.begin(), formated_axis.end()); for (int64_t i = 0; i < x.dims().size(); ++i) { if (dims_set.find(i) == dims_set.end()) { - reduce_all = false; + full_dim = false; break; } } + reduce_all = reduce_all || full_dim; std::vector out_dim_vector; if (keep_dim) { @@ -441,11 +472,20 @@ void ReduceInferMetaBase(const MetaTensor& x, out->set_layout(x.layout()); } -void ReduceInferMeta(const MetaTensor& x, - const std::vector& axis, - bool keep_dim, - MetaTensor* out) { - ReduceInferMetaBase(x, axis, keep_dim, DataType::UNDEFINED, out); +void MeanRawInferMeta(const MetaTensor& x, + const std::vector& axis, + bool keep_dim, + bool reduce_all, + MetaTensor* out) { + ReduceInferMetaBase(x, axis, keep_dim, reduce_all, DataType::UNDEFINED, out); +} + +void MeanInferMeta(const MetaTensor& x, + const std::vector& axis, + bool keep_dim, + MetaTensor* out) { + bool reduce_all = false; + ReduceInferMetaBase(x, axis, keep_dim, reduce_all, DataType::UNDEFINED, out); } void TransferLayoutInferMeta(const MetaTensor& x, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index b3929b9d2b4..a2d779e0f70 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -86,13 +86,20 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x, void ReduceInferMetaBase(const MetaTensor& x, const std::vector& axis, bool keep_dim, + bool reduce_all, DataType dtype, MetaTensor* out); -void ReduceInferMeta(const MetaTensor& x, - const std::vector& axis, - bool keep_dim, - MetaTensor* out); +void MeanRawInferMeta(const MetaTensor& x, + const std::vector& axis, + bool keep_dim, + bool reduce_all, + MetaTensor* out); + +void MeanInferMeta(const MetaTensor& x, + const std::vector& axis, + bool keep_dim, + MetaTensor* out); void SumInferMeta(const MetaTensor& x, const std::vector& axis, diff --git a/paddle/phi/kernels/math_kernel.h b/paddle/phi/kernels/math_kernel.h index c6036f4a042..342393d79bd 100644 --- a/paddle/phi/kernels/math_kernel.h +++ b/paddle/phi/kernels/math_kernel.h @@ -156,7 +156,7 @@ DenseTensor Mean(const Context& dev_ctx, bool keep_dim) { auto dense_out = phi::Empty(dev_ctx); MetaTensor meta_out(&dense_out); - ReduceInferMetaBase(x, axis, keep_dim, x.dtype(), &meta_out); + ReduceInferMetaBase(x, axis, keep_dim, false, x.dtype(), &meta_out); MeanKernel(dev_ctx, x, axis, keep_dim, &dense_out); return dense_out; } diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc index 74704671f8b..6395486ed2b 100644 --- a/paddle/phi/ops/compat/reduce_sig.cc +++ b/paddle/phi/ops/compat/reduce_sig.cc @@ -17,28 +17,36 @@ limitations under the License. */ namespace phi { KernelSignature ReduceSumOpArgumentMapping(const ArgumentMappingContext& ctx) { - bool reduce_all = paddle::any_cast(ctx.Attr("reduce_all")); if (ctx.IsDenseTensorInput("X")) { - if (!reduce_all) { - return KernelSignature( - "sum", {"X"}, {"dim", "out_dtype", "keep_dim"}, {"Out"}); + bool reduce_all = paddle::any_cast(ctx.Attr("reduce_all")); + // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in + // InferShape, so we must return the "sum_raw" KernelSignature. + // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with + // the "sum_raw" KernelSignature + if (ctx.IsForInferShape() || reduce_all) { + return KernelSignature("sum_raw", + {"X"}, + {"dim", "keep_dim", "reduce_all", "out_dtype"}, + {"Out"}); } - return KernelSignature("sum_raw", - {"X"}, - {"dim", "keep_dim", "reduce_all", "out_dtype"}, - {"Out"}); + return KernelSignature( + "sum", {"X"}, {"dim", "out_dtype", "keep_dim"}, {"Out"}); } return KernelSignature("unregistered", {}, {}, {}); } KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) { - bool reduce_all = paddle::any_cast(ctx.Attr("reduce_all")); if (ctx.IsDenseTensorInput("X")) { - if (!reduce_all) { - return KernelSignature("mean", {"X"}, {"dim", "keep_dim"}, {"Out"}); + bool reduce_all = paddle::any_cast(ctx.Attr("reduce_all")); + // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in + // InferShape, so we must return the "mean_raw" KernelSignature. + // And the InferMeta function(i.e. MeanRawInferMeta) is accordance with the + // "mean_raw" KernelSignature + if (ctx.IsForInferShape() || reduce_all) { + return KernelSignature( + "mean_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); } - return KernelSignature( - "mean_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); + return KernelSignature("mean", {"X"}, {"dim", "keep_dim"}, {"Out"}); } return KernelSignature("unregistered", {}, {}, {}); } diff --git a/paddle/phi/tests/ops/test_op_signature.h b/paddle/phi/tests/ops/test_op_signature.h index fcd2d397fa2..06048f33d94 100644 --- a/paddle/phi/tests/ops/test_op_signature.h +++ b/paddle/phi/tests/ops/test_op_signature.h @@ -80,6 +80,8 @@ class TestArgumentMappingContext : public phi::ArgumentMappingContext { return selected_rows_outputs.count(name) > 0; } + bool IsForInferShape() const override { return false; } + private: const std::unordered_set dense_tensor_inputs; const std::unordered_set selected_rows_inputs; diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 7ea8493b67f..45a6aae5e6d 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -124,7 +124,7 @@ args : (Tensor x, int64_t[] axis={}, bool keep_dim=false) output : Tensor infer_meta : - func : ReduceInferMeta + func : MeanInferMeta kernel : func : mean -- GitLab From 69ab270021c51ce70345f484e52eadb5165b9c54 Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Tue, 1 Mar 2022 20:11:33 +0800 Subject: [PATCH 028/272] fix compiling and running with ipu (#39920) --- paddle/fluid/framework/phi_utils.cc | 9 + .../fluid/platform/device/ipu/ipu_strategy.cc | 306 ++++++++++-------- .../fluid/platform/device/ipu/ipu_strategy.h | 72 +++-- paddle/fluid/pybind/pybind.cc | 2 + .../fluid/tests/unittests/ipu/CMakeLists.txt | 8 + 5 files changed, 231 insertions(+), 166 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc index 93bc2c02d57..14997dd9610 100644 --- a/paddle/fluid/framework/phi_utils.cc +++ b/paddle/fluid/framework/phi_utils.cc @@ -125,6 +125,15 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key, return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype()); } +#endif +#ifdef PADDLE_WITH_IPU + if (platform::is_ipu_place(expected_kernel_key.place_)) { + VLOG(3) << "pten missing IPU kernel: " << op.Type() + << ", expected_kernel_key:" << expected_kernel_key + << ", fallbacking to CPU one!"; + return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(), + kernel_key.dtype()); + } #endif return phi::KernelKey(); } diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc index 943dfcc6cff..e806b0b30e4 100644 --- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc +++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc @@ -120,121 +120,151 @@ IpuStrategy::IpuStrategy() { RegisterGetter(options_getter, options_type, #name, "string", \ [&]() { return popart_options.aliased_name; }) -#define ADD_POPART_ENUM_OPTION(name, EnumType) \ - ADD_POPART_ENUM_OPTION_ALIAS(name, name, EnumType) - -#define ADD_POPART_BOOL_OPTION(name) ADD_POPART_BOOL_OPTION_ALIAS(name, name) - -#define ADD_POPART_UINT64_OPTION(name) \ - ADD_POPART_UINT64_OPTION_ALIAS(name, name) - -#define ADD_POPART_DOUBLE_OPTION(name) \ - ADD_POPART_DOUBLE_OPTION_ALIAS(name, name) - -#define ADD_POPART_STRING_OPTION(name) \ - ADD_POPART_STRING_OPTION_ALIAS(name, name) - - ADD_POPART_ENUM_OPTION(autodiffSettings.stitchStrategy, - AutodiffStitchStrategy); - ADD_POPART_ENUM_OPTION(batchSerializationSettings.transformContext, - BatchSerializationTransformContext); - ADD_POPART_ENUM_OPTION(batchSerializationSettings.method, - BatchSerializationMethod); - ADD_POPART_ENUM_OPTION(batchSerializationSettings.batchSchedule, - BatchSerializationBatchSchedule); - ADD_POPART_ENUM_OPTION(autoRecomputation, RecomputationType); - ADD_POPART_ENUM_OPTION(mergeVarUpdate, MergeVarUpdateType); - ADD_POPART_ENUM_OPTION(virtualGraphMode, VirtualGraphMode); - ADD_POPART_ENUM_OPTION(syntheticDataMode, SyntheticDataMode); - ADD_POPART_ENUM_OPTION(subgraphCopyingStrategy, SubgraphCopyingStrategy); - ADD_POPART_ENUM_OPTION(accumulationAndReplicationReductionType, - ReductionType); - ADD_POPART_ENUM_OPTION(meanAccumulationAndReplicationReductionStrategy, - MeanReductionStrategy); - - ADD_POPART_STRING_OPTION(logDir); - ADD_POPART_STRING_OPTION(cachePath); - ADD_POPART_STRING_OPTION(partialsTypeMatMuls); - ADD_POPART_STRING_OPTION(customCodeletCompileFlags); - ADD_POPART_STRING_OPTION(serializedPoprithmsShiftGraphsDir); - ADD_POPART_STRING_OPTION(kahnTieBreaker); - - ADD_POPART_UINT64_OPTION(executionPhaseSettings.phases); - ADD_POPART_UINT64_OPTION(executionPhaseSettings.stages); - ADD_POPART_UINT64_OPTION(batchSerializationSettings.factor); - ADD_POPART_UINT64_OPTION(firstDotOp); - ADD_POPART_UINT64_OPTION(finalDotOp); - ADD_POPART_UINT64_OPTION(numIOTiles); - ADD_POPART_UINT64_OPTION(mergeVarUpdateMemThreshold); - ADD_POPART_UINT64_OPTION(looseThresholdAtPeak); - ADD_POPART_UINT64_OPTION(accumulationFactor); - ADD_POPART_UINT64_OPTION(swapLimitScheduler); - ADD_POPART_UINT64_OPTION(globalReplicationFactor); - ADD_POPART_UINT64_OPTION(globalReplicaOffset); - ADD_POPART_UINT64_OPTION(defaultPrefetchBufferingDepth); - ADD_POPART_UINT64_OPTION(compilationProgressTotal); - ADD_POPART_UINT64_OPTION(transitiveClosureOptimizationThreshold); - - ADD_POPART_BOOL_OPTION(batchSerializationSettings.concatOnVirtualGraphChange); - ADD_POPART_BOOL_OPTION( + ADD_POPART_ENUM_OPTION_ALIAS(autodiff_settings.stitch_strategy, + autodiffSettings.stitchStrategy, + AutodiffStitchStrategy); + ADD_POPART_ENUM_OPTION_ALIAS(batch_serialization_settings.transform_context, + batchSerializationSettings.transformContext, + BatchSerializationTransformContext); + ADD_POPART_ENUM_OPTION_ALIAS(batch_serialization_settings.method, + batchSerializationSettings.method, + BatchSerializationMethod); + ADD_POPART_ENUM_OPTION_ALIAS(batch_serialization_settings.batch_schedule, + batchSerializationSettings.batchSchedule, + BatchSerializationBatchSchedule); + ADD_POPART_ENUM_OPTION_ALIAS(auto_recomputation, autoRecomputation, + RecomputationType); + ADD_POPART_ENUM_OPTION_ALIAS(merge_var_update, mergeVarUpdate, + MergeVarUpdateType); + ADD_POPART_ENUM_OPTION_ALIAS(virtual_graph_mode, virtualGraphMode, + VirtualGraphMode); + ADD_POPART_ENUM_OPTION_ALIAS(synthetic_data_mode, syntheticDataMode, + SyntheticDataMode); + ADD_POPART_ENUM_OPTION_ALIAS(subgraph_copying_strategy, + subgraphCopyingStrategy, + SubgraphCopyingStrategy); + ADD_POPART_ENUM_OPTION_ALIAS(accumulation_and_replication_reduction_type, + accumulationAndReplicationReductionType, + ReductionType); + ADD_POPART_ENUM_OPTION_ALIAS( + mean_accumulation_and_replication_reduction_strategy, + meanAccumulationAndReplicationReductionStrategy, MeanReductionStrategy); + + ADD_POPART_STRING_OPTION_ALIAS(log_dir, logDir); + ADD_POPART_STRING_OPTION_ALIAS(cache_path, cachePath); + ADD_POPART_STRING_OPTION_ALIAS(partials_type_matmuls, partialsTypeMatMuls); + ADD_POPART_STRING_OPTION_ALIAS(custom_codelet_compile_flags, + customCodeletCompileFlags); + ADD_POPART_STRING_OPTION_ALIAS(serialized_poprithms_shift_graphs_dir, + serializedPoprithmsShiftGraphsDir); + ADD_POPART_STRING_OPTION_ALIAS(kahn_tie_breaker, kahnTieBreaker); + + ADD_POPART_UINT64_OPTION_ALIAS(execution_phase_settings.phases, + executionPhaseSettings.phases); + ADD_POPART_UINT64_OPTION_ALIAS(execution_phase_settings.stages, + executionPhaseSettings.stages); + ADD_POPART_UINT64_OPTION_ALIAS(batch_serialization_settings.factor, + batchSerializationSettings.factor); + ADD_POPART_UINT64_OPTION_ALIAS(first_dot_op, firstDotOp); + ADD_POPART_UINT64_OPTION_ALIAS(final_dot_op, finalDotOp); + ADD_POPART_UINT64_OPTION_ALIAS(num_io_tiles, numIOTiles); + ADD_POPART_UINT64_OPTION_ALIAS(merge_var_update_mem_threshold, + mergeVarUpdateMemThreshold); + ADD_POPART_UINT64_OPTION_ALIAS(loose_threshold_at_peak, looseThresholdAtPeak); + ADD_POPART_UINT64_OPTION_ALIAS(accumulation_factor, accumulationFactor); + ADD_POPART_UINT64_OPTION_ALIAS(swap_limit_scheduler, swapLimitScheduler); + ADD_POPART_UINT64_OPTION_ALIAS(global_replication_factor, + globalReplicationFactor); + ADD_POPART_UINT64_OPTION_ALIAS(global_replica_offset, globalReplicaOffset); + ADD_POPART_UINT64_OPTION_ALIAS(default_prefetch_buffering_depth, + defaultPrefetchBufferingDepth); + ADD_POPART_UINT64_OPTION_ALIAS(compilation_progress_total, + compilationProgressTotal); + ADD_POPART_UINT64_OPTION_ALIAS(transitive_closure_optimization_threshold, + transitiveClosureOptimizationThreshold); + + ADD_POPART_BOOL_OPTION_ALIAS( + batch_serialization_settings.concat_on_virtual_graph_change, + batchSerializationSettings.concatOnVirtualGraphChange); + ADD_POPART_BOOL_OPTION_ALIAS( + batch_serialization_settings.concat_on_execution_phase_change, batchSerializationSettings.concatOnExecutionPhaseChange); - ADD_POPART_BOOL_OPTION( + ADD_POPART_BOOL_OPTION_ALIAS( + batch_serialization_settings.concat_on_pipeline_stage_change, batchSerializationSettings.concatOnPipelineStageChange); - ADD_POPART_BOOL_OPTION(strictOpVersions); - ADD_POPART_BOOL_OPTION(opxAliasChecking); - ADD_POPART_BOOL_OPTION(opxModifyChecking); - ADD_POPART_BOOL_OPTION(dotOpNames); - ADD_POPART_BOOL_OPTION(exportPoplarComputationGraph); - ADD_POPART_BOOL_OPTION(exportPoplarVertexGraph); - ADD_POPART_BOOL_OPTION(separateCallOpPdfs); - ADD_POPART_BOOL_OPTION(enableOutlining); - ADD_POPART_BOOL_OPTION(enableOutliningCopyCostPruning); - ADD_POPART_BOOL_OPTION(rearrangeAnchorsOnHost); - ADD_POPART_BOOL_OPTION(enablePrefetchDatastreams); - ADD_POPART_BOOL_OPTION(enableNonStableSoftmax); - ADD_POPART_BOOL_OPTION(enableReplicatedGraphs); - ADD_POPART_BOOL_OPTION(enableGradientAccumulation); - ADD_POPART_BOOL_OPTION(instrumentWithHardwareCycleCounter); - ADD_POPART_BOOL_OPTION(enablePipelining); + ADD_POPART_BOOL_OPTION_ALIAS(strict_op_versions, strictOpVersions); + ADD_POPART_BOOL_OPTION_ALIAS(opx_alias_checking, opxAliasChecking); + ADD_POPART_BOOL_OPTION_ALIAS(opx_modify_checking, opxModifyChecking); + ADD_POPART_BOOL_OPTION_ALIAS(dot_op_names, dotOpNames); + ADD_POPART_BOOL_OPTION_ALIAS(export_poplar_computation_graph, + exportPoplarComputationGraph); + ADD_POPART_BOOL_OPTION_ALIAS(export_poplar_vertex_graph, + exportPoplarVertexGraph); + ADD_POPART_BOOL_OPTION_ALIAS(separate_call_op_pdfs, separateCallOpPdfs); + ADD_POPART_BOOL_OPTION_ALIAS(enable_outlining, enableOutlining); + ADD_POPART_BOOL_OPTION_ALIAS(enable_outlining_copy_cost_pruning, + enableOutliningCopyCostPruning); + ADD_POPART_BOOL_OPTION_ALIAS(rearrange_anchors_on_host, + rearrangeAnchorsOnHost); + ADD_POPART_BOOL_OPTION_ALIAS(enable_prefetch_datastreams, + enablePrefetchDatastreams); + ADD_POPART_BOOL_OPTION_ALIAS(enable_non_stable_softmax, + enableNonStableSoftmax); + ADD_POPART_BOOL_OPTION_ALIAS(enable_replicated_graphs, + enableReplicatedGraphs); + ADD_POPART_BOOL_OPTION_ALIAS(enable_gradient_accumulation, + enableGradientAccumulation); + ADD_POPART_BOOL_OPTION_ALIAS(instrument_with_hardware_cycle_counter, + instrumentWithHardwareCycleCounter); ADD_POPART_BOOL_OPTION_ALIAS(enable_pipelining, enablePipelining); - ADD_POPART_BOOL_OPTION(disableGradAccumulationTensorStreams); - ADD_POPART_BOOL_OPTION(compileEngine); - ADD_POPART_BOOL_OPTION(constantWeights); - ADD_POPART_BOOL_OPTION(enableEngineCaching); - ADD_POPART_BOOL_OPTION(enableMergeExchange); - ADD_POPART_BOOL_OPTION(enableFloatingPointChecks); - ADD_POPART_BOOL_OPTION(enableStochasticRounding); + ADD_POPART_BOOL_OPTION_ALIAS(disable_grad_accumulation_tensor_streams, + disableGradAccumulationTensorStreams); + ADD_POPART_BOOL_OPTION_ALIAS(compile_engine, compileEngine); + ADD_POPART_BOOL_OPTION_ALIAS(constant_weights, constantWeights); + ADD_POPART_BOOL_OPTION_ALIAS(enable_engine_caching, enableEngineCaching); + ADD_POPART_BOOL_OPTION_ALIAS(enable_merge_exchange, enableMergeExchange); + ADD_POPART_BOOL_OPTION_ALIAS(enable_floating_point_checks, + enableFloatingPointChecks); ADD_POPART_BOOL_OPTION_ALIAS(enable_stochastic_rounding, enableStochasticRounding); - ADD_POPART_BOOL_OPTION(explicitRecomputation); - ADD_POPART_BOOL_OPTION(enableExplicitMainLoops); - ADD_POPART_BOOL_OPTION(useHostCopyOps); - ADD_POPART_BOOL_OPTION(aliasZeroCopy); - ADD_POPART_BOOL_OPTION(delayVarUpdates); - ADD_POPART_BOOL_OPTION(enableFullyConnectedPass); - ADD_POPART_BOOL_OPTION(enableSerializedMatmuls); - ADD_POPART_BOOL_OPTION(enableStableNorm); - ADD_POPART_BOOL_OPTION(decomposeGradSum); - ADD_POPART_BOOL_OPTION(enableDistributedReplicatedGraphs); - ADD_POPART_BOOL_OPTION(groupHostSync); - ADD_POPART_BOOL_OPTION(automaticLossScalingSettings.enabled); - ADD_POPART_BOOL_OPTION(instrumentWithHardwareCycleCounter); - ADD_POPART_BOOL_OPTION(enableSupportedDataTypeCasting); - ADD_POPART_BOOL_OPTION(groupNormStridedChannelGrouping); - ADD_POPART_BOOL_OPTION(scheduleNonWeightUpdateGradientConsumersEarly); - - ADD_POPART_DOUBLE_OPTION(outlineSequenceBreakCost); - ADD_POPART_DOUBLE_OPTION(outlineThreshold); - ADD_POPART_DOUBLE_OPTION(timeLimitScheduler); - ADD_POPART_DOUBLE_OPTION(automaticLossScalingSettings.binEdgeLocation); - ADD_POPART_DOUBLE_OPTION( + ADD_POPART_BOOL_OPTION_ALIAS(explicit_recomputation, explicitRecomputation); + ADD_POPART_BOOL_OPTION_ALIAS(enable_explicit_main_loops, + enableExplicitMainLoops); + ADD_POPART_BOOL_OPTION_ALIAS(use_host_copy_ops, useHostCopyOps); + ADD_POPART_BOOL_OPTION_ALIAS(alias_zero_copy, aliasZeroCopy); + ADD_POPART_BOOL_OPTION_ALIAS(delay_var_updates, delayVarUpdates); + ADD_POPART_BOOL_OPTION_ALIAS(enable_fully_connected_pass, + enableFullyConnectedPass); + ADD_POPART_BOOL_OPTION_ALIAS(enable_serialized_matmuls, + enableSerializedMatmuls); + ADD_POPART_BOOL_OPTION_ALIAS(enable_stable_norm, enableStableNorm); + ADD_POPART_BOOL_OPTION_ALIAS(decompose_grad_sum, decomposeGradSum); + ADD_POPART_BOOL_OPTION_ALIAS(enable_distributed_replicated_graphs, + enableDistributedReplicatedGraphs); + ADD_POPART_BOOL_OPTION_ALIAS(group_host_sync, groupHostSync); + ADD_POPART_BOOL_OPTION_ALIAS(automatic_loss_scaling_settings.enabled, + automaticLossScalingSettings.enabled); + ADD_POPART_BOOL_OPTION_ALIAS(instrument_with_hardware_cycle_counter, + instrumentWithHardwareCycleCounter); + ADD_POPART_BOOL_OPTION_ALIAS(enable_supported_data_type_casting, + enableSupportedDataTypeCasting); + ADD_POPART_BOOL_OPTION_ALIAS(group_norm_strided_channel_grouping, + groupNormStridedChannelGrouping); + ADD_POPART_BOOL_OPTION_ALIAS( + schedule_non_weight_update_gradient_consumers_early, + scheduleNonWeightUpdateGradientConsumersEarly); + + ADD_POPART_DOUBLE_OPTION_ALIAS(outline_sequence_break_cost, + outlineSequenceBreakCost); + ADD_POPART_DOUBLE_OPTION_ALIAS(outline_threshold, outlineThreshold); + ADD_POPART_DOUBLE_OPTION_ALIAS(time_limit_scheduler, timeLimitScheduler); + ADD_POPART_DOUBLE_OPTION_ALIAS( + automatic_loss_scaling_settings.bin_edge_location, + automaticLossScalingSettings.binEdgeLocation); + ADD_POPART_DOUBLE_OPTION_ALIAS( + automatic_loss_scaling_settings.threshold_upper_count_proportion, automaticLossScalingSettings.thresholdUpperCountProportion); -#undef ADD_POPART_STRING_OPTION -#undef ADD_POPART_DOUBLE_OPTION -#undef ADD_POPART_UINT64_OPTION -#undef ADD_POPART_BOOL_OPTION -#undef ADD_POPART_ENUM_OPTION #undef ADD_POPART_STRING_OPTION_ALIAS #undef ADD_POPART_DOUBLE_OPTION_ALIAS #undef ADD_POPART_UINT64_OPTION_ALIAS @@ -278,14 +308,14 @@ IpuStrategy::IpuStrategy() { }); RegisterSetter( - container_options, "dotChecks", + container_options, "dot_checks", [&](const std::pair& p) { std::uint64_t value = std::stoul(p.first); popart_options.dotChecks.insert(static_cast(value)); }); RegisterGetter( - vector_options_getter, options_type, "dotChecks", "vector", [&]() { + vector_options_getter, options_type, "dot_checks", "vector", [&]() { std::vector res; for (auto x : popart_options.dotChecks) { res.push_back(std::to_string(static_cast(x))); @@ -293,7 +323,7 @@ IpuStrategy::IpuStrategy() { return res; }); - RegisterSetter(container_options, "hardwareInstrumentations", + RegisterSetter(container_options, "hardware_instrumentations", [&](const std::pair& p) { std::uint64_t value = std::stoul(p.first); popart_options.hardwareInstrumentations.insert( @@ -301,8 +331,8 @@ IpuStrategy::IpuStrategy() { }); RegisterGetter( - vector_options_getter, options_type, "hardwareInstrumentations", "vector", - [&]() { + vector_options_getter, options_type, "hardware_instrumentations", + "vector", [&]() { std::vector res; for (auto x : popart_options.hardwareInstrumentations) { res.push_back(std::to_string(static_cast(x))); @@ -310,12 +340,12 @@ IpuStrategy::IpuStrategy() { return res; }); - RegisterSetter(container_options, "customCodelets", + RegisterSetter(container_options, "custom_codelets", [&](const std::pair& p) { popart_options.customCodelets.push_back(p.first); }); - RegisterGetter(vector_options_getter, options_type, "customCodelets", + RegisterGetter(vector_options_getter, options_type, "custom_codelets", "vector", [&]() { std::vector res; for (auto x : popart_options.customCodelets) { @@ -324,44 +354,44 @@ IpuStrategy::IpuStrategy() { return res; }); - RegisterSetter(container_options, "engineOptions", + RegisterSetter(container_options, "engine_options", [&](const std::pair& p) { popart_options.engineOptions.emplace(p); }); - RegisterGetter(map_options_getter, options_type, "engineOptions", "map", + RegisterGetter(map_options_getter, options_type, "engine_options", "map", [&]() { return popart_options.engineOptions; }); - RegisterSetter(container_options, "reportOptions", + RegisterSetter(container_options, "report_options", [&](const std::pair& p) { popart_options.reportOptions.emplace(p); }); - RegisterGetter(map_options_getter, options_type, "reportOptions", "map", + RegisterGetter(map_options_getter, options_type, "report_options", "map", [&]() { return popart_options.reportOptions; }); - RegisterSetter(container_options, "convolutionOptions", + RegisterSetter(container_options, "convolution_options", [&](const std::pair& p) { popart_options.convolutionOptions.emplace(p); }); - RegisterGetter(map_options_getter, options_type, "convolutionOptions", "map", + RegisterGetter(map_options_getter, options_type, "convolution_options", "map", [&]() { return popart_options.convolutionOptions; }); - RegisterSetter(container_options, "lstmOptions", + RegisterSetter(container_options, "lstm_options", [&](const std::pair& p) { popart_options.lstmOptions.emplace(p); }); - RegisterGetter(map_options_getter, options_type, "lstmOptions", "map", + RegisterGetter(map_options_getter, options_type, "lstm_options", "map", [&]() { return popart_options.lstmOptions; }); - RegisterSetter(container_options, "gclOptions", + RegisterSetter(container_options, "gcl_options", [&](const std::pair& p) { popart_options.gclOptions.emplace(p); }); - RegisterGetter(map_options_getter, options_type, "gclOptions", "map", + RegisterGetter(map_options_getter, options_type, "gcl_options", "map", [&]() { return popart_options.gclOptions; }); } @@ -415,21 +445,21 @@ void IpuStrategy::SetTensorLocation(const std::string& tensor, "Unknown tensor location: %s", tensor)); } - if (opt == "minElementsForOffChip") { + if (opt == "min_elements_for_off_chip") { settings->minElementsForOffChip = value; - } else if (opt == "minElementsForReplicatedTensorSharding") { + } else if (opt == "min_elements_for_replicated_tensor_sharding") { settings->minElementsForReplicatedTensorSharding = value; - } else if (opt == "onChip") { + } else if (opt == "on_chip") { settings->location.storage = value > 0 ? popart::TensorStorage::OnChip : popart::TensorStorage::OffChip; - } else if (opt == "useReplicatedTensorSharding") { + } else if (opt == "use_replicated_tensor_sharding") { settings->location.replicatedTensorSharding = value > 0 ? popart::ReplicatedTensorSharding::On : popart::ReplicatedTensorSharding::Off; - } else if (opt == "useIOTilesToLoad") { + } else if (opt == "use_io_tiles_to_load") { settings->location.loadTileSet = value > 0 ? popart::TileSet::IO : popart::TileSet::Compute; - } else if (opt == "useIOTilesToStore") { + } else if (opt == "use_io_tiles_to_store") { settings->location.storageTileSet = value > 0 ? popart::TileSet::IO : popart::TileSet::Compute; } else { @@ -464,6 +494,20 @@ std::string IpuStrategy::GetOptionType(const std::string& option) { return options_type[option]; } +std::vector IpuStrategy::GetAllOptionNames() { + std::vector names; + for (auto& option : options_getter) { + names.push_back(option.first); + } + for (auto& option : vector_options_getter) { + names.push_back(option.first); + } + for (auto& option : map_options_getter) { + names.push_back(option.first); + } + return names; +} + void IpuStrategy::EnablePattern(const std::string& t) { VLOG(10) << "enable popart pattern: " << t; popart_patterns.enablePattern(t, true); diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h index 64436dc14fe..571fb1e1637 100644 --- a/paddle/fluid/platform/device/ipu/ipu_strategy.h +++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h @@ -24,7 +24,8 @@ namespace paddle { namespace platform { namespace ipu { -struct IpuStrategy { +class IpuStrategy { + public: IpuStrategy(); // TODO(alleng) create PaddleOptions @@ -75,22 +76,30 @@ struct IpuStrategy { // custom ops std::vector custom_ops; - private: - std::map> bool_options; - std::map> uint64_options; - std::map> double_options; - std::map> string_options; - std::map)>> - container_options; + public: + void AddBoolOption(const std::string &option, bool value); + void AddUint64Option(const std::string &option, std::uint64_t value); + void AddDoubleOption(const std::string &option, double value); + void AddStringOption(const std::string &option, const std::string &value); + void InsertStringOption(const std::string &option, const std::string &value); + void InsertStringPairOption(const std::string &option, const std::string &key, + const std::string &value); + void SetTensorLocation(const std::string &tensor, const std::string &option, + std::uint64_t value); + void AddCustomOp(const std::string &paddle_op, const std::string &popart_op, + const std::string &domain, int version); - std::map> options_getter; - std::map()>> - vector_options_getter; - std::map()>> - map_options_getter; - std::map options_type; + std::string GetOption(const std::string &); + std::vector GetVectorOption(const std::string &); + std::map GetMapOption(const std::string &); + std::string GetOptionType(const std::string &); + std::vector GetAllOptionNames(); + + void EnablePattern(const std::string &t); + void DisablePattern(const std::string &t); + const bool IsPatternEnabled(const std::string &t); + private: template void set( const std::string &key, ValueType value, @@ -117,27 +126,20 @@ struct IpuStrategy { return it->second(); } - public: - void AddBoolOption(const std::string &option, bool value); - void AddUint64Option(const std::string &option, std::uint64_t value); - void AddDoubleOption(const std::string &option, double value); - void AddStringOption(const std::string &option, const std::string &value); - void InsertStringOption(const std::string &option, const std::string &value); - void InsertStringPairOption(const std::string &option, const std::string &key, - const std::string &value); - void SetTensorLocation(const std::string &tensor, const std::string &option, - std::uint64_t value); - void AddCustomOp(const std::string &paddle_op, const std::string &popart_op, - const std::string &domain, int version); - - std::string GetOption(const std::string &); - std::vector GetVectorOption(const std::string &); - std::map GetMapOption(const std::string &); - std::string GetOptionType(const std::string &); + std::map> bool_options; + std::map> uint64_options; + std::map> double_options; + std::map> string_options; + std::map)>> + container_options; - void EnablePattern(const std::string &t); - void DisablePattern(const std::string &t); - const bool IsPatternEnabled(const std::string &t); + std::map> options_getter; + std::map()>> + vector_options_getter; + std::map()>> + map_options_getter; + std::map options_type; }; } // namespace ipu diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 6e553ad2e60..3d8815e2eb6 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -3919,6 +3919,8 @@ All parameter, weight, gradient are variables in Paddle. } return res; }) + .def("get_all_option_names", + &platform::ipu::IpuStrategy::GetAllOptionNames) .def("enable_pattern", &platform::ipu::IpuStrategy::EnablePattern) .def("disable_pattern", &platform::ipu::IpuStrategy::DisablePattern) .def("is_pattern_enabled", &platform::ipu::IpuStrategy::IsPatternEnabled); diff --git a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt new file mode 100644 index 00000000000..959700ad743 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt @@ -0,0 +1,8 @@ +if(WITH_IPU) + file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") + string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") + + foreach(TEST_OP ${TEST_OPS}) + py_test_modules(${TEST_OP} MODULES ${TEST_OP}) + endforeach(TEST_OP) +endif() -- GitLab From 4617c1b2da8b061015d4a23f01ad81109ea931a7 Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Tue, 1 Mar 2022 20:13:14 +0800 Subject: [PATCH 029/272] fix bug of paddle.to_tensor and paddle.moveaxis (#39662) * fix bug of paddle.to_tensor and paddle.moveaxis * fix CI --- .../tests/unittests/test_transpose_op.py | 8 +++++ .../fluid/tests/unittests/test_var_base.py | 4 +++ python/paddle/tensor/creation.py | 31 +++++++++---------- python/paddle/tensor/manipulation.py | 7 +++-- 4 files changed, 31 insertions(+), 19 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py index 0fc56726c5d..13b880b28bf 100644 --- a/python/paddle/fluid/tests/unittests/test_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py @@ -423,6 +423,14 @@ class TestMoveAxis(unittest.TestCase): self.assertEqual(np.array_equal(out.numpy(), expected), True) paddle.enable_static() + def test_moveaxis3(self): + paddle.disable_static() + x = paddle.to_tensor( + [[1 + 1j, -1 - 1j], [1 + 1j, -1 - 1j], [1 + 1j, -1 - 1j]]) + out = x.moveaxis(0, 1) + self.assertEqual(out.shape, [2, 3]) + paddle.enable_static() + def test_error(self): x = paddle.randn([2, 3, 4, 5]) # src must have the same number with dst diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py index 541df6659c2..dbd40c349bb 100644 --- a/python/paddle/fluid/tests/unittests/test_var_base.py +++ b/python/paddle/fluid/tests/unittests/test_var_base.py @@ -51,6 +51,10 @@ class TestVarBase(unittest.TestCase): np.array_equal(x.numpy(), np.array([1.2], 'float16'))) self.assertEqual(x.dtype, core.VarDesc.VarType.FP16) + # set_default_dtype take effect on int + x = paddle.to_tensor(1, place=place) + self.assertTrue(x.dtype, core.VarDesc.VarType.INT64) + # set_default_dtype take effect on float x = paddle.to_tensor(1.2, place=place, stop_gradient=False) self.assertTrue( diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index ae563e641e3..bddc45bc961 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -110,12 +110,6 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True): "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace, paddle.NPUPlace, paddle.XPUPlace, paddle.CustomPlace" ) - #Todo(zhouwei): Support allocate tensor on any other specified card - if isinstance(place, core.CUDAPlace) and isinstance( - _current_expected_place(), core.CUDAPlace) and place._get_device_id( - ) != _current_expected_place()._get_device_id(): - place = _current_expected_place() - if not isinstance(data, np.ndarray): def _handle_dtype(data, dtype): @@ -139,7 +133,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True): data.stop_gradient = stop_gradient return data elif isinstance(data, (core.LoDTensor, core.Tensor)): - # Note(zhouwei25): should't expose it to users, just for internal use. + # should't expose it to users, just for internal use. # convert core.Tensor/core.LoDTensor to VarBase first # Currenly, there is no copy when places are same data = paddle.Tensor(data) @@ -152,15 +146,20 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True): raise TypeError( "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|numpy.ndarray|paddle.Tensor". format(type(data))) - if not dtype and data.dtype in [ - 'float16', 'float32', 'float64', 'complex64', 'complex128' - ]: - default_type = paddle.get_default_dtype() - if np.iscomplexobj(data): - default_type = 'complex64' if default_type in [ - 'float16', 'float32' - ] else 'complex128' - data = data.astype(default_type) + if not dtype: + if data.dtype in [ + 'float16', 'float32', 'float64', 'complex64', 'complex128' + ]: + default_type = paddle.get_default_dtype() + if np.iscomplexobj(data): + default_type = 'complex64' if default_type in [ + 'float16', 'float32' + ] else 'complex128' + data = data.astype(default_type) + # Windows default type is 'int32', while Linux/Mac is 'int64'. Unify they. + if data.dtype in ['int32']: + default_type = "int64" + data = data.astype(default_type) if dtype and convert_dtype(dtype) != data.dtype: data = data.astype(convert_dtype(dtype)) diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 53bb9a88075..fbd6197c1b9 100755 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -2737,9 +2737,10 @@ def moveaxis(x, source, destination, name=None): out, _ = _C_ops.transpose2(x, 'axis', perm) return out - check_variable_and_dtype( - x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], - 'moveaxis') + check_variable_and_dtype(x, 'x', [ + 'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'complex64', + 'complex128' + ], 'moveaxis') helper = LayerHelper('moveaxis', **locals()) out = helper.create_variable_for_type_inference(x.dtype) -- GitLab From 72e462cd0115b41b9a855c3edb9ee0622b241527 Mon Sep 17 00:00:00 2001 From: Qi Li Date: Tue, 1 Mar 2022 20:40:19 +0800 Subject: [PATCH 030/272] [ROCM] fix to get rocm number in script, test=develop (#39938) --- paddle/scripts/paddle_build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 4d7451f4352..8528ba34e21 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -1266,7 +1266,7 @@ function card_test() { elif [ "${WITH_ASCEND_CL}" == "ON" ];then CUDA_DEVICE_COUNT=1 elif [ "${WITH_ROCM}" == "ON" ];then - CUDA_DEVICE_COUNT=4 + CUDA_DEVICE_COUNT=$(rocm-smi -i | grep GPU | wc -l) else CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l) fi -- GitLab From 852a872f6dafb3f8f32b30567d8402651f8e9e1e Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Tue, 1 Mar 2022 21:00:59 +0800 Subject: [PATCH 031/272] Added attr & tensor type mapping for final state codegen (#39997) --- .../final_state_generator/eager_gen.py | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index c6e56e34627..02183e2ca5c 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -24,6 +24,17 @@ core_ops_args_info = {} core_ops_args_type_info = {} +yaml_types_mapping = { + 'int' : 'int', 'int32_t' : 'int32_t', 'int64_t' : 'int64_t', 'size_t' : 'size_t', \ + 'float' : 'float', 'double' : 'double', 'bool' : 'bool', \ + 'Backend' : 'Backend', 'DataLayout' : 'DataLayout', 'DataType' : 'DataType', \ + 'int64_t[]' : 'std::vector', 'int[]' : 'std::vector', + 'Tensor' : 'Tensor', + 'Tensor[]' : 'std::vector', + 'Tensor[Tensor[]]' : 'std::vector>' +} + + def ParseArguments(): parser = argparse.ArgumentParser( description='Eager Code Generator Args Parser') @@ -59,7 +70,9 @@ def IsPlainTensorType(string): def IsVectorTensorType(string): - vector_tensor_types = ['list(Tensor)'] + vector_tensor_types = [ + 'std::vector>', 'std::vector' + ] if string in vector_tensor_types: return True return False @@ -180,6 +193,9 @@ def ParseYamlArgs(string): arg_name = m.group(3).split("=")[0].strip() default_value = m.group(3).split("=")[1].strip() if len( m.group(3).split("=")) > 1 else None + + assert arg_type in yaml_types_mapping.keys() + arg_type = yaml_types_mapping[arg_type] if "Tensor" in arg_type: assert default_value is None inputs_list.append([arg_name, arg_type, i]) @@ -219,6 +235,10 @@ def ParseYamlReturnsWithName(string): m = re.search(pattern, ret) ret_type = m.group(1) ret_name = m.group(2) + + assert ret_type in yaml_types_mapping.keys() + ret_type = yaml_types_mapping[ret_type] + assert "Tensor" in ret_type returns_list.append([ret_name, ret_type, i]) -- GitLab From acdf0663ae98fee60ea61ef25bb3e8af7d88f6b4 Mon Sep 17 00:00:00 2001 From: Shang Zhizhou Date: Wed, 2 Mar 2022 09:42:20 +0800 Subject: [PATCH 032/272] update pd_2_trt lower pass (#40019) * update pd_2_trt lower pass * update pd_2_trt lower pass * update style * udpate * change trt.graph to trt.create_engine * update comments * update comments * add test --- .../dialect/tensorrt/trt_graph_fuse_pass.cc | 20 +++++++++--------- .../dialect/tensorrt/trt_graph_fuse_pass.h | 21 ++++++++++++------- .../dialect/tensorrt/trt_graph_split_pass.cc | 7 +++---- .../dialect/tensorrt/trt_graph_split_pass.h | 9 ++++++-- .../dialect/tensorrt/trt_op_converter_pass.cc | 12 +++++------ .../dialect/tensorrt/trt_op_converter_pass.h | 8 +++---- .../dialect/tensorrt/trt_op_teller_pass.cc | 17 +++++++-------- .../dialect/tensorrt/trt_op_teller_pass.h | 17 +++++++++------ paddle/infrt/dialect/tensorrt/trt_ops.h | 1 + paddle/infrt/dialect/tensorrt/trt_ops.td | 15 ++----------- .../{disabled_trt_ops.mlir => trt_ops.mlir} | 1 + paddle/infrt/tests/lit.cfg.py.in | 3 ++- 12 files changed, 67 insertions(+), 64 deletions(-) rename paddle/infrt/tests/dialect/{disabled_trt_ops.mlir => trt_ops.mlir} (98%) diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc index 17633a4e8e9..fa0095363c5 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc @@ -53,9 +53,9 @@ bool reverseDfs(std::vector source, } // merge the first&second graph op to a new graph op. -void mergeTwoAdjacentGraphOp(mlir::OpBuilder &builder, // NOLINT - mlir::pd::GraphOp first, - mlir::pd::GraphOp second) { +void mergeTwoAdjacentCreateEngineOp(mlir::OpBuilder &builder, // NOLINT + CreateEngineOp first, + CreateEngineOp second) { // comput inputs and outputs ::llvm::SmallVector inputs(first.getOperands()), outputs; for (mlir::Value input : second.getOperands()) { @@ -84,7 +84,8 @@ void mergeTwoAdjacentGraphOp(mlir::OpBuilder &builder, // NOLINT // create the new graph op builder.setInsertionPoint(first); auto loc = first.getLoc(); - auto graph_op = builder.create(loc, return_types, inputs); + auto graph_op = + builder.create(loc, return_types, inputs, true); mlir::Block *block = new mlir::Block; auto copy_range = second.getBody()->without_terminator(); block->getOperations().splice(block->begin(), @@ -97,7 +98,7 @@ void mergeTwoAdjacentGraphOp(mlir::OpBuilder &builder, // NOLINT copy_range.begin(), copy_range.end()); builder.setInsertionPointToEnd(block); - builder.create(loc, outputs); + builder.create<::infrt::dialect::ReturnOp>(loc, outputs); graph_op.body().push_back(block); // mapping the output @@ -149,13 +150,12 @@ void TRTGraphFusePass::runOnFunction() { do { changed = false; for (auto &op : body) { - mlir::pd::GraphOp graph_op = - ::llvm::dyn_cast_or_null(&op); + CreateEngineOp graph_op = ::llvm::dyn_cast_or_null(&op); if (nullptr == graph_op) continue; for (auto user_op : op.getUsers()) { - mlir::pd::GraphOp user_graph_op = - ::llvm::dyn_cast_or_null(user_op); + CreateEngineOp user_graph_op = + ::llvm::dyn_cast_or_null(user_op); if (nullptr == user_graph_op) continue; // get all dst input nodes except src. std::vector source_nodes; @@ -168,7 +168,7 @@ void TRTGraphFusePass::runOnFunction() { // Reverse DFS from the source_nodes. if (!reverseDfs(source_nodes, [&op](const mlir::Operation *n) { return n == &op; })) { - mergeTwoAdjacentGraphOp(builder, graph_op, user_graph_op); + mergeTwoAdjacentCreateEngineOp(builder, graph_op, user_graph_op); changed = true; break; } diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h index ebd7a4ac4bd..350add905aa 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h +++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h @@ -14,6 +14,8 @@ #pragma once #include +#include "paddle/infrt/dialect/infrt_base.h" +#include "paddle/infrt/dialect/tensorrt/trt_ops.h" namespace infrt { namespace trt { @@ -26,28 +28,28 @@ namespace trt { * * func @main() -> tensor { * %a = "pd.feed"()... - * %c = "pd.graph"(%a) { + * %c = "trt.create_engine"(%a) { * %m = "pd.conv2d"(%a)... - * "pd.return" %m + * "Infrt.return" %m * } ... - * %d = "pd.graph"(%c) { + * %d = "trt.create_engine"(%c) { * %m = "pd.conv3d"(%c)... - * "pd.return" %m + * "Infrt.return" %m * } ... - * %f = "pd.graph"(%a) { + * %f = "trt.create_engine"(%a) { * %m = "pd.conv2d"(%a)... - * "pd.return" %m + * "Infrt.return" %m * } ... * "pd.fetch" %d, %f * * destination func: * func @main() -> tensor { * %a = "pd.feed"()... - * %d, %f = "pd.graph"(%a) { + * %d, %f = "trt.create_engine"(%a) { * %m = "pd.conv2d"(%a)... * %n = "pd.conv3d"(%m)... * %s = "pd.conv2d"(%a)... - * "pd.return" %n, %s + * "Infrt.return" %n, %s * } ... * "pd.fetch" %d, %f * } @@ -55,6 +57,9 @@ namespace trt { class TRTGraphFusePass : public mlir::PassWrapper { public: + void getDependentDialects(mlir::DialectRegistry ®istry) const override { + registry.insert(); + } ::llvm::StringRef getName() const override { return "trtGraphFusePass"; } void runOnFunction() override; }; diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc index f24b9cc40cd..5ee7b23213a 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc @@ -22,18 +22,17 @@ namespace infrt { namespace trt { // Implementation of the trtGraphSplitPass。 void TRTGraphSplitPass::runOnFunction() { - std::vector worklist; + std::vector worklist; mlir::Block& block = getFunction().front(); for (auto& op : block) { - mlir::pd::GraphOp graph_op = - ::llvm::dyn_cast_or_null(&op); + CreateEngineOp graph_op = ::llvm::dyn_cast_or_null(&op); if (nullptr != graph_op && graph_op.getBody()->getOperations().size() <= min_subgraph_size_) { worklist.push_back(graph_op); } } while (!worklist.empty()) { - mlir::pd::GraphOp graph_op = worklist.back(); + CreateEngineOp graph_op = worklist.back(); worklist.pop_back(); mlir::Block* body = graph_op.getBody(); auto return_op = body->getTerminator(); diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h index 51f84227243..28078e2bc2d 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h +++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h @@ -14,6 +14,8 @@ #pragma once #include +#include "paddle/infrt/dialect/infrt_base.h" +#include "paddle/infrt/dialect/tensorrt/trt_ops.h" namespace infrt { namespace trt { @@ -27,11 +29,11 @@ namespace trt { * * func @main() -> tensor { * %a = "pd.feed"()... - * %d, %f = "pd.graph"(%a) { + * %d, %f = "trt.create_engine"(%a) { * %m = "pd.conv2d"(%a)... * %n = "pd.conv3d"(%m)... * %s = "pd.conv2d"(%a)... - * "pd.return" (%n, %s) + * "Infrt.return" (%n, %s) * } ... * "pd.fetch" (%d, %f) * } @@ -49,6 +51,9 @@ class TRTGraphSplitPass : public mlir::PassWrapper { public: ::llvm::StringRef getName() const override { return "trtGraphSplitPass"; } + void getDependentDialects(mlir::DialectRegistry ®istry) const override { + registry.insert(); + } void runOnFunction() override; explicit TRTGraphSplitPass(size_t min_subgraph_size = 3) : min_subgraph_size_(min_subgraph_size) {} diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc index e34308a2f0f..8d81e739d9c 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h" -#include "mlir/IR/Builders.h" -#include "mlir/Transforms/DialectConversion.h" +#include +#include #include "paddle/infrt/dialect/infrt_base.h" #include "paddle/infrt/dialect/pd_ops.h" @@ -22,12 +22,10 @@ namespace trt { #include "paddle/infrt/dialect/tensorrt/pd_lower_to_trt.cpp.inc" // NOLINT -using namespace mlir; - void TRTOpConverterPass::runOnOperation() { // The first thing to define is the conversion target. This will define the // final target for this lowering. - ConversionTarget target(getContext()); + ::mlir::ConversionTarget target(getContext()); // We define the specific operations, or dialects, that are legal targets for // this lowering. In our case, we are lowering to TensorRTDialect from @@ -36,13 +34,13 @@ void TRTOpConverterPass::runOnOperation() { // Now that the conversion target has been defined, we just need to provide // the set of patterns that will lower the TensorRT operations. - RewritePatternSet patterns(&getContext()); + ::mlir::RewritePatternSet patterns(&getContext()); populateWithGenerated(patterns); // With the target and rewrite patterns defined, we can now attempt the // conversion. The conversion will signal failure if any of our `illegal` // operations were not converted successfully. - if (failed( + if (::mlir::failed( applyPartialConversion(getOperation(), target, std::move(patterns)))) signalPassFailure(); } diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h index 0adbf11b891..a8128a585ee 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h +++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h @@ -25,11 +25,11 @@ namespace trt { * source ir: * func @main() -> tensor { * %a = "pd.feed"()... - * %d, %f = "pd.graph"(%a) { + * %d, %f = "trt.create_engine"(%a) { * %m = "pd.conv2d"(%a)... * %n = "pd.conv3d"(%m)... * %s = "pd.conv2d"(%a)... - * "pd.return" %n, %s + * "Infrt.return" %n, %s * } ... * "pd.fetch" %d, %f * } @@ -37,11 +37,11 @@ namespace trt { * destination ir: * func @main() -> tensor { * %a = "pd.feed"()... - * %d, %f = "pd.graph"(%a) { + * %d, %f = "trt.create_engine"(%a) { * %m = "trt.Convolution"(%a)... * %n = "trt.Convolution"(%m)... * %s = "trt.Convolution"(%a)... - * "pd.return" %n, %s + * "Infrt.return" %n, %s * } ... * "pd.fetch" %d, %f * } diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc index 176fdb7a2e0..17e893a383a 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc @@ -15,6 +15,7 @@ #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h" #include +#include "paddle/infrt/dialect/basic_kernels.h" #include "paddle/infrt/dialect/pd_ops.h" namespace infrt { @@ -33,16 +34,14 @@ void TRTOpTellerPass::runOnFunction() { auto *op = worklist.back(); worklist.pop_back(); if (op == nullptr) continue; - auto op1 = ::llvm::dyn_cast_or_null(op); - if (op1) continue; - auto op2 = ::llvm::dyn_cast_or_null(op); - if (op2) continue; - auto op3 = ::llvm::dyn_cast_or_null(op); - if (op3) continue; + if (::llvm::dyn_cast_or_null(op)) continue; + if (::llvm::dyn_cast_or_null(op)) continue; + if (::llvm::dyn_cast_or_null(op)) continue; + if (::llvm::dyn_cast_or_null(op)) continue; builder.setInsertionPoint(op); auto loc = getFunction().getLoc(); - auto graph_op = builder.create( - loc, op->getResultTypes(), op->getOperands()); + auto graph_op = builder.create( + loc, op->getResultTypes(), op->getOperands(), true); ::llvm::SmallVector tblgen_repl_values; for (auto v : @@ -55,7 +54,7 @@ void TRTOpTellerPass::runOnFunction() { graph_op.body().push_back(block); op->moveBefore(block, block->begin()); builder.setInsertionPointToEnd(block); - builder.create(loc, op->getResults()); + builder.create<::infrt::dialect::ReturnOp>(loc, op->getResults()); } } } // namespace trt diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h index 8b9a16376ce..471eafa9f9b 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h +++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h @@ -14,6 +14,8 @@ #pragma once #include +#include "paddle/infrt/dialect/infrt_base.h" +#include "paddle/infrt/dialect/tensorrt/trt_ops.h" namespace infrt { namespace trt { @@ -35,17 +37,17 @@ namespace trt { * destination func: * func @main() -> tensor { * %a = "pd.feed"()... - * %c = "pd.graph"(%a) { + * %c = "trt.create_engine"(%a) { * %m = "pd.conv2d"(%a)... - * "pd.return" (%m) + * "Infrt.return" (%m) * } ... - * %d = "pd.graph"(%c) { + * %d = "trt.create_engine"(%c) { * %m = "pd.conv3d"(%c)... - * "pd.return" (%m) + * "Infrt.return" (%m) * } ... - * %f = "pd.graph"(%a) { + * %f = "trt.create_engine"(%a) { * %m = "pd.conv2d"(%a)... - * "pd.return" (%m) + * "Infrt.return" (%m) * } ... * "pd.fetch" (%d, %f) * } @@ -55,6 +57,9 @@ namespace trt { class TRTOpTellerPass : public mlir::PassWrapper { public: + void getDependentDialects(mlir::DialectRegistry ®istry) const override { + registry.insert(); + } ::llvm::StringRef getName() const override { return "trtOpTellerPass"; } void runOnFunction() override; }; diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h index a37491ec1ab..95b2ed41fdf 100644 --- a/paddle/infrt/dialect/tensorrt/trt_ops.h +++ b/paddle/infrt/dialect/tensorrt/trt_ops.h @@ -28,6 +28,7 @@ #include #include #include +#include "paddle/infrt/dialect/basic_kernels.h" namespace infrt { namespace trt { diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.td b/paddle/infrt/dialect/tensorrt/trt_ops.td index 8e3dfffff54..31142a5157b 100755 --- a/paddle/infrt/dialect/tensorrt/trt_ops.td +++ b/paddle/infrt/dialect/tensorrt/trt_ops.td @@ -7,25 +7,14 @@ include "mlir/Interfaces/CallInterfaces.td" include "mlir/IR/OpBase.td" include "paddle/infrt/dialect/tensorrt/trt_op_base.td" -def TRT_FetchOp : TRT_Op<"fetch", [Terminator]> { - let summary = "TensorRT engine return operation"; - let description = [{ - The `trt.fetch` operation terminates and returns values for the - `trt.graph` operation. - }]; - - let arguments = (ins Variadic:$inputs); -} - -def TRT_GraphOp : TRT_Op<"graph", [SingleBlockImplicitTerminator<"FetchOp">]> { +def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator<"::infrt::dialect::ReturnOp">]> { let summary = "trt Graph Op"; let description = [{ Describe a tensorrt subgraph. }]; let regions = (region SizedRegion<1>:$body); - let arguments = (ins Variadic:$inputs); + let arguments = (ins Variadic:$inputs, DefaultValuedAttr:$run_once); let results = (outs Variadic:$outputs); - } def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> { diff --git a/paddle/infrt/tests/dialect/disabled_trt_ops.mlir b/paddle/infrt/tests/dialect/trt_ops.mlir similarity index 98% rename from paddle/infrt/tests/dialect/disabled_trt_ops.mlir rename to paddle/infrt/tests/dialect/trt_ops.mlir index b59cfb04816..49510bc542d 100644 --- a/paddle/infrt/tests/dialect/disabled_trt_ops.mlir +++ b/paddle/infrt/tests/dialect/trt_ops.mlir @@ -1,3 +1,4 @@ +// RUN: trt-exec %s // CHECK-LABEL: @main func @main() -> tensor { %bias = "pd.feed"() {name="input0"} : () -> tensor diff --git a/paddle/infrt/tests/lit.cfg.py.in b/paddle/infrt/tests/lit.cfg.py.in index 19ee0076b55..d47957dac92 100644 --- a/paddle/infrt/tests/lit.cfg.py.in +++ b/paddle/infrt/tests/lit.cfg.py.in @@ -21,10 +21,11 @@ build_dir = "@CMAKE_BINARY_DIR@" config.llvm_tools_dir = os.path.join(build_dir, "third_party/install/llvm/bin") config.llvm_tools_dir = os.path.join(build_dir, "/third_party/install/llvm/lib") infrtopt_bin = os.path.join(build_dir, "paddle/infrt/dialect/") +trtexec_bin = os.path.join(build_dir, "paddle/infrt/dialect/tensorrt/") infrtexec_bin = os.path.join(build_dir, "paddle/infrt/host_context/") llvm_bin = os.path.join(build_dir, "third_party/install/llvm/bin/") config.environment['PATH'] = os.path.pathsep.join( - (infrtopt_bin, infrtexec_bin, llvm_bin, config.environment['PATH'])) + (infrtopt_bin, infrtexec_bin, trtexec_bin, llvm_bin, config.environment['PATH'])) config.suffixes = ['.mlir'] -- GitLab From fb0cadfd2fa159a3d949357300a668a9cff75802 Mon Sep 17 00:00:00 2001 From: From00 Date: Wed, 2 Mar 2022 10:05:45 +0800 Subject: [PATCH 033/272] Fix bug for prepare phi OP (#40033) --- paddle/fluid/imperative/prepared_operator.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 9dd1dacc02c..2317bfdd7c0 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -186,11 +186,10 @@ PreparedOp PrepareImpl(const NameVarMap& ins, << " | kernel key: " << pt_kernel_key << " | kernel: " << pt_kernel; - if (platform::is_cpu_place(expected_kernel_key.place_)) { - auto* cpu_ctx = pool.Get(paddle::platform::CPUPlace()); - return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature, - pt_kernel, cpu_ctx); + if (expected_kernel_key.place_ != place) { + dev_ctx = pool.Get(expected_kernel_key.place_); } + // TODO(chenweihang): using CPUKernel when miss device kernel case return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature, pt_kernel, dev_ctx); -- GitLab From dbcf879758db039d68b5c6018b9229f4548e8702 Mon Sep 17 00:00:00 2001 From: Weilong Wu Date: Wed, 2 Mar 2022 10:17:29 +0800 Subject: [PATCH 034/272] [Eager] Support gnn ptb_rnn in eager mode (#39993) --- .../paddle/fluid/tests/unittests/test_imperative_gnn.py | 8 +++++++- .../unittests/test_imperative_ptb_rnn_sorted_gradient.py | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py index c813aeede6f..a5a90461551 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py @@ -23,6 +23,7 @@ import paddle.fluid.core as core from paddle.fluid.optimizer import AdamOptimizer from test_imperative_base import new_program_scope from paddle.fluid.dygraph.base import to_variable +from paddle.fluid.framework import _test_eager_guard def gen_data(): @@ -60,7 +61,7 @@ class GCN(fluid.Layer): class TestDygraphGNN(unittest.TestCase): - def test_gnn_float32(self): + def func_gnn_float32(self): paddle.seed(90) paddle.framework.random._manual_program_seed(90) startup = fluid.Program() @@ -168,6 +169,11 @@ class TestDygraphGNN(unittest.TestCase): self.assertTrue(np.allclose(static_weight, model2_gc_weight_value)) sys.stderr.write('%s %s\n' % (static_loss, loss_value)) + def test_gnn_float32(self): + with _test_eager_guard(): + self.func_gnn_float32() + self.func_gnn_float32() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py index e5453eed136..f659d834354 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py @@ -26,10 +26,11 @@ from test_imperative_base import new_program_scope from test_imperative_ptb_rnn import PtbModel import numpy as np import six +from paddle.fluid.framework import _test_eager_guard class TestDygraphPtbRnnSortGradient(unittest.TestCase): - def test_ptb_rnn_sort_gradient(self): + def func_ptb_rnn_sort_gradient(self): for is_sparse in [True, False]: self.ptb_rnn_sort_gradient_cpu_float32(is_sparse) @@ -171,6 +172,11 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase): for key, value in six.iteritems(static_param_updated): self.assertTrue(np.array_equal(value, dy_param_updated[key])) + def test_ptb_rnn_sort_gradient(self): + with _test_eager_guard(): + self.func_ptb_rnn_sort_gradient() + self.func_ptb_rnn_sort_gradient() + if __name__ == '__main__': unittest.main() -- GitLab From e4dba69a2fdc793ca399042e688256108e0098fb Mon Sep 17 00:00:00 2001 From: Feiyu Chan Date: Wed, 2 Mar 2022 10:23:15 +0800 Subject: [PATCH 035/272] [Pten] Gru lstm migration (#39729) * move sequence2batch * move lstm and gru * Add phi/kernels directory into exclusion to stop using hipcc to compile non .cu files in it. --- cmake/generic.cmake | 4 +- .../fused/fused_embedding_fc_lstm_op.cc | 6 +- paddle/fluid/operators/fused/fusion_gru_op.cc | 6 +- .../fluid/operators/fused/fusion_lstm_op.cc | 6 +- paddle/fluid/operators/fused/multi_gru_op.cc | 2 +- paddle/fluid/operators/gru_op.cc | 28 +- paddle/fluid/operators/gru_op.cu.cc | 12 +- paddle/fluid/operators/gru_op.h | 22 +- paddle/fluid/operators/lstm_op.h | 38 +- paddle/fluid/operators/lstmp_op.h | 68 +-- paddle/fluid/operators/math/CMakeLists.txt | 6 +- paddle/fluid/operators/math/gru_compute.h | 80 ---- paddle/fluid/operators/math/lstm_compute.cc | 93 ---- paddle/fluid/operators/math/lstm_compute.cu | 59 --- paddle/fluid/operators/rnn_op.h | 64 +-- paddle/phi/kernels/funcs/CMakeLists.txt | 4 + .../kernels/funcs}/detail/CMakeLists.txt | 0 .../funcs}/detail/activation_functions.h | 68 +-- .../kernels/funcs}/detail/avx_functions.cc | 19 +- .../kernels/funcs}/detail/avx_mathfun.h | 6 +- .../kernels/funcs}/detail/gru_cpu_kernel.h | 451 ++++++++++++------ .../kernels/funcs}/detail/gru_gpu_kernel.h | 106 ++-- .../kernels/funcs}/detail/gru_kernel.h | 150 +++--- .../kernels/funcs}/detail/lstm_cpu_kernel.h | 266 ++++++++--- .../kernels/funcs}/detail/lstm_gpu_kernel.h | 159 ++++-- .../kernels/funcs}/detail/lstm_kernel.h | 123 +++-- paddle/phi/kernels/funcs/gru_compute.cc | 373 +++++++++++++++ paddle/phi/kernels/funcs/gru_compute.cu | 349 ++++++++++++++ paddle/phi/kernels/funcs/gru_compute.h | 88 ++++ paddle/phi/kernels/funcs/lstm_compute.cc | 103 ++++ paddle/phi/kernels/funcs/lstm_compute.cu | 76 +++ .../math => phi/kernels/funcs}/lstm_compute.h | 39 +- .../kernels/funcs}/sequence2batch.cc | 62 +-- .../kernels/funcs}/sequence2batch.cu | 72 +-- .../kernels/funcs}/sequence2batch.h | 66 +-- 35 files changed, 2181 insertions(+), 893 deletions(-) delete mode 100644 paddle/fluid/operators/math/gru_compute.h delete mode 100644 paddle/fluid/operators/math/lstm_compute.cc delete mode 100644 paddle/fluid/operators/math/lstm_compute.cu rename paddle/{fluid/operators/math => phi/kernels/funcs}/detail/CMakeLists.txt (100%) rename paddle/{fluid/operators/math => phi/kernels/funcs}/detail/activation_functions.h (75%) rename paddle/{fluid/operators/math => phi/kernels/funcs}/detail/avx_functions.cc (87%) rename paddle/{fluid/operators/math => phi/kernels/funcs}/detail/avx_mathfun.h (99%) rename paddle/{fluid/operators/math => phi/kernels/funcs}/detail/gru_cpu_kernel.h (60%) rename paddle/{fluid/operators/math => phi/kernels/funcs}/detail/gru_gpu_kernel.h (74%) rename paddle/{fluid/operators/math => phi/kernels/funcs}/detail/gru_kernel.h (64%) rename paddle/{fluid/operators/math => phi/kernels/funcs}/detail/lstm_cpu_kernel.h (65%) rename paddle/{fluid/operators/math => phi/kernels/funcs}/detail/lstm_gpu_kernel.h (68%) rename paddle/{fluid/operators/math => phi/kernels/funcs}/detail/lstm_kernel.h (59%) create mode 100644 paddle/phi/kernels/funcs/gru_compute.cc create mode 100644 paddle/phi/kernels/funcs/gru_compute.cu create mode 100644 paddle/phi/kernels/funcs/gru_compute.h create mode 100644 paddle/phi/kernels/funcs/lstm_compute.cc create mode 100644 paddle/phi/kernels/funcs/lstm_compute.cu rename paddle/{fluid/operators/math => phi/kernels/funcs}/lstm_compute.h (56%) rename paddle/{fluid/operators/math => phi/kernels/funcs}/sequence2batch.cc (56%) rename paddle/{fluid/operators/math => phi/kernels/funcs}/sequence2batch.cu (55%) rename paddle/{fluid/operators/math => phi/kernels/funcs}/sequence2batch.h (80%) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 51ed537ce5d..da81575188f 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -580,8 +580,8 @@ function(hip_library TARGET_NAME) cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) if(hip_library_SRCS) # FindHIP.cmake defined hip_add_library, HIP_SOURCE_PROPERTY_FORMAT is requried if no .cu files found - if(NOT ${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/operators") - set_source_files_properties(${hip_library_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) + if(NOT (${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/operators" OR ${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/phi/kernels")) + set_source_files_properties(${hip_library_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) endif() if (hip_library_SHARED OR hip_library_shared) # build *.so hip_add_library(${TARGET_NAME} SHARED ${hip_library_SRCS}) diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc index 56c2c86e1a7..0c83c36b475 100644 --- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc @@ -15,9 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h" #include #include "paddle/fluid/operators/math/cpu_vec.h" -#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/sequence2batch.h" namespace paddle { namespace operators { @@ -473,7 +473,7 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { hidden_out->mutable_data(place); cell_out->mutable_data(place); - math::LoDTensor2BatchFunctor to_batch; + phi::funcs::LoDTensor2BatchFunctor to_batch; auto& dev_ctx = ctx.template device_context(); auto blas = phi::funcs::GetBlas(dev_ctx); @@ -591,7 +591,7 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { #undef MOVE_ONE_BATCH #undef DEFINE_CUR - math::Batch2LoDTensorFunctor to_seq; + phi::funcs::Batch2LoDTensorFunctor to_seq; batched_h_out->set_lod(batched_lod); to_seq(dev_ctx, *batched_h_out, hidden_out); batched_c_out->set_lod(batched_lod); diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc index 41a69031c54..3311e3b4ebc 100644 --- a/paddle/fluid/operators/fused/fusion_gru_op.cc +++ b/paddle/fluid/operators/fused/fusion_gru_op.cc @@ -19,8 +19,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/math/fc.h" -#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/sequence2batch.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif @@ -368,7 +368,7 @@ class FusionGRUKernel : public framework::OpKernel { hidden_out->mutable_data(place); auto& dev_ctx = ctx.template device_context(); auto blas = phi::funcs::GetBlas(dev_ctx); - math::LoDTensor2BatchFunctor to_batch; + phi::funcs::LoDTensor2BatchFunctor to_batch; math::FCFunctor fc; if (M > D3) { @@ -463,7 +463,7 @@ class FusionGRUKernel : public framework::OpKernel { batched_input_data = cur_batched_data; } - math::Batch2LoDTensorFunctor to_seq; + phi::funcs::Batch2LoDTensorFunctor to_seq; batched_out->set_lod(batched_lod); to_seq(dev_ctx, *batched_out, hidden_out); } diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc index 06d406867f0..00be8b09d12 100644 --- a/paddle/fluid/operators/fused/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc @@ -16,8 +16,8 @@ limitations under the License. */ #include #include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/math/fc.h" -#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/sequence2batch.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif @@ -421,7 +421,7 @@ class FuisonLSTMKernel : public framework::OpKernel { hidden_out->mutable_data(place); cell_out->mutable_data(place); - math::LoDTensor2BatchFunctor to_batch; + phi::funcs::LoDTensor2BatchFunctor to_batch; auto& dev_ctx = ctx.template device_context(); auto blas = phi::funcs::GetBlas(dev_ctx); math::FCFunctor fc; @@ -514,7 +514,7 @@ class FuisonLSTMKernel : public framework::OpKernel { batched_input_data = cur_in_data; } - math::Batch2LoDTensorFunctor to_seq; + phi::funcs::Batch2LoDTensorFunctor to_seq; batched_h_out->set_lod(batched_lod); to_seq(dev_ctx, *batched_h_out, hidden_out); batched_c_out->set_lod(batched_lod); diff --git a/paddle/fluid/operators/fused/multi_gru_op.cc b/paddle/fluid/operators/fused/multi_gru_op.cc index 84826ff3993..c2260c53b2e 100644 --- a/paddle/fluid/operators/fused/multi_gru_op.cc +++ b/paddle/fluid/operators/fused/multi_gru_op.cc @@ -19,8 +19,8 @@ limitations under the License. */ #include #include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/math/fc.h" -#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/sequence2batch.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc index 88530b5352d..d7cf03ddd61 100644 --- a/paddle/fluid/operators/gru_op.cc +++ b/paddle/fluid/operators/gru_op.cc @@ -15,9 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/gru_op.h" #include #include -#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h" -#include "paddle/fluid/operators/math/detail/gru_kernel.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h" +#include "paddle/phi/kernels/funcs/detail/gru_kernel.h" DECLARE_int32(paddle_num_threads); @@ -316,7 +316,7 @@ class GRUCPUKernel : public framework::OpKernel { batch_hidden->mutable_data(context.GetPlace()); bool is_reverse = context.Attr("is_reverse"); - math::LoDTensor2BatchFunctor to_batch; + phi::funcs::LoDTensor2BatchFunctor to_batch; auto& dev_ctx = context.template device_context(); to_batch(dev_ctx, *input, batch_gate, true, is_reverse); @@ -326,7 +326,7 @@ class GRUCPUKernel : public framework::OpKernel { } int frame_size = hidden_dims[1]; - math::GRUMetaValue gru_value; + phi::funcs::GRUMetaValue gru_value; gru_value.gate_weight = const_cast(weight_data); gru_value.state_weight = const_cast(weight_data + 2 * frame_size * frame_size); @@ -347,9 +347,9 @@ class GRUCPUKernel : public framework::OpKernel { } auto batch_starts = batch_gate->lod()[0]; size_t seq_len = batch_starts.size() - 1; - auto active_node = math::detail::GetActivationType( + auto active_node = phi::funcs::detail::GetActivationType( context.Attr("activation")); - auto active_gate = math::detail::GetActivationType( + auto active_gate = phi::funcs::detail::GetActivationType( context.Attr("gate_activation")); #ifdef PADDLE_WITH_MKLML @@ -396,9 +396,9 @@ class GRUCPUKernel : public framework::OpKernel { frame_size * 2, T(1), gru_value.gate_value, frame_size * 3); } - math::detail::forward_reset_output( - math::detail::forward::gru_resetOutput(), gru_value, frame_size, - cur_batch_size, active_gate); + phi::funcs::detail::forward_reset_output( + phi::funcs::detail::forward::gru_resetOutput(), gru_value, + frame_size, cur_batch_size, active_gate); if (gru_value.prev_out_value) { blas.GEMM_COMPUTE( @@ -408,9 +408,9 @@ class GRUCPUKernel : public framework::OpKernel { frame_size * 3); } - math::detail::forward_final_output( - math::detail::forward::gru_finalOutput(), gru_value, frame_size, - cur_batch_size, active_node, origin_mode); + phi::funcs::detail::forward_final_output( + phi::funcs::detail::forward::gru_finalOutput(), gru_value, + frame_size, cur_batch_size, active_node, origin_mode); gru_value.prev_out_value = gru_value.output_value; } @@ -432,7 +432,7 @@ class GRUCPUKernel : public framework::OpKernel { gru_value.gate_value = gate_t.data(); gru_value.reset_output_value = reset_hidden_prev_t.data(); - math::GRUUnitFunctor::compute( + phi::funcs::GRUUnitFunctor::compute( dev_ctx, gru_value, frame_size, cur_batch_size, active_node, active_gate, origin_mode); @@ -441,7 +441,7 @@ class GRUCPUKernel : public framework::OpKernel { #ifdef PADDLE_WITH_MKLML } #endif - math::Batch2LoDTensorFunctor to_seq; + phi::funcs::Batch2LoDTensorFunctor to_seq; batch_hidden->set_lod(batch_gate->lod()); to_seq(dev_ctx, *batch_hidden, hidden); } diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc index 7d055240916..5be0acc1543 100644 --- a/paddle/fluid/operators/gru_op.cu.cc +++ b/paddle/fluid/operators/gru_op.cu.cc @@ -65,7 +65,7 @@ class GRUKernel : public framework::OpKernel { batch_hidden->mutable_data(context.GetPlace()); bool is_reverse = context.Attr("is_reverse"); - math::LoDTensor2BatchFunctor to_batch; + phi::funcs::LoDTensor2BatchFunctor to_batch; auto& dev_ctx = context.template device_context(); to_batch(dev_ctx, *input, batch_gate, true, is_reverse); @@ -75,7 +75,7 @@ class GRUKernel : public framework::OpKernel { } int frame_size = hidden_dims[1]; - math::GRUMetaValue gru_value; + phi::funcs::GRUMetaValue gru_value; gru_value.gate_weight = const_cast(weight_data); gru_value.state_weight = const_cast(weight_data + 2 * frame_size * frame_size); @@ -96,9 +96,9 @@ class GRUKernel : public framework::OpKernel { } auto batch_starts = batch_gate->lod()[0]; size_t num_batch = batch_starts.size() - 1; - auto active_node = math::detail::GetActivationType( + auto active_node = phi::funcs::detail::GetActivationType( context.Attr("activation")); - auto active_gate = math::detail::GetActivationType( + auto active_gate = phi::funcs::detail::GetActivationType( context.Attr("gate_activation")); for (size_t n = 0; n < num_batch; n++) { int bstart = static_cast(batch_starts[n]); @@ -111,13 +111,13 @@ class GRUKernel : public framework::OpKernel { gru_value.output_value = hidden_t.data(); gru_value.gate_value = gate_t.data(); gru_value.reset_output_value = reset_hidden_prev_t.data(); - math::GRUUnitFunctor::compute( + phi::funcs::GRUUnitFunctor::compute( dev_ctx, gru_value, frame_size, cur_batch_size, active_node, active_gate, origin_mode); gru_value.prev_out_value = gru_value.output_value; } - math::Batch2LoDTensorFunctor to_seq; + phi::funcs::Batch2LoDTensorFunctor to_seq; batch_hidden->set_lod(batch_gate->lod()); to_seq(dev_ctx, *batch_hidden, hidden); } diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h index 130b10c7390..852655034c8 100644 --- a/paddle/fluid/operators/gru_op.h +++ b/paddle/fluid/operators/gru_op.h @@ -16,10 +16,10 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/detail/activation_functions.h" -#include "paddle/fluid/operators/math/gru_compute.h" -#include "paddle/fluid/operators/math/sequence2batch.h" +#include "paddle/phi/kernels/funcs/detail/activation_functions.h" +#include "paddle/phi/kernels/funcs/gru_compute.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/sequence2batch.h" namespace paddle { namespace operators { @@ -32,7 +32,7 @@ inline void ReorderInitState(const DeviceContext& ctx, const framework::Tensor& src, framework::Vector index_lod, framework::Tensor* dst, bool indexed_src) { - math::CopyMatrixRowsFunctor row_shuffle; + phi::funcs::CopyMatrixRowsFunctor row_shuffle; dst->mutable_data(src.dims(), ctx.GetPlace()); row_shuffle(ctx, src, index_lod, dst, indexed_src); } @@ -63,7 +63,7 @@ class GRUGradKernel : public framework::OpKernel { auto hidden_dims = hidden->dims(); int frame_size = hidden_dims[1]; - math::LoDTensor2BatchFunctor to_batch; + phi::funcs::LoDTensor2BatchFunctor to_batch; LoDTensor batch_hidden_grad, batch_gate_grad, batch_reset_hidden_prev_grad; batch_hidden_grad.mutable_data(hidden_dims, context.GetPlace()); batch_gate_grad.mutable_data(gate_dims, context.GetPlace()); @@ -93,12 +93,12 @@ class GRUGradKernel : public framework::OpKernel { batch_hidden_grad.set_lod(batch_hidden->lod()); to_batch(dev_ctx, *hidden_grad, &batch_hidden_grad, false, is_reverse); - math::GRUMetaValue gru_value; + phi::funcs::GRUMetaValue gru_value; gru_value.gate_weight = const_cast(weight_data); gru_value.state_weight = const_cast(weight_data + 2 * frame_size * frame_size); - math::GRUMetaGrad gru_grad; + phi::funcs::GRUMetaGrad gru_grad; if (weight_grad) { gru_grad.gate_weight_grad = weight_grad->mutable_data(context.GetPlace()); @@ -112,9 +112,9 @@ class GRUGradKernel : public framework::OpKernel { auto batch_starts = batch_hidden_grad.lod()[0]; size_t num_batch = batch_starts.size() - 1; - auto active_node = math::detail::GetActivationType( + auto active_node = phi::funcs::detail::GetActivationType( context.Attr("activation")); - auto active_gate = math::detail::GetActivationType( + auto active_gate = phi::funcs::detail::GetActivationType( context.Attr("gate_activation")); for (int n = static_cast(num_batch) - 1; n >= 0; n--) { int bstart = static_cast(batch_starts[n]); @@ -145,13 +145,13 @@ class GRUGradKernel : public framework::OpKernel { gru_grad.prev_out_grad = hidden_prev_grad_t.data(); } gru_value.output_value = nullptr; - math::GRUUnitGradFunctor::compute( + phi::funcs::GRUUnitGradFunctor::compute( dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size, active_node, active_gate, origin_mode); } if (input_grad) { input_grad->mutable_data(context.GetPlace()); - math::Batch2LoDTensorFunctor to_seq; + phi::funcs::Batch2LoDTensorFunctor to_seq; batch_gate_grad.set_lod(batch_gate->lod()); to_seq(dev_ctx, batch_gate_grad, input_grad); } diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h index 62f9cd26c41..4ec3072a96d 100644 --- a/paddle/fluid/operators/lstm_op.h +++ b/paddle/fluid/operators/lstm_op.h @@ -15,10 +15,10 @@ limitations under the License. */ #pragma once #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/detail/activation_functions.h" -#include "paddle/fluid/operators/math/lstm_compute.h" -#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/detail/activation_functions.h" +#include "paddle/phi/kernels/funcs/lstm_compute.h" +#include "paddle/phi/kernels/funcs/sequence2batch.h" namespace paddle { namespace operators { @@ -31,7 +31,7 @@ inline void ReorderInitState(const DeviceContext& ctx, const framework::Tensor& src, framework::Vector index_lod, framework::Tensor* dst, bool indexed_src) { - math::CopyMatrixRowsFunctor row_shuffle; + phi::funcs::CopyMatrixRowsFunctor row_shuffle; dst->mutable_data(src.dims(), ctx.GetPlace()); row_shuffle(ctx, src, index_lod, dst, indexed_src); } @@ -64,7 +64,7 @@ class LSTMKernel : public framework::OpKernel { cell_out->mutable_data(ctx.GetPlace()); bool is_reverse = ctx.Attr("is_reverse"); - math::LoDTensor2BatchFunctor to_batch; + phi::funcs::LoDTensor2BatchFunctor to_batch; auto& device_ctx = ctx.template device_context(); to_batch(device_ctx, *input, batch_gate, true, is_reverse); @@ -80,7 +80,7 @@ class LSTMKernel : public framework::OpKernel { add_bias(device_ctx, *batch_gate, gate_bias, batch_gate); } - math::LstmMetaValue lstm_value; + phi::funcs::LstmMetaValue lstm_value; if (bias && ctx.Attr("use_peepholes")) { T* bias_data = const_cast(bias->data()); // the code style in LstmMetaValue will be updated later. @@ -121,11 +121,11 @@ class LSTMKernel : public framework::OpKernel { auto batch_starts = batch_gate->lod()[0]; size_t num_batch = batch_starts.size() - 1; - auto gate_act = math::detail::GetActivationType( + auto gate_act = phi::funcs::detail::GetActivationType( ctx.Attr("gate_activation")); - auto cell_act = math::detail::GetActivationType( + auto cell_act = phi::funcs::detail::GetActivationType( ctx.Attr("cell_activation")); - auto cand_act = math::detail::GetActivationType( + auto cand_act = phi::funcs::detail::GetActivationType( ctx.Attr("candidate_activation")); auto blas = phi::funcs::GetBlas(device_ctx); @@ -166,13 +166,13 @@ class LSTMKernel : public framework::OpKernel { lstm_value.state_value = cell_t.data(); lstm_value.state_active_value = cell_pre_act_t.data(); T cell_clip = 0.0; - math::LstmUnitFunctor::compute( + phi::funcs::LstmUnitFunctor::compute( device_ctx, lstm_value, frame_size, cur_batch_size, cell_clip, gate_act, cell_act, cand_act); lstm_value.prev_state_value = lstm_value.state_value; } - math::Batch2LoDTensorFunctor to_seq; + phi::funcs::Batch2LoDTensorFunctor to_seq; batch_hidden.set_lod(batch_gate->lod()); // restore the output hidden in LoDTensor from the batch hidden to_seq(device_ctx, batch_hidden, hidden_out); @@ -241,7 +241,7 @@ class LSTMGradKernel : public framework::OpKernel { ") should be %d, but received %d in LSTM@Grad operator.", frame_size, out_dims[1])); - math::LstmMetaValue lstm_value; + phi::funcs::LstmMetaValue lstm_value; if (bias && ctx.Attr("use_peepholes")) { T* bias_data = const_cast(bias->data()); lstm_value.check_ig = bias_data + 4 * frame_size; @@ -253,7 +253,7 @@ class LSTMGradKernel : public framework::OpKernel { lstm_value.check_og = nullptr; } - math::LstmMetaGrad lstm_grad; + phi::funcs::LstmMetaGrad lstm_grad; if (bias && bias_g) { bias_g->mutable_data(ctx.GetPlace()); @@ -270,7 +270,7 @@ class LSTMGradKernel : public framework::OpKernel { lstm_grad.check_og_grad = nullptr; } - math::LoDTensor2BatchFunctor to_batch; + phi::funcs::LoDTensor2BatchFunctor to_batch; auto ToBatch = [&batch_gate, &to_batch]( const DeviceContext& ctx, const framework::LoDTensor& src, @@ -293,11 +293,11 @@ class LSTMGradKernel : public framework::OpKernel { batch_gate_g.mutable_data(batch_gate->dims(), ctx.GetPlace()); batch_gate_g.set_lod(batch_gate->lod()); - auto gate_act = math::detail::GetActivationType( + auto gate_act = phi::funcs::detail::GetActivationType( ctx.Attr("gate_activation")); - auto cell_act = math::detail::GetActivationType( + auto cell_act = phi::funcs::detail::GetActivationType( ctx.Attr("cell_activation")); - auto cand_act = math::detail::GetActivationType( + auto cand_act = phi::funcs::detail::GetActivationType( ctx.Attr("candidate_activation")); auto batch_starts = batch_gate->lod()[0]; @@ -338,7 +338,7 @@ class LSTMGradKernel : public framework::OpKernel { lstm_grad.state_active_grad = nullptr; int cur_batch_size = bend - bstart; T cell_clip = 0.0; - math::LstmUnitGradFunctor::compute( + phi::funcs::LstmUnitGradFunctor::compute( device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size, cell_clip, gate_act, cell_act, cand_act); @@ -369,7 +369,7 @@ class LSTMGradKernel : public framework::OpKernel { } } - math::Batch2LoDTensorFunctor to_seq; + phi::funcs::Batch2LoDTensorFunctor to_seq; if (in_g) { /* backward data */ in_g->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h index 96c074f1efb..5d24c0b70d3 100644 --- a/paddle/fluid/operators/lstmp_op.h +++ b/paddle/fluid/operators/lstmp_op.h @@ -18,12 +18,12 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/activation_op.h" -#include "paddle/fluid/operators/math/detail/activation_functions.h" -#include "paddle/fluid/operators/math/lstm_compute.h" -#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/transform.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/detail/activation_functions.h" +#include "paddle/phi/kernels/funcs/lstm_compute.h" +#include "paddle/phi/kernels/funcs/sequence2batch.h" namespace paddle { namespace operators { @@ -72,7 +72,7 @@ inline void ReorderInitState(const DeviceContext& ctx, const framework::Tensor& src, framework::Vector index, framework::Tensor* dst, bool indexed_src) { - math::CopyMatrixRowsFunctor row_shuffle; + phi::funcs::CopyMatrixRowsFunctor row_shuffle; dst->mutable_data(src.dims(), ctx.GetPlace()); row_shuffle(ctx, src, index, dst, indexed_src); } @@ -81,15 +81,15 @@ template class LSTMPKernel : public framework::OpKernel { public: template - void ActCompute(const math::detail::ActivationType act_type, const Device& d, - X x, Y y, platform::Place place) const { - if (act_type == math::detail::ActivationType::kIdentity) { + void ActCompute(const phi::funcs::detail::ActivationType act_type, + const Device& d, X x, Y y, platform::Place place) const { + if (act_type == phi::funcs::detail::ActivationType::kIdentity) { y.device(d) = x; - } else if (act_type == math::detail::ActivationType::kSigmoid) { + } else if (act_type == phi::funcs::detail::ActivationType::kSigmoid) { SigmoidFunctor()(d, x, y); - } else if (act_type == math::detail::ActivationType::kTanh) { + } else if (act_type == phi::funcs::detail::ActivationType::kTanh) { TanhFunctor()(d, x, y); - } else if (act_type == math::detail::ActivationType::kReLU) { + } else if (act_type == phi::funcs::detail::ActivationType::kReLU) { if (place == platform::CPUPlace()) ReluCPUFunctor()(d, x, y); else @@ -120,7 +120,7 @@ class LSTMPKernel : public framework::OpKernel { cell_out->mutable_data(ctx.GetPlace()); bool is_reverse = ctx.Attr("is_reverse"); - math::LoDTensor2BatchFunctor to_batch; + phi::funcs::LoDTensor2BatchFunctor to_batch; auto& device_ctx = ctx.template device_context(); to_batch(device_ctx, *input, batch_gate, true, is_reverse); @@ -137,7 +137,7 @@ class LSTMPKernel : public framework::OpKernel { add_bias(device_ctx, *batch_gate, gate_bias, batch_gate); } - math::LstmMetaValue lstmp_value; + phi::funcs::LstmMetaValue lstmp_value; if (bias && ctx.Attr("use_peepholes")) { T* bias_data = const_cast(bias->data()); // the code style in LstmpMetaValue will be updated later. @@ -176,13 +176,13 @@ class LSTMPKernel : public framework::OpKernel { auto batch_starts = batch_gate->lod()[0]; size_t num_batch = batch_starts.size() - 1; - auto gate_act = math::detail::GetActivationType( + auto gate_act = phi::funcs::detail::GetActivationType( ctx.Attr("gate_activation")); - auto cell_act = math::detail::GetActivationType( + auto cell_act = phi::funcs::detail::GetActivationType( ctx.Attr("cell_activation")); - auto cand_act = math::detail::GetActivationType( + auto cand_act = phi::funcs::detail::GetActivationType( ctx.Attr("candidate_activation")); - auto proj_act = math::detail::GetActivationType( + auto proj_act = phi::funcs::detail::GetActivationType( ctx.Attr("proj_activation")); auto& place = *ctx.template device_context().eigen_device(); auto blas = phi::funcs::GetBlas(device_ctx); @@ -222,13 +222,13 @@ class LSTMPKernel : public framework::OpKernel { lstmp_value.output_value = hidden_t.data(); lstmp_value.state_value = cell_t.data(); lstmp_value.state_active_value = cell_pre_act_t.data(); - math::LstmUnitFunctor::compute( + phi::funcs::LstmUnitFunctor::compute( device_ctx, lstmp_value, frame_size, cur_batch_size, cell_clip, gate_act, cell_act, cand_act); lstmp_value.prev_state_value = lstmp_value.state_value; blas.MatMul(hidden_t, false, *proj_weight, false, static_cast(1.0), &proj_t, static_cast(0.0)); - if (proj_act != math::detail::ActivationType::kIdentity) { + if (proj_act != phi::funcs::detail::ActivationType::kIdentity) { auto proj_t_dev = EigenMatrix::From(proj_t); ActCompute(cell_act, place, proj_t_dev, proj_t_dev, ctx.GetPlace()); } @@ -242,7 +242,7 @@ class LSTMPKernel : public framework::OpKernel { } } - math::Batch2LoDTensorFunctor to_seq; + phi::funcs::Batch2LoDTensorFunctor to_seq; batch_proj.set_lod(batch_gate->lod()); // restore the output hidden in LoDTensor from the batch hidden to_seq(device_ctx, batch_proj, proj_out); @@ -257,16 +257,16 @@ template class LSTMPGradKernel : public framework::OpKernel { public: template - void ActGradCompute(const math::detail::ActivationType act_type, + void ActGradCompute(const phi::funcs::detail::ActivationType act_type, const Device& d, X x, Y y, DX dx, DY dy) const { // x is dummy and won't be used even in Relu(use y instead) - if (act_type == math::detail::ActivationType::kIdentity) + if (act_type == phi::funcs::detail::ActivationType::kIdentity) dx.device(d) = dy; - else if (act_type == math::detail::ActivationType::kSigmoid) + else if (act_type == phi::funcs::detail::ActivationType::kSigmoid) SigmoidGradFunctor()(d, x, y, dy, dx); - else if (act_type == math::detail::ActivationType::kTanh) + else if (act_type == phi::funcs::detail::ActivationType::kTanh) TanhGradFunctor()(d, x, y, dy, dx); - else if (act_type == math::detail::ActivationType::kReLU) + else if (act_type == phi::funcs::detail::ActivationType::kReLU) ReluGradFunctor()(d, x, y, dy, dx); else PADDLE_THROW( @@ -340,7 +340,7 @@ class LSTMPGradKernel : public framework::OpKernel { "but received %d in LSTMP@Grad operator.", frame_size, out_dims[1])); - math::LstmMetaValue lstmp_value; + phi::funcs::LstmMetaValue lstmp_value; if (bias && ctx.Attr("use_peepholes")) { T* bias_data = const_cast(bias->data()); lstmp_value.check_ig = bias_data + 4 * frame_size; @@ -352,7 +352,7 @@ class LSTMPGradKernel : public framework::OpKernel { lstmp_value.check_og = nullptr; } - math::LstmMetaGrad lstmp_grad; + phi::funcs::LstmMetaGrad lstmp_grad; if (bias && bias_g) { bias_g->mutable_data(ctx.GetPlace()); @@ -369,7 +369,7 @@ class LSTMPGradKernel : public framework::OpKernel { lstmp_grad.check_og_grad = nullptr; } - math::LoDTensor2BatchFunctor to_batch; + phi::funcs::LoDTensor2BatchFunctor to_batch; auto ToBatch = [&batch_gate, &to_batch]( const DeviceContext& ctx, const framework::LoDTensor& src, @@ -393,13 +393,13 @@ class LSTMPGradKernel : public framework::OpKernel { batch_gate_g.mutable_data(batch_gate->dims(), ctx.GetPlace()); batch_gate_g.set_lod(batch_gate->lod()); - auto gate_act = math::detail::GetActivationType( + auto gate_act = phi::funcs::detail::GetActivationType( ctx.Attr("gate_activation")); - auto cell_act = math::detail::GetActivationType( + auto cell_act = phi::funcs::detail::GetActivationType( ctx.Attr("cell_activation")); - auto cand_act = math::detail::GetActivationType( + auto cand_act = phi::funcs::detail::GetActivationType( ctx.Attr("candidate_activation")); - auto proj_act = math::detail::GetActivationType( + auto proj_act = phi::funcs::detail::GetActivationType( ctx.Attr("proj_activation")); auto& place = *ctx.template device_context().eigen_device(); @@ -423,7 +423,7 @@ class LSTMPGradKernel : public framework::OpKernel { _ClipGradFunctor(-1.0 * proj_clip, proj_clip)); } - if (proj_act != math::detail::ActivationType::kIdentity) { + if (proj_act != phi::funcs::detail::ActivationType::kIdentity) { auto cur_proj_dev = EigenMatrix::From(cur_proj); auto proj_g_dev = EigenMatrix::From(proj_g); ActGradCompute(cell_act, place, cur_proj_dev, cur_proj_dev, proj_g_dev, @@ -470,7 +470,7 @@ class LSTMPGradKernel : public framework::OpKernel { lstmp_value.output_value = nullptr; lstmp_grad.state_active_grad = nullptr; - math::LstmUnitGradFunctor::compute( + phi::funcs::LstmUnitGradFunctor::compute( device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size, cell_clip, gate_act, cell_act, cand_act); @@ -503,7 +503,7 @@ class LSTMPGradKernel : public framework::OpKernel { } } - math::Batch2LoDTensorFunctor to_seq; + phi::funcs::Batch2LoDTensorFunctor to_seq; if (in_g) { /* backward data */ in_g->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index ac6566a8703..ba047355ad7 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -1,5 +1,3 @@ -add_subdirectory(detail) - if (WITH_ASCEND_CL) cc_library(beam_search_npu SRCS beam_search_npu.cc DEPS npu_op_runner) endif() @@ -18,8 +16,7 @@ math_library(im2col) math_library(sample_prob) math_library(sampler DEPS generator) -math_library(gru_compute DEPS activation_functions math_function) -math_library(lstm_compute DEPS activation_functions) +# math_library(math_function DEPS blas dense_tensor tensor) math_library(maxouting) math_library(pooling) @@ -29,7 +26,6 @@ else() math_library(selected_rows_functor DEPS selected_rows_utils math_function blas) endif() -math_library(sequence2batch) math_library(sequence_padding) math_library(sequence_pooling DEPS math_function jit_kernel_helper) math_library(sequence_scale) diff --git a/paddle/fluid/operators/math/gru_compute.h b/paddle/fluid/operators/math/gru_compute.h deleted file mode 100644 index 70cbfecefc8..00000000000 --- a/paddle/fluid/operators/math/gru_compute.h +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/operators/math/detail/activation_functions.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { -namespace math { - -template -struct GRUMetaValue { - const T *gate_weight; - const T *state_weight; - const T *reset_bias; - T *gate_value; - T *reset_output_value; - T *output_value; - const T *prev_out_value; -}; - -template -struct GRUMetaGrad { - T *gate_weight_grad; - T *state_weight_grad; - T *gate_grad; - T *reset_output_grad; - T *output_grad; - T *prev_out_grad; - T *bias_hh_grad; -}; - -template -struct GRUUnitFunctor { - static void compute(const DeviceContext &context, GRUMetaValue value, - int frame_size, int batch_size, - const detail::ActivationType active_node, - const detail::ActivationType active_gate, - bool origin_mode); -}; - -template -struct GRUUnitGradFunctor { - static void compute(const DeviceContext &context, GRUMetaValue value, - GRUMetaGrad grad, int frame_size, int batch_size, - const detail::ActivationType active_node, - const detail::ActivationType active_gate, - bool origin_mode); -}; - -template -struct GRUUnitFunctorV2 { - static void compute(const DeviceContext &context, GRUMetaValue value, - int frame_size, int batch_size, - const detail::ActivationType active_node, - const detail::ActivationType active_gate); -}; - -template -struct GRUUnitGradFunctorV2 { - static void compute(const DeviceContext &context, GRUMetaValue value, - GRUMetaGrad grad, int frame_size, int batch_size, - const detail::ActivationType active_node, - const detail::ActivationType active_gate); -}; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/lstm_compute.cc b/paddle/fluid/operators/math/lstm_compute.cc deleted file mode 100644 index aa4fe65a520..00000000000 --- a/paddle/fluid/operators/math/lstm_compute.cc +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/lstm_compute.h" - -#include "paddle/fluid/operators/math/detail/lstm_cpu_kernel.h" -#include "paddle/fluid/operators/math/detail/lstm_kernel.h" - -namespace paddle { -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace math { - -template -struct LstmUnitFunctor { - static void compute(const platform::CPUDeviceContext& context, - LstmMetaValue value, int frame_size, int batch_size, - T cell_clip, const detail::ActivationType& gate_act, - const detail::ActivationType& cell_act, - const detail::ActivationType& cand_act, - bool old_api_version = true) { - for (int b = 0; b < batch_size; b++) { - detail::cpu_lstm_forward(context, detail::forward::lstm(), value, - frame_size, cell_clip, cand_act, gate_act, - cell_act, old_api_version); - value.gate_value += frame_size * 4; - value.state_value += frame_size; - value.state_active_value += frame_size; - value.output_value += frame_size; - if (value.prev_state_value) { - value.prev_state_value += frame_size; - } - } - } -}; - -template -struct LstmUnitGradFunctor { - static void compute(const platform::CPUDeviceContext& context, - LstmMetaValue value, LstmMetaGrad grad, - int frame_size, int batch_size, T cell_clip, - const detail::ActivationType& gate_act, - const detail::ActivationType& cell_act, - const detail::ActivationType& cand_act, - bool old_api_version = true) { - for (int b = 0; b < batch_size; b++) { - detail::cpu_lstm_backward(context, detail::backward::lstm(), value, - grad, frame_size, cell_clip, cand_act, gate_act, - cell_act, old_api_version); - - value.gate_value += frame_size * 4; - value.state_value += frame_size; - value.state_active_value += frame_size; - value.output_value += frame_size; - if (value.prev_state_value) { - value.prev_state_value += frame_size; - } - - grad.gate_grad += frame_size * 4; - grad.state_grad += frame_size; - grad.state_active_grad += frame_size; - grad.output_grad += frame_size; - if (grad.prev_state_grad) { - grad.prev_state_grad += frame_size; - } - } - } -}; - -template class LstmUnitFunctor; -template class LstmUnitFunctor; -template class LstmUnitGradFunctor; -template class LstmUnitGradFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/lstm_compute.cu b/paddle/fluid/operators/math/lstm_compute.cu deleted file mode 100644 index 4342cb7b799..00000000000 --- a/paddle/fluid/operators/math/lstm_compute.cu +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/detail/lstm_gpu_kernel.h" -#include "paddle/fluid/operators/math/detail/lstm_kernel.h" -#include "paddle/fluid/operators/math/lstm_compute.h" - -namespace paddle { -namespace operators { -namespace math { - -template -struct LstmUnitFunctor { - static void compute(const platform::CUDADeviceContext& context, - LstmMetaValue value, int frame_size, int batch_size, - T cell_clip, const detail::ActivationType& gate_act, - const detail::ActivationType& cell_act, - const detail::ActivationType& cand_act, - bool old_api_version = true) { - detail::gpu_lstm_forward(context, detail::forward::lstm(), value, - frame_size, batch_size, cell_clip, cand_act, - gate_act, cell_act); - } -}; - -template -struct LstmUnitGradFunctor { - static void compute(const platform::CUDADeviceContext& context, - LstmMetaValue value, LstmMetaGrad grad, - int frame_size, int batch_size, T cell_clip, - const detail::ActivationType& gate_act, - const detail::ActivationType& cell_act, - const detail::ActivationType& cand_act, - bool old_api_version = true) { - detail::gpu_lstm_backward(context, detail::backward::lstm(), value, grad, - frame_size, batch_size, cell_clip, cand_act, - gate_act, cell_act); - } -}; - -template class LstmUnitFunctor; -template class LstmUnitFunctor; -template class LstmUnitGradFunctor; -template class LstmUnitGradFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/rnn_op.h b/paddle/fluid/operators/rnn_op.h index c18570af775..b636184ae45 100644 --- a/paddle/fluid/operators/rnn_op.h +++ b/paddle/fluid/operators/rnn_op.h @@ -20,13 +20,13 @@ limitations under the License. */ #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/operators/math/concat_and_split.h" -#include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/fc.h" -#include "paddle/fluid/operators/math/gru_compute.h" -#include "paddle/fluid/operators/math/lstm_compute.h" #include "paddle/fluid/operators/unique_op.h" #include "paddle/fluid/operators/utils.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/detail/activation_functions.h" +#include "paddle/phi/kernels/funcs/gru_compute.h" +#include "paddle/phi/kernels/funcs/lstm_compute.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -100,7 +100,7 @@ struct Cell { }; template class EigenActivationFunctor, - math::detail::ActivationType act_type> + phi::funcs::detail::ActivationType act_type> struct SimpleRNNCell : Cell { void operator()(const platform::CPUDeviceContext* device_ctx, Tensor* input, const Tensor* weight_hh, const Tensor* init_h, @@ -148,7 +148,7 @@ struct GRUCell : Cell { size_t frame_size = init_h->dims()[2]; size_t batch_size = init_h->dims()[1]; - math::GRUMetaValue gru_value; + phi::funcs::GRUMetaValue gru_value; gru_value.gate_weight = weight_hh->data(); gru_value.state_weight = weight_hh->data() + 2 * frame_size * frame_size; gru_value.reset_bias = bias_hh->data() + 2 * frame_size; @@ -158,10 +158,10 @@ struct GRUCell : Cell { gru_value.output_value = output->data(); gru_value.prev_out_value = init_h->data(); - auto gate_act = math::detail::GetActivationType("sigmoid_v2"); - auto cand_act = math::detail::GetActivationType("tanh_v2"); + auto gate_act = phi::funcs::detail::GetActivationType("sigmoid_v2"); + auto cand_act = phi::funcs::detail::GetActivationType("tanh_v2"); - math::GRUUnitFunctorV2::compute( + phi::funcs::GRUUnitFunctorV2::compute( *device_ctx, gru_value, frame_size, batch_size, cand_act, gate_act); } }; @@ -184,14 +184,14 @@ struct LSTMCell : Cell { blas.MatMul(*init_h, mat_dim_a, *weight_hh, mat_dim_b, static_cast(1.0), input, static_cast(1.0)); - math::LstmMetaValue lstm_value; + phi::funcs::LstmMetaValue lstm_value; lstm_value.check_ig = nullptr; lstm_value.check_fg = nullptr; lstm_value.check_og = nullptr; - auto gate_act = math::detail::GetActivationType("sigmoid_v2"); - auto cell_act = math::detail::GetActivationType("tanh_v2"); - auto cand_act = math::detail::GetActivationType("tanh_v2"); + auto gate_act = phi::funcs::detail::GetActivationType("sigmoid_v2"); + auto cell_act = phi::funcs::detail::GetActivationType("tanh_v2"); + auto cand_act = phi::funcs::detail::GetActivationType("tanh_v2"); size_t frame_size = init_h->dims()[2]; size_t batch_size = init_h->dims()[1]; @@ -208,7 +208,7 @@ struct LSTMCell : Cell { lstm_value.state_value = last_c->data(); lstm_value.state_active_value = last_c_act->data(); T cell_clip = 0.0; - math::LstmUnitFunctor::compute( + phi::funcs::LstmUnitFunctor::compute( *device_ctx, lstm_value, frame_size, batch_size, cell_clip, gate_act, cell_act, cand_act, false); } @@ -986,18 +986,18 @@ class RNNCPUKernel : public framework::OpKernel { seed, reserve_data); } else if (is_rnn_relu(ctx)) { gate_num = 1; - RnnFunc< - SimpleRNNCell, - Layer, SingleLayer, BidirLayer, T>( + RnnFunc, + Layer, SingleLayer, BidirLayer, T>( ctx, input, weight_list, pre_state[0], nullptr, sequence_length, state[0], nullptr, output, dropout_mask, num_layers, gate_num, input_size, hidden_size, is_bidirec, mode, dropout_prob, is_test, seed, reserve_data); } else if (is_rnn_tanh(ctx)) { gate_num = 1; - RnnFunc< - SimpleRNNCell, - Layer, SingleLayer, BidirLayer, T>( + RnnFunc, + Layer, SingleLayer, BidirLayer, T>( ctx, input, weight_list, pre_state[0], nullptr, sequence_length, state[0], nullptr, output, dropout_mask, num_layers, gate_num, input_size, hidden_size, is_bidirec, mode, dropout_prob, is_test, @@ -1014,14 +1014,14 @@ class RNNCPUKernel : public framework::OpKernel { }; template -void create_lstm_value(math::LstmMetaValue* lstm_value) { +void create_lstm_value(phi::funcs::LstmMetaValue* lstm_value) { lstm_value->check_ig = nullptr; lstm_value->check_fg = nullptr; lstm_value->check_og = nullptr; } template -void create_lstm_grad(math::LstmMetaGrad* lstm_grad) { +void create_lstm_grad(phi::funcs::LstmMetaGrad* lstm_grad) { lstm_grad->check_ig_grad = nullptr; lstm_grad->check_fg_grad = nullptr; lstm_grad->check_og_grad = nullptr; @@ -1686,8 +1686,8 @@ struct GRUGradCell : GradCell { // zero pre_hidden phi::funcs::SetConstant zero; zero(device_ctx, grad_pre_hidden, static_cast(0.0)); - math::GRUMetaValue gru_value; - math::GRUMetaGrad gru_grad; + phi::funcs::GRUMetaValue gru_value; + phi::funcs::GRUMetaGrad gru_grad; gru_value.gate_value = gate_tensor->data(); gru_value.prev_out_value = pre_hidden->data(); gru_value.reset_output_value = state_tensor->data(); @@ -1703,9 +1703,9 @@ struct GRUGradCell : GradCell { grad_weight_hh->data() + 2 * frame_size * frame_size; gru_grad.bias_hh_grad = grad_bias_hh->data(); - auto act_gate = math::detail::GetActivationType("sigmoid_v2"); - auto act_node = math::detail::GetActivationType("tanh_v2"); - math::GRUUnitGradFunctorV2::compute( + auto act_gate = phi::funcs::detail::GetActivationType("sigmoid_v2"); + auto act_node = phi::funcs::detail::GetActivationType("tanh_v2"); + phi::funcs::GRUUnitGradFunctorV2::compute( device_ctx, gru_value, gru_grad, frame_size, batch_size, act_node, act_gate); @@ -1738,8 +1738,8 @@ struct LSTMGradCell : GradCell { backup_tensor(context, &grad_pre_state_bak, grad_pre_state); } - math::LstmMetaValue lstm_value; - math::LstmMetaGrad lstm_grad; + phi::funcs::LstmMetaValue lstm_value; + phi::funcs::LstmMetaGrad lstm_grad; create_lstm_value(&lstm_value); create_lstm_grad(&lstm_grad); lstm_value.gate_value = gate_tensor->data(); @@ -1755,12 +1755,12 @@ struct LSTMGradCell : GradCell { lstm_value.output_value = nullptr; lstm_grad.state_active_grad = nullptr; - auto gate_act = math::detail::GetActivationType("sigmoid_v2"); - auto state_act = math::detail::GetActivationType("tanh_v2"); - auto cand_act = math::detail::GetActivationType("tanh_v2"); + auto gate_act = phi::funcs::detail::GetActivationType("sigmoid_v2"); + auto state_act = phi::funcs::detail::GetActivationType("tanh_v2"); + auto cand_act = phi::funcs::detail::GetActivationType("tanh_v2"); T cell_clip = 0.0; - math::LstmUnitGradFunctor::compute( + phi::funcs::LstmUnitGradFunctor::compute( device_ctx, lstm_value, lstm_grad, frame_size, batch_size, cell_clip, gate_act, state_act, cand_act, false); this->update_pre_hidden_grad( diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt index aa4fac16920..8b8697b6df1 100644 --- a/paddle/phi/kernels/funcs/CMakeLists.txt +++ b/paddle/phi/kernels/funcs/CMakeLists.txt @@ -1,6 +1,10 @@ add_subdirectory(eigen) add_subdirectory(blas) add_subdirectory(lapack) +add_subdirectory(detail) math_library(math_function DEPS blas dense_tensor tensor) +math_library(sequence2batch) +math_library(gru_compute DEPS activation_functions math_function) +math_library(lstm_compute DEPS activation_functions) math_library(concat_and_split_functor DEPS dense_tensor) diff --git a/paddle/fluid/operators/math/detail/CMakeLists.txt b/paddle/phi/kernels/funcs/detail/CMakeLists.txt similarity index 100% rename from paddle/fluid/operators/math/detail/CMakeLists.txt rename to paddle/phi/kernels/funcs/detail/CMakeLists.txt diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/phi/kernels/funcs/detail/activation_functions.h similarity index 75% rename from paddle/fluid/operators/math/detail/activation_functions.h rename to paddle/phi/kernels/funcs/detail/activation_functions.h index 1fac60e7cb8..475557f1642 100644 --- a/paddle/fluid/operators/math/detail/activation_functions.h +++ b/paddle/phi/kernels/funcs/detail/activation_functions.h @@ -19,9 +19,8 @@ limitations under the License. */ #include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/core/hostdevice.h" -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { namespace detail { #define SIGMOID_THRESHOLD_MIN -40.0 @@ -132,25 +131,35 @@ struct Active { #ifdef PADDLE_WITH_CUDA -static DEVICE Active::Act kActFloat[] = { - &forward::Sigmoid, &forward::SigmoidV2, - &forward::Relu, &forward::Tanh, - &forward::TanhV2, &forward::Identity}; +static DEVICE Active::Act kActFloat[] = {&forward::Sigmoid, + &forward::SigmoidV2, + &forward::Relu, + &forward::Tanh, + &forward::TanhV2, + &forward::Identity}; static DEVICE Active::ActGrad kActGradFloat[] = { - &backward::Sigmoid, &backward::Sigmoid, - &backward::Relu, &backward::Tanh, - &backward::Tanh, &backward::Identity}; - -static DEVICE Active::Act kActDouble[] = { - &forward::Sigmoid, &forward::SigmoidV2, - &forward::Relu, &forward::Tanh, - &forward::TanhV2, &forward::Identity}; + &backward::Sigmoid, + &backward::Sigmoid, + &backward::Relu, + &backward::Tanh, + &backward::Tanh, + &backward::Identity}; + +static DEVICE Active::Act kActDouble[] = {&forward::Sigmoid, + &forward::SigmoidV2, + &forward::Relu, + &forward::Tanh, + &forward::TanhV2, + &forward::Identity}; static DEVICE Active::ActGrad kActGradDouble[] = { - &backward::Sigmoid, &backward::Sigmoid, - &backward::Relu, &backward::Tanh, - &backward::Tanh, &backward::Identity}; + &backward::Sigmoid, + &backward::Sigmoid, + &backward::Relu, + &backward::Tanh, + &backward::Tanh, + &backward::Identity}; namespace forward { inline DEVICE float activation(float a, int index) { @@ -287,13 +296,19 @@ __m256 Identity(const __m256 a, const __m256 b); } // namespace avx } // namespace backward -static Active<__m256>::Act kActAvx[] = { - &forward::avx::Sigmoid, &forward::avx::SigmoidV2, &forward::avx::Relu, - &forward::avx::Tanh, &forward::avx::TanhV2, &forward::avx::Identity}; +static Active<__m256>::Act kActAvx[] = {&forward::avx::Sigmoid, + &forward::avx::SigmoidV2, + &forward::avx::Relu, + &forward::avx::Tanh, + &forward::avx::TanhV2, + &forward::avx::Identity}; -static Active<__m256>::ActGrad kActGradAvx[] = { - &backward::avx::Sigmoid, &backward::avx::Sigmoid, &backward::avx::Relu, - &backward::avx::Tanh, &backward::avx::Tanh, &backward::avx::Identity}; +static Active<__m256>::ActGrad kActGradAvx[] = {&backward::avx::Sigmoid, + &backward::avx::Sigmoid, + &backward::avx::Relu, + &backward::avx::Tanh, + &backward::avx::Tanh, + &backward::avx::Identity}; namespace forward { inline __m256 activation(__m256 a, int index) { return kActAvx[index](a); } @@ -308,6 +323,5 @@ inline __m256 activation(__m256 a, __m256 b, int index) { #endif } // namespace detail -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/fluid/operators/math/detail/avx_functions.cc b/paddle/phi/kernels/funcs/detail/avx_functions.cc similarity index 87% rename from paddle/fluid/operators/math/detail/avx_functions.cc rename to paddle/phi/kernels/funcs/detail/avx_functions.cc index 89e2c825c24..51af97857df 100644 --- a/paddle/fluid/operators/math/detail/avx_functions.cc +++ b/paddle/phi/kernels/funcs/detail/avx_functions.cc @@ -14,12 +14,11 @@ limitations under the License. */ #ifdef __AVX__ -#include "paddle/fluid/operators/math/detail/activation_functions.h" -#include "paddle/fluid/operators/math/detail/avx_mathfun.h" +#include "paddle/phi/kernels/funcs/detail/activation_functions.h" +#include "paddle/phi/kernels/funcs/detail/avx_mathfun.h" -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { namespace detail { __m256 Exp(__m256 a) { return exp256_ps(a); } @@ -77,8 +76,9 @@ namespace backward { namespace avx { __m256 Relu(const __m256 a, const __m256 b) { return _mm256_mul_ps( - a, _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS), - _mm256_set1_ps(1.0f))); + a, + _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS), + _mm256_set1_ps(1.0f))); } __m256 Sigmoid(const __m256 a, const __m256 b) { @@ -96,8 +96,7 @@ __m256 Identity(const __m256 a, const __m256 b) { return a; } } // namespace backward } // namespace detail -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi #endif diff --git a/paddle/fluid/operators/math/detail/avx_mathfun.h b/paddle/phi/kernels/funcs/detail/avx_mathfun.h similarity index 99% rename from paddle/fluid/operators/math/detail/avx_mathfun.h rename to paddle/phi/kernels/funcs/detail/avx_mathfun.h index d7cf91134e4..e5e7388d51d 100644 --- a/paddle/fluid/operators/math/detail/avx_mathfun.h +++ b/paddle/phi/kernels/funcs/detail/avx_mathfun.h @@ -49,9 +49,9 @@ typedef __m256 v8sf; // vector of 8 float (avx) typedef __m256i v8si; // vector of 8 int (avx) typedef __m128i v4si; // vector of 8 int (avx) -#define _PI32AVX_CONST(Name, Val) \ - static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = {Val, Val, \ - Val, Val} +#define _PI32AVX_CONST(Name, Val) \ + static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { \ + Val, Val, Val, Val} _PI32AVX_CONST(1, 1); _PI32AVX_CONST(inv1, ~1); diff --git a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h similarity index 60% rename from paddle/fluid/operators/math/detail/gru_cpu_kernel.h rename to paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h index cbbfbc321b5..cb37daa680e 100644 --- a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h +++ b/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h @@ -16,24 +16,28 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/operators/activation_op.h" -#include "paddle/fluid/operators/math/detail/activation_functions.h" -#include "paddle/fluid/operators/math/gru_compute.h" +#include "paddle/phi/kernels/funcs/detail/activation_functions.h" +#include "paddle/phi/kernels/funcs/gru_compute.h" -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { namespace detail { using Array1 = Eigen::DSizes; -template -using EigenVector = framework::EigenVector; +using EigenVector = paddle::framework::EigenVector; #if !defined(__NVCC__) && !defined(__HIPCC___) // @{ Group for GRU CPU template -void hl_naive_gru_forward_reset_output( - OpResetOutput op_reset_output, T *gate_value, T *reset_output_value, - const T *prev_output_value, int frame_size, ActivationType active_gate, - bool old_version = true, const T *reset_bias = nullptr) { +void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output, + T *gate_value, + T *reset_output_value, + const T *prev_output_value, + int frame_size, + ActivationType active_gate, + bool old_version = true, + const T *reset_bias = nullptr) { T r_value_update_gate; T r_value_reset_gate; T r_value_reset_output; @@ -59,8 +63,12 @@ void hl_naive_gru_forward_reset_output( r_prev_out = prev_output_value[i]; } - op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out, - &r_value_reset_output, active_gate, &r_reset_bias, + op_reset_output(&r_value_update_gate, + &r_value_reset_gate, + &r_prev_out, + &r_value_reset_output, + active_gate, + &r_reset_bias, old_version); update_gate[i] = r_value_update_gate; @@ -70,10 +78,14 @@ void hl_naive_gru_forward_reset_output( } template -void hl_naive_gru_forward_final_output( - OpFinalOutput op_final_output, T *gate_value, const T *prev_output_value, - T *output_value, int frame_size, ActivationType active_node, - bool origin_mode, bool old_version = true) { +void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output, + T *gate_value, + const T *prev_output_value, + T *output_value, + int frame_size, + ActivationType active_node, + bool origin_mode, + bool old_version = true) { T r_value_update_gate; T r_value_frame_state; T r_prev_out = 0; @@ -93,8 +105,12 @@ void hl_naive_gru_forward_final_output( r_prev_out = prev_output_value[i]; } - op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out, - &r_output, active_node, origin_mode); + op_final_output(&r_value_update_gate, + &r_value_frame_state, + &r_prev_out, + &r_output, + active_node, + origin_mode); frame_state[i] = r_value_frame_state; output_value[i] = r_output; @@ -103,8 +119,10 @@ void hl_naive_gru_forward_final_output( template void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output, - T *gate_value, T *reset_output_value, - const T *prev_output_value, int frame_size, + T *gate_value, + T *reset_output_value, + const T *prev_output_value, + int frame_size, ActivationType active_gate, bool old_version = true, const T *reset_bias = nullptr) { @@ -152,8 +170,12 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output, _mm256_loadu_ps((const float *)(reset_output_value + i)); } - op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out, - &r_value_reset_output, active_gate, &r_reset_bias, + op_reset_output(&r_value_update_gate, + &r_value_reset_gate, + &r_prev_out, + &r_value_reset_output, + active_gate, + &r_reset_bias, old_version); _mm256_storeu_ps(reinterpret_cast(update_gate + i), @@ -167,9 +189,13 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output, if (rest > 0) { i = n - block; - op_reset_output(&r_value_update_gate_last, &r_value_reset_gate_last, - &r_prev_out_last, &r_value_reset_output, active_gate, - &r_reset_bias, old_version); + op_reset_output(&r_value_update_gate_last, + &r_value_reset_gate_last, + &r_prev_out_last, + &r_value_reset_output, + active_gate, + &r_reset_bias, + old_version); _mm256_storeu_ps(reinterpret_cast(update_gate + i), r_value_update_gate_last); @@ -183,8 +209,10 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output, template void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output, - T *gate_value, const T *prev_output_value, - T *output_value, int frame_size, + T *gate_value, + const T *prev_output_value, + T *output_value, + int frame_size, ActivationType active_node, bool origin_mode, bool old_version = true) { @@ -226,8 +254,12 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output, r_prev_out = _mm256_loadu_ps((const float *)(prev_output_value + i)); } - op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out, - &r_output, active_node, origin_mode); + op_final_output(&r_value_update_gate, + &r_value_frame_state, + &r_prev_out, + &r_output, + active_node, + origin_mode); _mm256_storeu_ps(reinterpret_cast(frame_state + i), r_value_frame_state); @@ -236,8 +268,12 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output, if (rest > 0) { i = n - block; - op_final_output(&r_value_update_gate_last, &r_value_frame_state_last, - &r_prev_out_last, &r_output, active_node, origin_mode); + op_final_output(&r_value_update_gate_last, + &r_value_frame_state_last, + &r_prev_out_last, + &r_output, + active_node, + origin_mode); _mm256_storeu_ps(reinterpret_cast(frame_state + i), r_value_frame_state_last); @@ -248,8 +284,10 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output, } template -inline void forward_reset_outputV2(const platform::CPUDeviceContext &context, - GRUMetaValue value, int frame_size) { +inline void forward_reset_outputV2( + const paddle::platform::CPUDeviceContext &context, + phi::funcs::GRUMetaValue value, + int frame_size) { auto &place = *context.eigen_device(); auto value_reset_gate = typename EigenVector::Type(value.gate_value, Array1(frame_size)); @@ -259,17 +297,23 @@ inline void forward_reset_outputV2(const platform::CPUDeviceContext &context, value.reset_output_value, Array1(frame_size)); auto value_reset_bias = typename EigenVector::ConstType(value.reset_bias, Array1(frame_size)); - SigmoidFunctor()(place, value_reset_gate, value_reset_gate); - SigmoidFunctor()(place, value_update_gate, value_update_gate); + paddle::operators::SigmoidFunctor()( + place, value_reset_gate, value_reset_gate); + paddle::operators::SigmoidFunctor()( + place, value_update_gate, value_update_gate); value_reset_output.device(place) = (value_reset_output + value_reset_bias) * value_reset_gate; } template inline void forward_reset_output( - OpResetOutput op_reset_output, GRUMetaValue value, int frame_size, - int batch_size, ActivationType active_gate, bool old_version = true, - const platform::CPUDeviceContext *context = nullptr) { + OpResetOutput op_reset_output, + phi::funcs::GRUMetaValue value, + int frame_size, + int batch_size, + ActivationType active_gate, + bool old_version = true, + const paddle::platform::CPUDeviceContext *context = nullptr) { for (int b = 0; b < batch_size; b++) { if (!old_version) { // use eigen @@ -277,15 +321,23 @@ inline void forward_reset_output( } else { if (OpResetOutput::avx && (frame_size > static_cast(8 - 1)) && (sizeof(T) == 4)) { - hl_avx_gru_forward_reset_output( - op_reset_output, value.gate_value, value.reset_output_value, - value.prev_out_value, frame_size, active_gate, old_version, - value.reset_bias); + hl_avx_gru_forward_reset_output(op_reset_output, + value.gate_value, + value.reset_output_value, + value.prev_out_value, + frame_size, + active_gate, + old_version, + value.reset_bias); } else { - hl_naive_gru_forward_reset_output( - op_reset_output, value.gate_value, value.reset_output_value, - value.prev_out_value, frame_size, active_gate, old_version, - value.reset_bias); + hl_naive_gru_forward_reset_output(op_reset_output, + value.gate_value, + value.reset_output_value, + value.prev_out_value, + frame_size, + active_gate, + old_version, + value.reset_bias); } } value.gate_value += frame_size * 3; @@ -297,8 +349,10 @@ inline void forward_reset_output( } template -inline void forward_final_outputV2(const platform::CPUDeviceContext &context, - GRUMetaValue value, int frame_size) { +inline void forward_final_outputV2( + const paddle::platform::CPUDeviceContext &context, + phi::funcs::GRUMetaValue value, + int frame_size) { auto &place = *context.eigen_device(); auto value_update_gate = typename EigenVector::Type( value.gate_value + frame_size, Array1(frame_size)); @@ -306,7 +360,8 @@ inline void forward_final_outputV2(const platform::CPUDeviceContext &context, value.gate_value + 2 * frame_size, Array1(frame_size)); auto value_output = typename EigenVector::Type(value.output_value, Array1(frame_size)); - TanhFunctor()(place, value_frame_state, value_frame_state); + paddle::operators::TanhFunctor()( + place, value_frame_state, value_frame_state); value_output.device(place) = (static_cast(1.0) - value_update_gate) * value_frame_state; if (value.prev_out_value) { @@ -319,10 +374,14 @@ inline void forward_final_outputV2(const platform::CPUDeviceContext &context, template inline void forward_final_output( - OpFinalOutput op_final_output, GRUMetaValue value, int frame_size, - int batch_size, ActivationType active_node, bool origin_mode, + OpFinalOutput op_final_output, + phi::funcs::GRUMetaValue value, + int frame_size, + int batch_size, + ActivationType active_node, + bool origin_mode, bool old_version = true, - const platform::CPUDeviceContext *context = nullptr) { + const paddle::platform::CPUDeviceContext *context = nullptr) { for (int b = 0; b < batch_size; b++) { if (!old_version) { // eigen @@ -330,15 +389,23 @@ inline void forward_final_output( } else { if (OpFinalOutput::avx && (frame_size > static_cast(8 - 1)) && (sizeof(T) == 4)) { - hl_avx_gru_forward_final_output(op_final_output, value.gate_value, + hl_avx_gru_forward_final_output(op_final_output, + value.gate_value, value.prev_out_value, - value.output_value, frame_size, - active_node, origin_mode, old_version); + value.output_value, + frame_size, + active_node, + origin_mode, + old_version); } else { - hl_naive_gru_forward_final_output( - op_final_output, value.gate_value, value.prev_out_value, - value.output_value, frame_size, active_node, origin_mode, - old_version); + hl_naive_gru_forward_final_output(op_final_output, + value.gate_value, + value.prev_out_value, + value.output_value, + frame_size, + active_node, + origin_mode, + old_version); } } value.gate_value += frame_size * 3; @@ -350,9 +417,12 @@ inline void forward_final_output( } template -void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, - T *gate_grad, const T *prev_out_value, - T *prev_out_grad, T *output_grad, +void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, + T *gate_value, + T *gate_grad, + const T *prev_out_value, + T *prev_out_grad, + T *output_grad, int frame_size, ActivationType active_node, bool origin_mode) { @@ -379,9 +449,15 @@ void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, r_prev_out_grad = prev_out_grad[i]; } - op_state_grad(&r_update_gate_value, &r_update_gate_grad, - &r_frame_state_value, &r_frame_state_grad, &r_prev_out_value, - &r_prev_out_grad, &r_out_grad, active_node, origin_mode); + op_state_grad(&r_update_gate_value, + &r_update_gate_grad, + &r_frame_state_value, + &r_frame_state_grad, + &r_prev_out_value, + &r_prev_out_grad, + &r_out_grad, + active_node, + origin_mode); update_gate_grad[i] = r_update_gate_grad; frame_state_grad[i] = r_frame_state_grad; @@ -392,9 +468,12 @@ void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, } template -void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, - T *gate_grad, const T *prev_out_value, - T *prev_out_grad, T *reset_output_grad, +void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, + T *gate_value, + T *gate_grad, + const T *prev_out_value, + T *prev_out_grad, + T *reset_output_grad, int frame_size, ActivationType active_gate) { T r_update_gate_value; @@ -424,9 +503,14 @@ void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, r_prev_out_grad = prev_out_grad[i]; } - op_reset_grad(&r_update_gate_value, &r_update_gate_grad, - &r_reset_gate_value, &r_reset_gate_grad, &r_prev_out_value, - &r_prev_out_grad, &r_reset_output_grad, active_gate); + op_reset_grad(&r_update_gate_value, + &r_update_gate_grad, + &r_reset_gate_value, + &r_reset_gate_grad, + &r_prev_out_value, + &r_prev_out_grad, + &r_reset_output_grad, + active_gate); update_gate_grad[i] = r_update_gate_grad; reset_gate_grad[i] = r_reset_gate_grad; @@ -437,10 +521,14 @@ void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, } template -void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, - T *gate_grad, const T *prev_out_value, - T *prev_out_grad, T *output_grad, - int frame_size, ActivationType active_node, +void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, + T *gate_value, + T *gate_grad, + const T *prev_out_value, + T *prev_out_grad, + T *output_grad, + int frame_size, + ActivationType active_node, bool origin_mode) { #ifdef __AVX__ __m256 r_update_gate_value; @@ -468,9 +556,15 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i]; } - op_state_grad(&r_update_gate_value, &r_update_gate_grad, - &r_frame_state_value, &r_frame_state_grad, &r_prev_out_value, - &r_prev_out_grad, &r_out_grad, active_node, origin_mode); + op_state_grad(&r_update_gate_value, + &r_update_gate_grad, + &r_frame_state_value, + &r_frame_state_grad, + &r_prev_out_value, + &r_prev_out_grad, + &r_out_grad, + active_node, + origin_mode); update_gate_grad[i] = r_update_gate_grad; frame_state_grad[i] = r_frame_state_grad; @@ -482,9 +576,12 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, } template -void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, - T *gate_grad, const T *prev_out_value, - T *prev_out_grad, T *reset_output_grad, +void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, + T *gate_value, + T *gate_grad, + const T *prev_out_value, + T *prev_out_grad, + T *reset_output_grad, int frame_size, ActivationType active_gate) { #ifdef __AVX__ @@ -516,9 +613,14 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i]; } - op_reset_grad(&r_update_gate_value, &r_update_gate_grad, - &r_reset_gate_value, &r_reset_gate_grad, &r_prev_out_value, - &r_prev_out_grad, &r_reset_output_grad, active_gate); + op_reset_grad(&r_update_gate_value, + &r_update_gate_grad, + &r_reset_gate_value, + &r_reset_gate_grad, + &r_prev_out_value, + &r_prev_out_grad, + &r_reset_output_grad, + active_gate); update_gate_grad[i] = r_update_gate_grad; reset_gate_grad[i] = r_reset_gate_grad; @@ -530,11 +632,16 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, } template -inline void hl_naive_gru_backward(OpGruGrad op_gru_grad, T *gate_value, - T *gate_grad, const T *prev_out_value, - T *prev_out_grad, T *reset_output_value, - T *reset_output_grad, T *output_grad, - int frame_size, ActivationType active_node, +inline void hl_naive_gru_backward(OpGruGrad op_gru_grad, + T *gate_value, + T *gate_grad, + const T *prev_out_value, + T *prev_out_grad, + T *reset_output_value, + T *reset_output_grad, + T *output_grad, + int frame_size, + ActivationType active_node, ActivationType active_gate) { T r_value_reset_gate; T r_grad_reset_gate; @@ -573,10 +680,18 @@ inline void hl_naive_gru_backward(OpGruGrad op_gru_grad, T *gate_value, r_grad_reset_output = reset_output_grad[i]; } - op_gru_grad(&r_value_reset_gate, &r_grad_reset_gate, &r_value_update_gate, - &r_grad_update_gate, &r_value_frame_state, &r_grad_frame_state, - &r_value_prev_out, &r_grad_prev_out, &r_grad_output, - &r_value_reset_output, &r_grad_reset_output, active_node, + op_gru_grad(&r_value_reset_gate, + &r_grad_reset_gate, + &r_value_update_gate, + &r_grad_update_gate, + &r_value_frame_state, + &r_grad_frame_state, + &r_value_prev_out, + &r_grad_prev_out, + &r_grad_output, + &r_value_reset_output, + &r_grad_reset_output, + active_node, active_gate); reset_gate_grad[i] = r_grad_reset_gate; @@ -592,11 +707,16 @@ inline void hl_naive_gru_backward(OpGruGrad op_gru_grad, T *gate_value, } template -inline void hl_avx_gru_backward(OpGruGrad op_gru_grad, T *gate_value, - T *gate_grad, const T *prev_out_value, - T *prev_out_grad, T *reset_output_value, - T *reset_output_grad, T *output_grad, - int frame_size, ActivationType active_node, +inline void hl_avx_gru_backward(OpGruGrad op_gru_grad, + T *gate_value, + T *gate_grad, + const T *prev_out_value, + T *prev_out_grad, + T *reset_output_value, + T *reset_output_grad, + T *output_grad, + int frame_size, + ActivationType active_node, ActivationType active_gate) { #ifdef __AVX__ __m256 r_value_reset_gate; @@ -639,10 +759,18 @@ inline void hl_avx_gru_backward(OpGruGrad op_gru_grad, T *gate_value, r_grad_reset_output = (reinterpret_cast<__m256 *>(reset_output_grad))[i]; } - op_gru_grad(&r_value_reset_gate, &r_grad_reset_gate, &r_value_update_gate, - &r_grad_update_gate, &r_value_frame_state, &r_grad_frame_state, - &r_value_prev_out, &r_grad_prev_out, &r_grad_output, - &r_value_reset_output, &r_grad_reset_output, active_node, + op_gru_grad(&r_value_reset_gate, + &r_grad_reset_gate, + &r_value_update_gate, + &r_grad_update_gate, + &r_value_frame_state, + &r_grad_frame_state, + &r_value_prev_out, + &r_grad_prev_out, + &r_grad_output, + &r_value_reset_output, + &r_grad_reset_output, + active_node, active_gate); reset_gate_grad[i] = r_grad_reset_gate; @@ -660,20 +788,33 @@ inline void hl_avx_gru_backward(OpGruGrad op_gru_grad, T *gate_value, template inline void backward_state_grad(OpStateGrad op_state_grad, - GRUMetaValue value, GRUMetaGrad grad, - int frame_size, int batch_size, - ActivationType active_node, bool origin_mode) { + phi::funcs::GRUMetaValue value, + phi::funcs::GRUMetaGrad grad, + int frame_size, + int batch_size, + ActivationType active_node, + bool origin_mode) { for (int b = 0; b < batch_size; b++) { if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { - hl_avx_gru_backward_state_grad(op_state_grad, value.gate_value, - grad.gate_grad, value.prev_out_value, - grad.prev_out_grad, grad.output_grad, - frame_size, active_node, origin_mode); + hl_avx_gru_backward_state_grad(op_state_grad, + value.gate_value, + grad.gate_grad, + value.prev_out_value, + grad.prev_out_grad, + grad.output_grad, + frame_size, + active_node, + origin_mode); } else { - hl_naive_gru_backward_state_grad(op_state_grad, value.gate_value, - grad.gate_grad, value.prev_out_value, - grad.prev_out_grad, grad.output_grad, - frame_size, active_node, origin_mode); + hl_naive_gru_backward_state_grad(op_state_grad, + value.gate_value, + grad.gate_grad, + value.prev_out_value, + grad.prev_out_grad, + grad.output_grad, + frame_size, + active_node, + origin_mode); } value.gate_value += frame_size * 3; @@ -691,18 +832,30 @@ inline void backward_state_grad(OpStateGrad op_state_grad, template inline void backward_reset_grad(OpResetGrad op_reset_grad, - GRUMetaValue value, GRUMetaGrad grad, - int frame_size, int batch_size, + phi::funcs::GRUMetaValue value, + phi::funcs::GRUMetaGrad grad, + int frame_size, + int batch_size, ActivationType active_gate) { for (int b = 0; b < batch_size; b++) { if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { - hl_avx_gru_backward_reset_grad( - op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value, - grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate); + hl_avx_gru_backward_reset_grad(op_reset_grad, + value.gate_value, + grad.gate_grad, + value.prev_out_value, + grad.prev_out_grad, + grad.reset_output_grad, + frame_size, + active_gate); } else { - hl_naive_gru_backward_reset_grad( - op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value, - grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate); + hl_naive_gru_backward_reset_grad(op_reset_grad, + value.gate_value, + grad.gate_grad, + value.prev_out_value, + grad.prev_out_grad, + grad.reset_output_grad, + frame_size, + active_gate); } value.gate_value += frame_size * 3; @@ -719,8 +872,9 @@ inline void backward_reset_grad(OpResetGrad op_reset_grad, } template -inline void gru_backward(const platform::CPUDeviceContext &context, - GRUMetaValue value, GRUMetaGrad grad, +inline void gru_backward(const paddle::platform::CPUDeviceContext &context, + phi::funcs::GRUMetaValue value, + phi::funcs::GRUMetaGrad grad, int frame_size) { auto &place = *context.eigen_device(); @@ -747,13 +901,19 @@ inline void gru_backward(const platform::CPUDeviceContext &context, if (value.prev_out_value) { auto value_prev_out = typename EigenVector::ConstType( value.prev_out_value, Array1(frame_size)); - SigmoidGradFunctor()(place, 1 /*useless*/, value_update_gate, - (value_prev_out - value_frame_state) * grad_output, - grad_update_gate); + paddle::operators::SigmoidGradFunctor()( + place, + 1 /*useless*/, + value_update_gate, + (value_prev_out - value_frame_state) * grad_output, + grad_update_gate); } else { - SigmoidGradFunctor()( - place, 1 /*useless*/, value_update_gate, - static_cast(-1) * value_frame_state * grad_output, grad_update_gate); + paddle::operators::SigmoidGradFunctor()( + place, + 1 /*useless*/, + value_update_gate, + static_cast(-1) * value_frame_state * grad_output, + grad_update_gate); } if (grad.prev_out_grad) { auto grad_prev_out = @@ -761,11 +921,16 @@ inline void gru_backward(const platform::CPUDeviceContext &context, grad_prev_out.device(place) = grad_prev_out + grad_output * value_update_gate; } - TanhGradFunctor()(place, 1 /*useless*/, value_frame_state, - grad_output * (static_cast(1.0) - value_update_gate), - grad_frame_state); - SigmoidGradFunctor()( - place, 1 /*useless*/, value_reset_gate, + paddle::operators::TanhGradFunctor()( + place, + 1 /*useless*/, + value_frame_state, + grad_output * (static_cast(1.0) - value_update_gate), + grad_frame_state); + paddle::operators::SigmoidGradFunctor()( + place, + 1 /*useless*/, + value_reset_gate, value_reset_output / value_reset_gate * grad_frame_state, grad_reset_gate); if (value.prev_out_value && grad.prev_out_grad) { @@ -774,10 +939,13 @@ inline void gru_backward(const platform::CPUDeviceContext &context, } template -inline void cpu_gru_backward(const platform::CPUDeviceContext &context, - OpGruGrad op_gru_grad, GRUMetaValue value, - GRUMetaGrad grad, int frame_size, - int batch_size, ActivationType active_node, +inline void cpu_gru_backward(const paddle::platform::CPUDeviceContext &context, + OpGruGrad op_gru_grad, + phi::funcs::GRUMetaValue value, + phi::funcs::GRUMetaGrad grad, + int frame_size, + int batch_size, + ActivationType active_node, ActivationType active_gate) { for (int b = 0; b < batch_size; ++b) { // eigen @@ -801,6 +969,5 @@ inline void cpu_gru_backward(const platform::CPUDeviceContext &context, #endif // @} End Group for GRU CPU } // namespace detail -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h similarity index 74% rename from paddle/fluid/operators/math/detail/gru_gpu_kernel.h rename to paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h index 75d4809a462..6657417beac 100644 --- a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h +++ b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h @@ -14,14 +14,13 @@ limitations under the License. */ #pragma once #include -#include "paddle/fluid/operators/math/detail/activation_functions.h" -#include "paddle/fluid/operators/math/gru_compute.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/kernels/funcs/detail/activation_functions.h" +#include "paddle/phi/kernels/funcs/gru_compute.h" -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { namespace detail { /* @@ -30,9 +29,11 @@ namespace detail { */ template __global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output, - T *gate_value, T *reset_output_value, + T *gate_value, + T *reset_output_value, const T *prev_output_value, - int frame_size, int batch_size, + int frame_size, + int batch_size, ActivationType active_gate) { const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; if (frame_idx >= frame_size) return; @@ -55,8 +56,11 @@ __global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output, r_prev_out = prev_output_value[frame_idx]; } - op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out, - &r_value_reset_output, active_gate); + op_reset_output(&r_value_update_gate, + &r_value_reset_gate, + &r_prev_out, + &r_value_reset_output, + active_gate); gate_value[frame_idx + frame_size * 0] = r_value_update_gate; gate_value[frame_idx + frame_size * 1] = r_value_reset_gate; @@ -68,10 +72,14 @@ __global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output, * grid(frame_blocks, batch_blocks) */ template -__global__ void KeGruForwardFinalOutput( - OpFinalOutput op_final_output, T *gate_value, const T *prev_output_value, - T *output_value, int frame_size, int batch_size, ActivationType active_node, - bool origin_mode) { +__global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output, + T *gate_value, + const T *prev_output_value, + T *output_value, + int frame_size, + int batch_size, + ActivationType active_node, + bool origin_mode) { const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; if (frame_idx >= frame_size) return; int batch_idx = 0; @@ -92,8 +100,12 @@ __global__ void KeGruForwardFinalOutput( r_prev_out = prev_output_value[frame_idx]; } - op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out, - &r_output, active_node, origin_mode); + op_final_output(&r_value_update_gate, + &r_value_frame_state, + &r_prev_out, + &r_output, + active_node, + origin_mode); gate_value[frame_idx + frame_size * 2] = r_value_frame_state; output_value[frame_idx] = r_output; @@ -106,7 +118,8 @@ __global__ void KeGruForwardFinalOutput( template __global__ void KeFastCollectiveGruGate(T *gate_value, const T *prev_output_value, - const T *gate_weight, T *reset_output, + const T *gate_weight, + T *reset_output, int frame_size, ActivationType active_node) { T xt_0 = 0.0f; @@ -164,9 +177,12 @@ __global__ void KeFastCollectiveGruGate(T *gate_value, */ template __global__ void KeFastCollectiveGruOut(const T *gate_weight, - const T *prev_out_value, T *output_value, - T *gate_value, T *reset_value, - int frame_size, ActivationType act_node, + const T *prev_out_value, + T *output_value, + T *gate_value, + T *reset_value, + int frame_size, + ActivationType act_node, bool origin_mode) { int COL = blockIdx.x * blockDim.x + threadIdx.x; @@ -221,10 +237,14 @@ __global__ void KeFastCollectiveGruOut(const T *gate_weight, * grid(frame_blocks, batch_blocks) */ template -__global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value, - T *gate_grad, const T *prev_out_value, - T *prev_out_grad, T *output_grad, - int frame_size, int batch_size, +__global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, + T *gate_value, + T *gate_grad, + const T *prev_out_value, + T *prev_out_grad, + T *output_grad, + int frame_size, + int batch_size, ActivationType active_node, bool origin_mode) { const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -254,9 +274,15 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value, r_prev_out_grad = prev_out_grad[frame_idx]; } - op_state_grad(&r_update_gate_value, &r_update_gate_grad, &r_frame_state_value, - &r_frame_state_grad, &r_prev_out_value, &r_prev_out_grad, - &r_out_grad, active_node, origin_mode); + op_state_grad(&r_update_gate_value, + &r_update_gate_grad, + &r_frame_state_value, + &r_frame_state_grad, + &r_prev_out_value, + &r_prev_out_grad, + &r_out_grad, + active_node, + origin_mode); gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad; gate_grad[frame_idx + frame_size * 2] = r_frame_state_grad; @@ -270,10 +296,14 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value, * grid(frame_blocks, batch_blocks) */ template -__global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value, - T *gate_grad, const T *prev_out_value, - T *prev_out_grad, T *reset_output_grad, - int frame_size, int batch_size, +__global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, + T *gate_value, + T *gate_grad, + const T *prev_out_value, + T *prev_out_grad, + T *reset_output_grad, + int frame_size, + int batch_size, ActivationType active_gate) { const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; if (frame_idx >= frame_size) return; @@ -302,9 +332,14 @@ __global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value, r_reset_output_grad = reset_output_grad[frame_idx]; } - op_reset_grad(&r_update_gate_value, &r_update_gate_grad, &r_reset_gate_value, - &r_reset_gate_grad, &r_prev_out_value, &r_prev_out_grad, - &r_reset_output_grad, active_gate); + op_reset_grad(&r_update_gate_value, + &r_update_gate_grad, + &r_reset_gate_value, + &r_reset_gate_grad, + &r_prev_out_value, + &r_prev_out_grad, + &r_reset_output_grad, + active_gate); gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad; gate_grad[frame_idx + frame_size * 1] = r_reset_gate_grad; @@ -313,6 +348,5 @@ __global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value, } } } // namespace detail -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/fluid/operators/math/detail/gru_kernel.h b/paddle/phi/kernels/funcs/detail/gru_kernel.h similarity index 64% rename from paddle/fluid/operators/math/detail/gru_kernel.h rename to paddle/phi/kernels/funcs/detail/gru_kernel.h index 082c2a180da..db53fc4576d 100644 --- a/paddle/fluid/operators/math/detail/gru_kernel.h +++ b/paddle/phi/kernels/funcs/detail/gru_kernel.h @@ -14,13 +14,12 @@ limitations under the License. */ #pragma once #include -#include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/funcs/detail/activation_functions.h" // TODO(guosheng): refine code style in gru_kernel -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { namespace detail { namespace forward { @@ -28,8 +27,10 @@ namespace forward { template class gru_resetOutput { public: - HOSTDEVICE void operator()(T *value_update_gate, T *value_reset_gate, - T *prev_out, T *value_reset_output, + HOSTDEVICE void operator()(T *value_update_gate, + T *value_reset_gate, + T *prev_out, + T *value_reset_output, ActivationType act_gate, T *value_reset_bias = nullptr, bool old_version = true) { @@ -48,7 +49,8 @@ class gru_resetOutput { #else static const bool avx = true; HOSTDEVICE void operator()(__m256 *value_update_gate, - __m256 *value_reset_gate, __m256 *prev_out, + __m256 *value_reset_gate, + __m256 *prev_out, __m256 *value_reset_output, ActivationType act_gate, __m256 *value_reset_bias = nullptr, @@ -71,9 +73,12 @@ class gru_resetOutput { template class gru_finalOutput { public: - HOSTDEVICE void operator()(T *value_update_gate, T *value_frame_state, - T *prev_out, T *value_output, - ActivationType act_input, bool origin_mode) { + HOSTDEVICE void operator()(T *value_update_gate, + T *value_frame_state, + T *prev_out, + T *value_output, + ActivationType act_input, + bool origin_mode) { *value_frame_state = activation(*value_frame_state, act_input); if (origin_mode) { *value_output = ((*value_update_gate) * (*prev_out)) + @@ -90,8 +95,10 @@ class gru_finalOutput { #else static const bool avx = true; HOSTDEVICE void operator()(__m256 *value_update_gate, - __m256 *value_frame_state, __m256 *prev_out, - __m256 *value_output, ActivationType act_input, + __m256 *value_frame_state, + __m256 *prev_out, + __m256 *value_output, + ActivationType act_input, bool origin_mode) { *value_frame_state = activation(*value_frame_state, act_input); if (origin_mode) { @@ -116,10 +123,14 @@ namespace backward { template class gru_stateGrad { public: - HOSTDEVICE void operator()(T *value_update_gate, T *grad_update_gate, - T *value_frame_state, T *grad_frame_state, - T *value_prev_out, T *grad_prev_out, - T *grad_output, ActivationType act_input, + HOSTDEVICE void operator()(T *value_update_gate, + T *grad_update_gate, + T *value_frame_state, + T *grad_frame_state, + T *value_prev_out, + T *grad_prev_out, + T *grad_output, + ActivationType act_input, bool origin_mode) { if (origin_mode) { *grad_update_gate = @@ -127,14 +138,15 @@ class gru_stateGrad { *grad_prev_out += (*grad_output * (*value_update_gate)); *grad_frame_state = activation( *grad_output * (static_cast(1.0) - (*value_update_gate)), - *value_frame_state, act_input); + *value_frame_state, + act_input); } else { *grad_update_gate = (*grad_output) * ((*value_frame_state) - (*value_prev_out)); *grad_prev_out += (*grad_output * (static_cast(1.0) - *value_update_gate)); - *grad_frame_state = activation(*grad_output * (*value_update_gate), - *value_frame_state, act_input); + *grad_frame_state = activation( + *grad_output * (*value_update_gate), *value_frame_state, act_input); } } #if !defined(__NVCC__) && !defined(__HIPCC___) // @{ Group GRU state grad @@ -145,28 +157,35 @@ class gru_stateGrad { HOSTDEVICE void operator()(__m256 *value_update_gate, __m256 *grad_update_gate, __m256 *value_frame_state, - __m256 *grad_frame_state, __m256 *value_prev_out, - __m256 *grad_prev_out, __m256 *grad_output, - ActivationType act_input, bool origin_mode) { + __m256 *grad_frame_state, + __m256 *value_prev_out, + __m256 *grad_prev_out, + __m256 *grad_output, + ActivationType act_input, + bool origin_mode) { if (origin_mode) { *grad_update_gate = _mm256_mul_ps( *grad_output, _mm256_sub_ps(*value_prev_out, *value_frame_state)); *grad_prev_out = _mm256_add_ps( *grad_prev_out, _mm256_mul_ps(*grad_output, *value_update_gate)); *grad_frame_state = activation( - _mm256_mul_ps(*grad_output, _mm256_sub_ps(_mm256_set1_ps(1.0f), - *value_update_gate)), - *value_frame_state, act_input); + _mm256_mul_ps( + *grad_output, + _mm256_sub_ps(_mm256_set1_ps(1.0f), *value_update_gate)), + *value_frame_state, + act_input); } else { *grad_update_gate = _mm256_mul_ps( *grad_output, _mm256_sub_ps(*value_frame_state, *value_prev_out)); *grad_prev_out = _mm256_add_ps( *grad_prev_out, - _mm256_mul_ps(*grad_output, _mm256_sub_ps(_mm256_set1_ps(1.0f), - *value_update_gate))); + _mm256_mul_ps( + *grad_output, + _mm256_sub_ps(_mm256_set1_ps(1.0f), *value_update_gate))); *grad_frame_state = activation(_mm256_mul_ps(*grad_output, *value_update_gate), - *value_frame_state, act_input); + *value_frame_state, + act_input); } } #endif @@ -176,10 +195,14 @@ class gru_stateGrad { template class gru_resetGrad { public: - HOSTDEVICE void operator()(T *value_update_gate, T *grad_update_gate, - T *value_reset_gate, T *grad_reset_gate, - T *value_prev_out, T *grad_prev_out, - T *grad_reset_output, ActivationType act_gate) { + HOSTDEVICE void operator()(T *value_update_gate, + T *grad_update_gate, + T *value_reset_gate, + T *grad_reset_gate, + T *value_prev_out, + T *grad_prev_out, + T *grad_reset_output, + ActivationType act_gate) { *grad_reset_gate = (*grad_reset_output * (*value_prev_out)); *grad_prev_out += (*grad_reset_output * (*value_reset_gate)); *grad_update_gate = @@ -193,9 +216,12 @@ class gru_resetGrad { #else static const bool avx = true; HOSTDEVICE void operator()(__m256 *value_update_gate, - __m256 *grad_update_gate, __m256 *value_reset_gate, - __m256 *grad_reset_gate, __m256 *value_prev_out, - __m256 *grad_prev_out, __m256 *grad_reset_output, + __m256 *grad_update_gate, + __m256 *value_reset_gate, + __m256 *grad_reset_gate, + __m256 *value_prev_out, + __m256 *grad_prev_out, + __m256 *grad_reset_output, ActivationType act_gate) { *grad_reset_gate = _mm256_mul_ps(*grad_reset_output, *value_prev_out); *grad_prev_out = _mm256_add_ps( @@ -211,23 +237,31 @@ class gru_resetGrad { template class gru { public: - HOSTDEVICE void operator()(T *value_reset_gate, T *grad_reset_gate, - T *value_update_gate, T *grad_update_gate, - T *value_frame_state, T *grad_frame_state, - T *value_prev_out, T *grad_prev_out, - T *grad_output, T *value_reset_output, - T *grad_reset_output, ActivationType act_node, + HOSTDEVICE void operator()(T *value_reset_gate, + T *grad_reset_gate, + T *value_update_gate, + T *grad_update_gate, + T *value_frame_state, + T *grad_frame_state, + T *value_prev_out, + T *grad_prev_out, + T *grad_output, + T *value_reset_output, + T *grad_reset_output, + ActivationType act_node, ActivationType act_gate) { *grad_update_gate = activation((*grad_output) * ((*value_prev_out) - (*value_frame_state)), - (*value_update_gate), act_gate); + (*value_update_gate), + act_gate); *grad_prev_out += (*grad_output * (*value_update_gate)); *grad_frame_state = activation(*grad_output * (static_cast(1.0) - (*value_update_gate)), - *value_frame_state, act_node); + *value_frame_state, + act_node); T reset_output = (*value_reset_output) / (*value_reset_gate); - *grad_reset_gate = activation(reset_output * (*grad_frame_state), - *value_reset_gate, act_gate); + *grad_reset_gate = activation( + reset_output * (*grad_frame_state), *value_reset_gate, act_gate); *grad_reset_output = (*value_reset_gate) * (*grad_frame_state); } #if !defined(__NVCC__) && !defined(__HIPCC___) // @{ Group GRU CPU @@ -235,29 +269,36 @@ class gru { static const bool avx = false; #else static const bool avx = true; - HOSTDEVICE void operator()(__m256 *value_reset_gate, __m256 *grad_reset_gate, + HOSTDEVICE void operator()(__m256 *value_reset_gate, + __m256 *grad_reset_gate, __m256 *value_update_gate, __m256 *grad_update_gate, __m256 *value_frame_state, - __m256 *grad_frame_state, __m256 *value_prev_out, - __m256 *grad_prev_out, __m256 *grad_output, + __m256 *grad_frame_state, + __m256 *value_prev_out, + __m256 *grad_prev_out, + __m256 *grad_output, __m256 *value_reset_output, - __m256 *grad_reset_output, ActivationType act_node, + __m256 *grad_reset_output, + ActivationType act_node, ActivationType act_gate) { *grad_update_gate = activation( _mm256_mul_ps(*grad_output, _mm256_sub_ps(*value_prev_out, *value_frame_state)), - *value_update_gate, act_gate); + *value_update_gate, + act_gate); *grad_prev_out = _mm256_add_ps( *grad_prev_out, _mm256_mul_ps(*grad_output, *value_update_gate)); *grad_frame_state = activation( _mm256_mul_ps(*grad_output, _mm256_sub_ps(_mm256_set1_ps(1.0f), *value_update_gate)), - *value_frame_state, act_node); + *value_frame_state, + act_node); __m256 reset_output = _mm256_div_ps(*value_reset_output, *value_reset_gate); *grad_reset_gate = activation(_mm256_mul_ps(reset_output, *grad_frame_state), - *value_reset_gate, act_gate); + *value_reset_gate, + act_gate); *grad_reset_output = _mm256_mul_ps(*value_reset_gate, *grad_frame_state); } #endif @@ -267,6 +308,5 @@ class gru { } // namespace backward } // namespace detail -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h similarity index 65% rename from paddle/fluid/operators/math/detail/lstm_cpu_kernel.h rename to paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h index 169c5488bb5..10dbf27d348 100644 --- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h +++ b/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h @@ -16,8 +16,8 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/operators/activation_op.h" -#include "paddle/fluid/operators/math/detail/activation_functions.h" -#include "paddle/fluid/operators/math/lstm_compute.h" +#include "paddle/phi/kernels/funcs/detail/activation_functions.h" +#include "paddle/phi/kernels/funcs/lstm_compute.h" #if defined(_WIN32) #if defined(__AVX2__) || defined(__AVX__) @@ -25,21 +25,23 @@ inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); } #endif #endif -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { namespace detail { using Array1 = Eigen::DSizes; -template -using EigenVector = framework::EigenVector; +using EigenVector = paddle::framework::EigenVector; #if !defined(__NVCC__) && !defined(__HIPCC___) // @{ Group LSTM CPU template -void naive_lstm_forward_one_sequence(Op op, LstmMetaValue value, - int frame_size, T cell_clip, +void naive_lstm_forward_one_sequence(Op op, + phi::funcs::LstmMetaValue value, + int frame_size, + T cell_clip, ActivationType active_node, ActivationType active_gate, ActivationType active_state, @@ -79,9 +81,21 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue value, r_prev_state = value.prev_state_value[i]; } - op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state, - &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO, - &cell_clip, active_node, active_gate, active_state); + op(&r_value_in, + &r_value_ig, + &r_value_fg, + &r_value_og, + &r_prev_state, + &r_state, + &r_state_atv, + &r_out, + &r_checkI, + &r_checkF, + &r_checkO, + &cell_clip, + active_node, + active_gate, + active_state); value_in[i] = r_value_in; value_ig[i] = r_value_ig; @@ -94,9 +108,12 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue value, } template -void naive_lstm_backward_one_sequence(Op op, LstmMetaValue value, - LstmMetaGrad grad, int frame_size, - T cell_clip, ActivationType active_node, +void naive_lstm_backward_one_sequence(Op op, + phi::funcs::LstmMetaValue value, + phi::funcs::LstmMetaGrad grad, + int frame_size, + T cell_clip, + ActivationType active_node, ActivationType active_gate, ActivationType active_state, bool old_api_version) { @@ -157,11 +174,30 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue value, r_prev_state = value.prev_state_value[i]; } - op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in, - &r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad, - &r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI, - &r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, - &cell_clip, active_node, active_gate, active_state); + op(&r_value_in, + &r_value_ig, + &r_value_fg, + &r_value_og, + &r_grad_in, + &r_grad_ig, + &r_grad_fg, + &r_grad_og, + &r_prev_state, + &r_prev_state_grad, + &r_state, + &r_state_grad, + &r_state_atv, + &r_output_grad, + &r_checkI, + &r_checkF, + &r_checkO, + &r_checkIGrad, + &r_checkFGrad, + &r_checkOGrad, + &cell_clip, + active_node, + active_gate, + active_state); grad_in[i] = r_grad_in; grad_ig[i] = r_grad_ig; @@ -179,8 +215,10 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue value, } template -void avx_lstm_forward_one_sequence(Op op, LstmMetaValue value, - int frame_size, T cell_clip, +void avx_lstm_forward_one_sequence(Op op, + phi::funcs::LstmMetaValue value, + int frame_size, + T cell_clip, ActivationType active_node, ActivationType active_gate, ActivationType active_state, @@ -226,9 +264,21 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue value, (reinterpret_cast<__m256 const *>(value.prev_state_value))[i]; } - op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state, - &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO, - &cell_clip, active_node, active_gate, active_state); + op(&r_value_in, + &r_value_ig, + &r_value_fg, + &r_value_og, + &r_prev_state, + &r_state, + &r_state_atv, + &r_out, + &r_checkI, + &r_checkF, + &r_checkO, + &cell_clip, + active_node, + active_gate, + active_state); value_in[i] = r_value_in; value_ig[i] = r_value_ig; @@ -242,9 +292,12 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue value, } template -void avx_lstm_backward_one_sequence(Op op, LstmMetaValue value, - LstmMetaGrad grad, int frame_size, - T cell_clip, ActivationType active_node, +void avx_lstm_backward_one_sequence(Op op, + phi::funcs::LstmMetaValue value, + phi::funcs::LstmMetaGrad grad, + int frame_size, + T cell_clip, + ActivationType active_node, ActivationType active_gate, ActivationType active_state, bool old_api_version) { @@ -311,11 +364,30 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue value, (reinterpret_cast<__m256 const *>(value.prev_state_value))[i]; } - op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in, - &r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad, - &r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI, - &r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, - &cell_clip, active_node, active_gate, active_state); + op(&r_value_in, + &r_value_ig, + &r_value_fg, + &r_value_og, + &r_grad_in, + &r_grad_ig, + &r_grad_fg, + &r_grad_og, + &r_prev_state, + &r_prev_state_grad, + &r_state, + &r_state_grad, + &r_state_atv, + &r_output_grad, + &r_checkI, + &r_checkF, + &r_checkO, + &r_checkIGrad, + &r_checkFGrad, + &r_checkOGrad, + &cell_clip, + active_node, + active_gate, + active_state); grad_in[i] = r_grad_in; grad_ig[i] = r_grad_ig; @@ -338,8 +410,10 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue value, } template -void eigen_lstm_forward_one_sequence(const platform::CPUDeviceContext &context, - LstmMetaValue value, int frame_size) { +void eigen_lstm_forward_one_sequence( + const paddle::platform::CPUDeviceContext &context, + phi::funcs::LstmMetaValue value, + int frame_size) { auto eigen_value_ig = typename EigenVector::Type(value.gate_value, Array1(frame_size)); auto eigen_value_fg = typename EigenVector::Type( @@ -356,10 +430,10 @@ void eigen_lstm_forward_one_sequence(const platform::CPUDeviceContext &context, typename EigenVector::Type(value.output_value, Array1(frame_size)); auto &place = *context.eigen_device(); - TanhFunctor()(place, eigen_value_in, eigen_value_in); - SigmoidFunctor()(place, eigen_value_ig, eigen_value_ig); - SigmoidFunctor()(place, eigen_value_fg, eigen_value_fg); - SigmoidFunctor()(place, eigen_value_og, eigen_value_og); + paddle::operators::TanhFunctor()(place, eigen_value_in, eigen_value_in); + paddle::operators::SigmoidFunctor()(place, eigen_value_ig, eigen_value_ig); + paddle::operators::SigmoidFunctor()(place, eigen_value_fg, eigen_value_fg); + paddle::operators::SigmoidFunctor()(place, eigen_value_og, eigen_value_og); eigen_state.device(place) = eigen_value_in * eigen_value_ig; if (value.prev_state_value) { @@ -368,14 +442,16 @@ void eigen_lstm_forward_one_sequence(const platform::CPUDeviceContext &context, eigen_state.device(place) = eigen_state + eigen_prev_state * eigen_value_fg; } - TanhFunctor()(place, eigen_state, eigen_state_act); + paddle::operators::TanhFunctor()(place, eigen_state, eigen_state_act); eigen_output.device(place) = eigen_value_og * eigen_state_act; } template -void eigen_lstm_backward_one_sequence(const platform::CPUDeviceContext &context, - LstmMetaValue value, - LstmMetaGrad grad, int frame_size) { +void eigen_lstm_backward_one_sequence( + const paddle::platform::CPUDeviceContext &context, + phi::funcs::LstmMetaValue value, + phi::funcs::LstmMetaGrad grad, + int frame_size) { auto eigen_value_ig = typename EigenVector::Type(value.gate_value, Array1(frame_size)); auto eigen_value_fg = typename EigenVector::Type( @@ -401,23 +477,38 @@ void eigen_lstm_backward_one_sequence(const platform::CPUDeviceContext &context, typename EigenVector::Type(grad.state_grad, Array1(frame_size)); auto &place = *context.eigen_device(); - SigmoidGradFunctor()(place, 1 /*useless*/, eigen_value_og, - eigen_grad_output * eigen_state_act, eigen_grad_og); + paddle::operators::SigmoidGradFunctor()( + place, + 1 /*useless*/, + eigen_value_og, + eigen_grad_output * eigen_state_act, + eigen_grad_og); eigen_grad_state.device(place) = eigen_grad_state + eigen_grad_output * eigen_value_og * (static_cast(1) - eigen_state_act * eigen_state_act); - TanhGradFunctor()(place, 1, eigen_value_in, - eigen_grad_state * eigen_value_ig, eigen_grad_in); - SigmoidGradFunctor()(place, 1, eigen_value_ig, - eigen_grad_state * eigen_value_in, eigen_grad_ig); + paddle::operators::TanhGradFunctor()(place, + 1, + eigen_value_in, + eigen_grad_state * eigen_value_ig, + eigen_grad_in); + paddle::operators::SigmoidGradFunctor()(place, + 1, + eigen_value_ig, + eigen_grad_state * eigen_value_in, + eigen_grad_ig); if (value.prev_state_value) { auto eigen_prev_state = typename EigenVector::ConstType( value.prev_state_value, Array1(frame_size)); - SigmoidGradFunctor()(place, 1, eigen_value_fg, - eigen_grad_state * eigen_prev_state, eigen_grad_fg); + paddle::operators::SigmoidGradFunctor()( + place, + 1, + eigen_value_fg, + eigen_grad_state * eigen_prev_state, + eigen_grad_fg); } else { - SigmoidGradFunctor()(place, 1, eigen_value_fg, 0, eigen_grad_fg); + paddle::operators::SigmoidGradFunctor()( + place, 1, eigen_value_fg, 0, eigen_grad_fg); } if (grad.prev_state_grad) { auto eigen_grad_pre_state = @@ -427,42 +518,74 @@ void eigen_lstm_backward_one_sequence(const platform::CPUDeviceContext &context, } template -void cpu_lstm_forward(const platform::CPUDeviceContext &context, Op op, - LstmMetaValue value, int frame_size, T cell_clip, - ActivationType active_node, ActivationType active_gate, - ActivationType active_state, bool old_api_version) { +void cpu_lstm_forward(const paddle::platform::CPUDeviceContext &context, + Op op, + phi::funcs::LstmMetaValue value, + int frame_size, + T cell_clip, + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state, + bool old_api_version) { if (!old_api_version) { eigen_lstm_forward_one_sequence(context, value, frame_size); } else { if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same::value)) { - avx_lstm_forward_one_sequence(op, value, frame_size, cell_clip, - active_node, active_gate, active_state, + avx_lstm_forward_one_sequence(op, + value, + frame_size, + cell_clip, + active_node, + active_gate, + active_state, old_api_version); } else { - naive_lstm_forward_one_sequence(op, value, frame_size, cell_clip, - active_node, active_gate, active_state, + naive_lstm_forward_one_sequence(op, + value, + frame_size, + cell_clip, + active_node, + active_gate, + active_state, old_api_version); } } } template -void cpu_lstm_backward(const platform::CPUDeviceContext &context, Op op, - LstmMetaValue value, LstmMetaGrad grad, - int frame_size, T cell_clip, ActivationType active_node, - ActivationType active_gate, ActivationType active_state, +void cpu_lstm_backward(const paddle::platform::CPUDeviceContext &context, + Op op, + phi::funcs::LstmMetaValue value, + phi::funcs::LstmMetaGrad grad, + int frame_size, + T cell_clip, + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state, bool old_api_version) { if (!old_api_version) { eigen_lstm_backward_one_sequence(context, value, grad, frame_size); } else { if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same::value)) { - avx_lstm_backward_one_sequence(op, value, grad, frame_size, cell_clip, - active_node, active_gate, active_state, + avx_lstm_backward_one_sequence(op, + value, + grad, + frame_size, + cell_clip, + active_node, + active_gate, + active_state, old_api_version); } else { - naive_lstm_backward_one_sequence(op, value, grad, frame_size, - cell_clip, active_node, active_gate, - active_state, old_api_version); + naive_lstm_backward_one_sequence(op, + value, + grad, + frame_size, + cell_clip, + active_node, + active_gate, + active_state, + old_api_version); } } } @@ -470,6 +593,5 @@ void cpu_lstm_backward(const platform::CPUDeviceContext &context, Op op, #endif // @{ End Group LSTM CPU } // namespace detail -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h similarity index 68% rename from paddle/fluid/operators/math/detail/lstm_gpu_kernel.h rename to paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h index 851a62dbe9a..6d4c430d9e6 100644 --- a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h +++ b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h @@ -15,14 +15,13 @@ limitations under the License. */ #pragma once #include -#include "paddle/fluid/operators/math/detail/activation_functions.h" -#include "paddle/fluid/operators/math/lstm_compute.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/kernels/funcs/detail/activation_functions.h" +#include "paddle/phi/kernels/funcs/lstm_compute.h" -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { namespace detail { /* @@ -30,8 +29,11 @@ namespace detail { * grid(frame_blocks, batch_blocks) */ template -__global__ void KeLstmForward(Op op, LstmMetaValue value, int frame_size, - int batch_size, T cell_clip, +__global__ void KeLstmForward(Op op, + phi::funcs::LstmMetaValue value, + int frame_size, + int batch_size, + T cell_clip, ActivationType active_node, ActivationType active_gate, ActivationType active_state) { @@ -71,9 +73,21 @@ __global__ void KeLstmForward(Op op, LstmMetaValue value, int frame_size, r_prev_state = value.prev_state_value[frame_idx]; } - op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state, - &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO, - &cell_clip, active_node, active_gate, active_state); + op(&r_value_in, + &r_value_ig, + &r_value_fg, + &r_value_og, + &r_prev_state, + &r_state, + &r_state_atv, + &r_out, + &r_checkI, + &r_checkF, + &r_checkO, + &cell_clip, + active_node, + active_gate, + active_state); value.gate_value[frame_idx] = r_value_in; value.gate_value[frame_idx + frame_size] = r_value_ig; @@ -90,9 +104,12 @@ __global__ void KeLstmForward(Op op, LstmMetaValue value, int frame_size, * grid(frame_blocks, batch_blocks) */ template -__global__ void KeLstmBackward(Op op, LstmMetaValue value, - LstmMetaGrad grad, int frame_size, - int batch_size, T cell_clip, +__global__ void KeLstmBackward(Op op, + phi::funcs::LstmMetaValue value, + phi::funcs::LstmMetaGrad grad, + int frame_size, + int batch_size, + T cell_clip, ActivationType active_node, ActivationType active_gate, ActivationType active_state) { @@ -147,11 +164,30 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue value, r_prev_state = value.prev_state_value[frame_idx]; } - op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in, &r_grad_ig, - &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad, &r_state, - &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI, &r_checkF, - &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, &cell_clip, - active_node, active_gate, active_state); + op(&r_value_in, + &r_value_ig, + &r_value_fg, + &r_value_og, + &r_grad_in, + &r_grad_ig, + &r_grad_fg, + &r_grad_og, + &r_prev_state, + &r_prev_state_grad, + &r_state, + &r_state_grad, + &r_state_atv, + &r_output_grad, + &r_checkI, + &r_checkF, + &r_checkO, + &r_checkIGrad, + &r_checkFGrad, + &r_checkOGrad, + &cell_clip, + active_node, + active_gate, + active_state); grad.gate_grad[frame_idx] = r_grad_in; grad.gate_grad[frame_idx + frame_size] = r_grad_ig; @@ -185,10 +221,15 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue value, } template -void gpu_lstm_forward(const platform::DeviceContext& context, Op op, - LstmMetaValue value, int frame_size, int batch_size, - T cell_clip, ActivationType active_node, - ActivationType active_gate, ActivationType active_state) { +void gpu_lstm_forward(const paddle::platform::DeviceContext& context, + Op op, + phi::funcs::LstmMetaValue value, + int frame_size, + int batch_size, + T cell_clip, + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { dim3 threads; dim3 grid; if (batch_size == 1) { @@ -203,25 +244,45 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op, } auto stream = - reinterpret_cast(context).stream(); + reinterpret_cast(context) + .stream(); if (batch_size == 1) { - KeLstmForward<<>>( - op, value, frame_size, batch_size, cell_clip, active_node, active_gate, + op, + value, + frame_size, + batch_size, + cell_clip, + active_node, + active_gate, active_state); } else { - KeLstmForward<<>>( - op, value, frame_size, batch_size, cell_clip, active_node, active_gate, + op, + value, + frame_size, + batch_size, + cell_clip, + active_node, + active_gate, active_state); } } template -void gpu_lstm_backward(const platform::DeviceContext& context, Op op, - LstmMetaValue value, LstmMetaGrad grad, - int frame_size, int batch_size, T cell_clip, - ActivationType active_node, ActivationType active_gate, +void gpu_lstm_backward(const paddle::platform::DeviceContext& context, + Op op, + phi::funcs::LstmMetaValue value, + phi::funcs::LstmMetaGrad grad, + int frame_size, + int batch_size, + T cell_clip, + ActivationType active_node, + ActivationType active_gate, ActivationType active_state) { dim3 threads; dim3 grid; @@ -237,21 +298,37 @@ void gpu_lstm_backward(const platform::DeviceContext& context, Op op, } auto stream = - reinterpret_cast(context).stream(); + reinterpret_cast(context) + .stream(); if (batch_size == 1) { - KeLstmBackward<<>>( - op, value, grad, frame_size, batch_size, cell_clip, active_node, - active_gate, active_state); + op, + value, + grad, + frame_size, + batch_size, + cell_clip, + active_node, + active_gate, + active_state); } else { - KeLstmBackward<<>>( - op, value, grad, frame_size, batch_size, cell_clip, active_node, - active_gate, active_state); + op, + value, + grad, + frame_size, + batch_size, + cell_clip, + active_node, + active_gate, + active_state); } } } // namespace detail -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/fluid/operators/math/detail/lstm_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_kernel.h similarity index 59% rename from paddle/fluid/operators/math/detail/lstm_kernel.h rename to paddle/phi/kernels/funcs/detail/lstm_kernel.h index 2d4e7dd59fb..8b429264125 100644 --- a/paddle/fluid/operators/math/detail/lstm_kernel.h +++ b/paddle/phi/kernels/funcs/detail/lstm_kernel.h @@ -14,12 +14,11 @@ limitations under the License. */ #pragma once #include -#include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/funcs/detail/activation_functions.h" -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { namespace detail { namespace forward { @@ -27,9 +26,18 @@ namespace forward { template class lstm { public: - HOSTDEVICE void operator()(T *value_in, T *value_ig, T *value_fg, T *value_og, - T *prev_state, T *state, T *state_atv, T *output, - T *checkI, T *checkF, T *checkO, T *cell_clip, + HOSTDEVICE void operator()(T *value_in, + T *value_ig, + T *value_fg, + T *value_og, + T *prev_state, + T *state, + T *state_atv, + T *output, + T *checkI, + T *checkF, + T *checkO, + T *cell_clip, ActivationType active_node, ActivationType active_gate, ActivationType active_state) { @@ -57,11 +65,18 @@ class lstm { // Only float support AVX optimization static const bool avx = std::is_same::value; - HOSTDEVICE void operator()(__m256 *value_in, __m256 *value_ig, - __m256 *value_fg, __m256 *value_og, - __m256 *prev_state, __m256 *state, - __m256 *state_atv, __m256 *output, __m256 *checkI, - __m256 *checkF, __m256 *checkO, T *cell_clip, + HOSTDEVICE void operator()(__m256 *value_in, + __m256 *value_ig, + __m256 *value_fg, + __m256 *value_og, + __m256 *prev_state, + __m256 *state, + __m256 *state_atv, + __m256 *output, + __m256 *checkI, + __m256 *checkF, + __m256 *checkO, + T *cell_clip, ActivationType active_node, ActivationType active_gate, ActivationType active_state) { @@ -97,12 +112,27 @@ namespace backward { template class lstm { public: - HOSTDEVICE void operator()(T *value_in, T *value_ig, T *value_fg, T *value_og, - T *grad_in, T *grad_ig, T *grad_fg, T *grad_og, - T *prev_state, T *prev_state_grad, T *state, - T *state_grad, T *state_atv, T *output_grad, - T *checkI, T *checkF, T *checkO, T *checkIGrad, - T *checkFGrad, T *checkOGrad, T *cell_clip, + HOSTDEVICE void operator()(T *value_in, + T *value_ig, + T *value_fg, + T *value_og, + T *grad_in, + T *grad_ig, + T *grad_fg, + T *grad_og, + T *prev_state, + T *prev_state_grad, + T *state, + T *state_grad, + T *state_atv, + T *output_grad, + T *checkI, + T *checkF, + T *checkO, + T *checkIGrad, + T *checkFGrad, + T *checkOGrad, + T *cell_clip, ActivationType active_node, ActivationType active_gate, ActivationType active_state) { @@ -138,17 +168,32 @@ class lstm { #else // Only float support AVX optimization static const bool avx = std::is_same::value; - HOSTDEVICE void operator()( - __m256 *value_in, __m256 *value_ig, __m256 *value_fg, __m256 *value_og, - __m256 *grad_in, __m256 *grad_ig, __m256 *grad_fg, __m256 *grad_og, - __m256 *prev_state, __m256 *prev_state_grad, __m256 *state, - __m256 *state_grad, __m256 *state_atv, __m256 *output_grad, - __m256 *checkI, __m256 *checkF, __m256 *checkO, __m256 *checkIGrad, - __m256 *checkFGrad, __m256 *checkOGrad, T *cell_clip, - ActivationType active_node, ActivationType active_gate, - ActivationType active_state) { - *grad_og = activation(_mm256_mul_ps(*output_grad, *state_atv), *value_og, - active_gate); + HOSTDEVICE void operator()(__m256 *value_in, + __m256 *value_ig, + __m256 *value_fg, + __m256 *value_og, + __m256 *grad_in, + __m256 *grad_ig, + __m256 *grad_fg, + __m256 *grad_og, + __m256 *prev_state, + __m256 *prev_state_grad, + __m256 *state, + __m256 *state_grad, + __m256 *state_atv, + __m256 *output_grad, + __m256 *checkI, + __m256 *checkF, + __m256 *checkO, + __m256 *checkIGrad, + __m256 *checkFGrad, + __m256 *checkOGrad, + T *cell_clip, + ActivationType active_node, + ActivationType active_gate, + ActivationType active_state) { + *grad_og = activation( + _mm256_mul_ps(*output_grad, *state_atv), *value_og, active_gate); if (*cell_clip > 0.0f) { T *state_ = reinterpret_cast(state); if (*state_ >= (*cell_clip) || *state_ <= (0.0f - (*cell_clip))) { @@ -156,18 +201,19 @@ class lstm { } else { *state_grad = _mm256_add_ps(activation(_mm256_mul_ps(*output_grad, *value_og), - *state_atv, active_state), + *state_atv, + active_state), *state_grad); *state_grad = _mm256_add_ps(_mm256_mul_ps(*grad_og, *checkO), *state_grad); } } - *grad_in = activation(_mm256_mul_ps(*state_grad, *value_ig), *value_in, - active_node); - *grad_ig = activation(_mm256_mul_ps(*state_grad, *value_in), *value_ig, - active_gate); - *grad_fg = activation(_mm256_mul_ps(*state_grad, *prev_state), *value_fg, - active_gate); + *grad_in = activation( + _mm256_mul_ps(*state_grad, *value_ig), *value_in, active_node); + *grad_ig = activation( + _mm256_mul_ps(*state_grad, *value_in), *value_ig, active_gate); + *grad_fg = activation( + _mm256_mul_ps(*state_grad, *prev_state), *value_fg, active_gate); *prev_state_grad = _mm256_add_ps(_mm256_mul_ps(*grad_ig, *checkI), _mm256_mul_ps(*grad_fg, *checkF)); *prev_state_grad = @@ -183,6 +229,5 @@ class lstm { } // namespace backward } // namespace detail -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/gru_compute.cc b/paddle/phi/kernels/funcs/gru_compute.cc new file mode 100644 index 00000000000..4f159fd28af --- /dev/null +++ b/paddle/phi/kernels/funcs/gru_compute.cc @@ -0,0 +1,373 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/funcs/gru_compute.h" + +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h" +#include "paddle/phi/kernels/funcs/detail/gru_kernel.h" + +namespace phi { +namespace funcs { + +template +struct GRUUnitFunctor { + static void compute(const paddle::platform::CPUDeviceContext &context, + GRUMetaValue value, + int frame_size, + int batch_size, + const phi::funcs::detail::ActivationType active_node, + const phi::funcs::detail::ActivationType active_gate, + bool origin_mode) { +#if !defined(__NVCC__) && !defined(__HIPCC___) + auto blas = + phi::funcs::GetBlas(context); + if (value.prev_out_value) { + blas.GEMM(false, + false, + batch_size, + frame_size * 2, + frame_size, + 1, + value.prev_out_value, + frame_size, + value.gate_weight, + frame_size * 2, + 1, + value.gate_value, + frame_size * 3); + } + + detail::forward_reset_output( + phi::funcs::detail::forward::gru_resetOutput(), + value, + frame_size, + batch_size, + active_gate, + true, + nullptr); + + if (value.prev_out_value) { + blas.GEMM(false, + false, + batch_size, + frame_size, + frame_size, + 1, + value.reset_output_value, + frame_size, + value.state_weight, + frame_size, + 1, + value.gate_value + frame_size * 2, + frame_size * 3); + } + + detail::forward_final_output( + phi::funcs::detail::forward::gru_finalOutput(), + value, + frame_size, + batch_size, + active_node, + origin_mode, + true, + nullptr); +#endif + } +}; + +template +struct GRUUnitGradFunctor { + static void compute(const paddle::platform::CPUDeviceContext &context, + GRUMetaValue value, + GRUMetaGrad grad, + int frame_size, + int batch_size, + const phi::funcs::detail::ActivationType active_node, + const phi::funcs::detail::ActivationType active_gate, + bool origin_mode) { +#if !defined(__NVCC__) && !defined(__HIPCC___) + detail::backward_state_grad( + phi::funcs::detail::backward::gru_stateGrad(), + value, + grad, + frame_size, + batch_size, + active_node, + origin_mode); + auto blas = + phi::funcs::GetBlas(context); + if (value.prev_out_value && grad.prev_out_grad) { + blas.GEMM(false, + true, + batch_size, + frame_size, + frame_size, + 1, + grad.gate_grad + frame_size * 2, + frame_size * 3, + value.state_weight, + frame_size, + 0, + grad.reset_output_grad, + frame_size); + + if (grad.state_weight_grad) { + blas.GEMM(true, + false, + frame_size, + frame_size, + batch_size, + 1, + value.reset_output_value, + frame_size, + grad.gate_grad + frame_size * 2, + frame_size * 3, + 1, + grad.state_weight_grad, + frame_size); + } + } + + detail::backward_reset_grad( + phi::funcs::detail::backward::gru_resetGrad(), + value, + grad, + frame_size, + batch_size, + active_gate); + if (grad.prev_out_grad && value.prev_out_value) { + blas.GEMM(false, + true, + batch_size, + frame_size, + frame_size * 2, + 1, + grad.gate_grad, + frame_size * 3, + value.gate_weight, + frame_size * 2, + 1, + grad.prev_out_grad, + frame_size); + + if (grad.gate_weight_grad) { + blas.GEMM(true, + false, + frame_size, + frame_size * 2, + batch_size, + 1, + value.prev_out_value, + frame_size, + grad.gate_grad, + frame_size * 3, + 1, + grad.gate_weight_grad, + frame_size * 2); + } + } +#endif + } +}; + +template +struct GRUUnitFunctorV2 { + static void compute(const paddle::platform::CPUDeviceContext &context, + GRUMetaValue value, + int frame_size, + int batch_size, + const phi::funcs::detail::ActivationType active_node, + const phi::funcs::detail::ActivationType active_gate) { +#if !defined(__NVCC__) && !defined(__HIPCC___) + auto blas = + phi::funcs::GetBlas(context); + if (value.prev_out_value) { + blas.GEMM(CblasNoTrans, + CblasTrans, + batch_size, + frame_size, + frame_size, + 1, + value.prev_out_value, + value.state_weight, + 0, + value.reset_output_value); + } + detail::forward_reset_output( + phi::funcs::detail::forward::gru_resetOutput(), + value, + frame_size, + batch_size, + active_gate, + false, + &context); + + T *cell_state_value = value.gate_value + 2 * frame_size; + T *reset_output_value = value.reset_output_value; + for (int b = 0; b < batch_size; ++b) { + blas.VADD( + frame_size, cell_state_value, reset_output_value, cell_state_value); + cell_state_value += frame_size * 3; + reset_output_value += frame_size; + } + + detail::forward_final_output( + phi::funcs::detail::forward::gru_finalOutput(), + value, + frame_size, + batch_size, + active_node, + true, + false, + &context); +#endif + } +}; + +template +struct GRUUnitGradFunctorV2 { + static void compute(const paddle::platform::CPUDeviceContext &context, + GRUMetaValue value, + GRUMetaGrad grad, + int frame_size, + int batch_size, + const phi::funcs::detail::ActivationType active_node, + const phi::funcs::detail::ActivationType active_gate) { +#if !defined(__NVCC__) && !defined(__HIPCC___) + // calculate grad_update_gate, grad_frame_state, + // grad_reset_output, grad_reset_gate + detail::cpu_gru_backward(context, + phi::funcs::detail::backward::gru(), + value, + grad, + frame_size, + batch_size, + active_node, + active_gate); + auto blas = + phi::funcs::GetBlas(context); + if (grad.prev_out_grad && value.prev_out_value) { + // update prev_out_grad + blas.GEMM(false, + false, + batch_size, + frame_size, + frame_size, + 1, + grad.gate_grad, + frame_size * 3, + value.gate_weight, + frame_size, + 1, + grad.prev_out_grad, + frame_size); + blas.GEMM(false, + false, + batch_size, + frame_size, + frame_size, + 1, + grad.gate_grad + frame_size, + frame_size * 3, + value.gate_weight + frame_size * frame_size, + frame_size, + 1, + grad.prev_out_grad, + frame_size); + blas.GEMM(false, + false, + batch_size, + frame_size, + frame_size, + 1, + grad.reset_output_grad, + frame_size, + value.state_weight, + frame_size, + 1, + grad.prev_out_grad, + frame_size); + // update weight_hh_grad + if (grad.gate_weight_grad) { + // reset gate + blas.GEMM(true, + false, + frame_size, + frame_size, + batch_size, + 1, + grad.gate_grad, + frame_size * 3, + value.prev_out_value, + frame_size, + 1, + grad.gate_weight_grad, + frame_size); + // update gate + blas.GEMM(true, + false, + frame_size, + frame_size, + batch_size, + 1, + grad.gate_grad + frame_size, + frame_size * 3, + value.prev_out_value, + frame_size, + 1, + grad.gate_weight_grad + frame_size * frame_size, + frame_size); + // cell state + blas.GEMM(true, + false, + frame_size, + frame_size, + batch_size, + 1, + grad.reset_output_grad, + frame_size, + value.prev_out_value, + frame_size, + 1, + grad.state_weight_grad, + frame_size); + } + } + // update bias_hh_grad + T *gate_grad = grad.gate_grad; + T *bias_hh_grad = grad.bias_hh_grad; + T *state_bias_grad = grad.bias_hh_grad + 2 * frame_size; + T *reset_output_grad = grad.reset_output_grad; + for (int b = 0; b < batch_size; ++b) { + blas.VADD(2 * frame_size, bias_hh_grad, gate_grad, bias_hh_grad); + blas.VADD( + frame_size, state_bias_grad, reset_output_grad, state_bias_grad); + gate_grad += 3 * frame_size; + reset_output_grad += frame_size; + } +#endif + } +}; + +template struct GRUUnitFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitGradFunctor; +template struct GRUUnitGradFunctor; + +template struct GRUUnitFunctorV2; +template struct GRUUnitFunctorV2; +template struct GRUUnitGradFunctorV2; +template struct GRUUnitGradFunctorV2; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/gru_compute.cu b/paddle/phi/kernels/funcs/gru_compute.cu new file mode 100644 index 00000000000..7666206b7f7 --- /dev/null +++ b/paddle/phi/kernels/funcs/gru_compute.cu @@ -0,0 +1,349 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h" +#include "paddle/phi/kernels/funcs/detail/gru_kernel.h" +#include "paddle/phi/kernels/funcs/gru_compute.h" + +namespace phi { +namespace funcs { + +template +struct GRUUnitFunctor { + static void compute(const paddle::platform::CUDADeviceContext &context, + GRUMetaValue value, + int frame_size, + int batch_size, + const phi::funcs::detail::ActivationType active_node, + const phi::funcs::detail::ActivationType active_gate, + bool origin_mode) { + auto stream = context.stream(); + dim3 threads; + dim3 grid; + if (batch_size == 1) { + if (context.GetComputeCapability() >= 70) { + if (frame_size < 16) { + constexpr int tiled_size = 8; + int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size; + threads = dim3(tiled_size, 1); + grid = dim3(frame_blocks, 1); + detail::KeFastCollectiveGruGate< + T, + tiled_size><<>>( + value.gate_value, + value.prev_out_value, + value.gate_weight, + value.reset_output_value, + frame_size, + active_gate); + + frame_blocks = (frame_size + tiled_size - 1) / tiled_size; + grid = dim3(frame_blocks, 1); + detail::KeFastCollectiveGruOut< + T, + tiled_size><<>>( + value.state_weight, + value.prev_out_value, + value.output_value, + value.gate_value, + value.reset_output_value, + frame_size, + active_node, + origin_mode); + } else { + constexpr int tiled_size = 16; + int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size; + threads = dim3(tiled_size, 1); + grid = dim3(frame_blocks, 1); + detail::KeFastCollectiveGruGate< + T, + tiled_size><<>>( + value.gate_value, + value.prev_out_value, + value.gate_weight, + value.reset_output_value, + frame_size, + active_gate); + + frame_blocks = (frame_size + tiled_size - 1) / tiled_size; + grid = dim3(frame_blocks, 1); + detail::KeFastCollectiveGruOut< + T, + tiled_size><<>>( + value.state_weight, + value.prev_out_value, + value.output_value, + value.gate_value, + value.reset_output_value, + frame_size, + active_node, + origin_mode); + } + return; + } else { + int frame_per_block = frame_size <= 1024 ? frame_size : 1024; + int frame_blocks = (frame_size + 1024 - 1) / 1024; + threads = dim3(frame_per_block, 1); + grid = dim3(frame_blocks, 1); + } + } else { + threads = dim3(32, 32); + grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32); + } + auto blas = + phi::funcs::GetBlas(context); + if (value.prev_out_value) { + blas.GEMM(false, + false, + batch_size, + frame_size * 2, + frame_size, + 1, + value.prev_out_value, + frame_size, + value.gate_weight, + frame_size * 2, + 1, + value.gate_value, + frame_size * 3); + } + + if (batch_size == 1) { + detail::KeGruForwardResetOutput< + phi::funcs::detail::forward::gru_resetOutput, + /* is_batch= */ false, + T><<>>( + phi::funcs::detail::forward::gru_resetOutput(), + value.gate_value, + value.reset_output_value, + value.prev_out_value, + frame_size, + batch_size, + active_gate); + } else { + detail::KeGruForwardResetOutput< + phi::funcs::detail::forward::gru_resetOutput, + /* is_batch= */ true, + T><<>>( + phi::funcs::detail::forward::gru_resetOutput(), + value.gate_value, + value.reset_output_value, + value.prev_out_value, + frame_size, + batch_size, + active_gate); + } + + if (value.prev_out_value) { + blas.GEMM(false, + false, + batch_size, + frame_size, + frame_size, + 1, + value.reset_output_value, + frame_size, + value.state_weight, + frame_size, + 1, + value.gate_value + frame_size * 2, + frame_size * 3); + } + + if (batch_size == 1) { + detail::KeGruForwardFinalOutput< + phi::funcs::detail::forward::gru_finalOutput, + /* is_batch= */ false, + T><<>>( + phi::funcs::detail::forward::gru_finalOutput(), + value.gate_value, + value.prev_out_value, + value.output_value, + frame_size, + batch_size, + active_node, + origin_mode); + } else { + detail::KeGruForwardFinalOutput< + phi::funcs::detail::forward::gru_finalOutput, + /* is_batch= */ true, + T><<>>( + phi::funcs::detail::forward::gru_finalOutput(), + value.gate_value, + value.prev_out_value, + value.output_value, + frame_size, + batch_size, + active_node, + origin_mode); + } + } +}; + +template +struct GRUUnitGradFunctor { + static void compute(const paddle::platform::CUDADeviceContext &context, + GRUMetaValue value, + GRUMetaGrad grad, + int frame_size, + int batch_size, + const phi::funcs::detail::ActivationType active_node, + const phi::funcs::detail::ActivationType active_gate, + bool origin_mode) { + auto stream = context.stream(); + dim3 threads; + dim3 grid; + if (batch_size == 1) { + int frame_per_block = frame_size <= 1024 ? frame_size : 1024; + int frame_blocks = (frame_size + 1024 - 1) / 1024; + threads = dim3(frame_per_block, 1); + grid = dim3(frame_blocks, 1); + } else { + threads = dim3(32, 32); + grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32); + } + + if (batch_size == 1) { + detail::KeGruBackwardStateGrad< + phi::funcs::detail::backward::gru_stateGrad, + /* is_batch= */ false><<>>( + phi::funcs::detail::backward::gru_stateGrad(), + value.gate_value, + grad.gate_grad, + value.prev_out_value, + grad.prev_out_grad, + grad.output_grad, + frame_size, + batch_size, + active_node, + origin_mode); + } else { + detail::KeGruBackwardStateGrad< + phi::funcs::detail::backward::gru_stateGrad, + /* is_batch= */ true><<>>( + phi::funcs::detail::backward::gru_stateGrad(), + value.gate_value, + grad.gate_grad, + value.prev_out_value, + grad.prev_out_grad, + grad.output_grad, + frame_size, + batch_size, + active_node, + origin_mode); + } + + auto blas = + phi::funcs::GetBlas(context); + + if (value.prev_out_value && grad.prev_out_grad) { + blas.GEMM(false, + true, + batch_size, + frame_size, + frame_size, + 1, + grad.gate_grad + frame_size * 2, + frame_size * 3, + value.state_weight, + frame_size, + 0, + grad.reset_output_grad, + frame_size); + + if (grad.state_weight_grad) { + blas.GEMM(true, + false, + frame_size, + frame_size, + batch_size, + 1, + value.reset_output_value, + frame_size, + grad.gate_grad + frame_size * 2, + frame_size * 3, + 1, + grad.state_weight_grad, + frame_size); + } + } + + if (batch_size == 1) { + detail::KeGruBackwardResetGrad< + phi::funcs::detail::backward::gru_resetGrad, + /* is_batch= */ false><<>>( + phi::funcs::detail::backward::gru_resetGrad(), + value.gate_value, + grad.gate_grad, + value.prev_out_value, + grad.prev_out_grad, + grad.reset_output_grad, + frame_size, + batch_size, + active_gate); + } else { + detail::KeGruBackwardResetGrad< + phi::funcs::detail::backward::gru_resetGrad, + /* is_batch= */ true><<>>( + phi::funcs::detail::backward::gru_resetGrad(), + value.gate_value, + grad.gate_grad, + value.prev_out_value, + grad.prev_out_grad, + grad.reset_output_grad, + frame_size, + batch_size, + active_gate); + } + + if (grad.prev_out_grad && value.prev_out_value) { + blas.GEMM(false, + true, + batch_size, + frame_size, + frame_size * 2, + 1, + grad.gate_grad, + frame_size * 3, + value.gate_weight, + frame_size * 2, + 1, + grad.prev_out_grad, + frame_size); + + if (grad.gate_weight_grad) { + blas.GEMM(true, + false, + frame_size, + frame_size * 2, + batch_size, + 1, + value.prev_out_value, + frame_size, + grad.gate_grad, + frame_size * 3, + 1, + grad.gate_weight_grad, + frame_size * 2); + } + } + } +}; + +template struct GRUUnitFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitGradFunctor; +template struct GRUUnitGradFunctor; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/gru_compute.h b/paddle/phi/kernels/funcs/gru_compute.h new file mode 100644 index 00000000000..02b2b91423c --- /dev/null +++ b/paddle/phi/kernels/funcs/gru_compute.h @@ -0,0 +1,88 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/kernels/funcs/detail/activation_functions.h" + +namespace phi { +namespace funcs { + +template +struct GRUMetaValue { + const T *gate_weight; + const T *state_weight; + const T *reset_bias; + T *gate_value; + T *reset_output_value; + T *output_value; + const T *prev_out_value; +}; + +template +struct GRUMetaGrad { + T *gate_weight_grad; + T *state_weight_grad; + T *gate_grad; + T *reset_output_grad; + T *output_grad; + T *prev_out_grad; + T *bias_hh_grad; +}; + +template +struct GRUUnitFunctor { + static void compute(const DeviceContext &context, + GRUMetaValue value, + int frame_size, + int batch_size, + const phi::funcs::detail::ActivationType active_node, + const phi::funcs::detail::ActivationType active_gate, + bool origin_mode); +}; + +template +struct GRUUnitGradFunctor { + static void compute(const DeviceContext &context, + GRUMetaValue value, + GRUMetaGrad grad, + int frame_size, + int batch_size, + const phi::funcs::detail::ActivationType active_node, + const phi::funcs::detail::ActivationType active_gate, + bool origin_mode); +}; + +template +struct GRUUnitFunctorV2 { + static void compute(const DeviceContext &context, + GRUMetaValue value, + int frame_size, + int batch_size, + const phi::funcs::detail::ActivationType active_node, + const phi::funcs::detail::ActivationType active_gate); +}; + +template +struct GRUUnitGradFunctorV2 { + static void compute(const DeviceContext &context, + GRUMetaValue value, + GRUMetaGrad grad, + int frame_size, + int batch_size, + const phi::funcs::detail::ActivationType active_node, + const phi::funcs::detail::ActivationType active_gate); +}; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/lstm_compute.cc b/paddle/phi/kernels/funcs/lstm_compute.cc new file mode 100644 index 00000000000..19932c62b01 --- /dev/null +++ b/paddle/phi/kernels/funcs/lstm_compute.cc @@ -0,0 +1,103 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/funcs/lstm_compute.h" +#include "paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h" +#include "paddle/phi/kernels/funcs/detail/lstm_kernel.h" + +namespace phi { +namespace funcs { + +template +struct LstmUnitFunctor { + static void compute(const paddle::platform::CPUDeviceContext& context, + LstmMetaValue value, + int frame_size, + int batch_size, + T cell_clip, + const phi::funcs::detail::ActivationType& gate_act, + const phi::funcs::detail::ActivationType& cell_act, + const phi::funcs::detail::ActivationType& cand_act, + bool old_api_version = true) { + for (int b = 0; b < batch_size; b++) { + detail::cpu_lstm_forward(context, + phi::funcs::detail::forward::lstm(), + value, + frame_size, + cell_clip, + cand_act, + gate_act, + cell_act, + old_api_version); + value.gate_value += frame_size * 4; + value.state_value += frame_size; + value.state_active_value += frame_size; + value.output_value += frame_size; + if (value.prev_state_value) { + value.prev_state_value += frame_size; + } + } + } +}; + +template +struct LstmUnitGradFunctor { + static void compute(const paddle::platform::CPUDeviceContext& context, + LstmMetaValue value, + LstmMetaGrad grad, + int frame_size, + int batch_size, + T cell_clip, + const phi::funcs::detail::ActivationType& gate_act, + const phi::funcs::detail::ActivationType& cell_act, + const phi::funcs::detail::ActivationType& cand_act, + bool old_api_version = true) { + for (int b = 0; b < batch_size; b++) { + detail::cpu_lstm_backward(context, + phi::funcs::detail::backward::lstm(), + value, + grad, + frame_size, + cell_clip, + cand_act, + gate_act, + cell_act, + old_api_version); + + value.gate_value += frame_size * 4; + value.state_value += frame_size; + value.state_active_value += frame_size; + value.output_value += frame_size; + if (value.prev_state_value) { + value.prev_state_value += frame_size; + } + + grad.gate_grad += frame_size * 4; + grad.state_grad += frame_size; + grad.state_active_grad += frame_size; + grad.output_grad += frame_size; + if (grad.prev_state_grad) { + grad.prev_state_grad += frame_size; + } + } + } +}; + +template class LstmUnitFunctor; +template class LstmUnitFunctor; +template class LstmUnitGradFunctor; +template class LstmUnitGradFunctor; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/lstm_compute.cu b/paddle/phi/kernels/funcs/lstm_compute.cu new file mode 100644 index 00000000000..b2057cfc4f9 --- /dev/null +++ b/paddle/phi/kernels/funcs/lstm_compute.cu @@ -0,0 +1,76 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h" +#include "paddle/phi/kernels/funcs/detail/lstm_kernel.h" +#include "paddle/phi/kernels/funcs/lstm_compute.h" + +namespace phi { +namespace funcs { + +template +struct LstmUnitFunctor { + static void compute(const paddle::platform::CUDADeviceContext& context, + LstmMetaValue value, + int frame_size, + int batch_size, + T cell_clip, + const phi::funcs::detail::ActivationType& gate_act, + const phi::funcs::detail::ActivationType& cell_act, + const phi::funcs::detail::ActivationType& cand_act, + bool old_api_version = true) { + detail::gpu_lstm_forward(context, + phi::funcs::detail::forward::lstm(), + value, + frame_size, + batch_size, + cell_clip, + cand_act, + gate_act, + cell_act); + } +}; + +template +struct LstmUnitGradFunctor { + static void compute(const paddle::platform::CUDADeviceContext& context, + LstmMetaValue value, + LstmMetaGrad grad, + int frame_size, + int batch_size, + T cell_clip, + const phi::funcs::detail::ActivationType& gate_act, + const phi::funcs::detail::ActivationType& cell_act, + const phi::funcs::detail::ActivationType& cand_act, + bool old_api_version = true) { + detail::gpu_lstm_backward(context, + phi::funcs::detail::backward::lstm(), + value, + grad, + frame_size, + batch_size, + cell_clip, + cand_act, + gate_act, + cell_act); + } +}; + +template class LstmUnitFunctor; +template class LstmUnitFunctor; +template class LstmUnitGradFunctor; +template class LstmUnitGradFunctor; + +} // namespace funcs +} // namespace phi diff --git a/paddle/fluid/operators/math/lstm_compute.h b/paddle/phi/kernels/funcs/lstm_compute.h similarity index 56% rename from paddle/fluid/operators/math/lstm_compute.h rename to paddle/phi/kernels/funcs/lstm_compute.h index cc91f784f39..d51b92fc4fd 100644 --- a/paddle/fluid/operators/math/lstm_compute.h +++ b/paddle/phi/kernels/funcs/lstm_compute.h @@ -14,13 +14,12 @@ limitations under the License. */ #pragma once -#include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/kernels/funcs/detail/activation_functions.h" -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { template struct LstmMetaValue { @@ -49,25 +48,31 @@ struct LstmMetaGrad { template class LstmUnitFunctor { public: - static void compute(const DeviceContext &context, LstmMetaValue value, - int frame_size, int batch_size, T cell_clip, - const detail::ActivationType &gate_act, - const detail::ActivationType &cell_act, - const detail::ActivationType &cand_act, + static void compute(const DeviceContext &context, + LstmMetaValue value, + int frame_size, + int batch_size, + T cell_clip, + const phi::funcs::detail::ActivationType &gate_act, + const phi::funcs::detail::ActivationType &cell_act, + const phi::funcs::detail::ActivationType &cand_act, bool old_api_version = true); }; template class LstmUnitGradFunctor { public: - static void compute(const DeviceContext &context, LstmMetaValue value, - LstmMetaGrad grad, int frame_size, int batch_size, - T cell_clip, const detail::ActivationType &gate_act, - const detail::ActivationType &cell_act, - const detail::ActivationType &cand_act, + static void compute(const DeviceContext &context, + LstmMetaValue value, + LstmMetaGrad grad, + int frame_size, + int batch_size, + T cell_clip, + const phi::funcs::detail::ActivationType &gate_act, + const phi::funcs::detail::ActivationType &cell_act, + const phi::funcs::detail::ActivationType &cand_act, bool old_api_version = true); }; -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/fluid/operators/math/sequence2batch.cc b/paddle/phi/kernels/funcs/sequence2batch.cc similarity index 56% rename from paddle/fluid/operators/math/sequence2batch.cc rename to paddle/phi/kernels/funcs/sequence2batch.cc index 852700fa7ff..0d75ba877db 100644 --- a/paddle/fluid/operators/math/sequence2batch.cc +++ b/paddle/phi/kernels/funcs/sequence2batch.cc @@ -12,47 +12,45 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/math/sequence2batch.h" +#include "paddle/phi/kernels/funcs/sequence2batch.h" -namespace paddle { -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { template -class CopyMatrixRowsFunctor { +class CopyMatrixRowsFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& src, - framework::Vector index_lod, framework::Tensor* dst, + void operator()(const paddle::platform::CPUDeviceContext& context, + const paddle::framework::Tensor& src, + paddle::framework::Vector index_lod, + paddle::framework::Tensor* dst, bool is_src_index) { size_t* index = index_lod.data(); auto src_dims = src.dims(); auto dst_dims = dst->dims(); - PADDLE_ENFORCE_EQ(src_dims.size(), 2UL, - platform::errors::InvalidArgument( + PADDLE_ENFORCE_EQ(src_dims.size(), + 2UL, + phi::errors::InvalidArgument( "The source tensor must be a matrix with rank 2, but " "got the source tensor rank is %lu. " "Please check the rank of the source tensor", src_dims.size())); - PADDLE_ENFORCE_EQ(dst_dims.size(), 2UL, - platform::errors::InvalidArgument( + PADDLE_ENFORCE_EQ(dst_dims.size(), + 2UL, + phi::errors::InvalidArgument( "The destination tensor must be a matrix with rank, " "but got the destination tensor rank is %lu. " "Please check the rank of the destination tensor", dst_dims.size())); PADDLE_ENFORCE_EQ( - src_dims[1], dst_dims[1], - platform::errors::InvalidArgument( + src_dims[1], + dst_dims[1], + phi::errors::InvalidArgument( "The width of the source tensor and the destination tensor must be " "same. But got %lu != %lu.Please check the rank of the source " "tensor", - src_dims.size(), dst_dims.size())); + src_dims.size(), + dst_dims.size())); auto height = dst_dims[0]; auto width = dst_dims[1]; auto* src_data = src.data(); @@ -70,14 +68,18 @@ class CopyMatrixRowsFunctor { } }; -template class CopyMatrixRowsFunctor; -template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; -template class LoDTensor2BatchFunctor; -template class LoDTensor2BatchFunctor; -template class Batch2LoDTensorFunctor; -template class Batch2LoDTensorFunctor; +template class LoDTensor2BatchFunctor; +template class LoDTensor2BatchFunctor; +template class Batch2LoDTensorFunctor; +template class Batch2LoDTensorFunctor; -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/fluid/operators/math/sequence2batch.cu b/paddle/phi/kernels/funcs/sequence2batch.cu similarity index 55% rename from paddle/fluid/operators/math/sequence2batch.cu rename to paddle/phi/kernels/funcs/sequence2batch.cu index f56c5293971..a66030e6426 100644 --- a/paddle/fluid/operators/math/sequence2batch.cu +++ b/paddle/phi/kernels/funcs/sequence2batch.cu @@ -11,15 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/math/sequence2batch.h" +#include "paddle/phi/kernels/funcs/sequence2batch.h" -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { template -__global__ void CopyMatrixRowsKernel(const T* src, T* dst, const size_t* index, - int64_t height, int64_t width, +__global__ void CopyMatrixRowsKernel(const T* src, + T* dst, + const size_t* index, + int64_t height, + int64_t width, bool is_src_index) { int idx = threadIdx.x; int idy = threadIdx.y; @@ -37,33 +39,38 @@ __global__ void CopyMatrixRowsKernel(const T* src, T* dst, const size_t* index, } template -class CopyMatrixRowsFunctor { +class CopyMatrixRowsFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& src, - framework::Vector index_lod, framework::Tensor* dst, + void operator()(const paddle::platform::CUDADeviceContext& context, + const paddle::framework::Tensor& src, + paddle::framework::Vector index_lod, + paddle::framework::Tensor* dst, bool is_src_index) { auto src_dims = src.dims(); auto dst_dims = dst->dims(); - PADDLE_ENFORCE_EQ(src_dims.size(), 2, - platform::errors::InvalidArgument( + PADDLE_ENFORCE_EQ(src_dims.size(), + 2, + phi::errors::InvalidArgument( "The source tensor must be a matrix with rank 2, but " "got the source tensor rank is %lu. " "Please check the rank of the source tensor", src_dims.size())); - PADDLE_ENFORCE_EQ(dst_dims.size(), 2, - platform::errors::InvalidArgument( + PADDLE_ENFORCE_EQ(dst_dims.size(), + 2, + phi::errors::InvalidArgument( "The destination tensor must be a matrix with rank, " "but got the destination tensor rank is %lu. " "Please check the rank of the destination tensor", dst_dims.size())); PADDLE_ENFORCE_EQ( - src_dims[1], dst_dims[1], - platform::errors::InvalidArgument( + src_dims[1], + dst_dims[1], + phi::errors::InvalidArgument( "The width of the source tensor and the destination tensor must be " "same. But got %lu != %lu.Please check the rank of the source " "tensor", - src_dims.size(), dst_dims.size())); + src_dims.size(), + dst_dims.size())); auto height = dst_dims[0]; auto width = dst_dims[1]; auto* src_data = src.data(); @@ -74,19 +81,28 @@ class CopyMatrixRowsFunctor { auto stream = context.stream(); paddle::framework::MixVector mix_index_lod(&index_lod); CopyMatrixRowsKernel<<>>( - src_data, dst_data, mix_index_lod.CUDAData(context.GetPlace()), height, - width, is_src_index); + src_data, + dst_data, + mix_index_lod.CUDAData(context.GetPlace()), + height, + width, + is_src_index); } }; -template class CopyMatrixRowsFunctor; -template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; -template class LoDTensor2BatchFunctor; -template class LoDTensor2BatchFunctor; -template class Batch2LoDTensorFunctor; -template class Batch2LoDTensorFunctor; +template class LoDTensor2BatchFunctor; +template class LoDTensor2BatchFunctor; +template class Batch2LoDTensorFunctor; +template class Batch2LoDTensorFunctor; -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/fluid/operators/math/sequence2batch.h b/paddle/phi/kernels/funcs/sequence2batch.h similarity index 80% rename from paddle/fluid/operators/math/sequence2batch.h rename to paddle/phi/kernels/funcs/sequence2batch.h index 6aa513e4d10..e7c387fb99b 100644 --- a/paddle/fluid/operators/math/sequence2batch.h +++ b/paddle/phi/kernels/funcs/sequence2batch.h @@ -20,13 +20,13 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { -template -using EigenMatrix = framework::EigenMatrix; +using EigenMatrix = paddle::framework::EigenMatrix; template class CopyMatrixRowsFunctor { @@ -36,8 +36,10 @@ class CopyMatrixRowsFunctor { // If is_src_index is false, // copy the input src to the indexed rows of output dst. // The indexed rows are based on the input index. - void operator()(const DeviceContext& context, const framework::Tensor& src, - framework::Vector index_lod, framework::Tensor* dst, + void operator()(const DeviceContext& context, + const paddle::framework::Tensor& src, + paddle::framework::Vector index_lod, + paddle::framework::Tensor* dst, bool is_src_index); }; @@ -59,32 +61,37 @@ class LoDTensor2BatchFunctor { public: void operator()(const DeviceContext& context, - const framework::LoDTensor& lod_tensor, - framework::LoDTensor* batch, bool is_cal_batch_lod, + const paddle::framework::LoDTensor& lod_tensor, + paddle::framework::LoDTensor* batch, + bool is_cal_batch_lod, bool is_reverse = false) const { if (!is_cal_batch_lod) { auto lods = batch->lod(); PADDLE_ENFORCE_GT( - lods.size(), 2UL, - platform::errors::InvalidArgument( + lods.size(), + 2UL, + phi::errors::InvalidArgument( "The LoD of LoDTensor should inlcude at least 2-level " "sequence information, but got the LoD level is %lu. Please " "check the input value.", lods.size())); PADDLE_ENFORCE_EQ( - lods[1].size(), static_cast(lod_tensor.dims()[0]), - platform::errors::InvalidArgument( + lods[1].size(), + static_cast(lod_tensor.dims()[0]), + phi::errors::InvalidArgument( "The LoD information should be consistent with the dims, but got " "%lu != %lu. Please check the input value.", - lods[1].size(), static_cast(lod_tensor.dims()[0]))); + lods[1].size(), + static_cast(lod_tensor.dims()[0]))); CopyMatrixRowsFunctor to_batch; to_batch(context, lod_tensor, lods[1], batch, true); return; } auto lods = lod_tensor.lod(); - PADDLE_ENFORCE_EQ(lods.size(), 1UL, - platform::errors::InvalidArgument( + PADDLE_ENFORCE_EQ(lods.size(), + 1UL, + phi::errors::InvalidArgument( "Only support one level sequence now, but got the " "LoD level is %lu. Please check the input value.", lods.size())); @@ -97,8 +104,9 @@ class LoDTensor2BatchFunctor { seq_info.emplace_back(lod[seq_id], length, seq_id); } - std::sort(seq_info.begin(), seq_info.end(), - [](SeqInfo a, SeqInfo b) { return a.length > b.length; }); + std::sort(seq_info.begin(), seq_info.end(), [](SeqInfo a, SeqInfo b) { + return a.length > b.length; + }); // Calculate the start position of each batch. // example: sequences = {s0, s1, s2} @@ -169,27 +177,29 @@ template class Batch2LoDTensorFunctor { public: void operator()(const DeviceContext& context, - const framework::LoDTensor& batch, - framework::LoDTensor* lod_tensor) const { + const paddle::framework::LoDTensor& batch, + paddle::framework::LoDTensor* lod_tensor) const { auto in_lod = batch.lod(); PADDLE_ENFORCE_GT( - in_lod.size(), 2UL, - platform::errors::InvalidArgument( + in_lod.size(), + 2UL, + phi::errors::InvalidArgument( "The LoD of LoDTensor should inlcude at least 2-level " "sequence information, but got the LoD level is %lu. Please check " "the input value.", in_lod.size())); PADDLE_ENFORCE_EQ( - in_lod[1].size(), static_cast(lod_tensor->dims()[0]), - platform::errors::InvalidArgument( + in_lod[1].size(), + static_cast(lod_tensor->dims()[0]), + phi::errors::InvalidArgument( "The LoD information should be consistent with the dims, but got " "%lu != %lu. Please check the input value.", - in_lod[1].size(), static_cast(lod_tensor->dims()[0]))); + in_lod[1].size(), + static_cast(lod_tensor->dims()[0]))); CopyMatrixRowsFunctor to_seq; to_seq(context, batch, in_lod[1], lod_tensor, false); } }; -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi -- GitLab From 1ff1c1e09a835123fdfe48cc7660f0d190c64e1e Mon Sep 17 00:00:00 2001 From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com> Date: Wed, 2 Mar 2022 10:44:15 +0800 Subject: [PATCH 036/272] add share external data interface (#39809) --- .../api/analysis_predictor_tester.cc | 82 +++++++++++++++++ .../inference/api/details/zero_copy_tensor.cc | 87 +++++++++++++++++++ paddle/fluid/inference/api/paddle_tensor.h | 13 +++ 3 files changed, 182 insertions(+) diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index a15a1cd84b1..9c7e5c6b27e 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -13,6 +13,9 @@ // limitations under the License. #include "paddle/fluid/inference/api/analysis_predictor.h" +#if defined(PADDLE_WITH_CUDA) +#include +#endif #include #include #include // NOLINT @@ -405,4 +408,83 @@ TEST(Predictor, Run) { predictor->TryShrinkMemory(); } +TEST(Tensor, CpuShareExternalData) { + Config config; + config.SetModel(FLAGS_dirname); + + auto predictor = CreatePredictor(config); + + auto w0 = predictor->GetInputHandle("firstw"); + auto w1 = predictor->GetInputHandle("secondw"); + auto w2 = predictor->GetInputHandle("thirdw"); + auto w3 = predictor->GetInputHandle("forthw"); + + std::vector> input_data(4, {0, 1, 2, 3}); + w0->ShareExternalData(input_data[0].data(), {4, 1}, PlaceType::kCPU); + w1->ShareExternalData(input_data[1].data(), {4, 1}, PlaceType::kCPU); + w2->ShareExternalData(input_data[2].data(), {4, 1}, PlaceType::kCPU); + w3->ShareExternalData(input_data[3].data(), {4, 1}, PlaceType::kCPU); + + auto out = predictor->GetOutputHandle("fc_1.tmp_2"); + auto out_shape = out->shape(); + std::vector out_data; + out_data.resize(std::accumulate(out_shape.begin(), out_shape.end(), 1, + std::multiplies())); + out->ShareExternalData(out_data.data(), out_shape, PlaceType::kCPU); + + predictor->Run(); + + PlaceType place; + int size = 0; + out->data(&place, &size); + LOG(INFO) << "output size: " << size / sizeof(float); + predictor->TryShrinkMemory(); +} + +#if defined(PADDLE_WITH_CUDA) +TEST(Tensor, GpuShareExternalData) { + Config config; + config.SetModel(FLAGS_dirname); + config.EnableUseGpu(100, 0); + + auto predictor = CreatePredictor(config); + + auto w0 = predictor->GetInputHandle("firstw"); + auto w1 = predictor->GetInputHandle("secondw"); + auto w2 = predictor->GetInputHandle("thirdw"); + auto w3 = predictor->GetInputHandle("forthw"); + + std::vector> input_data(4, {0, 1, 2, 3}); + std::vector input_gpu(4, nullptr); + + for (size_t i = 0; i < 4; ++i) { + cudaMalloc(reinterpret_cast(&input_gpu[i]), 4 * sizeof(int64_t)); + cudaMemcpy(input_gpu[i], input_data[i].data(), 4 * sizeof(int64_t), + cudaMemcpyHostToDevice); + } + + w0->ShareExternalData(input_gpu[0], {4, 1}, PlaceType::kGPU); + w1->ShareExternalData(input_gpu[1], {4, 1}, PlaceType::kGPU); + w2->ShareExternalData(input_gpu[2], {4, 1}, PlaceType::kGPU); + w3->ShareExternalData(input_gpu[3], {4, 1}, PlaceType::kGPU); + + auto out = predictor->GetOutputHandle("fc_1.tmp_2"); + auto out_shape = out->shape(); + float* out_data; + auto out_size = std::accumulate(out_shape.begin(), out_shape.end(), 1, + std::multiplies()) * + sizeof(float); + cudaMalloc(reinterpret_cast(out_data), out_size * sizeof(float)); + out->ShareExternalData(out_data, out_shape, PlaceType::kGPU); + + predictor->Run(); + + PlaceType place; + int size = 0; + out->data(&place, &size); + LOG(INFO) << "output size: " << size / sizeof(float); + predictor->TryShrinkMemory(); +} +#endif + } // namespace paddle_infer diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index 1d09b01f8f8..18b1d09f0e8 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -21,6 +21,7 @@ #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/core/allocator.h" namespace paddle_infer { @@ -205,6 +206,73 @@ void Tensor::CopyFromCpu(const T *data) { } } +template +struct DataTypeInfo; + +template <> +struct DataTypeInfo { + paddle::experimental::DataType TYPE = paddle::experimental::DataType::FLOAT32; +}; + +template <> +struct DataTypeInfo { + paddle::experimental::DataType TYPE = paddle::experimental::DataType::FLOAT16; +}; + +template <> +struct DataTypeInfo { + paddle::experimental::DataType TYPE = paddle::experimental::DataType::INT64; +}; + +template <> +struct DataTypeInfo { + paddle::experimental::DataType TYPE = paddle::experimental::DataType::INT8; +}; + +template <> +struct DataTypeInfo { + paddle::experimental::DataType TYPE = paddle::experimental::DataType::UINT8; +}; + +template <> +struct DataTypeInfo { + paddle::experimental::DataType TYPE = paddle::experimental::DataType::INT32; +}; + +paddle::experimental::DataLayout LayoutConvert(DataLayout layout) { + PADDLE_ENFORCE_EQ( + layout, DataLayout::kNCHW, + paddle::platform::errors::InvalidArgument("Only NCHW is supported now.")); + return paddle::experimental::DataLayout::NCHW; +} + +template +void Tensor::ShareExternalData(const T *data, const std::vector &shape, + PlaceType place, DataLayout layout) { + EAGER_GET_TENSOR(paddle::framework::LoDTensor) + size_t size = + std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()) * + sizeof(T); + phi::DenseTensorMeta meta(DataTypeInfo().TYPE, phi::make_ddim(shape), + LayoutConvert(layout)); + if (place == PlaceType::kCPU) { + phi::DenseTensor dtensor( + std::make_shared(const_cast(data), size, + paddle::platform::CPUPlace()), + meta); + *tensor = std::move(dtensor); + } else if (place == PlaceType::kGPU) { + phi::DenseTensor dtensor( + std::make_shared(const_cast(data), size, + paddle::platform::CUDAPlace(device_)), + meta); + *tensor = std::move(dtensor); + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "PlaceType must be PlaceType::kCPU or PlaceType::kGPU.")); + } +} + void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) { EAGER_GET_TENSOR(paddle_infer::Strings); PADDLE_ENFORCE_GE(tensor->size(), 0, @@ -334,6 +402,25 @@ template PD_INFER_DECL void Tensor::CopyFromCpu(const uint8_t *data); template PD_INFER_DECL void Tensor::CopyFromCpu(const int8_t *data); template PD_INFER_DECL void Tensor::CopyFromCpu(const float16 *data); +template PD_INFER_DECL void Tensor::ShareExternalData( + const float *data, const std::vector &shape, PlaceType place, + DataLayout layout); +template PD_INFER_DECL void Tensor::ShareExternalData( + const int64_t *data, const std::vector &shape, PlaceType place, + DataLayout layout); +template PD_INFER_DECL void Tensor::ShareExternalData( + const int32_t *data, const std::vector &shape, PlaceType place, + DataLayout layout); +template PD_INFER_DECL void Tensor::ShareExternalData( + const uint8_t *data, const std::vector &shape, PlaceType place, + DataLayout layout); +template PD_INFER_DECL void Tensor::ShareExternalData( + const int8_t *data, const std::vector &shape, PlaceType place, + DataLayout layout); +template PD_INFER_DECL void Tensor::ShareExternalData( + const float16 *data, const std::vector &shape, PlaceType place, + DataLayout layout); + template PD_INFER_DECL void Tensor::CopyToCpu(float *data) const; template PD_INFER_DECL void Tensor::CopyToCpu(int64_t *data) const; template PD_INFER_DECL void Tensor::CopyToCpu(int32_t *data) const; diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h index 81eecbb2c14..5a98d109aed 100644 --- a/paddle/fluid/inference/api/paddle_tensor.h +++ b/paddle/fluid/inference/api/paddle_tensor.h @@ -47,6 +47,8 @@ enum DataType { enum class PlaceType { kUNK = -1, kCPU, kGPU, kXPU, kNPU, kIPU }; +enum class DataLayout { kUNK = -1, kAny, kNHWC, kNCHW }; + /// \brief Represents an n-dimensional array of values. /// The Tensor is used to store the input or output of the network. /// Zero copy means that the tensor supports direct copy of host or device data @@ -92,6 +94,17 @@ class PD_INFER_DECL Tensor { template void CopyFromCpu(const T* data); + /// \brief Share the data with tensor data. + /// It's usually used to set the tensor data. + /// \param data The pointer of the data, from which the tensor will share. + /// \param shape The shape of data. + /// \param place The place of data. + /// \param layout The layout of data. Only NCHW is supported now. + template + void ShareExternalData(const T* data, const std::vector& shape, + PlaceType place, + DataLayout layout = DataLayout::kNCHW); + /// \brief Experimental interface. /// It's usually used to set the input tensor data with Strings data type. /// \param data The pointer of the data, from which the tensor will copy. -- GitLab From 26e2b918d80bb60855b9d1f8c0251d81e7c9e569 Mon Sep 17 00:00:00 2001 From: Wangzheee <634486483@qq.com> Date: Wed, 2 Mar 2022 11:14:04 +0800 Subject: [PATCH 037/272] ernie: revert skip_layernorm_fp16 (#39991) --- paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc index 71c4348685e..753cd707276 100644 --- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc @@ -105,7 +105,7 @@ class SkipLayerNormOpConverter : public OpConverter { "in CustomSkipLayerNormPluginDynamic hidden " "dimension should > 0")); if (enable_int8) { - type = static_cast(nvinfer1::DataType::kINT8); + type = static_cast(nvinfer1::DataType::kHALF); } const std::vector fields{ -- GitLab From 9af72957520e4dffa6356bc637e0532bd799ab75 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Wed, 2 Mar 2022 11:14:25 +0800 Subject: [PATCH 038/272] [Eager] open eager when WITH_PYTHON (#39979) * open eager when WITH_PYTHON, test=develop * refine, test=develop * refine, test=develop * add DWITH_PYTHON for gen_fluid_lib, test=develop --- paddle/fluid/eager/CMakeLists.txt | 2 +- paddle/fluid/eager/api/generated/CMakeLists.txt | 2 +- .../eager_generated/backwards/CMakeLists.txt | 2 +- .../eager_generated/forwards/CMakeLists.txt | 2 +- paddle/fluid/eager/tests/CMakeLists.txt | 2 +- paddle/fluid/eager/tests/task_tests/CMakeLists.txt | 2 +- paddle/fluid/pybind/CMakeLists.txt | 8 ++++---- paddle/fluid/pybind/pybind.cc | 8 ++------ paddle/scripts/paddle_build.sh | 14 ++++++++++---- python/paddle/fluid/tests/unittests/CMakeLists.txt | 2 +- 10 files changed, 23 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt index 5e16ab2b391..8cb69caf663 100644 --- a/paddle/fluid/eager/CMakeLists.txt +++ b/paddle/fluid/eager/CMakeLists.txt @@ -2,7 +2,7 @@ set(eager_deps phi phi_api hook_utils tensor_utils utils global_utils backward p set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy) set(generated_deps dygraph_function dygraph_node) -if(NOT ON_INFER) +if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) message("Performing Eager Dygraph Auto Code Generation") add_subdirectory(auto_code_generator) endif() diff --git a/paddle/fluid/eager/api/generated/CMakeLists.txt b/paddle/fluid/eager/api/generated/CMakeLists.txt index ebbef286f79..4f634c6884b 100644 --- a/paddle/fluid/eager/api/generated/CMakeLists.txt +++ b/paddle/fluid/eager/api/generated/CMakeLists.txt @@ -1,5 +1,5 @@ add_subdirectory(eager_generated) -if(NOT ON_INFER) +if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) add_subdirectory(fluid_generated) endif() diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt index 77d8ec57efc..81ff07b8963 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt @@ -1,6 +1,6 @@ cc_library(scale_node SRCS scale_node.cc DEPS global_utils phi phi_api grad_node_info) -if(NOT ON_INFER) +if(NOT (NOT WITH_PYTHON AND ON_INFER)) cc_library(final_dygraph_node SRCS nodes.cc DEPS ${eager_deps}) add_dependencies(final_dygraph_node eager_final_state_codegen) endif() diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt index 60b35340eab..c70bb80c35c 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt +++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt @@ -1,6 +1,6 @@ cc_library(eager_scale SRCS scale.cc DEPS phi_api phi autograd_meta scale_node) -if(NOT ON_INFER) +if(NOT (NOT WITH_PYTHON AND ON_INFER)) cc_library(final_dygraph_function SRCS dygraph_functions.cc DEPS ${eager_deps}) add_dependencies(final_dygraph_function eager_final_state_codegen) endif() diff --git a/paddle/fluid/eager/tests/CMakeLists.txt b/paddle/fluid/eager/tests/CMakeLists.txt index c1506d8139b..2bfb9937c8c 100644 --- a/paddle/fluid/eager/tests/CMakeLists.txt +++ b/paddle/fluid/eager/tests/CMakeLists.txt @@ -1,6 +1,6 @@ add_subdirectory(data_structure_tests) add_subdirectory(task_tests) -if(NOT ON_INFER) +if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) add_subdirectory(performance_tests) endif() diff --git a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt index dbdb52eb536..c65ad4641cf 100644 --- a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt +++ b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt @@ -6,7 +6,7 @@ cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} ea cc_test(test_egr_task_cross_batch SRCS cross_batch_accumulation_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) cc_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) -if(NOT ON_INFER) +if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) cc_test(test_egr_task_hook_intermidiate SRCS hook_test_intermidiate.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} dygraph_node) cc_test(test_egr_task_autocodegen SRCS generated_test.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps}) endif() diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index c61e8212b02..48d42f803a8 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -242,7 +242,7 @@ if(WITH_PYTHON) COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file} COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}" DEPENDS ${OP_IMPL_DEPS}) - if(NOT ON_INFER) + if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) add_custom_command(OUTPUT ${eager_impl_file} COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/eager_op_function_generator_retry.bat COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_eager_impl_file} ${eager_impl_file} @@ -276,7 +276,7 @@ if(WITH_PYTHON) COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}" DEPENDS ${OP_IMPL_DEPS} VERBATIM) - if(NOT ON_INFER) + if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) add_custom_command(OUTPUT ${eager_impl_file} COMMAND ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:." "${CMAKE_CURRENT_BINARY_DIR}/eager_op_function_generator" @@ -288,7 +288,7 @@ if(WITH_PYTHON) endif() endif(WIN32) add_custom_target(op_function_generator_cmd ALL DEPENDS ${impl_file}) - if(NOT ON_INFER) + if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) add_custom_target(eager_op_function_generator_cmd ALL DEPENDS ${eager_impl_file}) endif() @@ -296,7 +296,7 @@ if(WITH_PYTHON) cc_library(op_function_common SRCS op_function_common.cc DEPS ${PYBIND_DEPS}) list(APPEND PYBIND_DEPS op_function_common) - if(NOT ON_INFER) + if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) cc_library(paddle_eager SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc eager_utils.cc DEPS eager_api autograd_meta backward grad_node_info phi op_function_common final_dygraph_function final_dygraph_node dygraph_function dygraph_node accumulation_node global_utils utils python) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 3d8815e2eb6..2d9272dd0ed 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -79,12 +79,10 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/pybind/cuda_streams_py.h" #include "paddle/fluid/pybind/distributed_py.h" -#include "paddle/phi/core/compat/convert_utils.h" -#include "paddle/phi/core/lod_utils.h" -#ifndef PADDLE_ON_INFERENCE #include "paddle/fluid/pybind/eager.h" -#endif #include "paddle/fluid/pybind/io.h" +#include "paddle/phi/core/compat/convert_utils.h" +#include "paddle/phi/core/lod_utils.h" #include "paddle/utils/none.h" #ifdef PADDLE_WITH_ASCEND #include "paddle/fluid/pybind/ascend_wrapper_py.h" @@ -529,9 +527,7 @@ PYBIND11_MODULE(core_avx, m) { PYBIND11_MODULE(core_noavx, m) { #endif -#ifndef PADDLE_ON_INFERENCE BindEager(&m); -#endif BindCudaStream(&m); // Not used, just make sure cpu_info.cc is linked. diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 8528ba34e21..9bef7e12851 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -2374,7 +2374,7 @@ EOF fi startTime_s=`date +%s` set +e - cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON -DWITH_TENSORRT=ON -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-Auto};build_error=$? + cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON -DWITH_TENSORRT=ON -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-Auto} -DWITH_PYTHON=${WITH_PYTHON:-ON};build_error=$? # reset ccache zero stats for collect PR's actual hit rate ccache -z @@ -2739,7 +2739,9 @@ function main() { test_fluid_lib ;; build_inference_lib) - python ${PADDLE_ROOT}/tools/remove_grad_op_and_kernel.py + if [ "${WITH_PYTHON}" == "OFF" ] ; then + python ${PADDLE_ROOT}/tools/remove_grad_op_and_kernel.py + fi cmake_gen ${PYTHON_ABI:-""} gen_fluid_lib ${parallel_number} ;; @@ -2790,7 +2792,9 @@ function main() { ;; test_inference) PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )" - python ${PADDLE_ROOT}/tools/remove_grad_op_and_kernel.py + if [ "${WITH_PYTHON}" == "OFF" ] ; then + python ${PADDLE_ROOT}/tools/remove_grad_op_and_kernel.py + fi gen_fluid_lib ${parallel_number} test_fluid_lib #test_fluid_lib_train @@ -2800,7 +2804,9 @@ function main() { ;; build_inference) PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )" - python ${PADDLE_ROOT}/tools/remove_grad_op_and_kernel.py + if [ "${WITH_PYTHON}" == "OFF" ] ; then + python ${PADDLE_ROOT}/tools/remove_grad_op_and_kernel.py + fi gen_fluid_lib ${parallel_number} ;; gpu_inference) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 7d64cf7bd89..2f6df075478 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -116,7 +116,7 @@ foreach(TEST_OP ${MIXED_DIST_TEST_OPS}) list(REMOVE_ITEM TEST_OPS ${TEST_OP}) endforeach() -if(ON_INFER) +if(NOT WITH_PYTHON AND ON_INFER) LIST(REMOVE_ITEM TEST_OPS test_eager_trace_op) endif() -- GitLab From fb63508931868bd00d55af2abc34dfbd5c59915d Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Wed, 2 Mar 2022 11:15:10 +0800 Subject: [PATCH 039/272] optimize CUDA implementaion of randint OP (#39952) * change CUDA implementaion of randint OP,move distribution common func to phi * fix CI * fix CI --- .../phi/kernels/funcs/distribution_helper.h | 94 +++++++++++++++---- paddle/phi/kernels/gpu/bernoulli_kernel.cu | 4 +- paddle/phi/kernels/gpu/randint_kernel.cu | 56 ++++++----- .../phi/kernels/gpu/uniform_random_kernel.cu | 6 +- .../tests/unittests/test_cuda_random_seed.py | 6 +- .../fluid/tests/unittests/test_randint_op.py | 45 +++++++++ 6 files changed, 162 insertions(+), 49 deletions(-) diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h index 49e1c82482c..f0793fb9d27 100644 --- a/paddle/phi/kernels/funcs/distribution_helper.h +++ b/paddle/phi/kernels/funcs/distribution_helper.h @@ -21,12 +21,11 @@ limitations under the License. */ #include #endif +#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/device_context.h" #include "paddle/phi/core/generator.h" - -#include "paddle/phi/kernels/funcs/index_impl.cu.h" +#include "paddle/phi/core/hostdevice.h" #if defined(__NVCC__) || defined(__HIPCC__) #include "paddle/phi/kernels/primitive/kernel_primitives.h" @@ -40,7 +39,7 @@ limitations under the License. */ #endif namespace phi { -namespace distribution { +namespace funcs { /********************* Transformation Function **********************/ template @@ -64,8 +63,9 @@ struct exponential_transform { }; template -struct uniform_transform { - explicit uniform_transform(T min, T max) : range_(max - min), min_(min) {} +struct uniform_real_transform { + explicit uniform_real_transform(T min, T max) + : range_(max - min), min_(min) {} HOSTDEVICE inline T operator()(T val) const { if (UNLIKELY(val == static_cast(1.0))) { @@ -80,6 +80,22 @@ struct uniform_transform { T min_; }; +template +struct uniform_int_transform { + explicit uniform_int_transform(int min, int max) { + range_ = static_cast(max - min); + min_ = min; + } + + HOSTDEVICE inline T operator()(R rand) const { + return static_cast(static_cast(rand % range_) + min_); + } + + private: + uint32_t range_; + int min_; +}; + template struct normal_transform { explicit normal_transform(T mean, T std) : mean_(mean), std_(std) {} @@ -120,6 +136,27 @@ struct uniform_distribution { static constexpr int kReturnsCount = 2; }; +template <> +struct uniform_distribution { + __device__ inline uint4 operator()(curandStatePhilox4_32_10_t *state) const { + return curand4(state); + } + static constexpr int kReturnsCount = 4; +}; + +template <> +struct uniform_distribution { + __device__ inline ulonglong2 operator()( + curandStatePhilox4_32_10_t *state) const { + ulonglong2 result; + uint4 rand = curand4(state); + result.x = (uint64_t)rand.x << 32 | rand.y; + result.y = (uint64_t)rand.z << 32 | rand.w; + return result; + } + static constexpr int kReturnsCount = 2; +}; + template <> struct normal_distribution { __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const { @@ -156,6 +193,27 @@ struct uniform_distribution { static constexpr int kReturnsCount = 2; }; +template <> +struct uniform_distribution { + __device__ inline uint4 operator()(hiprandStatePhilox4_32_10_t *state) const { + return hiprand4(state); + } + static constexpr int kReturnsCount = 4; +}; + +template <> +struct uniform_distribution { + __device__ inline ulonglong2 operator()( + hiprandStatePhilox4_32_10_t *state) const { + ulonglong2 result; + uint4 rand = hiprand4(state); + result.x = (uint64_t)rand.x << 32 | rand.y; + result.y = (uint64_t)rand.z << 32 | rand.w; + return result; + } + static constexpr int kReturnsCount = 2; +}; + template <> struct normal_distribution { __device__ inline float4 operator()( @@ -209,19 +267,21 @@ __global__ void DistributionKernel(size_t size, } template -void distribution_and_transform(const GPUContext &dev_ctx, +void distribution_and_transform(const GPUContext &ctx, DenseTensor *out, DistOp dist, TransformOp trans) { - T *out_data = dev_ctx.template Alloc(out); + T *out_data = ctx.template Alloc(out); auto size = out->numel(); - - int64_t device_id = dev_ctx.GetPlace().GetDeviceId(); - auto gen_cuda = dev_ctx.GetGenerator(); + if (size == 0) return; + auto gen_cuda = ctx.GetGenerator(); size_t block_size = 256; size_t expect_grid_size = (size + block_size - 1) / block_size; - const auto &prop = backends::gpu::GetDeviceProperties(device_id); + + int64_t device_id = ctx.GetPlace().GetDeviceId(); + const auto &prop = phi::backends::gpu::GetDeviceProperties(device_id); + size_t max_grid_size = (prop.maxThreadsPerMultiProcessor / block_size) * prop.multiProcessorCount; size_t grid_size = @@ -237,13 +297,13 @@ void distribution_and_transform(const GPUContext &dev_ctx, uint64_t seed = seed_offset.first; uint64_t offset = seed_offset.second; - DistributionKernel< - T, - DistOp, - TransformOp><<>>( + DistributionKernel<<>>( size, seed, offset, dist, trans, out_data, total_thread); } #endif -} // namespace distribution + +} // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu index ac69d398b8a..2b6140d2fde 100644 --- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu +++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu @@ -29,9 +29,9 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/bernoulli_kernel.h" +#include "paddle/phi/kernels/funcs/distribution_helper.h" // See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/operators/distribution_helper.h" #include "paddle/fluid/platform/transform.h" DECLARE_bool(use_curand); @@ -77,7 +77,7 @@ __global__ void bernoulli_cuda_kernel( size_t total_thread = gridDim.x * blockDim.x; for (size_t i = 4 * thread_idx; i < size; i += total_thread * 4) { - paddle::distribution::uniform_distribution dist; + funcs::uniform_distribution dist; float4 rand = dist(&state); #pragma unroll for (size_t j = 0; j < 4; j++) { diff --git a/paddle/phi/kernels/gpu/randint_kernel.cu b/paddle/phi/kernels/gpu/randint_kernel.cu index 66dc5f72a5c..d4cbd5c73fe 100644 --- a/paddle/phi/kernels/gpu/randint_kernel.cu +++ b/paddle/phi/kernels/gpu/randint_kernel.cu @@ -18,10 +18,13 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/distribution_helper.h" // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/memory/memcpy.h" +DECLARE_bool(use_curand); + namespace phi { template @@ -32,34 +35,39 @@ void RandintRawKernel(const Context& dev_ctx, DataType dtype, int seed, DenseTensor* out) { - DenseTensor tmp; - tmp.Resize(phi::make_ddim(shape.GetData())); - T* tmp_data = dev_ctx.template HostAlloc(&tmp); - - out->Resize(tmp.dims()); + out->Resize(phi::make_ddim(shape.GetData())); T* data = dev_ctx.template Alloc(out); - - std::shared_ptr engine; - if (seed) { - engine = std::make_shared(); - engine->seed(seed); + if (FLAGS_use_curand) { + funcs::uniform_distribution dist; + funcs::uniform_int_transform trans(low, high); + funcs::distribution_and_transform(dev_ctx, out, dist, trans); } else { - engine = dev_ctx.GetHostGenerator()->GetCPUEngine(); - } + DenseTensor tmp; + tmp.Resize(phi::make_ddim(shape.GetData())); + T* tmp_data = dev_ctx.template HostAlloc(&tmp); - std::uniform_int_distribution dist(low, high - 1); - auto numel = out->numel(); - for (int64_t i = 0; i < numel; ++i) { - tmp_data[i] = dist(*engine); - } + std::shared_ptr engine; + if (seed) { + engine = std::make_shared(); + engine->seed(seed); + } else { + engine = dev_ctx.GetHostGenerator()->GetCPUEngine(); + } + + std::uniform_int_distribution dist(low, high - 1); + auto numel = out->numel(); + for (int64_t i = 0; i < numel; ++i) { + tmp_data[i] = dist(*engine); + } - paddle::memory::Copy( - out->place(), - data, - tmp.place(), - tmp_data, - numel * paddle::experimental::SizeOf(out->dtype()), - 0); + paddle::memory::Copy( + out->place(), + data, + tmp.place(), + tmp_data, + numel * paddle::experimental::SizeOf(out->dtype()), + 0); + } } template diff --git a/paddle/phi/kernels/gpu/uniform_random_kernel.cu b/paddle/phi/kernels/gpu/uniform_random_kernel.cu index 7f24a6667e5..cdab9faf6aa 100644 --- a/paddle/phi/kernels/gpu/uniform_random_kernel.cu +++ b/paddle/phi/kernels/gpu/uniform_random_kernel.cu @@ -116,9 +116,9 @@ void UniformRandomRawKernel(const Context& dev_ctx, if (generator->GetIsInitPy() && seed_flag) { if (FLAGS_use_curand) { using MT = typename kps::details::MPTypeTrait::Type; - distribution::uniform_distribution dist; - distribution::uniform_transform trans(min, max); - distribution::distribution_and_transform(dev_ctx, out, dist, trans); + funcs::uniform_distribution dist; + funcs::uniform_real_transform trans(min, max); + funcs::distribution_and_transform(dev_ctx, out, dist, trans); } else { auto seed_offset = generator->IncrementOffset(1); int64_t gen_offset = size * seed_offset.second; diff --git a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py index 686e738b8e0..69760192102 100644 --- a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py +++ b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py @@ -93,11 +93,11 @@ class TestGeneratorSeed(unittest.TestCase): fluid.enable_dygraph() - gen = paddle.seed(12312321111) + paddle.seed(12312321111) x = paddle.randint(low=10, shape=[10], dtype="int32") - st1 = gen.get_state() + st1 = paddle.get_cuda_rng_state() x1 = paddle.randint(low=10, shape=[10], dtype="int32") - gen.set_state(st1) + paddle.set_cuda_rng_state(st1) x2 = paddle.randint(low=10, shape=[10], dtype="int32") paddle.seed(12312321111) x3 = paddle.randint(low=10, shape=[10], dtype="int32") diff --git a/python/paddle/fluid/tests/unittests/test_randint_op.py b/python/paddle/fluid/tests/unittests/test_randint_op.py index 82bfb88d54d..5f58054d7ef 100644 --- a/python/paddle/fluid/tests/unittests/test_randint_op.py +++ b/python/paddle/fluid/tests/unittests/test_randint_op.py @@ -20,6 +20,9 @@ from op_test import OpTest import paddle from paddle.fluid import core from paddle.static import program_guard, Program +import os + +paddle.enable_static() def output_hist(out): @@ -156,5 +159,47 @@ class TestRandintImperative(unittest.TestCase): paddle.enable_static() +class TestRandomValue(unittest.TestCase): + def test_fixed_random_number(self): + # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t' + if not paddle.is_compiled_with_cuda(): + return + + # Different GPU generatte different random value. Only test V100 here. + if not "V100" in paddle.device.cuda.get_device_name(): + return + + if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None): + return + + print("Test Fixed Random number on GPU------>") + paddle.disable_static() + paddle.set_device('gpu') + paddle.seed(100) + + x = paddle.randint( + -10000, 10000, [32, 3, 1024, 1024], dtype='int32').numpy() + self.assertTrue(x.mean(), -0.7517569760481516) + self.assertTrue(x.std(), 5773.696619107639) + expect = [2535, 2109, 5916, -5011, -261] + self.assertTrue(np.array_equal(x[10, 0, 100, 100:105], expect)) + expect = [3465, 7206, -8660, -9628, -6574] + self.assertTrue(np.array_equal(x[20, 1, 600, 600:605], expect)) + expect = [881, 1560, 1100, 9664, 1669] + self.assertTrue(np.array_equal(x[30, 2, 1000, 1000:1005], expect)) + + x = paddle.randint( + -10000, 10000, [32, 3, 1024, 1024], dtype='int64').numpy() + self.assertTrue(x.mean(), -1.461287518342336) + self.assertTrue(x.std(), 5773.023477548159) + expect = [7213, -9597, 754, 8129, -1158] + self.assertTrue(np.array_equal(x[10, 0, 100, 100:105], expect)) + expect = [-7159, 8054, 7675, 6980, 8506] + self.assertTrue(np.array_equal(x[20, 1, 600, 600:605], expect)) + expect = [3581, 3420, -8027, -5237, -2436] + self.assertTrue(np.array_equal(x[30, 2, 1000, 1000:1005], expect)) + paddle.enable_static() + + if __name__ == "__main__": unittest.main() -- GitLab From aa47297a5cf94fcd56b8647332ee92f971565d86 Mon Sep 17 00:00:00 2001 From: lkylkylky <48178838+daidaiershidi@users.noreply.github.com> Date: Wed, 2 Mar 2022 11:25:18 +0800 Subject: [PATCH 040/272] fix unittests for eignvalsh (#39841) --- .../fluid/tests/unittests/test_eigvalsh_op.py | 40 ++++++++++++------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_eigvalsh_op.py b/python/paddle/fluid/tests/unittests/test_eigvalsh_op.py index db023722676..93745d9561f 100644 --- a/python/paddle/fluid/tests/unittests/test_eigvalsh_op.py +++ b/python/paddle/fluid/tests/unittests/test_eigvalsh_op.py @@ -60,8 +60,12 @@ class TestEigvalshGPUCase(unittest.TestCase): self.dtype = "float32" np.random.seed(123) self.x_np = np.random.random(self.x_shape).astype(self.dtype) - self.rtol = 1e-5 - self.atol = 1e-5 + if (paddle.version.cuda() >= "11.6"): + self.rtol = 5e-6 + self.atol = 6e-5 + else: + self.rtol = 1e-5 + self.atol = 1e-5 def test_check_output_gpu(self): if paddle.is_compiled_with_cuda(): @@ -75,23 +79,29 @@ class TestEigvalshGPUCase(unittest.TestCase): class TestEigvalshAPI(unittest.TestCase): def setUp(self): - self.init_input_shape() + self.x_shape = [5, 5] self.dtype = "float32" self.UPLO = 'L' - self.rtol = 1e-6 - self.atol = 1e-6 + if (paddle.version.cuda() >= "11.6"): + self.rtol = 5e-6 + self.atol = 6e-5 + else: + self.rtol = 1e-5 + self.atol = 1e-5 self.place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() \ else paddle.CPUPlace() np.random.seed(123) + self.init_input_data() + + def init_input_data(self): self.real_data = np.random.random(self.x_shape).astype(self.dtype) - self.complex_data = np.random.random(self.x_shape).astype( + complex_data = np.random.random(self.x_shape).astype( self.dtype) + 1J * np.random.random(self.x_shape).astype(self.dtype) self.trans_dims = list(range(len(self.x_shape) - 2)) + [ len(self.x_shape) - 1, len(self.x_shape) - 2 ] - - def init_input_shape(self): - self.x_shape = [5, 5] + self.complex_symm = np.divide( + complex_data + np.conj(complex_data.transpose(self.trans_dims)), 2) def compare_result(self, actual_w, expected_w): np.testing.assert_allclose( @@ -122,9 +132,9 @@ class TestEigvalshAPI(unittest.TestCase): output_w = paddle.linalg.eigvalsh(input_x) exe = paddle.static.Executor(self.place) expected_w = exe.run(main_prog, - feed={"input_x": self.complex_data}, + feed={"input_x": self.complex_symm}, fetch_list=[output_w]) - actual_w = np.linalg.eigvalsh(self.complex_data) + actual_w = np.linalg.eigvalsh(self.complex_symm) self.compare_result(actual_w, expected_w[0]) def test_in_static_mode(self): @@ -139,14 +149,14 @@ class TestEigvalshAPI(unittest.TestCase): actual_w = paddle.linalg.eigvalsh(input_real_data) self.compare_result(actual_w, expected_w) - input_complex_data = paddle.to_tensor(self.complex_data) - expected_w = np.linalg.eigvalsh(self.complex_data) - actual_w = paddle.linalg.eigvalsh(input_complex_data) + input_complex_symm = paddle.to_tensor(self.complex_symm) + expected_w = np.linalg.eigvalsh(self.complex_symm) + actual_w = paddle.linalg.eigvalsh(input_complex_symm) self.compare_result(actual_w, expected_w) def test_eigvalsh_grad(self): paddle.disable_static(self.place) - x = paddle.to_tensor(self.complex_data, stop_gradient=False) + x = paddle.to_tensor(self.complex_symm, stop_gradient=False) w = paddle.linalg.eigvalsh(x) (w.sum()).backward() np.testing.assert_allclose( -- GitLab From 4e00d2bb338082dc9e3f1ee44b5887c930c8bb60 Mon Sep 17 00:00:00 2001 From: Baibaifan <39549453+Baibaifan@users.noreply.github.com> Date: Wed, 2 Mar 2022 12:41:15 +0800 Subject: [PATCH 041/272] add_new_comm_primitive (#40040) --- .../distributed/collective/ProcessGroup.h | 20 ++- .../collective/ProcessGroupNCCL.cc | 156 ++++++++++++++++++ .../distributed/collective/ProcessGroupNCCL.h | 17 ++ paddle/fluid/distributed/collective/Types.h | 4 + paddle/fluid/pybind/distributed_py.cc | 33 ++++ .../tests/unittests/process_group_nccl.py | 30 ++++ 6 files changed, 259 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h index dde8622d900..e4f27205202 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.h +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -96,7 +96,25 @@ class ProcessGroup { std::vector& /* tensors */, const BroadcastOptions& = BroadcastOptions()) { PADDLE_THROW(platform::errors::InvalidArgument( - "ProcessGroup%s does not support allreduce", GetBackendName())); + "ProcessGroup%s does not support broadcast", GetBackendName())); + } + + virtual std::shared_ptr Barrier( + const BarrierOptions& = BarrierOptions()) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support barrier", GetBackendName())); + } + + virtual std::shared_ptr Send( + std::vector& tensors /* tensors */, int dst_rank) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support send", GetBackendName())); + } + + virtual std::shared_ptr Recv( + std::vector& tensors /* tensors */, int src_rank) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support receive", GetBackendName())); } protected: diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index fe2325423b4..5d96e730aa4 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -14,6 +14,9 @@ #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/api/include/api.h" +#include "paddle/phi/common/place.h" DECLARE_bool(nccl_blocking_wait); DECLARE_bool(use_stream_safe_cuda_allocator); @@ -139,6 +142,14 @@ bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) { std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout)); } } + + if (!barrierTensors_.empty()) { + // If we use the work to do barrier, we should block cpu + for (auto& place : places_) { + platform::CUDADeviceGuard gpuGuard(place); + PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); + } + } return true; } @@ -193,6 +204,10 @@ void ProcessGroupNCCL::CreateNCCLManagerCache( nccl_ids.resize(1); auto& nccl_id = nccl_ids.front(); + for (auto& place : places) { + used_place_ids_.insert(place.GetDeviceId()); + } + if (rank_ == 0) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetUniqueId(&nccl_id)); } @@ -274,6 +289,54 @@ std::shared_ptr ProcessGroupNCCL::Collective( return task; } +template +std::shared_ptr ProcessGroupNCCL::PointToPoint( + std::vector& tensors, Fn fn, int dst_rank, CommType op_type) { + const auto places = GetPlaceList(tensors); + const auto key = GetKeyFromPlaces(places); + + { + std::lock_guard lock(mutex_); + if (places_to_ncclcomm_.find(key) == places_to_ncclcomm_.end()) { + CreateNCCLManagerCache(key, places); + } + } + + auto& nccl_comms = places_to_ncclcomm_[key]; + + SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]); + + auto task = CreateTask(places, rank_, op_type, tensors); + + // construct uninitialize guard for device + platform::CUDADeviceGuard cuda_guard; + + if (FLAGS_use_stream_safe_cuda_allocator) { + for (size_t i = 0; i < tensors.size(); ++i) { + cuda_guard.SetDevice(places[i]); + auto dense_tensor = + std::dynamic_pointer_cast(tensors[i].impl()); + memory::RecordStream(dense_tensor->Holder(), + places_to_ctx_[key][i]->stream()); + } + } + + { + platform::NCCLGroupGuard nccl_guard; + for (size_t i = 0; i < tensors.size(); ++i) { + cuda_guard.SetDevice(places[i]); + const auto& nccl_stream = places_to_ctx_[key][i]->stream(); + fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank); + } + } + + for (size_t i = 0; i < tensors.size(); ++i) { + cuda_guard.SetDevice(places[i]); + task->control_events_[i].Record(*places_to_ctx_[key][i]); + } + return task; +} + std::shared_ptr ProcessGroupNCCL::AllReduce( std::vector& tensors, const AllreduceOptions& opts) { PADDLE_ENFORCE_EQ( @@ -317,5 +380,98 @@ std::shared_ptr ProcessGroupNCCL::Broadcast( CommType::BROADCAST); } +std::shared_ptr ProcessGroupNCCL::Barrier( + const BarrierOptions& opts) { + std::vector places; + + if (!opts.place_ids.empty()) { + for (auto place_id : opts.place_ids) { + places.emplace_back(place_id); + } + } else if (!used_place_ids_.empty()) { + for (auto place_id : used_place_ids_) { + places.emplace_back(place_id); + } + } else { + auto numGPUs = GetSize(); + int place_id = static_cast(rank_ % numGPUs); + places.emplace_back(place_id); + } + + std::vector barrierTensors; + barrierTensors.reserve(places.size()); + + platform::CUDADeviceGuard gpuGuard; + for (auto& place : places) { + gpuGuard.SetDeviceIndex(place.GetDeviceId()); + auto dt = full({1}, 0, phi::DataType::FLOAT32, phi::Backend::GPU); + barrierTensors.push_back(dt); + } + auto task = ProcessGroupNCCL::AllReduce(barrierTensors); + auto nccl_task = dynamic_cast(task.get()); + nccl_task->barrierTensors_ = std::move(barrierTensors); + return task; +} + +void CheckTensorsInDifferentDevices(const std::vector& tensors, + const size_t num_devices) { + PADDLE_ENFORCE_EQ( + tensors.size() == 0, false, + platform::errors::InvalidArgument("Tensor list must be nonempty.")); + PADDLE_ENFORCE_LE( + tensors.size(), num_devices, + platform::errors::InvalidArgument( + "Tensor list mustn't be larger than the number of available GPUs.")); + + std::set used_devices; + + for (const auto& t : tensors) { + PADDLE_ENFORCE_EQ(t.is_cuda() && t.is_dense_tensor(), true, + platform::errors::InvalidArgument( + "Tensors must be CUDA and dense tensor.")); + + const auto inserted = used_devices.insert(t.inner_place()).second; + PADDLE_ENFORCE_EQ(inserted, true, + platform::errors::InvalidArgument( + "Tensors must be on distinct GPU devices.")); + } +} + +std::shared_ptr ProcessGroupNCCL::Send( + std::vector& tensors, int dst_rank) { + CheckTensorsInDifferentDevices(tensors, static_cast(GetSize())); + + auto task = PointToPoint( + tensors, + [&](Tensor& input, ncclComm_t comm, const gpuStream_t& stream, + int dst_rank) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + return platform::dynload::ncclSend( + input_tensor->data(), input_tensor->numel(), + platform::ToNCCLDataType(input.type()), dst_rank, comm, stream); + }, + dst_rank, CommType::SEND); + return task; +} + +std::shared_ptr ProcessGroupNCCL::Recv( + std::vector& tensors, int src_rank) { + CheckTensorsInDifferentDevices(tensors, static_cast(GetSize())); + + auto task = PointToPoint( + tensors, + [&](Tensor& output, ncclComm_t comm, const gpuStream_t& stream, + int src_rank) { + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + return platform::dynload::ncclRecv( + output_tensor->data(), output_tensor->numel(), + platform::ToNCCLDataType(output.type()), src_rank, comm, stream); + }, + src_rank, CommType::RECV); + return task; +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index 9f06566d1c8..cfeb6467f0d 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -65,6 +65,7 @@ class ProcessGroupNCCL : public ProcessGroup { virtual ~NCCLTask(); std::vector control_events_; + std::vector barrierTensors_; protected: std::vector places_; @@ -88,6 +89,15 @@ class ProcessGroupNCCL : public ProcessGroup { std::vector& tensors, const BroadcastOptions& = BroadcastOptions()) override; + std::shared_ptr Barrier( + const BarrierOptions& = BarrierOptions()) override; + + std::shared_ptr Send(std::vector& tensors, + int dst_rank) override; + + std::shared_ptr Recv(std::vector& tensors, + int src_rank) override; + protected: virtual std::shared_ptr CreateTask( std::vector places, int rank, CommType opType, @@ -106,6 +116,8 @@ class ProcessGroupNCCL : public ProcessGroup { std::vector>> places_to_ctx_; + std::set used_place_ids_; + private: void BcastNCCLId(std::vector& nccl_ids, int root, // NOLINT int server_fd); @@ -118,6 +130,11 @@ class ProcessGroupNCCL : public ProcessGroup { std::vector& outputs, // NOLINT Fn fn, CommType op_type); + template + std::shared_ptr PointToPoint( + std::vector& tensors, // NOLINT + Fn fn, int dst_rank, CommType op_type); + void CreateNCCLManagerCache(const std::string& places_key, const std::vector& places); }; diff --git a/paddle/fluid/distributed/collective/Types.h b/paddle/fluid/distributed/collective/Types.h index 654d0668695..699222ac452 100644 --- a/paddle/fluid/distributed/collective/Types.h +++ b/paddle/fluid/distributed/collective/Types.h @@ -32,5 +32,9 @@ struct BroadcastOptions { int source_root = 0; }; +struct BarrierOptions { + std::vector place_ids; +}; + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index 7b59188a9f3..a4a1d07db2c 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -60,6 +60,10 @@ void BindDistributed(py::module *m) { .def_readwrite("source_root", &distributed::BroadcastOptions::source_root); + py::class_(*m, "BarrierOptions") + .def(py::init<>()) + .def_readwrite("place_ids", &distributed::BarrierOptions::place_ids); + auto ProcessGroup = py::class_>(*m, "ProcessGroup") @@ -88,6 +92,35 @@ void BindDistributed(py::module *m) { return self.Broadcast(tensors, opts); }, py::arg("tensor"), py::arg("source_rank"), + py::call_guard()) + + .def("barrier", + [](distributed::ProcessGroup &self, std::vector place_ids) { + distributed::BarrierOptions opts; + opts.place_ids = place_ids; + return self.Barrier(opts); + }, + py::arg("place_ids") = std::vector{}, + py::call_guard()) + + .def("send", + [](distributed::ProcessGroup &self, py::handle py_tensor, + int dst) { + auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); + std::vector tensors = {tensor}; + return self.Send(tensors, dst); + }, + py::arg("tensor"), py::arg("dst"), + py::call_guard()) + + .def("recv", + [](distributed::ProcessGroup &self, py::handle py_tensor, + int src) { + auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); + std::vector tensors = {tensor}; + return self.Recv(tensors, src); + }, + py::arg("tensor"), py::arg("src"), py::call_guard()); #if defined(PADDLE_WITH_NCCL) diff --git a/python/paddle/fluid/tests/unittests/process_group_nccl.py b/python/paddle/fluid/tests/unittests/process_group_nccl.py index d999aad63ec..8ec5d13c569 100644 --- a/python/paddle/fluid/tests/unittests/process_group_nccl.py +++ b/python/paddle/fluid/tests/unittests/process_group_nccl.py @@ -132,6 +132,36 @@ class TestProcessGroupFp32(unittest.TestCase): print("test broadcast api ok") + # test barrier + # rank 0 + if pg.rank() == 0: + task = pg.barrier() + task.wait() + # rank 1 + else: + task = pg.barrier() + task.wait() + + print("test barrier api ok\n") + + # test send/recv + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + if pg.rank() == 0: + task = pg.send(tensor_x, dst=1) + task.wait() + paddle.device.cuda.synchronize() + # rank 1 + else: + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + task = pg.recv(tensor_y, src=0) + task.wait() + paddle.device.cuda.synchronize() + assert np.array_equal(tensor_x, tensor_y) + print("test send/recv api ok\n") + class TestProcessGroupFp16(TestProcessGroupFp32): def setUp(self): -- GitLab From 4cab812e04c4af2a67752e1da3de1d8acf7dba5c Mon Sep 17 00:00:00 2001 From: fwenguang <95677191+fwenguang@users.noreply.github.com> Date: Wed, 2 Mar 2022 13:12:57 +0800 Subject: [PATCH 042/272] [MLU] add transpose2 mlu kernel (#39994) --- paddle/fluid/operators/mlu/mlu_baseop.h | 13 +- .../operators/reduce_ops/reduce_max_op_mlu.cc | 4 +- .../operators/reduce_ops/reduce_min_op_mlu.cc | 4 +- .../softmax_with_cross_entropy_op_mlu.cc | 6 +- paddle/fluid/operators/transpose_op_mlu.cc | 74 ++++ .../unittests/mlu/test_transpose_op_mlu.py | 393 ++++++++++++++++++ 6 files changed, 482 insertions(+), 12 deletions(-) create mode 100644 paddle/fluid/operators/transpose_op_mlu.cc create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_transpose_op_mlu.py diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h index 2cbecba9fa0..2a54a8392c7 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.h +++ b/paddle/fluid/operators/mlu/mlu_baseop.h @@ -1157,19 +1157,22 @@ inline void TransposeFromMLUTensor(const ExecutionContext& ctx, const Tensor* transformed_input, Tensor* transformed_output, bool need_reshape_or_alloc) { - auto in_dims_vec = phi::vectorize(transformed_input->dims()); + const int dim_size = perm.size(); if (need_reshape_or_alloc) { + std::vector output_shape; + auto input_dims = transformed_input->dims(); + for (int i = 0; i < dim_size; ++i) { + output_shape.push_back(input_dims[perm[i]]); + } transformed_output->mutable_data( - {in_dims_vec[perm[0]], in_dims_vec[perm[1]], in_dims_vec[perm[2]], - in_dims_vec[perm[3]]}, - ctx.GetPlace()); + framework::DDim(output_shape.data(), dim_size), ctx.GetPlace()); } MLUCnnlTensorDesc trans_in_desc(*transformed_input, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); MLUCnnlTensorDesc trans_out_desc(*transformed_output, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); - MLUCnnl::Transpose(ctx, perm, in_dims_vec.size(), trans_in_desc.get(), + MLUCnnl::Transpose(ctx, perm, dim_size, trans_in_desc.get(), GetBasePtr(transformed_input), trans_out_desc.get(), GetBasePtr(transformed_output)); } diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc index 7e02f0268b5..1abec24c0d3 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc @@ -27,11 +27,11 @@ class ReduceMaxMLUKernel : public framework::OpKernel { int out_dtype = context.Attr("out_dtype"); bool reduce_all = context.Attr("reduce_all"); auto dims = context.Attr>("dim"); - auto input_dims = framework::vectorize(input->dims()); + auto input_dims = input->dims(); const auto& input_dim_size = input->dims().size(); std::vector reduce_dims; if (reduce_all) { - for (size_t i = 0; i < input_dims.size(); i++) { + for (int i = 0; i < input_dims.size(); i++) { reduce_dims.push_back(static_cast(i)); } } else { diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc index daf5965fd54..d80cce74221 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc @@ -27,11 +27,11 @@ class ReduceMinMLUKernel : public framework::OpKernel { int out_dtype = context.Attr("out_dtype"); bool reduce_all = context.Attr("reduce_all"); auto dims = context.Attr>("dim"); - auto input_dims = framework::vectorize(input->dims()); + auto input_dims = input->dims(); const auto& input_dim_size = input->dims().size(); std::vector reduce_dims; if (reduce_all) { - for (size_t i = 0; i < input_dims.size(); i++) { + for (int i = 0; i < input_dims.size(); i++) { reduce_dims.push_back(static_cast(i)); } } else { diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc index 1cd6f8b7698..34650c2e062 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc @@ -37,7 +37,7 @@ class SoftmaxWithCrossEntropyMLUKernel : public framework::OpKernel { "the mlu kernel of softmax_with_cross_entropy.")); const int rank = logits->dims().size(); - const int axis = CanonicalAxis(ctx.Attr("axis"), rank); + const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); loss->mutable_data(ctx.GetPlace()); backprop->mutable_data(ctx.GetPlace()); @@ -45,10 +45,10 @@ class SoftmaxWithCrossEntropyMLUKernel : public framework::OpKernel { // cnnl softmax only support 3-dims, regard all shape as [d1, d2, d3] const int cnnl_softmax_dims = 3; - const int d1 = SizeToAxis(axis, logits->dims()); + const int d1 = phi::funcs::SizeToAxis(axis, logits->dims()); const int d2_logits = logits->dims()[axis]; const int d2_labels = labels->dims()[axis]; - const int d3 = SizeOutAxis(axis, logits->dims()); + const int d3 = phi::funcs::SizeOutAxis(axis, logits->dims()); // CNNL_SOFTMAX_MODE_LOW_DIMENSION has better perfermence, use it as much as // possible. diff --git a/paddle/fluid/operators/transpose_op_mlu.cc b/paddle/fluid/operators/transpose_op_mlu.cc new file mode 100644 index 00000000000..40cb22bab50 --- /dev/null +++ b/paddle/fluid/operators/transpose_op_mlu.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/transpose_op.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +template +class TransposeMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + std::vector axis = ctx.Attr>("axis"); + out->mutable_data(ctx.device_context().GetPlace()); + + TransposeFromMLUTensor(ctx, axis, x, out, + false /*need_reshape_or_alloc*/); + } +}; + +template +class TransposeGradMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out_grad = + ctx.Input(framework::GradVarName("Out")); + auto* x_grad = + ctx.Output(framework::GradVarName("X")); + std::vector axis = ctx.Attr>("axis"); + std::vector reversed_axis(axis); + for (size_t i = 0; i < axis.size(); i++) { + reversed_axis[axis[i]] = i; + } + x_grad->mutable_data(ctx.GetPlace()); + + TransposeFromMLUTensor(ctx, reversed_axis, out_grad, x_grad, + false /*need_reshape_or_alloc*/); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_MLU_KERNEL(transpose2, ops::TransposeMLUKernel, + ops::TransposeMLUKernel, + ops::TransposeMLUKernel, + ops::TransposeMLUKernel, + ops::TransposeMLUKernel, + ops::TransposeMLUKernel, + ops::TransposeMLUKernel); + +REGISTER_OP_MLU_KERNEL(transpose2_grad, ops::TransposeGradMLUKernel, + ops::TransposeGradMLUKernel, + ops::TransposeGradMLUKernel, + ops::TransposeGradMLUKernel, + ops::TransposeGradMLUKernel, + ops::TransposeGradMLUKernel, + ops::TransposeGradMLUKernel); diff --git a/python/paddle/fluid/tests/unittests/mlu/test_transpose_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_transpose_op_mlu.py new file mode 100644 index 00000000000..6f1bda477f0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_transpose_op_mlu.py @@ -0,0 +1,393 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +sys.path.append('..') +from op_test import OpTest, convert_float_to_uint16 +import paddle +import paddle.fluid as fluid +from paddle.fluid import Program, program_guard +import paddle.fluid.core as core + +paddle.enable_static() + + +class TestTransposeOp(OpTest): + def setUp(self): + self.init_op_type() + self.initKernelType() + self.initTestCase() + self.inputs = {'X': np.random.random(self.shape).astype("float32")} + self.attrs = {'axis': list(self.axis), } + self.outputs = {'Out': self.inputs['X'].transpose(self.axis)} + + def init_op_type(self): + self.op_type = "transpose2" + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out') + + def initTestCase(self): + self.shape = (3, 40) + self.axis = (1, 0) + + def initKernelType(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + + +class TestCase0(TestTransposeOp): + def initTestCase(self): + self.shape = (100, ) + self.axis = (0, ) + + +class TestCase1(TestTransposeOp): + def initTestCase(self): + self.shape = (3, 4, 10) + self.axis = (0, 2, 1) + + +class TestCase2(TestTransposeOp): + def initTestCase(self): + self.shape = (2, 3, 4, 5) + self.axis = (0, 2, 3, 1) + + +class TestCase3(TestTransposeOp): + def initTestCase(self): + self.shape = (2, 3, 4, 5, 6) + self.axis = (4, 2, 3, 1, 0) + + +class TestCase4(TestTransposeOp): + def initTestCase(self): + self.shape = (2, 3, 4, 5, 6, 1) + self.axis = (4, 2, 3, 1, 0, 5) + + +class TestCase5(TestTransposeOp): + def initTestCase(self): + self.shape = (2, 16, 96) + self.axis = (0, 2, 1) + + +class TestCase6(TestTransposeOp): + def initTestCase(self): + self.shape = (2, 10, 12, 16) + self.axis = (3, 1, 2, 0) + + +class TestCase7(TestTransposeOp): + def initTestCase(self): + self.shape = (2, 10, 2, 16) + self.axis = (0, 1, 3, 2) + + +class TestCase8(TestTransposeOp): + def initTestCase(self): + self.shape = (2, 3, 2, 3, 2, 4, 3, 3) + self.axis = (0, 1, 3, 2, 4, 5, 6, 7) + + +class TestCase9(TestTransposeOp): + def initTestCase(self): + self.shape = (2, 3, 2, 3, 2, 4, 3, 3) + self.axis = (6, 1, 3, 5, 0, 2, 4, 7) + + +class TestTransposeOpBool(TestTransposeOp): + def test_check_grad(self): + pass + + +class TestTransposeOpBool1D(TestTransposeOpBool): + def initTestCase(self): + self.shape = (100, ) + self.axis = (0, ) + self.inputs = {'X': np.random.random(self.shape).astype("bool")} + self.outputs = {'Out': self.inputs['X'].transpose(self.axis)} + + +class TestTransposeOpBool2D(TestTransposeOpBool): + def initTestCase(self): + self.shape = (3, 40) + self.axis = (1, 0) + self.inputs = {'X': np.random.random(self.shape).astype("bool")} + self.outputs = {'Out': self.inputs['X'].transpose(self.axis)} + + +class TestTransposeOpBool3D(TestTransposeOpBool): + def initTestCase(self): + self.shape = (3, 4, 10) + self.axis = (0, 2, 1) + self.inputs = {'X': np.random.random(self.shape).astype("bool")} + self.outputs = {'Out': self.inputs['X'].transpose(self.axis)} + + +class TestTransposeOpBool4D(TestTransposeOpBool): + def initTestCase(self): + self.shape = (2, 3, 4, 5) + self.axis = (0, 2, 3, 1) + self.inputs = {'X': np.random.random(self.shape).astype("bool")} + self.outputs = {'Out': self.inputs['X'].transpose(self.axis)} + + +class TestTransposeOpBool5D(TestTransposeOpBool): + def initTestCase(self): + self.shape = (2, 3, 4, 5, 6) + self.axis = (4, 2, 3, 1, 0) + self.inputs = {'X': np.random.random(self.shape).astype("bool")} + self.outputs = {'Out': self.inputs['X'].transpose(self.axis)} + + +class TestTransposeOpBool6D(TestTransposeOpBool): + def initTestCase(self): + self.shape = (2, 3, 4, 5, 6, 1) + self.axis = (4, 2, 3, 1, 0, 5) + self.inputs = {'X': np.random.random(self.shape).astype("bool")} + self.outputs = {'Out': self.inputs['X'].transpose(self.axis)} + + +class TestTransposeOpBool7D(TestTransposeOpBool): + def initTestCase(self): + self.shape = (2, 3, 2, 3, 2, 4, 3) + self.axis = (0, 1, 3, 2, 4, 5, 6) + self.inputs = {'X': np.random.random(self.shape).astype("bool")} + self.outputs = {'Out': self.inputs['X'].transpose(self.axis)} + + +class TestTransposeOpBool8D(TestTransposeOpBool): + def initTestCase(self): + self.shape = (2, 3, 2, 3, 2, 4, 3, 3) + self.axis = (6, 1, 3, 5, 0, 2, 4, 7) + self.inputs = {'X': np.random.random(self.shape).astype("bool")} + self.outputs = {'Out': self.inputs['X'].transpose(self.axis)} + + +class TestTransposeOpError(unittest.TestCase): + def test_errors(self): + paddle.enable_static() + with program_guard(Program(), Program()): + x = fluid.layers.data(name='x', shape=[10, 5, 3], dtype='float32') + + def test_x_Variable_check(): + # the Input(x)'s type must be Variable + fluid.layers.transpose("not_variable", perm=[1, 0, 2]) + + self.assertRaises(TypeError, test_x_Variable_check) + + def test_perm_list_check(): + # Input(perm)'s type must be list + fluid.layers.transpose(x, perm="[1, 0, 2]") + + self.assertRaises(TypeError, test_perm_list_check) + + def test_perm_length_and_x_dim_check(): + # Input(perm) is the permutation of dimensions of Input(input) + # its length should be equal to dimensions of Input(input) + fluid.layers.transpose(x, perm=[1, 0, 2, 3, 4]) + + self.assertRaises(ValueError, test_perm_length_and_x_dim_check) + + def test_each_elem_value_check(): + # Each element in Input(perm) should be less than Input(x)'s dimension + fluid.layers.transpose(x, perm=[3, 5, 7]) + + self.assertRaises(ValueError, test_each_elem_value_check) + + +class TestTransposeApi(unittest.TestCase): + def test_static_out(self): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.static.data(name='x', shape=[2, 3, 4], dtype='float32') + x_trans1 = paddle.transpose(x, perm=[1, 0, 2]) + x_trans2 = paddle.transpose(x, perm=(2, 1, 0)) + place = paddle.MLUPlace(0) + exe = paddle.static.Executor(place) + x_np = np.random.random([2, 3, 4]).astype("float32") + result1, result2 = exe.run(feed={"x": x_np}, + fetch_list=[x_trans1, x_trans2]) + expected_result1 = np.transpose(x_np, [1, 0, 2]) + expected_result2 = np.transpose(x_np, (2, 1, 0)) + + np.testing.assert_array_equal(result1, expected_result1) + np.testing.assert_array_equal(result2, expected_result2) + + def test_dygraph_out(self): + # This is an old test before 2.0 API so we need to disable static + # to trigger dygraph + paddle.disable_static() + x = paddle.randn([2, 3, 4]) + x_trans1 = paddle.transpose(x, perm=[1, 0, 2]) + x_trans2 = paddle.transpose(x, perm=(2, 1, 0)) + x_np = x.numpy() + expected_result1 = np.transpose(x_np, [1, 0, 2]) + expected_result2 = np.transpose(x_np, (2, 1, 0)) + + np.testing.assert_array_equal(x_trans1.numpy(), expected_result1) + np.testing.assert_array_equal(x_trans2.numpy(), expected_result2) + # This is an old test before 2.0 API so we enable static again after + # dygraph test + paddle.enable_static() + + +class TestTAPI(unittest.TestCase): + def test_out(self): + with fluid.program_guard(fluid.Program()): + data = fluid.data(shape=[10], dtype="float32", name="data") + data_t = paddle.t(data) + place = fluid.MLUPlace(0) + exe = fluid.Executor(place) + data_np = np.random.random([10]).astype("float32") + result, = exe.run(feed={"data": data_np}, fetch_list=[data_t]) + expected_result = np.transpose(data_np) + self.assertEqual((result == expected_result).all(), True) + + with fluid.program_guard(fluid.Program()): + data = fluid.data(shape=[10, 5], dtype="float32", name="data") + data_t = paddle.t(data) + place = fluid.MLUPlace(0) + exe = fluid.Executor(place) + data_np = np.random.random([10, 5]).astype("float32") + result, = exe.run(feed={"data": data_np}, fetch_list=[data_t]) + expected_result = np.transpose(data_np) + self.assertEqual((result == expected_result).all(), True) + + with fluid.program_guard(fluid.Program()): + data = fluid.data(shape=[1, 5], dtype="float32", name="data") + data_t = paddle.t(data) + place = fluid.MLUPlace(0) + exe = fluid.Executor(place) + data_np = np.random.random([1, 5]).astype("float32") + result, = exe.run(feed={"data": data_np}, fetch_list=[data_t]) + expected_result = np.transpose(data_np) + self.assertEqual((result == expected_result).all(), True) + + with fluid.dygraph.guard(): + np_x = np.random.random([10]).astype("float32") + data = fluid.dygraph.to_variable(np_x) + z = paddle.t(data) + np_z = z.numpy() + z_expected = np.array(np.transpose(np_x)) + self.assertEqual((np_z == z_expected).all(), True) + + with fluid.dygraph.guard(): + np_x = np.random.random([10, 5]).astype("float32") + data = fluid.dygraph.to_variable(np_x) + z = paddle.t(data) + np_z = z.numpy() + z_expected = np.array(np.transpose(np_x)) + self.assertEqual((np_z == z_expected).all(), True) + + with fluid.dygraph.guard(): + np_x = np.random.random([1, 5]).astype("float32") + data = fluid.dygraph.to_variable(np_x) + z = paddle.t(data) + np_z = z.numpy() + z_expected = np.array(np.transpose(np_x)) + self.assertEqual((np_z == z_expected).all(), True) + + def test_errors(self): + with fluid.program_guard(fluid.Program()): + x = fluid.data(name='x', shape=[10, 5, 3], dtype='float32') + + def test_x_dimension_check(): + paddle.t(x) + + self.assertRaises(ValueError, test_x_dimension_check) + + +class TestMoveAxis(unittest.TestCase): + def test_moveaxis1(self): + x_np = np.random.randn(2, 3, 4, 5, 7).astype('float32') + expected = np.moveaxis(x_np, [0, 4, 3, 2], [1, 3, 2, 0]) + paddle.enable_static() + with paddle.static.program_guard(fluid.Program()): + x = paddle.static.data("x", shape=[2, 3, 4, 5, 7], dtype='float32') + out = paddle.moveaxis(x, [0, 4, 3, 2], [1, 3, 2, 0]) + + exe = paddle.static.Executor() + out_np = exe.run(feed={"x": x_np}, fetch_list=[out])[0] + + self.assertEqual(np.array_equal(out_np, expected), True) + + paddle.disable_static() + x = paddle.to_tensor(x_np) + out = paddle.moveaxis(x, [0, 4, 3, 2], [1, 3, 2, 0]) + self.assertEqual(out.shape, [4, 2, 5, 7, 3]) + self.assertEqual(np.array_equal(out.numpy(), expected), True) + paddle.enable_static() + + def test_moveaxis2(self): + x_np = np.random.randn(2, 3, 5).astype('float32') + expected = np.moveaxis(x_np, -2, -1) + paddle.enable_static() + with paddle.static.program_guard(fluid.Program()): + x = paddle.static.data("x", shape=[2, 3, 5], dtype='float32') + out = x.moveaxis(-2, -1) + + exe = paddle.static.Executor() + out_np = exe.run(feed={"x": x_np}, fetch_list=[out])[0] + + self.assertEqual(np.array_equal(out_np, expected), True) + + paddle.disable_static() + x = paddle.to_tensor(x_np) + out = x.moveaxis(-2, -1) + self.assertEqual(out.shape, [2, 5, 3]) + self.assertEqual(np.array_equal(out.numpy(), expected), True) + paddle.enable_static() + + def test_error(self): + x = paddle.randn([2, 3, 4, 5]) + # src must have the same number with dst + with self.assertRaises(AssertionError): + paddle.moveaxis(x, [1, 0], [2]) + + # each element of src must be unique + with self.assertRaises(ValueError): + paddle.moveaxis(x, [1, 1], [0, 2]) + + # each element of dst must be unique + with self.assertRaises(ValueError): + paddle.moveaxis(x, [0, 1], [2, 2]) + + # each element of src must be integer + with self.assertRaises(AssertionError): + paddle.moveaxis(x, [0.5], [1]) + + # each element of dst must be integer + with self.assertRaises(AssertionError): + paddle.moveaxis(x, [0], [1.5]) + + # each element of src must be in the range of [-4, 3) + with self.assertRaises(AssertionError): + paddle.moveaxis(x, [-10, 1], [2, 3]) + + # each element of dst must be in the range of [-4, 3) + with self.assertRaises(AssertionError): + paddle.moveaxis(x, [2, 1], [10, 3]) + + +if __name__ == '__main__': + unittest.main() -- GitLab From 36660d4c356d4c6b71eb8df51e094ea36bfa2c06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?= <78149749+winter-wang@users.noreply.github.com> Date: Wed, 2 Mar 2022 14:02:42 +0800 Subject: [PATCH 043/272] [infrt] speed up the infrt ci. test=devvelop (#40032) --- paddle/scripts/infrt_build.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh index 8d858647ea6..a0132501387 100755 --- a/paddle/scripts/infrt_build.sh +++ b/paddle/scripts/infrt_build.sh @@ -102,9 +102,11 @@ function infrt_gen_and_build() { function create_fake_models() { cd ${PADDLE_ROOT}/build + cd python/dist/ # create multi_fc model, this will generate "multi_fc_model" python3 -m pip uninstall -y paddlepaddle - python3 -m pip install paddlepaddle + python3 -m pip install *whl + cd ${PADDLE_ROOT}/build python3 ${PADDLE_ROOT}/tools/infrt/fake_models/multi_fc.py } -- GitLab From 9070d5c5d85e15a04324b6a5f2f1e2c9a7ecc1b6 Mon Sep 17 00:00:00 2001 From: zhangchunle Date: Wed, 2 Mar 2022 14:08:19 +0800 Subject: [PATCH 044/272] test=document_fix;record py3 case time (#40018) --- paddle/scripts/paddle_build.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 9bef7e12851..ed70a8638bf 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -776,7 +776,9 @@ set +x tmpfile=$tmp_dir/$tmpfile_rand ctest -R "$UT_list_prec_1" -E "$disable_ut_quickly" -LE ${nightly_label} --output-on-failure -j $2 | tee $tmpfile fi - + ut_total_endTime_s=`date +%s` + echo "TestCases Total Time: $[ $ut_total_endTime_s - $ut_actual_total_startTime_s ]s" + collect_failed_tests rm -f $tmp_dir/* exec_times=0 -- GitLab From b4d931e8bce97a12e9ac7a12ff6c0a11499002c7 Mon Sep 17 00:00:00 2001 From: qipengh Date: Wed, 2 Mar 2022 14:23:35 +0800 Subject: [PATCH 045/272] [MLU] adapt matmul op (#39727) * [MLU] adapt matmul op * [MLU] fix phi namespace --- paddle/fluid/imperative/CMakeLists.txt | 6 +- paddle/fluid/operators/matmul_op_mlu.cc | 337 ++++++++++++++++++ .../tests/unittests/mlu/test_matmul_op_mlu.py | 329 +++++++++++++++++ 3 files changed, 671 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/operators/matmul_op_mlu.cc create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_matmul_op_mlu.py diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index f198919b0c8..e1ce705533a 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -46,8 +46,12 @@ if(WITH_GLOO) endif() endif() +if(WITH_MLU) + SET(MLU_DEPS mlu_baseop) +endif() + if(NOT WITH_ASCEND_CL) -cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function phi_tensor) +cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function phi_tensor ${MLU_DEPS}) else() cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function npu_op_runner phi_tensor) endif() diff --git a/paddle/fluid/operators/matmul_op_mlu.cc b/paddle/fluid/operators/matmul_op_mlu.cc new file mode 100644 index 00000000000..d0c84c4751e --- /dev/null +++ b/paddle/fluid/operators/matmul_op_mlu.cc @@ -0,0 +1,337 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +static void Mul(const framework::ExecutionContext& ctx, const Tensor& X, + const Tensor& Y, Tensor* Out, const float alpha) { + Out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + + MLUCnnlOpTensorDesc mul_op_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType(), + CNNL_NOT_PROPAGATE_NAN); + MLUCnnl::OpTensor(ctx, mul_op_desc.get(), x_desc.get(), GetBasePtr(&X), + y_desc.get(), GetBasePtr(&Y), out_desc.get(), + GetBasePtr(Out), ToCnnlDataType(), alpha); +} + +template +static void MatMul2D(const framework::ExecutionContext& ctx, const Tensor& X, + const Tensor& Y, Tensor* Out, const bool trans_x, + const bool trans_y, const float alpha) { + Out->mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_LT(fabs(alpha - 1.0), std::numeric_limits::epsilon(), + platform::errors::InvalidArgument( + "MLU(matmul): alpha should be equal to 1.0! " + "Other values are not supported yet." + "But received alpha is %d.", + alpha)); + + MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + + MLUCnnl::Matmul(ctx, trans_x, trans_y, x_desc.get(), GetBasePtr(&X), + y_desc.get(), GetBasePtr(&Y), out_desc.get(), + GetBasePtr(Out)); +} + +template +static void MatMulND(const framework::ExecutionContext& ctx, const Tensor& X, + const Tensor& Y, Tensor* Out, const bool trans_x, + const bool trans_y, const float alpha) { + if (!Out->initialized()) { + Out->mutable_data(ctx.GetPlace()); + } + + PADDLE_ENFORCE_LT(fabs(alpha - 1.0), std::numeric_limits::epsilon(), + platform::errors::InvalidArgument( + "MLU(matmul): alpha should be equal to 1.0! " + "Other values are not supported yet." + "But received alpha is %d.", + alpha)); + + MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + + MLUCnnl::BatchMatmul(ctx, trans_x, trans_y, x_desc.get(), GetBasePtr(&X), + y_desc.get(), GetBasePtr(&Y), out_desc.get(), + GetBasePtr(Out)); +} + +template +static void ReduceDims(const framework::ExecutionContext& ctx, + const std::vector& dims, + const std::vector& bcast_dims, const Tensor& in, + Tensor* out) { + std::vector axes; + int64_t size = bcast_dims.size(); + int64_t diff = bcast_dims.size() - dims.size(); + for (int64_t i = 0; i < size; ++i) { + if (i < diff) { + axes.push_back(i); + continue; + } + if (bcast_dims[i] > dims[i - diff]) { + axes.push_back(i); + } + } + out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc in_desc(in, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + + std::vector reduce_dims(axes.begin(), axes.end()); + MLUCnnlReduceDesc reduce_desc(reduce_dims, CNNL_REDUCE_ADD, + ToCnnlDataType(), CNNL_NOT_PROPAGATE_NAN, + CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES); + + MLUCnnl::Reduce(ctx, true /*need_workspace*/, reduce_desc.get(), nullptr, + in_desc.get(), GetBasePtr(&in), 0 /*indices_size*/, nullptr, + nullptr, out_desc.get(), GetBasePtr(out)); +} + +template +class MatMulMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* X = ctx.Input("X"); + auto* Y = ctx.Input("Y"); + auto* Out = ctx.Output("Out"); + bool transpose_x = ctx.Attr("transpose_X"); + bool transpose_y = ctx.Attr("transpose_Y"); + float alpha = static_cast(ctx.Attr("alpha")); + + std::vector x_dims = phi::vectorize(X->dims()); + std::vector y_dims = phi::vectorize(Y->dims()); + std::vector out_dims = phi::vectorize(Out->dims()); + int x_ndim = x_dims.size(); + int y_ndim = y_dims.size(); + + // Case 1: [K] x [K] = [1] + // Equal: [1, K] x [K, 1] = [1, 1] => [1] + const bool all_one_dim = (x_ndim == 1 && y_ndim == 1); + if (all_one_dim) { + Out->Resize({1, 1}); + } + + // Resize dim 1 to 2 + Tensor x_temp, y_temp; + x_temp.ShareDataWith(*X); + y_temp.ShareDataWith(*Y); + if (x_ndim == 1) { + x_dims.insert(x_dims.begin(), 1); + x_temp.Resize(phi::make_ddim(x_dims)); + x_ndim = 2; + // matmul op of mlu needs `std::max(x->dim, y->dim) == out->dim` + if (out_dims.size() < y_dims.size()) { + std::vector temp_out_dims(out_dims.begin(), out_dims.end()); + temp_out_dims.insert(temp_out_dims.end() - 1, 1); + Out->Resize(phi::make_ddim(temp_out_dims)); + } + } + if (y_ndim == 1) { + y_dims.push_back(1); + y_temp.Resize(phi::make_ddim(y_dims)); + y_ndim = 2; + // matmul op of mlu needs `std::max(x->dim, y->dim) == out->dim` + if (out_dims.size() < x_dims.size()) { + std::vector temp_out_dims(out_dims.begin(), out_dims.end()); + temp_out_dims.push_back(1); + Out->Resize(phi::make_ddim(temp_out_dims)); + } + } + + const int K = transpose_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1]; + if (transpose_y) { + PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1], K, + platform::errors::InvalidArgument( + "Input(Y) has error dim." + "Y'dims[%d] must be equal to %d" + "But received Y'dims[%d] is %d", + y_ndim - 1, K, y_ndim - 1, y_dims[y_ndim - 1])); + } else { + PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2], K, + platform::errors::InvalidArgument( + "Input(Y) has error dim." + "Y'dims[%d] must be equal to %d" + "But received Y'dims[%d] is %d", + y_ndim - 2, K, y_ndim - 2, y_dims[y_ndim - 2])); + } + + if (x_ndim == 2 && y_ndim == 2) { + // Case 2: [M, K] x [K, N] = [M, N] + MatMul2D(ctx, x_temp, y_temp, Out, transpose_x, transpose_y, alpha); + } else { + // Case 3: [B, M, K] x [K, N] = [B, M, N] + // Case 4: [B, M, K] x [B, K, N] = [B, M, N] + MatMulND(ctx, x_temp, y_temp, Out, transpose_x, transpose_y, alpha); + } + + if (phi::vectorize(Out->dims()) != out_dims) { + Out->Resize(phi::make_ddim(out_dims)); + } + } +}; + +template +class MatMulGradMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* X = ctx.Input("X"); + auto* Y = ctx.Input("Y"); + auto* dOut = ctx.Input(framework::GradVarName("Out")); + auto* dX = ctx.Output(framework::GradVarName("X")); + auto* dY = ctx.Output(framework::GradVarName("Y")); + bool transpose_x = ctx.Attr("transpose_X"); + bool transpose_y = ctx.Attr("transpose_Y"); + float alpha = static_cast(ctx.Attr("alpha")); + + std::vector x_dims = phi::vectorize(X->dims()); + std::vector y_dims = phi::vectorize(Y->dims()); + std::vector out_dims = phi::vectorize(dOut->dims()); + int x_ndim = x_dims.size(); + int y_ndim = y_dims.size(); + int out_ndim = out_dims.size(); + + // Case 1: [K] x [K] = [1] + if (x_ndim == 1 && y_ndim == 1) { + if (dX) { + Mul(ctx, *dOut, *Y, dX, alpha); + } + if (dY) { + Mul(ctx, *dOut, *X, dY, alpha); + } + return; + } + + // Resize dim 1 to 2 + Tensor x_temp, y_temp, dout_temp; + x_temp.ShareDataWith(*X); + y_temp.ShareDataWith(*Y); + dout_temp.ShareDataWith(*dOut); + if (x_ndim == 1) { + x_dims.insert(x_dims.begin(), 1); + out_dims.insert(out_dims.end() - 1, 1); + x_temp.Resize(phi::make_ddim(x_dims)); + dout_temp.Resize(phi::make_ddim(out_dims)); + x_ndim = 2; + out_ndim += 1; + } + if (y_ndim == 1) { + y_dims.push_back(1); + out_dims.push_back(1); + y_temp.Resize(phi::make_ddim(y_dims)); + dout_temp.Resize(phi::make_ddim(out_dims)); + y_ndim = 2; + out_ndim += 1; + } + + // Case 2: [M, K] x [K, N] = [M, N] + if (out_ndim == 2) { + if (dX) { + dX->Resize(phi::make_ddim(x_dims)); + if (transpose_x) { + MatMul2D(ctx, y_temp, dout_temp, dX, transpose_y, true, alpha); + } else { + MatMul2D(ctx, dout_temp, y_temp, dX, false, !transpose_y, alpha); + } + dX->Resize(X->dims()); + } + if (dY) { + dY->Resize(phi::make_ddim(y_dims)); + if (transpose_y) { + MatMul2D(ctx, dout_temp, x_temp, dY, true, transpose_x, alpha); + } else { + MatMul2D(ctx, x_temp, dout_temp, dY, !transpose_x, false, alpha); + } + dY->Resize(Y->dims()); + } + return; + } + + // Case 3: [B, M, K] x [K, N] = [B, M, N] + // Case 4: [B, M, K] x [B, K, N] = [B, M, N] + std::vector x_bcast_dims(out_ndim, 1); + std::vector y_bcast_dims(out_ndim, 1); + std::copy(out_dims.begin(), out_dims.end() - 2, x_bcast_dims.begin()); + std::copy(out_dims.begin(), out_dims.end() - 2, y_bcast_dims.begin()); + std::copy(x_dims.end() - 2, x_dims.end(), x_bcast_dims.end() - 2); + std::copy(y_dims.end() - 2, y_dims.end(), y_bcast_dims.end() - 2); + + if (dX) { + Tensor dx_temp(X->type()); + if (x_dims != x_bcast_dims) { + dx_temp.Resize(phi::make_ddim(x_bcast_dims)); + } else { + dX->mutable_data(ctx.GetPlace()); + dx_temp.ShareDataWith(*dX); + } + + if (transpose_x) { + MatMulND(ctx, y_temp, dout_temp, &dx_temp, transpose_y, true, alpha); + } else { + MatMulND(ctx, dout_temp, y_temp, &dx_temp, false, !transpose_y, + alpha); + } + + if (x_dims != x_bcast_dims) { + ReduceDims(ctx, x_dims, x_bcast_dims, dx_temp, dX); + } + } + + if (dY) { + Tensor dy_temp(Y->type()); + if (y_dims != y_bcast_dims) { + dy_temp.Resize(phi::make_ddim(y_bcast_dims)); + } else { + dY->mutable_data(ctx.GetPlace()); + dy_temp.ShareDataWith(*dY); + } + + if (transpose_y) { + MatMulND(ctx, dout_temp, x_temp, &dy_temp, true, transpose_x, alpha); + } else { + MatMulND(ctx, x_temp, dout_temp, &dy_temp, !transpose_x, false, + alpha); + } + + if (y_dims != y_bcast_dims) { + ReduceDims(ctx, y_dims, y_bcast_dims, dy_temp, dY); + } + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL(matmul, ops::MatMulMLUKernel, + ops::MatMulMLUKernel); +REGISTER_OP_MLU_KERNEL(matmul_grad, ops::MatMulGradMLUKernel, + ops::MatMulGradMLUKernel); diff --git a/python/paddle/fluid/tests/unittests/mlu/test_matmul_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_matmul_op_mlu.py new file mode 100644 index 00000000000..adfff112e6b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_matmul_op_mlu.py @@ -0,0 +1,329 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2022 + + +def reference_matmul(X, Y, transpose_X=False, transpose_Y=False, scale=1.0): + """Reference forward implementation using np.matmul.""" + # np.matmul does not support the transpose flags, so we manually + # transpose X and Y appropriately. + if transpose_X: + if X.ndim == 1: + X = X.reshape((X.size, )) + elif X.ndim == 2: + X = X.T + else: + dim = [i for i in range(len(X.shape))] + dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1] + X = np.transpose(X, tuple(dim)) + if transpose_Y: + if Y.ndim == 1: + Y = Y.reshape((Y.size, )) + else: + dim = [i for i in range(len(Y.shape))] + dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1] + Y = np.transpose(Y, tuple(dim)) + + Out = np.matmul(X, Y) + if not Out.shape: + # We do not support 0-dimensional Tensors (scalars). So where + # np.matmul outputs a scalar, we must convert to a Tensor of + # shape (1, ) instead. + # Everywhere else, we are compatible with np.matmul. + Out = np.array([Out], dtype="float64") + if abs(scale - 1.0) > 1e-09: + Out = Out * scale + return Out + + +class TestMatMulOp(OpTest): + """ + basic case + """ + + def setUp(self): + self.set_mlu() + self.op_type = "matmul" + self.init_dtype() + self.init_alpha() + self.config() + + X = np.random.random(self.x_shape).astype(self.dtype) + Y = np.random.random(self.y_shape).astype(self.dtype) + # -0.1 ~ 0.1 + X = -0.1 + 0.2 * X + Y = -0.1 + 0.2 * Y + + Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y, + self.alpha) + Out = Out.astype(self.dtype) + self.inputs = {'X': X, 'Y': Y} + self.attrs = { + 'transpose_X': self.transpose_X, + 'transpose_Y': self.transpose_Y, + 'alpha': self.alpha + } + self.outputs = {'Out': Out} + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + + def config(self): + self.x_shape = (100, ) + self.y_shape = (100, ) + self.transpose_X = False + self.transpose_Y = False + + def init_alpha(self): + self.alpha = 1.0 + + def init_dtype(self): + self.dtype = "float32" + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-7) + + def test_check_grad_normal(self): + self.check_grad_with_place(self.place, ['X', 'Y'], 'Out') + + +class TestMatMulOp1(TestMatMulOp): + """ + case x_ndim == 1, y_ndim != 1 + """ + + def config(self): + self.x_shape = (100, ) + self.y_shape = (1, 3, 2, 100) + self.transpose_X = False + self.transpose_Y = True + + +class TestMatMulOp2(TestMatMulOp): + """ + case x_ndim != 1, y_ndim == 1 + """ + + def config(self): + self.x_shape = (1, 2, 100, 1) + self.y_shape = (100, ) + self.transpose_X = True + self.transpose_Y = False + + +class TestMatMulOp3(TestMatMulOp): + """ + case [M, K] x [K, N] = [M, N] + """ + + def config(self): + self.x_shape = (2, 100) + self.y_shape = (100, 2) + self.transpose_X = False + self.transpose_Y = False + + +class TestMatMulOp4(TestMatMulOp): + """ + case [M, K] x [K, N] = [M, N] + """ + + def config(self): + self.x_shape = (2, 100) + self.y_shape = (2, 100) + self.transpose_X = False + self.transpose_Y = True + + +class TestMatMulOp5(TestMatMulOp): + """ + case [M, K] x [K, N] = [M, N] + """ + + def config(self): + self.x_shape = (100, 2) + self.y_shape = (100, 2) + self.transpose_X = True + self.transpose_Y = False + + +class TestMatMulOp6(TestMatMulOp): + """ + case [B, M, K] x [K, N] = [B, M, N] + """ + + def config(self): + self.x_shape = (2, 2, 25) + self.y_shape = (25, 4) + self.transpose_X = False + self.transpose_Y = False + + +class TestMatMulOp7(TestMatMulOp): + """ + case [B, M, K] x [K, N] = [B, M, N] + """ + + def config(self): + self.x_shape = (1, 2, 25) + self.y_shape = (4, 25) + self.transpose_X = False + self.transpose_Y = True + + +class TestMatMulOp8(TestMatMulOp): + """ + case [B, M, K] x [K, N] = [B, M, N] + """ + + def config(self): + self.x_shape = (1, 25, 4) + self.y_shape = (25, 4) + self.transpose_X = True + self.transpose_Y = False + + +class TestMatMulOp9(TestMatMulOp): + """ + case [B, M, K] x [B, K, N] = [B, M, N] + """ + + def config(self): + self.x_shape = (2, 5, 10) + self.y_shape = (2, 10, 5) + self.transpose_X = False + self.transpose_Y = False + + +class TestMatMulOp10(TestMatMulOp): + """ + case [B, M, K] x [B, K, N] = [B, M, N] + """ + + def config(self): + self.x_shape = (2, 10, 5) + self.y_shape = (2, 10, 5) + self.transpose_X = True + self.transpose_Y = False + + +class TestMatMulOp11(TestMatMulOp): + """ + case [B, M, K] x [B, K, N] = [B, M, N] + """ + + def config(self): + self.x_shape = (2, 5, 10) + self.y_shape = (2, 5, 10) + self.transpose_X = False + self.transpose_Y = True + + +class TestMatMulOp12(TestMatMulOp): + """ + case to check the gradient for special case + """ + + def config(self): + self.x_shape = (100) + self.y_shape = (1, 2, 2, 100, 2) + self.transpose_X = False + self.transpose_Y = False + + +class TestMatMulOp13(TestMatMulOp): + """ + case to check the gradient for special case + """ + + def config(self): + self.x_shape = (2, 1, 100) + self.y_shape = (100) + self.transpose_X = False + self.transpose_Y = False + + +# TODO(mlu): alpha will be supported in next version +#--------------------test matmul alpha-------------------- +# def create_test_alpha_class(parent): +# class TestMatMulOpAlphaCase(parent): +# def init_alpha(self): +# self.alpha = 0.125 + +# cls_name = "{0}_{1}".format(parent.__name__, "Alpha") +# TestMatMulOpAlphaCase.__name__ = cls_name +# globals()[cls_name] = TestMatMulOpAlphaCase + +# create_test_alpha_class(TestMatMulOp) +# create_test_alpha_class(TestMatMulOp1) +# create_test_alpha_class(TestMatMulOp2) +# create_test_alpha_class(TestMatMulOp3) +# create_test_alpha_class(TestMatMulOp4) +# create_test_alpha_class(TestMatMulOp5) +# create_test_alpha_class(TestMatMulOp6) +# create_test_alpha_class(TestMatMulOp9) +# create_test_alpha_class(TestMatMulOp10) +# create_test_alpha_class(TestMatMulOp11) +# create_test_alpha_class(TestMatMulOp12) +# create_test_alpha_class(TestMatMulOp13) + + +#--------------------test matmul fp16-------------------- +def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5): + class TestMatMulOpFp16Case(parent): + def init_kernel_type(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=atol) + + def test_check_grad(self): + self.check_grad_with_place( + self.place, ['X', 'Y'], + 'Out', + max_relative_error=max_relative_error) + + cls_name = "{0}_{1}".format(parent.__name__, "Fp16") + TestMatMulOpFp16Case.__name__ = cls_name + globals()[cls_name] = TestMatMulOpFp16Case + + +create_test_fp16_class(TestMatMulOp) +create_test_fp16_class(TestMatMulOp1) +create_test_fp16_class(TestMatMulOp2) +create_test_fp16_class(TestMatMulOp3) +create_test_fp16_class(TestMatMulOp4) +create_test_fp16_class(TestMatMulOp5) +create_test_fp16_class(TestMatMulOp6) +create_test_fp16_class(TestMatMulOp9) +create_test_fp16_class(TestMatMulOp10) +create_test_fp16_class(TestMatMulOp11) +create_test_fp16_class(TestMatMulOp12) +create_test_fp16_class(TestMatMulOp13) + +if __name__ == "__main__": + unittest.main() -- GitLab From 0764fda25bb016bf143fc0a3aa93a3fb56b0cd73 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 2 Mar 2022 15:07:34 +0800 Subject: [PATCH 046/272] [Phi] Unify complex type trait and fix real imag bug (#40036) * unify complex type trait and fix real imag bug * add unittest for type tratis --- paddle/fluid/operators/angle_op.h | 6 +- paddle/fluid/operators/eig_op.h | 26 ++-- paddle/fluid/operators/eigh_op.h | 2 +- paddle/fluid/operators/eigvals_op.h | 14 +- paddle/fluid/operators/imag_op.cc | 2 +- paddle/fluid/operators/lstsq_op.h | 4 +- .../operators/math/eigen_values_vectors.h | 8 +- paddle/fluid/operators/math/inclusive_scan.h | 2 +- paddle/fluid/operators/qr_op.cu | 14 +- paddle/fluid/operators/qr_op.h | 18 +-- paddle/fluid/operators/real_op.cc | 2 +- paddle/fluid/operators/svd_helper.h | 12 +- paddle/fluid/operators/svd_op.h | 12 +- paddle/phi/common/type_traits.h | 96 ++++++++++++++ paddle/phi/infermeta/unary.cc | 7 + paddle/phi/infermeta/unary.h | 2 + paddle/phi/kernels/cpu/abs_kernel.cc | 6 +- paddle/phi/kernels/cpu/complex_kernel.cc | 8 +- paddle/phi/kernels/funcs/complex_functors.h | 123 ++++++------------ paddle/phi/kernels/gpu/abs_kernel.cu | 10 +- paddle/phi/kernels/gpu/complex_kernel.cu | 8 +- .../phi/kernels/impl/abs_grad_kernel_impl.h | 2 +- .../kernels/impl/complex_grad_kernel_impl.h | 4 +- paddle/phi/kernels/impl/complex_kernel_impl.h | 8 +- paddle/phi/tests/common/test_data_type.cc | 16 +++ 25 files changed, 247 insertions(+), 165 deletions(-) create mode 100644 paddle/phi/common/type_traits.h diff --git a/paddle/fluid/operators/angle_op.h b/paddle/fluid/operators/angle_op.h index db5a3ea2961..116a8053db3 100644 --- a/paddle/fluid/operators/angle_op.h +++ b/paddle/fluid/operators/angle_op.h @@ -36,8 +36,8 @@ class AngleKernel : public framework::OpKernel { auto numel = x->numel(); auto* x_data = x->data(); - auto* out_data = out->mutable_data>( - context.GetPlace(), size_t(x->numel() * sizeof(phi::funcs::Real))); + auto* out_data = out->mutable_data>( + context.GetPlace(), size_t(x->numel() * sizeof(phi::dtype::Real))); auto& dev_ctx = context.template device_context(); platform::ForRange for_range(dev_ctx, numel); @@ -57,7 +57,7 @@ class AngleGradKernel : public framework::OpKernel { ctx.Output(framework::GradVarName("X")); auto numel = d_out->numel(); - auto* dout_data = d_out->data>(); + auto* dout_data = d_out->data>(); auto* x_data = x->data(); auto* dx_data = d_x->mutable_data( ctx.GetPlace(), static_cast(numel * sizeof(T))); diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h index 03b25c6705a..e9c6c1eb7ec 100644 --- a/paddle/fluid/operators/eig_op.h +++ b/paddle/fluid/operators/eig_op.h @@ -87,19 +87,19 @@ void LapackEig(Tensor* input, Tensor* values, Tensor* vectors, int info, int values_stride = values->dims()[values->dims().size() - 1]; Tensor rwork; - phi::funcs::Real* rwork_data = nullptr; + phi::dtype::Real* rwork_data = nullptr; rwork.Resize(phi::make_ddim({lda * 2})); - rwork_data = rwork.mutable_data>(context.GetPlace()); + rwork_data = rwork.mutable_data>(context.GetPlace()); // call lapackEig once to compute the size of work; T computed_work_size; - phi::funcs::lapackEig>( + phi::funcs::lapackEig>( jobvl, jobvr, order, input_data, lda, values_data, lvector_data, ldvl, rvector_data, ldvr, &computed_work_size, lwork, rwork_data, &info); lwork = std::max( - 1, static_cast(phi::funcs::Real(computed_work_size))); + 1, static_cast(phi::dtype::Real(computed_work_size))); Tensor work; work.Resize(phi::make_ddim({lwork})); T* work_data = work.mutable_data(context.GetPlace()); @@ -109,7 +109,7 @@ void LapackEig(Tensor* input, Tensor* values, Tensor* vectors, int info, T* current_values = &values_data[i * values_stride]; T* current_rvectors = &rvector_data[i * matrix_stride]; - phi::funcs::lapackEig>( + phi::funcs::lapackEig>( jobvl, jobvr, order, current_matrix, lda, current_values, lvector_data, ldvl, current_rvectors, ldvr, work_data, lwork, rwork_data, &info); PADDLE_ENFORCE_EQ( @@ -207,23 +207,23 @@ class EigKernel : public framework::OpKernel { origin_dim.push_back(last_item * 2); framework::DDim big_dim = phi::make_ddim(origin_dim); - real_values.mutable_data>(big_dim, + real_values.mutable_data>(big_dim, context.GetPlace()); - real_vectors.mutable_data>(x->dims(), + real_vectors.mutable_data>(x->dims(), context.GetPlace()); - ApplyEigKernel>( + ApplyEigKernel>( *x, &real_values, &real_vectors, context); auto dito = math::DeviceIndependenceTensorOperations< - DeviceContext, phi::funcs::Real, Tout>(context); + DeviceContext, phi::dtype::Real, Tout>(context); // 1. extract real part & imag part from real_values Tensor real_part = dito.Slice(real_values, {-1}, {0}, {order}); Tensor imag_part = dito.Slice(real_values, {-1}, {order}, {order * 2}); // 2. construct complex values - auto* real_part_data = real_part.data>(); - auto* imag_part_data = imag_part.data>(); + auto* real_part_data = real_part.data>(); + auto* imag_part_data = imag_part.data>(); int out_values_numel = out_values->numel(); platform::ForRange for_range( context.template device_context(), out_values_numel); @@ -236,7 +236,7 @@ class EigKernel : public framework::OpKernel { Tensor real_vector_trans = dito.Transpose(real_vectors); Tensor out_vectors_trans; out_vectors_trans.mutable_data(x->dims(), context.GetPlace()); - ConstructComplexVectors, Tout>( + ConstructComplexVectors, Tout>( &out_vectors_trans, *out_values, real_vector_trans, context, batch_count, order); TransposeTwoAxis(out_vectors_trans, out_vectors, @@ -272,7 +272,7 @@ void ComputeBackwardForComplexInput( // turn diag_unsqueezed into complex auto numel = diag_unsqueezed.numel(); Tensor diag_unsqueezed_complex; - auto* data_diag_un = diag_unsqueezed.data>(); + auto* data_diag_un = diag_unsqueezed.data>(); auto* data_diag_un_com = diag_unsqueezed_complex.mutable_data( diag_unsqueezed.dims(), context.GetPlace(), static_cast(numel * sizeof(Tout))); diff --git a/paddle/fluid/operators/eigh_op.h b/paddle/fluid/operators/eigh_op.h index 294794877b3..5279ec75093 100644 --- a/paddle/fluid/operators/eigh_op.h +++ b/paddle/fluid/operators/eigh_op.h @@ -40,7 +40,7 @@ template class EighGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - using ValueType = phi::funcs::Real; + using ValueType = phi::dtype::Real; auto& x_grad = *ctx.Output(framework::GradVarName("X")); x_grad.mutable_data(ctx.GetPlace()); auto& output_w = *ctx.Input("Eigenvalues"); diff --git a/paddle/fluid/operators/eigvals_op.h b/paddle/fluid/operators/eigvals_op.h index 59eabfb29b9..4627acc0d07 100644 --- a/paddle/fluid/operators/eigvals_op.h +++ b/paddle/fluid/operators/eigvals_op.h @@ -48,7 +48,7 @@ struct PaddleComplex< template using PaddleCType = typename PaddleComplex::type; template -using Real = typename phi::funcs::Real; +using Real = typename phi::dtype::Real; static void SpiltBatchSquareMatrix(const Tensor& input, std::vector* output) { @@ -144,7 +144,7 @@ LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input, required_work_mem, work_mem)); int64_t rwork_mem = rwork->memory_size(); - int64_t required_rwork_mem = (n_dim << 1) * sizeof(phi::funcs::Real); + int64_t required_rwork_mem = (n_dim << 1) * sizeof(phi::dtype::Real); PADDLE_ENFORCE_GE( rwork_mem, required_rwork_mem, platform::errors::InvalidArgument( @@ -154,11 +154,11 @@ LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input, required_rwork_mem, rwork_mem)); int info = 0; - phi::funcs::lapackEig>( + phi::funcs::lapackEig>( 'N', 'N', static_cast(n_dim), a.template data(), static_cast(n_dim), output->template data(), NULL, 1, NULL, 1, work->template data(), static_cast(work_mem / sizeof(T)), - rwork->template data>(), &info); + rwork->template data>(), &info); std::string name = "framework::platform::dynload::cgeev_"; if (framework::TransToProtoVarType(input.dtype()) == @@ -188,10 +188,10 @@ class EigvalsKernel : public framework::OpKernel { // query workspace size T qwork; int info; - phi::funcs::lapackEig>( + phi::funcs::lapackEig>( 'N', 'N', static_cast(n_dim), input_matrices[0].template data(), static_cast(n_dim), NULL, NULL, 1, NULL, 1, &qwork, -1, - static_cast*>(NULL), &info); + static_cast*>(NULL), &info); int64_t lwork = static_cast(qwork); Tensor work, rwork; @@ -208,7 +208,7 @@ class EigvalsKernel : public framework::OpKernel { } if (framework::IsComplexType( framework::TransToProtoVarType(input->dtype()))) { - rwork.mutable_data>(phi::make_ddim({n_dim << 1}), + rwork.mutable_data>(phi::make_ddim({n_dim << 1}), ctx.GetPlace()); } diff --git a/paddle/fluid/operators/imag_op.cc b/paddle/fluid/operators/imag_op.cc index 33b68d68992..567a69f383d 100644 --- a/paddle/fluid/operators/imag_op.cc +++ b/paddle/fluid/operators/imag_op.cc @@ -83,7 +83,7 @@ DECLARE_INPLACE_OP_INFERER(ImagGradOpInplaceInferer, } // namespace paddle DELCARE_INFER_SHAPE_FUNCTOR(imag, ImagInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); + PT_INFER_META(phi::RealAndImagInferMeta)); namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/lstsq_op.h b/paddle/fluid/operators/lstsq_op.h index a4c3d1c81fb..3cbbc62e7be 100644 --- a/paddle/fluid/operators/lstsq_op.h +++ b/paddle/fluid/operators/lstsq_op.h @@ -46,7 +46,7 @@ template class LstsqCPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - using ValueType = phi::funcs::Real; + using ValueType = phi::dtype::Real; const Tensor& x = *context.Input("X"); auto y = context.Input("Y"); @@ -169,7 +169,7 @@ class LstsqCPUKernel : public framework::OpKernel { &rank_32, &wkopt, lwork, &rwkopt, &info); } - lwork = std::max(1, static_cast(phi::funcs::Real(wkopt))); + lwork = std::max(1, static_cast(phi::dtype::Real(wkopt))); Tensor work; work.Resize(phi::make_ddim({lwork})); T* work_data = work.mutable_data(context.GetPlace()); diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h index 9b6ebf73d9b..1ade2190bb9 100644 --- a/paddle/fluid/operators/math/eigen_values_vectors.h +++ b/paddle/fluid/operators/math/eigen_values_vectors.h @@ -63,7 +63,7 @@ struct MatrixEighFunctor { void operator()(const framework::ExecutionContext &ctx, const Tensor &input, Tensor *eigen_values, Tensor *eigen_vectors, bool is_lower, bool has_vectors) { - using ValueType = phi::funcs::Real; + using ValueType = phi::dtype::Real; auto *out_value = eigen_values->mutable_data(ctx.GetPlace()); auto dito = @@ -123,7 +123,7 @@ struct MatrixEighFunctor { for (auto i = 0; i < batch_size; i++) { auto *value_data = out_value + i * values_stride; auto *input_data = input_vector + i * vector_stride; - phi::funcs::lapackEigh>( + phi::funcs::lapackEigh>( jobz, uplo, n, input_data, lda, value_data, work_data, lwork, rwork_data, lrwork, iwork_data, liwork, &info); CheckEighResult(i, info); @@ -151,7 +151,7 @@ struct MatrixEighFunctor { void operator()(const framework::ExecutionContext &ctx, const Tensor &input, Tensor *eigen_values, Tensor *eigen_vectors, bool is_lower, bool has_vectors) { - using ValueType = phi::funcs::Real; + using ValueType = phi::dtype::Real; auto *out_value = eigen_values->mutable_data(ctx.GetPlace()); auto &dev_ctx = ctx.template device_context(); @@ -233,7 +233,7 @@ struct MatrixEighFunctor { } } - using ValueType = phi::funcs::Real; + using ValueType = phi::dtype::Real; inline void EvdBuffer(cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, int n, const T *A, int lda, const ValueType *W, int *lwork) const; diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h index 38692a64611..9994ccc10cb 100644 --- a/paddle/fluid/operators/math/inclusive_scan.h +++ b/paddle/fluid/operators/math/inclusive_scan.h @@ -115,7 +115,7 @@ static __global__ void InclusiveScanInnerDimCUDAKernel(const T *x, T *y, size_t num_rows, size_t row_size, T init, BinaryOp op) { - using RealT = phi::funcs::Real; + using RealT = phi::dtype::Real; constexpr auto kSharedBufferSize = framework::IsComplex::value ? 4 * kThreadNumX : 2 * kThreadNumX; __shared__ RealT sbuf[kThreadNumY][kSharedBufferSize]; diff --git a/paddle/fluid/operators/qr_op.cu b/paddle/fluid/operators/qr_op.cu index 5e841a097fe..a57a8d5cf8b 100644 --- a/paddle/fluid/operators/qr_op.cu +++ b/paddle/fluid/operators/qr_op.cu @@ -56,13 +56,13 @@ class QrGPUKernel : public framework::OpKernel { int tau_stride = min_mn; if (compute_q) { - q.mutable_data>( + q.mutable_data>( context.GetPlace(), - size_t(batch_size * m * k * sizeof(phi::funcs::Real))); + size_t(batch_size * m * k * sizeof(phi::dtype::Real))); } - r.mutable_data>( + r.mutable_data>( context.GetPlace(), - size_t(batch_size * k * n * sizeof(phi::funcs::Real))); + size_t(batch_size * k * n * sizeof(phi::dtype::Real))); auto dito = math::DeviceIndependenceTensorOperations { // Note: allocate temporary tensors because of lacking in-place operatios. // Prepare qr Tensor qr; - qr.mutable_data>( + qr.mutable_data>( context.GetPlace(), - size_t(batch_size * m * n * sizeof(phi::funcs::Real))); + size_t(batch_size * m * n * sizeof(phi::dtype::Real))); // BatchedGeqrf performs computation in-place and 'qr' must be a copy of // input paddle::framework::TensorCopy(x, context.GetPlace(), &qr); @@ -126,7 +126,7 @@ class QrGPUKernel : public framework::OpKernel { for (int i = 0; i < batch_size; ++i) { memory::Copy(dev_ctx.GetPlace(), (new_qr_data + i * new_qr_stride), dev_ctx.GetPlace(), (qr_data + i * qr_stride), - qr_stride * sizeof(phi::funcs::Real), + qr_stride * sizeof(phi::dtype::Real), dev_ctx.stream()); } BatchedOrgqr( diff --git a/paddle/fluid/operators/qr_op.h b/paddle/fluid/operators/qr_op.h index cef9371fea0..f09a07e96cd 100644 --- a/paddle/fluid/operators/qr_op.h +++ b/paddle/fluid/operators/qr_op.h @@ -74,19 +74,19 @@ class QrCPUKernel : public framework::OpKernel { int q_stride = m * k; int r_stride = k * n; - auto* x_data = x.data>(); + auto* x_data = x.data>(); T* q_data = nullptr; if (compute_q) { - q_data = q.mutable_data>( + q_data = q.mutable_data>( context.GetPlace(), - size_t(batch_size * m * k * sizeof(phi::funcs::Real))); + size_t(batch_size * m * k * sizeof(phi::dtype::Real))); memset(q_data, 0, - size_t(batch_size * m * k * sizeof(phi::funcs::Real))); + size_t(batch_size * m * k * sizeof(phi::dtype::Real))); } - auto* r_data = r.mutable_data>( + auto* r_data = r.mutable_data>( context.GetPlace(), - size_t(batch_size * k * n * sizeof(phi::funcs::Real))); - memset(r_data, 0, size_t(batch_size * k * n * sizeof(phi::funcs::Real))); + size_t(batch_size * k * n * sizeof(phi::dtype::Real))); + memset(r_data, 0, size_t(batch_size * k * n * sizeof(phi::dtype::Real))); // Implement QR by calling Eigen for (int i = 0; i < batch_size; ++i) { @@ -142,7 +142,7 @@ class QrGradKernel : public framework::OpKernel { // Use a different name dA instead of dX framework::Tensor& dA = *ctx.Output(framework::GradVarName("X")); - dA.mutable_data>(ctx.GetPlace()); + dA.mutable_data>(ctx.GetPlace()); auto& dev_ctx = ctx.template device_context(); phi::funcs::SetConstant()(dev_ctx, &dA, T(0)); @@ -224,7 +224,7 @@ class QrGradKernel : public framework::OpKernel { } else { // If m < n for input matrices A, we partition A = [X|Y] and R = [U|V] // Calculate dX and dY individually and concatenate them to get dA - dA.mutable_data>(ctx.GetPlace()); + dA.mutable_data>(ctx.GetPlace()); auto Y = dito.Slice(A, {-1}, {m}, {n}); auto U = dito.Slice(R, {-1}, {0}, {m}); diff --git a/paddle/fluid/operators/real_op.cc b/paddle/fluid/operators/real_op.cc index 1f3691978b5..28a8484f539 100644 --- a/paddle/fluid/operators/real_op.cc +++ b/paddle/fluid/operators/real_op.cc @@ -83,7 +83,7 @@ DECLARE_INPLACE_OP_INFERER(RealGradOpInplaceInferer, } // namespace paddle DELCARE_INFER_SHAPE_FUNCTOR(real, RealInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); + PT_INFER_META(phi::RealAndImagInferMeta)); namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h index bcb3ee44f04..166f49999d5 100644 --- a/paddle/fluid/operators/svd_helper.h +++ b/paddle/fluid/operators/svd_helper.h @@ -105,7 +105,7 @@ struct RealMulComplexFunctor { "The image part of y must to be 0" "but got [%d]", y.imag)); - return platform::complex>(x.real * y.real, + return platform::complex>(x.real * y.real, x.imag * y.real); } }; @@ -391,11 +391,11 @@ struct DeviceIndependenceTensorOperations { // batch_diag for CPU only Tensor BatchDiag(const Tensor& x, int batch) { Tensor out; - auto* x_data = x.data>(); + auto* x_data = x.data>(); auto numel = x.numel(); - auto* out_data = out.mutable_data>( + auto* out_data = out.mutable_data>( x.dims(), context.GetPlace(), - static_cast(numel * sizeof(phi::funcs::Real))); + static_cast(numel * sizeof(phi::dtype::Real))); auto x_dims = x.dims(); int num_dims = x_dims.size(); @@ -661,9 +661,9 @@ struct DeviceIndependenceTensorOperations { Tensor Real(const Tensor& x) { Tensor out; auto numel = x.numel(); - auto* out_data = out.mutable_data>( + auto* out_data = out.mutable_data>( x.dims(), context.GetPlace(), - static_cast(numel * sizeof(phi::funcs::Real))); + static_cast(numel * sizeof(phi::dtype::Real))); auto* x_data = x.data(); auto for_range = GetForRange(numel); phi::funcs::RealFunctor functor(x_data, out_data, numel); diff --git a/paddle/fluid/operators/svd_op.h b/paddle/fluid/operators/svd_op.h index f5e451ac705..42a847206a3 100644 --- a/paddle/fluid/operators/svd_op.h +++ b/paddle/fluid/operators/svd_op.h @@ -46,14 +46,14 @@ class SvdCPUKernel : public framework::OpKernel { int col_u = full ? rows : k; int col_v = full ? cols : k; int batches = numel / (rows * cols); - auto* U_out = U->mutable_data>( + auto* U_out = U->mutable_data>( context.GetPlace(), - size_t(batches * rows * col_u * sizeof(phi::funcs::Real))); - auto* VH_out = VH->mutable_data>( + size_t(batches * rows * col_u * sizeof(phi::dtype::Real))); + auto* VH_out = VH->mutable_data>( context.GetPlace(), - size_t(batches * col_v * cols * sizeof(phi::funcs::Real))); - auto* S_out = S->mutable_data>( - context.GetPlace(), size_t(batches * k * sizeof(phi::funcs::Real))); + size_t(batches * col_v * cols * sizeof(phi::dtype::Real))); + auto* S_out = S->mutable_data>( + context.GetPlace(), size_t(batches * k * sizeof(phi::dtype::Real))); /*SVD Use the Eigen Library*/ math::BatchSvd(x_data, U_out, VH_out, S_out, rows, cols, batches, full); } diff --git a/paddle/phi/common/type_traits.h b/paddle/phi/common/type_traits.h new file mode 100644 index 00000000000..ef894eee468 --- /dev/null +++ b/paddle/phi/common/type_traits.h @@ -0,0 +1,96 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/data_type.h" + +namespace phi { +namespace dtype { + +template +struct cond { + static constexpr bool value = B; + using type = T; +}; + +template +struct eval_if { + using type = typename TrueF::type; +}; + +template +struct eval_if { + using type = typename FalseF::type; +}; + +template +using eval_if_t = typename eval_if::type; + +template +struct select { + using type = eval_if_t>; +}; + +template +struct select { + using type = T; +}; + +template +struct select> { + // last one had better be true! + static_assert(B, "No match select type!"); + using type = T; +}; + +template +using select_t = typename select::type; + +// runtime real and complex type conversion + +template +using Real = select_t>::value, float>, + cond>::value, double>, + T>; + +template +using Complex = select_t::value, complex>, + cond::value, complex>, + T>; + +inline DataType ToReal(DataType dtype) { + switch (dtype) { + case phi::DataType::COMPLEX64: + return phi::DataType::FLOAT32; + case phi::DataType::COMPLEX128: + return phi::DataType::FLOAT64; + default: + return dtype; + } +} + +inline DataType ToComplex(DataType dtype) { + switch (dtype) { + case phi::DataType::FLOAT32: + return phi::DataType::COMPLEX64; + case phi::DataType::FLOAT64: + return phi::DataType::COMPLEX128; + default: + return dtype; + } +} + +} // namespace dtype +} // namespace phi diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 983e0162264..fbd9259a83f 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/type_traits.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/kernels/funcs/unfold_functor.h" @@ -51,6 +52,12 @@ void UnchangedInferMetaCheckAxis(const MetaTensor& x, out->share_meta(x); } +void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out) { + out->set_dims(x.dims()); + out->set_dtype(dtype::ToReal(x.dtype())); + out->set_layout(x.layout()); +} + void FlattenInferMeta(const MetaTensor& x, int start_axis, int stop_axis, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index a2d779e0f70..3c0628981af 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -39,6 +39,8 @@ void UnchangedInferMetaCheckAxis(const MetaTensor& x, int axis, MetaTensor* out); +void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out); + void FlattenInferMeta(const MetaTensor& x, int start_axis, int stop_axis, diff --git a/paddle/phi/kernels/cpu/abs_kernel.cc b/paddle/phi/kernels/cpu/abs_kernel.cc index efe7d090405..9f89fc27a71 100644 --- a/paddle/phi/kernels/cpu/abs_kernel.cc +++ b/paddle/phi/kernels/cpu/abs_kernel.cc @@ -25,9 +25,9 @@ template void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { auto numel = x.numel(); auto* x_data = x.data(); - ctx.template Alloc>( - out, size_t(x.numel() * sizeof(phi::funcs::Real))); - auto* out_data = out->data>(); + ctx.template Alloc>( + out, size_t(x.numel() * sizeof(phi::dtype::Real))); + auto* out_data = out->data>(); phi::funcs::ForRange for_range(ctx, numel); phi::funcs::AbsFunctor functor(x_data, out_data, numel); diff --git a/paddle/phi/kernels/cpu/complex_kernel.cc b/paddle/phi/kernels/cpu/complex_kernel.cc index 801502e1673..859d5a84527 100644 --- a/paddle/phi/kernels/cpu/complex_kernel.cc +++ b/paddle/phi/kernels/cpu/complex_kernel.cc @@ -37,11 +37,15 @@ PD_REGISTER_KERNEL(real, ALL_LAYOUT, phi::RealKernel, phi::dtype::complex, - phi::dtype::complex) {} + phi::dtype::complex) { + kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); +} PD_REGISTER_KERNEL(imag, CPU, ALL_LAYOUT, phi::ImagKernel, phi::dtype::complex, - phi::dtype::complex) {} + phi::dtype::complex) { + kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); +} diff --git a/paddle/phi/kernels/funcs/complex_functors.h b/paddle/phi/kernels/funcs/complex_functors.h index 86dbdd099ec..8b292cb5dc5 100644 --- a/paddle/phi/kernels/funcs/complex_functors.h +++ b/paddle/phi/kernels/funcs/complex_functors.h @@ -20,56 +20,12 @@ limitations under the License. */ #include #include "paddle/phi/common/complex.h" +#include "paddle/phi/common/type_traits.h" #include "paddle/phi/core/hostdevice.h" namespace phi { namespace funcs { -template -struct cond { - static constexpr bool value = B; - using type = T; -}; - -template -struct eval_if { - using type = typename TrueF::type; -}; - -template -struct eval_if { - using type = typename FalseF::type; -}; - -template -using eval_if_t = typename eval_if::type; - -template -struct select { - using type = eval_if_t>; -}; - -template -struct select { - using type = T; -}; - -template -struct select> { - // last one had better be true! - static_assert(B, "No match select type!"); - using type = T; -}; - -template -using select_t = typename select::type; - -template -using Real = - select_t>::value, float>, - cond>::value, double>, - T>; - template using Complex = typename std::enable_if::value>::type; @@ -91,9 +47,9 @@ template struct RealFunctor; template -struct RealFunctor>> { +struct RealFunctor>> { public: - RealFunctor(const T* input, Real* output, int64_t numel) + RealFunctor(const T* input, dtype::Real* output, int64_t numel) : input_(input), output_(output), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { @@ -102,7 +58,7 @@ struct RealFunctor>> { private: const T* input_; - Real* output_; + dtype::Real* output_; int64_t numel_; }; @@ -110,8 +66,8 @@ template struct ImagFunctor; template -struct ImagFunctor>> { - ImagFunctor(const T* input, Real* output, int64_t numel) +struct ImagFunctor>> { + ImagFunctor(const T* input, dtype::Real* output, int64_t numel) : input_(input), output_(output), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { @@ -119,7 +75,7 @@ struct ImagFunctor>> { } const T* input_; - Real* output_; + dtype::Real* output_; int64_t numel_; }; @@ -127,8 +83,8 @@ template struct AbsFunctor; template -struct AbsFunctor>> { - AbsFunctor(const T* input, Real* output, int64_t numel) +struct AbsFunctor>> { + AbsFunctor(const T* input, dtype::Real* output, int64_t numel) : input_(input), output_(output), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { @@ -136,12 +92,12 @@ struct AbsFunctor>> { } const T* input_; - Real* output_; + dtype::Real* output_; int64_t numel_; }; template -struct AbsFunctor>> { +struct AbsFunctor>> { AbsFunctor(const T* input, T* output, int64_t numel) : input_(input), output_(output), numel_(numel) {} @@ -203,7 +159,10 @@ struct AbsGradCUDAFunctor> { template struct AbsGradFunctor { - AbsGradFunctor(const Real* dout, const T* x, T* output, int64_t numel) + AbsGradFunctor(const dtype::Real* dout, + const T* x, + T* output, + int64_t numel) : dout_(dout), x_(x), output_(output), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { @@ -214,7 +173,7 @@ struct AbsGradFunctor { } } - const Real* dout_; + const dtype::Real* dout_; const T* x_; T* output_; int64_t numel_; @@ -334,8 +293,8 @@ template struct RealToComplexFunctor; template -struct RealToComplexFunctor>> { - RealToComplexFunctor(const Real* input, T* output, int64_t numel) +struct RealToComplexFunctor>> { + RealToComplexFunctor(const dtype::Real* input, T* output, int64_t numel) : input_(input), output_(output), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { @@ -343,7 +302,7 @@ struct RealToComplexFunctor>> { output_[idx].imag = 0; } - const Real* input_; + const dtype::Real* input_; T* output_; int64_t numel_; }; @@ -352,8 +311,8 @@ template struct ImagToComplexFunctor; template -struct ImagToComplexFunctor>> { - ImagToComplexFunctor(const Real* input, T* output, int64_t numel) +struct ImagToComplexFunctor>> { + ImagToComplexFunctor(const dtype::Real* input, T* output, int64_t numel) : input_(input), output_(output), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { @@ -361,7 +320,7 @@ struct ImagToComplexFunctor>> { output_[idx].imag = input_[idx]; } - const Real* input_; + const dtype::Real* input_; T* output_; int64_t numel_; }; @@ -370,9 +329,9 @@ template struct RealImagToComplexFunctor; template -struct RealImagToComplexFunctor>> { - RealImagToComplexFunctor(const Real* input_real, - const Real* input_imag, +struct RealImagToComplexFunctor>> { + RealImagToComplexFunctor(const dtype::Real* input_real, + const dtype::Real* input_imag, T* output, int64_t numel) : input_real_(input_real), @@ -385,8 +344,8 @@ struct RealImagToComplexFunctor>> { output_[idx].imag = input_imag_[idx]; } - const Real* input_real_; - const Real* input_imag_; + const dtype::Real* input_real_; + const dtype::Real* input_imag_; T* output_; int64_t numel_; }; @@ -423,8 +382,8 @@ struct AngleFunctor; // angel function for complex template -struct AngleFunctor>> { - AngleFunctor(const T* input, phi::funcs::Real* output, int64_t numel) +struct AngleFunctor>> { + AngleFunctor(const T* input, dtype::Real* output, int64_t numel) : input_(input), output_(output), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { @@ -432,13 +391,13 @@ struct AngleFunctor>> { } const T* input_; - phi::funcs::Real* output_; + dtype::Real* output_; int64_t numel_; }; // angel function for real template -struct AngleFunctor>> { +struct AngleFunctor>> { AngleFunctor(const T* input, T* output, int64_t numel) : input_(input), output_(output), numel_(numel) {} @@ -456,25 +415,22 @@ struct AngleGradFunctor; // angle grad for complex template -struct AngleGradFunctor>> { - AngleGradFunctor(const phi::funcs::Real* dout, - const T* x, - T* dx, - int64_t numel) +struct AngleGradFunctor>> { + AngleGradFunctor(const dtype::Real* dout, const T* x, T* dx, int64_t numel) : dout_(dout), x_(x), dx_(dx), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { if (x_[idx] == T(0)) { dx_[idx] = T(0); } else { - const phi::funcs::Real r_square = + const phi::dtype::Real r_square = x_[idx].real * x_[idx].real + x_[idx].imag * x_[idx].imag; dx_[idx] = T(-dout_[idx] * x_[idx].imag / r_square, dout_[idx] * x_[idx].real / r_square); } } - const phi::funcs::Real* dout_; + const phi::dtype::Real* dout_; const T* x_; T* dx_; int64_t numel_; @@ -482,16 +438,13 @@ struct AngleGradFunctor>> { // angle grad for real template -struct AngleGradFunctor>> { - AngleGradFunctor(const phi::funcs::Real* dout, - const T* x, - T* dx, - int64_t numel) +struct AngleGradFunctor>> { + AngleGradFunctor(const dtype::Real* dout, const T* x, T* dx, int64_t numel) : dout_(dout), x_(x), dx_(dx), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { dx_[idx] = 0; } - const phi::funcs::Real* dout_; + const dtype::Real* dout_; const T* x_; T* dx_; int64_t numel_; diff --git a/paddle/phi/kernels/gpu/abs_kernel.cu b/paddle/phi/kernels/gpu/abs_kernel.cu index e122e6b1e9c..5c424316a83 100644 --- a/paddle/phi/kernels/gpu/abs_kernel.cu +++ b/paddle/phi/kernels/gpu/abs_kernel.cu @@ -27,14 +27,14 @@ template struct CudaAbsFunctor; template -struct CudaAbsFunctor>> { - __device__ __forceinline__ phi::funcs::Real operator()(const T x) const { +struct CudaAbsFunctor>> { + __device__ __forceinline__ phi::dtype::Real operator()(const T x) const { return abs(x); } }; template -struct CudaAbsFunctor>> { +struct CudaAbsFunctor>> { __device__ __forceinline__ T operator()(const T x) const { return std::abs(x); } @@ -42,12 +42,12 @@ struct CudaAbsFunctor>> { template void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { - ctx.template Alloc>(out); + ctx.template Alloc>(out); std::vector ins = {&x}; std::vector outs = {out}; auto functor = CudaAbsFunctor(); - funcs::ElementwiseKernel>(ctx, ins, &outs, functor); + funcs::ElementwiseKernel>(ctx, ins, &outs, functor); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/complex_kernel.cu b/paddle/phi/kernels/gpu/complex_kernel.cu index d0b086718a4..e03e079581a 100644 --- a/paddle/phi/kernels/gpu/complex_kernel.cu +++ b/paddle/phi/kernels/gpu/complex_kernel.cu @@ -38,11 +38,15 @@ PD_REGISTER_KERNEL(real, ALL_LAYOUT, phi::RealKernel, phi::dtype::complex, - phi::dtype::complex) {} + phi::dtype::complex) { + kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); +} PD_REGISTER_KERNEL(imag, GPU, ALL_LAYOUT, phi::ImagKernel, phi::dtype::complex, - phi::dtype::complex) {} + phi::dtype::complex) { + kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); +} diff --git a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h index 78c25200bbd..9dad40b57c9 100644 --- a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h @@ -47,7 +47,7 @@ void AbsGradKernel(const Context& ctx, const DenseTensor& dout, DenseTensor* dx) { auto numel = dout.numel(); - auto* dout_data = dout.data>(); + auto* dout_data = dout.data>(); auto* x_data = x.data(); ctx.template Alloc(dx, static_cast(numel * sizeof(T))); diff --git a/paddle/phi/kernels/impl/complex_grad_kernel_impl.h b/paddle/phi/kernels/impl/complex_grad_kernel_impl.h index a10481284b1..03896a2353d 100644 --- a/paddle/phi/kernels/impl/complex_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/complex_grad_kernel_impl.h @@ -24,7 +24,7 @@ void RealGradKernel(const Context& dev_ctx, const DenseTensor& dout, DenseTensor* dx) { auto numel = dout.numel(); - auto* dout_data = dout.data>(); + auto* dout_data = dout.data>(); auto* dx_data = dev_ctx.template Alloc(dx, static_cast(numel * sizeof(T))); @@ -38,7 +38,7 @@ void ImagGradKernel(const Context& dev_ctx, const DenseTensor& dout, DenseTensor* dx) { auto numel = dout.numel(); - auto* dout_data = dout.data>(); + auto* dout_data = dout.data>(); auto* dx_data = dev_ctx.template Alloc(dx, static_cast(numel * sizeof(T))); diff --git a/paddle/phi/kernels/impl/complex_kernel_impl.h b/paddle/phi/kernels/impl/complex_kernel_impl.h index ff5cf86ed2e..72b13288339 100644 --- a/paddle/phi/kernels/impl/complex_kernel_impl.h +++ b/paddle/phi/kernels/impl/complex_kernel_impl.h @@ -39,8 +39,8 @@ void RealKernel(const Context& dev_ctx, DenseTensor* out) { auto numel = x.numel(); auto* x_data = x.data(); - auto* out_data = dev_ctx.template Alloc>( - out, static_cast(numel * sizeof(phi::funcs::Real))); + auto* out_data = dev_ctx.template Alloc>( + out, static_cast(numel * sizeof(phi::dtype::Real))); phi::funcs::ForRange for_range(dev_ctx, numel); phi::funcs::RealFunctor functor(x_data, out_data, numel); @@ -53,8 +53,8 @@ void ImagKernel(const Context& dev_ctx, DenseTensor* out) { auto numel = x.numel(); auto* x_data = x.data(); - auto* out_data = dev_ctx.template Alloc>( - out, static_cast(numel * sizeof(phi::funcs::Real))); + auto* out_data = dev_ctx.template Alloc>( + out, static_cast(numel * sizeof(phi::dtype::Real))); phi::funcs::ForRange for_range(dev_ctx, numel); phi::funcs::ImagFunctor functor(x_data, out_data, numel); diff --git a/paddle/phi/tests/common/test_data_type.cc b/paddle/phi/tests/common/test_data_type.cc index c962c68b4d5..5a1b41d796d 100644 --- a/paddle/phi/tests/common/test_data_type.cc +++ b/paddle/phi/tests/common/test_data_type.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/phi/api/ext/exception.h" #include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/type_traits.h" namespace phi { namespace tests { @@ -71,5 +72,20 @@ TEST(DataType, OStream) { } } +TEST(TypeTraits, Complex) { + EXPECT_EQ(phi::dtype::ToReal(phi::DataType::COMPLEX64), + phi::DataType::FLOAT32); + EXPECT_EQ(phi::dtype::ToReal(phi::DataType::COMPLEX128), + phi::DataType::FLOAT64); + EXPECT_EQ(phi::dtype::ToReal(phi::DataType::FLOAT32), phi::DataType::FLOAT32); + + EXPECT_EQ(phi::dtype::ToComplex(phi::DataType::FLOAT32), + phi::DataType::COMPLEX64); + EXPECT_EQ(phi::dtype::ToComplex(phi::DataType::FLOAT64), + phi::DataType::COMPLEX128); + EXPECT_EQ(phi::dtype::ToComplex(phi::DataType::COMPLEX64), + phi::DataType::COMPLEX64); +} + } // namespace tests } // namespace phi -- GitLab From 90ab7403753acad5c93b425f6a909a526aa57a3d Mon Sep 17 00:00:00 2001 From: Lijunhui <1578034415@qq.com> Date: Wed, 2 Mar 2022 15:11:42 +0800 Subject: [PATCH 047/272] [KP] Activation op registration for XPU2. part 1/2 (#40002) --- .../{activation_op.cu => activation_op.kps} | 64 +++++++++++++++++++ .../platform/device/xpu/xpu_op_kpfirst_list.h | 26 ++++++++ 2 files changed, 90 insertions(+) rename paddle/fluid/operators/{activation_op.cu => activation_op.kps} (94%) diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.kps similarity index 94% rename from paddle/fluid/operators/activation_op.cu rename to paddle/fluid/operators/activation_op.kps index e578ad899e7..e1afb3919f8 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.kps @@ -1861,3 +1861,67 @@ REGISTER_OP_CUDA_KERNEL( __macro(hard_swish, HardSwish, CudaHardSwishFunctor, \ CudaHardSwishGradFunctor); FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL) + +#ifdef PADDLE_WITH_XPU_KP +#define REGISTER_ACTIVATION_XPU_KERNEL(act_type, op_name, functor, \ + grad_functor) \ + REGISTER_OP_KERNEL( \ + act_type, KP, plat::XPUPlace, \ + ops::ActivationCudaKernel>); \ + REGISTER_OP_KERNEL(act_type##_grad, KP, plat::XPUPlace, \ + ops::ActivationGradCudaKernel>); + +REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor, + CudaLeakyReluGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(relu, Relu, CudaReluFunctor, + CudaReluGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor, + CudaSigmoidGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(exp, Exp, CudaExpFunctor, CudaExpGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(reciprocal, Reciprocal, CudaReciprocalFunctor, + CudaReciprocalGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(softplus, Softplus, CudaSoftplusFunctor, + CudaSoftplusGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, HardSwish, CudaHardSwishFunctor, + CudaHardSwishGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(elu, Elu, CudaELUFunctor, CudaELUGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(celu, Celu, CudaCELUFunctor, + CudaCELUGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(sqrt, Sqrt, CudaSqrtFunctor, + CudaSqrtGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(square, Square, CudaSquareFunctor, + CudaSquareGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(silu, Silu, CudaSiluFunctor, + CudaSiluGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, + CudaLogSigmoidGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(softshrink, SoftShrink, CudaSoftShrinkFunctor, + CudaSoftShrinkGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(ceil, Ceil, CudaCeilFunctor, + CudaZeroGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(floor, Floor, CudaFloorFunctor, + CudaZeroGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(log1p, Log1p, CudaLog1pFunctor, + CudaLog1pGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(brelu, BRelu, CudaBReluFunctor, + CudaBReluGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(soft_relu, SoftRelu, CudaSoftReluFunctor, + CudaSoftReluGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(softsign, Softsign, CudaSoftsignFunctor, + CudaSoftsignGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(relu6, Relu6, CudaRelu6Functor, + CudaRelu6GradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(hard_shrink, HardShrink, CudaHardShrinkFunctor, + CudaHardShrinkGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(hard_sigmoid, HardSigmoid, + CudaHardSigmoidFunctor, + CudaHardSigmoidGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(swish, Swish, CudaSwishFunctor, + CudaSwishGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(thresholded_relu, ThresholdedRelu, + CudaThresholdedReluFunctor, + CudaThresholdedReluGradFunctor); + +#endif // PADDLE_WITH_XPU_KP diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h index f79ef8505d8..c5dff84723c 100644 --- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h +++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h @@ -30,6 +30,32 @@ XPUOpMap& get_kp_ops() { static XPUOpMap s_xpu_kp_kernels{ {"elementwise_add", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + // activation op + {"exp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"hard_swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"leaky_relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"softplus", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"reciprocal", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"log", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"sigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"elu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"celu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"sqrt", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"square", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"silu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"logsigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"softshrink", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"ceil", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"floor", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"log1p", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"brelu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"soft_relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"softsign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"relu6", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"hard_shrink", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"hard_sigmoid", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, }; return s_xpu_kp_kernels; -- GitLab From 244ae318c2fbfea0ab4315a17f6e6296c6be2624 Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Wed, 2 Mar 2022 15:24:36 +0800 Subject: [PATCH 048/272] [fleet_executor] Add entrance of FleetExecutor in AnalysisPredictor for distributed inference (#39992) --- .../distributed/fleet_executor/carrier.cc | 24 +- .../distributed/fleet_executor/carrier.h | 7 +- .../fleet_executor/fleet_executor.cc | 48 ++- .../fleet_executor/fleet_executor.h | 10 +- .../distributed/fleet_executor/task_node.cc | 11 +- .../distributed/fleet_executor/task_node.h | 2 +- paddle/fluid/inference/api/analysis_config.cc | 3 + .../fluid/inference/api/analysis_predictor.cc | 289 +++++++++++++++++- .../fluid/inference/api/analysis_predictor.h | 59 ++++ .../inference/api/paddle_analysis_config.h | 57 ++++ .../fluid/inference/tests/api/CMakeLists.txt | 6 + .../tests/api/analyzer_dist_model_tester.cc | 72 +++++ paddle/fluid/pybind/bind_fleet_executor.cc | 2 +- paddle/fluid/pybind/inference_api.cc | 19 +- python/paddle/fluid/executor.py | 5 +- 15 files changed, 581 insertions(+), 33 deletions(-) create mode 100644 paddle/fluid/inference/tests/api/analyzer_dist_model_tester.cc diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc index 56d8da3eca4..0d5d328fd32 100644 --- a/paddle/fluid/distributed/fleet_executor/carrier.cc +++ b/paddle/fluid/distributed/fleet_executor/carrier.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include + #include "paddle/fluid/distributed/fleet_executor/carrier.h" #include "paddle/fluid/distributed/fleet_executor/global.h" #include "paddle/fluid/distributed/fleet_executor/interceptor.h" @@ -46,7 +48,8 @@ void Carrier::Init( const std::unordered_map& interceptor_id_to_rank, const std::unordered_map& interceptor_id_to_node, const framework::ProgramDesc& program, framework::Scope* scope, - int64_t num_micro_batches, const platform::Place& place) { + int64_t num_micro_batches, const platform::Place& place, + const std::vector& inference_root_scope_vars) { rank_ = rank; interceptor_id_to_rank_ = interceptor_id_to_rank; interceptor_id_to_node_ = interceptor_id_to_node; @@ -60,7 +63,7 @@ void Carrier::Init( microbatch_scopes_.resize(num_micro_batches); for (int i = 0; i < num_micro_batches; ++i) { microbatch_scopes_[i] = &minibatch_scope_->NewScope(); - CopyParameters(i, program); + CopyParameters(i, program, inference_root_scope_vars); } // TODO(fleet_exe dev): thread pool @@ -80,12 +83,23 @@ void Carrier::Release() { Carrier::~Carrier() { VLOG(3) << "Carrier's destructor."; } -void Carrier::CopyParameters(int microbatch_id, - const framework::ProgramDesc& program) { +void Carrier::CopyParameters( + int microbatch_id, const framework::ProgramDesc& program, + const std::vector& inference_root_scope_vars) { auto& global_block = program.Block(0); + std::map inference_root_scope_var_map; + for (auto var_name : inference_root_scope_vars) { + inference_root_scope_var_map.insert({var_name, 1}); + } for (auto& var : global_block.AllVars()) { - if (var->Persistable() && microbatch_id == 0) { + std::string var_name = var->Name(); + bool force_root = inference_root_scope_var_map.find(var_name) != + inference_root_scope_var_map.end(); + if (force_root) { + VLOG(4) << var_name << " will be forced to be created in the root scope."; + } + if ((var->Persistable() || force_root) && microbatch_id == 0) { auto* ptr = root_scope_->Var(var->Name()); InitializeVariable(ptr, var->GetType()); VLOG(5) << "Create persistable var: " << var->Name() diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h index 9a74fa78c0e..d35a3260915 100644 --- a/paddle/fluid/distributed/fleet_executor/carrier.h +++ b/paddle/fluid/distributed/fleet_executor/carrier.h @@ -57,9 +57,12 @@ class Carrier final { const std::unordered_map& interceptor_id_to_rank, const std::unordered_map& interceptor_id_to_node, const framework::ProgramDesc& program, framework::Scope* scope, - int64_t num_micro_batches, const platform::Place& place); + int64_t num_micro_batches, const platform::Place& place, + const std::vector& inference_root_scope_vars = {}); - void CopyParameters(int microbatch_id, const framework::ProgramDesc& program); + void CopyParameters( + int microbatch_id, const framework::ProgramDesc& program, + const std::vector& inference_root_scope_vars); void Release(); void Wait(); diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc index 457549a27b4..e946d78550f 100644 --- a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc +++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc @@ -11,6 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +#include #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h" #include "paddle/fluid/distributed/fleet_executor/global.h" @@ -52,7 +53,8 @@ void FleetExecutor::Init( const std::string& carrier_id, const framework::ProgramDesc& program_desc, framework::Scope* scope, const platform::Place& place, int64_t num_micro_batches, const std::vector& task_nodes, - const std::unordered_map& task_id_to_rank) { + const std::unordered_map& task_id_to_rank, + const std::vector& inference_root_scope_vars) { PADDLE_ENFORCE_GT(task_nodes.size(), 0, platform::errors::InvalidArgument( "Fleet executor is inited with empty task node")); @@ -64,6 +66,37 @@ void FleetExecutor::Init( } } auto unused_vars = framework::GetUnusedVars(program_desc.Block(0), ops, {}); + // NOTE: For inference, the vars in inference_root_scope_vars + // shouldn't be deleted during inf, for that they may be the result of the + // inf. If they are GCed, it will cause error during ZeroCopy the result. + std::vector changed_ops; + for (auto pair : unused_vars) { + const framework::OperatorBase* op = pair.first; + std::vector unused = pair.second; + for (auto name : inference_root_scope_vars) { + auto iter = std::find(unused.begin(), unused.end(), name); + if (iter != unused.end()) { + VLOG(3) << "Removing var: [" << name + << "] from the unused vars list of op: [" << op->Type() << "]"; + unused.erase(iter); + if (std::find(changed_ops.begin(), changed_ops.end(), op) == + changed_ops.end()) { + // record the op whose unused vars have been updated + changed_ops.emplace_back(op); + } + } + } + // update the unused vars list in the map + unused_vars[op] = unused; + } + for (auto op : changed_ops) { + auto iter = unused_vars.find(op); + if (iter->second.empty()) { + // remove those ops in the map that have empty unused vars list + VLOG(3) << "Removing op: [" << op->Type() << "] from unused_vars map."; + unused_vars.erase(iter); + } + } runtime_graph_ = std::make_shared(); std::unordered_map interceptor_id_to_task; for (auto task_node : task_nodes) { @@ -82,17 +115,18 @@ void FleetExecutor::Init( carrier_ids_.insert(carrier_id); // Set current running carrier GlobalVal::Set(new std::string(carrier_id)); - InitCarrier(carrier, scope, place, num_micro_batches, program_desc); + InitCarrier(carrier, scope, place, num_micro_batches, program_desc, + inference_root_scope_vars); GlobalVal::Get()->Barrier(); } -void FleetExecutor::InitCarrier(Carrier* carrier, framework::Scope* scope, - const platform::Place& place, - int64_t num_micro_batches, - const framework::ProgramDesc& program_desc) { +void FleetExecutor::InitCarrier( + Carrier* carrier, framework::Scope* scope, const platform::Place& place, + int64_t num_micro_batches, const framework::ProgramDesc& program_desc, + const std::vector& inference_root_scope_vars) { carrier->Init(exe_desc_.cur_rank(), runtime_graph_->interceptor_id_to_rank(), runtime_graph_->interceptor_id_to_node(), program_desc, scope, - num_micro_batches, place); + num_micro_batches, place, inference_root_scope_vars); } void FleetExecutor::InitMessageBus() { diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.h b/paddle/fluid/distributed/fleet_executor/fleet_executor.h index fa65309127b..ccdb3dcc459 100644 --- a/paddle/fluid/distributed/fleet_executor/fleet_executor.h +++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.h @@ -42,15 +42,17 @@ class FleetExecutor final { const framework::ProgramDesc& program_desc, framework::Scope* scope, const platform::Place& place, int64_t num_micro_batches, const std::vector& task_nodes, - const std::unordered_map& task_id_to_rank); + const std::unordered_map& task_id_to_rank, + const std::vector& inference_root_scope_vars = {}); void Run(const std::string& carrier_id); private: DISABLE_COPY_AND_ASSIGN(FleetExecutor); void InitMessageBus(); - void InitCarrier(Carrier* carrier, framework::Scope* scope, - const platform::Place& place, int64_t num_micro_batches, - const framework::ProgramDesc& program_desc); + void InitCarrier( + Carrier* carrier, framework::Scope* scope, const platform::Place& place, + int64_t num_micro_batches, const framework::ProgramDesc& program_desc, + const std::vector& inference_root_scope_vars = {}); FleetExecutorDesc exe_desc_; std::shared_ptr runtime_graph_; std::unordered_set carrier_ids_; diff --git a/paddle/fluid/distributed/fleet_executor/task_node.cc b/paddle/fluid/distributed/fleet_executor/task_node.cc index 6de7038b323..95e4c733059 100644 --- a/paddle/fluid/distributed/fleet_executor/task_node.cc +++ b/paddle/fluid/distributed/fleet_executor/task_node.cc @@ -52,11 +52,20 @@ void TaskNode::SetProgram(paddle::framework::ProgramDesc* program) { program_ = program; } -void TaskNode::Init() { +void TaskNode::Init(bool use_feed_fetch_ops) { + if (!use_feed_fetch_ops) { + VLOG(3) << "TaskNode will be inited without feed and fetch ops"; + } if (ops_.empty()) { // Q (for fleet executor dev): should we need another reset funct? VLOG(3) << "Task node will be inited by calling Init()."; for (const auto& op_desc : program_->Block(0).AllOps()) { + if (!use_feed_fetch_ops && + (op_desc->Type() == "feed" || op_desc->Type() == "fetch")) { + VLOG(3) << "TaskNode will skip [" << op_desc->Input("X")[0] << "], " + << op_desc->Type() << " -> " << op_desc->Output("Out")[0]; + continue; + } ops_vec_.emplace_back(framework::OpRegistry::CreateOp(*op_desc)); } for (const auto& op : ops_vec_) { diff --git a/paddle/fluid/distributed/fleet_executor/task_node.h b/paddle/fluid/distributed/fleet_executor/task_node.h index b655d140d37..4764d4fd4af 100644 --- a/paddle/fluid/distributed/fleet_executor/task_node.h +++ b/paddle/fluid/distributed/fleet_executor/task_node.h @@ -46,7 +46,7 @@ class TaskNode final { ~TaskNode() = default; void SetProgram(paddle::framework::ProgramDesc* program); - void Init(); + void Init(bool use_feed_fetch_ops = true); int64_t rank() const { return rank_; } int64_t task_id() const { return task_id_; } int32_t role() const { return role_; } diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index fd2ccffae3b..9c33d700306 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -274,6 +274,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(ipu_available_memory_proportion_); CP_MEMBER(ipu_enable_half_partial_); + // fleet exe related + CP_MEMBER(dist_config_); + if (use_gpu_) { PADDLE_ENFORCE_EQ(use_xpu_, false, platform::errors::InvalidArgument( diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index cd6e3a3c759..5492c3b0d26 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -30,6 +30,7 @@ #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/framework/version.h" @@ -47,6 +48,14 @@ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/phi/api/ext/op_meta_info.h" +#include "paddle/utils/string/split.h" + +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h" +#include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h" +#include "paddle/fluid/distributed/fleet_executor/task_node.h" +#endif #ifdef PADDLE_WITH_MKLML #include "paddle/fluid/platform/dynload/mklml.h" @@ -186,14 +195,14 @@ bool AnalysisPredictor::Init( return false; } + // Get the feed_target_names and fetch_target_names + PrepareFeedFetch(); + // Prepare executor, create local variables. if (!PrepareExecutor()) { return true; } - // Get the feed_target_names and fetch_target_names - PrepareFeedFetch(); - return true; } @@ -359,6 +368,13 @@ static void DisablePrepareDataOpt( } bool AnalysisPredictor::PrepareExecutor() { +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) + if (config_.dist_config().use_dist_model()) { + VLOG(3) << "use_dist_model is enabled, will init FleetExecutor."; + return PrepareFleetExecutor(); + } +#endif DisablePrepareDataOpt(inference_program_, 0, false); executor_->Prepare(sub_scope_, *inference_program_, 0, @@ -371,6 +387,226 @@ bool AnalysisPredictor::PrepareExecutor() { return true; } +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) +bool AnalysisPredictor::PrepareFleetExecutor() { + VLOG(3) << "AnalysisPredictor::PrepareFleetExecutor()"; + if (config_.dist_config().nranks() > 1 && !CommInit()) { + return false; + } + task_node_.reset(new distributed::TaskNode(inference_program_.get(), + config_.dist_config().rank())); + // With auto cut, there is no concept of pp, no need to add dependency. + task_node_->SetType("Compute"); + task_node_->Init(config_.use_feed_fetch_ops_enabled()); + executor_desc_ = distributed::FleetExecutorDesc(); + executor_desc_.set_cur_rank(config_.dist_config().rank()); + std::unordered_map id_to_rank; + for (int i = 0; i < config_.dist_config().nranks(); ++i) { + distributed::RankInfo *rank_info = executor_desc_.add_cluster_info(); + rank_info->set_rank(i); + rank_info->set_ip_port(config_.dist_config().trainer_endpoints()[i]); + id_to_rank.insert({i, i}); + } + fleet_exe_.reset(new distributed::FleetExecutor(executor_desc_)); + // NOTE: Vars of feed fetch ops are not persistable, + // which will result in that those vars will be created in + // the subscope (microscope) in fleet executor. This will + // cause that the GetInputTensor/GetOutputTensor funct + // in analysis predictor cannot find those vars in the scope + // returned by the DistModel, since DistModel only return the + // root scope. So, those vars must to be created in the root + // scope instead of in the microscope + std::vector feed_fetch_vars; + for (auto pair : idx2feeds_) { + feed_fetch_vars.emplace_back(pair.second); + } + for (auto pair : idx2fetches_) { + feed_fetch_vars.emplace_back(pair.second); + } + fleet_exe_->Init(config_.dist_config().carrier_id(), + *(inference_program_.get()), scope_.get(), place_, 1, + {task_node_.get()}, id_to_rank, feed_fetch_vars); + return true; +} + +bool AnalysisPredictor::CommInit() { + std::map> ring_id_to_ranks{}; + std::map> rank_to_ring_ids{}; + if (!LoadConverterConfig(&ring_id_to_ranks, &rank_to_ring_ids)) { + VLOG(3) << "Load converter config failed, DistModel init failed."; + return false; + } + std::unique_ptr comm_init_program( + new framework::ProgramDesc()); + framework::BlockDesc *comm_init_block = comm_init_program->MutableBlock(0); + std::vector &ring_ids = + rank_to_ring_ids[config_.dist_config().rank()]; + int64_t order = 0; + std::string var_name_base = "comm_init_"; + for (int64_t ring_id : ring_ids) { + VLOG(3) << "Init comm for ring id: " << ring_id; + int64_t ranks_in_group = ring_id_to_ranks[ring_id].size(); + int64_t rank_in_group = 0; + std::vector &ranks = ring_id_to_ranks[ring_id]; + for (int64_t rank : ranks) { + if (config_.dist_config().rank() == rank) { + break; + } + rank_in_group += 1; + } + std::vector peer_endpoints; + for (int64_t rank : ranks) { + if (config_.dist_config().rank() == rank) { + continue; + } + peer_endpoints.emplace_back( + config_.dist_config().trainer_endpoints()[rank]); + } + InsertCommOp(var_name_base + std::to_string(order), ranks_in_group, + rank_in_group, peer_endpoints, comm_init_block, ring_id); + order += 1; + } + framework::NaiveExecutor e(place_); + e.CreateVariables(*comm_init_program, 0, true, scope_.get()); + e.Prepare(scope_.get(), *comm_init_program, 0, false); + e.Run(); + VLOG(3) << "Comm init successful."; + return true; +} + +void AnalysisPredictor::InsertCommOp( + std::string tmp_var_name, int nranks, int rank, + const std::vector &peer_endpoints, framework::BlockDesc *block, + int ring_id) { + /* + * tmp_var_name: the var name for var comm_id + * nranks: number of total ranks + * rank: the rank of local rank in the comm group + * peer_endpoints: peer's endpoints + * block: the block where to insert the comm ops + * ring_id: the ring_id to be inited + */ + const std::string &endpoint = config_.dist_config().current_endpoint(); + std::stringstream ss; + ss << "Init comm with tmp var: " << tmp_var_name + << ". The ring id is: " << ring_id << ". The group has: " << nranks + << " ranks. Current rank in the group is: " << rank + << ". The endpoint is: " << endpoint << ". Peer endpoints are: "; + for (auto ep : peer_endpoints) { + ss << ep << ", "; + } + VLOG(3) << ss.str(); + if (config_.use_gpu()) { + framework::VarDesc *new_var = block->Var(tmp_var_name); + new_var->SetType(framework::proto::VarType::RAW); + new_var->SetPersistable(true); + framework::OpDesc *gen_nccl_id_op = block->AppendOp(); + gen_nccl_id_op->SetType("c_gen_nccl_id"); + gen_nccl_id_op->SetOutput("Out", {tmp_var_name}); + gen_nccl_id_op->SetAttr("rank", rank); + gen_nccl_id_op->SetAttr("endpoint", + config_.dist_config().current_endpoint()); + gen_nccl_id_op->SetAttr("other_endpoints", peer_endpoints); + gen_nccl_id_op->SetAttr("ring_id", ring_id); + gen_nccl_id_op->SetAttr("op_role", + static_cast(framework::OpRole::kForward)); + gen_nccl_id_op->CheckAttrs(); + framework::OpDesc *comm_init_op = block->AppendOp(); + comm_init_op->SetType("c_comm_init"); + comm_init_op->SetInput("X", {tmp_var_name}); + comm_init_op->SetAttr("rank", rank); + comm_init_op->SetAttr("nranks", nranks); + comm_init_op->SetAttr("ring_id", ring_id); + comm_init_op->SetAttr("op_role", + static_cast(framework::OpRole::kForward)); + comm_init_op->CheckAttrs(); + } else { + LOG(WARNING) << "DistModelInf doesn't init comm."; + // TODO(fleet exe dev): comm init for more devices + } +} + +bool AnalysisPredictor::LoadConverterConfig( + std::map> *ring_id_to_ranks, + std::map> *rank_to_ring_ids) { + VLOG(3) << "Going to load converter config from: " + << config_.dist_config().comm_init_config() << "\n"; + std::ifstream fin(config_.dist_config().comm_init_config(), std::ios::in); + PADDLE_ENFORCE_EQ( + static_cast(fin.is_open()), true, + platform::errors::NotFound( + "Cannot open file %s, please confirm whether the file is normal.", + config_.dist_config().comm_init_config())); + std::string line; + bool ring_to_rank{true}; + // Reading config from file, the config file should like these format + // [ring_id -> ranks] + // 0,0,1,2,3 + // 1,0,1 + // 2,2,3 + // 21,0,1 + // 22,1,2 + // 23,2,3 + // [rank -> ring_ids] + // 0,0,1,21 + // 1,0,1,21,22 + // 2,0,2,22,23 + // 3,0,2,23 + while (std::getline(fin, line)) { + std::vector one_line = paddle::string::Split(line, ','); + if (one_line.size() == 1) { + // start a new section of the config + if (line == "[ring_id -> ranks]") { + ring_to_rank = true; + } else if (line == "[rank -> ring_ids]") { + ring_to_rank = false; + } + } else { + // parse key - values pairs in one section + int64_t key = std::stoll(one_line[0]); + for (size_t i = 1; i < one_line.size(); ++i) { + int64_t val = std::stoll(one_line[i]); + if (ring_to_rank) { + if (ring_id_to_ranks->find(key) == ring_id_to_ranks->end()) { + ring_id_to_ranks->insert({key, std::vector()}); + } + ring_id_to_ranks->at(key).emplace_back(val); + } else { + if (rank_to_ring_ids->find(key) == rank_to_ring_ids->end()) { + rank_to_ring_ids->insert({key, std::vector()}); + } + rank_to_ring_ids->at(key).emplace_back(val); + } + // NOTE: add more configuration sections here + } + } + } + std::stringstream ss; + ss << "Loaded the following converter config:\n"; + ss << "ring_id_to_ranks:\n"; + for (auto pair : *ring_id_to_ranks) { + int64_t key = pair.first; + ss << "\t" << key << "\t->\t"; + for (auto value : pair.second) { + ss << value << "\t"; + } + ss << "\n"; + } + ss << "rank_to_ring_ids:\n"; + for (auto pair : *rank_to_ring_ids) { + int64_t key = pair.first; + ss << "\t" << key << "\t->\t"; + for (auto value : pair.second) { + ss << value << "\t"; + } + ss << "\n"; + } + VLOG(3) << ss.str(); + return true; +} +#endif + void AnalysisPredictor::MkldnnPreSet(const std::vector &inputs) { #ifdef PADDLE_WITH_MKLDNN std::vector> inputs_shape; @@ -946,13 +1182,24 @@ std::vector AnalysisPredictor::GetOutputNames() { std::unique_ptr AnalysisPredictor::GetInputTensor( const std::string &name) { + framework::Scope *scope; +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) + if (config_.dist_config().use_dist_model()) { + scope = scope_.get(); + } else { + scope = executor_->scope(); + } +#else + scope = executor_->scope(); +#endif PADDLE_ENFORCE_NOT_NULL( - executor_->scope()->FindVar(name), + scope->FindVar(name), platform::errors::PreconditionNotMet( - "The variable named %s is not found in the scope of the exector.", + "The variable named %s is not found in the scope of the executor.", name)); std::unique_ptr res( - new ZeroCopyTensor(static_cast(executor_->scope()))); + new ZeroCopyTensor(static_cast(scope))); res->input_or_output_ = true; res->SetName(name); if (platform::is_cpu_place(place_)) { @@ -985,13 +1232,24 @@ std::unique_ptr AnalysisPredictor::GetInputTensor( std::unique_ptr AnalysisPredictor::GetOutputTensor( const std::string &name) { + framework::Scope *scope; +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) + if (config_.dist_config().use_dist_model()) { + scope = scope_.get(); + } else { + scope = executor_->scope(); + } +#else + scope = executor_->scope(); +#endif PADDLE_ENFORCE_NOT_NULL( - executor_->scope()->FindVar(name), + scope->FindVar(name), platform::errors::PreconditionNotMet( - "he variable named %s is not found in the scope of the exector.", + "The variable named %s is not found in the scope of the executor.", name)); std::unique_ptr res( - new ZeroCopyTensor(static_cast(executor_->scope()))); + new ZeroCopyTensor(static_cast(scope))); res->input_or_output_ = false; res->SetName(name); if (platform::is_cpu_place(place_)) { @@ -1023,6 +1281,18 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( } bool AnalysisPredictor::ZeroCopyRun() { +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) + if (config_.dist_config().use_dist_model()) { + VLOG(3) << "ZeroCopyRun will use the fleet executor."; + inference::Timer timer; + timer.tic(); + fleet_exe_->Run(config_.dist_config().carrier_id()); + VLOG(3) << "Fleet executor inf runs once use: " + << std::to_string(timer.toc()) << "ms"; + return true; + } +#endif paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); #ifdef PADDLE_WITH_MKLDNN if (config_.use_mkldnn_) { @@ -1035,7 +1305,6 @@ bool AnalysisPredictor::ZeroCopyRun() { MkldnnPreSet(shape_vector); } #endif - executor_->Run(); if (config_.shape_range_info_collected()) { diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index a8e56101d37..8ed183dae0b 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -18,6 +18,10 @@ #include #include #include +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h" +#endif #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/op_compatible_info.h" #include "paddle/fluid/inference/analysis/analyzer.h" @@ -391,6 +395,53 @@ class AnalysisPredictor : public PaddlePredictor { void StatisticShapeRangeInfo(); void CollectShapeRangeInfo(); +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) + // fleet exe related + + /// + /// \brief prepare for fleet executor to run + /// + /// Used in AnalysisPredictor::Init(), + /// + bool PrepareFleetExecutor(); + + /// + /// \brief init NCCL env for multi gpus inference + /// + /// Used in AnalysisPredictor::PrepareFleetExecutor() + /// + bool CommInit(); + + /// + /// \brief read the config to init NCCL env + /// + /// Used in AnalysisPredictor::CommInit() + /// + /// \param[in] ring_id_to_ranks: a ptr to ring_id_to_ranks + /// \param[in] rank_to_ring_ids: a ptr to rank_to_ring_ids + /// + bool LoadConverterConfig( + std::map> *ring_id_to_ranks, + std::map> *rank_to_ring_ids); + + /// + /// \brief add ops and run them with NaiveExecutor to init NCCL env + /// + /// Used in AnalysisPredictor::CommInit() + /// + /// \param[in] tmp_var_name: var name to hold NCCL unique id + /// \param[in] nranks: number of ranks in one comm group + /// \param[in] rank: relative rank of current rank in the comm group + /// \param[in] peer_endpoints: group's peers' endpoints + /// \param[in] block: the block to insert comm ops + /// \param[in] ring_id: the ring id to be used to init NCCL env + /// + void InsertCommOp(std::string tmp_var_name, int nranks, int rank, + const std::vector &peer_endpoints, + framework::BlockDesc *block, int ring_id); +#endif + private: AnalysisConfig config_; Argument argument_; @@ -436,6 +487,14 @@ class AnalysisPredictor : public PaddlePredictor { std::map>> shape_info_; int clone_num_{1}; + +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) + // fleet executor related + distributed::FleetExecutorDesc executor_desc_; + std::shared_ptr fleet_exe_; + std::shared_ptr task_node_; +#endif }; } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 180c028c6a6..b4a35839440 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -76,6 +76,54 @@ struct LiteNNAdapterConfig { LiteNNAdapterConfig& Disable(); }; +struct DistConfig { + bool use_dist_model() const { return use_dist_model_; } + void EnableDistModel(bool use_dist_model) { + use_dist_model_ = use_dist_model; + } + + std::vector trainer_endpoints() const { + return trainer_endpoints_; + } + + std::string current_endpoint() const { return current_endpoint_; } + + void SetEndpoints(const std::vector& trainer_endpoints, + const std::string& current_endpoint) { + trainer_endpoints_ = trainer_endpoints; + current_endpoint_ = current_endpoint; + } + + int64_t nranks() const { return nranks_; } + + int64_t rank() const { return rank_; } + + void SetRanks(int64_t nranks, int64_t rank) { + nranks_ = nranks; + rank_ = rank; + } + + std::string comm_init_config() const { return comm_init_config_; } + + void SetCommInitConfig(const std::string& comm_init_config) { + comm_init_config_ = comm_init_config; + } + + void SetCarrierId(const std::string& carrier_id) { carrier_id_ = carrier_id; } + + std::string carrier_id() const { return carrier_id_; } + + protected: + // DistModel Inference related + bool use_dist_model_{false}; // whether use DistModel or not + std::vector trainer_endpoints_{}; // all trainers' endpoints + std::string current_endpoint_{}; // current trainer's endpoint + int64_t nranks_{1}; // total ranks (number of trainers) + int64_t rank_{0}; // rank + std::string comm_init_config_{}; // converter config path + std::string carrier_id_{"inference"}; +}; + /// /// \brief configuration manager for AnalysisPredictor. /// \since 1.7.0 @@ -763,6 +811,12 @@ struct PD_INFER_DECL AnalysisConfig { LiteNNAdapterConfig& NNAdapter() { return nnadapter_config_; } + void SetDistConfig(const DistConfig& dist_config) { + dist_config_ = dist_config; + } + + const DistConfig& dist_config() const { return dist_config_; } + protected: // Update the config. void Update(); @@ -902,6 +956,9 @@ struct PD_INFER_DECL AnalysisConfig { mutable bool is_valid_{true}; std::string opt_cache_dir_; friend class paddle_infer::experimental::InternalUtils; + + // fleet exe related + DistConfig dist_config_{}; }; } // namespace paddle diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 0281fd91765..8c96499a022 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -720,6 +720,12 @@ inference_analysis_test(test_analyzer_zerocopytensor_tensor SRCS analyzer_zeroco EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${OCR_INSTALL_DIR}/model) +if(WITH_DISTRIBUTE AND WITH_PSCORE AND NOT (WITH_ASCEND OR WITH_ASCEND_CL)) + inference_analysis_test(test_analyzer_dist_model SRCS analyzer_dist_model_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${OCR_INSTALL_DIR}/model) +endif() + inference_analysis_test(test_analyzer_paddletensor_tensor SRCS analyzer_paddle_tensor_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${OCR_INSTALL_DIR}/model --infer_data=${OCR_INSTALL_DIR}/data.txt --refer_result=${OCR_INSTALL_DIR}/result.txt) diff --git a/paddle/fluid/inference/tests/api/analyzer_dist_model_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dist_model_tester.cc new file mode 100644 index 00000000000..7cf6e2adfc6 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_dist_model_tester.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" +#include "paddle/fluid/inference/utils/singleton.h" + +namespace paddle { +namespace inference { + +TEST(test_dist_model, dist_model) { + std::cout << "Analysis Predictor DistModel test." << std::endl; + AnalysisConfig config; + config.SetModel(FLAGS_infer_model + "/__model__", + FLAGS_infer_model + "/__params__"); + config.SwitchUseFeedFetchOps(false); + config.EnableUseGpu(100, 0); + DistConfig dist_config; + dist_config.SetRanks(1, 0); + dist_config.EnableDistModel(true); + dist_config.SetEndpoints({""}, ""); + config.SetDistConfig(dist_config); + + auto predictor = paddle_infer::CreatePredictor(config); + int batch_size = 1; + int channels = 1; + int height = 48; + int width = 512; + int nums = batch_size * channels * height * width; + std::cout << "Created predictor." << std::endl; + + float* input = new float[nums]; + for (int i = 0; i < nums; ++i) input[i] = 0; + auto input_names = predictor->GetInputNames(); + + auto input_t = predictor->GetInputHandle(input_names[0]); + input_t->Reshape({batch_size, channels, height, width}); + input_t->CopyFromCpu(input); + std::cout << "Input data." << std::endl; + + predictor->Run(); + std::cout << "Zero Copy Run." << std::endl; + + std::vector out_data; + auto output_names = predictor->GetOutputNames(); + auto output_t = predictor->GetOutputHandle(output_names[0]); + std::vector output_shape = output_t->shape(); + int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, + std::multiplies()); + out_data.resize(out_num); + output_t->CopyToCpu(out_data.data()); + std::cout << "Output data." << std::endl; + delete[] input; +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc index b29cc10e8f5..8491d1e2249 100644 --- a/paddle/fluid/pybind/bind_fleet_executor.cc +++ b/paddle/fluid/pybind/bind_fleet_executor.cc @@ -168,7 +168,7 @@ void BindFleetExecutor(py::module* m) { .def("set_run_at_offset", &TaskNode::SetRunAtOffset) .def("set_type", &TaskNode::SetType) .def("role", &TaskNode::role) - .def("init", &TaskNode::Init) + .def("init", [](TaskNode& self) { self.Init(); }) .def("set_program", &TaskNode::SetProgram); py::class_(*m, "DistModelConfig") diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index eafd5baab7d..9b5041154c9 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -658,7 +658,24 @@ void BindAnalysisConfig(py::module *m) { return dynamic_cast(self.pass_builder()); }, py::return_value_policy::reference) - .def("nnadapter", &AnalysisConfig::NNAdapter); + .def("nnadapter", &AnalysisConfig::NNAdapter) + .def("set_dist_config", &AnalysisConfig::SetDistConfig) + .def("dist_config", &AnalysisConfig::dist_config); + + py::class_(*m, "DistConfig") + .def(py::init<>()) + .def("set_carrier_id", &DistConfig::SetCarrierId) + .def("set_comm_init_config", &DistConfig::SetCommInitConfig) + .def("set_endpoints", &DistConfig::SetEndpoints) + .def("set_ranks", &DistConfig::SetRanks) + .def("enable_dist_model", &DistConfig::EnableDistModel) + .def("carrier_id", &DistConfig::carrier_id) + .def("current_endpoint", &DistConfig::current_endpoint) + .def("trainer_endpoints", &DistConfig::trainer_endpoints) + .def("nranks", &DistConfig::nranks) + .def("rank", &DistConfig::rank) + .def("comm_init_config", &DistConfig::comm_init_config) + .def("use_dist_model", &DistConfig::use_dist_model); } void BindLiteNNAdapterConfig(py::module *m) { diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index e372727b0f0..a7971763f53 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -2034,8 +2034,11 @@ class Executor(object): fleet_opt['task_id_to_rank'] = task_id_to_rank place = core.Place() place.set_place(self.place) + # NOTE: the last argument is used to force create some vars in root scope, + # won't be used during train. self._fleet_executor.init(carrier_id, program.desc, scope, place, - num_micro_batches, tasks, task_id_to_rank) + num_micro_batches, tasks, task_id_to_rank, + []) def _run_using_fleet_executor(self, program=None, -- GitLab From bc113e10487115fd91cfc738c4279372eeb7c2a2 Mon Sep 17 00:00:00 2001 From: joeqiao12 <45232181+joeqiao12@users.noreply.github.com> Date: Wed, 2 Mar 2022 15:29:24 +0800 Subject: [PATCH 049/272] add logic kernel for mlu (#39940) --- .../operators/controlflow/compare_op_mlu.cc | 200 ++++++++++++++++++ .../unittests/mlu/test_compare_op_mlu.py | 157 ++++++++++++++ 2 files changed, 357 insertions(+) create mode 100644 paddle/fluid/operators/controlflow/compare_op_mlu.cc create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_compare_op_mlu.py diff --git a/paddle/fluid/operators/controlflow/compare_op_mlu.cc b/paddle/fluid/operators/controlflow/compare_op_mlu.cc new file mode 100644 index 00000000000..9dc287ab76a --- /dev/null +++ b/paddle/fluid/operators/controlflow/compare_op_mlu.cc @@ -0,0 +1,200 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/controlflow/compare_op.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +template +class EqualMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(x->dtype())); + MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(y->dtype())); + MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(out->dtype())); + MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_EQ, input_x.get(), GetBasePtr(x), + input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out)); + } +}; + +template +class NotEqualMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(x->dtype())); + MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(y->dtype())); + MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(out->dtype())); + MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_NE, input_x.get(), GetBasePtr(x), + input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out)); + } +}; + +template +class LessThanMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(x->dtype())); + MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(y->dtype())); + MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(out->dtype())); + MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_LT, input_x.get(), GetBasePtr(x), + input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out)); + } +}; + +template +class LessEqualMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(x->dtype())); + MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(y->dtype())); + MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(out->dtype())); + MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_LE, input_x.get(), GetBasePtr(x), + input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out)); + } +}; + +template +class GreaterThanMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(x->dtype())); + MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(y->dtype())); + MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(out->dtype())); + MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_GT, input_x.get(), GetBasePtr(x), + input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out)); + } +}; + +template +class GreaterEqualMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(x->dtype())); + MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(y->dtype())); + MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(out->dtype())); + MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_GE, input_x.get(), GetBasePtr(x), + input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL( + equal, ops::EqualMLUKernel, + ops::EqualMLUKernel, + ops::EqualMLUKernel, + ops::EqualMLUKernel, + ops::EqualMLUKernel, + ops::EqualMLUKernel, + ops::EqualMLUKernel); + +REGISTER_OP_MLU_KERNEL( + not_equal, ops::NotEqualMLUKernel, + ops::NotEqualMLUKernel, + ops::NotEqualMLUKernel, + ops::NotEqualMLUKernel, + ops::NotEqualMLUKernel, + ops::NotEqualMLUKernel, + ops::NotEqualMLUKernel); + +REGISTER_OP_MLU_KERNEL( + less_than, ops::LessThanMLUKernel, + ops::LessThanMLUKernel, + ops::LessThanMLUKernel, + ops::LessThanMLUKernel, + ops::LessThanMLUKernel, + ops::LessThanMLUKernel, + ops::LessThanMLUKernel); + +REGISTER_OP_MLU_KERNEL( + less_equal, ops::LessEqualMLUKernel, + ops::LessEqualMLUKernel, + ops::LessEqualMLUKernel, + ops::LessEqualMLUKernel, + ops::LessEqualMLUKernel, + ops::LessEqualMLUKernel, + ops::LessEqualMLUKernel); + +REGISTER_OP_MLU_KERNEL( + greater_than, + ops::GreaterThanMLUKernel, + ops::GreaterThanMLUKernel, + ops::GreaterThanMLUKernel, + ops::GreaterThanMLUKernel, + ops::GreaterThanMLUKernel, + ops::GreaterThanMLUKernel, + ops::GreaterThanMLUKernel); + +REGISTER_OP_MLU_KERNEL( + greater_equal, + ops::GreaterEqualMLUKernel, + ops::GreaterEqualMLUKernel, + ops::GreaterEqualMLUKernel, + ops::GreaterEqualMLUKernel, + ops::GreaterEqualMLUKernel, + ops::GreaterEqualMLUKernel, + ops::GreaterEqualMLUKernel); diff --git a/python/paddle/fluid/tests/unittests/mlu/test_compare_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_compare_op_mlu.py new file mode 100644 index 00000000000..87997acce02 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_compare_op_mlu.py @@ -0,0 +1,157 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +from paddle.fluid import Program, program_guard + + +def create_test_class(op_type, typename, callback): + class Cls(OpTest): + def setUp(self): + self.set_mlu() + self.place = paddle.MLUPlace(0) + x = np.random.random(size=(10, 7)).astype(typename) + y = np.random.random(size=(10, 7)).astype(typename) + out = callback(x, y) + self.inputs = {'X': x, 'Y': y} + self.outputs = {'Out': out} + self.op_type = op_type + + def set_mlu(self): + self.__class__.use_mlu = True + + def test_output(self): + self.check_output_with_place(place=self.place) + + def test_errors(self): + paddle.enable_static() + with program_guard(Program(), Program()): + a = fluid.layers.data(name='a', shape=[2], dtype='float32') + b = fluid.layers.data(name='b', shape=[2], dtype='float32') + c = fluid.layers.data(name='c', shape=[2], dtype='int16') + d = fluid.create_lod_tensor(np.array([[-1]]), [[1]], self.place) + + op = eval("fluid.layers.%s" % self.op_type) + self.assertRaises(TypeError, op, x=a, y=b, axis=True) + self.assertRaises(TypeError, op, x=a, y=b, force_cpu=1) + self.assertRaises(TypeError, op, x=a, y=b, cond=1) + self.assertRaises(TypeError, op, x=a, y=c) + self.assertRaises(TypeError, op, x=c, y=a) + self.assertRaises(TypeError, op, x=a, y=d) + self.assertRaises(TypeError, op, x=d, y=a) + self.assertRaises(TypeError, op, x=c, y=d) + + def test_dynamic_api(self): + paddle.disable_static() + paddle.set_device('mlu:0') + x = np.random.random(size=(10, 7)).astype(typename) + y = np.random.random(size=(10, 7)).astype(typename) + real_result = callback(x, y) + x = paddle.to_tensor(x, dtype=typename) + y = paddle.to_tensor(y, dtype=typename) + op = eval("paddle.%s" % (self.op_type)) + out = op(x, y) + self.assertEqual((out.numpy() == real_result).all(), True) + + @unittest.skipIf(typename == 'float16', "float16 is not supported now") + def test_broadcast_api_1(self): + paddle.enable_static() + with program_guard(Program(), Program()): + x = paddle.static.data( + name='x', shape=[1, 2, 1, 3], dtype=typename) + y = paddle.static.data( + name='y', shape=[1, 2, 3], dtype=typename) + op = eval("paddle.%s" % (self.op_type)) + out = op(x, y) + exe = paddle.static.Executor(self.place) + input_x = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(typename) + input_y = np.arange(0, 6).reshape((1, 2, 3)).astype(typename) + real_result = callback(input_x, input_y) + res, = exe.run(feed={"x": input_x, + "y": input_y}, + fetch_list=[out]) + self.assertEqual((res == real_result).all(), True) + + @unittest.skipIf(typename == 'float16', "float16 is not supported now") + def test_broadcast_api_2(self): + paddle.enable_static() + with program_guard(Program(), Program()): + x = paddle.static.data( + name='x', shape=[1, 2, 3], dtype=typename) + y = paddle.static.data( + name='y', shape=[1, 2, 1, 3], dtype=typename) + op = eval("paddle.%s" % (self.op_type)) + out = op(x, y) + exe = paddle.static.Executor(self.place) + input_x = np.arange(0, 6).reshape((1, 2, 3)).astype(typename) + input_y = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(typename) + real_result = callback(input_x, input_y) + res, = exe.run(feed={"x": input_x, + "y": input_y}, + fetch_list=[out]) + self.assertEqual((res == real_result).all(), True) + + @unittest.skipIf(typename == 'float16', "float16 is not supported now") + def test_broadcast_api_3(self): + paddle.enable_static() + with program_guard(Program(), Program()): + x = paddle.static.data(name='x', shape=[5], dtype=typename) + y = paddle.static.data(name='y', shape=[3, 1], dtype=typename) + op = eval("paddle.%s" % (self.op_type)) + out = op(x, y) + exe = paddle.static.Executor(self.place) + input_x = np.arange(0, 5).reshape((5)).astype(typename) + input_y = np.array([5, 3, 2]).reshape((3, 1)).astype(typename) + real_result = callback(input_x, input_y) + res, = exe.run(feed={"x": input_x, + "y": input_y}, + fetch_list=[out]) + self.assertEqual((res == real_result).all(), True) + + @unittest.skipIf(typename == 'float16', "float16 is not supported now") + def test_attr_name(self): + paddle.enable_static() + with program_guard(Program(), Program()): + x = fluid.layers.data(name='x', shape=[4], dtype=typename) + y = fluid.layers.data(name='y', shape=[4], dtype=typename) + op = eval("paddle.%s" % (self.op_type)) + out = op(x=x, y=y, name="name_%s" % (self.op_type)) + self.assertEqual("name_%s" % (self.op_type) in out.name, True) + + cls_name = "{0}_{1}".format(op_type, typename) + Cls.__name__ = cls_name + globals()[cls_name] = Cls + + +for _type_name in {'float16', 'float32', 'int32', 'bool'}: + if _type_name == 'int32' or _type_name == 'bool': + create_test_class('equal', _type_name, lambda _a, _b: _a == _b) + continue + create_test_class('equal', _type_name, lambda _a, _b: _a == _b) + create_test_class('not_equal', _type_name, lambda _a, _b: _a != _b) + create_test_class('less_than', _type_name, lambda _a, _b: _a < _b) + create_test_class('less_equal', _type_name, lambda _a, _b: _a <= _b) + create_test_class('greater_than', _type_name, lambda _a, _b: _a > _b) + create_test_class('greater_equal', _type_name, lambda _a, _b: _a >= _b) + +if __name__ == '__main__': + unittest.main() -- GitLab From 0c3f7fbcfe68bfb34b0ed5d9aad6e3a8c0cca43f Mon Sep 17 00:00:00 2001 From: chenjian Date: Wed, 2 Mar 2022 15:30:09 +0800 Subject: [PATCH 050/272] Upgrade new profiler (#39984) * add new profiler components * fix bug * upgrade new profiler * fix operator.cc * fix operator.cc * fix cmakelists.txt * fix bug * fix according to pr * fix bug * fix cmake * fix bug * fix a bug * fix bug * fix bug --- paddle/fluid/framework/operator.cc | 8 +- paddle/fluid/platform/profiler/CMakeLists.txt | 10 +- .../platform/profiler/chrometracing_logger.cc | 320 ++++++++++++++---- .../platform/profiler/chrometracing_logger.h | 11 + .../platform/profiler/cpu_utilization.cc | 47 ++- .../platform/profiler/dump/CMakeLists.txt | 3 - .../profiler/dump/deserialization_reader.cc | 16 +- .../profiler/dump/deserialization_reader.h | 4 +- .../platform/profiler/dump/nodetree.proto | 27 +- .../profiler/dump/serialization_logger.cc | 12 + .../profiler/dump/serialization_logger.h | 5 + .../dump/test_serialization_logger.cc | 28 +- .../fluid/platform/profiler/event_python.cc | 122 +++++++ paddle/fluid/platform/profiler/event_python.h | 26 +- paddle/fluid/platform/profiler/profiler.cc | 35 +- paddle/fluid/platform/profiler/profiler.h | 10 +- .../fluid/platform/profiler/profiler_test.cc | 11 +- paddle/fluid/platform/profiler/trace_event.h | 2 + 18 files changed, 578 insertions(+), 119 deletions(-) mode change 100755 => 100644 paddle/fluid/platform/profiler/dump/serialization_logger.h create mode 100644 paddle/fluid/platform/profiler/event_python.cc diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index b12ad552aba..b91ee3c2d63 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -264,10 +264,10 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { // and different op name cost time,we set two event. platform::RecordEvent op_type_record_event( Type(), platform::TracerEventType::Operator, 1); - // auto op_name = platform::OpName(outputs_, Type()); - // platform::RecordEvent op_name_record_event( - // op_name, platform::TracerEventType::Operator, 1, - // platform::EventRole::kUniqueOp); + auto op_name = platform::OpName(outputs_, Type()); + platform::RecordEvent op_name_record_event( + op_name, platform::TracerEventType::Operator, 10, + platform::EventRole::kUniqueOp); RunImpl(scope, place); } diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt index 5acdfa39569..c903a52530c 100755 --- a/paddle/fluid/platform/profiler/CMakeLists.txt +++ b/paddle/fluid/platform/profiler/CMakeLists.txt @@ -2,10 +2,12 @@ cc_library(host_tracer SRCS host_tracer.cc DEPS enforce) cc_library(cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog) cc_library(event_node SRCS event_node.cc DEPS enforce) cc_library(profiler_utils SRCS utils.cc DEPS enforce glog) -cc_library(chrometracinglogger SRCS chrometracing_logger.cc DEPS event_node profiler_utils) -cc_test(test_event_node SRCS test_event_node.cc DEPS event_node chrometracinglogger) add_subdirectory(dump) +cc_library(profiler_logger SRCS chrometracing_logger.cc dump/serialization_logger.cc dump/deserialization_reader.cc DEPS nodetreeproto event_node profiler_utils) +cc_library(event_bind SRCS event_python.cc DEPS profiler_logger) cc_library(cpu_utilization SRCS cpu_utilization.cc DEPS cpu_info os_info enforce glog) +cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind) +cc_test(test_event_node SRCS test_event_node.cc DEPS event_node profiler_logger) cc_test(test_extra_info SRCS test_extra_info.cc DEPS profiler_utils) -cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization) -cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler event_node) +cc_test(test_serialization_logger SRCS dump/test_serialization_logger.cc DEPS event_bind) +cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler) diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc index 7b207ea7b20..4061e2d4d49 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.cc +++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc @@ -18,40 +18,17 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/os_info.h" +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler/chrometracing_logger.h" #include "paddle/fluid/platform/profiler/event_node.h" +#include "paddle/fluid/platform/profiler/utils.h" namespace paddle { namespace platform { static const char* kSchemaVersion = "1.0.0"; static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.json"; -static uint32_t num_span = 0; - -static int64_t nsToUs(int64_t ns) { return ns / 1000; } - -template -std::string string_format(const std::string& format, Args... args) { - int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) + - 1; // Extra space for '\0' - PADDLE_ENFORCE_GE(size_s, 0, platform::errors::Fatal( - "Error during profiler data formatting.")); - auto size = static_cast(size_s); - auto buf = std::make_unique(size); - std::snprintf(buf.get(), size, format.c_str(), args...); - return std::string(buf.get(), size - 1); // exclude the '\0' -} - -std::string GetStringFormatLocalTime() { - std::time_t rawtime; - std::tm* timeinfo; - char buf[100]; - std::time(&rawtime); - timeinfo = std::localtime(&rawtime); - std::strftime(buf, 100, "%F-%X", timeinfo); - return std::string(buf); -} +static uint32_t span_indx = 0; static std::string DefaultFileName() { auto pid = GetProcessId(); @@ -60,16 +37,19 @@ static std::string DefaultFileName() { } const char* ChromeTracingLogger::categary_name_[] = { - "operator", "dataloader", "profile_step", "cuda_runtime", "kernel", - "memcpy", "memset", "user_defined", "others"}; + "Operator", "Dataloader", "ProfileStep", "CudaRuntime", + "Kernel", "Memcpy", "Memset", "UserDefined", + "OperatorInner", "Forward", "Backward", "Optimization", + "Communication", "PythonOp", "PythonUserDefined"}; void ChromeTracingLogger::OpenFile() { output_file_stream_.open(filename_, std::ofstream::out | std::ofstream::trunc); if (!output_file_stream_) { - VLOG(2) << "Unable to open file for writing profiling data." << std::endl; + LOG(WARNING) << "Unable to open file for writing profiling data." + << std::endl; } else { - VLOG(0) << "writing profiling data to " << filename_ << std::endl; + LOG(INFO) << "writing profiling data to " << filename_ << std::endl; } } @@ -122,21 +102,54 @@ void ChromeTracingLogger::LogHostTraceEventNode( if (!output_file_stream_) { return; } - output_file_stream_ << string_format( - std::string( - R"JSON( + switch (host_node.Type()) { + case TracerEventType::ProfileStep: + case TracerEventType::Forward: + case TracerEventType::Backward: + case TracerEventType::Dataloader: + case TracerEventType::Optimization: + case TracerEventType::PythonOp: + case TracerEventType::PythonUserDefined: + output_file_stream_ << string_format( + std::string( + R"JSON( { - "name": "%s", "pid": %lld, "tid": %lld, + "name": "%s", "pid": %lld, "tid": "%lld(Python)", "ts": %lld, "dur": %lld, "ph": "X", "cat": "%s", "args": { - + "start_ns": %lld, + "end_ns": %lld } }, )JSON"), - host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(), - nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()), - categary_name_[static_cast(host_node.Type())]); + host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(), + nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()), + categary_name_[static_cast(host_node.Type())], + host_node.StartNs(), host_node.EndNs()); + break; + default: + output_file_stream_ << string_format( + std::string( + R"JSON( + { + "name": "%s", "pid": %lld, "tid": "%lld(C++)", + "ts": %lld, "dur": %lld, + "ph": "X", "cat": "%s", + "args": { + "start_ns": %lld, + "end_ns": %lld + } + }, + )JSON"), + host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(), + nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()), + categary_name_[static_cast(host_node.Type())], + host_node.StartNs(), host_node.EndNs()); + break; + } + + pid_tid_set_.insert({host_node.ProcessId(), host_node.ThreadId()}); } void ChromeTracingLogger::LogRuntimeTraceEventNode( @@ -148,11 +161,13 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode( std::string( R"JSON( { - "name": "%s", "pid": %lld, "tid": %lld, + "name": "%s", "pid": %lld, "tid": "%lld(C++)", "ts": %lld, "dur": %lld, "ph": "X", "cat": "%s", "args": { - "correlation id": %d + "correlation id": %d, + "start_ns": %lld, + "end_ns": %lld } }, )JSON"), @@ -160,7 +175,23 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode( runtime_node.ThreadId(), nsToUs(runtime_node.StartNs()), nsToUs(runtime_node.Duration()), categary_name_[static_cast(runtime_node.Type())], - runtime_node.CorrelationId()); + runtime_node.CorrelationId(), runtime_node.StartNs(), + runtime_node.EndNs()); + pid_tid_set_.insert({runtime_node.ProcessId(), runtime_node.ThreadId()}); + + output_file_stream_ << string_format( + std::string( + R"JSON( + { + "name": "launch", "id": %d, "pid": %lld, "tid": "%lld(C++)", + "ts": %lld, + "ph": "s", "cat": "async" + }, + )JSON"), + runtime_node.CorrelationId(), runtime_node.ProcessId(), + runtime_node.ThreadId(), + nsToUs((runtime_node.StartNs() + runtime_node.EndNs()) >> 1)); + pid_tid_set_.insert({runtime_node.ProcessId(), runtime_node.ThreadId()}); } void ChromeTracingLogger::LogDeviceTraceEventNode( @@ -180,6 +211,36 @@ void ChromeTracingLogger::LogDeviceTraceEventNode( default: break; } + if (nsToUs(device_node.Duration()) == 0) { + output_file_stream_ << string_format( + std::string( + R"JSON( + { + "name": "launch", "id": %d, "pid": %lld, "tid": %lld, + "ts": %lld, + "ph": "f", "cat": "async" + }, + )JSON"), + device_node.CorrelationId(), device_node.DeviceId(), + device_node.StreamId(), nsToUs(device_node.StartNs())); + deviceid_streamid_set_.insert( + {device_node.DeviceId(), device_node.StreamId()}); + } else { + output_file_stream_ << string_format( + std::string( + R"JSON( + { + "name": "launch", "id": %d, "pid": %lld, "tid": %lld, + "ts": %lld, + "ph": "f", "cat": "async", "bp": "e" + }, + )JSON"), + device_node.CorrelationId(), device_node.DeviceId(), + device_node.StreamId(), + nsToUs((device_node.StartNs() + device_node.EndNs()) >> 1)); + deviceid_streamid_set_.insert( + {device_node.DeviceId(), device_node.StreamId()}); + } } void ChromeTracingLogger::HandleTypeKernel( @@ -188,16 +249,21 @@ void ChromeTracingLogger::HandleTypeKernel( float blocks_per_sm = 0.0; float warps_per_sm = 0.0; float occupancy = 0.0; -#if defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_CUPTI) constexpr int threads_per_warp = 32; const gpuDeviceProp& device_property = GetDeviceProperties(device_node.DeviceId()); - blocks_per_sm = - (kernel_info.grid_x * kernel_info.grid_y * kernel_info.grid_z) / - device_property.multiProcessorCount; + blocks_per_sm = static_cast(kernel_info.grid_x * kernel_info.grid_y * + kernel_info.grid_z) / + device_property.multiProcessorCount; warps_per_sm = blocks_per_sm * (kernel_info.block_x * kernel_info.block_y * kernel_info.block_z) / threads_per_warp; + occupancy = CalculateEstOccupancy( + device_node.DeviceId(), kernel_info.registers_per_thread, + kernel_info.static_shared_memory, kernel_info.dynamic_shared_memory, + kernel_info.block_x, kernel_info.block_y, kernel_info.block_z, + blocks_per_sm); #endif output_file_stream_ << string_format( @@ -208,15 +274,17 @@ void ChromeTracingLogger::HandleTypeKernel( "ts": %lld, "dur": %lld, "ph": "X", "cat": "%s", "args": { + "start_ns": %lld, + "end_ns": %lld, "device": %d, "context": %d, "stream": %d, "correlation id": %d, "registers per thread": %d, - "shared memory": %f, + "shared memory": %d, "blocks per SM": %f, "warps per SM": %f, "grid": [%d, %d, %d], "block": [%d, %d, %d], - "est. achieved occupancy %": %f + "theoretical achieved occupancy %%": %f } }, )JSON"), @@ -224,12 +292,13 @@ void ChromeTracingLogger::HandleTypeKernel( device_node.StreamId(), nsToUs(device_node.StartNs()), nsToUs(device_node.Duration()), categary_name_[static_cast(device_node.Type())], - device_node.DeviceId(), device_node.ContextId(), device_node.StreamId(), + device_node.StartNs(), device_node.EndNs(), device_node.DeviceId(), + device_node.ContextId(), device_node.StreamId(), device_node.CorrelationId(), kernel_info.registers_per_thread, kernel_info.static_shared_memory + kernel_info.dynamic_shared_memory, blocks_per_sm, warps_per_sm, kernel_info.grid_x, kernel_info.grid_y, kernel_info.grid_z, kernel_info.block_x, kernel_info.block_y, - kernel_info.block_z, occupancy); + kernel_info.block_z, occupancy * 100); } void ChromeTracingLogger::HandleTypeMemcpy( @@ -247,6 +316,8 @@ void ChromeTracingLogger::HandleTypeMemcpy( "ts": %lld, "dur": %lld, "ph": "X", "cat": "%s", "args": { + "start_ns": %lld, + "end_ns": %lld, "stream": %d, "correlation id": %d, "bytes": %d, "memory bandwidth (GB/s)": %f } @@ -256,8 +327,8 @@ void ChromeTracingLogger::HandleTypeMemcpy( device_node.StreamId(), nsToUs(device_node.StartNs()), nsToUs(device_node.Duration()), categary_name_[static_cast(device_node.Type())], - device_node.StreamId(), device_node.CorrelationId(), - memcpy_info.num_bytes, memory_bandwidth); + device_node.StartNs(), device_node.EndNs(), device_node.StreamId(), + device_node.CorrelationId(), memcpy_info.num_bytes, memory_bandwidth); } void ChromeTracingLogger::HandleTypeMemset( @@ -271,6 +342,8 @@ void ChromeTracingLogger::HandleTypeMemset( "ts": %lld, "dur": %lld, "ph": "X", "cat": "%s", "args": { + "start_ns": %lld, + "end_ns": %lld, "device": %d, "context": %d, "stream": %d, "correlation id": %d, "bytes": %d, "value": %d @@ -281,7 +354,8 @@ void ChromeTracingLogger::HandleTypeMemset( device_node.StreamId(), nsToUs(device_node.StartNs()), nsToUs(device_node.Duration()), categary_name_[static_cast(device_node.Type())], - device_node.DeviceId(), device_node.ContextId(), device_node.StreamId(), + device_node.StartNs(), device_node.EndNs(), device_node.DeviceId(), + device_node.ContextId(), device_node.StreamId(), device_node.CorrelationId(), memset_info.num_bytes, memset_info.value); } @@ -290,10 +364,10 @@ void ChromeTracingLogger::StartLog() { R"JSON( { "schemaVersion": "%s", - "displayTimeUnit": "us", - "SpanNumber": "%d", + "displayTimeUnit": "ms", + "span_indx": "%d", )JSON"), - kSchemaVersion, num_span); + kSchemaVersion, span_indx++); // add device property information #if defined(PADDLE_WITH_CUDA) output_file_stream_ << std::string(R"JSON( @@ -358,11 +432,143 @@ void ChromeTracingLogger::StartLog() { )JSON"); } -void ChromeTracingLogger::EndLog() { +void ChromeTracingLogger::LogMetaInfo( + const std::unordered_map extra_info) { + RefineDisplayName(extra_info); output_file_stream_ << std::string( R"JSON( {} - ] + ], + )JSON"); + output_file_stream_ << std::string(R"JSON( + "ExtraInfo": {)JSON"); + size_t count = extra_info.size(); + for (const auto& kv : extra_info) { + if (count > 1) { + output_file_stream_ << string_format(std::string(R"JSON( + "%s": "%s", + )JSON"), + kv.first.c_str(), kv.second.c_str()); + } else { + output_file_stream_ << string_format(std::string(R"JSON( + "%s": "%s" + )JSON"), + kv.first.c_str(), kv.second.c_str()); + } + count--; + } + output_file_stream_ << std::string(R"JSON( + })JSON"); +} + +void ChromeTracingLogger::RefineDisplayName( + std::unordered_map extra_info) { + for (auto it = pid_tid_set_.begin(); it != pid_tid_set_.end(); ++it) { + output_file_stream_ << string_format( + std::string( + R"JSON( + { + "name": "process_name", "pid": %lld, "tid": "%lld(Python)", + "ph": "M", + "args": { + "name": "Process %lld (CPU)" + } + }, + { + "name": "process_name", "pid": %lld, "tid": "%lld(C++)", + "ph": "M", + "args": { + "name": "Process %lld (CPU)" + } + }, + { + "name": "thread_name", "pid": %lld, "tid": "%lld(Python)", + "ph": "M", + "args": { + "name": "thread %lld:%s(Python)" + } + }, + { + "name": "thread_name", "pid": %lld, "tid": "%lld(C++)", + "ph": "M", + "args": { + "name": "thread %lld:%s(C++)" + } + }, + { + "name": "process_sort_index", "pid": %lld, "tid": %lld, + "ph": "M", + "args": { + "sort_index": %lld + } + }, + { + "name": "thread_sort_index", "pid": %lld, "tid": "%lld(Python)", + "ph": "M", + "args": { + "sort_index": %lld + } + }, + { + "name": "thread_sort_index", "pid": %lld, "tid": "%lld(C++)", + "ph": "M", + "args": { + "sort_index": %lld + } + }, + )JSON"), + (*it).first, (*it).second, (*it).first, (*it).first, (*it).second, + (*it).first, (*it).first, (*it).second, (*it).second, + extra_info[string_format(std::string("%lld"), (*it).second)].c_str(), + (*it).first, (*it).second, (*it).second, + extra_info[string_format(std::string("%lld"), (*it).second)].c_str(), + (*it).first, (*it).second, (*it).first, (*it).first, (*it).second, + (*it).second * 2, (*it).first, (*it).second, (*it).second * 2 + 1); + } + + for (auto it = deviceid_streamid_set_.begin(); + it != deviceid_streamid_set_.end(); ++it) { + output_file_stream_ << string_format( + std::string( + R"JSON( + { + "name": "process_name", "pid": %lld, "tid": %lld, + "ph": "M", + "args": { + "name": "Deivce %lld (GPU)" + } + }, + { + "name": "thread_name", "pid": %lld, "tid": %lld, + "ph": "M", + "args": { + "name": "stream %lld" + } + }, + { + "name": "process_sort_index", "pid": %lld, "tid": %lld, + "ph": "M", + "args": { + "sort_index": %lld + } + }, + { + "name": "thread_sort_index", "pid": %lld, "tid": %lld, + "ph": "M", + "args": { + "sort_index": %lld + } + }, + )JSON"), + (*it).first, (*it).second, (*it).first, (*it).first, (*it).second, + (*it).second, (*it).first, (*it).second, (*it).first + 0x10000000, + (*it).first, (*it).second, (*it).second); + } +} + +void ChromeTracingLogger::EndLog() { + output_file_stream_ << std::string( + R"JSON( } )JSON"); } diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.h b/paddle/fluid/platform/profiler/chrometracing_logger.h index 06734418609..20a924a54ca 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.h +++ b/paddle/fluid/platform/profiler/chrometracing_logger.h @@ -13,11 +13,18 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include +#include #include "paddle/fluid/platform/profiler/output_logger.h" namespace paddle { namespace platform { +// Dump a NodeTrees into a chrome tracing file. +// A ChromeTracingLogger object can only dump a NodeTrees object, +// creates a file in the constructor and closes the file in the destructor. +// should only call LogNodeTrees and LogMetaInfo in order. class ChromeTracingLogger : public BaseLogger { public: explicit ChromeTracingLogger(const std::string& filename); @@ -28,6 +35,7 @@ class ChromeTracingLogger : public BaseLogger { void LogHostTraceEventNode(const HostTraceEventNode&) override; void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override; void LogNodeTrees(const NodeTrees&) override; + void LogMetaInfo(const std::unordered_map); private: void OpenFile(); @@ -36,9 +44,12 @@ class ChromeTracingLogger : public BaseLogger { void HandleTypeMemcpy(const DeviceTraceEventNode&); void StartLog(); void EndLog(); + void RefineDisplayName(std::unordered_map); std::string filename_; std::ofstream output_file_stream_; static const char* categary_name_[]; + std::set> pid_tid_set_; + std::set> deviceid_streamid_set_; }; } // namespace platform diff --git a/paddle/fluid/platform/profiler/cpu_utilization.cc b/paddle/fluid/platform/profiler/cpu_utilization.cc index 672a9a15453..ce2e49a1ccd 100644 --- a/paddle/fluid/platform/profiler/cpu_utilization.cc +++ b/paddle/fluid/platform/profiler/cpu_utilization.cc @@ -54,19 +54,16 @@ void CpuUtilization::RecordBeginTimeInfo() { if (stat_file != nullptr) { char temp_str[200]; uint64_t temp_lu; - while (true) { - int retval = fscanf( - stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 - "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64, - temp_str, &system_tms_start_.tms_utime, &nice_time_start_, - &system_tms_start_.tms_stime, &idle_start_, &iowait_start_, - &irq_start_, &softirq_start_, &steal_start_, &temp_lu, &temp_lu); - if (std::string(temp_str).find("cpu") != 0) { - break; - } - if (retval != 11) { - return; - } + int retval = fscanf( + stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 + "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64, + temp_str, &system_tms_start_.tms_utime, &nice_time_start_, + &system_tms_start_.tms_stime, &idle_start_, &iowait_start_, &irq_start_, + &softirq_start_, &steal_start_, &temp_lu, &temp_lu); + if (retval != 11) { + LOG(WARNING) + << "Failed to read cpu utilization information at record beginning." + << std::endl; } fclose(stat_file); } @@ -90,19 +87,17 @@ void CpuUtilization::RecordEndTimeInfo() { if (stat_file != nullptr) { char temp_str[200]; uint64_t temp_lu; - while (true) { - int retval = fscanf( - stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 - "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64, - temp_str, &system_tms_end_.tms_utime, &nice_time_end_, - &system_tms_end_.tms_stime, &idle_end_, &iowait_end_, &irq_end_, - &softirq_end_, &steal_end_, &temp_lu, &temp_lu); - if (std::string(temp_str).find("cpu") != 0) { - break; - } - if (retval != 11) { - return; - } + int retval = fscanf( + stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 + "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64, + temp_str, &system_tms_end_.tms_utime, &nice_time_end_, + &system_tms_end_.tms_stime, &idle_end_, &iowait_end_, &irq_end_, + &softirq_end_, &steal_end_, &temp_lu, &temp_lu); + + if (retval != 11) { + LOG(WARNING) + << "Failed to read cpu utilization information at record end." + << std::endl; } fclose(stat_file); } diff --git a/paddle/fluid/platform/profiler/dump/CMakeLists.txt b/paddle/fluid/platform/profiler/dump/CMakeLists.txt index e25333f7a8a..5045c56afbc 100644 --- a/paddle/fluid/platform/profiler/dump/CMakeLists.txt +++ b/paddle/fluid/platform/profiler/dump/CMakeLists.txt @@ -1,4 +1 @@ proto_library(nodetreeproto SRCS nodetree.proto) -cc_library(serialization_logger SRCS serialization_logger.cc DEPS nodetreeproto event_node) -cc_library(deserialization_reader SRCS deserialization_reader.cc DEPS nodetreeproto event_node) -cc_test(test_serialization_logger SRCS test_serialization_logger.cc DEPS serialization_logger deserialization_reader event_node) diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc index d1049a7dc19..de3411579d3 100644 --- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc +++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc @@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/profiler/dump/deserialization_reader.h" - #include +#include "paddle/fluid/platform/profiler/extra_info.h" namespace paddle { namespace platform { @@ -36,11 +36,19 @@ void DeserializationReader::OpenFile() { } } -std::unique_ptr DeserializationReader::Parse() { +std::unique_ptr DeserializationReader::Parse() { if (!node_trees_proto_->ParseFromIstream(&input_file_stream_)) { VLOG(2) << "Unable to load node trees in protobuf." << std::endl; return nullptr; } + // restore extra info + ExtraInfo extrainfo; + for (auto indx = 0; indx < node_trees_proto_->extra_info_size(); indx++) { + ExtraInfoMap extra_info_map = node_trees_proto_->extra_info(indx); + extrainfo.AddExtraInfo(extra_info_map.key(), std::string("%s"), + extra_info_map.value().c_str()); + } + // restore NodeTrees std::map thread_event_trees_map; for (int node_tree_index = 0; node_tree_index < node_trees_proto_->thread_trees_size(); @@ -95,7 +103,9 @@ std::unique_ptr DeserializationReader::Parse() { } } // restore NodeTrees object - return std::unique_ptr(new NodeTrees(thread_event_trees_map)); + std::unique_ptr tree(new NodeTrees(thread_event_trees_map)); + return std::unique_ptr( + new ProfilerResult(std::move(tree), extrainfo)); } DeserializationReader::~DeserializationReader() { diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.h b/paddle/fluid/platform/profiler/dump/deserialization_reader.h index 1ad2dabf229..e6feb4f9489 100644 --- a/paddle/fluid/platform/profiler/dump/deserialization_reader.h +++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.h @@ -14,7 +14,7 @@ limitations under the License. */ #include #include "paddle/fluid/platform/profiler/dump/nodetree.pb.h" -#include "paddle/fluid/platform/profiler/event_node.h" +#include "paddle/fluid/platform/profiler/event_python.h" namespace paddle { namespace platform { @@ -24,7 +24,7 @@ class DeserializationReader { explicit DeserializationReader(const std::string& filename); explicit DeserializationReader(const char* filename); ~DeserializationReader(); - std::unique_ptr Parse(); + std::unique_ptr Parse(); private: void OpenFile(); diff --git a/paddle/fluid/platform/profiler/dump/nodetree.proto b/paddle/fluid/platform/profiler/dump/nodetree.proto index 37dac0e597c..7016745059d 100644 --- a/paddle/fluid/platform/profiler/dump/nodetree.proto +++ b/paddle/fluid/platform/profiler/dump/nodetree.proto @@ -32,9 +32,21 @@ enum TracerEventTypeProto { Memset = 6; // Used to mark record defined by user UserDefined = 7; - // A flag to denote the number of current types - NumTypes = 8; -} + // Used to mark operator detail, (such as infer shape, compute) + OperatorInner = 8; + // Used to mark model training or testing perspective, forward process + Forward = 9; + // Used to mark model training perspective, backward process + Backward = 10; + // Used to mark model training perspective, optimization process + Optimization = 11; + // Used to mark distributed training perspective + Communication = 12; + // Used to mark python api + PythonOp = 13; + // Used to mark python level userdefined + PythonUserDefined = 14; +}; message KernelEventInfoProto { // The X-dimension block size for the kernel. @@ -175,7 +187,14 @@ message ThreadNodeTreeProto { repeated HostTraceEventNodeProto host_nodes = 2; } +message ExtraInfoMap { + required string key = 1; + required string value = 2; +} + message NodeTreesProto { required string version = 1; - repeated ThreadNodeTreeProto thread_trees = 2; + required uint32 span_indx = 2; + repeated ThreadNodeTreeProto thread_trees = 3; + repeated ExtraInfoMap extra_info = 4; } diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc index d9ed84bd438..73021f4362a 100644 --- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc +++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc @@ -13,6 +13,7 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler/dump/serialization_logger.h" #include "paddle/fluid/platform/profiler/event_node.h" +#include "paddle/fluid/platform/profiler/extra_info.h" #include "paddle/fluid/platform/profiler/utils.h" namespace paddle { @@ -20,6 +21,7 @@ namespace platform { static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.pb"; static const char* version = "1.0.0"; +static uint32_t span_indx = 0; static std::string DefaultFileName() { auto pid = GetProcessId(); @@ -39,6 +41,7 @@ void SerializationLogger::OpenFile() { } node_trees_proto_ = new NodeTreesProto(); node_trees_proto_->set_version(std::string(version)); + node_trees_proto_->set_span_indx(span_indx++); } void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) { @@ -240,6 +243,15 @@ void SerializationLogger::HandleTypeMemset( device_trace_event); } +void SerializationLogger::LogMetaInfo( + const std::unordered_map extra_info) { + for (const auto& kv : extra_info) { + ExtraInfoMap* extra_info_map = node_trees_proto_->add_extra_info(); + extra_info_map->set_key(kv.first); + extra_info_map->set_value(kv.second); + } +} + SerializationLogger::SerializationLogger(const std::string& filename) { filename_ = filename.empty() ? DefaultFileName() : filename; OpenFile(); diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.h b/paddle/fluid/platform/profiler/dump/serialization_logger.h old mode 100755 new mode 100644 index 1295be95d45..378834cff59 --- a/paddle/fluid/platform/profiler/dump/serialization_logger.h +++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h @@ -11,6 +11,8 @@ limitations under the License. */ #pragma once +#include + #include "paddle/fluid/platform/profiler/dump/nodetree.pb.h" #include "paddle/fluid/platform/profiler/output_logger.h" @@ -20,6 +22,7 @@ namespace platform { // Dump a NodeTrees into a profobuf file. // A SerializationLogger object can only dump a NodeTrees object, // creates a file in the constructor and closes the file in the destructor. +// Should only call LogNodeTrees and LogMetaInfo. class SerializationLogger : public BaseLogger { public: explicit SerializationLogger(const std::string& filename); @@ -30,12 +33,14 @@ class SerializationLogger : public BaseLogger { void LogHostTraceEventNode(const HostTraceEventNode&) override; void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override; void LogNodeTrees(const NodeTrees&) override; + void LogMetaInfo(const std::unordered_map); private: void OpenFile(); void HandleTypeKernel(const DeviceTraceEventNode&); void HandleTypeMemset(const DeviceTraceEventNode&); void HandleTypeMemcpy(const DeviceTraceEventNode&); + std::string filename_; std::ofstream output_file_stream_; NodeTreesProto* node_trees_proto_; diff --git a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc index 2fe9626ec76..dee1019da2b 100644 --- a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc +++ b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc @@ -17,6 +17,7 @@ #include "paddle/fluid/platform/profiler/dump/deserialization_reader.h" #include "paddle/fluid/platform/profiler/dump/serialization_logger.h" #include "paddle/fluid/platform/profiler/event_node.h" +#include "paddle/fluid/platform/profiler/event_python.h" using paddle::platform::SerializationLogger; using paddle::platform::DeserializationReader; @@ -31,6 +32,7 @@ using paddle::platform::TracerEventType; using paddle::platform::KernelEventInfo; using paddle::platform::MemcpyEventInfo; using paddle::platform::MemsetEventInfo; +using paddle::platform::ProfilerResult; TEST(SerializationLoggerTest, dump_case0) { std::list host_events; @@ -149,7 +151,8 @@ TEST(SerializationLoggerTest, dump_case1) { TEST(DeserializationReaderTest, restore_case0) { DeserializationReader reader("test_serialization_logger_case0.pb"); - std::unique_ptr tree = reader.Parse(); + auto profiler_result = reader.Parse(); + auto& tree = profiler_result->GetNodeTrees(); std::map> nodes = tree->Traverse(true); EXPECT_EQ(nodes[10].size(), 4u); @@ -172,3 +175,26 @@ TEST(DeserializationReaderTest, restore_case0) { } } } + +TEST(DeserializationReaderTest, restore_case1) { + DeserializationReader reader("test_serialization_logger_case1.pb"); + auto profiler_result = reader.Parse(); + auto& tree = profiler_result->GetNodeTrees(); + std::map> nodes = + tree->Traverse(true); + EXPECT_EQ(nodes[10].size(), 1u); + EXPECT_EQ(nodes[11].size(), 1u); + std::vector thread1_nodes = nodes[10]; + std::vector thread2_nodes = nodes[11]; + for (auto it = thread1_nodes.begin(); it != thread1_nodes.end(); it++) { + if ((*it)->Name() == "root node") { + EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 3u); + } + } + for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) { + if ((*it)->Name() == "root node") { + EXPECT_EQ((*it)->GetChildren().size(), 0u); + EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u); + } + } +} diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc new file mode 100644 index 00000000000..1a6f19d2f93 --- /dev/null +++ b/paddle/fluid/platform/profiler/event_python.cc @@ -0,0 +1,122 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/profiler/event_python.h" +#include "paddle/fluid/platform/profiler/chrometracing_logger.h" +#include "paddle/fluid/platform/profiler/dump/deserialization_reader.h" +#include "paddle/fluid/platform/profiler/dump/serialization_logger.h" +#include "paddle/fluid/platform/profiler/extra_info.h" + +namespace paddle { +namespace platform { + +HostPythonNode::~HostPythonNode() { + // delete all runtime or device nodes and recursive delete children + for (auto it = children_node_ptrs.begin(); it != children_node_ptrs.end(); + ++it) { + delete *it; + } + for (auto it = runtime_node_ptrs.begin(); it != runtime_node_ptrs.end(); + ++it) { + delete *it; + } + for (auto it = device_node_ptrs.begin(); it != device_node_ptrs.end(); ++it) { + delete *it; + } +} + +HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) { + // Copy and transfer EventNode in NodeTree to PythonNode + if (root == nullptr) { + return nullptr; + } + // copy HostTraceEventNode and its children + HostPythonNode* host_python_node = new HostPythonNode(); + host_python_node->name = root->Name(); + host_python_node->type = root->Type(); + host_python_node->start_ns = root->StartNs(); + host_python_node->end_ns = root->EndNs(); + host_python_node->process_id = root->ProcessId(); + host_python_node->thread_id = root->ThreadId(); + for (auto it = root->GetChildren().begin(); it != root->GetChildren().end(); + ++it) { + host_python_node->children_node_ptrs.push_back(CopyTree(*it)); + } + // copy its CudaRuntimeTraceEventNode + for (auto runtimenode = root->GetRuntimeTraceEventNodes().begin(); + runtimenode != root->GetRuntimeTraceEventNodes().end(); ++runtimenode) { + HostPythonNode* runtime_python_node = new HostPythonNode(); + runtime_python_node->name = (*runtimenode)->Name(); + runtime_python_node->type = (*runtimenode)->Type(); + runtime_python_node->start_ns = (*runtimenode)->StartNs(); + runtime_python_node->end_ns = (*runtimenode)->EndNs(); + runtime_python_node->process_id = (*runtimenode)->ProcessId(); + runtime_python_node->thread_id = (*runtimenode)->ThreadId(); + host_python_node->runtime_node_ptrs.push_back(runtime_python_node); + // copy DeviceTraceEventNode + for (auto devicenode = (*runtimenode)->GetDeviceTraceEventNodes().begin(); + devicenode != (*runtimenode)->GetDeviceTraceEventNodes().end(); + ++devicenode) { + DevicePythonNode* device_python_node = new DevicePythonNode(); + device_python_node->name = (*devicenode)->Name(); + device_python_node->type = (*devicenode)->Type(); + device_python_node->start_ns = (*devicenode)->StartNs(); + device_python_node->end_ns = (*devicenode)->EndNs(); + device_python_node->device_id = (*devicenode)->DeviceId(); + device_python_node->context_id = (*devicenode)->ContextId(); + device_python_node->stream_id = (*devicenode)->StreamId(); + runtime_python_node->device_node_ptrs.push_back(device_python_node); + } + } + return host_python_node; +} + +ProfilerResult::ProfilerResult(std::unique_ptr tree, + const ExtraInfo& extra_info) + : tree_(std::move(tree)), extra_info_(extra_info) { + if (tree_ != nullptr) { + std::map nodetrees = tree_->GetNodeTrees(); + for (auto it = nodetrees.begin(); it != nodetrees.end(); ++it) { + thread_event_trees_map_[it->first] = CopyTree(it->second); + } + } +} + +ProfilerResult::~ProfilerResult() { + // delete all root nodes + for (auto it = thread_event_trees_map_.begin(); + it != thread_event_trees_map_.end(); ++it) { + delete it->second; + } +} + +void ProfilerResult::Save(const std::string& file_name, + const std::string format) { + if (format == std::string("json")) { + ChromeTracingLogger logger(file_name); + tree_->LogMe(&logger); + logger.LogMetaInfo(GetExtraInfo()); + } else if (format == std::string("pb")) { + SerializationLogger logger(file_name); + tree_->LogMe(&logger); + logger.LogMetaInfo(GetExtraInfo()); + } + return; +} + +std::unique_ptr LoadProfilerResult(std::string filename) { + DeserializationReader reader(filename); + std::unique_ptr result = reader.Parse(); + return result; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h index b0d8eaa2427..12ecb9fde32 100644 --- a/paddle/fluid/platform/profiler/event_python.h +++ b/paddle/fluid/platform/profiler/event_python.h @@ -15,8 +15,11 @@ limitations under the License. */ #pragma once #include +#include +#include #include "paddle/fluid/platform/profiler/event_node.h" +#include "paddle/fluid/platform/profiler/extra_info.h" namespace paddle { namespace platform { @@ -66,18 +69,29 @@ struct HostPythonNode { class ProfilerResult { public: ProfilerResult() : tree_(nullptr) {} - explicit ProfilerResult(NodeTrees* tree); + explicit ProfilerResult(std::unique_ptr tree, + const ExtraInfo& extra_info); ~ProfilerResult(); std::map GetData() { - return thread_event_trees_map; + return thread_event_trees_map_; } - void Save(const std::string& file_name); + std::unordered_map GetExtraInfo() { + return extra_info_.GetExtraInfo(); + } + + void Save(const std::string& file_name, + const std::string format = std::string("json")); + + std::unique_ptr& GetNodeTrees() { return tree_; } private: - std::map thread_event_trees_map; - NodeTrees* tree_; - HostPythonNode* CopyTree(HostTraceEventNode* node); + std::map thread_event_trees_map_; + std::unique_ptr tree_; + ExtraInfo extra_info_; + HostPythonNode* CopyTree(HostTraceEventNode* root); }; +std::unique_ptr LoadProfilerResult(std::string filename); + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc index 5784d6e671b..35dbc96874d 100644 --- a/paddle/fluid/platform/profiler/profiler.cc +++ b/paddle/fluid/platform/profiler/profiler.cc @@ -25,8 +25,10 @@ #endif #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler/cuda_tracer.h" +#include "paddle/fluid/platform/profiler/extra_info.h" #include "paddle/fluid/platform/profiler/host_tracer.h" #include "paddle/fluid/platform/profiler/trace_event_collector.h" +#include "paddle/fluid/platform/profiler/utils.h" namespace paddle { namespace platform { @@ -44,10 +46,15 @@ std::unique_ptr Profiler::Create(const ProfilerOptions& options) { Profiler::Profiler(const ProfilerOptions& options) { options_ = options; - HostTracerOptions host_tracer_options; - host_tracer_options.trace_level = options.trace_level; - tracers_.emplace_back(new HostTracer(host_tracer_options), true); - tracers_.emplace_back(&CudaTracer::GetInstance(), false); + std::bitset<32> trace_switch(options_.trace_switch); + if (trace_switch.test(kProfileCPUOptionBit)) { + HostTracerOptions host_tracer_options; + host_tracer_options.trace_level = options_.trace_level; + tracers_.emplace_back(new HostTracer(host_tracer_options), true); + } + if (trace_switch.test(kProfileGPUOptionBit)) { + tracers_.emplace_back(&CudaTracer::GetInstance(), false); + } } Profiler::~Profiler() { alive_.store(false); } @@ -63,9 +70,10 @@ void Profiler::Start() { for (auto& tracer : tracers_) { tracer.Get().StartTracing(); } + cpu_utilization_.RecordBeginTimeInfo(); } -std::unique_ptr Profiler::Stop() { +std::unique_ptr Profiler::Stop() { SynchronizeAllDevice(); TraceEventCollector collector; for (auto& tracer : tracers_) { @@ -75,7 +83,22 @@ std::unique_ptr Profiler::Stop() { std::unique_ptr tree(new NodeTrees(collector.HostEvents(), collector.RuntimeEvents(), collector.DeviceEvents())); - return tree; + cpu_utilization_.RecordEndTimeInfo(); + ExtraInfo extrainfo; + extrainfo.AddExtraInfo(std::string("System Cpu Utilization"), + std::string("%f"), + cpu_utilization_.GetCpuUtilization()); + extrainfo.AddExtraInfo(std::string("Process Cpu Utilization"), + std::string("%f"), + cpu_utilization_.GetCpuCurProcessUtilization()); + const std::unordered_map thread_names = + collector.ThreadNames(); + for (const auto& kv : thread_names) { + extrainfo.AddExtraInfo(string_format(std::string("%llu"), kv.first), + kv.second); + } + return std::unique_ptr( + new platform::ProfilerResult(std::move(tree), extrainfo)); } } // namespace platform diff --git a/paddle/fluid/platform/profiler/profiler.h b/paddle/fluid/platform/profiler/profiler.h index 4fc1c6daf96..f9a8ece0504 100644 --- a/paddle/fluid/platform/profiler/profiler.h +++ b/paddle/fluid/platform/profiler/profiler.h @@ -15,12 +15,15 @@ #pragma once #include +#include #include #include #include #include #include "paddle/fluid/platform/macros.h" +#include "paddle/fluid/platform/profiler/cpu_utilization.h" #include "paddle/fluid/platform/profiler/event_node.h" +#include "paddle/fluid/platform/profiler/event_python.h" #include "paddle/fluid/platform/profiler/tracer_base.h" DECLARE_int64(host_trace_level); @@ -28,7 +31,11 @@ DECLARE_int64(host_trace_level); namespace paddle { namespace platform { +static constexpr uint32_t kProfileCPUOptionBit = 0; +static constexpr uint32_t kProfileGPUOptionBit = 1; + struct ProfilerOptions { + uint32_t trace_switch = 0; // bit 0: cpu, bit 1: gpu uint32_t trace_level = FLAGS_host_trace_level; }; @@ -40,7 +47,7 @@ class Profiler { void Start(); - std::unique_ptr Stop(); + std::unique_ptr Stop(); ~Profiler(); @@ -70,6 +77,7 @@ class Profiler { ProfilerOptions options_; uint64_t start_ns_ = UINT64_MAX; std::list tracers_; + CpuUtilization cpu_utilization_; }; } // namespace platform diff --git a/paddle/fluid/platform/profiler/profiler_test.cc b/paddle/fluid/platform/profiler/profiler_test.cc index 160c801dc6e..32310b9e862 100644 --- a/paddle/fluid/platform/profiler/profiler_test.cc +++ b/paddle/fluid/platform/profiler/profiler_test.cc @@ -22,6 +22,7 @@ #ifdef PADDLE_WITH_HIP #include #endif +#include "paddle/fluid/platform/profiler/event_python.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/profiler.h" @@ -30,8 +31,10 @@ TEST(ProfilerTest, TestHostTracer) { using paddle::platform::Profiler; using paddle::platform::RecordInstantEvent; using paddle::platform::TracerEventType; + using paddle::platform::ProfilerResult; ProfilerOptions options; options.trace_level = 2; + options.trace_switch = 3; auto profiler = Profiler::Create(options); EXPECT_TRUE(profiler); profiler->Prepare(); @@ -42,7 +45,8 @@ TEST(ProfilerTest, TestHostTracer) { RecordInstantEvent("TestTraceLevel_record2", TracerEventType::UserDefined, 3); } - auto nodetree = profiler->Stop(); + auto profiler_result = profiler->Stop(); + auto& nodetree = profiler_result->GetNodeTrees(); std::set host_events; for (const auto pair : nodetree->Traverse(true)) { for (const auto evt : pair.second) { @@ -56,8 +60,10 @@ TEST(ProfilerTest, TestHostTracer) { TEST(ProfilerTest, TestCudaTracer) { using paddle::platform::ProfilerOptions; using paddle::platform::Profiler; + using paddle::platform::ProfilerResult; ProfilerOptions options; options.trace_level = 0; + options.trace_switch = 3; auto profiler = Profiler::Create(options); EXPECT_TRUE(profiler); profiler->Prepare(); @@ -72,7 +78,8 @@ TEST(ProfilerTest, TestCudaTracer) { hipStreamCreate(&stream); hipStreamSynchronize(stream); #endif - auto nodetree = profiler->Stop(); + auto profiler_result = profiler->Stop(); + auto& nodetree = profiler_result->GetNodeTrees(); std::vector runtime_events; for (const auto pair : nodetree->Traverse(true)) { for (const auto host_node : pair.second) { diff --git a/paddle/fluid/platform/profiler/trace_event.h b/paddle/fluid/platform/profiler/trace_event.h index 61f96218560..16ef62fb515 100644 --- a/paddle/fluid/platform/profiler/trace_event.h +++ b/paddle/fluid/platform/profiler/trace_event.h @@ -48,6 +48,8 @@ enum class TracerEventType { Communication = 12, // Used to mark python api PythonOp = 13, + // Used to mark python level userdefined + PythonUserDefined = 14, // A flag to denote the number of current types NumTypes }; -- GitLab From 1db188f318ae0b0292984e08afd626898e3170da Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Wed, 2 Mar 2022 15:37:29 +0800 Subject: [PATCH 051/272] [IPU] update ipu unittests p0 (#39707) * update ipu UTs part0 * rename UT * sync api changes * update uts for new api * use_ipumodel() as classmethod --- .../tests/unittests/ipu/ernie_training.py | 934 ------------------ .../fluid/tests/unittests/ipu/op_test_ipu.py | 73 +- .../unittests/ipu/test_activation_x_op_ipu.py | 133 +++ .../unittests/ipu/test_arg_max_op_ipu.py | 117 +++ .../tests/unittests/ipu/test_assign_op_ipu.py | 102 ++ .../tests/unittests/ipu/test_avg_shard_ipu.py | 112 ++- .../unittests/ipu/test_batch_norm_op_ipu.py | 108 +- ....py => test_batchs_per_step_simple_ipu.py} | 22 +- .../tests/unittests/ipu/test_cast_op_ipu.py | 111 ++- .../tests/unittests/ipu/test_concat_op_ipu.py | 93 +- .../tests/unittests/ipu/test_conv_op_ipu.py | 127 +-- .../ipu/test_cross_entropy2_op_ipu.py | 128 ++- .../tests/unittests/ipu/test_cumsum_op_ipu.py | 123 +++ 13 files changed, 950 insertions(+), 1233 deletions(-) delete mode 100644 python/paddle/fluid/tests/unittests/ipu/ernie_training.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py rename python/paddle/fluid/tests/unittests/ipu/{test_ipu_batchs_per_step_simple.py => test_batchs_per_step_simple_ipu.py} (79%) create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py diff --git a/python/paddle/fluid/tests/unittests/ipu/ernie_training.py b/python/paddle/fluid/tests/unittests/ipu/ernie_training.py deleted file mode 100644 index ddda666db2c..00000000000 --- a/python/paddle/fluid/tests/unittests/ipu/ernie_training.py +++ /dev/null @@ -1,934 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# refrenece : https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/language_model/ernie - -import os -import copy -import argparse -from contextlib import contextmanager -from functools import partial - -import numpy as np -import paddle -import paddle.static -import paddle.fluid as fluid -import paddle.fluid.layers as layers -import paddle.fluid.compiler as compiler -paddle.enable_static() - -SEED = 2021 -INT_DTYPE = None - -# ernie related block -ernie_config = { - "emb_size": 128, - "emb_mapping_in": False, - "hidden_size": 192, - "num_hidden_layers": 2, - "n_layer_per_block": 2, - "num_attention_heads": 12, - "vocab_size": 300, - "max_position_embeddings": 512, - "sent_type_vocab_size": 4, - "task_type_vocab_size": 16, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.0, - "attention_probs_dropout_prob": 0.0, - "preln": False, - "pre_encoder_cmd": "n", - "preprocess_cmd": "", - "postprocess_cmd": "an", - "epsilon": 1e-12, - "initializer_range": 0.02, - "seq_len": 32 -} - - -def gelu(x): - """Gaussian Error Linear Unit. - - This is a smoother version of the RELU. - Original paper: https://arxiv.org/abs/1606.08415 - Args: - x: float Tensor to perform activation. - - Returns: - `x` with the GELU activation applied. - """ - cdf = 0.5 * (1.0 + fluid.layers.tanh( - (np.sqrt(2.0 / np.pi) * (x + 0.044715 * fluid.layers.pow(x, 3.0))))) - return x * cdf - - -def pre_post_process_layer(prev_out, - out, - process_cmd, - dropout_rate=0., - epsilon=1e-12, - name=''): - """ - Add residual connection, layer normalization and droput to the out tensor - optionally according to the value of process_cmd. - This will be used before or after multi-head attention and position-wise - feed-forward networks. - """ - for cmd in process_cmd: - if cmd == "a": # add residual connection - out = out + prev_out if prev_out else out - elif cmd == "n": # add layer normalization - out = layers.layer_norm( - out, - begin_norm_axis=len(out.shape) - 1, - param_attr=fluid.ParamAttr( - name=name + '_layer_norm_scale', - initializer=fluid.initializer.Constant(1.)), - bias_attr=fluid.ParamAttr( - name=name + '_layer_norm_bias', - initializer=fluid.initializer.Constant(0.)), - epsilon=epsilon) - elif cmd == "d": # add dropout - if dropout_rate: - out = layers.dropout( - out, - dropout_prob=dropout_rate, - dropout_implementation="upscale_in_train", - is_test=False) - return out - - -pre_process_layer = partial(pre_post_process_layer, None) -post_process_layer = pre_post_process_layer - - -def positionwise_feed_forward(x, - d_inner_hid, - d_hid, - dropout_rate, - hidden_act, - param_initializer=None, - name='ffn'): - """ - Position-wise Feed-Forward Networks. - This module consists of two linear transformations with a ReLU activation - in between, which is applied to each position separately and identically. - """ - - #assert hidden_act == 'gelu.approximate' - hidden = layers.fc(input=x, - size=d_inner_hid, - num_flatten_dims=2, - act=None, - param_attr=fluid.ParamAttr( - name=name + '_fc_0.w_0', - initializer=param_initializer), - bias_attr=name + '_fc_0.b_0') - hidden = gelu(hidden) - - if dropout_rate: - hidden = layers.dropout( - hidden, - dropout_prob=dropout_rate, - dropout_implementation="upscale_in_train", - is_test=False) - - out = layers.fc(input=hidden, - size=d_hid, - num_flatten_dims=2, - param_attr=fluid.ParamAttr( - name=name + '_fc_1.w_0', initializer=param_initializer), - bias_attr=name + '_fc_1.b_0') - - return out - - -def multi_head_attention(queries, - keys, - values, - attn_bias, - d_key, - d_value, - d_model, - n_head=1, - dropout_rate=0., - cache=None, - param_initializer=None, - name='multi_head_att'): - """ - Multi-Head Attention. Note that attn_bias is added to the logit before - computing softmax activiation to mask certain selected positions so that - they will not considered in attention weights. - """ - keys = queries if keys is None else keys - values = keys if values is None else values - - def __compute_qkv(queries, keys, values, n_head, d_key, d_value): - """ - Add linear projection to queries, keys, and values. - """ - q = layers.fc(input=queries, - size=d_key * n_head, - num_flatten_dims=2, - param_attr=fluid.ParamAttr( - name=name + '_query_fc.w_0', - initializer=param_initializer), - bias_attr=name + '_query_fc.b_0') - k = layers.fc(input=keys, - size=d_key * n_head, - num_flatten_dims=2, - param_attr=fluid.ParamAttr( - name=name + '_key_fc.w_0', - initializer=param_initializer), - bias_attr=name + '_key_fc.b_0') - v = layers.fc(input=values, - size=d_value * n_head, - num_flatten_dims=2, - param_attr=fluid.ParamAttr( - name=name + '_value_fc.w_0', - initializer=param_initializer), - bias_attr=name + '_value_fc.b_0') - - return q, k, v - - def __split_heads(x, n_head): - """ - Reshape the last dimension of inpunt tensor x so that it becomes two - dimensions and then transpose. Specifically, input a tensor with shape - [bs, max_sequence_length, n_head * hidden_dim] then output a tensor - with shape [bs, n_head, max_sequence_length, hidden_dim]. - """ - hidden_size = x.shape[-1] - # The value 0 in shape attr means copying the corresponding dimension - # size of the input as the output dimension size. - reshaped = layers.reshape( - x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=False) - - # permuate the dimensions into: - # [batch_size, n_head, max_sequence_len, hidden_size_per_head] - return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) - - def __combine_heads(x): - """ - Transpose and then reshape the last two dimensions of inpunt tensor x - so that it becomes one dimension, which is reverse to __split_heads. - """ - if len(x.shape) == 3: return x - if len(x.shape) != 4: - raise ValueError("Input(x) should be a 4-D Tensor.") - - trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) - # The value 0 in shape attr means copying the corresponding dimension - # size of the input as the output dimension size. - return layers.reshape( - x=trans_x, - shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], - inplace=False) - - def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): - """ - Scaled Dot-Product Attention - """ - scaled_q = layers.scale(x=q, scale=d_key**-0.5) - product = layers.matmul(x=scaled_q, y=k, transpose_y=True) - - if attn_bias: - product += attn_bias - weights = layers.softmax(product) - if dropout_rate: - weights = layers.dropout( - weights, - dropout_prob=dropout_rate, - dropout_implementation="upscale_in_train", - is_test=False) - out = layers.matmul(weights, v) - return out - - q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) - - if cache is not None: # use cache and concat time steps - # Since the inplace reshape in __split_heads changes the shape of k and - # v, which is the cache input for next time step, reshape the cache - # input from the previous time step first. - k = cache["k"] = layers.concat( - [layers.reshape( - cache["k"], shape=[0, 0, d_model]), k], axis=1) - v = cache["v"] = layers.concat( - [layers.reshape( - cache["v"], shape=[0, 0, d_model]), v], axis=1) - - q = __split_heads(q, n_head) - k = __split_heads(k, n_head) - v = __split_heads(v, n_head) - - ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key, - dropout_rate) - - out = __combine_heads(ctx_multiheads) - - # Project back to the model size. - proj_out = layers.fc(input=out, - size=d_model, - num_flatten_dims=2, - param_attr=fluid.ParamAttr( - name=name + '_output_fc.w_0', - initializer=param_initializer), - bias_attr=name + '_output_fc.b_0') - - return proj_out - - -def encoder_layer(enc_input, - attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd="n", - postprocess_cmd="da", - param_initializer=None, - name='', - epsilon=1e-12): - """The encoder layers that can be stacked to form a deep encoder. - This module consits of a multi-head (self) attention followed by - position-wise feed-forward networks and both the two components companied - with the post_process_layer to add residual connection, layer normalization - and droput. - """ - - attn_output = multi_head_attention( - enc_input, - None, - None, - attn_bias, - d_key, - d_value, - d_model, - n_head, - attention_dropout, - param_initializer=param_initializer, - name=name + '_multi_head_att') - - attn_output = post_process_layer( - enc_input, - attn_output, - 'an', - prepostprocess_dropout, - name=name + '_post_att', - epsilon=epsilon) - - ffd_output = positionwise_feed_forward( - attn_output, - d_inner_hid, - d_model, - relu_dropout, - hidden_act, - param_initializer=param_initializer, - name=name + '_ffn') - - post_output = post_process_layer( - attn_output, - ffd_output, - 'an', - prepostprocess_dropout, - name=name + '_post_ffn', - epsilon=epsilon) - - return post_output - - -def encoder_inner_share(enc_input, - attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd, - postprocess_cmd, - epsilon, - param_initializer=None, - name='', - n_layer_per_block=1): - """ - The encoder_inner_share is composed of n_layer_per_block layers returned by calling - encoder_layer. - """ - - for i in range(n_layer_per_block): - enc_output = encoder_layer( - enc_input, - attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd, - postprocess_cmd, - param_initializer=param_initializer, - name=name + '_layer_' + str(i), - epsilon=epsilon) - - enc_input = enc_output - - return enc_output - - -def encoder(enc_input, - attn_bias, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd, - postprocess_cmd, - epsilon, - n_layer_per_block, - param_initializer=None, - name='', - preln=False): - """ - The encoder is composed of a stack of identical layers returned by calling - encoder_layer . - """ - - for _ in range(n_layer // n_layer_per_block): - attn_bias.stop_gradient = True - attn_bias.persistable = False - enc_output = encoder_inner_share( - enc_input, - attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd, - postprocess_cmd, - epsilon, - param_initializer=param_initializer, - name=name, - n_layer_per_block=n_layer_per_block) - - enc_input = enc_output - - if preln: - enc_output = post_process_layer( - None, - enc_output, - 'n', - prepostprocess_dropout, - name='post_encoder', - epsilon=epsilon) - - enc_output = pre_process_layer( - enc_output, - preprocess_cmd, - prepostprocess_dropout, - name="post_encoder", - epsilon=epsilon) - - return enc_output - - -class ErnieModel(object): - def __init__(self, src_ids, sent_ids, pos_ids, input_mask, config): - - self._emb_size = config['emb_size'] if config[ - 'emb_mapping_in'] else config['hidden_size'] - self._hidden_size = config['hidden_size'] - self._n_layer = config['num_hidden_layers'] - self._n_head = config['num_attention_heads'] - self._voc_size = config['vocab_size'] - self._max_position_seq_len = config['max_position_embeddings'] - self._sent_types = config['sent_type_vocab_size'] - self._task_types = config['task_type_vocab_size'] - self._hidden_act = config['hidden_act'] - self._prepostprocess_dropout = config['hidden_dropout_prob'] - self._attention_dropout = config['attention_probs_dropout_prob'] - self.config = config - self.preln = config['preln'] if 'preln' in config.keys() else False - self.pre_encoder_cmd = "" if self.preln else self.config[ - 'pre_encoder_cmd'] - - self._word_emb_name = "word_embedding" - self._pos_emb_name = "pos_embedding" - self._sent_emb_name = "sent_embedding" - self._task_emb_name = "task_embedding" - self._dtype = "float32" - self._emb_dtype = "float32" - - # Initialize all weigths by truncated normal initializer, and all biases - # will be initialized by constant zero by default. - self._param_initializer = fluid.initializer.TruncatedNormal( - scale=config['initializer_range']) - - self.src_ids = src_ids - self.sent_ids = sent_ids - self.pos_ids = pos_ids - self.input_mask = input_mask - ''' - _build_position_ids: range op doesn't support - _build_input_mask: logic_not op doesn't support - ''' - - self._build_model() - - def _build_model(self, emb=None): - with fluid.ipu_shard(ipu_index=0, ipu_stage=0): - # padding id in vocabulary must be set to 0 - self.emb_out = fluid.layers.embedding( - input=self.src_ids, - size=[self._voc_size, self._emb_size], - dtype=self._emb_dtype, - param_attr=fluid.ParamAttr( - name=self._word_emb_name, - initializer=self._param_initializer), - is_sparse=False) - - self.position_emb_out = fluid.layers.embedding( - input=self.pos_ids, - size=[self._max_position_seq_len, self._emb_size], - dtype=self._emb_dtype, - param_attr=fluid.ParamAttr( - name=self._pos_emb_name, - initializer=self._param_initializer)) - - self.sent_emb_out = fluid.layers.embedding( - self.sent_ids, - size=[self._sent_types, self._emb_size], - dtype=self._emb_dtype, - param_attr=fluid.ParamAttr( - name=self._sent_emb_name, - initializer=self._param_initializer)) - - sum_emb = self.emb_out + self.position_emb_out + self.sent_emb_out - - sum_emb = pre_process_layer( - sum_emb, - self.config['pre_encoder_cmd'], - self._prepostprocess_dropout, - name='pre_encoder', - epsilon=self.config['epsilon']) - - if self.config['emb_mapping_in']: - sum_emb = fluid.layers.fc( - input=sum_emb, - num_flatten_dims=2, - size=self._hidden_size, - param_attr=fluid.ParamAttr( - name='emb_hidden_mapping', - initializer=self._param_initializer), - bias_attr='emb_hidden_mapping_bias') - - self_attn_mask = fluid.layers.matmul( - x=self.input_mask, y=self.input_mask, transpose_y=True) - - self_attn_mask = fluid.layers.scale( - x=self_attn_mask, - scale=10000.0, - bias=-1.0, - bias_after_scale=False) - - with fluid.ipu_shard(ipu_index=1, ipu_stage=1): - n_head_self_attn_mask = fluid.layers.stack( - x=[self_attn_mask] * self._n_head, - axis=1) # [bs, _n_head, seqlen, seq_len] - n_head_self_attn_mask.stop_gradient = True - - self._enc_out = encoder( - enc_input=sum_emb, - attn_bias=n_head_self_attn_mask, - n_layer=self._n_layer, - n_head=self._n_head, - d_key=self._hidden_size // self._n_head, - d_value=self._hidden_size // self._n_head, - d_model=self._hidden_size, - d_inner_hid=self._hidden_size * 4, - prepostprocess_dropout=self._prepostprocess_dropout, - attention_dropout=self._attention_dropout, - relu_dropout=0, - hidden_act=self._hidden_act, - preprocess_cmd=self.config['preprocess_cmd'], - postprocess_cmd=self.config['postprocess_cmd'], - param_initializer=self._param_initializer, - name='encoder', - epsilon=self.config['epsilon'], - n_layer_per_block=self.config['n_layer_per_block'], - preln=self.preln) - - def _build_position_ids(self): - d_shape = fluid.layers.shape(self.src_ids) - d_seqlen = d_shape[1] - d_batch = d_shape[0] - position_ids = fluid.layers.reshape( - fluid.layers.range( - 0, d_seqlen, 1, dtype='int32'), [1, d_seqlen, 1], - inplace=False) - position_ids = fluid.layers.expand(position_ids, [d_batch, 1, 1]) - position_ids = fluid.layers.cast(position_ids, INT_DTYPE) - position_ids.stop_gradient = True - return position_ids - - def _build_input_mask(self): - zero = fluid.layers.fill_constant([1], dtype=INT_DTYPE, value=0) - input_mask = fluid.layers.logical_not( - fluid.layers.equal(self.src_ids, zero)) # assume pad id == 0 - input_mask = fluid.layers.cast(input_mask, 'float32') - input_mask.stop_gradient = True - return input_mask - - def get_sequence_output(self): - return self._enc_out - - def get_pooled_output(self): - """Get the first feature of each sequence for classification""" - next_sent_feat = fluid.layers.slice( - input=self._enc_out, axes=[1], starts=[0], ends=[1]) - - next_sent_feat = fluid.layers.fc( - input=next_sent_feat, - size=self._hidden_size, - act="tanh", - param_attr=fluid.ParamAttr( - name="pooled_fc.w_0", initializer=self._param_initializer), - bias_attr="pooled_fc.b_0") - return next_sent_feat - - def get_next_sentence_output(self, labels): - next_sent_feat = self.get_pooled_output() - next_sent_fc_out = fluid.layers.fc( - input=next_sent_feat, - num_flatten_dims=1, - size=33, - param_attr=fluid.ParamAttr( - name="next_sent_fc.w_0", initializer=self._param_initializer), - bias_attr="next_sent_fc.b_0") - next_sent_fc_out = fluid.layers.reshape( - next_sent_fc_out, [-1, 33], inplace=False) - #next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy( - # logits=next_sent_fc_out, label=labels, return_softmax=True) - next_sent_softmax = fluid.layers.softmax(next_sent_fc_out) - next_sent_loss = fluid.layers.cross_entropy(next_sent_softmax, labels) - next_sent_acc = fluid.layers.accuracy( - input=next_sent_softmax, label=labels) - mean_next_sent_loss = fluid.layers.mean(next_sent_loss, - "mean_next_sent_loss") - return next_sent_acc, mean_next_sent_loss - - def get_lm_output(self, mask_label, mask_pos): - """Get the loss & accuracy for pretraining""" - mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32') - - # extract the first token feature in each sentence - reshaped_emb_out = fluid.layers.reshape( - x=self._enc_out, shape=[-1, self._hidden_size]) - - # extract masked tokens' feature - mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos) - if self._dtype == "float16": - mask_feat = fluid.layers.cast(x=mask_feat, dtype=self._emb_dtype) - - # transform: fc - if self._hidden_act == 'gelu' or self._hidden_act == 'gelu.precise': - _hidden_act = 'gelu' - else: - _hidden_act = None - - mask_trans_feat = fluid.layers.fc( - input=mask_feat, - size=self._emb_size, - act=_hidden_act, - param_attr=fluid.ParamAttr( - name='mask_lm_trans_fc.w_0', - initializer=self._param_initializer), - bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0')) - - if self._hidden_act == 'gelu' or self._hidden_act == 'gelu.precise': - pass - else: - mask_trans_feat = gelu(mask_trans_feat) - - # transform: layer norm - mask_trans_feat = fluid.layers.layer_norm( - mask_trans_feat, - begin_norm_axis=len(mask_trans_feat.shape) - 1, - param_attr=fluid.ParamAttr( - name='mask_lm_trans_layer_norm_scale', - initializer=fluid.initializer.Constant(1.)), - bias_attr=fluid.ParamAttr( - name='mask_lm_trans_layer_norm_bias', - initializer=fluid.initializer.Constant(0.)), - epsilon=self.config['epsilon']) - - mask_lm_out_bias_attr = fluid.ParamAttr( - name="mask_lm_out_fc.b_0", - initializer=fluid.initializer.Constant(value=0.0)) - - fc_out = fluid.layers.fc(input=mask_trans_feat, - size=self._voc_size, - param_attr=fluid.ParamAttr( - name="mask_lm_out_fc.w_0", - initializer=self._param_initializer), - bias_attr=mask_lm_out_bias_attr) - #mask_lm_loss = fluid.layers.softmax_with_cross_entropy( - # logits=fc_out, label=mask_label) - mask_lm_softmax = fluid.layers.softmax(fc_out) - mask_lm_loss = fluid.layers.cross_entropy(mask_lm_softmax, mask_label) - mean_mask_lm_loss = fluid.layers.mean( - mask_lm_loss, name="mean_mask_lm_loss") - - return mask_lm_loss, mean_mask_lm_loss - - def get_task_output(self, task, task_labels): - task_fc_out = fluid.layers.fc(input=self.next_sent_feat, - size=task["num_labels"], - param_attr=fluid.ParamAttr( - name=task["task_name"] + "_fc.w_0", - initializer=self._param_initializer), - bias_attr=task["task_name"] + "_fc.b_0") - #task_loss, task_softmax = fluid.layers.softmax_with_cross_entropy( - # logits=task_fc_out, label=task_labels, return_softmax=True) - task_softmax = fluid.layers.softmax(task_fc_out) - task_loss = fluid.layers.cross_entropy(task_softmax, task_labels) - task_acc = fluid.layers.accuracy(input=task_softmax, label=task_labels) - mean_task_loss = fluid.layers.mean(task_loss) - return mean_task_loss, task_acc - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(__doc__) - parser.add_argument( - "--run_on_ipu", type=bool, default=True, help="Run model with IPU") - parser.add_argument( - "--is_training", type=bool, default=True, help="Train of inference") - parser.add_argument( - "--num_ipus", type=int, default=2, help="Number of ipus") - parser.add_argument( - "--enable_pipelining", type=bool, default=False, help="Pipelining") - parser.add_argument( - "--save_model", type=bool, default=False, help="Save model or not") - parser.add_argument( - "--model_path", type=str, default="ernie", help="Save model to where") - parser.add_argument( - "--model_name", type=str, default="ernie", help="Save model name") - parser.add_argument( - "--ipu_run_steps", type=int, default=10, help="Number steps exe.run()") - parser.add_argument( - "--export_ops", type=bool, default=False, help="Export ops to ops.txt") - parser.add_argument( - "--export_ipu_idx", type=bool, default=False, help="Export op-idx pair") - args = parser.parse_args() - - # set random seed - np.random.seed(SEED) - paddle.static.default_startup_program().random_seed = SEED - paddle.static.default_main_program().random_seed = SEED - - # IPU doesn't support int64, so we change here - INT_DTYPE = "int32" if args.run_on_ipu else "int64" - - # paddle input placeholder, batch_size = 1 - micro_bs = 1 - seq_len = ernie_config["seq_len"] - input_shape = [micro_bs, seq_len, 1] - input_fields = { - 'names': [ - 'src_ids', 'sent_ids', 'pos_ids', 'input_mask', 'mask_label', - 'mask_pos' - ], - 'shapes': [ - input_shape, input_shape, input_shape, input_shape, [micro_bs, 1], - [micro_bs, 1] - ], - 'dtypes': - [INT_DTYPE, INT_DTYPE, INT_DTYPE, 'float32', INT_DTYPE, INT_DTYPE], - 'range': [[0, seq_len], [0, 4], [0, seq_len], None, [0, seq_len], - [0, seq_len]], - 'lod_levels': [0, 0, 0, 0, 0, 0], - } - - inputs = [ - fluid.data( - name=input_fields['names'][i], - shape=input_fields['shapes'][i], - dtype=input_fields['dtypes'][i], - lod_level=input_fields['lod_levels'][i]) - for i in range(len(input_fields['names'])) - ] - - # total_samples: assum disable pipelining - batches_per_step = 1 - if args.enable_pipelining: - batches_per_step = \ - ((args.num_ipus+1) if args.is_training else args.num_ipus) - total_samples = args.ipu_run_steps * batches_per_step - - total_steps = args.ipu_run_steps - if not args.run_on_ipu: # run on cpu - total_steps = total_samples // micro_bs - - # synthetic data - np_inputs = [] - for i in range(len(input_fields['names'])): - field_name = input_fields['names'][i] - if field_name == 'input_mask': - src_ids = np_inputs[0] - dtype = input_fields['dtypes'][i] - data = np.where(src_ids > 0, - np.ones_like(src_ids), - np.zeros_like(src_ids)).astype(dtype) - else: - shape = copy.copy(input_fields['shapes'][i]) - shape[0] = total_samples - min_val, max_val = input_fields['range'][i] - data = np.random.randint( - min_val, max_val, shape, dtype=input_fields['dtypes'][i]) - np_inputs.append(data) - - # paddle input placeholder - (src_ids, sent_ids, pos_ids, input_mask, mask_label, mask_pos) = inputs - - # ernie model - ernie = ErnieModel(src_ids, sent_ids, pos_ids, input_mask, ernie_config) - fetch_node = ernie.get_sequence_output() - if args.is_training: - with fluid.ipu_shard(ipu_index=1, ipu_stage=1): - _, mean_mask_lm_loss = ernie.get_lm_output(mask_label, mask_pos) - fetch_node = mean_mask_lm_loss - adam = paddle.optimizer.Adam(learning_rate=1e-2) - adam.minimize(mean_mask_lm_loss) - - # place = paddle.CPUPlace() - if args.run_on_ipu: - place = paddle.IPUPlace() - else: - place = paddle.CPUPlace() - executor = paddle.static.Executor(place) - - # feed & fetch list - if args.is_training: - feed_list = input_fields['names'] - else: - feed_list = input_fields['names'][:4] - fetch_list = [fetch_node.name] - - # program - startup_prog = paddle.static.default_startup_program() - executor.run(startup_prog) - - main_prog = paddle.static.default_main_program() - paddle.static.save(main_prog, "model/ernie") - paddle.static.load(main_prog, "model/ernie") - - if args.run_on_ipu: - ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig( - num_ipus=args.num_ipus, - is_training=args.is_training, - enable_manual_shard=args.num_ipus > 1) - ipu_strategy.SetPipeliningConfig( - enable_pipelining=args.enable_pipelining, - batches_per_step=args.num_ipus + 1) - - ipu_compiler = compiler.IPUCompiledProgram( - main_prog, ipu_strategy=ipu_strategy) - program = ipu_compiler.compile(feed_list, fetch_list) - else: - program = main_prog - - # executor run - results = [] - for i in range(total_steps): - start = i * (batches_per_step if args.run_on_ipu else 1) - end = start + (batches_per_step if args.run_on_ipu else 1) - feed_dict = { - src_ids.name: np_inputs[0][start:end], - sent_ids.name: np_inputs[1][start:end], - pos_ids.name: np_inputs[2][start:end], - input_mask.name: np_inputs[3][start:end] - } - if args.is_training: - feed_dict[mask_label.name] = np_inputs[4][start:end] - feed_dict[mask_pos.name] = np_inputs[5][start:end] - - res = executor.run(program, feed=feed_dict, fetch_list=[fetch_node]) - results.append(res) - - paddle.static.save(main_prog, "model/ernie") - - results = np.asarray(results).flatten() - if results.size > 32: - results = results[-32:] - print(results) - - if args.save_model: - full_name = args.model_path + '/' + args.model_name - if args.is_training: - fluid.save(program=main_prog, model_path=full_name) - else: - with fluid.ipu_shard(ipu_index=1, ipu_stage=1): - paddle.static.save_inference_model( - full_name, [src_ids, sent_ids, pos_ids, input_mask], - [fetch_node], executor) - - if args.export_ops: - op_type_list = [] - for op in main_prog.global_block().ops: - op_type_list.append(op.desc.type()) - - with open("ops.txt", "w") as fp: - for op_type in set(op_type_list): - fp.write(op_type + os.linesep) - - if args.export_ipu_idx: - op_ipu_idx_list = [] - for op in main_prog.global_block().ops: - if op._is_backward_op(): - continue - - op_ipu_idx_pair = [op.desc.type()] - if op.desc.has_attr("ipu_index"): - op_ipu_idx_pair.append(op.desc.attr("ipu_index")) - else: - op_ipu_idx_pair.append(-1) # not assign ipu_index - op_ipu_idx_list.append(op_ipu_idx_pair) - op_ipu_idx_list.sort(key=lambda item: item[-1]) - - with open("ops_ipu_idx.txt", "w") as fp: - for op_ipu_idx_pair in op_ipu_idx_list: - fp.write(str(op_ipu_idx_pair) + os.linesep) diff --git a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py index 0d09f604060..790388f30ea 100644 --- a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py @@ -12,17 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import random import unittest - import numpy as np -from paddle.fluid.tests.unittests.op_test import _set_use_system_allocator -from typing import Optional -import paddle.fluid.compiler as compiler - -SEED = 2021 +from enum import Enum -ipu_compiler_ref: Optional[compiler.IPUCompiledProgram] = None +import paddle +import paddle.static map_np_dtype_to_fluid_dtype = { 'bool': "bool", @@ -36,6 +33,19 @@ map_np_dtype_to_fluid_dtype = { } +class ExecutionMode(Enum): + CPU_FP32 = 1 + IPU_FP32 = 2 + # enable_fp16 through ipu_strategy.enable_fp16 + IPU_POPART_FP16 = 3 + + def __lt__(self, other): + return self.value < other.value + + def __gt__(self, other): + return self.value > other.value + + def np_dtype_to_fluid_str(dtype: np.dtype) -> str: return map_np_dtype_to_fluid_dtype[dtype.name] @@ -43,14 +53,16 @@ def np_dtype_to_fluid_str(dtype: np.dtype) -> str: class IPUOpTest(unittest.TestCase): @classmethod def setUpClass(cls): + # Get random seeds cls._np_rand_state = np.random.get_state() cls._py_rand_state = random.getstate() - cls.SEED = SEED + cls.SEED = 2021 np.random.seed(cls.SEED) random.seed(cls.SEED) - cls._use_system_allocator = _set_use_system_allocator(True) + # Enable paddle static graph mode + paddle.enable_static() @classmethod def tearDownClass(cls): @@ -58,14 +70,47 @@ class IPUOpTest(unittest.TestCase): np.random.set_state(cls._np_rand_state) random.setstate(cls._py_rand_state) - _set_use_system_allocator(cls._use_system_allocator) - # unittest will to trigger IPUCompiledProgram.__del__ automatically - global ipu_compiler_ref - ipu_compiler_ref is not None and ipu_compiler_ref.clean() + @classmethod + def use_ipumodel(cls): + if 'POPLAR_IPUMODEL' not in os.environ: + return False + else: + flag = os.environ['POPLAR_IPUMODEL'] + if flag.upper() in ['1', "TRUE"]: + return True def set_atol(self): - self.atol = 1e-5 + self.atol = 1e-10 + self.rtol = 1e-6 + self.atol_fp16 = 1e-3 + self.rtol_fp16 = 1e-3 def set_training(self): self.is_training = False self.epoch = 1 + + def check(self, outputs, check_shape=False): + cpu_fp32 = outputs[ExecutionMode.CPU_FP32] + ipu_fp32 = outputs[ExecutionMode.IPU_FP32] + max_diff = np.abs(cpu_fp32 - ipu_fp32).max() + fp32_flag = np.allclose( + cpu_fp32, ipu_fp32, rtol=self.rtol, atol=self.atol) + self.assertTrue(fp32_flag, "max diff is %f" % (max_diff)) + + if check_shape: + self.assertTrue(cpu_fp32.shape == ipu_fp32.shape) + + ipu_popart_fp16 = None + if ExecutionMode.IPU_POPART_FP16 in outputs.keys(): + ipu_popart_fp16 = outputs[ExecutionMode.IPU_POPART_FP16] + max_diff = np.abs(ipu_popart_fp16.astype(np.float32) - + cpu_fp32).max() + fp16_flag = np.allclose( + ipu_popart_fp16.astype(np.float32), + cpu_fp32, + rtol=self.rtol_fp16, + atol=self.atol_fp16) + self.assertTrue(fp16_flag, "max diff is %f" % (max_diff)) + + if check_shape: + self.assertTrue(ipu_popart_fp16.shape == cpu_fp32.shape) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py new file mode 100644 index 00000000000..138365b650f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py @@ -0,0 +1,133 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.nn.functional as F +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode, + IPUOpTest) + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestRelu(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_test_op() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + + @property + def fp16_enabled(self): + return True + + def set_test_op(self): + self.op = paddle.fluid.layers.relu + self.op_attrs = {} + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + + out = self.op(x, **self.op_attrs) + + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] + + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() + + self.check(output_dict) + + +class TestTanh(TestRelu): + def set_test_op(self): + self.op = F.tanh + self.op_attrs = {} + + +class TestLog(TestRelu): + def set_test_op(self): + self.op = paddle.fluid.layers.log + self.op_attrs = {} + + +class TestSigmoid(TestRelu): + def set_test_op(self): + self.op = F.sigmoid + self.op_attrs = {} + + +class TestSqrt(TestRelu): + def set_test_op(self): + self.op = paddle.fluid.layers.sqrt + self.op_attrs = {} + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py new file mode 100644 index 00000000000..d14eba98ef5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py @@ -0,0 +1,117 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode, + IPUOpTest) + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data = np.random.uniform(size=[10, 1000]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] + + def set_op_attrs(self): + self.attrs = {"axis": -1} + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + + out = paddle.fluid.layers.argmax(x, **self.attrs) + + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0].astype(np.int32) + + def test_base(self): + output_dict_fp32 = {} + output_dict_fp16 = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + + if mode > ExecutionMode.IPU_FP32: + output_dict_fp16[mode] = self._test_base(mode).flatten() + else: + output_dict_fp32[mode] = self._test_base(mode).flatten() + + self.check(output_dict_fp32) + + +class TestCase1(TestBase): + def set_op_attrs(self): + self.attrs = {"axis": 0} + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py new file mode 100644 index 00000000000..4f17c90de72 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py @@ -0,0 +1,102 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data = np.random.uniform(size=[2, 3, 1]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + + assign = paddle.assign(x) + out = paddle.fluid.layers.elementwise_add(assign, assign) + + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] + + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() + + self.check(output_dict) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py index a23cacf4763..f34e5b0d8b9 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py @@ -16,13 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -31,78 +26,89 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() - self.set_attrs() - - def set_feed(self): - self.feed_shape = [] - self.feed_shape.append([1, 3, 128, 128]) - - self.feed = {} - self.feed["in_0"] = np.random.uniform( - size=self.feed_shape[0]).astype(np.float32) - - self.feed_list = list(self.feed.keys()) - - def set_attrs(self): - self.attrs = {} - - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + self.set_data_feed() + self.set_feed_attr() + + @property + def fp16_enabled(self): + return True + + def set_atol(self): + self.atol = 2e-6 + self.rtol = 1e-5 + self.atol_fp16 = 1e-2 + self.rtol_fp16 = 1e-3 + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 128, 128]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32') - conv1 = paddle.static.nn.conv2d( + + x = paddle.static.nn.conv2d( + x, num_filters=3, filter_size=3, bias_attr=False) + x = paddle.static.nn.conv2d( + x, num_filters=3, filter_size=3, bias_attr=False) + x = paddle.static.nn.conv2d( + x, num_filters=3, filter_size=3, bias_attr=False) + x = paddle.static.nn.conv2d( x, num_filters=3, filter_size=3, bias_attr=False) - conv2 = paddle.static.nn.conv2d( - conv1, num_filters=3, filter_size=3, bias_attr=False) - conv3 = paddle.static.nn.conv2d( - conv2, num_filters=3, filter_size=3, bias_attr=False) - conv4 = paddle.static.nn.conv2d( - conv3, num_filters=3, filter_size=3, bias_attr=False) - fetch_list = [conv4.name] + fetch_list = [x.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig( - num_ipus=2, - is_training=self.is_training, - enable_manual_shard=True, - need_avg_shard=True) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + ipu_strategy.set_options({'need_avg_shard': True}) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] - def test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + self.check(output_dict) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py index 87f783dbd1c..1dab958c1ec 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py @@ -16,13 +16,9 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode, + IPUOpTest) @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -31,76 +27,100 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() - self.set_attrs() - - def set_feed(self): - self.feed_shape = [] - self.feed_shape.append([1, 3, 10, 10]) - - self.feed = {} - self.feed["in_0"] = np.random.uniform( - size=self.feed_shape[0]).astype(np.float32) - - self.feed_list = list(self.feed.keys()) - - def set_attrs(self): + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_atol(self): + self.atol = 1e-6 + self.rtol = 1e-5 + self.atol_fp16 = 1e-2 + self.rtol_fp16 = 1e-3 + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): self.attrs = {} self.attrs['is_test'] = False self.attrs['data_layout'] = 'NCHW' self.attrs['in_place'] = False - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32') + conv1 = paddle.static.nn.conv2d( x, num_filters=3, filter_size=3, bias_attr=False) out = paddle.fluid.layers.batch_norm(conv1, **self.attrs) fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] - def test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + self.check(output_dict) class TestCase1(TestBase): - def set_attrs(self): + def set_atol(self): + self.atol = 1e-7 + self.rtol = 1e-6 + self.atol_fp16 = 1e-3 + self.rtol_fp16 = 1e-3 + + def set_op_attrs(self): self.attrs = {} self.attrs['is_test'] = True self.attrs['data_layout'] = 'NCHW' @@ -108,7 +128,13 @@ class TestCase1(TestBase): class TestCase2(TestBase): - def set_attrs(self): + def set_atol(self): + self.atol = 1e-7 + self.rtol = 1e-6 + self.atol_fp16 = 1e-3 + self.rtol_fp16 = 1e-3 + + def set_op_attrs(self): self.attrs = {} self.attrs['is_test'] = True self.attrs['data_layout'] = 'NCHW' diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_batchs_per_step_simple.py b/python/paddle/fluid/tests/unittests/ipu/test_batchs_per_step_simple_ipu.py similarity index 79% rename from python/paddle/fluid/tests/unittests/ipu/test_ipu_batchs_per_step_simple.py rename to python/paddle/fluid/tests/unittests/ipu/test_batchs_per_step_simple_ipu.py index 9b485d7794d..ef61e651b2a 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_batchs_per_step_simple.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_batchs_per_step_simple_ipu.py @@ -17,8 +17,7 @@ from __future__ import print_function import numpy as np import unittest import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler +import paddle.static paddle.enable_static() SEED = 2021 @@ -28,7 +27,7 @@ SEED = 2021 "core is not compiled with IPU") class TestFunc(unittest.TestCase): def _test_func(self, run_ipu=True): - scope = fluid.core.Scope() + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() main_prog.random_seed = SEED @@ -40,22 +39,20 @@ class TestFunc(unittest.TestCase): c, h, w = 3, 10, 10 np_image = np.random.uniform(size=[1 * bps, c, h, w]).astype(np.float32) - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): image = paddle.static.data( name='image', shape=[n, c, h, w], dtype='float32') conv2d = paddle.static.nn.conv2d( image, num_filters=3, filter_size=3, bias_attr=False) - # paddle.mean oshape on ipu is [bps], need another mean() - # paddle.mean oshape on cpu is [1] - # out = paddle.mean(conv2d) out = conv2d if run_ipu: place = paddle.IPUPlace() else: place = paddle.CPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) @@ -63,14 +60,9 @@ class TestFunc(unittest.TestCase): feed_list = [image.name] fetch_list = [out.name] ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig( - num_ipus=2, - is_training=False, - enable_manual_shard=True, - need_avg_shard=True) - ipu_strategy.SetPipeliningConfig( - enable_pipelinin=True, batches_per_step=bps) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=False) + ipu_strategy.set_pipelining_config(batches_per_step=bps) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py index 6e58f809046..5f0eeaa2f99 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py @@ -16,14 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,14 +26,14 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() def set_atol(self): self.atol = 1e-3 - def set_feed(self): + def set_data_feed(self): self.feed = { "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float32'), } @@ -47,23 +41,20 @@ class TestBase(IPUOpTest): def set_feed_attr(self): self.feed_shape = [x.shape for x in self.feed.values()] self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_dtype = [x.dtype for x in self.feed.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} self.attrs['dtype'] = 'float16' def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], @@ -82,8 +73,8 @@ class TestBase(IPUOpTest): if run_ipu: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: @@ -103,27 +94,91 @@ class TestBase(IPUOpTest): self.assertTrue(res0.shape == res1.shape) -class TestCase1(TestBase): - def set_attrs(self): +class TestCase2(TestBase): + def set_atol(self): + self.atol = 1e-10 + + def set_data_feed(self): + self.feed = { + "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float16'), + } + + def set_op_attrs(self): + self.attrs = {} + self.attrs['dtype'] = 'float32' + + +class TestCase3(TestBase): + def set_atol(self): + self.atol = 1e-10 + + def set_data_feed(self): + self.feed = { + "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float32'), + } + + def set_op_attrs(self): + self.attrs = {} + self.attrs['dtype'] = 'int32' + + +class TestCase4(TestBase): + def set_atol(self): + self.atol = 1e-10 + + def set_data_feed(self): + self.feed = { + "x": np.random.uniform(size=[1, 3, 3, 3]).astype('int32'), + } + + def set_op_attrs(self): + self.attrs = {} + self.attrs['dtype'] = 'float32' + + +class TestCase5(TestBase): + def set_atol(self): + self.atol = 1e-10 + + def set_data_feed(self): + self.feed = { + "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float16'), + } + + def set_op_attrs(self): + self.attrs = {} + self.attrs['dtype'] = 'int32' + + +class TestCase6(TestBase): + def set_atol(self): + self.atol = 1e-10 + + def set_data_feed(self): + self.feed = { + "x": np.random.uniform(size=[1, 3, 3, 3]).astype('int32'), + } + + def set_op_attrs(self): self.attrs = {} self.attrs['dtype'] = 'float16' @unittest.skip('float64 is not supported') class TestCase2(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} self.attrs['dtype'] = 'float64' @unittest.skip('skip float16 to float32') class TestCase3(TestBase): - def set_feed(self): + def set_data_feed(self): self.feed = { "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float16'), } - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} self.attrs['dtype'] = 'float32' @@ -133,13 +188,13 @@ class TestCase4(TestBase): def set_atol(self): self.atol = 1 - def set_feed(self): + def set_data_feed(self): self.feed = { "x": np.random.randint( low=1, high=100, size=[1, 3, 3, 3]).astype('int32'), } - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} self.attrs['dtype'] = 'int8' diff --git a/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py index 094b19ce99d..c5a80902839 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py @@ -16,14 +16,9 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode, + IPUOpTest) @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,81 +27,95 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data1 = np.random.uniform(size=[1, 3, 10, 10]) + data2 = np.random.uniform(size=[1, 3, 10, 10]) - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'), - "y": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'), + self.feed_fp32 = { + 'x': data1.astype(np.float32), + 'y': data2.astype(np.float32) + } + self.feed_fp16 = { + 'x': data1.astype(np.float16), + 'y': data2.astype(np.float16) } def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axis": 0} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') y = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], - dtype=self.feed_dtype[1]) + dtype='float32') + out = paddle.fluid.layers.concat([x, y], **self.attrs) fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axis": 1} diff --git a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py index f28733de6b1..ade54fda869 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py @@ -16,13 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -31,20 +26,30 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() - self.set_attrs() - - def set_feed(self): - self.feed_shape = [] - self.feed_shape.append([1, 3, 10, 10]) - - self.feed = {} - self.feed["in_0"] = np.random.uniform( - size=self.feed_shape[0]).astype(np.float32) - - self.feed_list = list(self.feed.keys()) - - def set_attrs(self): + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_atol(self): + self.atol = 1e-6 + self.rtol = 1e-6 + self.atol_fp16 = 1e-3 + self.rtol_fp16 = 1e-3 + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): self.attrs = {} self.attrs['num_filters'] = 3 self.attrs['filter_size'] = 3 @@ -54,104 +59,112 @@ class TestBase(IPUOpTest): self.attrs['groups'] = 1 self.attrs['data_format'] = 'NCHW' - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): image = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32') + out = paddle.fluid.layers.conv2d(image, **self.attrs) fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] - def test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + self.check(output_dict) class TestCase1(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['num_filters'] = 1 class TestCase2(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['filter_size'] = [3, 3] class TestCase2_1(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['filter_size'] = [3, 2] class TestCase3(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['stride'] = [2, 3] class TestCase4(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['dilation'] = [2, 2] class TestCase5(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['groups'] = 3 class TestCase6(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['padding'] = 2 class TestCase7(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['padding'] = [2, 3] class TestCase8(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['padding'] = [1, 2, 2, 3] diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py index 3987c6cd5b3..3a21f0cb007 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py @@ -16,14 +16,8 @@ import unittest import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,44 +26,54 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() - - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[3, 7]).astype('float32'), - "label": np.arange(3).reshape([3]).astype(np.int64), + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + x = np.random.uniform(size=[3, 7]) + label = np.arange(3).reshape([3, 1]) + self.feed_fp32 = { + "x": x.astype(np.float32), + "label": label.astype(np.int64) + } + self.feed_fp16 = { + "x": x.astype(np.float16), + "label": label.astype(np.int32) } def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) - def set_attrs(self): + def set_op_attrs(self): self.attrs = {'soft_label': False, } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def np_nll_loss(self): + tmp = -np.log(self.feed_fp32['x']) + label = self.feed_fp32['label'] + indice = [range(label.shape[0]), label.flatten()] + self.np_ref = tmp[indice] + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype="float32") - # [warning] Copying (host) tensor input/1 from INT64 to INT32. - # Will only warn once - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: label = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], @@ -80,52 +84,78 @@ class TestBase(IPUOpTest): shape=self.feed_shape[1], dtype='int64') - out = fluid.layers.cross_entropy( + out = paddle.fluid.layers.cross_entropy( input=x, label=label, **self.attrs) + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) - return result[0] + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 - def test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) + if exec_mode != ExecutionMode.CPU_FP32: + feed['label'] = feed['label'].astype(np.int32) - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] - self.assertTrue(res0.shape == res1.shape) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() + self.np_nll_loss() + + self.check(output_dict) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { 'soft_label': False, 'ignore_index': 1, } -@unittest.skip("soft_label=True id not supported") class TestCase2(TestBase): - def set_attrs(self): + def set_data_feed(self): + x = np.random.uniform(size=[30, 70]) + label = np.arange(30).reshape([30, 1]) + + self.feed_fp32 = { + "x": x.astype(np.float32), + "label": label.astype(np.int64) + } + self.feed_fp16 = { + "x": x.astype(np.float16), + "label": label.astype(np.int32) + } + + +@unittest.skip("soft_label=True is not supported") +class TestCase3(TestBase): + def set_op_attrs(self): self.attrs = {'soft_label': True, } diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py new file mode 100644 index 00000000000..2f1d86daf00 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py @@ -0,0 +1,123 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + # popart unsupport fp16 cumsum + @property + def fp16_enabled(self): + return False + + def set_data_feed(self): + x = np.random.uniform(size=[1, 128]) + self.feed_fp32 = {"x": x.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] + + def set_op_attrs(self): + self.attrs = {} + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype="float32") + + out = paddle.fluid.layers.cumsum(x, **self.attrs) + + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] + + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + + output_dict[mode] = self._test_base(mode).flatten() + + self.check(output_dict) + + +class TestCase1(TestBase): + def set_op_attrs(self): + self.attrs = {"exclusive": True, "reverse": False} + + +class TestCase2(TestBase): + def set_op_attrs(self): + self.attrs = {"exclusive": False, "reverse": True} + + +class TestCase3(TestBase): + def set_op_attrs(self): + self.attrs = {"exclusive": True, "reverse": True} + + +if __name__ == "__main__": + unittest.main() -- GitLab From 6af2729e615a8d6b3b4f96964f1c71d20b8f5517 Mon Sep 17 00:00:00 2001 From: crystal <62974595+Zjq9409@users.noreply.github.com> Date: Wed, 2 Mar 2022 15:45:28 +0800 Subject: [PATCH 052/272] =?UTF-8?q?=E3=80=90phi=E3=80=91migrate=20gather?= =?UTF-8?q?=5Ftree,reduce=5Fprod=20to=20phi=20(#39844)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * move to phi * migrate gather_tree_op into phi * move reduce_prod tp phi * optimize code --- paddle/fluid/operators/gather_tree_op.cc | 4 +- paddle/fluid/operators/gather_tree_op.cu | 84 ------------------- paddle/fluid/operators/gather_tree_op.h | 66 --------------- .../operators/reduce_ops/reduce_prod_op.cc | 10 +-- .../operators/reduce_ops/reduce_prod_op.h | 7 -- paddle/phi/kernels/cpu/gather_tree_kernel.cc | 62 ++++++++++++++ paddle/phi/kernels/cpu/reduce_prod_kernel.cc | 44 ++++++++++ paddle/phi/kernels/funcs/reduce_functor.h | 8 ++ .../kernels/gather_tree_kernel.h} | 21 +++-- paddle/phi/kernels/gpu/gather_tree_kernel.cu | 79 +++++++++++++++++ paddle/phi/kernels/gpu/reduce_prod_kernel.cu | 43 ++++++++++ paddle/phi/kernels/reduce_prod_kernel.h | 29 +++++++ paddle/phi/ops/compat/reduce_sig.cc | 6 ++ 13 files changed, 285 insertions(+), 178 deletions(-) delete mode 100644 paddle/fluid/operators/gather_tree_op.cu delete mode 100644 paddle/fluid/operators/gather_tree_op.h create mode 100644 paddle/phi/kernels/cpu/gather_tree_kernel.cc create mode 100644 paddle/phi/kernels/cpu/reduce_prod_kernel.cc rename paddle/{fluid/operators/reduce_ops/reduce_prod_op.cu => phi/kernels/gather_tree_kernel.h} (51%) create mode 100644 paddle/phi/kernels/gpu/gather_tree_kernel.cu create mode 100644 paddle/phi/kernels/gpu/reduce_prod_kernel.cu create mode 100644 paddle/phi/kernels/reduce_prod_kernel.h diff --git a/paddle/fluid/operators/gather_tree_op.cc b/paddle/fluid/operators/gather_tree_op.cc index 830134e57e0..2868c3697ed 100644 --- a/paddle/fluid/operators/gather_tree_op.cc +++ b/paddle/fluid/operators/gather_tree_op.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/gather_tree_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -73,5 +73,3 @@ selected ids. namespace ops = paddle::operators; REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker); -REGISTER_OP_CPU_KERNEL(gather_tree, ops::GatherTreeOpKernel, - ops::GatherTreeOpKernel); diff --git a/paddle/fluid/operators/gather_tree_op.cu b/paddle/fluid/operators/gather_tree_op.cu deleted file mode 100644 index 829682764a6..00000000000 --- a/paddle/fluid/operators/gather_tree_op.cu +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather_tree_op.h" - -namespace paddle { -namespace operators { - -template -__global__ void GatherTree(const T *ids_data, const T *parents_data, - T *out_data, const int64_t max_length, - const int64_t batch_size, const int64_t beam_size) { - CUDA_KERNEL_LOOP(i, batch_size * beam_size) { - int batch = i / beam_size; - int beam = i % beam_size; - auto idx = - (max_length - 1) * batch_size * beam_size + batch * beam_size + beam; - out_data[idx] = ids_data[idx]; - auto parent = parents_data[idx]; - for (int step = max_length - 2; step >= 0; step--) { - idx = step * batch_size * beam_size + batch * beam_size; - out_data[idx + beam] = ids_data[idx + parent]; - parent = parents_data[idx + parent]; - } - } -} - -template -class GatherTreeOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *ids = ctx.Input("Ids"); - auto *parents = ctx.Input("Parents"); - auto *out = ctx.Output("Out"); - - const auto *ids_data = ids->data(); - const auto *parents_data = parents->data(); - auto *out_data = out->mutable_data(ctx.GetPlace()); - - PADDLE_ENFORCE_NOT_NULL( - ids_data, platform::errors::InvalidArgument( - "Input(Ids) of gather_tree should not be null.")); - - PADDLE_ENFORCE_NOT_NULL( - parents_data, platform::errors::InvalidArgument( - "Input(Parents) of gather_tree should not be null.")); - - auto &ids_dims = ids->dims(); - int64_t max_length = ids_dims[0]; - int64_t batch_size = ids_dims[1]; - int64_t beam_size = ids_dims[2]; - - auto &dev_ctx = ctx.cuda_device_context(); - - const int block = 512; - int max_threads = - std::min(static_cast(dev_ctx.GetMaxPhysicalThreadCount()), - batch_size * beam_size); - const int grid = std::max(max_threads / block, 1); - GatherTree<<>>(ids_data, parents_data, out_data, max_length, - batch_size, beam_size); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL(gather_tree, ops::GatherTreeOpCUDAKernel, - ops::GatherTreeOpCUDAKernel); diff --git a/paddle/fluid/operators/gather_tree_op.h b/paddle/fluid/operators/gather_tree_op.h deleted file mode 100644 index e035a30e795..00000000000 --- a/paddle/fluid/operators/gather_tree_op.h +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class GatherTreeOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *ids = ctx.Input("Ids"); - auto *parents = ctx.Input("Parents"); - auto *out = ctx.Output("Out"); - - const auto *ids_data = ids->data(); - const auto *parents_data = parents->data(); - auto *out_data = out->mutable_data(ctx.GetPlace()); - - auto &ids_dims = ids->dims(); - auto max_length = ids_dims[0]; - auto batch_size = ids_dims[1]; - auto beam_size = ids_dims[2]; - - PADDLE_ENFORCE_NOT_NULL( - ids_data, platform::errors::InvalidArgument( - "Input(Ids) of gather_tree should not be null.")); - - PADDLE_ENFORCE_NOT_NULL( - parents_data, platform::errors::InvalidArgument( - "Input(Parents) of gather_tree should not be null.")); - - for (int batch = 0; batch < batch_size; batch++) { - for (int beam = 0; beam < beam_size; beam++) { - auto idx = (max_length - 1) * batch_size * beam_size + - batch * beam_size + beam; - out_data[idx] = ids_data[idx]; - auto parent = parents_data[idx]; - for (int step = max_length - 2; step >= 0; step--) { - idx = step * batch_size * beam_size + batch * beam_size; - out_data[idx + beam] = ids_data[idx + parent]; - parent = parents_data[idx + parent]; - } - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc index 50df75d9ad3..eb745ab9c56 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc @@ -27,15 +27,7 @@ class CPUDeviceContext; } // namespace paddle REGISTER_REDUCE_OP(reduce_prod); -REGISTER_OP_CPU_KERNEL(reduce_prod, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); + REGISTER_OP_CPU_KERNEL(reduce_prod_grad, ops::ReduceGradKernel, diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.h b/paddle/fluid/operators/reduce_ops/reduce_prod_op.h index 103e108e4bd..60dedf8d6ff 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.h @@ -19,13 +19,6 @@ namespace paddle { namespace operators { -struct ProdFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->prod(dim); - } -}; - struct ProdGradFunctor { template diff --git a/paddle/phi/kernels/cpu/gather_tree_kernel.cc b/paddle/phi/kernels/cpu/gather_tree_kernel.cc new file mode 100644 index 00000000000..25fb870d851 --- /dev/null +++ b/paddle/phi/kernels/cpu/gather_tree_kernel.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gather_tree_kernel.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void GatherTreeKernel(const Context &dev_ctx, + const DenseTensor &ids, + const DenseTensor &parents, + DenseTensor *out) { + const auto *ids_data = ids.data(); + const auto *parents_data = parents.data(); + + T *out_data = dev_ctx.template Alloc(out); + + auto &ids_dims = ids.dims(); + auto max_length = ids_dims[0]; + auto batch_size = ids_dims[1]; + auto beam_size = ids_dims[2]; + + PADDLE_ENFORCE_NOT_NULL(ids_data, + phi::errors::InvalidArgument( + "Input(Ids) of gather_tree should not be null.")); + + PADDLE_ENFORCE_NOT_NULL( + parents_data, + phi::errors::InvalidArgument( + "Input(Parents) of gather_tree should not be null.")); + + for (int batch = 0; batch < batch_size; batch++) { + for (int beam = 0; beam < beam_size; beam++) { + auto idx = + (max_length - 1) * batch_size * beam_size + batch * beam_size + beam; + out_data[idx] = ids_data[idx]; + auto parent = parents_data[idx]; + for (int step = max_length - 2; step >= 0; step--) { + idx = step * batch_size * beam_size + batch * beam_size; + out_data[idx + beam] = ids_data[idx + parent]; + parent = parents_data[idx + parent]; + } + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + gather_tree, CPU, ALL_LAYOUT, phi::GatherTreeKernel, int, int64_t) {} diff --git a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc b/paddle/phi/kernels/cpu/reduce_prod_kernel.cc new file mode 100644 index 00000000000..cf0179124eb --- /dev/null +++ b/paddle/phi/kernels/cpu/reduce_prod_kernel.cc @@ -0,0 +1,44 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_prod_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" + +namespace phi { + +template +void ReduceProdKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(reduce_prod, + CPU, + ALL_LAYOUT, + phi::ReduceProdKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/funcs/reduce_functor.h b/paddle/phi/kernels/funcs/reduce_functor.h index ce8e095e8ac..aebd155ac59 100644 --- a/paddle/phi/kernels/funcs/reduce_functor.h +++ b/paddle/phi/kernels/funcs/reduce_functor.h @@ -33,5 +33,13 @@ struct MeanFunctor { } }; +//////// Prod Functor /////// +struct ProdFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->prod(dim); + } +}; + } // namespace funcs } // namespace phi diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/phi/kernels/gather_tree_kernel.h similarity index 51% rename from paddle/fluid/operators/reduce_ops/reduce_prod_op.cu rename to paddle/phi/kernels/gather_tree_kernel.h index 2de647df8b1..e5a1a684dae 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu +++ b/paddle/phi/kernels/gather_tree_kernel.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,12 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h" +#pragma once -REGISTER_OP_CUDA_KERNEL( - reduce_prod, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel); +#include "paddle/phi/core/dense_tensor.h" +namespace phi { + +template +void GatherTreeKernel(const Context &dev_ctx, + const DenseTensor &ids, + const DenseTensor &parents, + DenseTensor *out); + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/gather_tree_kernel.cu b/paddle/phi/kernels/gpu/gather_tree_kernel.cu new file mode 100644 index 00000000000..a9e73ec37c8 --- /dev/null +++ b/paddle/phi/kernels/gpu/gather_tree_kernel.cu @@ -0,0 +1,79 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/phi/core/device_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gather_tree_kernel.h" + +namespace phi { + +template +__global__ void GatherTree(const T *ids_data, + const T *parents_data, + T *out_data, + const int64_t max_length, + const int64_t batch_size, + const int64_t beam_size) { + CUDA_KERNEL_LOOP(i, batch_size * beam_size) { + int batch = i / beam_size; + int beam = i % beam_size; + auto idx = + (max_length - 1) * batch_size * beam_size + batch * beam_size + beam; + out_data[idx] = ids_data[idx]; + auto parent = parents_data[idx]; + for (int step = max_length - 2; step >= 0; step--) { + idx = step * batch_size * beam_size + batch * beam_size; + out_data[idx + beam] = ids_data[idx + parent]; + parent = parents_data[idx + parent]; + } + } +} + +template +void GatherTreeKernel(const Context &dev_ctx, + const DenseTensor &ids, + const DenseTensor &parents, + DenseTensor *out) { + const auto *ids_data = ids.data(); + const auto *parents_data = parents.data(); + T *out_data = dev_ctx.template Alloc(out); + + PADDLE_ENFORCE_NOT_NULL(ids_data, + phi::errors::InvalidArgument( + "Input(Ids) of gather_tree should not be null.")); + + PADDLE_ENFORCE_NOT_NULL( + parents_data, + phi::errors::InvalidArgument( + "Input(Parents) of gather_tree should not be null.")); + + auto &ids_dims = ids.dims(); + int64_t max_length = ids_dims[0]; + int64_t batch_size = ids_dims[1]; + int64_t beam_size = ids_dims[2]; + + const int block = 512; + int max_threads = + std::min(static_cast(dev_ctx.GetMaxPhysicalThreadCount()), + batch_size * beam_size); + const int grid = std::max(max_threads / block, 1); + GatherTree<<>>( + ids_data, parents_data, out_data, max_length, batch_size, beam_size); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + gather_tree, GPU, ALL_LAYOUT, phi::GatherTreeKernel, int, int64_t) {} diff --git a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu b/paddle/phi/kernels/gpu/reduce_prod_kernel.cu new file mode 100644 index 00000000000..14084d0f4f3 --- /dev/null +++ b/paddle/phi/kernels/gpu/reduce_prod_kernel.cu @@ -0,0 +1,43 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/phi/kernels/gpu/reduce.h" +#include "paddle/phi/kernels/reduce_prod_kernel.h" + +namespace phi { + +template +void ReduceProdKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(reduce_prod, + GPU, + ALL_LAYOUT, + phi::ReduceProdKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/reduce_prod_kernel.h b/paddle/phi/kernels/reduce_prod_kernel.h new file mode 100644 index 00000000000..5e92b6c4db1 --- /dev/null +++ b/paddle/phi/kernels/reduce_prod_kernel.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ReduceProdKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc index 6395486ed2b..92839fb3030 100644 --- a/paddle/phi/ops/compat/reduce_sig.cc +++ b/paddle/phi/ops/compat/reduce_sig.cc @@ -51,6 +51,11 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("unregistered", {}, {}, {}); } +KernelSignature ReduceProdOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "reduce_prod", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); +} + } // namespace phi PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum); @@ -58,3 +63,4 @@ PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean); PD_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reduce_prod, phi::ReduceProdOpArgumentMapping); -- GitLab From c9cd47d96b2cccb34d8dc269a055f5b64346a10e Mon Sep 17 00:00:00 2001 From: JZ-LIANG Date: Wed, 2 Mar 2022 15:58:57 +0800 Subject: [PATCH 053/272] [Auto Parallel] Adapt Partitioner & DistOp for ERNIE3.0 Inference and cache (#39895) * adapot dist op * add dist_fill_constant_batch_size_like * remvoe print * update compitable * add unitest --- .../auto_parallel/operators/__init__.py | 1 + .../auto_parallel/operators/dist_eltwise.py | 0 .../auto_parallel/operators/dist_embedding.py | 5 +- .../dist_fill_constant_batch_size_like.py | 127 ++++++++++++++++++ .../auto_parallel/operators/dist_matmul.py | 8 +- .../distributed/auto_parallel/partitioner.py | 3 + .../test_auto_parallel_while_op.py | 28 ++++ 7 files changed, 168 insertions(+), 4 deletions(-) mode change 100755 => 100644 python/paddle/distributed/auto_parallel/operators/dist_eltwise.py create mode 100644 python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/operators/__init__.py index 9f84df2d896..db6f909f8ca 100644 --- a/python/paddle/distributed/auto_parallel/operators/__init__.py +++ b/python/paddle/distributed/auto_parallel/operators/__init__.py @@ -27,3 +27,4 @@ from . import dist_eltwise from . import dist_check_finite_and_unscale from . import dist_update_loss_scaling from . import dist_split +from . import dist_fill_constant_batch_size_like diff --git a/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py b/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py old mode 100755 new mode 100644 diff --git a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py index 94eb0d2d469..32f8e2acef5 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py @@ -155,7 +155,7 @@ class DistributedEmbeddingImpl(DistributedOperatorImpl): kwargs['Out']) Ids_var = main_block.var(kwargs['Ids'][0]) - Weight_var = main_block.var(kwargs['W'][0]) + Weight_var = main_block._var_recursive(kwargs['W'][0]) Out_var = main_block.var(kwargs['Out'][0]) # got dist attribute info @@ -277,7 +277,8 @@ class DistributedEmbeddingImpl(DistributedOperatorImpl): # param initialization sync if Weight_var.is_parameter and not op_dist_attr.is_recompute: - assert Weight_var.name not in dist_op_context.already_init_sync_vars + if Weight_var.name in dist_op_context.already_init_sync_vars: + return dist_op_context.already_init_sync_vars.add(Weight_var.name) param = startup_block.var(Weight_var.name) param_dist_attr = ctx.get_tensor_dist_attr_for_program(param) diff --git a/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py b/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py new file mode 100644 index 00000000000..0c9d9eda02e --- /dev/null +++ b/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py @@ -0,0 +1,127 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +from .common import DistributedOperatorImplContainer +from .common import DistributedOperatorImpl +from .common import register_distributed_operator_impl_container +from .common import register_distributed_operator_impl +from ..utils import is_dim_shard +from ..utils import is_dim_replicate +from ..utils import is_valid_list_index +from ..utils import compute_compatible_dim_mapping +from ..utils import compute_compatible_dims_mapping +from ..utils import compute_compatible_and_update_dim_mapping +from ..utils import set_dist_op_desc_original_id +from paddle.fluid import core, unique_name +from paddle.fluid.framework import in_dygraph_mode +from paddle.fluid.framework import Program, Parameter, Variable, program_guard +from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype +from .dist_default import DistributedDefaultImpl0 + + +class DistributedFillConstantBatchSizeLike(DistributedOperatorImplContainer): + def __init__(self, op_type): + super(DistributedFillConstantBatchSizeLike, self).__init__(op_type) + + +register_distributed_operator_impl_container( + DistributedFillConstantBatchSizeLike("fill_constant_batch_size_like")) + + +class DistributedFillConstantBatchSizeLikeImpl0(DistributedOperatorImpl): + def __init__(self, name): + super(DistributedFillConstantBatchSizeLikeImpl0, self).__init__(name) + self._forward_implemented = True + self._backward_implemented = True + + def is_input_compatible(self, dist_op): + + return True + + def is_output_compatible(self, dist_op): + op_desc = dist_op.serial_op.desc + op_dist_attr = dist_op.dist_attr + out_name = op_desc.output('Out')[0] + out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name) + shape_list = op_desc.attr("shape") + + if len(shape_list) != len(out_dims_mapping): + return False + + return True + + def is_auto_compatible(self, dist_op): + if (not self.is_input_compatible(dist_op)) or \ + (not self.is_output_compatible(dist_op)): + return False + + out_name = op_desc.output('Out')[0] + out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name) + in_name = op_desc.input('Input')[0] + in_dims_mapping = op_dist_attr.get_input_dims_mapping(in_name) + + # the dim_mapping of batch dimension should be the same + return out_dims_mapping[0] == in_dims_mapping[0] + + def update_dims_mapping(self, dist_op): + changed = False + op_desc = dist_op.serial_op.desc + op_dist_attr = dist_op.dist_attr + x_name = op_desc.input('X')[0] + out_name = op_desc.output('Out')[0] + x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name) + out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name) + + # only the batch size dimemsion of input and output are relative. + dim_changed = compute_compatible_and_update_dim_mapping( + [x_dims_mapping, out_dims_mapping], [0, 0]) + if dim_changed: + changed = True + + return changed + + @staticmethod + def forward(ctx, *args, **kwargs): + """ + kwargs: inputname_mapping & outputname_mapping + """ + DistributedDefaultImpl0.forward(ctx, *args, **kwargs) + dist_op_context = ctx.dist_op_context + src_op = dist_op_context.cur_src_op + op_dist_attr = ctx.get_op_dist_attr_for_program(src_op) + main_block = dist_op_context.work_block + op = main_block.ops[-1] + assert op.type == "fill_constant_batch_size_like" + + # modify shape attr according to how output are partitioned + out_name = op.output('Out')[0] + dims_mapping = op_dist_attr.get_output_dims_mapping(out_name) + process_mesh_shape = op_dist_attr.process_mesh.topology + shape_list = op.attr("shape") + # modify target shape + for idx, axis in enumerate(dims_mapping): + if axis >= 0: + shape_list[idx] = shape_list[idx] // process_mesh_shape[axis] + + op._set_attr("shape", shape_list) + main_block._sync_with_cpp() + + @staticmethod + def backward(ctx, *args, **kwargs): + DistributedDefaultImpl0.backward(ctx, *args, **kwargs) + + +register_distributed_operator_impl( + "fill_constant_batch_size_like", + DistributedFillConstantBatchSizeLikeImpl0("fill_by_shape")) diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py index 9eb24a65e60..058ae1d0a9f 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py @@ -433,8 +433,8 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs): def _init_param_sync(Weight_var, dist_op_context, startup_block, ctx, rank_id): - assert Weight_var.name not in dist_op_context.already_init_sync_vars, "{} is in {}.".format( - Weight_var.name, dist_op_context.already_init_sync_vars) + if Weight_var.name in dist_op_context.already_init_sync_vars: + return assert startup_block.has_var(Weight_var.name) dist_op_context.already_init_sync_vars.add(Weight_var.name) param = startup_block.var(Weight_var.name) @@ -819,6 +819,8 @@ class DistributedMatmulImpl1(DistributedOperatorImpl): out_var_dist_attr) intermediate_var_0 = main_block.create_var( + name=unique_name.generate_with_ignorable_key(".".join( + ["c_allreduce_sum", 'tmp'])), shape=Out_var.shape, dtype=Out_var.dtype, type=Out_var.type, @@ -1323,6 +1325,8 @@ class DistributedMatmulV2Impl1(DistributedOperatorImpl): out_var_dist_attr) intermediate_var_0 = main_block.create_var( + name=unique_name.generate_with_ignorable_key(".".join( + ["c_allreduce_sum", 'tmp'])), shape=Out_var.shape, dtype=Out_var.dtype, type=Out_var.type, diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py index 2f88407c093..ed5ec85d84f 100644 --- a/python/paddle/distributed/auto_parallel/partitioner.py +++ b/python/paddle/distributed/auto_parallel/partitioner.py @@ -285,6 +285,9 @@ def _get_dist_shape(var, dist_attr): var_shape = var.shape mapping = dist_attr.dims_mapping mesh = dist_attr.process_mesh.topology + if mapping == []: + return var_shape + assert len(var_shape) == len( mapping ), "variable shape [{}] and dim_mapping [{}] is NOT match !".format( diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py index 1cd8f8f3e70..07e6a2c4346 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py @@ -174,6 +174,7 @@ def get_program(): dtype='float32') label = static.data( name="label", shape=[batch_size, sequence_len, 1], dtype='float32') + data_holder = [input, label] # dataloader dataloader = paddle.io.DataLoader.from_generator( @@ -194,6 +195,17 @@ def get_program(): "dims_mapping": [-1, -1, -1] }) + # fill constant bsz like + tmp = paddle.fluid.layers.fill_constant_batch_size_like( + input=input, shape=[-1, 16, 0, 48], dtype='float32', value=0) + auto.shard_tensor( + tmp, + dist_attr={ + "process_mesh": _g_process_mesh, + "dims_mapping": [-1, 0, -1, -1] + }) + + # model mlp_start = MLPLayer( hidden_size=hidden_size, intermediate_size=4 * hidden_size, @@ -395,6 +407,9 @@ def completion(train_program, start_program, dist_context): op_dist_attr.impl_idx = 0 else: op_dist_attr.impl_idx = 1 + elif op.type == "fill_constant_batch_size_like": + op_dist_attr.impl_type = "fill_constant_batch_size_like" + op_dist_attr.impl_idx = 0 else: op_dist_attr.impl_type = "default" op_dist_attr.impl_idx = 0 @@ -428,6 +443,12 @@ class TestMLP(unittest.TestCase): dist_main_prog, dist_startup_prog = partition( train_program, start_program, dist_context) global_block_ops = dist_main_prog.blocks[0].ops + + fill_op = None + for op in global_block_ops: + if op.type == "fill_constant_batch_size_like": + fill_op = op + global_block_ops = [op.type for op in global_block_ops] sub_block_ops = dist_main_prog.blocks[1].ops sub_block_ops = [op.type for op in sub_block_ops] @@ -435,6 +456,13 @@ class TestMLP(unittest.TestCase): self.assertTrue("c_allreduce_sum" in global_block_ops) self.assertTrue("c_allreduce_sum" in sub_block_ops) + # test fill_constant_batch_size_like + + self.assertTrue(fill_op is not None) + ref_shape = [-1, 8, 0, 48] + shape = fill_op.attr("shape") + self.assertTrue(ref_shape == shape) + if __name__ == "__main__": unittest.main() -- GitLab From 4a4215ffad5efada31dcdae9262a806635b1f226 Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Wed, 2 Mar 2022 16:14:31 +0800 Subject: [PATCH 054/272] [bf16] add bf16 kernel: softmax & log_softmax (#39999) * add softmax log_softmax * refine rocm * refine unittest --- paddle/fluid/operators/log_softmax_op.cu | 16 ++-- paddle/fluid/operators/math/softmax.cu | 13 +++ paddle/fluid/operators/math/softmax_impl.h | 91 +++++++++++++++++++ .../platform/device/gpu/rocm/miopen_helper.h | 17 ++++ paddle/phi/common/amp_type_traits.h | 42 +++++++++ paddle/phi/common/bfloat16.h | 18 ++-- paddle/phi/common/float16.h | 12 --- paddle/phi/kernels/gpu/softmax_grad_kernel.cu | 4 +- paddle/phi/kernels/gpu/softmax_kernel.cu | 4 +- paddle/phi/kernels/gpudnn/softmax_gpudnn.h | 12 +++ .../gpudnn/softmax_grad_kernel_gpudnn.cu | 14 ++- .../kernels/gpudnn/softmax_kernel_gpudnn.cu | 14 ++- .../fluid/tests/unittests/test_log_softmax.py | 30 +++++- .../fluid/tests/unittests/test_softmax_op.py | 52 ++++++++++- 14 files changed, 305 insertions(+), 34 deletions(-) create mode 100644 paddle/phi/common/amp_type_traits.h diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu index 034e67568b3..8770abdac83 100644 --- a/paddle/fluid/operators/log_softmax_op.cu +++ b/paddle/fluid/operators/log_softmax_op.cu @@ -13,9 +13,9 @@ // limitations under the License. #include -#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/log_softmax_op.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/funcs/functors.h" @@ -311,7 +311,7 @@ void LaunchLogSoftmaxForwardCUDAKernelNotLastAxis(T *output_data, template class LogSoftmaxKernel : public framework::OpKernel { - using MPDType = typename details::MPTypeTrait::Type; + using MPDType = typename phi::dtype::MPTypeTrait::Type; public: void Compute(const framework::ExecutionContext &context) const override { @@ -433,7 +433,7 @@ void LaunchSoftmaxBackwardForLastAxis(T *grad_input, const T *grad_output, template class LogSoftmaxGradKernel : public framework::OpKernel { - using MPDType = typename details::MPTypeTrait::Type; + using MPDType = typename phi::dtype::MPTypeTrait::Type; public: void Compute(const framework::ExecutionContext &context) const override { @@ -468,16 +468,18 @@ class LogSoftmaxGradKernel } }; -} // operators -} // paddle +} // namespace operators +} // namespace paddle namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( log_softmax, ops::LogSoftmaxKernel, ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel); + ops::LogSoftmaxKernel, + ops::LogSoftmaxKernel); REGISTER_OP_CUDA_KERNEL( log_softmax_grad, ops::LogSoftmaxGradKernel, ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel); + ops::LogSoftmaxGradKernel, + ops::LogSoftmaxGradKernel); diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu index fd879e9e6ff..83b124902eb 100644 --- a/paddle/fluid/operators/math/softmax.cu +++ b/paddle/fluid/operators/math/softmax.cu @@ -120,6 +120,10 @@ template class SoftmaxCUDNNFunctor; template class SoftmaxCUDNNFunctor; template class SoftmaxGradCUDNNFunctor; template class SoftmaxGradCUDNNFunctor; +#if CUDNN_VERSION_MIN(8, 1, 0) +template class SoftmaxCUDNNFunctor; +template class SoftmaxGradCUDNNFunctor; +#endif // MIOPEN do not support double #ifndef PADDLE_WITH_HIP @@ -131,6 +135,10 @@ template class SoftmaxFunctor; template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; template class SoftmaxFunctor; template class SoftmaxFunctor; template class SoftmaxFunctor; @@ -139,9 +147,13 @@ template class SoftmaxGradFunctor; template class SoftmaxGradFunctor; template class SoftmaxGradFunctor; +template class SoftmaxGradFunctor; template class SoftmaxFunctor; template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; template class SoftmaxFunctor; template class SoftmaxFunctor; template class SoftmaxFunctor; @@ -149,6 +161,7 @@ template class SoftmaxFunctor; template class SoftmaxGradFunctor; template class SoftmaxGradFunctor; template class SoftmaxGradFunctor; +template class SoftmaxGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index d51d638e0c1..9833b4447ec 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -156,6 +156,65 @@ class SoftmaxEigen { } }; +template +class SoftmaxEigen { + public: + void operator()(const DeviceContext& context, const int axis_dim, + const framework::Tensor* X, framework::Tensor* Y) { + constexpr int kBatchDim = 0; + constexpr int kClassDim = 1; + constexpr int kAxisDim = 1; + + auto logits = EigenMatrix::From(*X); + auto softmax = EigenMatrix::From(*Y); + + const int batch_size = logits.dimension(kBatchDim); + const int num_classes = logits.dimension(kClassDim); + const int num_remain = num_classes / axis_dim; + + Eigen::DSizes along_axis(kAxisDim); + Eigen::DSizes batch_classes(batch_size, num_classes); + Eigen::DSizes batch_by_one(batch_size, 1); + Eigen::DSizes one_by_class(1, num_classes); + Eigen::DSizes batch_one_remain(batch_size, 1, num_remain); + Eigen::DSizes one_axis_one(1, axis_dim, 1); + Eigen::DSizes one_axis(1, axis_dim); + Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); + + // For numerical stability, logits should be shifted by maximum number along + // axis, calculate shifted_logits into softmax tensor for memory reuse. + if (num_remain == 1) { + // axis == -1, axis and class in same dimension, calculate along + // class dimension directly for higher performance + softmax.device(*context.eigen_device()) = + (logits - + logits.maximum(along_axis) + .reshape(batch_by_one) + .broadcast(one_by_class)) + .unaryExpr(ValueClip()); + } else { + // axis != -1, class dimension split into (axis, remain), max and sum + // should be calculated along axis dimension + softmax.device(*context.eigen_device()) = + (logits.reshape(batch_axis_remain) - + logits.reshape(batch_axis_remain) + .maximum(along_axis) + .reshape(batch_one_remain) + .broadcast(one_axis_one) + .reshape(batch_classes)) + .unaryExpr(ValueClip()); + } + + softmax.device(*context.eigen_device()) = softmax.exp(); + softmax.device(*context.eigen_device()) = + (softmax * + softmax.reshape(batch_axis_remain) + .sum(along_axis) + .inverse() + .broadcast(one_axis)); + } +}; + template void SoftmaxFunctor::operator()( const DeviceContext& context, const int axis_dim, @@ -289,6 +348,38 @@ class SoftmaxGradEigen { } }; +template +class SoftmaxGradEigen { + public: + void operator()(const DeviceContext& context, const int axis_dim, + const framework::Tensor* y, const framework::Tensor* y_grad, + framework::Tensor* x_grad) { + auto softmax = EigenMatrix::From(*y); + auto softmax_grad = EigenMatrix::From(*y_grad); + auto logits_grad = EigenMatrix::From(*x_grad); + + constexpr int kBatchDim = 0; + constexpr int kClassDim = 1; + + const int batch_size = softmax.dimension(kBatchDim); + const int num_classes = softmax.dimension(kClassDim); + const int num_remain = num_classes / axis_dim; + + Eigen::DSizes along_class(kClassDim); + Eigen::DSizes batch_by_one(batch_size, 1); + Eigen::DSizes one_by_class(1, num_classes); + Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); + Eigen::DSizes one_axis(1, axis_dim); + + auto dot = (softmax * softmax_grad) + .reshape(batch_axis_remain) + .sum(along_class) + .broadcast(one_axis); + logits_grad.device(*context.eigen_device()) = + (softmax_grad - dot) * softmax; + } +}; + template void SoftmaxGradFunctor::operator()( const DeviceContext& context, const int axis_dim, diff --git a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h index 34b9d57e055..1a514d2aca2 100644 --- a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h +++ b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h @@ -140,6 +140,23 @@ class CudnnDataType { } }; +template <> +class CudnnDataType { + public: + static const miopenDataType_t type = miopenBFloat16; + // The scaling param type is float for HALF and FLOAT tensors + using ScalingParamType = const float; + using BatchNormParamType = float; + static ScalingParamType* kOne() { + static ScalingParamType v = 1.0; + return &v; + } + static ScalingParamType* kZero() { + static ScalingParamType v = 0.0; + return &v; + } +}; + template <> class CudnnDataType { public: diff --git a/paddle/phi/common/amp_type_traits.h b/paddle/phi/common/amp_type_traits.h new file mode 100644 index 00000000000..ce3a469f5ae --- /dev/null +++ b/paddle/phi/common/amp_type_traits.h @@ -0,0 +1,42 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" + +namespace phi { +namespace dtype { + +template +class MPTypeTrait { + public: + using Type = T; +}; + +template <> +class MPTypeTrait { + public: + using Type = float; +}; + +template <> +class MPTypeTrait { + public: + using Type = float; +}; + +} // namespace dtype +} // namespace phi diff --git a/paddle/phi/common/bfloat16.h b/paddle/phi/common/bfloat16.h index 3fd8eb1b268..cf99bb8f19a 100644 --- a/paddle/phi/common/bfloat16.h +++ b/paddle/phi/common/bfloat16.h @@ -377,31 +377,31 @@ struct numeric_limits { static const bool traps = true; static const bool tinyness_before = false; - static phi::dtype::bfloat16(min)() { + HOSTDEVICE static phi::dtype::bfloat16(min)() { return phi::dtype::raw_uint16_to_bfloat16(0x007f); } - static phi::dtype::bfloat16 lowest() { + HOSTDEVICE static phi::dtype::bfloat16 lowest() { return phi::dtype::raw_uint16_to_bfloat16(0xff7f); } - static phi::dtype::bfloat16(max)() { + HOSTDEVICE static phi::dtype::bfloat16(max)() { return phi::dtype::raw_uint16_to_bfloat16(0x7f7f); } - static phi::dtype::bfloat16 epsilon() { + HOSTDEVICE static phi::dtype::bfloat16 epsilon() { return phi::dtype::raw_uint16_to_bfloat16(0x3400); } - static phi::dtype::bfloat16 round_error() { + HOSTDEVICE static phi::dtype::bfloat16 round_error() { return phi::dtype::bfloat16(0.5); } - static phi::dtype::bfloat16 infinity() { + HOSTDEVICE static phi::dtype::bfloat16 infinity() { return phi::dtype::raw_uint16_to_bfloat16(0x7f80); } - static phi::dtype::bfloat16 quiet_NaN() { + HOSTDEVICE static phi::dtype::bfloat16 quiet_NaN() { return phi::dtype::raw_uint16_to_bfloat16(0xffc1); } - static phi::dtype::bfloat16 signaling_NaN() { + HOSTDEVICE static phi::dtype::bfloat16 signaling_NaN() { return phi::dtype::raw_uint16_to_bfloat16(0xff81); } - static phi::dtype::bfloat16 denorm_min() { + HOSTDEVICE static phi::dtype::bfloat16 denorm_min() { return phi::dtype::raw_uint16_to_bfloat16(0x0001); } }; diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h index 6ed9c88d705..1cdcdef2c12 100644 --- a/paddle/phi/common/float16.h +++ b/paddle/phi/common/float16.h @@ -988,18 +988,6 @@ inline std::ostream& operator<<(std::ostream& os, const float16& a) { return os; } -template -class MPTypeTrait { - public: - using Type = T; -}; - -template <> -class MPTypeTrait { - public: - using Type = float; -}; - } // namespace dtype } // namespace phi diff --git a/paddle/phi/kernels/gpu/softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/softmax_grad_kernel.cu index aa496d3cd39..04052e0dfc3 100644 --- a/paddle/phi/kernels/gpu/softmax_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/softmax_grad_kernel.cu @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/phi/kernels/softmax_grad_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/softmax_grad_kernel_impl.h" @@ -25,4 +26,5 @@ PD_REGISTER_KERNEL(softmax_grad, phi::SoftmaxGradKernel, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/softmax_kernel.cu b/paddle/phi/kernels/gpu/softmax_kernel.cu index 32efb9b7764..03c5714b967 100644 --- a/paddle/phi/kernels/gpu/softmax_kernel.cu +++ b/paddle/phi/kernels/gpu/softmax_kernel.cu @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/phi/kernels/softmax_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/softmax_kernel_impl.h" @@ -25,4 +26,5 @@ PD_REGISTER_KERNEL(softmax, phi::SoftmaxRawKernel, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h index 45798b88bb5..c9c549379bb 100644 --- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h +++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h @@ -15,6 +15,8 @@ limitations under the License. */ #pragma once #include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/funcs/axis_utils.h" @@ -47,6 +49,11 @@ class VecT4 { public: using Type = int2; }; +template <> +class VecT4 { + public: + using Type = int2; +}; // Vectorization trait 2 * sizeof(T) template @@ -66,6 +73,11 @@ class VecT2 { public: using Type = int; }; +template <> +class VecT2 { + public: + using Type = int; +}; static inline int log2_ceil(int value) { int log2_value = 0; diff --git a/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu index 56e5fef6e37..45ab645d373 100644 --- a/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu +++ b/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu @@ -38,7 +38,18 @@ PD_REGISTER_KERNEL(softmax_grad, ALL_LAYOUT, phi::SoftmaxGradGPUDNNKernel, float, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else +#if CUDNN_VERSION_MIN(8, 1, 0) +PD_REGISTER_KERNEL(softmax_grad, + GPUDNN, + ALL_LAYOUT, + phi::SoftmaxGradGPUDNNKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} #else PD_REGISTER_KERNEL(softmax_grad, GPUDNN, @@ -48,3 +59,4 @@ PD_REGISTER_KERNEL(softmax_grad, double, phi::dtype::float16) {} #endif +#endif diff --git a/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu index 427d1729a13..7685c7dbb68 100644 --- a/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu +++ b/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu @@ -37,7 +37,18 @@ PD_REGISTER_KERNEL(softmax, ALL_LAYOUT, phi::SoftmaxRawGPUDNNKernel, float, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else +#if CUDNN_VERSION_MIN(8, 1, 0) +PD_REGISTER_KERNEL(softmax, + GPUDNN, + ALL_LAYOUT, + phi::SoftmaxRawGPUDNNKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} #else PD_REGISTER_KERNEL(softmax, GPUDNN, @@ -47,3 +58,4 @@ PD_REGISTER_KERNEL(softmax, double, phi::dtype::float16) {} #endif +#endif diff --git a/python/paddle/fluid/tests/unittests/test_log_softmax.py b/python/paddle/fluid/tests/unittests/test_log_softmax.py index d1437ca9c96..16f954708d4 100644 --- a/python/paddle/fluid/tests/unittests/test_log_softmax.py +++ b/python/paddle/fluid/tests/unittests/test_log_softmax.py @@ -14,8 +14,9 @@ import unittest import numpy as np -from paddle.fluid.tests.unittests.op_test import OpTest +from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16 import paddle +import paddle.fluid.core as core import paddle.nn.functional as F np.random.seed(10) @@ -74,6 +75,33 @@ class TestLogSoftmaxAxis(TestLogSoftmaxOp): self.axis = 1 +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestLogSoftmaxBF16Op(OpTest): + def setUp(self): + self.op_type = 'log_softmax' + self.dtype = np.uint16 + self.shape = [2, 3, 4, 5] + self.axis = -1 + + x = np.random.uniform(0.1, 1., self.shape).astype(np.float32) + out = np.apply_along_axis(ref_log_softmax, self.axis, x) + self.x_grad = ref_log_softmax_grad(x, self.axis) + + self.inputs = {'X': convert_float_to_uint16(x)} + self.outputs = {'Out': convert_float_to_uint16(out)} + self.attrs = {'axis': self.axis} + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, ['X'], ['Out'], user_defined_grads=[self.x_grad]) + + class TestNNLogSoftmaxAPI(unittest.TestCase): def setUp(self): self.x_shape = [2, 3, 4, 5] diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py index a1cbefa40f3..4f1c37a2424 100644 --- a/python/paddle/fluid/tests/unittests/test_softmax_op.py +++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py @@ -16,7 +16,7 @@ from __future__ import print_function import unittest import numpy as np -from op_test import OpTest +from op_test import OpTest, convert_float_to_uint16 import paddle.fluid.core as core import paddle.fluid as fluid from paddle.fluid import compiler, Program, program_guard @@ -296,6 +296,56 @@ class TestSoftmaxFP16CUDNNOp2(TestSoftmaxFP16CUDNNOp): return [2, 3, 4, 5] +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestSoftmaxBF16Op(OpTest): + def setUp(self): + self.op_type = "softmax" + self.use_cudnn = self.init_cudnn() + self.use_mkldnn = False + self.dtype = np.uint16 + self.shape = [10, 10] + self.axis = -1 + + np.random.seed(0) + x = np.random.uniform(0.1, 1, self.shape).astype(np.float32) + out = np.apply_along_axis(stable_softmax, self.axis, x) + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(x)) + } + self.outputs = {'Out': convert_float_to_uint16(out)} + self.attrs = { + 'axis': self.axis, + 'use_cudnn': self.use_cudnn, + 'use_mkldnn': self.use_mkldnn + } + + def init_cudnn(self): + return False + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place( + place, check_dygraph=(self.use_mkldnn == False)) + + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, ["X"], + "Out", + numeric_grad_delta=0.05, + check_dygraph=(self.use_mkldnn == False)) + + +@unittest.skipIf( + not core.is_compiled_with_cuda() or core.cudnn_version() < 8100, + "core is not compiled with CUDA and cudnn version need larger than 8.1.0") +class TestSoftmaxBF16CUDNNOp(TestSoftmaxBF16Op): + def init_cudnn(self): + return True + + class TestSoftmaxAPI(unittest.TestCase): def setUp(self): self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda( -- GitLab From 07dad6d6ec415758d520e33960a0c53e50ef2ab5 Mon Sep 17 00:00:00 2001 From: huzhiqiang <912790387@qq.com> Date: Wed, 2 Mar 2022 02:16:04 -0600 Subject: [PATCH 055/272] [Infrt]add phi kernel dialect (#39726) --- .gitignore | 3 + .../pybind/kernel_signature_generator.cc | 26 +- paddle/infrt/dialect/infrt/common_type.h | 18 +- paddle/infrt/dialect/infrt/infrt_ops_base.td | 7 +- paddle/infrt/dialect/init_infrt_dialects.cc | 4 + paddle/infrt/dialect/phi/ir/CMakeLists.txt | 7 +- .../infrt/dialect/phi/ir/infrt_phi_kernel.td | 24 +- .../infrt/dialect/phi/ir/infrt_phi_tensor.td | 11 +- paddle/infrt/dialect/phi/ir/phi_kernels.cc | 44 +++ paddle/infrt/dialect/phi/ir/phi_kernels.h | 42 +++ .../infrt/dialect/phi/pass/kernel_op_desc.cc | 45 ++- paddle/infrt/host_context/mlir_exec.cc | 2 + paddle/infrt/kernel/phi/context_kernels.cc | 8 +- paddle/infrt/kernel/phi/context_kernels.h | 3 +- .../infrt/kernel/phi/dense_tensor_kernels.cc | 34 ++- .../infrt/kernel/phi/dense_tensor_kernels.h | 3 +- .../infershaped/infershape_launchers_test.cc | 2 +- paddle/infrt/kernel/phi/registry.cc | 2 + .../tests/dialect/pten/dense_tensor.mlir | 12 +- paddle/scripts/infrt_build.sh | 4 +- tools/infrt/generate_phi_kernel_dialect.py | 276 ++++++++++++++++++ tools/infrt/get_phi_kernel_info.py | 12 +- 22 files changed, 536 insertions(+), 53 deletions(-) create mode 100644 paddle/infrt/dialect/phi/ir/phi_kernels.cc create mode 100644 paddle/infrt/dialect/phi/ir/phi_kernels.h create mode 100644 tools/infrt/generate_phi_kernel_dialect.py diff --git a/.gitignore b/.gitignore index cecd6fa91c7..debec551d9c 100644 --- a/.gitignore +++ b/.gitignore @@ -49,6 +49,9 @@ tools/__pycache__ # This file is automatically generated. # TODO(zhiqiang) Move this file to build directory. paddle/infrt/dialect/pd_ops.td +paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td +paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td +tools/infrt/kernels.json paddle/infrt/dialect/pd_ops_info.h .lit_test_times.txt paddle/infrt/tests/dialect/Output diff --git a/paddle/fluid/pybind/kernel_signature_generator.cc b/paddle/fluid/pybind/kernel_signature_generator.cc index 8283a249ded..f0d5a4e477f 100644 --- a/paddle/fluid/pybind/kernel_signature_generator.cc +++ b/paddle/fluid/pybind/kernel_signature_generator.cc @@ -49,24 +49,30 @@ int main(int argc, char **argv) { if (kernel_signature_map.Has(op_kernel_pair.first)) { std::cout << "\"" << op_kernel_pair.first << "\":{"; auto &args = kernel_signature_map.Get(op_kernel_pair.first).args; + std::cout << "\"inputs\":["; - for (auto name : std::get<0>(args)) { - std::cout << "\"" << name << "\","; + auto inputs_ = std::get<0>(args); + if (inputs_.size() > 0) std::cout << inputs_[0]; + for (size_t i = 1; i < inputs_.size(); i++) { + std::cout << ",\"" << inputs_[i] << "\""; } - if (std::get<0>(args).size() > 0) std::cout << "\b"; + std::cout << "],\"attrs\":["; - for (auto name : std::get<1>(args)) { - std::cout << "\"" << name << "\","; + auto attrs_ = std::get<1>(args); + if (attrs_.size() > 0) std::cout << attrs_[0]; + for (size_t i = 1; i < attrs_.size(); i++) { + std::cout << ",\"" << attrs_[i] << "\""; } - if (std::get<1>(args).size() > 0) std::cout << "\b"; + std::cout << "],\"outputs\":["; - for (auto name : std::get<2>(args)) { - std::cout << "\"" << name << "\","; + auto outputs_ = std::get<2>(args); + for (size_t i = 1; i < outputs_.size(); i++) { + std::cout << ",\"" << outputs_[i] << "\""; } - if (std::get<2>(args).size() > 0) std::cout << "\b"; + std::cout << "]},"; } } - std::cout << "\b}" << std::endl; + std::cout << "}" << std::endl; return 0; } diff --git a/paddle/infrt/dialect/infrt/common_type.h b/paddle/infrt/dialect/infrt/common_type.h index d6d6503c03b..436e7920ca5 100644 --- a/paddle/infrt/dialect/infrt/common_type.h +++ b/paddle/infrt/dialect/infrt/common_type.h @@ -21,8 +21,22 @@ namespace infrt { enum class TargetType : uint8_t { CPU, GPU, UNK }; -enum class PrecisionType : uint8_t { FLOAT32, FLOAT16, UNK }; -enum class LayoutType : uint8_t { NCHW, NHWC, UNK }; +enum class LayoutType : uint8_t { NCHW, NHWC, ANY, UNK }; +enum class PrecisionType : uint8_t { + UINT8, + INT8, + INT16, + INT32, + INT64, + FLOAT16, + BFLOAT16, + FLOAT32, + FLOAT64, + COMPLEX64, + COMPLEX128, + BOOL, + UNK +}; struct Place { TargetType target; diff --git a/paddle/infrt/dialect/infrt/infrt_ops_base.td b/paddle/infrt/dialect/infrt/infrt_ops_base.td index 978b126d754..f19912dc0cd 100644 --- a/paddle/infrt/dialect/infrt/infrt_ops_base.td +++ b/paddle/infrt/dialect/infrt/infrt_ops_base.td @@ -34,9 +34,10 @@ def DenseTensor : Infrt_Type<"DenseTensor"> { let summary = "infrt dense tensor"; let description = [{dense_tensor<, 3>}]; let parameters = (ins - "TargetType":$target, - "PrecisionType":$precision, - "LayoutType":$layout + "::infrt::TargetType":$target, + "::infrt::PrecisionType":$precision, + "::infrt::LayoutType":$layout + ); } diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_infrt_dialects.cc index c5c81b4b0f2..5eae0171936 100644 --- a/paddle/infrt/dialect/init_infrt_dialects.cc +++ b/paddle/infrt/dialect/init_infrt_dialects.cc @@ -23,6 +23,8 @@ #include "paddle/infrt/dialect/pd_ops.h" #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" #include "paddle/infrt/dialect/phi/ir/phi_base.h" +#include "paddle/infrt/dialect/phi/ir/phi_kernels.h" + #include "paddle/infrt/dialect/tensor_shape.h" namespace infrt { @@ -34,6 +36,8 @@ void registerCinnDialects(mlir::DialectRegistry ®istry) { // NOLINT mlir::pd::PaddleDialect, #ifdef INFRT_WITH_PHI phi::PHIDenseTensorDialect, + phi::PHICPUKernelDialect, + phi::PHIGPUKernelDialect, phi::PHIDialect #endif >(); diff --git a/paddle/infrt/dialect/phi/ir/CMakeLists.txt b/paddle/infrt/dialect/phi/ir/CMakeLists.txt index 8c1d75629d0..0497b983211 100644 --- a/paddle/infrt/dialect/phi/ir/CMakeLists.txt +++ b/paddle/infrt/dialect/phi/ir/CMakeLists.txt @@ -1,9 +1,12 @@ #mlir_tablegen_on(infrt_phi_base DIALECT phi) add_mlir_dialect(infrt_phi_base phi) add_mlir_dialect(infrt_phi_tensor phi_dt) -add_mlir_dialect(infrt_phi_kernel phi_kernel) +add_mlir_dialect(phi_cpu_kernels phi_cpu) +add_mlir_dialect(phi_gpu_kernels phi_gpu) + #mlir_tablegen_on(infrt_phi_tensor) gather_srcs(infrt_src SRCS phi_base.cc - infrt_phi_tensor.cc) + infrt_phi_tensor.cc + phi_kernels.cc) diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td index 37bf0b5ef21..ee23470fc75 100644 --- a/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td @@ -6,24 +6,32 @@ include "mlir/IR/OpBase.td" include "paddle/infrt/dialect/infrt_base.td" include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td" -def PHI_KernelDialect : Dialect { - let name = "phi_kernel"; +def PHI_CPUKernelDialect : Dialect { + let name = "phi_cpu"; let description = [{ - The PHI Kernel dialect. + The PHI CPU Kernel dialect. + }]; + + let cppNamespace = "::infrt::phi"; +} + +def PHI_GPUKernelDialect : Dialect { + let name = "phi_gpu"; + + let description = [{ + The PHI GPU Kernel dialect. }]; let cppNamespace = "::infrt::phi"; } // PHI Kernel related ops. -class PDT_Kernel traits = []> : Op { +class PDTCPU_Kernel traits = []> : Op { } -def PDCK_AbsOp : PDT_Kernel<"phi.abs.host.fp32"> { - let arguments = (ins CPU_Context:$dev_ctx, DenseTensor:$x); - let results = (outs DenseTensor:$output); +// PHI Kernel related ops. +class PDTGPU_Kernel traits = []> : Op { } #endif - diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td index dc3a4b340d7..39677871ff8 100644 --- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td @@ -34,6 +34,14 @@ class FillDenseTensorOp : attr_type:$value ); let results = (outs); + let assemblyFormat = "`(` $input `:` type($input) `)` attr-dict"; +} + +class PrintDenseTensorOp: + PDT_Op<"print_tensor"> { + let arguments = (ins DenseTensor:$input); + let results = (outs); + let assemblyFormat = "`(` $input `:` type($input) `)` attr-dict"; } class CreateCPUAllocatorOp @@ -44,7 +52,7 @@ class CreateCPUAllocatorOp class CreateCPUContextOp : PDT_Op<"create_context." # "cpu", [NoSideEffect]> { - let arguments = (ins); + let arguments = (ins CPU_Allocator:$input); let results = (outs CPU_Context:$output); } @@ -52,6 +60,7 @@ def PDT_CreateDenseTensorOp_cpu_f32_nchw : CreateDenseTensorOp<"cpu", "f32", "nc def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp; def PDT_CreateAllocatorOp_cpu : CreateCPUAllocatorOp; def PDT_CreateContextOp_cpu : CreateCPUContextOp; +def PDT_PrintDenseTensor_cpu : PrintDenseTensorOp; def FakeKernelOp : PDT_Op<"fake_phi_kernel"> { let arguments = (ins CPU_Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y); diff --git a/paddle/infrt/dialect/phi/ir/phi_kernels.cc b/paddle/infrt/dialect/phi/ir/phi_kernels.cc new file mode 100644 index 00000000000..c7a837b83fc --- /dev/null +++ b/paddle/infrt/dialect/phi/ir/phi_kernels.cc @@ -0,0 +1,44 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/phi/ir/phi_kernels.h" +#include + +#include "paddle/infrt/dialect/phi/ir/phi_gpu_kernelsDialect.cpp.inc" +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/phi/ir/phi_cpu_kernels.cpp.inc" // NOLINT + +#include "paddle/infrt/dialect/phi/ir/phi_cpu_kernelsDialect.cpp.inc" +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/phi/ir/phi_gpu_kernels.cpp.inc" // NOLINT + +namespace infrt { +namespace phi { + +void PHICPUKernelDialect::initialize() { +#define GET_OP_LIST + addOperations< +#include "paddle/infrt/dialect/phi/ir/phi_cpu_kernels.cpp.inc" // NOLINT + >(); +} + +void PHIGPUKernelDialect::initialize() { +#define GET_OP_LIST + addOperations< +#include "paddle/infrt/dialect/phi/ir/phi_gpu_kernels.cpp.inc" // NOLINT + >(); +} + +} // namespace phi +} // namespace infrt diff --git a/paddle/infrt/dialect/phi/ir/phi_kernels.h b/paddle/infrt/dialect/phi/ir/phi_kernels.h new file mode 100644 index 00000000000..b84d1b2b729 --- /dev/null +++ b/paddle/infrt/dialect/phi/ir/phi_kernels.h @@ -0,0 +1,42 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/infrt/dialect/dense_tensor.h" +#include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/phi/ir/phi_base.h" + +#include "paddle/infrt/dialect/phi/ir/phi_cpu_kernelsDialect.h.inc" +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/phi/ir/phi_cpu_kernels.h.inc" + +#include "paddle/infrt/dialect/phi/ir/phi_gpu_kernelsDialect.h.inc" +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/phi/ir/phi_gpu_kernels.h.inc" diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc index 63869b7d7b9..6c0f6df8921 100644 --- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc @@ -41,26 +41,49 @@ TargetType cvtTargetFromPhi(phi::Backend backend) { } phi::DataType cvtPrecision2Phi(PrecisionType precision) { +#define CONVERT_PRECISION_TO_PHI(Precision) \ + case PrecisionType::Precision: \ + return phi::DataType::Precision; + switch (precision) { - case PrecisionType::FLOAT32: - return phi::DataType::FLOAT32; - break; - case PrecisionType::FLOAT16: - return phi::DataType::FLOAT16; + CONVERT_PRECISION_TO_PHI(FLOAT32) + CONVERT_PRECISION_TO_PHI(FLOAT16) + CONVERT_PRECISION_TO_PHI(FLOAT64) + CONVERT_PRECISION_TO_PHI(UINT8) + CONVERT_PRECISION_TO_PHI(INT8) + CONVERT_PRECISION_TO_PHI(INT16) + CONVERT_PRECISION_TO_PHI(INT32) + CONVERT_PRECISION_TO_PHI(INT64) + CONVERT_PRECISION_TO_PHI(COMPLEX64) + CONVERT_PRECISION_TO_PHI(COMPLEX128) + CONVERT_PRECISION_TO_PHI(BOOL) default: return phi::DataType::UNDEFINED; } +#undef CONVERT_PRECISION_TO_PHI } PrecisionType cvtPrecisionFromPhi(phi::DataType datatype) { +#define CONVERT_PRECISION_FROM_PHI(Precision) \ + case phi::DataType::Precision: \ + return PrecisionType::Precision; + switch (datatype) { - case phi::DataType::FLOAT32: - return PrecisionType::FLOAT32; - case phi::DataType::FLOAT16: - return PrecisionType::FLOAT16; + CONVERT_PRECISION_FROM_PHI(FLOAT32) + CONVERT_PRECISION_FROM_PHI(FLOAT16) + CONVERT_PRECISION_FROM_PHI(FLOAT64) + CONVERT_PRECISION_FROM_PHI(UINT8) + CONVERT_PRECISION_FROM_PHI(INT8) + CONVERT_PRECISION_FROM_PHI(INT16) + CONVERT_PRECISION_FROM_PHI(INT32) + CONVERT_PRECISION_FROM_PHI(INT64) + CONVERT_PRECISION_FROM_PHI(COMPLEX64) + CONVERT_PRECISION_FROM_PHI(COMPLEX128) + CONVERT_PRECISION_FROM_PHI(BOOL) default: return PrecisionType::UNK; } +#undef CONVERT_PRECISION_FROM_PHI } phi::DataLayout cvtLayout2Phi(LayoutType layout) { @@ -69,6 +92,8 @@ phi::DataLayout cvtLayout2Phi(LayoutType layout) { return phi::DataLayout::NCHW; case LayoutType::NHWC: return phi::DataLayout::NHWC; + case LayoutType::ANY: + return phi::DataLayout::ANY; default: return phi::DataLayout::UNDEFINED; } @@ -80,6 +105,8 @@ LayoutType cvtLayoutFromPhi(phi::DataLayout layout) { return LayoutType::NCHW; case phi::DataLayout::NHWC: return LayoutType::NHWC; + case phi::DataLayout::ANY: + return LayoutType::ANY; default: return LayoutType::UNK; } diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc index 79717ba2cc0..7823681079f 100644 --- a/paddle/infrt/host_context/mlir_exec.cc +++ b/paddle/infrt/host_context/mlir_exec.cc @@ -29,6 +29,7 @@ #include "paddle/infrt/kernel/tensor_shape_kernels.h" #include "paddle/infrt/kernel/test_kernels.h" #ifdef INFRT_WITH_PHI +#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h" #include "paddle/infrt/kernel/phi/registry.h" #endif @@ -58,6 +59,7 @@ int main(int argc, char** argv) { kernel::RegisterControlFlowKernels(®istry); #ifdef INFRT_WITH_PHI kernel::RegisterPhiKernels(®istry); + kernel::RegisterInferShapeLaunchers(®istry); #endif // load extra shared library diff --git a/paddle/infrt/kernel/phi/context_kernels.cc b/paddle/infrt/kernel/phi/context_kernels.cc index 5284f499916..3caaf1788e3 100644 --- a/paddle/infrt/kernel/phi/context_kernels.cc +++ b/paddle/infrt/kernel/phi/context_kernels.cc @@ -18,7 +18,13 @@ namespace infrt { namespace kernel { namespace phi { -::phi::CPUContext CreateCpuContext() { return {}; } +::phi::CPUContext CreateCpuContext( + infrt::backends::CpuPhiAllocator* allocator) { + ::phi::CPUContext context; + context.SetAllocator(allocator); + context.Init(); + return context; +} } // namespace phi } // namespace kernel diff --git a/paddle/infrt/kernel/phi/context_kernels.h b/paddle/infrt/kernel/phi/context_kernels.h index 8082dc6c2ff..7f1e7ef6cd3 100644 --- a/paddle/infrt/kernel/phi/context_kernels.h +++ b/paddle/infrt/kernel/phi/context_kernels.h @@ -14,6 +14,7 @@ #pragma once +#include "paddle/infrt/backends/host/phi_allocator.h" #include "paddle/infrt/backends/host/phi_context.h" #include "paddle/phi/core/dense_tensor.h" @@ -21,7 +22,7 @@ namespace infrt { namespace kernel { namespace phi { -::phi::CPUContext CreateCpuContext(); +::phi::CPUContext CreateCpuContext(::infrt::backends::CpuPhiAllocator*); } // namespace phi } // namespace kernel diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc index ce9200b9918..871336e8762 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h" - +#include namespace infrt { namespace kernel { namespace phi { @@ -30,8 +30,38 @@ namespace phi { } void FillDenseTensorF32(::phi::DenseTensor* dense_tensor, - host_context::Attribute> values) {} + host_context::Attribute> values) { + auto place = ::phi::CPUPlace(); + float* a_data = dense_tensor->mutable_data(place); + for (int64_t i = 0; i < dense_tensor->numel(); ++i) { + a_data[i] = (values.get())[i]; + } +} +void PrintDenseTensor(::phi::DenseTensor* dense_tensor) { +#define PRINT_META_DATA(PHI_DATATYPE, DTYPE) \ + case ::phi::DataType::PHI_DATATYPE: { \ + DTYPE* data = dense_tensor->data(); \ + if (dense_tensor->numel() == 0) break; \ + std::cout << data[0]; \ + for (int64_t i = 1; i < dense_tensor->numel(); i++) { \ + std::cout << "," << data[i]; \ + } \ + break; \ + } + + ::phi::DDim dims = dense_tensor->dims(); + std::cout << "dense_tensor: shape=shape" << dims.to_str() << "," + << " values=["; + switch (dense_tensor->dtype()) { + PRINT_META_DATA(FLOAT32, float); + PRINT_META_DATA(INT32, int32_t); + default: + std::cout << "Error! Unsupported data type!\n"; + } + std::cout << "]\n"; +#undef PRINT_META_DATA +} } // namespace phi } // namespace kernel } // namespace infrt diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h index 25daf7027e8..920c0b1c8af 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h @@ -28,7 +28,8 @@ namespace phi { host_context::Attribute> lod); void FillDenseTensorF32(::phi::DenseTensor* dense_tensor, - host_context::Attribute> values); + host_context::Attribute> values); +void PrintDenseTensor(::phi::DenseTensor* dense_tensor); } // namespace phi } // namespace kernel diff --git a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc index 2161e98fac8..37f9197edb7 100644 --- a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc +++ b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc @@ -54,7 +54,7 @@ TEST(ElementwiseAdd, launcher_registry) { host_context::KernelRegistry registry; RegisterInferShapeLaunchers(®istry); ASSERT_GE(registry.size(), 1UL); - auto creator = registry.GetKernel("pten.add.cpu.any.fp32"); + auto creator = registry.GetKernel("phi_cpu.add.any.float32"); const phi::DDim dims({1, 2}); const phi::DataType dtype{phi::DataType::FLOAT32}; diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc index 5d79814d4be..15e2d21005e 100644 --- a/paddle/infrt/kernel/phi/registry.cc +++ b/paddle/infrt/kernel/phi/registry.cc @@ -42,6 +42,8 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) { INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensorCpuF32Nchw)); registry->AddKernel("phi_dt.fill_dense_tensor.f32", INFRT_KERNEL(infrt::kernel::phi::FillDenseTensorF32)); + registry->AddKernel("phi_dt.print_tensor", + INFRT_KERNEL(infrt::kernel::phi::PrintDenseTensor)); registry->AddKernel( "phi_dt.fake_phi_kernel", std::bind(&KernelLauncherFunc !phi.CPU_allocator - %ctx = "phi_dt.create_context.cpu" (): () -> !phi.CPU_context + %ctx = "phi_dt.create_context.cpu" (%allocator): (!phi.CPU_allocator) -> !phi.CPU_context %t = "phi_dt.create_dense_tensor.cpu.f32.nchw" (%allocator) {dims=[1:i64], lod=[1:i64]}: (!phi.CPU_allocator) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor) -> () + %e = "phi_cpu.sign.any.float32"(%ctx, %t) : (!phi.CPU_context, !infrt.dense_tensor) -> (!infrt.dense_tensor) - // CHECK: @FakePhiKernel@ - %d = "phi_dt.fake_phi_kernel" (%ctx, %t, %t) {transpose_x=false, transpose_y=false} : (!phi.CPU_context, !infrt.dense_tensor, !infrt.dense_tensor) -> (!infrt.dense_tensor) + // CHECK: dense_tensor: shape=shape[1], values=[1] + "phi_dt.print_tensor" (%e) : (!infrt.dense_tensor) -> () Infrt.return } diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh index a0132501387..75b27e4165d 100755 --- a/paddle/scripts/infrt_build.sh +++ b/paddle/scripts/infrt_build.sh @@ -33,14 +33,16 @@ function update_pd_ops() { rm -rf ${PADDLE_ROOT}/build && mkdir -p ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build cmake .. -DWITH_PYTHON=ON -DWITH_GPU=OFF -DPYTHON_EXECUTABLE=`which python3` -DWITH_XBYAK=OFF -DWITH_NCCL=OFF -DWITH_RCCL=OFF -DWITH_CRYPTO=OFF - make -j8 paddle_python + make -j8 paddle_python print_pten_kernels cd ${PADDLE_ROOT}/build + ./paddle/phi/tools/print_pten_kernels > ../tools/infrt/kernels.json cd python/dist/ python3 -m pip uninstall -y paddlepaddle python3 -m pip install *whl # update pd_ops.td cd ${PADDLE_ROOT}/tools/infrt/ python3 generate_pd_op_dialect_from_paddle_op_maker.py + python3 generate_phi_kernel_dialect.py ./kernels.json } function init() { diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py new file mode 100644 index 00000000000..80cf3958b15 --- /dev/null +++ b/tools/infrt/generate_phi_kernel_dialect.py @@ -0,0 +1,276 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import sys + +attr_type_converter = {"i": 'SI32Attr', "b": 'BoolAttr', "l": 'SI64Attr'} +supported_kernels = ['sign', 'dot', 'digamma', 'conj'] + +target_type_converter = {"CPU": "CPU", "GPU": "GPU"} +layout_type_converter = { + "NCHW": "NCHW", + "NHWC": "NHWC", + "Undefined(AnyLayout)": "ANY" +} +precision_type_converter = { + "uint8": "UINT8", + "int8": "INT8", + "int16": "INT16", + "int32": "INT32", + "int64": "INT64", + "float16": "FLOAT16", + "bfloat16": "BFLOAT16", + "float32": "FLOAT32", + "float64": "FLOAT64", + "complex64": "COMPLEX64", + "complex128": "COMPLEX128", + "bool": "BOOL" +} + + +def generate_kernel_name(op_name, place_str): + [target_, layout_, precision_] = place_str[1:-1].split(',') + target_ = target_type_converter[target_.strip()] + layout_ = layout_type_converter[layout_.strip()] + precision_ = precision_type_converter[precision_.strip()] + alias_ = "{}.{}".format(op_name, ".".join( + [target_.strip(), layout_.strip(), precision_.strip()])) + return alias_ + + +def generate_attrs_info(op_name, attrs_info): + kernel_attrs_names = { + 'split': ['sections', 'num', 'axis', 'mkldnn_data_type'], + 'sign': [], + 'masked_select': [], + 'trace': ['offset', 'axis1', 'axis2'], + 'concat': ['axis'], + 'empty': ['shape', 'dtype'], + 'conj': [], + 'norm': ['axis', 'epsilon', 'is_test'], + 'histogram': ['bins', 'min', 'max'], + 'dot': [], + 'scale': ['scale', 'bias', 'bias_after_scale'], + 'digamma': [], + 'lerp': [], + 'cast': ['out_dtype', 'in_dtype'], + 'abs': [] + } + attrs_args_ = "" + if len(kernel_attrs_names[op_name]) == len(attrs_info): + for index in range(len(attrs_info)): + attr_name = kernel_attrs_names[op_name][index] + attr_type = attr_type_converter[attrs_info[index]] + attrs_args_ += '{type_}:${name_},'.format( + type_=attr_type, name_=attr_name) + return attrs_args_[:-1] + + +def generate_inputs_info(input_info): + input_args_ = "" + for index in range(len(input_info)): + [target_, layout_, precision_] = input_info[index].split(',') + # todo: check vadility + target_ = target_type_converter[target_.strip()] + layout_ = layout_type_converter[layout_.strip()] + precision_ = precision_type_converter[precision_.strip()] + input_args_ += " DenseTensor<\"{}\",\"{}\",\"{}\">:$in{},".format( + target_.strip(), precision_.strip(), layout_.strip(), str(index)) + input_args_ = input_args_[:-1] + return input_args_ + + +def generate_arguments_info(op_name, input_info, attr_info): + input_args = generate_inputs_info(input_info) + attr_args = generate_attrs_info(op_name, attr_info) + context_args = "CPU_Context:$dev_ctx" + argument_ = "{},{},{}".format(context_args, input_args, attr_args) + return (("let arguments = (ins {});".format(argument_.strip(",")))) + + +def generate_results_info(output_info): + output_args_ = "let results = (outs " + for index in range(len(output_info)): + [target_, layout_, precision_] = output_info[index].split(',') + # todo: check vadility + target_ = target_type_converter[target_.strip()] + layout_ = layout_type_converter[layout_.strip()] + precision_ = precision_type_converter[precision_.strip()] + output_args_ += " DenseTensor<\"{}\",\"{}\",\"{}\">:$out{},".format( + target_.strip(), precision_.strip(), layout_.strip(), str(index)) + return ("{});".format(output_args_[:-1])) + + +def generate_supported_kernel_list(load_dict): + supported_kernels_list_ = [] + for op_name in load_dict: + kernel_list = load_dict[op_name] + for kernel_info in kernel_list: + for kernel_alias_ in kernel_info: + attributes = kernel_info[kernel_alias_]["attribute"] + flag = True + for attribute in attributes: + if attribute not in attr_type_converter: + flag = False + if flag: + supported_kernels_list_.append(op_name) + + alias_ = generate_kernel_dialect(op_name, kernel_alias_, + kernel_info[kernel_alias_]) + supported_kernels_list_ = list(set(supported_kernels_list_)) + print(supported_kernels_list_) + + +def scan_kernel_info(load_dict): + target_type_ = [] + layout_type_ = [] + precision_type_ = [] + for op_name in load_dict: + kernel_list = load_dict[op_name] + for kernel_info in kernel_list: + for kernel_alias_ in kernel_info: + [target_, layout_, precision_] = kernel_alias_[1:-1].split(',') + target_type_.append(target_.strip()) + layout_type_.append(layout_.strip()) + precision_type_.append(precision_.strip()) + target_type_ = list(set(target_type_)) + layout_type_ = list(set(layout_type_)) + precision_type_ = list(set(precision_type_)) + print(target_type_) + print(layout_type_) + print(precision_type_) + + +def generate_cpu_kernel_dialect(op_name, kernel_alias_, kernel_info): + + alias = generate_kernel_name(op_name, kernel_alias_) + summary = 'let summary = "{name}";'.format(name=alias) + dialect_name = alias.split(".") + dialect_name = dialect_name[0] + "." + dialect_name[2] + "." + dialect_name[ + 3] + + header = 'def {kernel_name} : PDTCPU_Kernel<"{name}",[NoSideEffect]> {left_brace}'.format( + kernel_name=alias.replace(".", ""), + name=dialect_name.lower(), + left_brace="{") + + inputs_ = kernel_info["input"] + attributes = kernel_info["attribute"] + arguments = generate_arguments_info(op_name, inputs_, attributes) + + outputs = kernel_info["output"] + results = generate_results_info(outputs) + + kernel_dialect = '{header_}\n {summary_}\n {arguments_}\n {results_}\n{right_brace}\n'.format( + header_=header, + summary_=summary, + arguments_=arguments, + results_=results, + right_brace="}") + return kernel_dialect + + +def generate_gpu_kernel_dialect(op_name, kernel_alias_, kernel_info): + + alias = generate_kernel_name(op_name, kernel_alias_) + summary = 'let summary = "{name}";'.format(name=alias) + dialect_name = alias.split(".") + dialect_name = dialect_name[0] + "." + dialect_name[2] + "." + dialect_name[ + 3] + + header = 'def {kernel_name} : PDTGPU_Kernel<"{name}",[NoSideEffect]> {left_brace}'.format( + kernel_name=alias.replace(".", ""), + name=dialect_name.lower(), + left_brace="{") + inputs_ = kernel_info["input"] + attributes = kernel_info["attribute"] + arguments = generate_arguments_info(op_name, inputs_, attributes) + + outputs = kernel_info["output"] + results = generate_results_info(outputs) + + kernel_dialect = '{header_}\n {summary_}\n {arguments_}\n {results_}\n{right_brace}\n'.format( + header_=header, + summary_=summary, + arguments_=arguments, + results_=results, + right_brace="}") + return kernel_dialect + + +def generate_dialect_head(): + comment_ = "/*===- TableGen'source file -----------------------------------------------===*\\\n\ +|* *|\n\ +|* Kernel Definitions *|\n\ +|* *|\n\ +|* Automatically generated file, do not edit! *|\n\ +|* Generated by tools/infrt/generate_pten_kernel_dialect.py *|\n\ +|* *|\n\ +\*===----------------------------------------------------------------------===*/\n" + + includes_ = "#ifndef PTEN_KERNELS\n\ +#define PTEN_KERNELS\n\ +include \"mlir/Interfaces/InferTypeOpInterface.td\"\n\ +include \"mlir/Interfaces/LoopLikeInterface.td\"\n\ +include \"mlir/IR/OpBase.td\"\n\ +include \"paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td\"" + + return (comment_ + includes_) + + +def get_kernel_target(kernel_alias_): + target = kernel_alias_[1:-1].split(",") + return target[0] + + +def main(path_): + with open(path_, "r") as f: + load_dict = json.load(f) + + head = generate_dialect_head() + + cpu_registry_ = "" + gpu_registry_ = "" + for op_name in load_dict: + if op_name not in supported_kernels: + continue + kernel_list = load_dict[op_name] + for kernel_info in kernel_list: + for kernel_alias_ in kernel_info: + if get_kernel_target(kernel_alias_) == "CPU": + kernel_registry = generate_cpu_kernel_dialect( + op_name, kernel_alias_, kernel_info[kernel_alias_]) + cpu_registry_ += kernel_registry + elif get_kernel_target(kernel_alias_) == "GPU": + kernel_registry = generate_gpu_kernel_dialect( + op_name, kernel_alias_, kernel_info[kernel_alias_]) + gpu_registry_ += kernel_registry + else: + print("Unsupported backend:" + get_kernel_target( + kernel_alias_)) + end = "#endif // PTEN_KERNELS" + with open("../../paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td", + "w") as dst: + dst.write('{start_}\n{dialect_}\n{end_}'.format( + start_=head, dialect_=cpu_registry_, end_=end)) + with open("../../paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td", + "w") as dst: + dst.write('{start_}\n{dialect_}\n{end_}'.format( + start_=head, dialect_=gpu_registry_, end_=end)) + + +if __name__ == '__main__': + path = sys.argv[1] + main(path) diff --git a/tools/infrt/get_phi_kernel_info.py b/tools/infrt/get_phi_kernel_info.py index f3e9f345da2..9ea3fef0030 100644 --- a/tools/infrt/get_phi_kernel_info.py +++ b/tools/infrt/get_phi_kernel_info.py @@ -150,19 +150,19 @@ def gen_dtype(vals: List[str]): ir_dtypes, origin_dtypes = [], [] for val in vals: if val == "float": - ir_dtypes.append("fp32") + ir_dtypes.append("float32") origin_dtypes.append("float") elif val == "double": - ir_dtypes.append("fp64") + ir_dtypes.append("float64") origin_dtypes.append("double") elif val == "float16": - ir_dtypes.append("fp16") + ir_dtypes.append("float16") origin_dtypes.append("paddle::experimental::float16") elif val == "bfloat16": ir_dtypes.append("bf16") origin_dtypes.append("paddle::experimental::bfloat16") elif val == "bool": - ir_dtypes.append("int1") + ir_dtypes.append("bool") origin_dtypes.append("bool") elif val == "int8_t": ir_dtypes.append("int8") @@ -219,8 +219,8 @@ def gen_register_info(resources: List[List[str]]): for ir_dtype, origin_dtype in zip(ir_dtypes, origin_dtypes): kernel_func = gen_kernel_func(update_item[3], ctx_name, origin_dtype) - ir_name = 'pten.' + '.'.join( - [it.lower() for it in update_item[:3]]) + "." + ir_dtype + ir_name = 'phi_cpu.' + update_item[0].lower() + '.' + update_item[ + 2].lower() + '.' + ir_dtype res += f""" registry->AddKernel("{ir_name}",""" -- GitLab From f30b3f810d1b7e341507450313503cf4702f7d8a Mon Sep 17 00:00:00 2001 From: pangyoki Date: Wed, 2 Mar 2022 16:17:43 +0800 Subject: [PATCH 056/272] support checking `phi` directory in CI op benchmark (#40026) * support phi checking in CI op benchmark * add sparse/gpu * remove h file in cpu directory --- tools/ci_op_benchmark.sh | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/tools/ci_op_benchmark.sh b/tools/ci_op_benchmark.sh index 1db79418b2d..0937ebe5343 100644 --- a/tools/ci_op_benchmark.sh +++ b/tools/ci_op_benchmark.sh @@ -43,20 +43,33 @@ function match_cu_file_directory { do [ "${cu_file_dir}" == "paddle/fluid/operators${sub_dir}" ] && return 0 done - for sub_dir in "" "/gpu" "/hybird" + for sub_dir in "" "/gpu" "/gpudnn" "/sparse/gpu" do [ "${cu_file_dir}" == "paddle/phi/kernels${sub_dir}" ] && return 0 done return 1 } +# Limit h file directory +function match_h_file_directory { + LOG "[INFO] run function match_h_file_directory" + local sub_dir h_file_dir + h_file_dir=$(dirname ${1}) + # '.h' file should not in directory below + for sub_dir in "" "/cpu" + do + [ "${h_file_dir}" == "paddle/phi/kernels${sub_dir}" ] && return 1 + done + return 0 +} + # Load op files by header file function load_CHANGE_OP_FILES_by_header_file { LOG "[INFO] run function load_CHANGE_OP_FILES_by_header_file" local change_file for change_file in $(grep -rl "${1}" paddle/fluid/operators paddle/phi/kernels/) do - if [[ "$change_file" =~ "_op.cu" ]] + if [[ "$change_file" =~ "_op.cu" || "$change_file" =~ "_kernel.cu" || "$change_file" =~ "_kernel_gpudnn.cu" ]] then # match cu file directory limit match_cu_file_directory $change_file || continue @@ -64,6 +77,7 @@ function load_CHANGE_OP_FILES_by_header_file { CHANGE_OP_FILES[${#CHANGE_OP_FILES[@]}]="$change_file" elif [[ "$change_file" =~ ".h" ]] then + match_h_file_directory $change_file || continue [ -n "${INCLUDE_SEARCH_MAP[$change_file]}" ] && continue LOG "[INFO] Found \"${1}\" include by \"${change_file}\", keep searching." INCLUDE_SEARCH_MAP[$change_file]="searched" @@ -82,7 +96,7 @@ function load_CHANGE_OP_FILES { # match directory limit [[ "$change_file" =~ "paddle/fluid/operators/" ]] || [[ "$change_file" =~ "paddle/phi/kernels/" ]] || continue # match file name limit - if [[ "$change_file" =~ "_op.cu" ]] + if [[ "$change_file" =~ "_op.cu" || "$change_file" =~ "_kernel.cu" || "$change_file" =~ "_kernel_gpudnn.cu" ]] then # match cu file directory limit match_cu_file_directory $change_file || continue @@ -90,6 +104,7 @@ function load_CHANGE_OP_FILES { CHANGE_OP_FILES[${#CHANGE_OP_FILES[@]}]="$change_file" elif [[ "$change_file" =~ ".h" ]] then + match_h_file_directory $change_file || continue LOG "[INFO] Found \"${change_file}\" changed, keep searching." INCLUDE_SEARCH_MAP[${change_file}]="searched" load_CHANGE_OP_FILES_by_header_file $change_file @@ -131,6 +146,8 @@ function load_CHANGE_OP_MAP { op_name=${change_file_name##*/} op_name=${op_name%_cudnn_op*} op_name=${op_name%_op*} + op_name=${op_name%_grad_kernel*} + op_name=${op_name%_kernel*} [ -n "${SKIP_OP_MAP[$op_name]}" ] && continue LOG "[INFO] Load op: \"${op_name}\"." CHANGE_OP_MAP[${op_name}]="$change_file" -- GitLab From 1c4e3e5dd0d32a4216bdad0b1cafcab4ca5ed5bb Mon Sep 17 00:00:00 2001 From: ziyoujiyi <73728031+ziyoujiyi@users.noreply.github.com> Date: Wed, 2 Mar 2022 16:23:52 +0800 Subject: [PATCH 057/272] new fleet_desc builder (#39948) * delete gloo connect retry * the_one_ps dirs reconstruct * . * . * create the_one_ps dirs * create the_one_ps dirs * create the_one_ps dirs * create the_one_ps dirs * create the_one_ps dirs * create the_one_ps dirs * the one ps dirs modify * the one ps dirs modify * the one ps dirs modify * the one ps dirs modify * refactor ps optimize * refactor ps optimize * refactor ps optimize * . * . * . * . * . * . * refactor theoneps * the_one_ps * add ps pass unittest * add ps pass unittest * ps unitest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * add cpu_async_ps_mode test * add cpu_async_ps_mode test * add cpu_async_ps_mode test * ps unittest ready * ps unittest ready * solve dist_pass init conflict * solve import CommContext error * unittest ok * implement AllocateFrom * solve setup.py.in conflict * solve conflict * solve conflict * solve conflict * . * . * cpu-async-ps minimize test ok & gpu minimize test ok * add heter 2stage unittest * add heter 2stage unittest * add heter 2stage unittest * sync/geo test ok & fix heter_worker program ok * . * new fleet desc generator * new fleet_desc builder * new fleet_desc builder * . * . * correct ps.proto compile * . Co-authored-by: zkh2016 --- paddle/fluid/distributed/ps/ps.proto | 13 - paddle/fluid/framework/CMakeLists.txt | 5 +- paddle/fluid/framework/ps.proto | 213 ++++ .../fleet/meta_optimizers/ps_optimizer.py | 1 + python/paddle/distributed/ps/README.md | 3 - python/paddle/distributed/ps/the_one_ps.py | 1022 ++++++++--------- .../paddle/distributed/ps/utils/ps_factory.py | 4 +- .../ps/utils/ps_program_builder.py | 5 +- python/paddle/distributed/ps/utils/public.py | 4 +- .../fluid/tests/unittests/CMakeLists.txt | 2 +- .../distributed_passes/ps_pass_test_base.py | 54 +- .../test_ps_trainer_pass.py | 122 +- .../fluid/tests/unittests/ps/CMakeLists.txt | 4 +- .../tests/unittests/ps/ps_dnn_trainer.py | 86 +- .../tests/unittests/ps/test_the_one_ps.py | 92 +- .../fluid/tests/unittests/ps_dnn_model.py | 1 + 16 files changed, 961 insertions(+), 670 deletions(-) delete mode 100755 paddle/fluid/distributed/ps/ps.proto mode change 100644 => 100755 paddle/fluid/framework/CMakeLists.txt create mode 100755 paddle/fluid/framework/ps.proto delete mode 100755 python/paddle/distributed/ps/README.md mode change 100644 => 100755 python/paddle/fluid/tests/unittests/CMakeLists.txt mode change 100644 => 100755 python/paddle/fluid/tests/unittests/ps/CMakeLists.txt mode change 100644 => 100755 python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py diff --git a/paddle/fluid/distributed/ps/ps.proto b/paddle/fluid/distributed/ps/ps.proto deleted file mode 100755 index 2691f637527..00000000000 --- a/paddle/fluid/distributed/ps/ps.proto +++ /dev/null @@ -1,13 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ \ No newline at end of file diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt old mode 100644 new mode 100755 index 14aecb5fd43..02d90b9c6da --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -235,6 +235,7 @@ if(WITH_PYTHON) py_proto_compile(trainer_py_proto SRCS trainer_desc.proto data_feed.proto) py_proto_compile(distributed_strategy_py_proto SRCS distributed_strategy.proto) py_proto_compile(pass_desc_py_proto SRCS pass_desc.proto) + py_proto_compile(ps_py_proto SRCS ps.proto) #Generate an empty \ #__init__.py to make framework_py_proto as a valid python module. add_custom_target(fleet_proto_init ALL @@ -242,12 +243,13 @@ if(WITH_PYTHON) COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py ) add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) - add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init pass_desc_py_proto) + add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init pass_desc_py_proto ps_py_proto) if (NOT WIN32) add_custom_command(TARGET framework_py_proto POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/ COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto + COMMAND cp ps_pb2.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto COMMENT "Copy generated python proto into directory paddle/fluid/proto." WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) add_custom_target(fleet_executor_proto_init ALL DEPENDS fleet_proto_init fleet_executor_desc_py_proto @@ -259,6 +261,7 @@ if(WITH_PYTHON) add_custom_command(TARGET framework_py_proto POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto COMMAND copy /Y *.py ${proto_dstpath} + COMMAND copy /Y ps_pb2.py ${fleet_proto_dstpath} COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath} COMMENT "Copy generated python proto into directory paddle/fluid/proto." COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto." diff --git a/paddle/fluid/framework/ps.proto b/paddle/fluid/framework/ps.proto new file mode 100755 index 00000000000..0ae87812bce --- /dev/null +++ b/paddle/fluid/framework/ps.proto @@ -0,0 +1,213 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; +package paddle.distributed; +option cc_generic_services = true; +option cc_enable_arenas = true; + +message FsClientParameter { + enum FsApiType { + HDFS = 0; + AFS = 1; + } + optional FsApiType fs_type = 1 [ default = HDFS ]; + optional string uri = 2; // such as afs://xxx.afs.com:9902 + optional string user = 3; // user_name to access fs + optional string passwd = 4; // password + optional int32 buffer_size = 5; // buffer for read/write + optional string hadoop_bin = 51; + optional string afs_conf = 101; +} + +message PSParameter { + optional string worker_class = 1; + optional string server_class = 2; + optional string instance_class = 3; + optional string init_gflags = 4 [ default = "" ]; + optional WorkerParameter worker_param = 101; + optional ServerParameter server_param = 102; + repeated DownpourTrainerParameter trainer_param = 301; + optional FsClientParameter fs_client_param = 501; +} + +message WorkerParameter { + optional DownpourWorkerParameter downpour_worker_param = 1; +} + +message DownpourWorkerParameter { + repeated TableParameter downpour_table_param = 1; +} + +message DownpourServerParameter { + repeated TableParameter downpour_table_param = 1; + optional ServerServiceParameter service_param = 2; +} + +message ServerParameter { + optional DownpourServerParameter downpour_server_param = 1; +} + +message DownpourTrainerParameter { + repeated DenseTableParameter dense_table = 1; + repeated SparseTableParameter sparse_table = 2; + optional int32 push_sparse_per_batch = 3; + optional int32 push_dense_per_batch = 4; + repeated string skip_op = 5; + repeated ProgramConfig program_config = 6; +} + +message DenseTableParameter { + optional int32 table_id = 1; + repeated string dense_variable_name = 2; + repeated string dense_gradient_variable_name = 3; + optional int32 fea_dim = 4; +} + +message SparseTableParameter { + optional int32 table_id = 1; + optional int32 feature_dim = 2; + repeated string slot_key = 3; + repeated string slot_value = 4; + repeated string slot_gradient = 5; +} + +message ServerServiceParameter { + optional string server_class = 1 [ default = "BrpcPsServer" ]; + optional string client_class = 2 [ default = "BrpcPsClient" ]; + optional string service_class = 3 [ default = "BrpcPsService" ]; + optional uint32 start_server_port = 4 + [ default = 0 ]; // will find a avaliable port from it + optional uint32 server_thread_num = 5 [ default = 12 ]; +} + +message ProgramConfig { + required string program_id = 1; + repeated int32 push_sparse_table_id = 2; + repeated int32 push_dense_table_id = 3; + repeated int32 pull_sparse_table_id = 4; + repeated int32 pull_dense_table_id = 5; +} + +enum TableType { + PS_SPARSE_TABLE = 0; + PS_DENSE_TABLE = 1; + PS_OTHER_TABLE = 2; +} + +message TableParameter { + optional uint64 table_id = 1; + optional string table_class = 2; + optional uint64 shard_num = 3 [ default = 1000 ]; + optional TableAccessorParameter accessor = 4; + optional TensorAccessorParameter tensor = 5; + optional CommonAccessorParameter common = 6; + optional TableType type = 7; + optional bool compress_in_save = 8 [ default = false ]; +} + +message TableAccessorParameter { + optional string accessor_class = 1; + optional uint32 fea_dim = 4 [ default = 11 ]; + optional uint32 embedx_dim = 5 [ default = 8 ]; + optional uint32 embedx_threshold = 6 [ default = 10 ]; + optional CtrAccessorParameter ctr_accessor_param = 7; + repeated TableAccessorSaveParameter table_accessor_save_param = 8; + optional SparseCommonSGDRuleParameter embed_sgd_param = 10; + optional SparseCommonSGDRuleParameter embedx_sgd_param = 11; +} + +message CtrAccessorParameter { + optional float nonclk_coeff = 1 + [ default = 0.1 ]; // to calculate show_click_score + optional float click_coeff = 2 + [ default = 1 ]; // to calculate show_click_score + optional float base_threshold = 3 [ + default = 1.5 + ]; // show_click_score > base_threshold, this feature can be saved + optional float delta_threshold = 4 + [ default = + 0.25 ]; // delta_score > delta_threshold, this feature can be saved + optional float delta_keep_days = 5 + [ default = + 16 ]; // unseen_day < delta_keep_days, this feature can be saved + optional float show_click_decay_rate = 6 [ + default = 0.98 + ]; // show/click will update to show/click * show_click_decay_rate after a day + optional float delete_threshold = 7 + [ default = 0.8 ]; // threshold to shrink a feasign + optional float delete_after_unseen_days = 8 + [ default = 30 ]; // unseen_day > delete_after_unseen_days, this feature + // will be delete in shrink_model + optional int32 ssd_unseenday_threshold = 9 + [ default = 1 ]; // threshold to save ssd +} + +message TensorAccessorParameter { + optional string feed_var_name = 1; + optional string fetch_var_name = 2; + optional int64 startup_program_id = 3; + optional int64 main_program_id = 4; + optional string tensor_table_class = 6; +} + +message CommonAccessorParameter { + optional string name = 1; + optional string table_name = 2; + repeated string attributes = 3; + repeated string params = 4; + repeated uint32 dims = 5; + repeated string initializers = 6; + optional string entry = 7; + optional int32 trainer_num = 8; + optional bool sync = 9; + optional uint32 table_num = 10; + optional uint32 table_dim = 11; +} + +message TableAccessorSaveParameter { + optional uint32 param = 1; + optional string converter = 2; + optional string deconverter = 3; +} + +message SparseCommonSGDRuleParameter { + optional string name = 1; + optional SparseNaiveSGDRuleParameter naive = 2; + optional SparseAdagradSGDRuleParameter adagrad = 3; + optional SparseAdamSGDParameter adam = 4; +} + +message SparseNaiveSGDRuleParameter { // SparseNaiveSGDRule + optional double learning_rate = 1 [ default = 0.05 ]; + optional double initial_range = 2 [ default = 0.0001 ]; + repeated float weight_bounds = 3; +} + +message + SparseAdagradSGDRuleParameter { // SparseAdaGradSGDRule|StdAdaGradSGDRule + optional double learning_rate = 1 [ default = 0.05 ]; + optional double initial_g2sum = 2 [ default = 3.0 ]; + optional double initial_range = 3 [ default = 0.0001 ]; + repeated float weight_bounds = 4; +} + +message SparseAdamSGDParameter { // SparseAdamSGDRule + optional double learning_rate = 1 [ default = 0.001 ]; + optional double initial_range = 2 [ default = 0.0001 ]; + optional double beta1_decay_rate = 3 [ default = 0.9 ]; + optional double beta2_decay_rate = 4 [ default = 0.999 ]; + optional double ada_epsilon = 5 [ default = 1e-08 ]; + repeated float weight_bounds = 6; +} diff --git a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py index 100a6882b1b..00937dbe7a4 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py @@ -54,6 +54,7 @@ class ParameterServerOptimizer(MetaOptimizerBase): attrs['cloned_startup'] = attrs['origin_startup_program'].clone() attrs['user_defined_strategy'] = self.user_defined_strategy + attrs['valid_strategy'] = self.user_defined_strategy attrs['trainer'] = TrainerRuntimeConfig(self.user_defined_strategy) attrs['ps_mode'] = attrs['trainer'].mode logger.info("ps_mode: {}".format(attrs['ps_mode'])) diff --git a/python/paddle/distributed/ps/README.md b/python/paddle/distributed/ps/README.md deleted file mode 100755 index 8d28031794f..00000000000 --- a/python/paddle/distributed/ps/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# 目录说明 - -* 改完之后,上层目录中 fleet 中相关文件(夹)就可以删除 diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py index 14a68ad9167..cc744bc9d9e 100755 --- a/python/paddle/distributed/ps/the_one_ps.py +++ b/python/paddle/distributed/ps/the_one_ps.py @@ -15,10 +15,11 @@ import warnings import os +from paddle.distributed.fleet.proto import ps_pb2 import paddle.fluid as fluid import paddle.distributed.fleet as fleet from paddle.fluid import core -from .utils.public import * +from paddle.distributed.ps.utils.public import * from paddle.fluid.framework import Program from paddle.fluid.compiler import CompiledProgram from paddle.fluid.executor import Executor @@ -29,14 +30,10 @@ from paddle.distributed.fleet.base.private_helper_function import wait_server_re from paddle.fluid.communicator import Communicator, HeterClient from google.protobuf import text_format -__all__ = [] - - -def conv_indent(indent): - return "".join([" "] * indent) - - -PSERVER_SAVE_SUFFIX = ".shard" +__all__ = [ + 'Table', 'SparseTable', 'GeoSparseTable', 'BarrierTable', 'TensorTable', + 'DenseTable' +] def get_program_by_id(context, program_id): @@ -62,129 +59,140 @@ def parse_table_class(varname, program_id, context): return "MemorySparseTable" -def get_default_accessor_proto(accessor, varname, program_id, context): +def check_embedding_dim(accessor_proto, varname, program_id, context): main_program, startup_program = get_program_by_id(context, program_id) embedding_dim = 0 for var in main_program.list_vars(): if var.name == varname: embedding_dim = var.shape[1] + print('new var: {}, {}, {}'.format(var, embedding_dim, + accessor_proto.fea_dim)) break - - if not accessor.HasField("accessor_class"): - accessor.accessor_class = "CtrCommonAccessor" - if not accessor.HasField("fea_dim"): - accessor.fea_dim = embedding_dim + 2 - if not accessor.HasField("embedx_dim"): - accessor.embedx_dim = embedding_dim - 1 - if not accessor.HasField("embedx_threshold"): - accessor.embedx_threshold = 0 - - ctr_accessor_param = accessor.ctr_accessor_param - if not ctr_accessor_param.HasField("nonclk_coeff"): - ctr_accessor_param.nonclk_coeff = 0.1 - if not ctr_accessor_param.HasField("click_coeff"): - ctr_accessor_param.click_coeff = 1.0 - if not ctr_accessor_param.HasField("base_threshold"): - ctr_accessor_param.base_threshold = 0 - if not ctr_accessor_param.HasField("delta_threshold"): - ctr_accessor_param.delta_threshold = 0 - if not ctr_accessor_param.HasField("delta_keep_days"): - ctr_accessor_param.delta_keep_days = 16 - if not ctr_accessor_param.HasField("show_click_decay_rate"): - ctr_accessor_param.show_click_decay_rate = 1 - if not ctr_accessor_param.HasField("delete_threshold"): - ctr_accessor_param.delete_threshold = 0 - if not ctr_accessor_param.HasField("delete_after_unseen_days"): - ctr_accessor_param.delete_after_unseen_days = 30 - if not ctr_accessor_param.HasField("ssd_unseenday_threshold"): - ctr_accessor_param.ssd_unseenday_threshold = 1 - - for sgd_param in [accessor.embed_sgd_param, accessor.embedx_sgd_param]: - if not sgd_param.HasField("name"): - sgd_param.name = "SparseAdaGradSGDRule" - if sgd_param.name == "SparseAdaGradSGDRule" or sgd_param.name == "StdAdaGradSGDRule": - if not sgd_param.adagrad.HasField("learning_rate"): - sgd_param.adagrad.learning_rate = 0.05 - if not sgd_param.adagrad.HasField("initial_g2sum"): - sgd_param.adagrad.initial_g2sum = 3.0 - if not sgd_param.adagrad.HasField("initial_range"): - sgd_param.adagrad.initial_range = 0.0001 - if len(sgd_param.adagrad.weight_bounds) == 0: - sgd_param.adagrad.weight_bounds.extend([-10.0, 10.0]) - if sgd_param.name == "SparseNaiveSGDRule": - if not sgd_param.naive.HasField("learning_rate"): - sgd_param.naive.learning_rate = 0.05 - if not sgd_param.naive.HasField("initial_range"): - sgd_param.naive.initial_range = 0.0001 - if len(sgd_param.naive.weight_bounds) == 0: - sgd_param.naive.weight_bounds.extend([-10.0, 10.0]) - if sgd_param.name == "SparseAdamSGDRule": - if not sgd_param.adam.HasField("learning_rate"): - sgd_param.adam.learning_rate = 0.001 - if not sgd_param.adam.HasField("initial_range"): - sgd_param.adam.initial_range = 0.0001 - if not sgd_param.adam.HasField("beta1_decay_rate"): - sgd_param.adam.beta1_decay_rate = 0.9 - if not sgd_param.adam.HasField("beta2_decay_rate"): - sgd_param.adam.beta2_decay_rate = 0.999 - if not sgd_param.adam.HasField("ada_epsilon"): - sgd_param.adam.ada_epsilon = 1e-08 - if len(sgd_param.adam.weight_bounds) == 0: - sgd_param.adam.weight_bounds.extend([-10.0, 10.0]) - - -def check_embedding_dim(accessor, varname, program_id, context): - main_program, startup_program = get_program_by_id(context, program_id) - embedding_dim = 0 - for var in main_program.list_vars(): - if var.name == varname: - embedding_dim = var.shape[1] - break - fea_dim = accessor.fea_dim + fea_dim = accessor_proto.fea_dim if fea_dim != embedding_dim + 2: raise ValueError( "The fea_dim is wrong, it will be sparse_embedding_dim + 2: {}, but got {}". format(embedding_dim + 2, fea_dim)) - embedx_dim = accessor.embedx_dim + embedx_dim = accessor_proto.embedx_dim if embedx_dim != embedding_dim - 1: raise ValueError( "The embedx_dim is wrong, it will be sparse_embedding_dim - 1: {}, but got {}". format(embedding_dim - 1, embedx_dim)) +class Service: + def __init__(self): + pass + + def _set(self, service_proto): + service_proto.server_class = "BrpcPsServer" + service_proto.client_class = "BrpcPsClient" + service_proto.service_class = "BrpcPsService" + service_proto.start_server_port = 0 + service_proto.server_thread_num = 12 + + +class GpuService(Service): + def __init__(self): + super(GpuService).__init__(self) + + def _set(self, service_proto): + super(GpuService)._set(service_proto) + service_proto.server_class = 'PsLocalServer' + service_proto.client_class = 'PsLocalClient' + + class Accessor: def __init__(self): self.accessor_class = "" self.optimizer = None - self.feature_dim = -1 - self.embedding_dim = -1 - self.optimizer = None - - def to_string(self, indent): - accessor_str = "{}accessor {{{}\n{}}}" - attrs = "" - attrs += "accessor_class: \"{}\" ".format(self.accessor_class) - attrs += "fea_dim: {} ".format(self.feature_dim) - attrs += "embedx_dim: {} ".format(self.embedding_dim) - attrs += "\n" - if self.optimizer is not None: - attrs += self.optimizer.to_string(indent) - return accessor_str.format( - conv_indent(indent), attrs, conv_indent(indent)) + self.feature_dim = 0 + self.embedding_dim = 0 + # TableAccessorParameter accessor + def _set(self, accessor_proto, varname, program_id, context): + main_program, startup_program = get_program_by_id(context, program_id) + embedding_dim = 0 + for var in main_program.list_vars(): + if var.name == varname: + embedding_dim = var.shape[1] + break -class CommonAccessor: + if not accessor_proto.HasField("accessor_class"): + accessor_proto.accessor_class = "CtrCommonAccessor" + if not accessor_proto.HasField("fea_dim"): + accessor_proto.fea_dim = embedding_dim + 2 + if not accessor_proto.HasField("embedx_dim"): + accessor_proto.embedx_dim = embedding_dim - 1 + if not accessor_proto.HasField("embedx_threshold"): + accessor_proto.embedx_threshold = 0 + + ctr_accessor_param = accessor_proto.ctr_accessor_param + if not ctr_accessor_param.HasField("nonclk_coeff"): + ctr_accessor_param.nonclk_coeff = 0.1 + if not ctr_accessor_param.HasField("click_coeff"): + ctr_accessor_param.click_coeff = 1.0 + if not ctr_accessor_param.HasField("base_threshold"): + ctr_accessor_param.base_threshold = 0 + if not ctr_accessor_param.HasField("delta_threshold"): + ctr_accessor_param.delta_threshold = 0 + if not ctr_accessor_param.HasField("delta_keep_days"): + ctr_accessor_param.delta_keep_days = 16 + if not ctr_accessor_param.HasField("show_click_decay_rate"): + ctr_accessor_param.show_click_decay_rate = 1 + if not ctr_accessor_param.HasField("delete_threshold"): + ctr_accessor_param.delete_threshold = 0 + if not ctr_accessor_param.HasField("delete_after_unseen_days"): + ctr_accessor_param.delete_after_unseen_days = 30 + if not ctr_accessor_param.HasField("ssd_unseenday_threshold"): + ctr_accessor_param.ssd_unseenday_threshold = 1 + + for sgd_param in [ + accessor_proto.embed_sgd_param, accessor_proto.embedx_sgd_param + ]: + if not sgd_param.HasField("name"): + sgd_param.name = "SparseAdaGradSGDRule" + if sgd_param.name == "SparseAdaGradSGDRule" or sgd_param.name == "StdAdaGradSGDRule": + if not sgd_param.adagrad.HasField("learning_rate"): + sgd_param.adagrad.learning_rate = 0.05 + if not sgd_param.adagrad.HasField("initial_g2sum"): + sgd_param.adagrad.initial_g2sum = 3.0 + if not sgd_param.adagrad.HasField("initial_range"): + sgd_param.adagrad.initial_range = 0.0001 + if len(sgd_param.adagrad.weight_bounds) == 0: + sgd_param.adagrad.weight_bounds.extend([-10.0, 10.0]) + if sgd_param.name == "SparseNaiveSGDRule": + if not sgd_param.naive.HasField("learning_rate"): + sgd_param.naive.learning_rate = 0.05 + if not sgd_param.naive.HasField("initial_range"): + sgd_param.naive.initial_range = 0.0001 + if len(sgd_param.naive.weight_bounds) == 0: + sgd_param.naive.weight_bounds.extend([-10.0, 10.0]) + if sgd_param.name == "SparseAdamSGDRule": + if not sgd_param.adam.HasField("learning_rate"): + sgd_param.adam.learning_rate = 0.001 + if not sgd_param.adam.HasField("initial_range"): + sgd_param.adam.initial_range = 0.0001 + if not sgd_param.adam.HasField("beta1_decay_rate"): + sgd_param.adam.beta1_decay_rate = 0.9 + if not sgd_param.adam.HasField("beta2_decay_rate"): + sgd_param.adam.beta2_decay_rate = 0.999 + if not sgd_param.adam.HasField("ada_epsilon"): + sgd_param.adam.ada_epsilon = 1e-08 + if len(sgd_param.adam.weight_bounds) == 0: + sgd_param.adam.weight_bounds.extend([-10.0, 10.0]) + + +class CommonAccessor(Accessor): def __init__(self): - self.accessor_class = "" - self.table_name = None - self.entry = None + super(CommonAccessor, self).__init__() + self.table_name = '' + self.entry = 'none' self.attrs = [] self.params = [] self.dims = [] self.trainer_num = 0 - self.sync = "false" - self.table_num = None - self.table_dim = None + self.sync = False self.initializers = [] self.opt_input_map = {} self.opt_attr_map = {} @@ -422,233 +430,361 @@ class CommonAccessor: self.initializers = initializers self.attrs = attrs - def to_string(self, indent): - accessor_str = "{}common {{{}\n{}}}" - attrs = "" - attrs += "name: \"{}\" ".format(self.accessor_class) - - if self.table_name: - attrs += "table_name: \"{}\" ".format(self.table_name) - - if self.entry: - attrs += "entry: \"{}\" ".format(self.entry) - attrs += "trainer_num: {} ".format(self.trainer_num) - attrs += "sync: {} ".format(self.sync) - if self.table_num: - attrs += "table_num: {} ".format(self.table_num) - if self.table_dim: - attrs += "table_dim: {} ".format(self.table_dim) - - for param in self.params: - attrs += "params: \"{}\" ".format(param) - - for dim in self.dims: - attrs += "dims: {} ".format(dim) - - for initializer in self.initializers: - attrs += "initializers: \"{}\" ".format(initializer) - - attrs += "\n" - return accessor_str.format( - conv_indent(indent), attrs, conv_indent(indent)) + # CommonAccessorParameter common + def _set(self, proto): + proto.name = self.accessor_class + proto.table_name = self.table_name + proto.params.extend(self.params) + proto.dims.extend(self.dims) + proto.initializers.extend(self.initializers) + proto.entry = self.entry + proto.trainer_num = self.trainer_num + proto.sync = self.sync + proto.table_num = self.table_num + proto.table_dim = self.table_dim class Tensor: - def __init__(self): - self.main_program_id = None - self.startup_program_id = None - self.feed_var_name = None - self.fetch_var_name = None - self.tensor_table_class = False - - def to_string(self, indent): - program_str = "{}tensor {{{}\n{}}}" - attrs = "" - attrs += "feed_var_name: \"{}\" ".format(str(self.feed_var_name)) - attrs += "fetch_var_name: \"{}\" ".format(str(self.fetch_var_name)) - attrs += "startup_program_id: {} ".format(str(self.startup_program_id)) - attrs += "main_program_id: {} ".format(str(self.main_program_id)) - attrs += "tensor_table_class: \"{}\" ".format( - str(self.tensor_table_class)) - attrs += "\n" - return program_str.format( - conv_indent(indent), attrs, conv_indent(indent)) + def __init__(self, tesnor_dcit): + self.tensor_dict = tesnor_dcit + + def _set(self, tensor_proto): + tensor_proto.main_program_id = self.tensor_dict.get("main_program_id", + 0) + tensor_proto.startup_program_id = self.tensor_dict.get( + "startup_program_id", 0) + tensor_proto.feed_var_name = self.tensor_dict.get("feed_var_name", '') + tensor_proto.fetch_var_name = self.tensor_dict.get("fetch_var_name", '') + tensor_proto.tensor_table_class = self.tensor_dict.get( + "tensor_table_class", '') class Table: def __init__(self): - self.id = -1 self.table_class = None self.shard_num = -1 self.type = None - self.accessor = None - self.common = None + self.accessor = Accessor() + self.shard_num = 256 + self.common = CommonAccessor() self.tensor = None - self.accessor_proto = None - - def to_string(self, indent): - # if self.id == 1: - # proto_txt = '' - # with open('./sparse_table.prototxt') as f: - # proto_txt = f.read() - # return proto_txt - table_str = "{}downpour_table_param {{{}\n{}}}" - - attrs = "" - attrs += "table_id: {} ".format(self.id) - attrs += "table_class: \"{}\" ".format(self.table_class) - attrs += "shard_num: {} ".format(self.shard_num) - attrs += "type: {}".format(self.type) - attrs += "\n" - indent += 2 - - if self.accessor_proto is not None: - accessor_str = "{}accessor {{{}\n{}}}" - accessor_str = accessor_str.format( - conv_indent(indent), self.accessor_proto, conv_indent(indent)) - attrs += accessor_str + "\n" - elif self.accessor is not None: - attrs += self.accessor.to_string(indent) - attrs += "\n" - - if self.tensor is not None: - attrs += self.tensor.to_string(indent) - attrs += "\n" - - if self.common is not None: - attrs += self.common.to_string(indent) - attrs += "\n" - - return table_str.format(conv_indent(indent), attrs, conv_indent(indent)) + def _set(self, table_proto): + pass -class Service: - def __init__(self): - self.server_class = "BrpcPsServer" - self.client_class = "BrpcPsClient" - self.service_class = "BrpcPsService" - self.start_server_port = 0 - self.server_thread_num = 12 - def to_string(self, indent): - service_str = "{}service_param {{{}\n{}}}" +class BarrierTable(Table): + def __init__(self, context, idx): + super(BarrierTable, self).__init__() + self.type = None + self.shard_num = 256 + self.accessor.accessor_class = 'CommMergeAccessor' + self.common.attrs = "" + self.common.dims = [] + self.common.params = [] + self.is_heter_ps_mode = context['is_heter_ps_mode'] + self.role_maker = context['role_maker'] + self.idx = idx + self.is_sync = context['is_sync'] + + def _set(self, table_proto): + table_proto.table_id = self.idx + table_proto.table_class = 'BarrierTable' + table_proto.shard_num = 256 + table_proto.type = ps_pb2.PS_OTHER_TABLE + + table_proto.accessor.accessor_class = "CommMergeAccessor" + table_proto.accessor.fea_dim = 0 + table_proto.accessor.embedx_dim = 0 + + table_proto.common.name = "" + table_proto.common.table_name = "barrier_table" + table_proto.common.sync = self.is_sync + table_proto.common.entry = 'none' + + trainer_num = get_trainers(self.role_maker) + if self.is_heter_ps_mode: + trainer_num += len(self.role_maker._get_heter_worker_endpoints()) + table_proto.common.trainer_num = trainer_num - attrs = "" - attrs += "server_class: \"{}\" ".format(self.server_class) - attrs += "client_class: \"{}\" ".format(self.client_class) - attrs += "service_class: \"{}\" ".format(self.service_class) - attrs += "start_server_port: {} ".format(self.start_server_port) - attrs += "server_thread_num: {} ".format(self.server_thread_num) - return service_str.format( - conv_indent(indent), attrs, conv_indent(indent)) +class TensorTable(Table): + def __init__(self, idx, tensor_dict, role_maker): + super(TensorTable, self).__init__() + self.idx = idx + self.tensor_dict = tensor_dict + self.role_maker = role_maker + def _set(self, table_proto): + table_proto.table_id = self.idx + table_proto.type = ps_pb2.PS_OTHER_TABLE + table_proto.table_class = self.tensor_dict.get("tensor_table_class", '') -class DownpourServer: - def __init__(self): - self.service = None - self.tables = [] + table_proto.accessor.accessor_class = "CommMergeAccessor" - def set_service_param(self, service): - self.service = service + table_proto.common.table_name = self.tensor_dict.get("feed_var_name", + '') + table_proto.common.trainer_num = get_trainers(self.role_maker) - def append_tables(self, table): - if not isinstance(table, Table): - raise ValueError("only support instance Table") - self.tables.append(table) + tensor = Tensor(self.tensor_dict) + tensor._set(table_proto.tensor) - def to_string(self, indent): - server_str = "{}downpour_server_param {{{}\n{}}}" - table_strs = "" - indent += 2 +class SparseTable(Table): + def __init__(self, context, send_ctx): + super(SparseTable, self).__init__() + self.context = context + self.ctx = send_ctx + self.type = None + self.table_class = 'MemorySparseTable' + self.accessor = Accessor() - table_strs += "\n" - table_strs += self.service.to_string(indent) + def _set(self, table_proto): + ctx = self.ctx + if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1 or ( + ctx.is_sparse() == False): + return + table_proto.table_id = ctx.table_id() + table_proto.table_class = self.table_class + table_proto.type = ps_pb2.PS_SPARSE_TABLE + table_proto.shard_num = self.shard_num + + self.common.table_name = self.context['grad_name_to_param_name'][ + ctx.origin_varnames()[0]] + + print('new table_name: {}'.format(self.common.table_name)) + all_table_proto = self.context[ + "user_defined_strategy"].sparse_table_configs + usr_table_proto = all_table_proto.add() + for proto in all_table_proto: + if proto.table_name == self.common.table_name: + usr_table_proto = proto + break + table_proto.table_class = 'MemorySparseTable' + warnings.warn("The PS mode must use MemorySparseTable.") + if usr_table_proto.HasField("shard_num"): + table_proto.shard_num = usr_table_proto.shard_num + else: + table_proto.shard_num = 1000 + warnings.warn( + "The shard_num of sparse table is not set, use default value 1000." + ) - for table in self.tables: - table_strs += "\n" - table_strs += table.to_string(indent) - return server_str.format( - conv_indent(indent), table_strs, conv_indent(indent)) + if usr_table_proto.accessor.ByteSize() == 0: + warnings.warn( + "The accessor of sparse table is not set, use default value.") + table_proto.accessor.ParseFromString( + usr_table_proto.accessor.SerializeToString()) + self.accessor._set(table_proto.accessor, self.common.table_name, + ctx.program_id(), self.context) -class Server: - def __init__(self): - self.servers = [] + check_embedding_dim(table_proto.accessor, self.common.table_name, + ctx.program_id(), self.context) - def add_server(self, server): - if not isinstance(server, DownpourServer): - raise ValueError("only support instance DownpourServer") - self.servers.append(server) + adam_d2sum = self.context["user_defined_strategy"].adam_d2sum + self.common.parse_by_optimizer(ctx, self.context) + self.common.parse_entry(self.common.table_name, + ctx.program_id(), self.context) + self.common.sync = True if self.context['is_sync'] else False - def __str__(self): - server_str = "server_param {{{}\n}}" - indent = 2 - servers_str = "" - for server in self.servers: - servers_str += "\n" - servers_str += server.to_string(indent) + self.common._set(table_proto.common) - return server_str.format(servers_str) +class GeoSparseTable(SparseTable): + def __init__(self, context, send_ctx): + super(GeoSparseTable, self).__init__(context, send_ctx) + self.table_class = "SparseGeoTable" + if self.context['ps_mode'] != DistributedMode.GEO: + raise ValueError("not geo sparse table!") + + def _set(self, table_proto): + ctx = self.ctx + if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1 or ( + ctx.is_sparse() == False): + return + table_proto.table_id = ctx.table_id() + table_proto.table_class = self.table_class + table_proto.type = ps_pb2.PS_SPARSE_TABLE + table_proto.shard_num = self.shard_num + + table_proto.accessor.accessor_class = 'CommMergeAccessor' + table_proto.accessor.fea_dim = ctx.sections()[0] + table_proto.accessor.embedx_dim = ctx.sections()[1] + + self.common.table_name = self.context['grad_name_to_param_name'][ + ctx.origin_varnames()[0]] + adam_d2sum = self.context["user_defined_strategy"].adam_d2sum + self.common.parse_by_optimizer(ctx, self.context) + self.common.parse_entry(self.common.table_name, + ctx.program_id(), self.context) + self.common.sync = False + self.common._set(table_proto.common) + + +class DenseTable(Table): + def __init__(self, context, send_ctx): + super(DenseTable, self).__init__() + self.context = context + self.ctx = send_ctx + self.accessor = Accessor() -class DownpourWorker: + def _set(self, table_proto): + ctx = self.ctx + if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1 or ( + ctx.is_sparse() == True): + return + + table_proto.table_id = ctx.table_id() + + table_proto.type = ps_pb2.PS_DENSE_TABLE + table_proto.table_class = "CommonDenseTable" + table_proto.shard_num = 256 + + table_proto.accessor.accessor_class = 'CommMergeAccessor' + table_proto.accessor.fea_dim = ctx.sections()[0] + table_proto.accessor.embedx_dim = 1 + + self.common.table_name = "MergedDense" + adam_d2sum = self.context["user_defined_strategy"].adam_d2sum + self.common.parse_by_optimizer(ctx, self.context) + self.common.parse_entry(self.common.table_name, + ctx.program_id(), self.context) + self.common.sync = True if self.context['is_sync'] else False + + self.common._set(table_proto.common) + + +class Server: def __init__(self): - self.tables = [] + pass - def append_tables(self, table): - if not isinstance(table, Table): - raise ValueError("only support instance Table") - self.tables.append(table) + def _set(self): + pass - def to_string(self, indent): - worker_str = "{}downpour_worker_param {{{}\n{}}}" - table_strs = "" - indent += 2 - for table in self.tables: - table_strs += "\n" - table_strs += table.to_string(indent) - return worker_str.format( - conv_indent(indent), table_strs, conv_indent(indent)) +class DownpourServer(Server): + def __init__(self): + super(DownpourServer, self).__init__() + + def _set(self): + pass class Worker: def __init__(self): - self.workers = [] + pass - def add_worker(self, worker): - if not isinstance(worker, DownpourWorker): - raise ValueError("only support instance DownpourWorker") - self.workers.append(worker) + def _set(self): + pass - def __str__(self): - worker_str = "worker_param {{{}\n}}" - indent = 2 - workers_str = "" - for worker in self.workers: - workers_str += "\n" - workers_str += worker.to_string(indent) - return worker_str.format(workers_str) +class DownpourWorker(Worker): + def __init__(self): + super(DownpourWorker, self).__init__() + + def _set(self): + pass class fsClient: - def __init__(self, proto): - self.proto = proto - self.uri = proto.uri - self.user = proto.user - self.passwd = proto.passwd - self.hadoop_bin = proto.hadoop_bin - - def to_string(self): - proto_txt = text_format.MessageToString(self.proto) - if proto_txt: - fs_str = "fs_client_param {{\n{}}}" - return fs_str.format(proto_txt) + def __init__(self, fs_client_param): + self.fs_client_param = fs_client_param + + def _set(self, proto): + if not text_format.MessageToString(self.fs_client_param): + return + proto.uri = self.fs_client_param.uri + proto.user = self.fs_client_param.user + proto.passwd = self.fs_client_param.passwd + proto.hadoop_bin = self.fs_client_param.hadoop_bin + + +class PsDescBuilder(object): + def __init__(self, context): + self.context = context + self.is_sync = context['is_sync'] + self.ps_mode = context['ps_mode'] + self.is_heter_ps_mode = context['is_heter_ps_mode'] + self.use_ps_gpu = context['use_ps_gpu'] + self.send_ctx = get_the_one_send_context( + self.context, + use_origin_program=True, + split_dense_table=self.is_heter_ps_mode) + + self.tensor_table_dict = {} # TODO + self._server_sub_program = [] + + self.tables = self._get_tables() + + self.service = self._get_service() + self.fs_client = self._get_fs_client() + + self.ps_desc = ps_pb2.PSParameter() + + def _get_tensor_tables(self): + program_idx = 0 + if not self.tensor_table_dict: + self._server_sub_program.append(Program().desc) + tables = [] + for table_name in self.tensor_table_dict: + tables.append(globals()['TensorTable'](len(tables), tensor_dict, + self.context['role_maker'])) + program_idx += 1 + return tables + + def _get_tables(self): + tables = [] + for idx, (name, ctx) in enumerate(self.send_ctx.items()): + print('####### {}\n'.format(ctx.is_sparse())) + if ctx.is_sparse(): + if self.ps_mode == DistributedMode.GEO: + tables.append(globals()['GeoSparseTable'](self.context, + ctx)) + else: + tables.append(globals()['SparseTable'](self.context, ctx)) + else: + tables.append(globals()['DenseTable'](self.context, ctx)) + self.tensor_tables = self._get_tensor_tables() + tables.extend(self.tensor_tables) + tables.append(globals()['BarrierTable'](self.context, len(tables))) + return tables + + def _get_service(self): + if self.use_ps_gpu: + return GpuService() else: - return "" + return Service() + + def _get_fs_client(self): + return fsClient(self.context["user_defined_strategy"].fs_client_param) + + def build_worker_desc(self): + for table in self.tables: + table_proto = self.ps_desc.worker_param.downpour_worker_param.downpour_table_param.add( + ) + table._set(table_proto) + table_proto = self.ps_desc.server_param.downpour_server_param.downpour_table_param.add( + ) + table._set(table_proto) + self.service._set( + self.ps_desc.server_param.downpour_server_param.service_param) + return text_format.MessageToString(self.ps_desc) + + def build_server_desc(self): + for table in self.tables: + table_proto = self.ps_desc.server_param.downpour_server_param.downpour_table_param.add( + ) + table._set(table_proto) + self.sparse_table_maps = {} + if table_proto.type == ps_pb2.PS_SPARSE_TABLE and table_proto.common is not None: + self.sparse_table_maps[ + table_proto.common.table_name] = table_proto.table_id + + self.service._set( + self.ps_desc.server_param.downpour_server_param.service_param) + self.fs_client._set(self.ps_desc.fs_client_param) + return text_format.MessageToString(self.ps_desc) class TheOnePSRuntime(RuntimeBase): @@ -665,8 +801,11 @@ class TheOnePSRuntime(RuntimeBase): self.role_maker = context["role_maker"] self.origin_main_program = context["origin_main_program"] - self.origin_main_programs = context["origin_main_programs"] - + self.origin_main_programs = context.get("origin_main_programs", + [self.origin_main_program]) + self.context["origin_main_programs"] = self.origin_main_programs + self.context["origin_startup_programs"] = context.get( + 'origin_startup_programs', [context['origin_startup_program']]) self.context[ 'is_heter_ps_mode'] = self.role_maker._is_heter_parameter_server_mode self.is_heter_ps_mode = self.context['is_heter_ps_mode'] @@ -675,15 +814,23 @@ class TheOnePSRuntime(RuntimeBase): self.context['ps_mode'] = self.context['trainer'].mode self.context['use_ps_gpu'] = context['valid_strategy'].a_sync_configs[ 'use_ps_gpu'] - self.is_sync = True if self.context[ + self.context['is_sync'] = True if self.context[ 'ps_mode'] == DistributedMode.SYNC else False self.context['grad_name_to_param_name'] = {} self.context['tensor_table'] = {} build_var_distributed(self.context) + endpoints = get_ps_endpoints(self.role_maker) + self.string_hosts = [] + for idx, ep in enumerate(endpoints): + host, port = ep.split(":") + pshost = fluid.core.PSHost(host, int(port), idx) + self.string_hosts.append(pshost.serialize_to_string()) + + self.ps_desc_builder = PsDescBuilder(self.context) + def _init_worker(self): - worker = self._get_fleet_proto(is_server=False, is_sync=self.is_sync) - server = self._get_fleet_proto(is_server=True, is_sync=self.is_sync) + worker_desc = self.ps_desc_builder.build_worker_desc() if self.context['use_ps_gpu']: main_program = self.context['loss'].block.program @@ -701,23 +848,11 @@ class TheOnePSRuntime(RuntimeBase): kwargs["trainer_id"] = self.role_maker._worker_index() return kwargs - proto_txt = str(worker) + "\n" + str(server) - with open('proto_txt', 'w') as f: - f.write(proto_txt) - + proto_txt = worker_desc + "\n" + server_desc debug = bool(int(os.getenv("PSERVER_DEBUG", "0"))) - if debug: print("worker: \n{}".format(proto_txt)) - endpoints = get_ps_endpoints(self.role_maker) - - string_hosts = [] - for idx, ep in enumerate(endpoints): - host, port = ep.split(":") - pshost = fluid.core.PSHost(host, int(port), idx) - string_hosts.append(pshost.serialize_to_string()) - dense_map = get_the_one_recv_context( self.context, split_dense_table=self.is_heter_ps_mode) send_ctx = get_the_one_send_context( @@ -741,7 +876,7 @@ class TheOnePSRuntime(RuntimeBase): kwargs["trainer_id"] = self.role_maker._role_id() kwargs["trainers"] = self.role_maker._worker_num() - for table in server.servers[0].tables: + for table in server.servers[0].tables: #TODO if table.table_class == "BarrierTable": kwargs["barrier_table_id"] = table.id break @@ -755,7 +890,8 @@ class TheOnePSRuntime(RuntimeBase): trainer_config.mode, kwargs, trainer_config.get_communicator_flags()) self._communicator.init_with_ctx(send_ctx, dense_map, proto_txt, - string_hosts, fluid.global_scope()) + self.string_hosts, + fluid.global_scope()) fleet.util.barrier() info = self._communicator.get_client_info() @@ -812,275 +948,16 @@ class TheOnePSRuntime(RuntimeBase): previous_trainers, self.role_maker._role_id()) - def _push_sparse_param(self, - var_name, - table_id=-1, - scope=fluid.global_scope()): - self._communicator.push_sparse_param(var_name, table_id, scope) - - def _get_executor(self): - executor = fluid.Executor(fluid.CPUPlace()) - if self.is_heter_ps_mode: - if self.role_maker._is_heter_worker(): - heter_device_type = self.role_maker._heter_device_type().upper() - if heter_device_type not in ["GPU", "XPU", "CPU"]: - raise ValueError("Heter Worker Not Support Device {}". - format(device_type)) - if heter_device_type == "GPU": - executor = Executor( - fluid.CUDAPlace( - int(os.getenv("FLAGS_selected_gpus", "0")))) - elif heter_device_type == "XPU": - executor = Executor( - fluid.XPUPlace( - int(os.getenv("FLAGS_selected_xpus", "0")))) - return executor - - def _get_fleet_proto(self, is_server, is_sync, **kwargs): - def _build_merge_accessor(ctx): - accessor = Accessor() - accessor.accessor_class = "CommMergeAccessor" - accessor.optimizer = None - - if ctx.is_sparse(): - accessor.feature_dim = ctx.sections()[0] - accessor.embedding_dim = ctx.sections()[1] - else: - accessor.feature_dim = ctx.sections()[0] - accessor.embedding_dim = 1 - - return accessor - - def _build_barrier_table(idx): - table = Table() - table.id = idx - table.type = "PS_OTHER_TABLE" - table.table_class = "BarrierTable" - table.shard_num = 256 - - accessor = Accessor() - accessor.accessor_class = "CommMergeAccessor" - accessor.optimizer = None - accessor.feature_dim = 0 - accessor.embedding_dim = 0 - table.accessor = accessor - - common = CommonAccessor() - common.table_name = "barrier_table" - trainer_num = get_trainers(self.context['role_maker']) - if self.is_heter_ps_mode: - trainer_num += len(self.role_maker._get_heter_worker_endpoints( - )) - common.trainer_num = trainer_num - common.attrs = "" - common.dims = [] - common.params = [] - table.common = common - return table - - def _build_tensor_table(idx, tensor_dict): - table = Table() - table.id = idx - table.type = "PS_OTHER_TABLE" - table.table_class = tensor_dict["tensor_table_class"] - table.shard_num = 256 - - accessor = Accessor() - accessor.accessor_class = "CommMergeAccessor" - accessor.optimizer = None - accessor.feature_dim = 0 - accessor.embedding_dim = 0 - table.accessor = accessor - - common = CommonAccessor() - common.table_name = tensor_dict["feed_var_name"] - common.trainer_num = get_trainers(self.role_maker) - common.attrs = "" - common.dims = [] - common.params = [] - table.common = common - - tensor = Tensor() - tensor.main_program_id = tensor_dict["main_program_id"] - tensor.startup_program_id = tensor_dict["startup_program_id"] - tensor.feed_var_name = tensor_dict["feed_var_name"] - tensor.fetch_var_name = tensor_dict["fetch_var_name"] - tensor.tensor_table_class = tensor_dict["tensor_table_class"] - table.tensor = tensor - - return table - - def _add_tensor_table(tables): - tensor_table_dict = {} - program_idx = 0 - for table_name in tensor_table_dict: - if tensor_table_dict[table_name]["startup_program"] != None: - tensor_table_dict[table_name][ - "startup_program_id"] = program_idx - self._server_sub_program.append(tensor_table_dict[ - table_name]["startup_program"].desc) - program_idx += 1 - if tensor_table_dict[table_name]["main_program"] != None: - tensor_table_dict[table_name][ - "main_program_id"] = program_idx - self._server_sub_program.append(tensor_table_dict[ - table_name]["main_program"].desc) - program_idx += 1 - # Todo: Hard code for lr_decay table apply table id - new_table = _build_tensor_table( - len(tables), tensor_table_dict[table_name]) - tables.append(new_table) - return tables - - def _get_tables(): - send_ctx = get_the_one_send_context( - self.context, - use_origin_program=True, - split_dense_table=self.is_heter_ps_mode) - - tables = [] - for idx, (name, ctx) in enumerate(send_ctx.items()): - print(" wxm python test send_ctx.items-->", idx, (name, ctx)) - if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1: - continue - - table = Table() - table.id = ctx.table_id() - common = CommonAccessor() - - if ctx.is_sparse(): - table.type = "PS_SPARSE_TABLE" - table.shard_num = 256 - - common.table_name = self.context['grad_name_to_param_name'][ - ctx.origin_varnames()[0]] - - if self.context['ps_mode'] == DistributedMode.GEO: - table.table_class = "SparseGeoTable" - else: - all_table_proto = self.context[ - "user_defined_strategy"].sparse_table_configs - table_proto = all_table_proto.add() - for proto in all_table_proto: - if proto.table_name == common.table_name: - table_proto = proto - break - if table_proto.HasField("table_class"): - table.table_class = table_proto.table_class - else: - table.table_class = parse_table_class( - common.table_name, - ctx.program_id(), self.context) - if table.table_class != 'MemorySparseTable': - table.table_class = 'MemorySparseTable' - warnings.warn( - "The PS mode must use MemorySparseTable.") - - if table_proto.HasField("shard_num"): - table.shard_num = table_proto.shard_num - else: - table.shard_num = 1000 - warnings.warn( - "The shard_num of sparse table is not set, use default value 1000." - ) - - if table_proto.accessor.ByteSize() == 0: - warnings.warn( - "The accessor of sparse table is not set, use default value." - ) - get_default_accessor_proto( - table_proto.accessor, common.table_name, - ctx.program_id(), self.context) - check_embedding_dim(table_proto.accessor, - common.table_name, - ctx.program_id(), self.context) - table.accessor_proto = text_format.MessageToString( - table_proto.accessor) - else: - table.type = "PS_DENSE_TABLE" - table.table_class = "CommonDenseTable" - table.shard_num = 256 - common.table_name = "MergedDense" - - adam_d2sum = self.context["user_defined_strategy"].adam_d2sum - common.parse_by_optimizer(ctx, self.context) - - if ctx.is_sparse(): - common.parse_entry(common.table_name, - ctx.program_id(), self.context) - - if is_sync: - common.sync = "true" - else: - common.sync = "false" - table.common = common - - if table.table_class != 'MemorySparseTable': - accessor = _build_merge_accessor(ctx) - table.accessor = accessor - tables.append(table) - - tensor_table_dict = {} - if len(tensor_table_dict) > 0: - tables = _add_tensor_table(tables) - else: - empty_porgram = Program() - self._server_sub_program.append(empty_porgram.desc) - - barrier_table = _build_barrier_table(len(tables)) - tables.append(barrier_table) - return tables - - if is_server: - server = Server() - downpour_server = DownpourServer() - - service = Service() - dist_strategy = self.context["valid_strategy"] - use_ps_gpu = dist_strategy.a_sync_configs["use_ps_gpu"] - if use_ps_gpu: - service.server_class = "PsLocalServer" - service.client_class = "PsLocalClient" - downpour_server.set_service_param(service) - - tables = _get_tables() - downpour_server.tables = tables - server.add_server(downpour_server) - return server - else: - worker = Worker() - downpour_worker = DownpourWorker() - - tables = _get_tables() - downpour_worker.tables = tables - worker.add_worker(downpour_worker) - return worker - def _init_server(self, dirname=None, var_names=None, **kwargs): + server_desc = self.ps_desc_builder.build_server_desc() role_id = get_role_id(self.role_maker) - endpoints = get_ps_endpoints(self.role_maker) trainers = get_trainers(self.role_maker) if self.is_heter_ps_mode: trainers += len(self.role_maker._get_heter_worker_endpoints()) - server = self._get_fleet_proto(is_server=True, is_sync=self.is_sync) - proto_txt = str(server) - fs_client = fsClient(self.context["user_defined_strategy"] - .fs_client_param) - proto_txt = proto_txt + "\n" + fs_client.to_string() - - debug = bool(int(os.getenv("PSERVER_DEBUG", "0"))) - if debug: - print("server: \n{}".format(proto_txt)) - - string_hosts = [] - for idx, ep in enumerate(endpoints): - host, port = ep.split(":") - pshost = fluid.core.PSHost(host, int(port), idx) - string_hosts.append(pshost.serialize_to_string()) self._server = fluid.core.DistFleetWrapper() - self._server.init_server(proto_txt, string_hosts, role_id, trainers, - self._server_sub_program) + self._server.init_server(server_desc, self.string_hosts, role_id, + trainers, self._server_sub_program) dist_varnames = get_sparse_tablenames(self.origin_main_programs, True) sparse_varnames = get_sparse_tablenames(self.origin_main_programs, @@ -1101,10 +978,7 @@ class TheOnePSRuntime(RuntimeBase): if dirname is None or not load_varnames: return - sparse_table_maps = {} - for table in server.servers[0].tables: - if table.type == "PS_SPARSE_TABLE" and table.common is not None: - sparse_table_maps[table.common.table_name] = table.id + sparse_table_maps = self.ps_desc_builder.sparse_table_maps dirname = os.path.normpath(dirname) pserver_id = self.role_maker._role_id() @@ -1186,7 +1060,7 @@ class TheOnePSRuntime(RuntimeBase): sparses = get_the_one_recv_context( self.context, is_dense=False, - split_dense_table=self.is_heter_ps_mod, + split_dense_table=self.is_heter_ps_mode, use_origin_program=True) sparse_varnames = self._save_sparse_params(executor, dirname, sparses, @@ -1413,7 +1287,7 @@ class TheOnePSRuntime(RuntimeBase): fleet.util.barrier() if self.role_maker._is_first_worker(): - sparses = sget_the_one_recv_context( + sparses = get_the_one_recv_context( self.context, is_dense=False, split_dense_table=self.role_maker. diff --git a/python/paddle/distributed/ps/utils/ps_factory.py b/python/paddle/distributed/ps/utils/ps_factory.py index 1a426f3ad6c..701ae8be6cb 100755 --- a/python/paddle/distributed/ps/utils/ps_factory.py +++ b/python/paddle/distributed/ps/utils/ps_factory.py @@ -38,5 +38,7 @@ class PsProgramBuilderFactory(object): elif 'is_fl_ps_mode' in attrs and attrs[ 'is_fl_ps_mode'] == DistributedMode.FL: return globals()['FlPsProgramBuilder'](pass_ctx) - else: + elif attrs['ps_mode'] == DistributedMode.SYNC: return globals()['CpuSyncPsProgramBuilder'](pass_ctx) + else: + return globals()['CpuAsyncPsProgramBuilder'](pass_ctx) diff --git a/python/paddle/distributed/ps/utils/ps_program_builder.py b/python/paddle/distributed/ps/utils/ps_program_builder.py index 25e4dc28bdc..d737542f323 100755 --- a/python/paddle/distributed/ps/utils/ps_program_builder.py +++ b/python/paddle/distributed/ps/utils/ps_program_builder.py @@ -95,11 +95,12 @@ class GeoPsProgramBuilder(PsProgramBuilder): # 仅 CPU 模式 class CpuSyncPsProgramBuilder(PsProgramBuilder): def __init__(self, pass_ctx): - logger.info("start building cpu-sync-ps program") super(CpuSyncPsProgramBuilder, self).__init__(pass_ctx) + if self.ps_mode == DistributedMode.SYNC: + logger.info("start building cpu-sync-ps program") if self.ps_mode != DistributedMode.SYNC and self.ps_mode != DistributedMode.ASYNC: raise ValueError("ps mode: {} not matched {}", - format(self.ps_mode, "CpuSyncPsProgramBuilder")) + format(self.ps_mode, "PsProgramBuilder")) def _build_trainer_programs(self): add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass", diff --git a/python/paddle/distributed/ps/utils/public.py b/python/paddle/distributed/ps/utils/public.py index ebec6900e38..ab5bd7da09d 100755 --- a/python/paddle/distributed/ps/utils/public.py +++ b/python/paddle/distributed/ps/utils/public.py @@ -73,7 +73,9 @@ def logger_config(log_path, logging_name): return logger -logger = logger_config(log_path='/ps_log', logging_name='ps_log') +ps_log_root_dir = '/ps_log/' +logger = logger_config( + log_path='/ps_usr_print_log', logging_name='ps_usr_print_log') class DistributedMode: diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt old mode 100644 new mode 100755 index 2f6df075478..1443eebf293 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -627,7 +627,7 @@ set_tests_properties(test_norm_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE") set_tests_properties(test_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE") if(WITH_DISTRIBUTE) add_subdirectory(distributed_passes) - + add_subdirectory(ps) add_subdirectory(auto_parallel) # FIXME(typhoonzero): add these tests back diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/ps_pass_test_base.py b/python/paddle/fluid/tests/unittests/distributed_passes/ps_pass_test_base.py index 63dd4b8e21e..93a0044a5e4 100755 --- a/python/paddle/fluid/tests/unittests/distributed_passes/ps_pass_test_base.py +++ b/python/paddle/fluid/tests/unittests/distributed_passes/ps_pass_test_base.py @@ -23,13 +23,24 @@ import unittest import numpy as np from collections import OrderedDict from paddle.distributed.ps.utils.public import logger -from dist_pass_test_base import prepare_python_path_and_return_module, remove_path_if_exists +from paddle.fluid.tests.unittests.distributed_passes.dist_pass_test_base import prepare_python_path_and_return_module, remove_path_if_exists import paddle.distributed.fleet as fleet class PsPassTestBase(unittest.TestCase): def init(self): - raise NotImplementedError + self.config = {} + self.config['ps_mode_config'] = "" + self.config['worker_num'] = "1" + self.config['server_num'] = "1" + self.config['run_minimize'] = "0" + self.config['run_single_pass'] = "0" + self.config['run_the_one_ps'] = '0' + self.config['debug_new_minimize'] = "0" + self.config['debug_new_pass'] = "0" + self.config['debug_the_one_ps'] = '0' + self.config['log_dir'] = "" + self.config['applied_pass_name'] = "" def setUp(self): print('Ps setUp...') @@ -37,7 +48,7 @@ class PsPassTestBase(unittest.TestCase): def tearDown(self): print('Ps tearDown...') - def ps_launch(self, config, ps_mode="cpu-ps"): + def ps_launch(self, ps_mode="cpu-ps"): if ps_mode == "cpu-ps" or ps_mode == 'heter-ps': os.environ['WITH_DISTRIBUTE'] = 'ON' @@ -45,23 +56,26 @@ class PsPassTestBase(unittest.TestCase): sys.executable, "-u", ] + [ - "-m", "launch", "--log_dir", config['log_dir'], "--worker_num", - config['worker_num'], "--server_num", config['server_num'] + "-m", "launch", "--log_dir", self.config['log_dir'], + "--worker_num", self.config['worker_num'], "--server_num", + self.config['server_num'] ] if ps_mode == 'heter-ps': os.environ['FLAGS_START_PORT'] = '12004' cmd += [ - '--heter_worker_num', config['heter_worker_num'], - '--heter_devices', config['heter_devices'] + '--heter_worker_num', self.config['heter_worker_num'], + '--heter_devices', self.config['heter_devices'] ] cmd += [ - "../ps/ps_dnn_trainer.py", "-m", config['ps_mode_config'], - "--run_minimize", config['run_minimize'], "--run_single_pass", - config['run_single_pass'], "--debug_new_pass", - config['debug_new_pass'], "--debug_new_minimize", - config['debug_new_minimize'], "--applied_pass_name", - config['applied_pass_name'] + "../ps/ps_dnn_trainer.py", "-m", self.config['ps_mode_config'], + "--run_minimize", self.config['run_minimize'], + "--run_single_pass", self.config['run_single_pass'], + "--run_the_one_ps", self.config['run_the_one_ps'], + "--debug_new_pass", self.config['debug_new_pass'], + "--debug_new_minimize", self.config['debug_new_minimize'], + "--applied_pass_name", self.config['applied_pass_name'], + "--debug_the_one_ps", self.config['debug_the_one_ps'] ] elif ps_mode == "gpu-ps": os.environ['FLAGS_LAUNCH_BARRIER'] = '0' @@ -80,12 +94,14 @@ class PsPassTestBase(unittest.TestCase): cmd = [ sys.executable, "-u", "../ps/ps_dnn_trainer.py", "-m", - config['ps_mode_config'], "--run_minimize", - config['run_minimize'], "--run_single_pass", - config['run_single_pass'], "--debug_new_pass", - config['debug_new_pass'], "--debug_new_minimize", - config['debug_new_minimize'], "--applied_pass_name", - config['applied_pass_name'] + self.config['ps_mode_config'], "--run_minimize", + self.config['run_minimize'], "--run_single_pass", + self.config['run_single_pass'], "--run_the_one_ps", + self.config['run_the_one_ps'], "--debug_new_pass", + self.config['debug_new_pass'], "--debug_new_minimize", + self.config['debug_new_minimize'], "--applied_pass_name", + self.config['applied_pass_name'], "--debug_the_one_ps", + self.config['debug_the_one_ps'] ] cmd = [shlex.quote(c) for c in cmd] diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py index b186869ee97..fd558ef0403 100755 --- a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py +++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py @@ -21,31 +21,26 @@ import numpy as np import paddle from ps_pass_test_base import * -from paddle.distributed.ps.utils.public import logger +from paddle.distributed.ps.utils.public import logger, ps_log_root_dir from paddle.fluid.tests.unittests.ps.ps_dnn_trainer import DnnTrainer class TestPsTrainerPass(PsPassTestBase): - def init(self): - self.config = {} - self.config['ps_mode_config'] = "" - self.config['worker_num'] = "1" - self.config['server_num'] = "1" - self.config['run_minimize'] = "0" - self.config['run_single_pass'] = "0" - self.config['debug_new_minimize'] = "0" - self.config['debug_new_pass'] = "0" - self.config['log_dir'] = "" - self.config['applied_pass_name'] = "" - def setUp(self): pass def tearDown(self): pass - def check(self): - pass + def check(self, file1, file2): + with open(file1, 'r', encoding='utf-8') as f: + text1 = f.read() + with open(file2, 'r', encoding='utf-8') as f: + text2 = f.read() + if text1 == text2: + return True + else: + return False def test_ps_optimizer_minimize_cpu_async(self): self.init() @@ -53,16 +48,21 @@ class TestPsTrainerPass(PsPassTestBase): self.config['run_minimize'] = '1' self.config['debug_new_minimize'] = '0' - self.config['log_dir'] = "/async_cpu_log_old_minimize" + self.config['log_dir'] = ps_log_root_dir + "async_cpu_log_old_minimize" remove_path_if_exists(self.config['log_dir']) - self.ps_launch(self.config) + self.ps_launch() self.config['debug_new_minimize'] = '1' - self.config['log_dir'] = "/async_cpu_log_new_minimize" + self.config['log_dir'] = ps_log_root_dir + "async_cpu_log_new_minimize" remove_path_if_exists(self.config['log_dir']) - self.ps_launch(self.config) + self.ps_launch() - self.check() + file1 = '/ps_log/async_run_minimize_debug:_0_worker_main.prototxt' + file2 = '/ps_log/async_run_minimize_debug:_1_worker_main.prototxt' + if self.check(file1, file2): + logger.info('test_ps_optimizer_minimize_cpu_async passed!') + else: + logger.error('test_ps_optimizer_minimize_cpu_async failed!') def test_ps_optimizer_minimize_cpu_sync(self): self.init() @@ -70,16 +70,22 @@ class TestPsTrainerPass(PsPassTestBase): self.config['run_minimize'] = '1' self.config['debug_new_minimize'] = '0' - self.config['log_dir'] = "/sync_cpu_log_old_minimize" + self.config['log_dir'] = ps_log_root_dir + "sync_cpu_log_old_minimize" remove_path_if_exists(self.config['log_dir']) - self.ps_launch(self.config) + self.ps_launch() self.config['debug_new_minimize'] = '1' - self.config['log_dir'] = "/sync_cpu_log_new_minimize" + self.config['log_dir'] = ps_log_root_dir + "sync_cpu_log_new_minimize" remove_path_if_exists(self.config['log_dir']) - self.ps_launch(self.config) - - self.check() + self.ps_launch() + ''' + file1 = '/ps_log/sync_run_minimize_debug:_0_worker_main.prototxt' + file2 = '/ps_log/sync_run_minimize_debug:_1_worker_main.prototxt' + if self.check(file1, file2): + logger.info('test_ps_optimizer_minimize_cpu_sync passed!') + else: + logger.error('test_ps_optimizer_minimize_cpu_sync failed!') + ''' def test_ps_optimizer_minimize_cpu_geo(self): self.init() @@ -87,16 +93,21 @@ class TestPsTrainerPass(PsPassTestBase): self.config['run_minimize'] = '1' self.config['debug_new_minimize'] = '0' - self.config['log_dir'] = "/geo_cpu_log_old_minimize" + self.config['log_dir'] = ps_log_root_dir + "geo_cpu_log_old_minimize" remove_path_if_exists(self.config['log_dir']) - self.ps_launch(self.config) + self.ps_launch() self.config['debug_new_minimize'] = '1' - self.config['log_dir'] = "/geo_cpu_log_new_minimize" + self.config['log_dir'] = ps_log_root_dir + "geo_cpu_log_new_minimize" remove_path_if_exists(self.config['log_dir']) - self.ps_launch(self.config) + self.ps_launch() - self.check() + file1 = '/ps_log/geo_run_minimize_debug:_0_worker_main.prototxt' + file2 = '/ps_log/geo_run_minimize_debug:_1_worker_main.prototxt' + if self.check(file1, file2): + logger.info('test_ps_optimizer_minimize_cpu_geo passed!') + else: + logger.error('test_ps_optimizer_minimize_cpu_geo failed!') # heter ps 二阶段 def test_ps_optimizer_minimize_heter(self): @@ -110,14 +121,24 @@ class TestPsTrainerPass(PsPassTestBase): self.config['ps_mode_config'] = "../ps/heter_ps_config.yaml" self.config['debug_new_minimize'] = '0' - self.config['log_dir'] = "/heter_log_old_minimize" + self.config['log_dir'] = ps_log_root_dir + "heter_log_old_minimize" remove_path_if_exists(self.config['log_dir']) - self.ps_launch(self.config, 'heter-ps') + self.ps_launch('heter-ps') self.config['debug_new_minimize'] = '1' - self.config['log_dir'] = "/heter_log_new_minimize" + self.config['log_dir'] = ps_log_root_dir + "heter_log_new_minimize" remove_path_if_exists(self.config['log_dir']) - self.ps_launch(self.config, 'heter-ps') + self.ps_launch('heter-ps') + ''' + file1 = '/ps_log/heter_run_minimize_debug:_0_worker_main.prototxt' + file2 = '/ps_log/heter_run_minimize_debug:_1_worker_main.prototxt' + file3 = '/ps_log/heter_run_minimize_debug:_0_heter_worker_main.prototxt' + file4 = '/ps_log/heter_run_minimize_debug:_1_heter_worker_main.prototxt' + if self.check(file1, file2) and self.check(file3, file4): + logger.info('test_ps_optimizer_minimize_heter passed!') + else: + logger.error('test_ps_optimizer_minimize_heter failed!') + ''' def test_ps_optimizer_minimize_gpu(self): self.init() @@ -125,29 +146,42 @@ class TestPsTrainerPass(PsPassTestBase): self.config['ps_mode_config'] = "../ps/gpu_ps_config.yaml" self.config['debug_new_minimize'] = '0' - self.ps_launch(self.config, "gpu-ps") + self.ps_launch("gpu-ps") self.config['debug_new_minimize'] = '1' - self.ps_launch(self.config, "gpu-ps") + self.ps_launch("gpu-ps") - self.check() + file1 = '/ps_log/gpubox_run_minimize_debug:_0_worker_main.prototxt' + file2 = '/ps_log/gpubox_run_minimize_debug:_1_worker_main.prototxt' + if self.check(file1, file2): + logger.info('test_ps_optimizer_minimize_gpu passed!') + else: + logger.error('test_ps_optimizer_minimize_gpu failed!') def test_append_send_ops_pass(self): self.init() self.config['run_single_pass'] = '1' + self.config['ps_mode_config'] = "../ps/cpu_async_ps_config.yaml" self.config['applied_pass_name'] = "append_send_ops_pass" self.config['debug_new_pass'] = '0' - self.config['log_dir'] = "/log_old_" + self.config['applied_pass_name'] + self.config['log_dir'] = ps_log_root_dir + "log_old_" + self.config[ + 'applied_pass_name'] remove_path_if_exists(self.config['log_dir']) - self.ps_launch(self.config, "cpu-ps") + self.ps_launch("cpu-ps") self.config['debug_new_pass'] = '1' - self.config['log_dir'] = "/log_new_" + self.config['applied_pass_name'] + self.config['log_dir'] = ps_log_root_dir + "log_new_" + self.config[ + 'applied_pass_name'] remove_path_if_exists(self.config['log_dir']) - self.ps_launch(self.config, "cpu-ps") - - self.check() + self.ps_launch("cpu-ps") + + file1 = '/ps_log/async_append_send_ops_pass_debug:_0_worker_main.prototxt' + file2 = '/ps_log/async_append_send_ops_pass_debug:_1_worker_main.prototxt' + if self.check(file1, file2): + logger.info('test_append_send_ops_pass passed!') + else: + logger.info('test_append_send_ops_pass failed!') def test_distributed_ops_pass(self): pass diff --git a/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt old mode 100644 new mode 100755 index 3aef3283b82..9af32a8aca7 --- a/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt @@ -3,6 +3,6 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) + list(APPEND TEST_OPS ${TEST_OP}) + set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 50) endforeach(TEST_OP) - -set_tests_properties(test_the_one_ps PROPERTIES TIMEOUT 50) diff --git a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py index d08c1d41c89..bc87fc255a5 100755 --- a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py +++ b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py @@ -264,12 +264,16 @@ def parse_args(): '--run_minimize', type=int, default=0, help="test single pass") parser.add_argument( '--run_single_pass', type=int, default=0, help="test single pass") + parser.add_argument( + '--run_the_one_ps', type=int, default=0, help="test the_one_ps") parser.add_argument( '--debug_new_minimize', type=int, default=0, help="test single pass") parser.add_argument( '--debug_new_pass', type=int, default=0, help="test single pass") parser.add_argument( '--applied_pass_name', type=str, default="", help="test single pass") + parser.add_argument( + '--debug_the_one_ps', type=int, default=0, help="test the_one_ps") args = parser.parse_args() args.abs_dir = os.path.dirname(os.path.abspath(args.config_yaml)) @@ -280,9 +284,11 @@ def parse_args(): config["pure_bf16"] = args.pure_bf16 config['run_minimize'] = args.run_minimize config['run_single_pass'] = args.run_single_pass + config['run_the_one_ps'] = args.run_the_one_ps config['debug_new_minimize'] = args.debug_new_minimize config['debug_new_pass'] = args.debug_new_pass config['applied_pass_name'] = args.applied_pass_name + config['debug_the_one_ps'] = args.debug_the_one_ps yaml_helper.print_yaml(config) return config @@ -344,15 +350,15 @@ class DnnTrainer(object): fleet_obj.minimize(loss) if fleet.is_server(): - _main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str( + _main_file = ps_log_root_dir + sync_mode + '_run_minimize' + '_debug:_' + str( self.config['debug_new_minimize']) + '_server_main.prototxt' debug_program(_main_file, loss.block.program) elif fleet.is_worker(): - _main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str( + _main_file = ps_log_root_dir + sync_mode + '_run_minimize' + '_debug:_' + str( self.config['debug_new_minimize']) + '_worker_main.prototxt' debug_program(_main_file, loss.block.program) elif self.role_maker._is_heter_worker(): - _main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str( + _main_file = ps_log_root_dir + sync_mode + '_run_minimize' + '_debug:_' + str( self.config[ 'debug_new_minimize']) + '_heter_worker_main.prototxt' debug_program(_main_file, loss.block.program) @@ -397,16 +403,84 @@ class DnnTrainer(object): _main = worker.append_send_ops_pass(_main, compiled_config) if fleet.is_server(): - _main_file = '/' + sync_mode + "_" + str(config[ + _main_file = ps_log_root_dir + sync_mode + "_" + str(config[ "applied_pass_name"]) + '_debug:_' + str(self.config[ 'debug_new_pass']) + '_server_main.prototxt' debug_program(_main_file, _main) elif fleet.is_worker(): - _main_file = '/' + sync_mode + "_" + str(config[ + _main_file = ps_log_root_dir + sync_mode + "_" + str(config[ "applied_pass_name"]) + '_debug:_' + str(self.config[ 'debug_new_pass']) + '_worker_main.prototxt' debug_program(_main_file, _main) + def run_the_one_ps(self): + self.init_fleet_with_gloo() + self.model = get_model(self.config) + self.input_data = self.model.create_feeds() + self.metrics = self.model.net(self.input_data) + loss = self.model._cost + user_defined_strategy = get_user_defined_strategy(self.config) + learning_rate = self.config.get( + "hyper_parameters.optimizer.learning_rate") + sync_mode = self.config.get("runner.sync_mode") + inner_optimizer = paddle.optimizer.Adam(learning_rate, lazy_mode=True) + + self.role_maker._generate_role() # 必要 + if self.config['debug_the_one_ps'] == 1: + logger.info("entering run_the_one_ps -- new") + + from paddle.distributed.fleet.meta_optimizers.ps_optimizer import ParameterServerOptimizer + ps_optimizer = ParameterServerOptimizer(inner_optimizer) + ps_optimizer._set_basic_info(loss, self.role_maker, inner_optimizer, + user_defined_strategy) + ps_optimizer.minimize_impl(loss) + + from paddle.distributed.ps.the_one_ps import TheOnePSRuntime + _runtime_handle = TheOnePSRuntime() # ps 目录下重构版的 TheOnePSRuntime + _runtime_handle._set_basic_info(ps_optimizer.pass_ctx._attrs) + if fleet.is_worker(): + worker_desc = _runtime_handle.ps_desc_builder.build_worker_desc( + ) + with open(ps_log_root_dir + sync_mode + '_' + + 'new_worker_ps_desc', 'w') as f: + f.write(worker_desc) + if fleet.is_server(): + server_desc = _runtime_handle.ps_desc_builder.build_server_desc( + ) + with open(ps_log_root_dir + sync_mode + '_' + + 'new_server_ps_desc', 'w') as f: + f.write(server_desc) + + else: + pass + ''' + logger.info("entering run_the_one_ps -- old") + fleet_obj = fleet.distributed_optimizer( + inner_optimizer, user_defined_strategy) + fleet_obj.minimize(loss) + if fleet.is_worker(): + worker_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=False, is_sync=False) + server_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=True, is_sync=False) + with open(ps_log_root_dir + sync_mode + '_' + 'worker_ps_desc', 'w') as f: + f.write(str(worker_desc) + str(server_desc)) + if fleet.is_server(): + server_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=True, is_sync=False) + with open(ps_log_root_dir + sync_mode + '_' + 'server_ps_desc', 'w') as f: + f.write(str(server_desc) + str(fleet_obj._runtime_handle._get_fs_client_desc().to_string())) + ''' + if fleet.is_server(): + _main_file = ps_log_root_dir + sync_mode + '_run_the_one_ps' + '_debug:_' + str( + self.config['debug_the_one_ps']) + '_server_main.prototxt' + debug_program(_main_file, loss.block.program) + elif fleet.is_worker(): + _main_file = ps_log_root_dir + sync_mode + '_run_the_one_ps' + '_debug:_' + str( + self.config['debug_the_one_ps']) + '_worker_main.prototxt' + debug_program(_main_file, loss.block.program) + elif self.role_maker._is_heter_worker(): + _main_file = ps_log_root_dir + sync_mode + '_run_the_one_ps' + '_debug:_' + str( + self.config['debug_the_one_ps']) + '_heter_worker_main.prototxt' + debug_program(_main_file, loss.block.program) + if __name__ == "__main__": paddle.enable_static() @@ -418,3 +492,5 @@ if __name__ == "__main__": benchmark_main.run_single_pass() elif config['run_minimize'] == 1: benchmark_main.run_minimize() + elif config['run_the_one_ps'] == 1: + benchmark_main.run_the_one_ps() diff --git a/python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py b/python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py old mode 100644 new mode 100755 index 78bae0e50c5..8dddc6abd4c --- a/python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py +++ b/python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py @@ -22,16 +22,100 @@ import numpy as np import paddle import paddle.fluid as fluid +import paddle +from paddle.fluid.tests.unittests.distributed_passes.ps_pass_test_base import * +from paddle.distributed.ps.utils.public import logger, ps_log_root_dir +from ps_dnn_trainer import DnnTrainer +from paddle.distributed.fleet.proto import ps_pb2 +from google.protobuf import text_format + -class TestTheOnePs(unittest.TestCase): +class TestTheOnePs(PsPassTestBase): def setUp(self): - print('setUp...') + pass def tearDown(self): - print('tearDown...') + pass - def test_main(self): + def check(self, file1, file2): pass + ''' + f = open(file1, "rb") + ps_desc_1 = ps_pb2.PSParameter() + text_format.Parse(f.read(), ps_desc_1) + f.close() + + f = open(file2, "rb") + ps_desc_2 = ps_pb2.PSParameter() + text_format.Parse(f.read(), ps_desc_2) + f.close() + str1 = text_format.MessageToString(ps_desc_1) + str2 = text_format.MessageToString(ps_desc_2) + #logger.info('### msg10: {}'.format(str1)) + #logger.info('### msg20: {}'.format(str2)) + if str1 == str2: + return True + else: + return False + ''' + + def test_ps_cpu_async(self): + self.init() + self.config['ps_mode_config'] = "../ps/cpu_async_ps_config.yaml" + self.config['run_the_one_ps'] = '1' + + self.config['debug_the_one_ps'] = '0' + self.config[ + 'log_dir'] = ps_log_root_dir + "async_cpu_log_old_the_one_ps" + remove_path_if_exists(self.config['log_dir']) + self.ps_launch() + + self.config['debug_the_one_ps'] = '1' + self.config[ + 'log_dir'] = ps_log_root_dir + "async_cpu_log_new_the_one_ps" + remove_path_if_exists(self.config['log_dir']) + self.ps_launch() + + desc1 = '/ps_desc_baseline/async_worker_ps_desc' + desc2 = '/ps_log/async_new_worker_ps_desc' + desc3 = '/ps_desc_baseline/async_server_ps_desc' + desc4 = '/ps_log/async_new_server_ps_desc' + if self.check(desc1, desc2): + logger.info('test_ps_cpu_async ps_desc: worker passed!') + else: + logger.info('test_ps_cpu_async ps_desc: worker failed!') + if self.check(desc3, desc4): + logger.info('test_ps_cpu_async ps_desc: server passed!') + else: + logger.info('test_ps_cpu_async ps_desc: server failed!') + + def test_ps_cpu_geo(self): + self.init() + self.config['ps_mode_config'] = "../ps/cpu_geo_ps_config.yaml" + self.config['run_the_one_ps'] = '1' + + self.config['debug_the_one_ps'] = '0' + self.config['log_dir'] = ps_log_root_dir + "geo_cpu_log_old_the_one_ps" + remove_path_if_exists(self.config['log_dir']) + self.ps_launch() + + self.config['debug_the_one_ps'] = '1' + self.config['log_dir'] = ps_log_root_dir + "geo_cpu_log_new_the_one_ps" + remove_path_if_exists(self.config['log_dir']) + self.ps_launch() + + desc1 = '/ps_desc_baseline/geo_worker_ps_desc' + desc2 = '/ps_log/geo_new_worker_ps_desc' + desc3 = '/ps_desc_baseline/geo_server_ps_desc' + desc4 = '/ps_log/geo_new_server_ps_desc' + if self.check(desc1, desc2): + logger.info('test_ps_cpu_geo ps_desc: worker passed!') + else: + logger.info('test_ps_cpu_geo ps_desc: worker failed!') + if self.check(desc3, desc4): + logger.info('test_ps_cpu_geo ps_desc: server passed!') + else: + logger.info('test_ps_cpu_geo ps_desc: server failed!') if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/ps_dnn_model.py b/python/paddle/fluid/tests/unittests/ps_dnn_model.py index 0a147334dab..8d91e0f4678 100755 --- a/python/paddle/fluid/tests/unittests/ps_dnn_model.py +++ b/python/paddle/fluid/tests/unittests/ps_dnn_model.py @@ -74,6 +74,7 @@ class DNNLayer(nn.Layer): else: emb = self.embedding(s_input) emb = paddle.reshape(emb, shape=[-1, self.sparse_feature_dim]) + # emb.stop_gradient = True sparse_embs.append(emb) y_dnn = paddle.concat(x=sparse_embs + [dense_inputs], axis=1) -- GitLab From 28795771408a6dcd757ed367d348fb0ead5ab507 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Wed, 2 Mar 2022 16:40:05 +0800 Subject: [PATCH 058/272] run recompute's real backward with amp disabled (#40042) --- python/paddle/distributed/fleet/utils/recompute.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/paddle/distributed/fleet/utils/recompute.py b/python/paddle/distributed/fleet/utils/recompute.py index dccd7f62053..4ccb48ef72e 100755 --- a/python/paddle/distributed/fleet/utils/recompute.py +++ b/python/paddle/distributed/fleet/utils/recompute.py @@ -182,9 +182,10 @@ class RecomputeFunction(PyLayer): "none of output has requires_grad=True, this recompute() is not necessary" ) - # actually backward - paddle.autograd.backward(forward_outputs_with_grad, - backward_inputs_with_grad) + # actually backward + with paddle.amp.auto_cast(enable=False): + paddle.autograd.backward(forward_outputs_with_grad, + backward_inputs_with_grad) grads = list(inp._grad_ivar() for inp in detached_inputs if isinstance(inp, core.VarBase)) -- GitLab From 8492d3bbf6f01e98d6674b57b27913fe537584dd Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Wed, 2 Mar 2022 16:43:52 +0800 Subject: [PATCH 059/272] The backward code of Sparse Conv3d (#40054) Sparse Conv3d backward code --- .../kernels/sparse/convolution_grad_kernel.h | 66 +++++++ paddle/phi/kernels/sparse/cpu/convolution.h | 1 + .../sparse/cpu/convolution_grad_kernel.cc | 166 ++++++++++++++++++ .../kernels/test_sparse_conv3d_dev_api.cc | 112 +++++++++++- 4 files changed, 337 insertions(+), 8 deletions(-) create mode 100644 paddle/phi/kernels/sparse/convolution_grad_kernel.h create mode 100644 paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h new file mode 100644 index 00000000000..1a6ac852448 --- /dev/null +++ b/paddle/phi/kernels/sparse/convolution_grad_kernel.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/kernels/empty_kernel.h" + +namespace phi { +namespace sparse { + +template +void Conv3dGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& rulebook, + const DenseTensor& kernel, + const SparseCooTensor& out_grad, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + DenseTensor* x_grad, + DenseTensor* kernel_grad); + +template +std::vector Conv3dGrad(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& rulebook, + const DenseTensor& kernel, + const SparseCooTensor& out_grad, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups) { + DenseTensor x_grad = phi::Empty(dev_ctx); + DenseTensor kernel_grad = phi::Empty(dev_ctx); + Conv3dGradKernel(dev_ctx, + x, + rulebook, + kernel, + out_grad, + paddings, + dilations, + strides, + groups, + &x_grad, + &kernel_grad); + std::vector out(2); + out[0] = x_grad; + out[1] = kernel_grad; + return out; +} + +} // namespace sparse +} // namespace phi diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h index 5803069d927..ab2fef5320f 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution.h +++ b/paddle/phi/kernels/sparse/cpu/convolution.h @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/sparse/convolution_kernel.h" namespace phi { namespace sparse { diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc new file mode 100644 index 00000000000..d4f770ce871 --- /dev/null +++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc @@ -0,0 +1,166 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/sparse/cpu/convolution.h" + +namespace phi { +namespace sparse { + +// rulebook: +//[ +// [kernel_index], +// [in_i], +// [out_i], +//] +// x_grad = out_grad * transpose(kenrel) +// kernel_grad = transpose(x) * out_grad +template +void Conv3dGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& rulebook, + const DenseTensor& kernel, + const SparseCooTensor& out_grad, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + DenseTensor* x_grad, + DenseTensor* kernel_grad) { + const auto& kernel_dims = kernel.dims(); + const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; + const int in_channels = kernel_dims[3]; + const int out_channels = kernel_dims[4]; + const int* rulebook_ptr = rulebook.data(); + + const int rulebook_len = rulebook.dims()[1]; + + DenseTensorMeta in_features_meta( + x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW); + DenseTensorMeta d_x_features_meta( + x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW); + DenseTensorMeta out_grad_features_meta( + x.dtype(), {rulebook_len, out_channels}, DataLayout::NCHW); + phi::DenseTensor in_features = + phi::Empty(dev_ctx, std::move(in_features_meta)); + phi::DenseTensor d_x_features = + phi::Empty(dev_ctx, std::move(d_x_features_meta)); + phi::DenseTensor out_grad_features = + phi::Empty(dev_ctx, std::move(out_grad_features_meta)); + + dev_ctx.Alloc( + &in_features, in_features.dtype(), sizeof(T) * in_features.numel()); + T* in_features_ptr = in_features.data(); + dev_ctx.Alloc( + &d_x_features, d_x_features.dtype(), sizeof(T) * d_x_features.numel()); + T* d_x_features_ptr = d_x_features.data(); + dev_ctx.Alloc(&out_grad_features, + out_grad_features.dtype(), + sizeof(T) * out_grad_features.numel()); + T* out_grad_features_ptr = out_grad_features.data(); + kernel_grad->Resize(kernel_dims); + dev_ctx.Alloc( + kernel_grad, kernel_grad->dtype(), kernel_grad->numel() * sizeof(T)); + T* d_kernel_ptr = kernel_grad->data(); + + Gather(x.non_zero_elements().data(), + rulebook_ptr + rulebook_len, + rulebook_len, + in_channels, + in_features_ptr); + Gather(out_grad.non_zero_elements().data(), + rulebook_ptr + rulebook_len * 2, + rulebook_len, + out_channels, + out_grad_features_ptr); + + auto blas = phi::funcs::GetBlas(dev_ctx); + std::vector offsets(kernel_size + 1), counter(kernel_size, 0); + for (int i = 0; i < rulebook_len; i++) { + counter[rulebook_ptr[i]] += 1; + } + int offset = 0; + for (int i = 0; i < kernel_size; i++) { + offsets[i] = offset; + offset += counter[i]; + } + offsets[kernel_size] = offset; + + const T* kernel_ptr = kernel.data(); + for (int i = 0; i < kernel_size; i++) { + if (counter[i] <= 0) { + continue; + } + + const int M = counter[i]; + const int K = in_channels; + const int N = out_channels; + T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels; + T* tmp_out_grad_ptr = out_grad_features_ptr + offsets[i] * out_channels; + const T* tmp_kernel_ptr = kernel_ptr + i * in_channels * out_channels; + T* tmp_d_x_ptr = d_x_features_ptr + offsets[i] * out_channels; + T* tmp_d_kernel_ptr = d_kernel_ptr + i * in_channels * out_channels; + + // call gemm: d_kernel = transpose(x) * out_grad + // (in_channels, n) * (n, out_channels) + blas.GEMM(CblasTrans, + CblasNoTrans, + M, + N, + K, + static_cast(1), + tmp_in_ptr, + tmp_out_grad_ptr, + static_cast(0), + tmp_d_kernel_ptr); + + // call gemm: d_x = out_grad * transpose(kernel) + // (n, out_channels) * (out_channels, in_channels) + blas.GEMM(CblasNoTrans, + CblasTrans, + M, + K, + N, + static_cast(1), + tmp_out_grad_ptr, + tmp_kernel_ptr, + static_cast(0), + tmp_d_x_ptr); + } + + // 4. scatter + x_grad->Resize(x.non_zero_elements().dims()); + dev_ctx.Alloc(x_grad, x_grad->dtype(), sizeof(T) * x_grad->numel()); + T* x_grad_values_ptr = x_grad->data(); + memset(x_grad_values_ptr, 0, sizeof(T) * x_grad->numel()); + Scatter(d_x_features_ptr, + rulebook.data() + rulebook_len, + rulebook_len, + in_channels, + x_grad_values_ptr); +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(sparse_conv_grad, + CPU, + ALL_LAYOUT, + phi::sparse::Conv3dGradKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); + kernel->InputAt(3).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc index 57601514370..00b2a256a95 100644 --- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/phi/common/place.h" #include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h" #include "paddle/phi/kernels/sparse/convolution_kernel.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" @@ -59,7 +60,10 @@ void TestConv3dBase(const std::vector& indices, const std::vector& paddings, const std::vector& strides, const std::vector& dilations, - const float diff = 1e-3) { + const float diff = 1e-3, + const bool backward = false, + const std::vector features_grad = {}, + const std::vector kernel_grad = {}) { phi::CPUContext dev_ctx_cpu; dev_ctx_cpu.SetAllocator( paddle::memory::allocation::AllocatorFacade::Instance() @@ -122,10 +126,29 @@ void TestConv3dBase(const std::vector& indices, correct_out_indices.size() * sizeof(int)); ASSERT_EQ(cmp_indices, 0); - for (uint64_t i = 0; i < correct_out_features.size(); i++) { - float tmp = std::fabs(static_cast( - correct_out_features[i] - out.non_zero_elements().data()[i])); - ASSERT_LT(tmp, diff); + auto f_verify = [&](const T* real_data, + const std::vector& correct_data) { + for (uint64_t i = 0; i < correct_data.size(); i++) { + float tmp = + std::fabs(static_cast(correct_data[i] - real_data[i])); + ASSERT_LT(tmp, diff); + } + }; + + f_verify(out.non_zero_elements().data(), correct_out_features); + + if (backward) { + std::vector grads = sparse::Conv3dGrad(dev_ctx_cpu, + x_tensor, + rulebook, + kernel_tensor, + out, + paddings, + dilations, + strides, + 1); + f_verify(grads[0].data(), features_grad); + f_verify(grads[1].data(), kernel_grad); } } } @@ -141,7 +164,11 @@ void TestConv3d(const std::vector& indices, const int non_zero_num, const std::vector& paddings, const std::vector& strides, - const std::vector& dilations) { + const std::vector& dilations, + const float diff = 1e-3, + const bool backward = false, + const std::vector features_grad = {}, + const std::vector kernel_grad = {}) { // test float TestConv3dBase(indices, features, @@ -154,7 +181,11 @@ void TestConv3d(const std::vector& indices, non_zero_num, paddings, strides, - dilations); + dilations, + diff, + backward, + features_grad, + kernel_grad); // test double TestConv3dBase(indices, cast(features), @@ -167,7 +198,11 @@ void TestConv3d(const std::vector& indices, non_zero_num, paddings, strides, - dilations); + dilations, + diff, + backward, + cast(features_grad), + cast(kernel_grad)); } TEST(DEV_API, sparse_conv3d) { @@ -467,5 +502,66 @@ TEST(DEV_API, sparse_conv2d) { dilations); } +TEST(DEV_API, sparse_conv3d_backward) { + const int in_channels = 1; + const int out_channels = 1; + DDim x_dims = {1, 4, 4, 4, in_channels}; + DDim kernel_dims = {3, 3, 3, in_channels, out_channels}; + DDim out_dims = {1, 2, 2, 2, out_channels}; + std::vector paddings = {0, 0, 0}; + std::vector strides = {1, 1, 1}; + std::vector dilations = {1, 1, 1}; + + const int non_zero_num = 2; + std::vector indices_flatten = {0, 0, 0, 2, 3, 2, 3, 2}; + + std::vector features = {-0.28833008, 0.0287323}; + // 3*3*3=27 + std::vector kernel = { + 0.64306641, 0.45043945, 0.47216797, 0.22924805, 0.97509766, 0.86181641, + 0.57861328, 0.91796875, 0.87255859, 0.16589355, 0.44555664, 0.01889038, + 0.46459961, 0.44726562, 0.19909668, 0.89697266, 0.37158203, 0.00513077, + 0.69628906, 0.26904297, 0.74707031, 0.54003906, 0.5390625, 0.07958984, + 0.47338867, 0.90966797, 0.17126465}; + + std::vector out_indices_flatten = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, + 1, 1, 0, 1, 0, 1, 0, 1, 0, 1}; + + std::vector out_features = {4.9200e-03, + 2.6140e-02, + 2.2900e-03, + -2.3596e-01, + 1.5000e-04, + 1.0670e-02, + 5.7200e-03, + 1.2850e-02}; + + std::vector features_grad = {-0.20593, -0.09149}; + std::vector kernel_grad = { + 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, + 0.000e+00, 0.000e+00, 6.805e-02, 0.000e+00, 0.000e+00, 0.000e+00, + 0.000e+00, 3.700e-04, 1.600e-04, 0.000e+00, 3.100e-04, 0.000e+00, + 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, -6.780e-03, 7.000e-05, + 0.000e+00, 7.500e-04, 1.400e-04}; + + TestConv3d(indices_flatten, + features, + x_dims, + kernel, + kernel_dims, + out_indices_flatten, + out_features, + out_dims, + non_zero_num, + paddings, + strides, + dilations, + 1e-3, + true, + features_grad, + kernel_grad); +} + } // namespace tests } // namespace phi -- GitLab From 2a5590a18e3dd90f815f20a82f6dcc722bc17892 Mon Sep 17 00:00:00 2001 From: From00 Date: Wed, 2 Mar 2022 16:55:19 +0800 Subject: [PATCH 060/272] Move BroadcastTensors OP to phi (#40047) * Move BroadcastTensors OP to phi * Remove mutable_data in impl * Move BilinearTensorProductInferMeta to multiary.h/cc --- .../fluid/operators/broadcast_tensors_op.cc | 99 +----- .../fluid/operators/broadcast_tensors_op.cu | 122 -------- paddle/fluid/operators/broadcast_tensors_op.h | 282 ------------------ paddle/phi/infermeta/multiary.cc | 66 +++- paddle/phi/infermeta/multiary.h | 5 + .../kernels/broadcast_tensors_grad_kernel.h | 27 ++ paddle/phi/kernels/broadcast_tensors_kernel.h | 27 ++ paddle/phi/kernels/complex_grad_kernel.h | 2 +- paddle/phi/kernels/complex_kernel.h | 14 +- .../cpu/broadcast_tensors_grad_kernel.cc | 201 +++++++++++++ .../kernels/cpu/broadcast_tensors_kernel.cc | 30 ++ .../gpu/broadcast_tensors_grad_kernel.cu | 111 +++++++ .../kernels/gpu/broadcast_tensors_kernel.cu | 30 ++ .../impl/broadcast_tensors_kernel_impl.h | 118 ++++++++ .../phi/ops/compat/broadcast_tensors_sig.cc | 28 ++ 15 files changed, 658 insertions(+), 504 deletions(-) delete mode 100644 paddle/fluid/operators/broadcast_tensors_op.cu delete mode 100644 paddle/fluid/operators/broadcast_tensors_op.h create mode 100644 paddle/phi/kernels/broadcast_tensors_grad_kernel.h create mode 100644 paddle/phi/kernels/broadcast_tensors_kernel.h create mode 100644 paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc create mode 100644 paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu create mode 100644 paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h create mode 100644 paddle/phi/ops/compat/broadcast_tensors_sig.cc diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc index 27b1107675d..c3917fad555 100644 --- a/paddle/fluid/operators/broadcast_tensors_op.cc +++ b/paddle/fluid/operators/broadcast_tensors_op.cc @@ -12,15 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/broadcast_tensors_op.h" - -#include -#include -#include -#include -#include - +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/var_type_inference.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { @@ -31,64 +27,6 @@ class BroadcastTensorsOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "broadcast_tensors"); - OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", - "broadcast_tensors"); - - int target_rank = 0; - const auto& input_dims = ctx->GetInputsDim("X"); - - // 1. Find Output rank = max(Inputs rank) - for (const auto& input_ddim : input_dims) { - target_rank = std::max(target_rank, input_ddim.size()); - } - - PADDLE_ENFORCE_GT( - target_rank, 0, - platform::errors::InvalidArgument( - "BroadcastTensorsOp requires at least one input tensor" - "to have rank greater than zero")); - - std::vector target_dims(target_rank, 0); - // 2. Output dim(axis=x) = max(Inputs dim(axis=x)) - for (int index = 0; index < target_rank; index++) { - // Loop axes in reverse order, - // For each axis, take the maximum as target size - // Fill size = 1 if shape vector exhausts - int target_dim_size = 1; - for (const auto& input_ddim : input_dims) { - // Reversed order - int axis = static_cast(input_ddim.size()) - index - 1; - int dim_size = 1; - if (axis >= 0) { - dim_size = input_ddim[axis]; - } - - if (target_dim_size != 1 && dim_size != 1 && - target_dim_size != dim_size) { - PADDLE_THROW(platform::errors::InvalidArgument( - "BroadcastTensorsOp inputs does not satisfy bcast semantics," - "Please check axis = %d in reverse order", - index)); - } - - // We performed bcast semantics check at python level - // So input tensors should all have legal shape - target_dim_size = std::max(target_dim_size, dim_size); - } - target_dims[target_rank - index - 1] = target_dim_size; - } - - // 3. Set Output Dim - std::vector output_ddims; - for (size_t i = 0; i < input_dims.size(); i++) { - output_ddims.emplace_back(phi::make_ddim(target_dims)); - } - ctx->SetOutputsDim("Out", output_ddims); - ctx->ShareAllLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -229,34 +167,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(BroadcastTensorsGradNoNeedBufVarsInferer, namespace ops = paddle::operators; namespace plat = paddle::platform; +DELCARE_INFER_SHAPE_FUNCTOR(broadcast_tensors, + BroadcastTensorsInferShapeFunctor, + PT_INFER_META(phi::BroadcastTensorsInferMeta)); + REGISTER_OPERATOR(broadcast_tensors, ops::BroadcastTensorsOp, ops::BroadcastTensorsOpMaker, ops::BroadcastTensorsGradOpMaker, ops::BroadcastTensorsGradOpMaker, - ops::BroadcastTensorsOpVarTypeInference); + ops::BroadcastTensorsOpVarTypeInference, + BroadcastTensorsInferShapeFunctor); REGISTER_OPERATOR(broadcast_tensors_grad, ops::BroadcastTensorsGradOp, ops::BroadcastTensorsGradOpVarTypeInference, ops::BroadcastTensorsGradNoNeedBufVarsInferer); - -REGISTER_OP_CPU_KERNEL( - broadcast_tensors, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel); - -REGISTER_OP_CPU_KERNEL( - broadcast_tensors_grad, - ops::BroadcastTensorsGradOpKernel, - ops::BroadcastTensorsGradOpKernel, - ops::BroadcastTensorsGradOpKernel, - ops::BroadcastTensorsGradOpKernel, - ops::BroadcastTensorsGradOpKernel); diff --git a/paddle/fluid/operators/broadcast_tensors_op.cu b/paddle/fluid/operators/broadcast_tensors_op.cu deleted file mode 100644 index 5882258317d..00000000000 --- a/paddle/fluid/operators/broadcast_tensors_op.cu +++ /dev/null @@ -1,122 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/broadcast_tensors_op.h" - -#include -#include -#include -#include -#include - -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" - -namespace paddle { -namespace operators { - -using framework::Tensor; -using framework::DDim; - -template -class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - // Find reduce dimensions - const auto& in_tensors = - context.MultiInput(framework::GradVarName("Out")); - auto out_tensors = context.MultiOutput(framework::GradVarName("X")); - - size_t num_ins = in_tensors.size(); - - PADDLE_ENFORCE_GT( - num_ins, 1, - platform::errors::InvalidArgument( - "Expected at least 2 input tensors, but only received d%.", - in_tensors.size())); - - PADDLE_ENFORCE_EQ( - num_ins, out_tensors.size(), - platform::errors::InvalidArgument( - "BroadcastTensorsOp expects equal number of inputs and outputs," - "but received: %d inputs v.s %d outputs", - num_ins, out_tensors.size())); - - // For each In-Out tensor pair, - // Prepare and apply broadcast dims array - for (size_t i = 0; i < num_ins; i++) { - auto* input_tensor = in_tensors[i]; - auto* output_tensor = out_tensors[i]; - - const DDim& input_dims = input_tensor->dims(); - const DDim& output_dims = output_tensor->dims(); - - int in_rank = input_dims.size(); - int out_rank = output_dims.size(); - - // Collect reduce_dims - // Example: - // dX = [1,1,1,1] - // dOut = [1,1,1,4] - // - // reduce_dims = [3] // reduce along the broadcasted axis - std::vector reduce_dims_vec; - for (int j = 0; j < in_rank; j++) { - int out_axis = out_rank - j - 1; - int in_axis = in_rank - j - 1; - - if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) { - reduce_dims_vec.push_back(in_axis); - } - } - - bool just_copy = (reduce_dims_vec.size() == 0); - output_tensor->mutable_data(context.GetPlace()); - if (just_copy) { - // Turns out to be a No-Op, simply copy tensors - framework::TensorCopy(*input_tensor, context.GetPlace(), - context.device_context(), output_tensor); - } else { - // reduce_sum implementation on CUDA - auto stream = context.cuda_device_context().stream(); - TensorReduceImpl>( - context.cuda_device_context(), *input_tensor, output_tensor, - kps::IdentityFunctor(), reduce_dims_vec, stream); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - broadcast_tensors, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel); - -REGISTER_OP_CUDA_KERNEL(broadcast_tensors_grad, - ops::CUDABroadcastTensorsGradOpKernel, - ops::CUDABroadcastTensorsGradOpKernel, - ops::CUDABroadcastTensorsGradOpKernel, - ops::CUDABroadcastTensorsGradOpKernel, - ops::CUDABroadcastTensorsGradOpKernel); diff --git a/paddle/fluid/operators/broadcast_tensors_op.h b/paddle/fluid/operators/broadcast_tensors_op.h deleted file mode 100644 index 682f2e24769..00000000000 --- a/paddle/fluid/operators/broadcast_tensors_op.h +++ /dev/null @@ -1,282 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/lod_tensor_array.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -#define SWITCH_OUT_RANK_CASE(n) \ - case n: { \ - ApplyBroadcast(context, in_tensors[i], out_tensors[i]); \ - break; \ - } - -namespace paddle { -namespace operators { - -using framework::Tensor; -using framework::DDim; -using framework::EigenTensor; - -template -class BroadcastTensorsOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const auto& in_tensors = context.MultiInput("X"); - auto out_tensors = context.MultiOutput("Out"); - - size_t num_ins = in_tensors.size(); - - PADDLE_ENFORCE_GT( - num_ins, 1, - platform::errors::InvalidArgument( - "Expected at least 2 input tensors, but only received d%.", - in_tensors.size())); - - PADDLE_ENFORCE_EQ( - num_ins, out_tensors.size(), - platform::errors::InvalidArgument( - "BroadcastTensorsOp expects equal number of inputs and outputs," - "but received: %d inputs v.s %d outputs", - num_ins, out_tensors.size())); - - // Eigen has no support for dynamic ranked tensor - // Thus we perform static expansion for each possible ranks - for (size_t i = 0; i < num_ins; i++) { - int out_rank = out_tensors[i]->dims().size(); - switch (out_rank) { - SWITCH_OUT_RANK_CASE(1) - SWITCH_OUT_RANK_CASE(2) - SWITCH_OUT_RANK_CASE(3) - SWITCH_OUT_RANK_CASE(4) - SWITCH_OUT_RANK_CASE(5) - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Target tensor rank out of range" - "Maximum supported rank for broadcast is: 5")); - } - } - } - } - - template - void ApplyBroadcast(const framework::ExecutionContext& context, - const Tensor* input_tensor, Tensor* output_tensor) const { - const auto& input_dims = input_tensor->dims(); - const auto& output_dims = output_tensor->dims(); - - int in_rank = input_dims.size(); - int out_rank = output_dims.size(); - - // 1. Collect bcast_dims, each element of which indicates how many - // times we need to replicate along the corresponding dimension - // 2. Collect new_input_dims_vec. Eigen::broadcast requires same rank for - // both input and output tensors, so we need to initialize input X with - // expanded dims: "new_input_dims_vec" - Eigen::DSizes bcast_dims; - std::vector new_input_dims_vec(out_rank); - for (int j = 0; j < out_rank; j++) { - int out_axis = out_rank - j - 1; - int in_axis = in_rank - j - 1; - - bcast_dims[out_axis] = output_dims[out_axis]; - new_input_dims_vec[out_axis] = 1; - if (in_axis >= 0 && input_dims[in_axis] == output_dims[out_axis]) { - bcast_dims[out_axis] = 1; - new_input_dims_vec[out_axis] = input_dims[in_axis]; - } - } - auto new_input_dims = phi::make_ddim(new_input_dims_vec); - - // Initialize input X with new_input_dims_vec, so it's rank-aligned with the - // output - auto x = EigenTensor::From(*input_tensor, new_input_dims); - - output_tensor->mutable_data(context.GetPlace()); - auto y = EigenTensor::From(*output_tensor, output_dims); - - auto& place = - *context.template device_context().eigen_device(); - EigenBroadcast, T, OutRank>::Eval(place, y, x, - bcast_dims); - } -}; - -#define SWITCH_RESHAPE_DIMS(n) \ - case n: { \ - Eigen::DSizes reshape_dims; \ - for (size_t i = 0; i < reshape_dims_vec.size(); ++i) { \ - reshape_dims[i] = reshape_dims_vec[i]; \ - } \ - dX.device(place) = \ - dOut.reshape(reshape_dims).sum(reduce_dims).reshape(dX.dimensions()); \ - break; \ - } - -#define UPPER_SWITCH_REDUCE_DIMS(m) \ - case m: { \ - Eigen::DSizes reduce_dims; \ - for (size_t i = 0; i < reduce_dims_vec.size(); ++i) { \ - reduce_dims[i] = reduce_dims_vec[i]; \ - } \ - switch (reshape_size) { -#define LOWER_SWITCH_REDUCE_DIMS \ - default: { \ - PADDLE_THROW(platform::errors::InvalidArgument( \ - "Detected reshape size: %d out of range" \ - "Minimum value should be larger than reduce size %d" \ - "While maximum supported is: 5", \ - reshape_size, reduce_size)); \ - } \ - } \ - break; \ - } - -/* ----- GradOpKernel ----- */ -template -class BroadcastTensorsGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - // Find reduce dimensions - const auto& in_tensors = - context.MultiInput(framework::GradVarName("Out")); - auto out_tensors = context.MultiOutput(framework::GradVarName("X")); - - size_t num_ins = in_tensors.size(); - - PADDLE_ENFORCE_GT( - num_ins, 1, - platform::errors::InvalidArgument( - "Expected at least 2 input tensors, but only received d%.", - in_tensors.size())); - - PADDLE_ENFORCE_EQ( - num_ins, out_tensors.size(), - platform::errors::InvalidArgument( - "BroadcastTensorsOp expects equal number of inputs and outputs," - "but received: %d inputs v.s %d outputs", - num_ins, out_tensors.size())); - - // For each In-Out tensor pair, - // Prepare and apply broadcast dims array - for (size_t i = 0; i < num_ins; i++) { - const auto* input_tensor = in_tensors[i]; - auto* output_tensor = out_tensors[i]; - - const auto& input_dims = input_tensor->dims(); - const auto& output_dims = output_tensor->dims(); - - int in_rank = input_dims.size(); - int out_rank = output_dims.size(); - - // BroadcastTensorsGrad is simply a reduce_sum along broadcasted axes - // Here we perform the following Eigen operations: - // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) -> - // reshape(dX_shape) -> dX - // Note the last "reshape(dX_shape)" will be performed implicitly, - // and we only need to collect reduce_dims and reshape_dims - std::vector reduce_dims_vec; - std::vector reshape_dims_vec; - for (int j = 0; j < in_rank; j++) { - int out_axis = out_rank - j - 1; - int in_axis = in_rank - j - 1; - - reshape_dims_vec.push_back(input_dims[j]); - if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) { - reduce_dims_vec.push_back(in_axis); - } - } - - size_t reduce_size = reduce_dims_vec.size(); - size_t reshape_size = reshape_dims_vec.size(); - bool just_copy = (reduce_dims_vec.size() == 0); - output_tensor->mutable_data(context.GetPlace()); - if (just_copy) { - // If this turns out to be a No-Op, simply perform a tensor copy - framework::TensorCopy(*input_tensor, context.GetPlace(), - context.device_context(), output_tensor); - } else { - PADDLE_ENFORCE_GE(reduce_dims_vec.size(), 1, - platform::errors::InvalidArgument( - "The number of dimensions of the input " - "'Out@GRAD' for Op(broadcast_tensors)" - " must be greater than or equal to 1, but " - "the value received is %d.", - reduce_dims_vec.size())); - PADDLE_ENFORCE_LE( - reduce_dims_vec.size(), 5, - platform::errors::InvalidArgument( - "The number of dimensions of the input 'Out@GRAD' " - "for Op(broadcast_tensors) must be less than or equal " - "to 5, but the value received is %d.", - reduce_dims_vec.size())); - - // Overall: - // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) -> - // reshape(dX_shape) -> dX - auto dX = framework::EigenVector::Flatten(*output_tensor); - auto dOut = framework::EigenVector::Flatten(*input_tensor); - auto& place = - *context.template device_context().eigen_device(); - - // Expand ReduceSize and ReshapeSize into static values - switch (reduce_size) { - UPPER_SWITCH_REDUCE_DIMS(1) - SWITCH_RESHAPE_DIMS(1) - SWITCH_RESHAPE_DIMS(2) - SWITCH_RESHAPE_DIMS(3) - SWITCH_RESHAPE_DIMS(4) - SWITCH_RESHAPE_DIMS(5) - LOWER_SWITCH_REDUCE_DIMS - - UPPER_SWITCH_REDUCE_DIMS(2) - SWITCH_RESHAPE_DIMS(2) - SWITCH_RESHAPE_DIMS(3) - SWITCH_RESHAPE_DIMS(4) - SWITCH_RESHAPE_DIMS(5) - LOWER_SWITCH_REDUCE_DIMS - - UPPER_SWITCH_REDUCE_DIMS(3) - SWITCH_RESHAPE_DIMS(3) - SWITCH_RESHAPE_DIMS(4) - SWITCH_RESHAPE_DIMS(5) - LOWER_SWITCH_REDUCE_DIMS - - UPPER_SWITCH_REDUCE_DIMS(4) - SWITCH_RESHAPE_DIMS(4) - SWITCH_RESHAPE_DIMS(5) - LOWER_SWITCH_REDUCE_DIMS - - UPPER_SWITCH_REDUCE_DIMS(5) - SWITCH_RESHAPE_DIMS(5) - LOWER_SWITCH_REDUCE_DIMS - - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Detected reduce size: %d out of range" - "While maximum supported is: 5", - reduce_size)); - } - } - } - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 7634e5e01ac..dc5478e8afb 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -13,11 +13,21 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/infermeta/multiary.h" - +#include #include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/meta_tensor.h" #include "paddle/phi/kernels/funcs/concat_funcs.h" namespace phi { +std::vector GetMetaTensorsDim(const std::vector& tensors) { + std::vector dims; + dims.reserve(tensors.size()); + for (const MetaTensor* tensor : tensors) { + dims.emplace_back(tensor->dims()); + } + return dims; +} + void BilinearTensorProductInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, @@ -84,6 +94,60 @@ void BilinearTensorProductInferMeta(const MetaTensor& x, out->set_dtype(x.dtype()); } +void BroadcastTensorsInferMeta(const std::vector& x, + std::vector out) { + int target_rank = 0; + const auto& input_dims = GetMetaTensorsDim(x); + + // 1. Find Output rank = max(Inputs rank) + for (const auto& input_ddim : input_dims) { + target_rank = std::max(target_rank, input_ddim.size()); + } + + PADDLE_ENFORCE_GT(target_rank, + 0, + errors::InvalidArgument("BroadcastTensorsOp requires at " + "least one input tensor to have " + "rank greater than zero")); + + std::vector target_dims(target_rank, 0); + // 2. Output dim(axis=x) = max(Inputs dim(axis=x)) + for (int index = 0; index < target_rank; index++) { + // Loop axes in reverse order, + // For each axis, take the maximum as target size + // Fill size = 1 if shape vector exhausts + int target_dim_size = 1; + for (const auto& input_ddim : input_dims) { + // Reversed order + int axis = static_cast(input_ddim.size()) - index - 1; + int dim_size = 1; + if (axis >= 0) { + dim_size = input_ddim[axis]; + } + + if (target_dim_size != 1 && dim_size != 1 && + target_dim_size != dim_size) { + PADDLE_THROW(errors::InvalidArgument( + "BroadcastTensorsOp inputs does not satisfy bcast semantics, " + "please check axis = %d in reverse order", + index)); + } + + // We performed bcast semantics check at python level + // So input tensors should all have legal shape + target_dim_size = std::max(target_dim_size, dim_size); + } + target_dims[target_rank - index - 1] = target_dim_size; + } + + // 3. Set Output Dim + for (size_t i = 0; i < out.size(); i++) { + out[i]->set_dims(phi::make_ddim(target_dims)); + out[i]->share_lod(*(x[i])); + out[i]->set_dtype(x[i]->dtype()); + } +} + void ConcatInferMeta(const std::vector& x, const Scalar& axis_scalar, MetaTensor* out, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 2afb79daa35..51738c5e08e 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -18,6 +18,8 @@ limitations under the License. */ #include "paddle/phi/core/meta_tensor.h" namespace phi { +std::vector GetMetaTensorsDim(const std::vector& tensors); + void BilinearTensorProductInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, @@ -25,6 +27,9 @@ void BilinearTensorProductInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void BroadcastTensorsInferMeta(const std::vector& x, + std::vector out); + void ConcatInferMeta(const std::vector& x, const Scalar& axis_scalar, MetaTensor* out, diff --git a/paddle/phi/kernels/broadcast_tensors_grad_kernel.h b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h new file mode 100644 index 00000000000..5ec2e35cc9b --- /dev/null +++ b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BroadcastTensorsGradKernel(const Context& ctx, + const std::vector& dout, + std::vector dx); + +} // namespace phi diff --git a/paddle/phi/kernels/broadcast_tensors_kernel.h b/paddle/phi/kernels/broadcast_tensors_kernel.h new file mode 100644 index 00000000000..fb2a6f1136c --- /dev/null +++ b/paddle/phi/kernels/broadcast_tensors_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BroadcastTensorsKernel(const Context& ctx, + const std::vector& x, + std::vector out); + +} // namespace phi diff --git a/paddle/phi/kernels/complex_grad_kernel.h b/paddle/phi/kernels/complex_grad_kernel.h index 505d4d37442..be13e2826ea 100644 --- a/paddle/phi/kernels/complex_grad_kernel.h +++ b/paddle/phi/kernels/complex_grad_kernel.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h index 44bfae9820a..3b3003392d3 100644 --- a/paddle/phi/kernels/complex_kernel.h +++ b/paddle/phi/kernels/complex_kernel.h @@ -50,14 +50,10 @@ DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) { return x; } -template -void RealKernel(const DeviceContext& dev_ctx, - const DenseTensor& x, - DenseTensor* out); - -template -void ImagKernel(const DeviceContext& dev_ctx, - const DenseTensor& x, - DenseTensor* out); +template +void RealKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); + +template +void ImagKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc new file mode 100644 index 00000000000..7a97f8c2189 --- /dev/null +++ b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc @@ -0,0 +1,201 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/broadcast_tensors_grad_kernel.h" + +#include +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +#define SWITCH_RESHAPE_DIMS(n) \ + case n: { \ + Eigen::DSizes reshape_dims; \ + for (size_t i = 0; i < reshape_dims_vec.size(); ++i) { \ + reshape_dims[i] = reshape_dims_vec[i]; \ + } \ + dX.device(place) = \ + dOut.reshape(reshape_dims).sum(reduce_dims).reshape(dX.dimensions()); \ + break; \ + } + +#define UPPER_SWITCH_REDUCE_DIMS(m) \ + case m: { \ + Eigen::DSizes reduce_dims; \ + for (size_t i = 0; i < reduce_dims_vec.size(); ++i) { \ + reduce_dims[i] = reduce_dims_vec[i]; \ + } \ + switch (reshape_size) { +#define LOWER_SWITCH_REDUCE_DIMS \ + default: { \ + PADDLE_THROW(errors::InvalidArgument( \ + "Detected reshape size: %d out of range" \ + "Minimum value should be larger than reduce size %d" \ + "While maximum supported is: 5", \ + reshape_size, \ + reduce_size)); \ + } \ + } \ + break; \ + } + +namespace phi { + +template +void BroadcastTensorsGradKernel(const Context& ctx, + const std::vector& dout, + std::vector dx) { + // Find reduce dimensions + const auto& in_tensors = dout; + auto& out_tensors = dx; + + size_t num_ins = in_tensors.size(); + + PADDLE_ENFORCE_GT( + num_ins, + 1, + errors::InvalidArgument( + "Expected at least 2 input tensors, but only received d%.", + in_tensors.size())); + + PADDLE_ENFORCE_EQ(num_ins, + out_tensors.size(), + errors::InvalidArgument( + "BroadcastTensorsOp expects equal number of inputs and " + "outputs, but received: %d inputs v.s %d outputs", + num_ins, + out_tensors.size())); + + // For each In-Out tensor pair, + // Prepare and apply broadcast dims array + for (size_t i = 0; i < num_ins; i++) { + const auto* input_tensor = &in_tensors[i]; + auto* output_tensor = out_tensors[i]; + + const auto& input_dims = input_tensor->dims(); + const auto& output_dims = output_tensor->dims(); + + int in_rank = input_dims.size(); + int out_rank = output_dims.size(); + + // BroadcastTensorsGrad is simply a reduce_sum along broadcasted axes + // Here we perform the following Eigen operations: + // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) -> + // reshape(dX_shape) -> dX + // Note the last "reshape(dX_shape)" will be performed implicitly, + // and we only need to collect reduce_dims and reshape_dims + std::vector reduce_dims_vec; + std::vector reshape_dims_vec; + for (int j = 0; j < in_rank; j++) { + int out_axis = out_rank - j - 1; + int in_axis = in_rank - j - 1; + + reshape_dims_vec.push_back(input_dims[j]); + if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) { + reduce_dims_vec.push_back(in_axis); + } + } + + size_t reduce_size = reduce_dims_vec.size(); + size_t reshape_size = reshape_dims_vec.size(); + bool just_copy = (reduce_dims_vec.size() == 0); + ctx.template Alloc(output_tensor); + if (just_copy) { + // If this turns out to be a No-Op, simply perform a tensor copy + paddle::framework::TensorCopy( + *input_tensor, ctx.GetPlace(), ctx, output_tensor); + } else { + PADDLE_ENFORCE_GE( + reduce_dims_vec.size(), + 1, + errors::InvalidArgument("The number of dimensions of the input " + "'Out@GRAD' for Op(broadcast_tensors)" + " must be greater than or equal to 1, but " + "the value received is %d.", + reduce_dims_vec.size())); + PADDLE_ENFORCE_LE( + reduce_dims_vec.size(), + 5, + errors::InvalidArgument( + "The number of dimensions of the input 'Out@GRAD' " + "for Op(broadcast_tensors) must be less than or equal " + "to 5, but the value received is %d.", + reduce_dims_vec.size())); + + // Overall: + // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) -> + // reshape(dX_shape) -> dX + auto dX = EigenVector::Flatten(*output_tensor); + auto dOut = EigenVector::Flatten(*input_tensor); + auto& place = *ctx.eigen_device(); + + // Expand ReduceSize and ReshapeSize into static values + switch (reduce_size) { + UPPER_SWITCH_REDUCE_DIMS(1) + SWITCH_RESHAPE_DIMS(1) + SWITCH_RESHAPE_DIMS(2) + SWITCH_RESHAPE_DIMS(3) + SWITCH_RESHAPE_DIMS(4) + SWITCH_RESHAPE_DIMS(5) + LOWER_SWITCH_REDUCE_DIMS + + UPPER_SWITCH_REDUCE_DIMS(2) + SWITCH_RESHAPE_DIMS(2) + SWITCH_RESHAPE_DIMS(3) + SWITCH_RESHAPE_DIMS(4) + SWITCH_RESHAPE_DIMS(5) + LOWER_SWITCH_REDUCE_DIMS + + UPPER_SWITCH_REDUCE_DIMS(3) + SWITCH_RESHAPE_DIMS(3) + SWITCH_RESHAPE_DIMS(4) + SWITCH_RESHAPE_DIMS(5) + LOWER_SWITCH_REDUCE_DIMS + + UPPER_SWITCH_REDUCE_DIMS(4) + SWITCH_RESHAPE_DIMS(4) + SWITCH_RESHAPE_DIMS(5) + LOWER_SWITCH_REDUCE_DIMS + + UPPER_SWITCH_REDUCE_DIMS(5) + SWITCH_RESHAPE_DIMS(5) + LOWER_SWITCH_REDUCE_DIMS + + default: { + PADDLE_THROW( + errors::InvalidArgument("Detected reduce size: %d out of range" + "While maximum supported is: 5", + reduce_size)); + } + } + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(broadcast_tensors_grad, + CPU, + ALL_LAYOUT, + phi::BroadcastTensorsGradKernel, + int, + int64_t, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc new file mode 100644 index 00000000000..4cb6db87692 --- /dev/null +++ b/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/broadcast_tensors_kernel.h" +#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h" + +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(broadcast_tensors, + CPU, + ALL_LAYOUT, + phi::BroadcastTensorsKernel, + bool, + int, + int64_t, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu new file mode 100644 index 00000000000..6fb24d72145 --- /dev/null +++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu @@ -0,0 +1,111 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/broadcast_tensors_grad_kernel.h" + +#include +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/reduce.h" +#include "paddle/phi/kernels/primitive/functor_primitives.h" + +namespace phi { + +template +void BroadcastTensorsGradKernel(const Context& ctx, + const std::vector& dout, + std::vector dx) { + // Find reduce dimensions + const auto& in_tensors = dout; + auto& out_tensors = dx; + + size_t num_ins = in_tensors.size(); + + PADDLE_ENFORCE_GT( + num_ins, + 1, + errors::InvalidArgument( + "Expected at least 2 input tensors, but only received d%.", + in_tensors.size())); + + PADDLE_ENFORCE_EQ( + num_ins, + out_tensors.size(), + errors::InvalidArgument( + "BroadcastTensorsOp expects equal number of inputs and outputs," + "but received: %d inputs v.s %d outputs", + num_ins, + out_tensors.size())); + + // For each In-Out tensor pair, + // Prepare and apply broadcast dims array + for (size_t i = 0; i < num_ins; i++) { + auto* input_tensor = &in_tensors[i]; + auto* output_tensor = out_tensors[i]; + + const DDim& input_dims = input_tensor->dims(); + const DDim& output_dims = output_tensor->dims(); + + int in_rank = input_dims.size(); + int out_rank = output_dims.size(); + + // Collect reduce_dims + // Example: + // dX = [1,1,1,1] + // dOut = [1,1,1,4] + // + // reduce_dims = [3] // reduce along the broadcasted axis + std::vector reduce_dims_vec; + for (int j = 0; j < in_rank; j++) { + int out_axis = out_rank - j - 1; + int in_axis = in_rank - j - 1; + + if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) { + reduce_dims_vec.push_back(in_axis); + } + } + + bool just_copy = (reduce_dims_vec.size() == 0); + ctx.template Alloc(output_tensor); + if (just_copy) { + // Turns out to be a No-Op, simply copy tensors + paddle::framework::TensorCopy( + *input_tensor, ctx.GetPlace(), ctx, output_tensor); + } else { + // reduce_sum implementation on CUDA + kernels::TensorReduceImpl>( + ctx, + *input_tensor, + output_tensor, + kps::IdentityFunctor(), + reduce_dims_vec, + ctx.stream()); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(broadcast_tensors_grad, + GPU, + ALL_LAYOUT, + phi::BroadcastTensorsGradKernel, + int, + int64_t, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu new file mode 100644 index 00000000000..aa45bd3c438 --- /dev/null +++ b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/broadcast_tensors_kernel.h" +#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h" + +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(broadcast_tensors, + GPU, + ALL_LAYOUT, + phi::BroadcastTensorsKernel, + bool, + int, + int64_t, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h new file mode 100644 index 00000000000..eb01b83377c --- /dev/null +++ b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h @@ -0,0 +1,118 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/broadcast_tensors_kernel.h" + +#include +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +#define SWITCH_OUT_RANK_CASE(n) \ + case n: { \ + ApplyBroadcast(ctx, &in_tensors[i], out_tensors[i]); \ + break; \ + } + +namespace phi { + +template +void ApplyBroadcast(const Context& ctx, + const DenseTensor* input_tensor, + DenseTensor* output_tensor) { + const auto& input_dims = input_tensor->dims(); + const auto& output_dims = output_tensor->dims(); + + int in_rank = input_dims.size(); + int out_rank = output_dims.size(); + + // 1. Collect bcast_dims, each element of which indicates how many + // times we need to replicate along the corresponding dimension + // 2. Collect new_input_dims_vec. Eigen::broadcast requires same rank for + // both input and output tensors, so we need to initialize input X with + // expanded dims: "new_input_dims_vec" + Eigen::DSizes bcast_dims; + std::vector new_input_dims_vec(out_rank); + for (int j = 0; j < out_rank; j++) { + int out_axis = out_rank - j - 1; + int in_axis = in_rank - j - 1; + + bcast_dims[out_axis] = output_dims[out_axis]; + new_input_dims_vec[out_axis] = 1; + if (in_axis >= 0 && input_dims[in_axis] == output_dims[out_axis]) { + bcast_dims[out_axis] = 1; + new_input_dims_vec[out_axis] = input_dims[in_axis]; + } + } + auto new_input_dims = phi::make_ddim(new_input_dims_vec); + + // Initialize input X with new_input_dims_vec, so it's rank-aligned with the + // output + auto x = EigenTensor::From(*input_tensor, new_input_dims); + + ctx.template Alloc(output_tensor); + auto y = EigenTensor::From(*output_tensor, output_dims); + + auto& place = *ctx.eigen_device(); + funcs::EigenBroadcast, T, OutRank>::Eval( + place, y, x, bcast_dims); +} + +template +void BroadcastTensorsKernel(const Context& ctx, + const std::vector& x, + std::vector out) { + const auto& in_tensors = x; + auto out_tensors = out; + size_t num_ins = in_tensors.size(); + + PADDLE_ENFORCE_GT( + num_ins, + 1, + errors::InvalidArgument( + "Expected at least 2 input tensors, but only received d%.", + in_tensors.size())); + + PADDLE_ENFORCE_EQ(num_ins, + out_tensors.size(), + errors::InvalidArgument( + "BroadcastTensorsOp expects equal number of inputs and " + "outputs,but received: %d inputs v.s %d outputs", + num_ins, + out_tensors.size())); + + // Eigen has no support for dynamic ranked tensor + // Thus we perform static expansion for each possible ranks + for (size_t i = 0; i < num_ins; i++) { + int out_rank = out_tensors[i]->dims().size(); + switch (out_rank) { + SWITCH_OUT_RANK_CASE(1) + SWITCH_OUT_RANK_CASE(2) + SWITCH_OUT_RANK_CASE(3) + SWITCH_OUT_RANK_CASE(4) + SWITCH_OUT_RANK_CASE(5) + default: { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "Target tensor rank out of range" + "Maximum supported rank for broadcast is: 5")); + } + } + } +} + +} // namespace phi diff --git a/paddle/phi/ops/compat/broadcast_tensors_sig.cc b/paddle/phi/ops/compat/broadcast_tensors_sig.cc new file mode 100644 index 00000000000..2c979c4aedc --- /dev/null +++ b/paddle/phi/ops/compat/broadcast_tensors_sig.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature BroadcastTensorsGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "broadcast_tensors_grad", {GradVarName("Out")}, {}, {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(broadcast_tensors_grad, + phi::BroadcastTensorsGradOpArgumentMapping); -- GitLab From 7a857924570084851be8b6094f181f217d58fb7c Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Wed, 2 Mar 2022 17:18:53 +0800 Subject: [PATCH 061/272] Move transpose to pten (#39327) * immigrate_transpose_to_pten cpu kernel only; test=develop * fix bug; test=develop * add transpose cuda api * bug fix; * fix bugs * fix bugs; test=develop * bug fix; * move transepose to pten; test=develop * fix bug; test=develop * fix bugs; test=develop * add transpose grad fp16 support; test=develop * fix bug; test=develop * fix npu bug; test=develop * fix nemul = 0 bug; test=develop * add fp16 support; test=develop * fix data type register bug; test=develop * fix transpose bug; test=develop * update transpose * fix transpose bug; test=develop * remove useless code; test=develop * remove useless code; test=develop * fix transpose alias bug; test=develop * polish code; test=develop * resolve confict; test=develop * resolve confilct; test=develop * recover prepared operator; test=develop * fix bug; test=develop * polish code; test=develop * fix bug; test=develop * fix bug; test=develop --- .../operators/mkldnn/test_mkldnn_op_nhwc.cc | 2 +- paddle/fluid/operators/transpose_op.cc | 60 ++------ paddle/fluid/operators/transpose_op.cu | 139 ------------------ paddle/fluid/operators/transpose_op.cu.h | 42 +++--- paddle/fluid/operators/transpose_op.h | 58 -------- .../fluid/operators/transpose_op_npu_test.cc | 2 +- .../phi/kernels/cpu/transpose_grad_kernel.cc | 32 ++++ paddle/phi/kernels/cpu/transpose_kernel.cc | 80 ++++++++++ paddle/phi/kernels/funcs/math_function.cu | 51 +++++++ .../phi/kernels/gpu/transpose_grad_kernel.cu | 34 +++++ paddle/phi/kernels/gpu/transpose_kernel.cu | 57 +++++++ .../kernels/impl/transpose_grad_kernel_impl.h | 38 +++++ paddle/phi/kernels/transpose_grad_kernel.h | 28 ++++ paddle/phi/kernels/transpose_kernel.h | 28 ++++ paddle/phi/ops/compat/transpose_sig.cc | 38 +++++ .../unittests/parallel_executor_test_base.py | 2 +- ..._imperative_lod_tensor_to_selected_rows.py | 1 + .../test_parallel_executor_transformer.py | 1 + ...test_partial_eager_deletion_transformer.py | 2 + .../tests/unittests/test_transpose_op.py | 1 + 20 files changed, 426 insertions(+), 270 deletions(-) delete mode 100644 paddle/fluid/operators/transpose_op.cu create mode 100644 paddle/phi/kernels/cpu/transpose_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/transpose_kernel.cc create mode 100644 paddle/phi/kernels/gpu/transpose_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/transpose_kernel.cu create mode 100644 paddle/phi/kernels/impl/transpose_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/transpose_grad_kernel.h create mode 100644 paddle/phi/kernels/transpose_kernel.h create mode 100644 paddle/phi/ops/compat/transpose_sig.cc diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc index 52e2caaeb6e..3791fed23a8 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc @@ -29,7 +29,7 @@ USE_OP(pool2d); USE_OP_DEVICE_KERNEL(pool2d, MKLDNN); USE_OP(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); -USE_OP(transpose); +USE_OP_ITSELF(transpose); USE_OP_DEVICE_KERNEL(transpose, MKLDNN); namespace paddle { diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc index 768ab21936f..1a297e7238c 100644 --- a/paddle/fluid/operators/transpose_op.cc +++ b/paddle/fluid/operators/transpose_op.cc @@ -339,6 +339,14 @@ class Transpose2OpGrad : public framework::OperatorWithKernel { } }; +class TransposeGradInferVarType : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + ctx->SyncTypeAndDataType(framework::GradVarName("Out"), + framework::GradVarName("X")); + } +}; + } // namespace operators } // namespace paddle @@ -347,59 +355,13 @@ REGISTER_OPERATOR( transpose, ops::TransposeOp, ops::TransposeOpMaker, paddle::framework::DefaultGradOpMaker, paddle::framework::DefaultGradOpMaker); -REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad); - -REGISTER_OP_CPU_KERNEL( - transpose, ops::TransposeKernel, - ops::TransposeKernel, - ops::TransposeKernel, - ops::TransposeKernel>, - ops::TransposeKernel>, - ops::TransposeKernel); -REGISTER_OP_CPU_KERNEL( - transpose_grad, - ops::TransposeGradKernel, - ops::TransposeGradKernel, - ops::TransposeGradKernel, - ops::TransposeGradKernel>, - ops::TransposeGradKernel>, - ops::TransposeGradKernel); +REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad, + ops::TransposeGradInferVarType); REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker, ops::Transpose2GradMaker, ops::Transpose2GradMaker); REGISTER_OPERATOR(transpose2_grad, ops::Transpose2OpGrad, + ops::TransposeGradInferVarType, ops::Transpose2DoubleGradMaker, ops::Transpose2DoubleGradMaker); - -REGISTER_OP_CPU_KERNEL( - transpose2, ops::TransposeKernel, - ops::TransposeKernel, - ops::TransposeKernel, - ops::TransposeKernel, - ops::TransposeKernel, - ops::TransposeKernel>, - ops::TransposeKernel>, - ops::TransposeKernel); -REGISTER_OP_CPU_KERNEL( - transpose2_grad, - ops::TransposeGradKernel, - ops::TransposeGradKernel, - ops::TransposeGradKernel, - ops::TransposeGradKernel, - ops::TransposeGradKernel, - ops::TransposeGradKernel>, - ops::TransposeGradKernel>, - ops::TransposeGradKernel); diff --git a/paddle/fluid/operators/transpose_op.cu b/paddle/fluid/operators/transpose_op.cu deleted file mode 100644 index 02e224549a5..00000000000 --- a/paddle/fluid/operators/transpose_op.cu +++ /dev/null @@ -1,139 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/transpose_op.cu.h" -#include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace operators { - -template -class TransposeGPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.InputVar("X"); - auto* out = context.OutputVar("Out"); - - const framework::Tensor* x_tensor = - GetLoDTensorOrSelectedRowsValueFromVar(*x); - framework::Tensor* out_tensor = - GetMutableLoDTensorOrSelectedRowsValueFromVar(out); - - out_tensor->mutable_data(context.GetPlace()); - if (out_tensor->numel() == 0) { - return; - } - - std::vector axis = context.Attr>("axis"); - int ndims = axis.size(); - const auto& dev_ctx = context.template device_context(); - TransposeGPUKernelDriver(dev_ctx, ndims, *x_tensor, axis, out_tensor); - } -}; -template -class TransposeGradGPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out_grad = context.InputVar(framework::GradVarName("Out")); - auto* x_grad = context.OutputVar(framework::GradVarName("X")); - if (!x_grad) { - return; - } - - const framework::Tensor* out_grad_tensor = - GetLoDTensorOrSelectedRowsValueFromVar(*out_grad); - framework::Tensor* x_grad_tensor = - GetMutableLoDTensorOrSelectedRowsValueFromVar(x_grad); - - x_grad_tensor->mutable_data(context.GetPlace()); - if (x_grad_tensor->numel() == 0) { - return; - } - std::vector axis = context.Attr>("axis"); - std::vector reversed_axis(axis); - - for (size_t i = 0; i < axis.size(); i++) { - reversed_axis[axis[i]] = i; - } - - int ndims = axis.size(); - const auto& dev_ctx = context.template device_context(); - TransposeGPUKernelDriver(dev_ctx, ndims, *out_grad_tensor, reversed_axis, - x_grad_tensor); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - transpose, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel>, - ops::TransposeGPUKernel>); -REGISTER_OP_CUDA_KERNEL( - transpose_grad, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel>, - ops::TransposeGradGPUKernel>); - -REGISTER_OP_CUDA_KERNEL( - transpose2, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel>, - ops::TransposeGPUKernel>); -REGISTER_OP_CUDA_KERNEL( - transpose2_grad, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel>, - ops::TransposeGradGPUKernel>); diff --git a/paddle/fluid/operators/transpose_op.cu.h b/paddle/fluid/operators/transpose_op.cu.h index b542fa37f88..a31ac28c991 100644 --- a/paddle/fluid/operators/transpose_op.cu.h +++ b/paddle/fluid/operators/transpose_op.cu.h @@ -16,8 +16,9 @@ limitations under the License. */ #include "paddle/fluid/framework/gpu_utils.h" #include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" namespace paddle { namespace operators { @@ -258,10 +259,10 @@ struct SystemElemType<16> { }; template -void LaunchNarrowDims2TransposeKernel(const platform::CUDADeviceContext& d, - int tile_size_i, int tile_size_j, - int total_tiles_count, const T* input, - const Dim3& input_dims, T* output) { +void LaunchNarrowDims2TransposeKernel(const phi::GPUContext& d, int tile_size_i, + int tile_size_j, int total_tiles_count, + const T* input, const Dim3& input_dims, + T* output) { constexpr int NumThreads = tile_long; if (tile_size_i <= tile_long && tile_size_j <= tile_short) { TilingSwapDim1And2< @@ -278,7 +279,7 @@ void LaunchNarrowDims2TransposeKernel(const platform::CUDADeviceContext& d, template struct NarrowDims2TransposeDispatch { - static void DoTranspose(const platform::CUDADeviceContext& d, int tile_size_i, + static void DoTranspose(const phi::GPUContext& d, int tile_size_i, int tile_size_j, int total_tiles_count, const T* input, const Dim3& input_dims, T* output) { PADDLE_ENFORCE_EQ( @@ -319,7 +320,7 @@ struct NarrowDims2TransposeDispatch< T, tile_long, tile_short, typename std::enable_if< CheckNonLongTileSize(tile_long, tile_short, sizeof(T)), void>::type> { - static void DoTranspose(const platform::CUDADeviceContext& d, int tile_size_i, + static void DoTranspose(const phi::GPUContext& d, int tile_size_i, int tile_size_j, int total_tiles_count, const T* input, const Dim3& input_dims, T* output) { PADDLE_ENFORCE_EQ( @@ -351,7 +352,7 @@ struct NarrowDims2TransposeDispatch< T, tile_long, tile_short, typename std::enable_if::type> { - static void DoTranspose(const platform::CUDADeviceContext& d, int tile_size_i, + static void DoTranspose(const phi::GPUContext& d, int tile_size_i, int tile_size_j, int total_tiles_count, const T* input, const Dim3& input_dims, T* output) { PADDLE_ENFORCE_EQ( @@ -368,7 +369,7 @@ struct NarrowDims2TransposeDispatch< }; template -void SwapDim1And2InNarrow(const platform::CUDADeviceContext& d, const T* input, +void SwapDim1And2InNarrow(const phi::GPUContext& d, const T* input, const Dim3& input_dims, T* output, const int kMinTileSize) { // First get available tile sizes for the data type requested as backups @@ -473,9 +474,8 @@ __global__ void TransposeSimpleKernel(int nthreads, const T* __restrict__ input, // Here suppose convert all tensor to dim3, so just change dim1 and 2. template -void SendSwapDim1And2InTranspose(const platform::CUDADeviceContext& d, - const T* input, const Dim3& input_dims, - T* output) { +void SendSwapDim1And2InTranspose(const phi::GPUContext& d, const T* input, + const Dim3& input_dims, T* output) { // Suppose tile size > 16 static const int kMinTileSize = 16; static const int kMinNarrowTileSize = 96; @@ -512,7 +512,7 @@ void SendSwapDim1And2InTranspose(const platform::CUDADeviceContext& d, } else { // If input shape is small, such as 8X8, just do simple copy int total_elements = input_dims[0] * input_dims[1] * input_dims[2]; - auto config = GetGpuLaunchConfig1D(d, total_elements); + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(d, total_elements); TransposeSimpleKernel<<< config.block_per_grid.x, config.thread_per_block.x, 0, d.stream()>>>( total_elements, input, input_dims, output); @@ -521,7 +521,7 @@ void SendSwapDim1And2InTranspose(const platform::CUDADeviceContext& d, template struct SwapDim1And2InTranspose { - typedef platform::CUDADeviceContext Device; + typedef phi::GPUContext Device; void operator()(const Device& d, const T* in, const std::vector& combined_dims, T* out) { Dim3 input_dims = {static_cast(combined_dims[0]), @@ -533,7 +533,7 @@ struct SwapDim1And2InTranspose { template struct SwapDim0And2InTranspose { - typedef platform::CUDADeviceContext Device; + typedef phi::GPUContext Device; void operator()(const Device& d, const T* in, const std::vector& combined_dims, T* out) { Dim3 input_dims = {static_cast(combined_dims[0]), @@ -541,7 +541,7 @@ struct SwapDim0And2InTranspose { static_cast(combined_dims[2])}; size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2]; - auto config = GetGpuLaunchConfig1D(d, total_size); + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(d, total_size); TransposeSimpleKernel<<< config.block_per_grid.x, config.thread_per_block.x, 0, d.stream()>>>( @@ -607,7 +607,7 @@ inline void CombineTransposeDim3(const framework::DDim& shape, template struct TransposeSimple { - static bool run(const platform::CUDADeviceContext& ctx, const Tensor& in, + static bool run(const phi::GPUContext& ctx, const Tensor& in, const std::vector perm, Tensor* out) { // First reduce the dimensions of the input tensor if possible. std::vector new_perm; @@ -654,12 +654,12 @@ struct TransposeSimple { }; template -void TransposeGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, - const int ndims, const Tensor& in, - const std::vector perm, Tensor* out) { +void TransposeGPUKernelDriver(const phi::GPUContext& dev_ctx, const int ndims, + const Tensor& in, + const std::vector& perm, Tensor* out) { auto ret = TransposeSimple::run(dev_ctx, in, perm, out); if (!ret) { - TransCompute(ndims, dev_ctx, in, out, perm); + TransCompute(ndims, dev_ctx, in, out, perm); } } diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h index ec05a534c0e..a9e4876cc82 100644 --- a/paddle/fluid/operators/transpose_op.h +++ b/paddle/fluid/operators/transpose_op.h @@ -59,63 +59,5 @@ inline void TransCompute(const int dim, const DeviceContext& dev_ctx, } } -template -class TransposeKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.InputVar("X"); - auto* out = context.OutputVar("Out"); - - const framework::Tensor* x_tensor = - GetLoDTensorOrSelectedRowsValueFromVar(*x); - framework::Tensor* out_tensor = - GetMutableLoDTensorOrSelectedRowsValueFromVar(out); - - out_tensor->mutable_data(context.GetPlace()); - if (out_tensor->numel() == 0) { - return; - } - - std::vector axis = context.Attr>("axis"); - int ndims = axis.size(); - auto& dev_ctx = context.template device_context(); - TransCompute(ndims, dev_ctx, *x_tensor, out_tensor, axis); - } -}; - -template -class TransposeGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out_grad = context.InputVar(framework::GradVarName("Out")); - auto* x_grad = context.OutputVar(framework::GradVarName("X")); - - if (!x_grad) { - return; - } - const framework::Tensor* out_grad_tensor = - GetLoDTensorOrSelectedRowsValueFromVar(*out_grad); - framework::Tensor* x_grad_tensor = - GetMutableLoDTensorOrSelectedRowsValueFromVar(x_grad); - - x_grad_tensor->mutable_data(context.GetPlace()); - if (x_grad_tensor->numel() == 0) { - return; - } - - std::vector axis = context.Attr>("axis"); - std::vector reversed_axis(axis); - - for (size_t i = 0; i < axis.size(); i++) { - reversed_axis[axis[i]] = i; - } - - int ndims = axis.size(); - auto& dev_ctx = context.template device_context(); - TransCompute(ndims, dev_ctx, *out_grad_tensor, - x_grad_tensor, reversed_axis); - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc index cce3f188c8b..5617d728a51 100644 --- a/paddle/fluid/operators/transpose_op_npu_test.cc +++ b/paddle/fluid/operators/transpose_op_npu_test.cc @@ -31,7 +31,7 @@ limitations under the License. */ namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(transpose2); +USE_OP_ITSELF(transpose2); USE_OP_DEVICE_KERNEL(transpose2, NPU); template diff --git a/paddle/phi/kernels/cpu/transpose_grad_kernel.cc b/paddle/phi/kernels/cpu/transpose_grad_kernel.cc new file mode 100644 index 00000000000..9dbcf575f33 --- /dev/null +++ b/paddle/phi/kernels/cpu/transpose_grad_kernel.cc @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/transpose_grad_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(transpose_grad, + CPU, + ALL_LAYOUT, + phi::TransposeGradKernel, + bool, + float, + double, + int32_t, + int64_t, + phi::dtype::bfloat16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/transpose_kernel.cc b/paddle/phi/kernels/cpu/transpose_kernel.cc new file mode 100644 index 00000000000..a80196e7f80 --- /dev/null +++ b/paddle/phi/kernels/cpu/transpose_kernel.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/transpose_kernel.h" +#include +#include "paddle/phi/api/ext/dispatch.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h" + +namespace phi { + +template +void TransposeKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& axis, + DenseTensor* out) { + ctx.template Alloc(out); + if (out->numel() == 0) { + return; + } + int rank = axis.size(); + switch (rank) { + case 1: + funcs::Transpose trans1; + trans1(ctx, x, out, axis); + break; + case 2: + funcs::Transpose trans2; + trans2(ctx, x, out, axis); + break; + case 3: + funcs::Transpose trans3; + trans3(ctx, x, out, axis); + break; + case 4: + funcs::Transpose trans4; + trans4(ctx, x, out, axis); + break; + case 5: + funcs::Transpose trans5; + trans5(ctx, x, out, axis); + break; + case 6: + funcs::Transpose trans6; + trans6(ctx, x, out, axis); + break; + default: + // for rank >= 7 situation + funcs::TransposeNormal trans_normal; + trans_normal(ctx, x, out, axis); + } +} +} // namespace phi + +PD_REGISTER_KERNEL(transpose, + CPU, + ALL_LAYOUT, + phi::TransposeKernel, + bool, + float, + double, + int32_t, + int64_t, + phi::dtype::bfloat16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu index ae368a005f0..df2af82d551 100644 --- a/paddle/phi/kernels/funcs/math_function.cu +++ b/paddle/phi/kernels/funcs/math_function.cu @@ -187,6 +187,57 @@ void TransposeNormal::operator()( in_ptr, out_ptr, elements, in_stride_ptr, out_stride_ptr, axis_ptr, rank); } +template +struct TransposeNormal { + void operator()(const phi::GPUContext& context, + const DenseTensor& in, + DenseTensor* out, + const std::vector& axis) { + const int rank = axis.size(); + auto in_stride = stride(in.dims()); + auto out_stride = stride(out->dims()); + auto* in_ptr = in.data(); + auto* out_ptr = out->data(); + + // copy in_stride, out_stride, axis to gpu device + const phi::GPUPlace& cuda_place = context.GetPlace(); + phi::CPUPlace cpu_place = paddle::platform::CPUPlace(); + size_t size = 3 * rank * sizeof(int64_t); + auto cpu_buf_holder = paddle::memory::Alloc(cpu_place, size); + auto cuda_buf_holder = paddle::memory::Alloc(cuda_place, size); + REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr()); + REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr()); + for (int i = 0; i < rank; ++i) { + cpu_buf[i] = in_stride[i]; + cpu_buf[rank + i] = out_stride[i]; + cpu_buf[2 * rank + i] = axis[i]; + } + paddle::memory::Copy( + cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream()); + REINTERPRET(const int64_t, in_stride_ptr, cuda_buf); + REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank); + REINTERPRET(const int64_t, axis_ptr, cuda_buf + 2 * rank); + + const int MAX_BLOCK_DIM = context.GetMaxThreadsPerBlock(); + const int MAX_GRID_DIM = + context.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM; + int64_t elements = in.numel(); + int block_size = (elements >= MAX_BLOCK_DIM) + ? MAX_BLOCK_DIM + : (1 << static_cast(std::log2(elements))); + int grid_size = elements / block_size; + grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size; + TransposeNormalKernel<<>>( + in_ptr, + out_ptr, + elements, + in_stride_ptr, + out_stride_ptr, + axis_ptr, + rank); + } +}; + // define transpose normal #define DEFINE_GPU_TRANS_NORMAL(TYPE) \ template struct TransposeNormal; \ diff --git a/paddle/phi/kernels/gpu/transpose_grad_kernel.cu b/paddle/phi/kernels/gpu/transpose_grad_kernel.cu new file mode 100644 index 00000000000..0687dc0c200 --- /dev/null +++ b/paddle/phi/kernels/gpu/transpose_grad_kernel.cu @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h" +#include "paddle/phi/kernels/transpose_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(transpose_grad, + GPU, + ALL_LAYOUT, + phi::TransposeGradKernel, + bool, + float, + double, + int32_t, + int64_t, + phi::dtype::float16, + phi::dtype::bfloat16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/transpose_kernel.cu b/paddle/phi/kernels/gpu/transpose_kernel.cu new file mode 100644 index 00000000000..9ea2af292cc --- /dev/null +++ b/paddle/phi/kernels/gpu/transpose_kernel.cu @@ -0,0 +1,57 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/phi/api/ext/dispatch.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/transpose_kernel.h" + +#include "paddle/fluid/framework/gpu_utils.h" +#include "paddle/fluid/operators/transpose_op.cu.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h" + +namespace phi { +template +void TransposeKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& axis, + DenseTensor* out) { + int rank = axis.size(); + ctx.template Alloc(out); + if (out->numel() == 0) { + return; + } + paddle::operators::TransposeGPUKernelDriver(ctx, rank, x, axis, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(transpose, + GPU, + ALL_LAYOUT, + phi::TransposeKernel, + bool, + float, + double, + int32_t, + int64_t, + phi::dtype::float16, + phi::dtype::bfloat16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/impl/transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/transpose_grad_kernel_impl.h new file mode 100644 index 00000000000..6bb555fe28f --- /dev/null +++ b/paddle/phi/kernels/impl/transpose_grad_kernel_impl.h @@ -0,0 +1,38 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/transpose_grad_kernel.h" +#include "paddle/phi/kernels/transpose_kernel.h" + +namespace phi { + +template +void TransposeGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const std::vector& axis, + DenseTensor* x_grad) { + std::vector reversed_axis(axis); + + dev_ctx.template Alloc(x_grad); + for (size_t i = 0; i < axis.size(); i++) { + reversed_axis[axis[i]] = i; + } + + TransposeKernel(dev_ctx, out_grad, reversed_axis, x_grad); +} + +} // namespace phi diff --git a/paddle/phi/kernels/transpose_grad_kernel.h b/paddle/phi/kernels/transpose_grad_kernel.h new file mode 100644 index 00000000000..33d4ca7e3c6 --- /dev/null +++ b/paddle/phi/kernels/transpose_grad_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void TransposeGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const std::vector& axis, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/transpose_kernel.h b/paddle/phi/kernels/transpose_kernel.h new file mode 100644 index 00000000000..303b4a9a8f0 --- /dev/null +++ b/paddle/phi/kernels/transpose_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void TransposeKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& axis, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/transpose_sig.cc b/paddle/phi/ops/compat/transpose_sig.cc new file mode 100644 index 00000000000..90961760cfc --- /dev/null +++ b/paddle/phi/ops/compat/transpose_sig.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature TransposeOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("transpose", {"X"}, {"axis"}, {"Out"}); +} + +KernelSignature TransposeGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "transpose_grad", {GradVarName("Out")}, {"axis"}, {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_BASE_KERNEL_NAME(transpose2, transpose); +PD_REGISTER_BASE_KERNEL_NAME(transpose2_grad, transpose_grad); + +PD_REGISTER_ARG_MAPPING_FN(transpose2, phi::TransposeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(transpose2_grad, + phi::TransposeGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(transpose, phi::TransposeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(transpose_grad, phi::TransposeGradOpArgumentMapping); diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 2a8f72c2170..2633a599256 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -43,7 +43,7 @@ class TestParallelExecutorBase(unittest.TestCase): get_data_from_feeder=None, use_parallel_executor=True, use_reduce=False, - use_ir_memory_optimize=True, + use_ir_memory_optimize=False, enable_inplace=True, fuse_elewise_add_act_ops=False, fuse_all_optimizer_ops=False, diff --git a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py index d54194164a5..110bb961bbe 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py @@ -207,4 +207,5 @@ class TestDygraphSimpleNet(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index 1cb39eb131b..b87e8d4e3c2 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -206,4 +206,5 @@ class TestTransformer(TestParallelExecutorBase): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py index 1661f753a84..15d9e0e2daa 100644 --- a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py @@ -14,10 +14,12 @@ import unittest import paddle.fluid as fluid +import paddle fluid.core._set_eager_deletion_mode(0.0, 0.55, True) from test_parallel_executor_transformer import TestTransformer if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py index 13b880b28bf..1e6b4354dd9 100644 --- a/python/paddle/fluid/tests/unittests/test_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py @@ -463,4 +463,5 @@ class TestMoveAxis(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() -- GitLab From 66196573ffe73bd3e02a4f713e2b2578bbf601aa Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Wed, 2 Mar 2022 17:50:32 +0800 Subject: [PATCH 062/272] [XPU] Fix Phi Kernel cache problem in operator.cc (#40044) * [XPU] Fix Phi Kernel cache problem in operator.cc * fix typo --- paddle/fluid/framework/operator.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index b91ee3c2d63..ffdc3e6d3c2 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1210,6 +1210,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope, VLOG(6) << "Static mode ChoosePhiKernel - kernel `" << pt_kernel_name << "` not found."; } + } else { + pt_kernel_name = pt_kernel_signature_->name; + pt_kernel_key = TransOpKernelTypeToPhiKernelKey(*kernel_type_.get()); } #ifdef PADDLE_WITH_XPU bool is_xpu_unsupport = -- GitLab From 5898e9abecc05bc039e29838ec4b8fb49ae2d3f0 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Wed, 2 Mar 2022 18:25:54 +0800 Subject: [PATCH 063/272] [Phi]Move elementwise function to funcs directory (#39986) * move elementwise function to funcs directory * fix compile bugs * modify according to comment --- .../elementwise/elementwise_add_op.kps | 2 +- .../elementwise/elementwise_op_broadcast.cu.h | 3 - .../elementwise/elementwise_op_function.h | 29 +- .../elementwise/elementwise_op_impl.cu.h | 2 +- paddle/fluid/operators/viterbi_decode_op.h | 12 +- paddle/phi/kernels/cpu/elementwise.h | 619 +---------------- paddle/phi/kernels/cpu/elementwise_grad.h | 146 ++++ .../kernels/cpu/elementwise_grad_kernel.cc | 27 +- paddle/phi/kernels/cpu/logical_kernel.cc | 20 +- paddle/phi/kernels/cpu/math_kernel.cc | 9 +- paddle/phi/kernels/funcs/broadcast_function.h | 18 +- paddle/phi/kernels/funcs/elementwise_base.h | 285 ++++---- .../elementwise_grad_base.h} | 655 +++++++++++------- paddle/phi/kernels/funcs/elementwise_utils.h | 121 ++++ paddle/phi/kernels/gpu/elementwise_grad.h | 246 +++++++ .../kernels/gpu/elementwise_grad_kernel.cu | 27 +- paddle/phi/kernels/gpu/logical_kernel.cu | 3 +- paddle/phi/kernels/gpu/math_kernel.cu | 2 +- .../impl/elementwise_grad_kernel_impl.h | 33 +- 19 files changed, 1149 insertions(+), 1110 deletions(-) create mode 100644 paddle/phi/kernels/cpu/elementwise_grad.h rename paddle/phi/kernels/{gpu/elementwise.h => funcs/elementwise_grad_base.h} (78%) create mode 100644 paddle/phi/kernels/funcs/elementwise_utils.h create mode 100644 paddle/phi/kernels/gpu/elementwise_grad.h diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.kps b/paddle/fluid/operators/elementwise/elementwise_add_op.kps index d6e0749318e..3b7457d72e1 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.kps +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.kps @@ -39,7 +39,7 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #else #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" -#include "paddle/phi/kernels/gpu/elementwise.h" +#include "paddle/phi/kernels/gpu/elementwise_grad.h" #endif namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h index 418779c32e8..102127e6ffe 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h @@ -16,9 +16,6 @@ #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" -// only can include the headers in paddle/top/api dirs -#include "paddle/phi/kernels/gpu/elementwise.h" - namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index a1a7f831098..61862aa9f87 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -31,6 +31,7 @@ limitations under the License. */ #include "paddle/phi/api/lib/utils/tensor_utils.h" #include "paddle/phi/kernels/cpu/elementwise.h" +#include "paddle/phi/kernels/cpu/elementwise_grad.h" #if defined(__NVCC__) || defined(__HIPCC__) #ifdef __NVCC__ @@ -133,7 +134,7 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims, inline framework::DDim trim_trailing_singular_dims( const framework::DDim &dims) { - return phi::funcs::trim_trailing_singular_dims(dims); + return phi::funcs::TrimTrailingSingularDims(dims); } template ( dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); } else { - phi::ElemwiseGradComputeWithBroadcast( + phi::funcs::ElemwiseGradComputeWithBroadcast( dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); } } @@ -173,19 +174,9 @@ void ElementwiseComputeEx(const framework::ExecutionContext &ctx, const framework::Tensor *y, int axis, Functor func, framework::Tensor *z) { z->mutable_data(ctx.GetPlace()); - if (platform::is_gpu_place(ctx.GetPlace())) { -#if defined(__NVCC__) || defined(__HIPCC__) - const auto &dev_ctx = - ctx.template device_context(); - phi::ElementwiseCompute(dev_ctx, *x, *y, axis, func, - z); - -#endif - return; - } - const auto &dev_ctx = - ctx.template device_context(); - phi::ElementwiseCompute(dev_ctx, *x, *y, axis, func, z); + const auto &dev_ctx = ctx.template device_context(); + phi::funcs::ElementwiseCompute(dev_ctx, *x, *y, axis, + func, z); } // FusedElemwiseAndAct @@ -443,8 +434,8 @@ void FusedElemwiseAndActComputeWithBroadcast( axis = (y_dim.size() == 0) ? x_dim.size() : axis; int pre, n, post, is_run_common_broadcast; - phi::funcs::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post, - &is_run_common_broadcast); + phi::funcs::GetMidDims(x_dim, y_dim, axis, &pre, &n, &post, + &is_run_common_broadcast); if (post == 1) { int h = pre; int w = n; @@ -991,8 +982,8 @@ void FusedElemwiseAndActGradComputeWithBroadcast( axis = (y_dim.size() == 0) ? x_dim.size() : axis; int pre, n, post, is_run_common_broadcast; - phi::funcs::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post, - &is_run_common_broadcast); + phi::funcs::GetMidDims(x_dim, y_dim, axis, &pre, &n, &post, + &is_run_common_broadcast); const T *x_data = nullptr; const T *y_data = nullptr; if (x->IsInitialized()) x_data = x->data(); diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h index 7d7bb4f26fc..f49e2ab4e17 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h @@ -19,7 +19,7 @@ limitations under the License. */ // only can include the headers in paddle/top/api dirs #include "paddle/phi/api/lib/utils/tensor_utils.h" -#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/viterbi_decode_op.h b/paddle/fluid/operators/viterbi_decode_op.h index 8f01a0c3604..bf12a03e7b4 100644 --- a/paddle/fluid/operators/viterbi_decode_op.h +++ b/paddle/fluid/operators/viterbi_decode_op.h @@ -151,12 +151,12 @@ struct GetInputIndex { const std::vector& output_strides, int output_idx, int* index_array, int* lhs_idx, int* rhs_idx) { int out_dims_size = output_strides.size(); - *lhs_idx = - phi::GetElementwiseIndex(lhs_dims.data(), out_dims_size, index_array); - *rhs_idx = - phi::GetElementwiseIndex(rhs_dims.data(), out_dims_size, index_array); - phi::UpdateElementwiseIndexArray(output_dims.data(), out_dims_size, - index_array); + *lhs_idx = phi::funcs::GetElementwiseIndex(lhs_dims.data(), out_dims_size, + index_array); + *rhs_idx = phi::funcs::GetElementwiseIndex(rhs_dims.data(), out_dims_size, + index_array); + phi::funcs::UpdateElementwiseIndexArray(output_dims.data(), out_dims_size, + index_array); } }; diff --git a/paddle/phi/kernels/cpu/elementwise.h b/paddle/phi/kernels/cpu/elementwise.h index 28bf5ab743f..0f67df66113 100644 --- a/paddle/phi/kernels/cpu/elementwise.h +++ b/paddle/phi/kernels/cpu/elementwise.h @@ -16,8 +16,8 @@ limitations under the License. */ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/common_shape.h" -#include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/eigen/common.h" @@ -189,250 +189,6 @@ struct SameDimsMultiplyFunctor< } }; -inline void UpdateElementwiseIndexArray(const int* out_dims_array, - const int max_dim, - int* index_array) { - for (int i = max_dim - 1; i >= 0; --i) { - ++index_array[i]; - if (index_array[i] >= out_dims_array[i]) { - index_array[i] -= out_dims_array[i]; - } else { - break; - } - } -} - -inline int GetElementwiseIndex(const int* x_dims_array, - const int max_dim, - const int* index_array) { - int index_ = 0; - for (int i = 0; i < max_dim; i++) { - if (x_dims_array[i] > 1) { - index_ = index_ * x_dims_array[i] + index_array[i]; - } - } - return index_; -} - -template -void CommonGradBroadcastCPU(const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& out, - const DenseTensor& dout, - DenseTensor* dx, - DenseTensor* dy, - int* x_dims_array, - int* y_dims_array, - int* out_dims_array, - int max_dim, - const CPUContext& ctx, - DX_OP dx_op, - DY_OP dy_op) { - std::vector index_array(max_dim, 0); - const T* x_data = x.data(); - const T* y_data = y.data(); - const Tout* out_data = out.data(); - const Tout* dout_data = dout.data(); - T* dx_data = dx == nullptr ? nullptr : ctx.Alloc(dx); - T* dy_data = dy == nullptr ? nullptr : ctx.Alloc(dy); - if (dx_data != nullptr) { - memset(dx_data, 0, dx->numel() * sizeof(T)); - } - if (dy_data != nullptr) { - memset(dy_data, 0, dy->numel() * sizeof(T)); - } - const int out_size = std::accumulate( - out_dims_array, out_dims_array + max_dim, 1, std::multiplies()); - int x_index, y_index; - for (int out_index = 0; out_index < out_size; ++out_index) { - x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data()); - y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data()); - if (dx_data != nullptr) { - dx_data[x_index] += dx_op(x_data[x_index], - y_data[y_index], - out_data[out_index], - dout_data[out_index]); - } - if (dy_data != nullptr) { - dy_data[y_index] += dy_op(x_data[x_index], - y_data[y_index], - out_data[out_index], - dout_data[out_index]); - } - - UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data()); - } -} - -template -void CommonForwardBroadcastCPU(const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z, - int* x_dims_array, - int* y_dims_array, - int* out_dims_array, - int max_dim, - const CPUContext& ctx, - Functor func, - const bool is_xsize_larger = true) { - std::vector index_array(max_dim, 0); - const T* x_data = x.data(); - const T* y_data = y.data(); - PADDLE_ENFORCE_NOT_NULL( - x_data, phi::errors::InvalidArgument("The input X should not be empty.")); - PADDLE_ENFORCE_NOT_NULL( - y_data, phi::errors::InvalidArgument("The input Y should not be empty.")); - OutType* out_data = ctx.Alloc(z); - - const int out_size = std::accumulate( - out_dims_array, out_dims_array + max_dim, 1, std::multiplies()); - int x_index, y_index; - for (int out_index = 0; out_index < out_size; ++out_index) { - x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data()); - y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data()); - if (is_xsize_larger) { - out_data[out_index] = func(x_data[x_index], y_data[y_index]); - } else { - out_data[out_index] = func(y_data[y_index], x_data[x_index]); - } - - UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data()); - } -} - -template -void CommonElementwiseBroadcastForward(const CPUContext& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z, - const DDim& x_dims, - const DDim& y_dims, - Functor func, - int axis, - const bool is_xsize_larger = true) { - int max_dim = (std::max)(x_dims.size(), y_dims.size()); - axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); - PADDLE_ENFORCE_GE( - axis, - 0, - phi::errors::InvalidArgument( - "Axis should be great than or equal to 0, but received axis is %d.", - axis)); - PADDLE_ENFORCE_LT(axis, - max_dim, - phi::errors::InvalidArgument( - "Axis should be less than %d, but received axis is %d.", - max_dim, - axis)); - std::vector x_dims_array(max_dim); - std::vector y_dims_array(max_dim); - std::vector out_dims_array(max_dim); - funcs::GetBroadcastDimsArrays(x_dims, - y_dims, - x_dims_array.data(), - y_dims_array.data(), - out_dims_array.data(), - max_dim, - axis); - - CommonForwardBroadcastCPU(x, - y, - z, - x_dims_array.data(), - y_dims_array.data(), - out_dims_array.data(), - max_dim, - dev_ctx, - func, - is_xsize_larger); -} - -// It is a common CPU implementation to compute binary calculation with the -// support of broadcast. Note: -// 1. CPU implementation cannot support the case when x needs broadcast, thus -// this function need to be called with XxxFunctor and XxxInverseFunctor, -// like AddFunctor and InverseAddFunctor. -// 2. The corresponding GPU implementation supports all the broadcast cases, -// thus there is no need to define and call with XxxInverseFunctor. -// TODO(liuyiqun): optimize the CPU implementation to support all broadcast -// cases and avoid the need of XxxInverseFunctor. -template -void ElementwiseCompute(const CPUContext& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - Functor func, - DenseTensor* z) { - dev_ctx.Alloc(z); - auto x_dims = x.dims(); - auto y_dims = y.dims(); - bool is_xsize_larger = true; - int max_dim = x_dims.size(); - if (x_dims.size() < y_dims.size()) { - is_xsize_larger = false; - max_dim = y_dims.size(); - } - funcs::TransformFunctor functor( - x, y, z, dev_ctx, func, is_xsize_larger); - if (x_dims == y_dims) { - functor.Run(); - return; - } - - axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); - PADDLE_ENFORCE_GE( - axis, - 0, - phi::errors::InvalidArgument( - "Axis should be great than or equal to 0, but received axis is %d.", - axis)); - PADDLE_ENFORCE_LT(axis, - max_dim, - phi::errors::InvalidArgument( - "Axis should be less than %d, but received axis is %d.", - max_dim, - axis)); - - int pre, n, post, is_run_common_broadcast, axis_trim = 0; - if (is_xsize_larger) { - auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims); - axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis; - funcs::get_mid_dims(x_dims, - y_dims_trimed, - axis_trim, - &pre, - &n, - &post, - &is_run_common_broadcast); - } else { - auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims); - axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis; - funcs::get_mid_dims(y_dims, - x_dims_trimed, - axis_trim, - &pre, - &n, - &post, - &is_run_common_broadcast); - } - // special case for common implementation. - // case 1: x=[2,3,1,5], y=[2,1,4,1] - // case 2: x=[2,3,4], y=[1,1,4] - if (is_run_common_broadcast == 1) { - CommonElementwiseBroadcastForward( - dev_ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger); - return; - } - - if (post == 1) { - functor.RunRowWise(n, pre); - return; - } else { - functor.RunMidWise(n, pre, post); - return; - } -} - template struct SameDimsElementwiseCompute { void operator()(const CPUContext& dev_ctx, @@ -443,377 +199,4 @@ struct SameDimsElementwiseCompute { } }; -// BACKWARD CODE - -template -static void ElemwiseGradBroadcast1CPU(const T* x, - const T* y, - const Tout* out, - const Tout* dout, - int h, - int w, - bool is_xsize_larger, - DX_OP dx_op, - DY_OP dy_op, - T* dx, - T* dy) { - if (is_xsize_larger) { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; ++j) { - int x_offset = i * w + j; - if (dx != nullptr) { - dx[x_offset] = - dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); - } - if (dy != nullptr) { - T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); - if (i == 0) { - dy[j] = tmp; - } else { - dy[j] += tmp; - } - } - } - } - } else { // x.dims < y.dims, broadcast for x. - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; ++j) { - int y_offset = i * w + j; - if (dy != nullptr) { - dy[y_offset] = - dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); - } - if (dx != nullptr) { - T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); - if (i == 0) { - dx[j] = tmp; - } else { - dx[j] += tmp; - } - } - } - } - } -} - -template -static void ElemwiseGradBroadcast2CPU(const T* x, - const T* y, - const Tout* out, - const Tout* dout, - int pre, - int n, - int post, - bool is_xsize_larger, - DX_OP dx_op, - DY_OP dy_op, - T* dx, - T* dy) { - if (is_xsize_larger) { - for (int i = 0; i < pre; ++i) { - for (int j = 0; j < n; ++j) { - for (int k = 0; k < post; ++k) { - int x_offset = i * n * post + j * post + k; - if (dx != nullptr) { - dx[x_offset] = - dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); - } - if (dy != nullptr) { - T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); - if (i == 0 && k == 0) { - dy[j] = tmp; - } else { - dy[j] += tmp; - } - } - } - } - } - } else { // x.dims < y.dims, broadcast for x. - for (int i = 0; i < pre; ++i) { - for (int j = 0; j < n; ++j) { - for (int k = 0; k < post; ++k) { - int y_offset = i * n * post + j * post + k; - if (dy != nullptr) { - dy[y_offset] = - dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); - } - if (dx != nullptr) { - T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); - if (i == 0 && k == 0) { - dx[j] = tmp; - } else { - dx[j] += tmp; - } - } - } - } - } - } -} - -template -void CommonElementwiseBroadcastBackward(const CPUContext& ctx, - const DDim& x_dims, - const DDim& y_dims, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& out, - const DenseTensor& dout, - int axis, - DenseTensor* dx, - DenseTensor* dy, - DX_OP dx_op, - DY_OP dy_op) { - int max_dim = std::max(x_dims.size(), y_dims.size()); - axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); - std::vector x_dims_array(max_dim); - std::vector y_dims_array(max_dim); - std::vector out_dims_array(max_dim); - funcs::GetBroadcastDimsArrays(x_dims, - y_dims, - x_dims_array.data(), - y_dims_array.data(), - out_dims_array.data(), - max_dim, - axis); - // for inplace strategy. memset will make dx and dout clear and get wrong - // result. - if (dx && dx->IsSharedBufferWith(dout)) { - dx->clear(); - dx->mutable_data(x_dims, ctx.GetPlace()); - } - - VLOG(3) << "CommonElementwiseBroadcastBackward xdims:" - << phi::make_ddim(x_dims_array) - << " ydim:" << phi::make_ddim(y_dims_array); - - CommonGradBroadcastCPU(x, - y, - out, - dout, - dx, - dy, - x_dims_array.data(), - y_dims_array.data(), - out_dims_array.data(), - max_dim, - ctx, - dx_op, - dy_op); -} - -template -void ElemwiseGradComputeWithBroadcast(const CPUContext& ctx, - const DDim& x_dims, - const DDim& y_dims, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& out, - const DenseTensor& dout, - int axis, - DenseTensor* dx, - DenseTensor* dy, - DX_OP dx_op, - DY_OP dy_op) { - bool is_xsize_larger = true; - - int max_dim = x_dims.size(); - if (x_dims.size() < y_dims.size()) { - is_xsize_larger = false; - max_dim = y_dims.size(); - } - - axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); - PADDLE_ENFORCE_GE( - axis, - 0, - phi::errors::InvalidArgument( - "Axis should be great than or equal to 0, but received axis is %d.", - axis)); - PADDLE_ENFORCE_LT(axis, - max_dim, - phi::errors::InvalidArgument( - "Axis should be less than %d, but received axis is %d.", - max_dim, - axis)); - - int pre, n, post, is_run_common_broadcast, axis_trim = 0; - if (is_xsize_larger) { - auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims); - axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis; - funcs::get_mid_dims(x_dims, - y_dims_trimed, - axis_trim, - &pre, - &n, - &post, - &is_run_common_broadcast); - } else { - auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims); - axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis; - funcs::get_mid_dims(y_dims, - x_dims_trimed, - axis_trim, - &pre, - &n, - &post, - &is_run_common_broadcast); - } - // special case for common backward implementation. - if (is_run_common_broadcast) { - CommonElementwiseBroadcastBackward( - ctx, x_dims, y_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); - return; - } - if (post == 1) { - ElemwiseGradBroadcast1CPU(x.data(), - y.data(), - out.data(), - dout.data(), - pre, - n, - is_xsize_larger, - dx_op, - dy_op, - dx == nullptr ? nullptr : ctx.Alloc(dx), - dy == nullptr ? nullptr : ctx.Alloc(dy)); - } else { - ElemwiseGradBroadcast2CPU(x.data(), - y.data(), - out.data(), - dout.data(), - pre, - n, - post, - is_xsize_larger, - dx_op, - dy_op, - dx == nullptr ? nullptr : ctx.Alloc(dx), - dy == nullptr ? nullptr : ctx.Alloc(dy)); - } -} - -// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub. -// explicit gradient can cut off X, Y, Out from gradient op -// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse -// elementwise code. -template -void ElemwiseExplicitGradCompute(const CPUContext& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& out, - const DenseTensor& dout, - int axis, - DenseTensor* dx, - DenseTensor* dy, - DX_OP dx_op, - DY_OP dy_op) { - const DDim& x_dim = x.dims(); - const DDim& y_dim = y.dims(); - if (x.dims() == y.dims()) { - phi::funcs::ElemwiseGradComputeNoBroadcast( - dev_ctx, - x_dim, - y_dim, - dout, - dout, - out, - dout, - axis, - dx, - dy, - dx_op, - dy_op); - } else { - ElemwiseGradComputeWithBroadcast(dev_ctx, - x_dim, - y_dim, - dout, - dout, - out, - dout, - axis, - dx, - dy, - dx_op, - dy_op); - } -} - -/* -****************************** - Add Grad -****************************** -*/ -template -struct IdentityGrad { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; } -}; - -template -typename std::enable_if::value>::type -elementwise_add_grad(const CPUContext& ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& out, - const DenseTensor& dout, - DenseTensor* dx, - DenseTensor* dy, - int axis = -1) { - auto blas = phi::funcs::GetBlas(ctx); - if (dx) { - blas.VCOPY( - dout.numel(), dout.data(), dx->mutable_data(ctx.GetPlace())); - } - - if (dy) { - blas.VCOPY( - dout.numel(), dout.data(), dy->mutable_data(ctx.GetPlace())); - } -} - -template -typename std::enable_if::value>::type -elementwise_add_grad(const CPUContext& ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& out, - const DenseTensor& dout, - DenseTensor* dx, - DenseTensor* dy, - int axis = -1) { - ElemwiseExplicitGradCompute, IdentityGrad>( - ctx, x, y, out, dout, axis, dx, dy, IdentityGrad(), IdentityGrad()); -} - -/* -****************************** - Sub Grad -****************************** -*/ - -template -struct SubGradDX { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; } -}; - -template -struct SubGradDY { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -dout; } -}; - -template -void elementwise_sub_grad(const CPUContext& ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& out, - const DenseTensor& dout, - DenseTensor* dx, - DenseTensor* dy, - int axis = -1) { - ElemwiseExplicitGradCompute, SubGradDY>( - ctx, x, y, out, dout, axis, dx, dy, SubGradDX(), SubGradDY()); -} - } // namespace phi diff --git a/paddle/phi/kernels/cpu/elementwise_grad.h b/paddle/phi/kernels/cpu/elementwise_grad.h new file mode 100644 index 00000000000..92587566eb8 --- /dev/null +++ b/paddle/phi/kernels/cpu/elementwise_grad.h @@ -0,0 +1,146 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/elementwise_grad_base.h" + +namespace phi { + +// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub. +// explicit gradient can cut off X, Y, Out from gradient op +// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse +// elementwise code. +template +void ElemwiseExplicitGradCompute(const CPUContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, + int axis, + DenseTensor* dx, + DenseTensor* dy, + DX_OP dx_op, + DY_OP dy_op) { + const DDim& x_dim = x.dims(); + const DDim& y_dim = y.dims(); + if (x.dims() == y.dims()) { + funcs::ElemwiseGradComputeNoBroadcast(dev_ctx, + x_dim, + y_dim, + dout, + dout, + out, + dout, + axis, + dx, + dy, + dx_op, + dy_op); + } else { + funcs::ElemwiseGradComputeWithBroadcast(dev_ctx, + x_dim, + y_dim, + dout, + dout, + out, + dout, + axis, + dx, + dy, + dx_op, + dy_op); + } +} + +/* +****************************** + Add Grad +****************************** +*/ +template +struct IdentityGrad { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; } +}; + +template +typename std::enable_if::value>::type +ElementwiseAddGrad(const CPUContext& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx, + DenseTensor* dy, + int axis = -1) { + auto blas = phi::funcs::GetBlas(ctx); + if (dx) { + blas.VCOPY( + dout.numel(), dout.data(), dx->mutable_data(ctx.GetPlace())); + } + + if (dy) { + blas.VCOPY( + dout.numel(), dout.data(), dy->mutable_data(ctx.GetPlace())); + } +} + +template +typename std::enable_if::value>::type +ElementwiseAddGrad(const CPUContext& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx, + DenseTensor* dy, + int axis = -1) { + ElemwiseExplicitGradCompute, IdentityGrad>( + ctx, x, y, out, dout, axis, dx, dy, IdentityGrad(), IdentityGrad()); +} + +/* +****************************** + Sub Grad +****************************** +*/ + +template +struct SubGradDX { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; } +}; + +template +struct SubGradDY { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -dout; } +}; + +template +void ElementwiseSubGrad(const CPUContext& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx, + DenseTensor* dy, + int axis = -1) { + ElemwiseExplicitGradCompute, SubGradDY>( + ctx, x, y, out, dout, axis, dx, dy, SubGradDX(), SubGradDY()); +} + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc index c878e8133ff..e48ee805959 100644 --- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc @@ -17,7 +17,8 @@ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/copy_kernel.h" -#include "paddle/phi/kernels/cpu/elementwise.h" +#include "paddle/phi/kernels/cpu/elementwise_grad.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h" @@ -33,7 +34,7 @@ void AddGradFunc(const CPUContext& dev_ctx, DenseTensor* dy, int axis = -1) { if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) { - elementwise_add_grad(dev_ctx, x, y, out, dout, dx, dy); + ElementwiseAddGrad(dev_ctx, x, y, out, dout, dx, dy); } else { ElemwiseExplicitGradCompute, IdentityGrad>( dev_ctx, @@ -68,15 +69,7 @@ void AddDoubleGradKernel(const Context& dev_ctx, const DenseTensor& dout, int axis, DenseTensor* ddout) { - phi::AddDoubleGradImpl(dev_ctx, - y, - ddx, - ddy, - dout, - axis, - ddout, - ElementwiseCompute, T>, - ElementwiseCompute, T>); + phi::AddDoubleGradImpl(dev_ctx, y, ddx, ddy, dout, axis, ddout); } template @@ -101,7 +94,7 @@ void SubtractGradKernel(const Context& dev_ctx, DenseTensor* dy) { // skip out auto* out = &dout; - elementwise_sub_grad(dev_ctx, x, y, *out, dout, dx, dy, axis); + ElementwiseSubGrad(dev_ctx, x, y, *out, dout, dx, dy, axis); } template @@ -112,15 +105,7 @@ void SubtractDoubleGradKernel(const Context& dev_ctx, const DenseTensor& dout, int axis, DenseTensor* ddout) { - phi::SubtractDoubleGradImpl( - dev_ctx, - y, - ddx, - ddy, - dout, - axis, - ddout, - ElementwiseCompute, T>); + phi::SubtractDoubleGradImpl(dev_ctx, y, ddx, ddy, dout, axis, ddout); } } // namespace phi diff --git a/paddle/phi/kernels/cpu/logical_kernel.cc b/paddle/phi/kernels/cpu/logical_kernel.cc index 3d179e1e75f..a0747b128e5 100644 --- a/paddle/phi/kernels/cpu/logical_kernel.cc +++ b/paddle/phi/kernels/cpu/logical_kernel.cc @@ -16,7 +16,7 @@ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/elementwise.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/logical_functor.h" // See Note [ Why still include the fluid headers? ] @@ -24,15 +24,15 @@ namespace phi { -#define DEFINE_LOGICAL_BINARY_KERNEL(type) \ - template \ - void Logical##type##Kernel(const Context& dev_ctx, \ - const DenseTensor& x, \ - const DenseTensor& y, \ - DenseTensor* out) { \ - funcs::Logical##type##Functor binary_func; \ - ElementwiseCompute, T, bool>( \ - dev_ctx, x, y, -1, binary_func, out); \ +#define DEFINE_LOGICAL_BINARY_KERNEL(type) \ + template \ + void Logical##type##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + DenseTensor* out) { \ + funcs::Logical##type##Functor binary_func; \ + funcs::ElementwiseCompute, T, bool>( \ + dev_ctx, x, y, -1, binary_func, out); \ } DEFINE_LOGICAL_BINARY_KERNEL(And) diff --git a/paddle/phi/kernels/cpu/math_kernel.cc b/paddle/phi/kernels/cpu/math_kernel.cc index 5cfcfe62c78..250f656926c 100644 --- a/paddle/phi/kernels/cpu/math_kernel.cc +++ b/paddle/phi/kernels/cpu/math_kernel.cc @@ -20,6 +20,7 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/cpu/elementwise.h" #include "paddle/phi/kernels/cpu/reduce.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/funcs/reduce_functor.h" @@ -45,10 +46,10 @@ namespace phi { auto x_dims = x.dims(); \ auto y_dims = y.dims(); \ if (x_dims.size() >= y_dims.size()) { \ - ElementwiseCompute, T>( \ + funcs::ElementwiseCompute, T>( \ dev_ctx, x, y, axis, funcs::name##Functor(), out); \ } else { \ - ElementwiseCompute, T>( \ + funcs::ElementwiseCompute, T>( \ dev_ctx, x, y, axis, funcs::Inverse##name##Functor(), out); \ } \ } \ @@ -93,10 +94,10 @@ void DivideRawKernel(const Context& dev_ctx, auto x_dims = x.dims(); auto y_dims = y.dims(); if (x_dims.size() >= y_dims.size()) { - ElementwiseCompute, T>( + funcs::ElementwiseCompute, T>( dev_ctx, x, y, axis, funcs::DivideFunctor(), out); } else { - ElementwiseCompute, T>( + funcs::ElementwiseCompute, T>( dev_ctx, x, y, axis, funcs::InverseDivideFunctor(), out); } } diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index 84a36b849af..e9fd4cf47b8 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -25,6 +25,8 @@ namespace kps = phi::kps; namespace phi { namespace funcs { +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) + struct DimensionsTransform { using DimVector = std::vector; typedef void (*MergeFunctor)( @@ -183,8 +185,6 @@ struct DimensionsTransform { } }; -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) - template __device__ __forceinline__ void LoadData( T *dst, @@ -578,6 +578,20 @@ void BroadcastKernel(const KPDevice &ctx, } } +template +void ElementwiseCompute(const GPUContext &dev_ctx, + const DenseTensor &x, + const DenseTensor &y, + int axis, + Functor func, + DenseTensor *z) { + std::vector ins = {&x, &y}; + std::vector outs = {z}; + z->mutable_data(dev_ctx.GetPlace()); + BroadcastKernel( + dev_ctx, ins, &outs, axis, func); +} + #endif } // namespace funcs diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h index d369781f845..235dbdd40f6 100644 --- a/paddle/phi/kernels/funcs/elementwise_base.h +++ b/paddle/phi/kernels/funcs/elementwise_base.h @@ -18,7 +18,8 @@ limitations under the License. */ #include "paddle/phi/backends/all_context.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/empty_kernel.h" -#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/funcs/common_shape.h" +#include "paddle/phi/kernels/funcs/elementwise_utils.h" #include "paddle/phi/kernels/funcs/math_function.h" #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) @@ -44,28 +45,6 @@ using ConditionalT = namespace funcs { using DDim = phi::DDim; -template -struct ElemwiseGradNoBroadcast { - const T *x_; - const T *y_; - const Tout *out_; - const Tout *dout_; - - HOSTDEVICE void operator()(size_t i) { - if (dx_ != nullptr) { - dx_[i] = dx_op_(x_[i], y_[i], out_[i], dout_[i]); - } - if (dy_ != nullptr) { - dy_[i] = dy_op_(x_[i], y_[i], out_[i], dout_[i]); - } - } - - DX_OP dx_op_; - DY_OP dy_op_; - T *dx_; - T *dy_; -}; - template class RowwiseTransformIterator; @@ -293,73 +272,172 @@ class TransformFunctor { bool is_xsize_larger_; }; -inline DDim trim_trailing_singular_dims(const DDim &dims) { - // Remove trailing dimensions of size 1 for y - auto actual_dims_size = dims.size(); - for (; actual_dims_size != 0; --actual_dims_size) { - if (dims[actual_dims_size - 1] != 1) break; - } - if (actual_dims_size == dims.size()) return dims; - std::vector trim_dims; - trim_dims.resize(actual_dims_size); - for (int i = 0; i < actual_dims_size; ++i) { - trim_dims[i] = dims[i]; - } - if (trim_dims.size() == 0) { - return DDim(phi::make_dim()); +template +void CommonForwardBroadcastCPU(const DenseTensor &x, + const DenseTensor &y, + DenseTensor *z, + int *x_dims_array, + int *y_dims_array, + int *out_dims_array, + int max_dim, + const CPUContext &ctx, + Functor func, + const bool is_xsize_larger = true) { + std::vector index_array(max_dim, 0); + const T *x_data = x.data(); + const T *y_data = y.data(); + PADDLE_ENFORCE_NOT_NULL( + x_data, errors::InvalidArgument("The input X should not be empty.")); + PADDLE_ENFORCE_NOT_NULL( + y_data, errors::InvalidArgument("The input Y should not be empty.")); + OutType *out_data = ctx.Alloc(z); + + const int out_size = std::accumulate( + out_dims_array, out_dims_array + max_dim, 1, std::multiplies()); + int x_index, y_index; + for (int out_index = 0; out_index < out_size; ++out_index) { + x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data()); + y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data()); + if (is_xsize_larger) { + out_data[out_index] = func(x_data[x_index], y_data[y_index]); + } else { + out_data[out_index] = func(y_data[y_index], x_data[x_index]); + } + + UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data()); } - DDim actual_dims = phi::make_ddim(trim_dims); - return actual_dims; } -/* - * Out = X ⊙ Y - * If Y's shape does not match X' shape, they will be reshaped. - * For example: - * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1 - * pre=2, n=3*4, post=5 - * x.shape(2, 12, 5) * y.shape(1, 12, 1).broadcast(2, 12, 5) - * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5) - * pre=2*3, n=4*5, post=1 - * x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1) - * - * New parameter: *is_run_common_broadcast* is a flag to record whether to run - * common broadcast code. - */ -inline void get_mid_dims(const DDim &x_dims, - const DDim &y_dims, - const int axis, - int *pre, - int *n, - int *post, - int *is_run_common_broadcast) { - *pre = 1; - *n = 1; - *post = 1; - *is_run_common_broadcast = 0; - for (int i = 0; i < axis; ++i) { - (*pre) *= x_dims[i]; - } - for (int i = 0; i < y_dims.size(); ++i) { - if (x_dims[i + axis] != y_dims[i]) { - PADDLE_ENFORCE_EQ(y_dims[i] == 1 || x_dims[i + axis] == 1, - true, - phi::errors::InvalidArgument( - "Broadcast dimension mismatch. Operands " - "could not be broadcast together with the shape of " - "X = [%s] and the shape of Y = [%s]. Received [%d] " - "in X is not equal to [%d] in Y.", - x_dims, - y_dims, - x_dims[i + axis], - y_dims[i])); - *is_run_common_broadcast = 1; - return; - } - (*n) *= y_dims[i]; - } - for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) { - (*post) *= x_dims[i]; +template +void CommonElementwiseBroadcastForward(const CPUContext &dev_ctx, + const DenseTensor &x, + const DenseTensor &y, + DenseTensor *z, + const DDim &x_dims, + const DDim &y_dims, + Functor func, + int axis, + const bool is_xsize_larger = true) { + int max_dim = (std::max)(x_dims.size(), y_dims.size()); + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + PADDLE_ENFORCE_GE( + axis, + 0, + phi::errors::InvalidArgument( + "Axis should be great than or equal to 0, but received axis is %d.", + axis)); + PADDLE_ENFORCE_LT(axis, + max_dim, + phi::errors::InvalidArgument( + "Axis should be less than %d, but received axis is %d.", + max_dim, + axis)); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + GetBroadcastDimsArrays(x_dims, + y_dims, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); + + CommonForwardBroadcastCPU(x, + y, + z, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + dev_ctx, + func, + is_xsize_larger); +} + +// It is a common CPU implementation to compute binary calculation with the +// support of broadcast. Note: +// 1. CPU implementation cannot support the case when x needs broadcast, thus +// this function need to be called with XxxFunctor and XxxInverseFunctor, +// like AddFunctor and InverseAddFunctor. +// 2. The corresponding GPU implementation supports all the broadcast cases, +// thus there is no need to define and call with XxxInverseFunctor. +// TODO(liuyiqun): optimize the CPU implementation to support all broadcast +// cases and avoid the need of XxxInverseFunctor. +template +void ElementwiseCompute(const CPUContext &dev_ctx, + const DenseTensor &x, + const DenseTensor &y, + int axis, + Functor func, + DenseTensor *z) { + dev_ctx.Alloc(z); + auto x_dims = x.dims(); + auto y_dims = y.dims(); + bool is_xsize_larger = true; + int max_dim = x_dims.size(); + if (x_dims.size() < y_dims.size()) { + is_xsize_larger = false; + max_dim = y_dims.size(); + } + TransformFunctor functor( + x, y, z, dev_ctx, func, is_xsize_larger); + if (x_dims == y_dims) { + functor.Run(); + return; + } + + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + PADDLE_ENFORCE_GE( + axis, + 0, + errors::InvalidArgument( + "Axis should be great than or equal to 0, but received axis is %d.", + axis)); + PADDLE_ENFORCE_LT(axis, + max_dim, + errors::InvalidArgument( + "Axis should be less than %d, but received axis is %d.", + max_dim, + axis)); + + int pre, n, post, is_run_common_broadcast, axis_trim = 0; + if (is_xsize_larger) { + auto y_dims_trimed = TrimTrailingSingularDims(y_dims); + axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis; + GetMidDims(x_dims, + y_dims_trimed, + axis_trim, + &pre, + &n, + &post, + &is_run_common_broadcast); + } else { + auto x_dims_trimed = TrimTrailingSingularDims(x_dims); + axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis; + GetMidDims(y_dims, + x_dims_trimed, + axis_trim, + &pre, + &n, + &post, + &is_run_common_broadcast); + } + // special case for common implementation. + // case 1: x=[2,3,1,5], y=[2,1,4,1] + // case 2: x=[2,3,4], y=[1,1,4] + if (is_run_common_broadcast == 1) { + CommonElementwiseBroadcastForward( + dev_ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger); + return; + } + + if (post == 1) { + functor.RunRowWise(n, pre); + return; + } else { + functor.RunMidWise(n, pre, post); + return; } } @@ -395,41 +473,11 @@ static inline void GetDoubleGradSafeTensor(const DeviceContext &dev_ctx, auto meta = phi::DenseTensorMeta(x.dtype(), x.dims(), x.layout()); *ddx_safe = phi::Empty(dev_ctx, std::move(meta)); ddx_safe->mutable_data(dev_ctx.GetPlace()); - phi::funcs::SetConstant set_zero; + SetConstant set_zero; set_zero(dev_ctx, ddx_safe, static_cast(0)); } } -template -void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx, - const DDim &x_dim, - const DDim &y_dim, - const DenseTensor &x, - const DenseTensor &y, - const DenseTensor &out, - const DenseTensor &dout, - int axis, - DenseTensor *dx, - DenseTensor *dy, - DX_OP dx_op, - DY_OP dy_op) { - size_t N = static_cast(phi::product(x_dim)); - phi::funcs::ForRange for_range(dev_ctx, N); - for_range(ElemwiseGradNoBroadcast{ - x.data(), - y.data(), - out.data(), - dout.data(), - dx_op, - dy_op, - dx == nullptr ? nullptr : dev_ctx.template Alloc(dx), - dy == nullptr ? nullptr : dev_ctx.template Alloc(dy)}); -} - inline void ElementwiseGradPreProcess(const DenseTensor &dout, DenseTensor *dx) { if (dx != nullptr) { @@ -806,6 +854,7 @@ void ElementwiseKernel(const KPDevice &ctx, } } } + #endif } // namespace funcs diff --git a/paddle/phi/kernels/gpu/elementwise.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h similarity index 78% rename from paddle/phi/kernels/gpu/elementwise.h rename to paddle/phi/kernels/funcs/elementwise_grad_base.h index 12cafc7023b..dff0cfe5b8b 100644 --- a/paddle/phi/kernels/gpu/elementwise.h +++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h @@ -14,16 +14,25 @@ limitations under the License. */ #pragma once -#include "paddle/phi/kernels/copy_kernel.h" -#include "paddle/phi/kernels/funcs/broadcast_function.h" +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/funcs/common_shape.h" -#include "paddle/phi/kernels/gpu/reduce.h" +#include "paddle/phi/kernels/funcs/elementwise_utils.h" +#include "paddle/phi/kernels/funcs/for_range.h" + +#if defined(__NVCC__) || defined(__HIPCC__) +// See Note [ Why still include the fluid headers? ] +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" + +#endif #ifdef __HIPCC__ constexpr int ELEMWISE_MAX_BLOCK_DIM = 256; #else constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024; #endif + #define BLOCK_X 32 #define BLOCK_Y 32 @@ -36,21 +45,361 @@ constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024; namespace phi { -// General binary elementwise comutaion with the support of broadcast. -template -void ElementwiseCompute(const GPUContext &dev_ctx, - const DenseTensor &x, - const DenseTensor &y, - int axis, - Functor func, - DenseTensor *z) { - std::vector ins = {&x, &y}; - std::vector outs = {z}; - z->mutable_data(dev_ctx.GetPlace()); - phi::funcs::BroadcastKernel( - dev_ctx, ins, &outs, axis, func); +namespace funcs { +using DDim = phi::DDim; + +template +void CommonGradBroadcastCPU(const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out, + const DenseTensor &dout, + DenseTensor *dx, + DenseTensor *dy, + int *x_dims_array, + int *y_dims_array, + int *out_dims_array, + int max_dim, + const CPUContext &ctx, + DX_OP dx_op, + DY_OP dy_op) { + std::vector index_array(max_dim, 0); + const T *x_data = x.data(); + const T *y_data = y.data(); + const Tout *out_data = out.data(); + const Tout *dout_data = dout.data(); + T *dx_data = dx == nullptr ? nullptr : ctx.Alloc(dx); + T *dy_data = dy == nullptr ? nullptr : ctx.Alloc(dy); + if (dx_data != nullptr) { + memset(dx_data, 0, dx->numel() * sizeof(T)); + } + if (dy_data != nullptr) { + memset(dy_data, 0, dy->numel() * sizeof(T)); + } + const int out_size = std::accumulate( + out_dims_array, out_dims_array + max_dim, 1, std::multiplies()); + int x_index, y_index; + for (int out_index = 0; out_index < out_size; ++out_index) { + x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data()); + y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data()); + if (dx_data != nullptr) { + dx_data[x_index] += dx_op(x_data[x_index], + y_data[y_index], + out_data[out_index], + dout_data[out_index]); + } + if (dy_data != nullptr) { + dy_data[y_index] += dy_op(x_data[x_index], + y_data[y_index], + out_data[out_index], + dout_data[out_index]); + } + + UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data()); + } +} + +template +static void ElemwiseGradBroadcast1CPU(const T *x, + const T *y, + const Tout *out, + const Tout *dout, + int h, + int w, + bool is_xsize_larger, + DX_OP dx_op, + DY_OP dy_op, + T *dx, + T *dy) { + if (is_xsize_larger) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int x_offset = i * w + j; + if (dx != nullptr) { + dx[x_offset] = + dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); + } + if (dy != nullptr) { + T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); + if (i == 0) { + dy[j] = tmp; + } else { + dy[j] += tmp; + } + } + } + } + } else { // x.dims < y.dims, broadcast for x. + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int y_offset = i * w + j; + if (dy != nullptr) { + dy[y_offset] = + dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); + } + if (dx != nullptr) { + T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); + if (i == 0) { + dx[j] = tmp; + } else { + dx[j] += tmp; + } + } + } + } + } +} + +template +static void ElemwiseGradBroadcast2CPU(const T *x, + const T *y, + const Tout *out, + const Tout *dout, + int pre, + int n, + int post, + bool is_xsize_larger, + DX_OP dx_op, + DY_OP dy_op, + T *dx, + T *dy) { + if (is_xsize_larger) { + for (int i = 0; i < pre; ++i) { + for (int j = 0; j < n; ++j) { + for (int k = 0; k < post; ++k) { + int x_offset = i * n * post + j * post + k; + if (dx != nullptr) { + dx[x_offset] = + dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); + } + if (dy != nullptr) { + T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); + if (i == 0 && k == 0) { + dy[j] = tmp; + } else { + dy[j] += tmp; + } + } + } + } + } + } else { // x.dims < y.dims, broadcast for x. + for (int i = 0; i < pre; ++i) { + for (int j = 0; j < n; ++j) { + for (int k = 0; k < post; ++k) { + int y_offset = i * n * post + j * post + k; + if (dy != nullptr) { + dy[y_offset] = + dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); + } + if (dx != nullptr) { + T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); + if (i == 0 && k == 0) { + dx[j] = tmp; + } else { + dx[j] += tmp; + } + } + } + } + } + } +} + +template +void CommonElementwiseBroadcastBackward(const CPUContext &ctx, + const DDim &x_dims, + const DDim &y_dims, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out, + const DenseTensor &dout, + int axis, + DenseTensor *dx, + DenseTensor *dy, + DX_OP dx_op, + DY_OP dy_op) { + int max_dim = std::max(x_dims.size(), y_dims.size()); + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + GetBroadcastDimsArrays(x_dims, + y_dims, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); + // for inplace strategy. memset will make dx and dout clear and get wrong + // result. + if (dx && dx->IsSharedBufferWith(dout)) { + dx->clear(); + dx->mutable_data(x_dims, ctx.GetPlace()); + } + + VLOG(3) << "CommonElementwiseBroadcastBackward xdims:" + << phi::make_ddim(x_dims_array) + << " ydim:" << phi::make_ddim(y_dims_array); + + CommonGradBroadcastCPU(x, + y, + out, + dout, + dx, + dy, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + ctx, + dx_op, + dy_op); +} + +template +void ElemwiseGradComputeWithBroadcast(const CPUContext &ctx, + const DDim &x_dims, + const DDim &y_dims, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out, + const DenseTensor &dout, + int axis, + DenseTensor *dx, + DenseTensor *dy, + DX_OP dx_op, + DY_OP dy_op) { + bool is_xsize_larger = true; + + int max_dim = x_dims.size(); + if (x_dims.size() < y_dims.size()) { + is_xsize_larger = false; + max_dim = y_dims.size(); + } + + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + PADDLE_ENFORCE_GE( + axis, + 0, + errors::InvalidArgument( + "Axis should be great than or equal to 0, but received axis is %d.", + axis)); + PADDLE_ENFORCE_LT(axis, + max_dim, + errors::InvalidArgument( + "Axis should be less than %d, but received axis is %d.", + max_dim, + axis)); + + int pre, n, post, is_run_common_broadcast, axis_trim = 0; + if (is_xsize_larger) { + auto y_dims_trimed = TrimTrailingSingularDims(y_dims); + axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis; + GetMidDims(x_dims, + y_dims_trimed, + axis_trim, + &pre, + &n, + &post, + &is_run_common_broadcast); + } else { + auto x_dims_trimed = TrimTrailingSingularDims(x_dims); + axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis; + GetMidDims(y_dims, + x_dims_trimed, + axis_trim, + &pre, + &n, + &post, + &is_run_common_broadcast); + } + // special case for common backward implementation. + if (is_run_common_broadcast) { + CommonElementwiseBroadcastBackward( + ctx, x_dims, y_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); + return; + } + if (post == 1) { + ElemwiseGradBroadcast1CPU(x.data(), + y.data(), + out.data(), + dout.data(), + pre, + n, + is_xsize_larger, + dx_op, + dy_op, + dx == nullptr ? nullptr : ctx.Alloc(dx), + dy == nullptr ? nullptr : ctx.Alloc(dy)); + } else { + ElemwiseGradBroadcast2CPU(x.data(), + y.data(), + out.data(), + dout.data(), + pre, + n, + post, + is_xsize_larger, + dx_op, + dy_op, + dx == nullptr ? nullptr : ctx.Alloc(dx), + dy == nullptr ? nullptr : ctx.Alloc(dy)); + } +} + +template +struct ElemwiseGradNoBroadcast { + const T *x_; + const T *y_; + const Tout *out_; + const Tout *dout_; + + HOSTDEVICE void operator()(size_t i) { + if (dx_ != nullptr) { + dx_[i] = dx_op_(x_[i], y_[i], out_[i], dout_[i]); + } + if (dy_ != nullptr) { + dy_[i] = dy_op_(x_[i], y_[i], out_[i], dout_[i]); + } + } + + DX_OP dx_op_; + DY_OP dy_op_; + T *dx_; + T *dy_; +}; + +template +void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx, + const DDim &x_dim, + const DDim &y_dim, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out, + const DenseTensor &dout, + int axis, + DenseTensor *dx, + DenseTensor *dy, + DX_OP dx_op, + DY_OP dy_op) { + size_t N = static_cast(phi::product(x_dim)); + phi::funcs::ForRange for_range(dev_ctx, N); + for_range(ElemwiseGradNoBroadcast{ + x.data(), + y.data(), + out.data(), + dout.data(), + dx_op, + dy_op, + dx == nullptr ? nullptr : dev_ctx.template Alloc(dx), + dy == nullptr ? nullptr : dev_ctx.template Alloc(dy)}); } +#if defined(__NVCC__) || defined(__HIPCC__) // Suppose only has contiguous dims static inline bool CheckContiguousDims(const std::vector &broadcast_pos) { for (int i = 1; i < broadcast_pos.size(); ++i) { @@ -114,7 +463,6 @@ inline void ComputeBroadcastKernelSize(int *x_dims_array, } } -#ifndef __xpu__ template static __global__ void FastCommonGradBroadcastOneCUDAKernel(const T *x, const T *y, @@ -1282,13 +1630,13 @@ void CommonElementwiseBroadcastBackward(const GPUContext &ctx, std::vector x_dims_array(max_dim); std::vector y_dims_array(max_dim); std::vector out_dims_array(max_dim); - funcs::GetBroadcastDimsArrays(x_dims, - y_dims, - x_dims_array.data(), - y_dims_array.data(), - out_dims_array.data(), - max_dim, - axis); + GetBroadcastDimsArrays(x_dims, + y_dims, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); // for inplace strategy. memset will make dx and dout clear and get wrong // result. if (dx && dx->IsSharedBufferWith(dout)) { @@ -1340,37 +1688,37 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx, PADDLE_ENFORCE_GE( axis, 0, - phi::errors::InvalidArgument( + errors::InvalidArgument( "Axis should be great than or equal to 0, but received axis is %d.", axis)); PADDLE_ENFORCE_LT(axis, max_dim, - phi::errors::InvalidArgument( + errors::InvalidArgument( "Axis should be less than %d, but received axis is %d.", max_dim, axis)); int pre, n, post, is_run_common_broadcast, axis_trim = 0; if (is_xsize_larger) { - auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims); + auto y_dims_trimed = TrimTrailingSingularDims(y_dims); axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis; - funcs::get_mid_dims(x_dims, - y_dims_trimed, - axis_trim, - &pre, - &n, - &post, - &is_run_common_broadcast); + GetMidDims(x_dims, + y_dims_trimed, + axis_trim, + &pre, + &n, + &post, + &is_run_common_broadcast); } else { - auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims); + auto x_dims_trimed = TrimTrailingSingularDims(x_dims); axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis; - funcs::get_mid_dims(y_dims, - x_dims_trimed, - axis_trim, - &pre, - &n, - &post, - &is_run_common_broadcast); + GetMidDims(y_dims, + x_dims_trimed, + axis_trim, + &pre, + &n, + &post, + &is_run_common_broadcast); } // special case for common backward implementation. if (is_run_common_broadcast) { @@ -1408,228 +1756,7 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx, } } -/* -****************************** - Add Grad -****************************** -*/ - -template -static __global__ void SimpleElemwiseAddGradCUDAKernel( - const T *__restrict__ dout, int size, int vec_size, T *dx, T *dy) { - int tid = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X; - int stride = GRID_NUM_X * BLOCK_NUM_X; - int loop = size / vec_size; - int remainder = size % vec_size; - const float4 *dout_vec = reinterpret_cast(dout); - float4 *dx_vec = reinterpret_cast(dx); - float4 *dy_vec = reinterpret_cast(dy); - float4 tmp_loop; - - for (int i = tid; i < loop; i += stride) { - tmp_loop = dout_vec[i]; - dx_vec[i] = tmp_loop; - dy_vec[i] = tmp_loop; - } - - if (tid == loop && remainder != 0) { - T tmp_rem; - while (remainder) { - int idx = size - remainder; - remainder--; - tmp_rem = dout[idx]; - dx[idx] = tmp_rem; - dy[idx] = tmp_rem; - } - } -} - -template -void default_elementwise_add_grad(const GPUContext &ctx, - const DenseTensor &x, - const DenseTensor &y, - const DenseTensor &out, - const DenseTensor &dout, - DenseTensor *dx, - DenseTensor *dy, - int axis = -1) { - auto *dout_data = dout.data(); - - // dx - if (dx != nullptr) { - auto *dx_data = dx->mutable_data(ctx.GetPlace()); - if (dx->dims() == dout.dims()) { - if (dx_data != dout_data) { - phi::Copy(ctx, dout, ctx.GetPlace(), false, dx); - } - } else { - // For inplace strategy, dx will be stored in addr of dout, which makes - // the result of dy wrong. - if (dx->IsSharedBufferWith(dout)) { - dx->clear(); - dx->mutable_data(x.dims(), ctx.GetPlace()); - } - std::vector reduce_dims = - funcs::GetReduceDim(x.dims(), out.dims(), axis); - gpuStream_t stream = ctx.stream(); - kernels::TensorReduceImpl>( - ctx, dout, dx, kps::IdentityFunctor(), reduce_dims, stream); - } - } - // dy - if (dy != nullptr) { - auto *dy_data = dy->mutable_data(ctx.GetPlace()); - if (dy->dims() == dout.dims()) { - if (dy_data != dout_data) { - phi::Copy(ctx, dout, ctx.GetPlace(), false, dy); - } - } else { - std::vector reduce_dims = - funcs::GetReduceDim(y.dims(), out.dims(), axis); - gpuStream_t stream = ctx.stream(); - kernels::TensorReduceImpl>( - ctx, dout, dy, kps::IdentityFunctor(), reduce_dims, stream); - } - } -} - -template -void elementwise_add_grad(const GPUContext &ctx, - const DenseTensor &x, - const DenseTensor &y, - const DenseTensor &out, - const DenseTensor &dout, - DenseTensor *dx, - DenseTensor *dy) { - auto *dx_data = dx->mutable_data(ctx.GetPlace()); - auto *dy_data = dy->mutable_data(ctx.GetPlace()); - auto *dout_data = dout.data(); - if (dx_data == dout_data && dy_data != dout_data) { - VLOG(4) << "Special case when dx_data is the same as dout_data, " - "only need copy dout to dy"; - phi::Copy(ctx, dout, ctx.GetPlace(), false, dy); - } else if (dx_data != dout_data && dy_data == dout_data) { - VLOG(4) << "Special case when dy_data is the same as dout_data, " - "only need copy dout to dx"; - phi::Copy(ctx, dout, ctx.GetPlace(), false, dx); - } else if (dx_data != dout_data && dy_data != dout_data) { - auto size = x.numel(); - int vec_size = max(static_cast(sizeof(float4) / sizeof(T)), 1); - dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1); - dim3 grid_size = - dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) / - PREDEFINED_BLOCK_SIZE, - 1); - SimpleElemwiseAddGradCUDAKernel< - T><<>>( - dout.data(), - size, - vec_size, - dx->mutable_data(ctx.GetPlace()), - dy->mutable_data(ctx.GetPlace())); - } else { - VLOG(4) << "Special case when dy_data is the same as dout_data, " - "and dx_data is the same as dout_data, do not need " - "any operator"; - } -} - -/* -****************************** - Sub Grad -****************************** -*/ - -template -static __global__ void SimpleElemwiseSubGradCUDAKernel(const T *dout, - int64_t size, - T *dx, - T *dy) { - int col = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X; - - while (col < size) { - if (dx != nullptr) { - dx[col] = dout[col]; - } - dy[col] = -dout[col]; - col += BLOCK_NUM_X * GRID_NUM_X; - } -} - -template -void default_elementwise_sub_grad(const GPUContext &ctx, - const DenseTensor &x, - const DenseTensor &y, - const DenseTensor &out, - const DenseTensor &dout, - DenseTensor *dx, - DenseTensor *dy, - int axis = -1) { - auto *dout_data = dout.data(); - // dx - if (dx != nullptr) { - auto *dx_data = dx->mutable_data(ctx.GetPlace()); - if (dx->dims() == dout.dims()) { - if (dx_data != dout_data) { - phi::Copy(ctx, dout, ctx.GetPlace(), false, dx); - } - } else { - // For inplace strategy, dx will be stored in addr of dout, which makes - // the result of dy wrong. - if (dx->IsSharedBufferWith(dout)) { - dx->clear(); - dx->mutable_data(x.dims(), ctx.GetPlace()); - } - std::vector reduce_dims = - funcs::GetReduceDim(x.dims(), out.dims(), axis); - gpuStream_t stream = ctx.stream(); - kernels::TensorReduceImpl>( - ctx, dout, dx, kps::IdentityFunctor(), reduce_dims, stream); - } - } - // dy - if (dy != nullptr) { - auto *dy_data = dy->mutable_data(ctx.GetPlace()); - if (dy->dims() == dout.dims()) { - if (dy_data != dout_data) { - dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1); - auto size = dy->numel(); - dim3 grid_size = - dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1); - SimpleElemwiseSubGradCUDAKernel< - T><<>>( - dout.data(), size, nullptr, dy->mutable_data(ctx.GetPlace())); - } - } else { - std::vector reduce_dims = - funcs::GetReduceDim(y.dims(), out.dims(), axis); - gpuStream_t stream = ctx.stream(); - kernels::TensorReduceImpl>( - ctx, dout, dy, kps::InverseFunctor(), reduce_dims, stream); - } - } -} - -template -void elementwise_sub_grad(const GPUContext &ctx, - const DenseTensor &x, - const DenseTensor &y, - const DenseTensor &out, - const DenseTensor &dout, - DenseTensor *dx, - DenseTensor *dy) { - dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1); - auto size = x.numel(); - dim3 grid_size = - dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1); - SimpleElemwiseSubGradCUDAKernel< - T><<>>( - dout.data(), - size, - dx->mutable_data(ctx.GetPlace()), - dy->mutable_data(ctx.GetPlace())); -} - #endif +} // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/elementwise_utils.h b/paddle/phi/kernels/funcs/elementwise_utils.h new file mode 100644 index 00000000000..3790044346d --- /dev/null +++ b/paddle/phi/kernels/funcs/elementwise_utils.h @@ -0,0 +1,121 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" + +namespace phi { + +namespace funcs { + +using DDim = phi::DDim; + +/* + * Out = X ⊙ Y + * If Y's shape does not match X' shape, they will be reshaped. + * For example: + * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1 + * pre=2, n=3*4, post=5 + * x.shape(2, 12, 5) * y.shape(1, 12, 1).broadcast(2, 12, 5) + * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5) + * pre=2*3, n=4*5, post=1 + * x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1) + * + * New parameter: *is_run_common_broadcast* is a flag to record whether to run + * common broadcast code. + */ +inline void GetMidDims(const DDim &x_dims, + const DDim &y_dims, + const int axis, + int *pre, + int *n, + int *post, + int *is_run_common_broadcast) { + *pre = 1; + *n = 1; + *post = 1; + *is_run_common_broadcast = 0; + for (int i = 0; i < axis; ++i) { + (*pre) *= x_dims[i]; + } + for (int i = 0; i < y_dims.size(); ++i) { + if (x_dims[i + axis] != y_dims[i]) { + PADDLE_ENFORCE_EQ(y_dims[i] == 1 || x_dims[i + axis] == 1, + true, + phi::errors::InvalidArgument( + "Broadcast dimension mismatch. Operands " + "could not be broadcast together with the shape of " + "X = [%s] and the shape of Y = [%s]. Received [%d] " + "in X is not equal to [%d] in Y.", + x_dims, + y_dims, + x_dims[i + axis], + y_dims[i])); + *is_run_common_broadcast = 1; + return; + } + (*n) *= y_dims[i]; + } + for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) { + (*post) *= x_dims[i]; + } +} + +inline DDim TrimTrailingSingularDims(const DDim &dims) { + // Remove trailing dimensions of size 1 for y + auto actual_dims_size = dims.size(); + for (; actual_dims_size != 0; --actual_dims_size) { + if (dims[actual_dims_size - 1] != 1) break; + } + if (actual_dims_size == dims.size()) return dims; + std::vector trim_dims; + trim_dims.resize(actual_dims_size); + for (int i = 0; i < actual_dims_size; ++i) { + trim_dims[i] = dims[i]; + } + if (trim_dims.size() == 0) { + return DDim(phi::make_dim()); + } + DDim actual_dims = phi::make_ddim(trim_dims); + return actual_dims; +} + +inline int GetElementwiseIndex(const int *x_dims_array, + const int max_dim, + const int *index_array) { + int index_ = 0; + for (int i = 0; i < max_dim; i++) { + if (x_dims_array[i] > 1) { + index_ = index_ * x_dims_array[i] + index_array[i]; + } + } + return index_; +} + +inline void UpdateElementwiseIndexArray(const int *out_dims_array, + const int max_dim, + int *index_array) { + for (int i = max_dim - 1; i >= 0; --i) { + ++index_array[i]; + if (index_array[i] >= out_dims_array[i]) { + index_array[i] -= out_dims_array[i]; + } else { + break; + } + } +} + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h new file mode 100644 index 00000000000..b17196b6b11 --- /dev/null +++ b/paddle/phi/kernels/gpu/elementwise_grad.h @@ -0,0 +1,246 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/elementwise_grad_base.h" +#include "paddle/phi/kernels/gpu/reduce.h" + +namespace phi { + +/* +****************************** + Add Grad +****************************** +*/ + +template +static __global__ void SimpleElemwiseAddGradCUDAKernel( + const T *__restrict__ dout, int size, int vec_size, T *dx, T *dy) { + int tid = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X; + int stride = GRID_NUM_X * BLOCK_NUM_X; + int loop = size / vec_size; + int remainder = size % vec_size; + const float4 *dout_vec = reinterpret_cast(dout); + float4 *dx_vec = reinterpret_cast(dx); + float4 *dy_vec = reinterpret_cast(dy); + float4 tmp_loop; + + for (int i = tid; i < loop; i += stride) { + tmp_loop = dout_vec[i]; + dx_vec[i] = tmp_loop; + dy_vec[i] = tmp_loop; + } + + if (tid == loop && remainder != 0) { + T tmp_rem; + while (remainder) { + int idx = size - remainder; + remainder--; + tmp_rem = dout[idx]; + dx[idx] = tmp_rem; + dy[idx] = tmp_rem; + } + } +} + +template +void DefaultElementwiseAddGrad(const GPUContext &ctx, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out, + const DenseTensor &dout, + DenseTensor *dx, + DenseTensor *dy, + int axis = -1) { + auto *dout_data = dout.data(); + + // dx + if (dx != nullptr) { + auto *dx_data = dx->mutable_data(ctx.GetPlace()); + if (dx->dims() == dout.dims()) { + if (dx_data != dout_data) { + phi::Copy(ctx, dout, ctx.GetPlace(), false, dx); + } + } else { + // For inplace strategy, dx will be stored in addr of dout, which makes + // the result of dy wrong. + if (dx->IsSharedBufferWith(dout)) { + dx->clear(); + dx->mutable_data(x.dims(), ctx.GetPlace()); + } + std::vector reduce_dims = + funcs::GetReduceDim(x.dims(), out.dims(), axis); + gpuStream_t stream = ctx.stream(); + kernels::TensorReduceImpl>( + ctx, dout, dx, kps::IdentityFunctor(), reduce_dims, stream); + } + } + // dy + if (dy != nullptr) { + auto *dy_data = dy->mutable_data(ctx.GetPlace()); + if (dy->dims() == dout.dims()) { + if (dy_data != dout_data) { + phi::Copy(ctx, dout, ctx.GetPlace(), false, dy); + } + } else { + std::vector reduce_dims = + funcs::GetReduceDim(y.dims(), out.dims(), axis); + gpuStream_t stream = ctx.stream(); + kernels::TensorReduceImpl>( + ctx, dout, dy, kps::IdentityFunctor(), reduce_dims, stream); + } + } +} + +template +void ElementwiseAddGrad(const GPUContext &ctx, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out, + const DenseTensor &dout, + DenseTensor *dx, + DenseTensor *dy) { + ctx.template Alloc(dx); + ctx.template Alloc(dy); + auto *dx_data = dx->data(); + auto *dy_data = dy->data(); + auto *dout_data = dout.data(); + if (dx_data == dout_data && dy_data != dout_data) { + VLOG(4) << "Special case when dx_data is the same as dout_data, " + "only need copy dout to dy"; + phi::Copy(ctx, dout, ctx.GetPlace(), false, dy); + } else if (dx_data != dout_data && dy_data == dout_data) { + VLOG(4) << "Special case when dy_data is the same as dout_data, " + "only need copy dout to dx"; + phi::Copy(ctx, dout, ctx.GetPlace(), false, dx); + } else if (dx_data != dout_data && dy_data != dout_data) { + auto size = x.numel(); + int vec_size = max(static_cast(sizeof(float4) / sizeof(T)), 1); + dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1); + dim3 grid_size = + dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) / + PREDEFINED_BLOCK_SIZE, + 1); + SimpleElemwiseAddGradCUDAKernel< + T><<>>( + dout.data(), + size, + vec_size, + dx->mutable_data(ctx.GetPlace()), + dy->mutable_data(ctx.GetPlace())); + } else { + VLOG(4) << "Special case when dy_data is the same as dout_data, " + "and dx_data is the same as dout_data, do not need " + "any operator"; + } +} + +/* +****************************** + Sub Grad +****************************** +*/ + +template +static __global__ void SimpleElemwiseSubGradCUDAKernel(const T *dout, + int64_t size, + T *dx, + T *dy) { + int col = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X; + + while (col < size) { + if (dx != nullptr) { + dx[col] = dout[col]; + } + dy[col] = -dout[col]; + col += BLOCK_NUM_X * GRID_NUM_X; + } +} + +template +void default_elementwise_sub_grad(const GPUContext &ctx, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out, + const DenseTensor &dout, + DenseTensor *dx, + DenseTensor *dy, + int axis = -1) { + auto *dout_data = dout.data(); + // dx + if (dx != nullptr) { + auto *dx_data = dx->mutable_data(ctx.GetPlace()); + if (dx->dims() == dout.dims()) { + if (dx_data != dout_data) { + phi::Copy(ctx, dout, ctx.GetPlace(), false, dx); + } + } else { + // For inplace strategy, dx will be stored in addr of dout, which makes + // the result of dy wrong. + if (dx->IsSharedBufferWith(dout)) { + dx->clear(); + dx->mutable_data(x.dims(), ctx.GetPlace()); + } + std::vector reduce_dims = + funcs::GetReduceDim(x.dims(), out.dims(), axis); + gpuStream_t stream = ctx.stream(); + kernels::TensorReduceImpl>( + ctx, dout, dx, kps::IdentityFunctor(), reduce_dims, stream); + } + } + // dy + if (dy != nullptr) { + auto *dy_data = dy->mutable_data(ctx.GetPlace()); + if (dy->dims() == dout.dims()) { + if (dy_data != dout_data) { + dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1); + auto size = dy->numel(); + dim3 grid_size = + dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1); + SimpleElemwiseSubGradCUDAKernel< + T><<>>( + dout.data(), size, nullptr, dy->mutable_data(ctx.GetPlace())); + } + } else { + std::vector reduce_dims = + funcs::GetReduceDim(y.dims(), out.dims(), axis); + gpuStream_t stream = ctx.stream(); + kernels::TensorReduceImpl>( + ctx, dout, dy, kps::InverseFunctor(), reduce_dims, stream); + } + } +} + +template +void elementwise_sub_grad(const GPUContext &ctx, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out, + const DenseTensor &dout, + DenseTensor *dx, + DenseTensor *dy) { + dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1); + auto size = x.numel(); + dim3 grid_size = + dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1); + SimpleElemwiseSubGradCUDAKernel< + T><<>>( + dout.data(), + size, + dx->mutable_data(ctx.GetPlace()), + dy->mutable_data(ctx.GetPlace())); +} +} // namespace phi diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu index 3c4c01b1dc8..d00888aee67 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu @@ -17,8 +17,9 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" -#include "paddle/phi/kernels/gpu/elementwise.h" +#include "paddle/phi/kernels/gpu/elementwise_grad.h" #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h" namespace phi { @@ -33,9 +34,9 @@ void AddGradFunc(const GPUContext& dev_ctx, DenseTensor* dy, int axis = -1) { if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) { - elementwise_add_grad(dev_ctx, x, y, out, dout, dx, dy); + ElementwiseAddGrad(dev_ctx, x, y, out, dout, dx, dy); } else { - default_elementwise_add_grad(dev_ctx, x, y, out, dout, dx, dy, axis); + DefaultElementwiseAddGrad(dev_ctx, x, y, out, dout, dx, dy, axis); } } @@ -58,15 +59,7 @@ void AddDoubleGradKernel(const Context& dev_ctx, const DenseTensor& dout, int axis, DenseTensor* ddout) { - phi::AddDoubleGradImpl(dev_ctx, - y, - ddx, - ddy, - dout, - axis, - ddout, - ElementwiseCompute, T>, - ElementwiseCompute, T>); + phi::AddDoubleGradImpl(dev_ctx, y, ddx, ddy, dout, axis, ddout); } template @@ -106,15 +99,7 @@ void SubtractDoubleGradKernel(const Context& dev_ctx, const DenseTensor& dout, int axis, DenseTensor* ddout) { - phi::SubtractDoubleGradImpl( - dev_ctx, - y, - ddx, - ddy, - dout, - axis, - ddout, - ElementwiseCompute, T>); + phi::SubtractDoubleGradImpl(dev_ctx, y, ddx, ddy, dout, axis, ddout); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/logical_kernel.cu b/paddle/phi/kernels/gpu/logical_kernel.cu index f32d4c77d40..1c0bafc932e 100644 --- a/paddle/phi/kernels/gpu/logical_kernel.cu +++ b/paddle/phi/kernels/gpu/logical_kernel.cu @@ -16,9 +16,8 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/logical_functor.h" -#include "paddle/phi/kernels/gpu/elementwise.h" namespace phi { diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu index fc73ccca6de..af9d5574aa9 100644 --- a/paddle/phi/kernels/gpu/math_kernel.cu +++ b/paddle/phi/kernels/gpu/math_kernel.cu @@ -15,8 +15,8 @@ limitations under the License. */ #include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" -#include "paddle/phi/kernels/gpu/elementwise.h" #include "paddle/phi/kernels/gpu/reduce.h" #ifdef __NVCC__ diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index 460e74b5816..ac7d6fd1a0e 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" namespace phi { @@ -47,19 +47,14 @@ void AddGradImpl(const Context& dev_ctx, } } -template +template void AddDoubleGradImpl(const Context& dev_ctx, const DenseTensor& y, const paddle::optional& ddx, const paddle::optional& ddy, const DenseTensor& dout, int axis, - DenseTensor* ddout, - GradFunc grad_func, - GradInverseFunc grad_inverse_func) { + DenseTensor* ddout) { // ddOut = ddx + ddy if (ddout) { DenseTensor ddx_safe, ddy_safe; @@ -72,28 +67,28 @@ void AddDoubleGradImpl(const Context& dev_ctx, auto ddx_dims = ddx_safe.dims(); auto ddy_dims = ddy_safe.dims(); if (ddx_dims.size() >= ddy_dims.size()) { - grad_func( + funcs::ElementwiseCompute, T>( dev_ctx, ddx_safe, ddy_safe, axis, funcs::AddFunctor(), ddout); } else { - grad_inverse_func(dev_ctx, - ddx_safe, - ddy_safe, - axis, - funcs::InverseAddFunctor(), - ddout); + funcs::ElementwiseCompute, T>( + dev_ctx, + ddx_safe, + ddy_safe, + axis, + funcs::InverseAddFunctor(), + ddout); } } } -template +template void SubtractDoubleGradImpl(const Context& dev_ctx, const DenseTensor& y, const paddle::optional& ddx, const paddle::optional& ddy, const DenseTensor& dout, int axis, - DenseTensor* ddout, - GradFunc grad_func) { + DenseTensor* ddout) { // DDOut = ddx - ddy if (ddout) { DenseTensor ddx_safe, ddy_safe; @@ -103,7 +98,7 @@ void SubtractDoubleGradImpl(const Context& dev_ctx, dev_ctx, y, ddy.get_ptr(), &ddy_safe); ddout->mutable_data(dev_ctx.GetPlace()); - grad_func( + funcs::ElementwiseCompute, T>( dev_ctx, ddx_safe, ddy_safe, axis, funcs::SubtractFunctor(), ddout); } } -- GitLab From 2e6548a9cd2224e1a4b89c1351f1078273f98328 Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Wed, 2 Mar 2022 18:40:00 +0800 Subject: [PATCH 064/272] vec scale kernel (#40011) --- .../optimizers/distributed_fused_lamb_op.cu | 49 +++++++++++++++---- 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu index ca0828a6f6a..8bb4606ffff 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu @@ -304,14 +304,30 @@ struct AndFunctor { HOSTDEVICE bool operator()(bool x, bool y) const { return x && y; } }; -template +template static __global__ void ScaleCUDAKernel(const T1 *__restrict__ x, const T2 *__restrict__ scale, T1 *__restrict__ y, int num) { static_assert(sizeof(T1) <= sizeof(T2), "sizeof(T1) must be not greater than sizeof(T2)."); T2 s = scale[0]; - CUDA_KERNEL_LOOP(i, num) { + + int i = (threadIdx.x + blockIdx.x * blockDim.x) * VecSize; + int stride = blockDim.x * gridDim.x * VecSize; + + for (; i + VecSize <= num; i += stride) { + platform::AlignedVector x_vec; + platform::AlignedVector y_vec; + + platform::Load(x + i, &x_vec); +#pragma unroll + for (int j = 0; j < VecSize; ++j) { + y_vec[j] = static_cast(static_cast(x_vec[j]) * s); + } + platform::Store(y_vec, y + i); + } + + for (; i < num; ++i) { y[i] = static_cast(static_cast(x[i]) * s); } } @@ -396,7 +412,6 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel( for (; i + VecSize <= num; i += stride) { platform::AlignedVector param_vec; platform::AlignedVector grad_vec; - platform::AlignedVector weight_decay_vec; platform::AlignedVector mom1_vec; platform::AlignedVector mom2_vec; platform::AlignedVector trust_ratio_div_vec; @@ -760,6 +775,24 @@ static bool CreatePreMulScaleOpIfSupported(ncclDataType_t dtype, return false; } +template +static void LaunchScaleKernel(const platform::CUDADeviceContext &dev_ctx, + const T1 *x, const T2 *scale, T1 *y, int n, + gpuStream_t stream) { + int vec_size = std::min(GetChunkedVecSize(x, 0), GetChunkedVecSize(y, 0)); + auto config = platform::GetGpuLaunchConfig1D(dev_ctx, n, vec_size); + +#define PD_LAMB_VEC_SCALE_KERNEL_CASE \ + do { \ + ScaleCUDAKernel<<>>( \ + x, scale, y, n); \ + } while (0) + + PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAMB_VEC_SCALE_KERNEL_CASE); +#undef PD_LAMB_VEC_SCALE_KERNEL_CASE +} + template static void NCCLReduceScatterWithScale( const T *sendbuff, T *recvbuff, size_t recvcount, size_t nranks, @@ -775,10 +808,8 @@ static void NCCLReduceScatterWithScale( PADDLE_ENFORCE_EQ(nranks, 1, platform::errors::InvalidArgument( "nranks must be 1 when scale != nullptr.")); - auto numel = recvcount * nranks; - auto config = platform::GetGpuLaunchConfig1D(dev_ctx, numel); - ScaleCUDAKernel<<>>(sendbuff, scale, recvbuff, numel); + LaunchScaleKernel(dev_ctx, sendbuff, scale, recvbuff, recvcount * nranks, + stream); } return; } @@ -792,9 +823,7 @@ static void NCCLReduceScatterWithScale( if (scale && !should_destroy_op) { size_t numel = recvcount * nranks; T *new_sendbuff = buffer.Alloc(numel); - auto config = platform::GetGpuLaunchConfig1D(dev_ctx, numel); - ScaleCUDAKernel<<>>(sendbuff, scale, new_sendbuff, numel); + LaunchScaleKernel(dev_ctx, sendbuff, scale, new_sendbuff, numel, stream); sendbuff = new_sendbuff; } -- GitLab From 09258040e2584f4afd9114b994710232e6769970 Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Wed, 2 Mar 2022 18:50:26 +0800 Subject: [PATCH 065/272] Move gather.h/gather.cu.h/scatter.h/scatter.cu.h to the phi library (#40043) * move gather.h gather.cu.h scatter.h scatter.cu.h to phi library * fix CI * fix rocm ci --- .../fluid/operators/detection/bbox_util.cu.h | 1 - .../detection/collect_fpn_proposals_op.cu | 10 +- .../detection/collect_fpn_proposals_op.h | 6 +- .../detection/distribute_fpn_proposals_op.cu | 5 +- .../detection/distribute_fpn_proposals_op.h | 15 +- .../detection/generate_mask_labels_op.cc | 1 - .../detection/generate_proposal_labels_op.cc | 16 +- .../detection/generate_proposals_op.cc | 18 +- .../detection/generate_proposals_op.cu | 9 +- .../detection/generate_proposals_v2_op.cc | 18 +- .../detection/generate_proposals_v2_op.cu | 9 +- paddle/fluid/operators/gather_nd_op.cu | 94 +++++----- paddle/fluid/operators/gather_nd_op.h | 66 ++++--- paddle/fluid/operators/gather_op.cu | 32 ++-- paddle/fluid/operators/gather_op.h | 68 +++---- paddle/fluid/operators/gather_test.cc | 4 +- paddle/fluid/operators/grid_sampler_op.h | 1 - .../fluid/operators/math/segment_pooling.cu | 6 +- paddle/fluid/operators/scatter_nd_add_op.cu | 41 ++-- paddle/fluid/operators/scatter_nd_add_op.h | 41 ++-- paddle/fluid/operators/scatter_op.cu | 50 +++-- paddle/fluid/operators/scatter_op.h | 63 +++---- paddle/fluid/operators/scatter_test.cc | 4 +- paddle/fluid/operators/segment_pool_op.cu | 1 - .../sequence_ops/sequence_scatter_op.cc | 2 - .../sequence_ops/sequence_scatter_op.h | 3 +- paddle/fluid/operators/viterbi_decode_op.cu | 38 ++-- paddle/fluid/operators/viterbi_decode_op.h | 128 +++++++------ .../kernels/funcs}/gather.cu.h | 176 +++++++++++------- .../operators => phi/kernels/funcs}/gather.h | 114 +++++++----- .../kernels/funcs}/scatter.cu.h | 124 ++++++------ .../operators => phi/kernels/funcs}/scatter.h | 165 ++++++++-------- 32 files changed, 702 insertions(+), 627 deletions(-) rename paddle/{fluid/operators => phi/kernels/funcs}/gather.cu.h (62%) rename paddle/{fluid/operators => phi/kernels/funcs}/gather.h (72%) rename paddle/{fluid/operators => phi/kernels/funcs}/scatter.cu.h (67%) rename paddle/{fluid/operators => phi/kernels/funcs}/scatter.h (65%) diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h index b361bc3ab75..f170fbbe4b5 100644 --- a/paddle/fluid/operators/detection/bbox_util.cu.h +++ b/paddle/fluid/operators/detection/bbox_util.cu.h @@ -23,7 +23,6 @@ limitations under the License. */ #include namespace cub = hipcub; #endif -#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu index ce9ac3de4e7..860fdd01794 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu @@ -23,11 +23,11 @@ namespace cub = hipcub; #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h" -#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" namespace paddle { namespace operators { @@ -160,9 +160,9 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { sorted_rois.mutable_data({real_post_num, kBBoxSize}, dev_ctx.GetPlace()); Tensor sorted_batch_id; sorted_batch_id.mutable_data({real_post_num}, dev_ctx.GetPlace()); - GPUGather(dev_ctx, concat_rois, index_out_t, &sorted_rois); - GPUGather(dev_ctx, roi_batch_id_list_gpu, index_out_t, - &sorted_batch_id); + phi::funcs::GPUGather(dev_ctx, concat_rois, index_out_t, &sorted_rois); + phi::funcs::GPUGather(dev_ctx, roi_batch_id_list_gpu, index_out_t, + &sorted_batch_id); Tensor batch_index_t; int* batch_idx_in = @@ -190,7 +190,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { out_id_data, batch_idx_in, index_out_t.data(), real_post_num, 0, sizeof(int) * 8, dev_ctx.stream()); - GPUGather(dev_ctx, sorted_rois, index_out_t, fpn_rois); + phi::funcs::GPUGather(dev_ctx, sorted_rois, index_out_t, fpn_rois); Tensor length_lod; int* length_lod_data = diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h index a60f881ebf3..e5ae9a6ccbd 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h @@ -21,7 +21,6 @@ limitations under the License.*/ #include #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -66,7 +65,8 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel { auto multi_layer_scores = context.MultiInput("MultiLevelScores"); - auto multi_rois_num = context.MultiInput("MultiLevelRoIsNum"); + auto multi_rois_num = + context.MultiInput("MultiLevelRoIsNum"); int num_size = multi_rois_num.size(); auto* fpn_rois = context.Output("FpnRois"); @@ -176,7 +176,7 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel { } num_per_batch.emplace_back(post_nms_topN - pre_idx); if (context.HasOutput("RoisNum")) { - auto* rois_num = context.Output("RoisNum"); + auto* rois_num = context.Output("RoisNum"); int* rois_num_data = rois_num->mutable_data({batch_size}, context.GetPlace()); for (int i = 0; i < batch_size; i++) { diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu index c117fbd70f5..7ad25e003b4 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu @@ -24,9 +24,9 @@ namespace cub = hipcub; #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h" -#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -193,7 +193,8 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { start = end; multi_fpn_rois[i]->mutable_data({sub_rois_num, kBoxDim}, dev_ctx.GetPlace()); - GPUGather(dev_ctx, *fpn_rois, sub_idx, multi_fpn_rois[i]); + phi::funcs::GPUGather(dev_ctx, *fpn_rois, sub_idx, + multi_fpn_rois[i]); } else { multi_fpn_rois[i]->mutable_data({sub_rois_num, kBoxDim}, dev_ctx.GetPlace()); diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h index 628cbcd7611..5479e08c2a5 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h @@ -20,7 +20,6 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -28,10 +27,11 @@ namespace operators { const int kBoxDim = 4; -inline std::vector GetLodFromRoisNum(const Tensor* rois_num) { +inline std::vector GetLodFromRoisNum( + const framework::Tensor* rois_num) { std::vector rois_lod; auto* rois_num_data = rois_num->data(); - Tensor cpu_tensor; + framework::Tensor cpu_tensor; if (platform::is_gpu_place(rois_num->place())) { paddle::framework::TensorCopySync(*rois_num, platform::CPUPlace(), &cpu_tensor); @@ -93,7 +93,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel { std::vector fpn_rois_lod; int fpn_rois_num; if (context.HasInput("RoisNum")) { - auto* rois_num = context.Input("RoisNum"); + auto* rois_num = context.Input("RoisNum"); fpn_rois_lod = GetLodFromRoisNum(rois_num); } else { fpn_rois_lod = fpn_rois->lod().back(); @@ -105,7 +105,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel { std::vector num_rois_level(num_level, 0); std::vector num_rois_level_integral(num_level + 1, 0); for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) { - Tensor fpn_rois_slice = + auto fpn_rois_slice = fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]); const T* rois_data = fpn_rois_slice.data(); for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) { @@ -140,7 +140,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel { std::vector restore_index_inter(fpn_rois_num, -1); // distribute the rois into different fpn level by target level for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) { - Tensor fpn_rois_slice = + auto fpn_rois_slice = fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]); const T* rois_data = fpn_rois_slice.data(); size_t cur_offset = fpn_rois_lod[i]; @@ -163,7 +163,8 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel { for (int i = 0; i < fpn_rois_num; ++i) { restore_index_data[restore_index_inter[i]] = i; } - auto multi_rois_num = context.MultiOutput("MultiLevelRoIsNum"); + auto multi_rois_num = + context.MultiOutput("MultiLevelRoIsNum"); if (multi_rois_num.size() > 0) { int batch_size = fpn_rois_lod.size() - 1; for (int i = 0; i < num_level; ++i) { diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc index e6af1a5bbf7..c9cc4e72207 100644 --- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc +++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc @@ -17,7 +17,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/fluid/operators/detection/mask_util.h" -#include "paddle/fluid/operators/gather.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc index 424aa071440..cbf17048400 100644 --- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc +++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc @@ -16,8 +16,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/detection/bbox_util.h" -#include "paddle/fluid/operators/gather.h" #include "paddle/fluid/operators/math/concat_and_split.h" +#include "paddle/phi/kernels/funcs/gather.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -281,22 +281,22 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context, Tensor fg_boxes, bg_boxes, fg_labels, bg_labels; fg_boxes.mutable_data({fg_num, kBoxDim}, context.GetPlace()); - CPUGather(context, boxes, fg_inds_t, &fg_boxes); + phi::funcs::CPUGather(context, boxes, fg_inds_t, &fg_boxes); bg_boxes.mutable_data({bg_num, kBoxDim}, context.GetPlace()); - CPUGather(context, boxes, bg_inds_t, &bg_boxes); + phi::funcs::CPUGather(context, boxes, bg_inds_t, &bg_boxes); Concat(context, fg_boxes, bg_boxes, sampled_boxes); - CPUGather(context, gt_boxes, gt_box_inds_t, sampled_gts); + phi::funcs::CPUGather(context, gt_boxes, gt_box_inds_t, sampled_gts); fg_labels.mutable_data({fg_num}, context.GetPlace()); - CPUGather(context, gt_classes, gt_label_inds_t, &fg_labels); + phi::funcs::CPUGather(context, gt_classes, gt_label_inds_t, &fg_labels); bg_labels.mutable_data({bg_num}, context.GetPlace()); phi::funcs::set_constant(context, &bg_labels, 0); Concat(context, fg_labels, bg_labels, sampled_labels); Tensor fg_max_overlap, bg_max_overlap; fg_max_overlap.mutable_data({fg_num}, context.GetPlace()); - CPUGather(context, max_overlap, fg_inds_t, &fg_max_overlap); + phi::funcs::CPUGather(context, max_overlap, fg_inds_t, &fg_max_overlap); bg_max_overlap.mutable_data({bg_num}, context.GetPlace()); - CPUGather(context, max_overlap, bg_inds_t, &bg_max_overlap); + phi::funcs::CPUGather(context, max_overlap, bg_inds_t, &bg_max_overlap); Concat(context, fg_max_overlap, bg_max_overlap, sampled_max_overlap); } @@ -334,7 +334,7 @@ std::vector SampleRoisForOneImage( } else { proposals_num = keep.numel(); roi_filter.mutable_data({proposals_num, kBoxDim}, context.GetPlace()); - CPUGather(context, rpn_rois, keep, &roi_filter); + phi::funcs::CPUGather(context, rpn_rois, keep, &roi_filter); } T* roi_filter_dt = roi_filter.data(); memcpy(rpn_rois_dt, roi_filter_dt, roi_filter.numel() * sizeof(T)); diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc index 8c4bd4ac613..d6130823271 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/fluid/operators/detection/nms_util.h" -#include "paddle/fluid/operators/gather.h" +#include "paddle/phi/kernels/funcs/gather.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -196,10 +196,10 @@ class GenerateProposalsKernel : public framework::OpKernel { anchor_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); var_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); - CPUGather(ctx, scores_slice, index_t, &scores_sel); - CPUGather(ctx, bbox_deltas_slice, index_t, &bbox_sel); - CPUGather(ctx, anchors, index_t, &anchor_sel); - CPUGather(ctx, variances, index_t, &var_sel); + phi::funcs::CPUGather(ctx, scores_slice, index_t, &scores_sel); + phi::funcs::CPUGather(ctx, bbox_deltas_slice, index_t, &bbox_sel); + phi::funcs::CPUGather(ctx, anchors, index_t, &anchor_sel); + phi::funcs::CPUGather(ctx, variances, index_t, &var_sel); Tensor proposals; proposals.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); @@ -223,8 +223,8 @@ class GenerateProposalsKernel : public framework::OpKernel { Tensor scores_filter; bbox_sel.mutable_data({keep.numel(), 4}, ctx.GetPlace()); scores_filter.mutable_data({keep.numel(), 1}, ctx.GetPlace()); - CPUGather(ctx, proposals, keep, &bbox_sel); - CPUGather(ctx, scores_sel, keep, &scores_filter); + phi::funcs::CPUGather(ctx, proposals, keep, &bbox_sel); + phi::funcs::CPUGather(ctx, scores_sel, keep, &scores_filter); if (nms_thresh <= 0) { return std::make_pair(bbox_sel, scores_filter); } @@ -237,8 +237,8 @@ class GenerateProposalsKernel : public framework::OpKernel { proposals.mutable_data({keep_nms.numel(), 4}, ctx.GetPlace()); scores_sel.mutable_data({keep_nms.numel(), 1}, ctx.GetPlace()); - CPUGather(ctx, bbox_sel, keep_nms, &proposals); - CPUGather(ctx, scores_filter, keep_nms, &scores_sel); + phi::funcs::CPUGather(ctx, bbox_sel, keep_nms, &proposals); + phi::funcs::CPUGather(ctx, scores_filter, keep_nms, &scores_sel); return std::make_pair(proposals, scores_sel); } diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu index 6e3c322c174..5fb7973fd89 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cu +++ b/paddle/fluid/operators/detection/generate_proposals_op.cu @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/detection/bbox_util.cu.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -85,8 +86,8 @@ static std::pair ProposalForOneImage( } proposals_filter.mutable_data({keep_num, 4}, ctx.GetPlace()); scores_filter.mutable_data({keep_num, 1}, ctx.GetPlace()); - GPUGather(ctx, proposals, keep_index, &proposals_filter); - GPUGather(ctx, scores_sort, keep_index, &scores_filter); + phi::funcs::GPUGather(ctx, proposals, keep_index, &proposals_filter); + phi::funcs::GPUGather(ctx, scores_sort, keep_index, &scores_filter); if (nms_thresh <= 0) { return std::make_pair(proposals_filter, scores_filter); @@ -102,8 +103,8 @@ static std::pair ProposalForOneImage( Tensor scores_nms, proposals_nms; proposals_nms.mutable_data({keep_nms.numel(), 4}, ctx.GetPlace()); scores_nms.mutable_data({keep_nms.numel(), 1}, ctx.GetPlace()); - GPUGather(ctx, proposals_filter, keep_nms, &proposals_nms); - GPUGather(ctx, scores_filter, keep_nms, &scores_nms); + phi::funcs::GPUGather(ctx, proposals_filter, keep_nms, &proposals_nms); + phi::funcs::GPUGather(ctx, scores_filter, keep_nms, &scores_nms); return std::make_pair(proposals_nms, scores_nms); } diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc index 6351ea865cd..1f1802574c5 100644 --- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/fluid/operators/detection/nms_util.h" -#include "paddle/fluid/operators/gather.h" +#include "paddle/phi/kernels/funcs/gather.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -197,10 +197,10 @@ class GenerateProposalsV2Kernel : public framework::OpKernel { anchor_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); var_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); - CPUGather(ctx, scores_slice, index_t, &scores_sel); - CPUGather(ctx, bbox_deltas_slice, index_t, &bbox_sel); - CPUGather(ctx, anchors, index_t, &anchor_sel); - CPUGather(ctx, variances, index_t, &var_sel); + phi::funcs::CPUGather(ctx, scores_slice, index_t, &scores_sel); + phi::funcs::CPUGather(ctx, bbox_deltas_slice, index_t, &bbox_sel); + phi::funcs::CPUGather(ctx, anchors, index_t, &anchor_sel); + phi::funcs::CPUGather(ctx, variances, index_t, &var_sel); Tensor proposals; proposals.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); @@ -227,8 +227,8 @@ class GenerateProposalsV2Kernel : public framework::OpKernel { Tensor scores_filter; bbox_sel.mutable_data({keep.numel(), 4}, ctx.GetPlace()); scores_filter.mutable_data({keep.numel(), 1}, ctx.GetPlace()); - CPUGather(ctx, proposals, keep, &bbox_sel); - CPUGather(ctx, scores_sel, keep, &scores_filter); + phi::funcs::CPUGather(ctx, proposals, keep, &bbox_sel); + phi::funcs::CPUGather(ctx, scores_sel, keep, &scores_filter); if (nms_thresh <= 0) { return std::make_pair(bbox_sel, scores_filter); } @@ -242,8 +242,8 @@ class GenerateProposalsV2Kernel : public framework::OpKernel { proposals.mutable_data({keep_nms.numel(), 4}, ctx.GetPlace()); scores_sel.mutable_data({keep_nms.numel(), 1}, ctx.GetPlace()); - CPUGather(ctx, bbox_sel, keep_nms, &proposals); - CPUGather(ctx, scores_filter, keep_nms, &scores_sel); + phi::funcs::CPUGather(ctx, bbox_sel, keep_nms, &proposals); + phi::funcs::CPUGather(ctx, scores_filter, keep_nms, &scores_sel); return std::make_pair(proposals, scores_sel); } diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu index 93ba3deca5f..005309e8ee5 100644 --- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu +++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/detection/bbox_util.cu.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -86,8 +87,8 @@ static std::pair ProposalForOneImage( } proposals_filter.mutable_data({keep_num, 4}, ctx.GetPlace()); scores_filter.mutable_data({keep_num, 1}, ctx.GetPlace()); - GPUGather(ctx, proposals, keep_index, &proposals_filter); - GPUGather(ctx, scores_sort, keep_index, &scores_filter); + phi::funcs::GPUGather(ctx, proposals, keep_index, &proposals_filter); + phi::funcs::GPUGather(ctx, scores_sort, keep_index, &scores_filter); if (nms_thresh <= 0) { return std::make_pair(proposals_filter, scores_filter); @@ -104,8 +105,8 @@ static std::pair ProposalForOneImage( Tensor scores_nms, proposals_nms; proposals_nms.mutable_data({keep_nms.numel(), 4}, ctx.GetPlace()); scores_nms.mutable_data({keep_nms.numel(), 1}, ctx.GetPlace()); - GPUGather(ctx, proposals_filter, keep_nms, &proposals_nms); - GPUGather(ctx, scores_filter, keep_nms, &scores_nms); + phi::funcs::GPUGather(ctx, proposals_filter, keep_nms, &proposals_nms); + phi::funcs::GPUGather(ctx, scores_filter, keep_nms, &scores_nms); return std::make_pair(proposals_nms, scores_nms); } diff --git a/paddle/fluid/operators/gather_nd_op.cu b/paddle/fluid/operators/gather_nd_op.cu index 0de2798bf75..338c4411618 100644 --- a/paddle/fluid/operators/gather_nd_op.cu +++ b/paddle/fluid/operators/gather_nd_op.cu @@ -13,14 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/gather_nd_op.h" -#include "paddle/fluid/operators/scatter.cu.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" +#include "paddle/phi/kernels/funcs/scatter.cu.h" namespace paddle { namespace operators { -template +template class GatherNdOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -33,27 +33,25 @@ class GatherNdOpCUDAKernel : public framework::OpKernel { output->mutable_data(ctx.GetPlace()); if (x->numel() == 0) return; - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s], but " - "desires to be [%s] or [%s].", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - if (index_type == framework::proto::VarType::INT32) { - GPUGatherNd(ctx, *x, *index, output); - } else if (index_type == framework::proto::VarType::INT64) { - GPUGatherNd(ctx, *x, *index, output); + const auto &index_type = index->dtype(); + bool index_type_match = index_type == phi::DataType::INT32 || + index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, true, + platform::errors::InvalidArgument( + "Index holds the wrong type, it holds [%s], but " + "desires to be [%s] or [%s].", + index_type, phi::DataType::INT32, phi::DataType::INT64)); + auto &dev_ctx = ctx.cuda_device_context(); + if (index_type == phi::DataType::INT32) { + phi::funcs::GPUGatherNd(dev_ctx, *x, *index, output); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::GPUGatherNd(dev_ctx, *x, *index, output); } } }; -template +template class GatherNdGradOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -71,24 +69,22 @@ class GatherNdGradOpCUDAKernel : public framework::OpKernel { dxt.device(place) = dxt.constant(static_cast(0)); if (dO->numel() == 0) return; - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; + const auto &index_type = index->dtype(); + bool index_type_match = index_type == phi::DataType::INT32 || + index_type == phi::DataType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s].", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); + PADDLE_ENFORCE_EQ( + index_type_match, true, + platform::errors::InvalidArgument( + "Index holds the wrong type, it holds [%s]," + "but desires to be [%s] or [%s].", + index_type, phi::DataType::INT32, phi::DataType::INT64)); - if (index_type == framework::proto::VarType::INT32) { - GPUScatterNdAdd(ctx, *dO, *index, dX); - } else if (index_type == framework::proto::VarType::INT64) { - GPUScatterNdAdd(ctx, *dO, *index, dX); + auto &dev_ctx = ctx.cuda_device_context(); + if (index_type == phi::DataType::INT32) { + phi::funcs::GPUScatterNdAdd(dev_ctx, *dO, *index, dX); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::GPUScatterNdAdd(dev_ctx, *dO, *index, dX); } } }; @@ -98,18 +94,16 @@ class GatherNdGradOpCUDAKernel : public framework::OpKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; -using CUDA = paddle::platform::CUDADeviceContext; -REGISTER_OP_CUDA_KERNEL(gather_nd, ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(gather_nd, ops::GatherNdOpCUDAKernel, + ops::GatherNdOpCUDAKernel, + ops::GatherNdOpCUDAKernel, + ops::GatherNdOpCUDAKernel, + ops::GatherNdOpCUDAKernel, + ops::GatherNdOpCUDAKernel, + ops::GatherNdOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL(gather_nd_grad, - ops::GatherNdGradOpCUDAKernel, - ops::GatherNdGradOpCUDAKernel, - ops::GatherNdGradOpCUDAKernel, - ops::GatherNdGradOpCUDAKernel, - ops::GatherNdGradOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(gather_nd_grad, ops::GatherNdGradOpCUDAKernel, + ops::GatherNdGradOpCUDAKernel, + ops::GatherNdGradOpCUDAKernel, + ops::GatherNdGradOpCUDAKernel, + ops::GatherNdGradOpCUDAKernel); diff --git a/paddle/fluid/operators/gather_nd_op.h b/paddle/fluid/operators/gather_nd_op.h index f458c0e1801..d54261008e4 100644 --- a/paddle/fluid/operators/gather_nd_op.h +++ b/paddle/fluid/operators/gather_nd_op.h @@ -15,8 +15,8 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" -#include "paddle/fluid/operators/scatter.h" +#include "paddle/phi/kernels/funcs/gather.h" +#include "paddle/phi/kernels/funcs/scatter.h" namespace paddle { namespace operators { @@ -38,22 +38,20 @@ class GatherNdOpKernel : public framework::OpKernel { output->mutable_data(ctx.GetPlace()); if (x->numel() == 0) return; - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s]", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - if (index_type == framework::proto::VarType::INT32) { - CPUGatherNd(ctx.device_context(), *x, *index, output); - } else if (index_type == framework::proto::VarType::INT64) { - CPUGatherNd(ctx.device_context(), *x, *index, output); + auto index_type = index->dtype(); + bool index_type_match = index_type == phi::DataType::INT32 || + index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, true, + platform::errors::InvalidArgument( + "Index holds the wrong type, it holds [%s]," + "but desires to be [%s] or [%s]", + index_type, phi::DataType::INT32, phi::DataType::INT64)); + auto &dev_ctx = ctx.template device_context(); + if (index_type == phi::DataType::INT32) { + phi::funcs::CPUGatherNd(dev_ctx, *x, *index, output); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::CPUGatherNd(dev_ctx, *x, *index, output); } } }; @@ -65,6 +63,7 @@ class GatherNdGradOpKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ( platform::is_cpu_place(ctx.GetPlace()), true, platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); + auto *index = ctx.Input("Index"); auto *dX = ctx.Output(framework::GradVarName("X")); auto *dO = ctx.Input(framework::GradVarName("Out")); @@ -75,22 +74,21 @@ class GatherNdGradOpKernel : public framework::OpKernel { dxt.device(place) = dxt.constant(static_cast(0)); if (dO->numel() == 0) return; - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s]", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - if (index_type == framework::proto::VarType::INT32) { - ScatterNdAdd(ctx, *dO, *index, dX); - } else if (index_type == framework::proto::VarType::INT64) { - ScatterNdAdd(ctx, *dO, *index, dX); + auto index_type = index->dtype(); + bool index_type_match = index_type == phi::DataType::INT32 || + index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, true, + platform::errors::InvalidArgument( + "Index holds the wrong type, it holds [%s]," + "but desires to be [%s] or [%s]", + index_type, phi::DataType::INT32, phi::DataType::INT64)); + + auto &dev_ctx = ctx.template device_context(); + if (index_type == phi::DataType::INT32) { + phi::funcs::ScatterNdAdd(dev_ctx, *dO, *index, dX); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::ScatterNdAdd(dev_ctx, *dO, *index, dX); } } }; diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu index a502a130409..8f1d9284c50 100644 --- a/paddle/fluid/operators/gather_op.cu +++ b/paddle/fluid/operators/gather_op.cu @@ -14,9 +14,9 @@ limitations under the License. */ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/gather_op.h" -#include "paddle/fluid/operators/scatter.cu.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" +#include "paddle/phi/kernels/funcs/scatter.cu.h" namespace paddle { namespace operators { @@ -49,11 +49,14 @@ class GatherOpCUDAKernel : public framework::OpKernel { } const auto &place = ctx.GetPlace(); const auto &index_type = framework::TransToProtoVarType(index->dtype()); + const auto &dev_ctx = ctx.cuda_device_context(); if (axis != 0) { if (index_type == framework::proto::VarType::INT32) { - GatherV2CUDAFunction(x, index, axis, output, place, ctx); + phi::funcs::GatherV2CUDAFunction(x, index, axis, output, + dev_ctx); } else if (index_type == framework::proto::VarType::INT64) { - GatherV2CUDAFunction(x, index, axis, output, place, ctx); + phi::funcs::GatherV2CUDAFunction(x, index, axis, output, + dev_ctx); } return; } @@ -61,9 +64,9 @@ class GatherOpCUDAKernel : public framework::OpKernel { output->mutable_data(ctx.GetPlace()); if (x->numel() == 0) return; if (index_type == framework::proto::VarType::INT32) { - GPUGather(ctx.device_context(), *x, *index, output); + phi::funcs::GPUGather(dev_ctx, *x, *index, output); } else if (index_type == framework::proto::VarType::INT64) { - GPUGather(ctx.device_context(), *x, *index, output); + phi::funcs::GPUGather(dev_ctx, *x, *index, output); } } }; @@ -93,14 +96,15 @@ class GatherGradOpCUDAKernel : public framework::OpKernel { } } + const auto &dev_ctx = ctx.cuda_device_context(); const auto &index_type = framework::TransToProtoVarType(index->dtype()); if (axis != 0) { if (index_type == framework::proto::VarType::INT32) { - GatherV2GradCUDAFunction(dO, index, axis, dX, - ctx.GetPlace(), ctx); + phi::funcs::GatherV2GradCUDAFunction(dO, index, axis, dX, + dev_ctx); } else if (index_type == framework::proto::VarType::INT64) { - GatherV2GradCUDAFunction(dO, index, axis, dX, - ctx.GetPlace(), ctx); + phi::funcs::GatherV2GradCUDAFunction(dO, index, axis, dX, + dev_ctx); } return; } @@ -112,11 +116,11 @@ class GatherGradOpCUDAKernel : public framework::OpKernel { dxt.device(place) = dxt.constant(static_cast(0)); if (dO->numel() == 0) return; if (index_type == framework::proto::VarType::INT32) { - GPUScatterAssign(ctx, *dO, *index, dX, - ctx.Attr("overwrite")); + phi::funcs::GPUScatterAssign(dev_ctx, *dO, *index, dX, + ctx.Attr("overwrite")); } else if (index_type == framework::proto::VarType::INT64) { - GPUScatterAssign(ctx, *dO, *index, dX, - ctx.Attr("overwrite")); + phi::funcs::GPUScatterAssign(dev_ctx, *dO, *index, dX, + ctx.Attr("overwrite")); } } }; diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h index 016c2b398da..94de694b2f9 100644 --- a/paddle/fluid/operators/gather_op.h +++ b/paddle/fluid/operators/gather_op.h @@ -16,8 +16,8 @@ limitations under the License. */ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" -#include "paddle/fluid/operators/scatter.h" +#include "paddle/phi/kernels/funcs/gather.h" +#include "paddle/phi/kernels/funcs/scatter.h" namespace paddle { namespace operators { @@ -40,31 +40,32 @@ class GatherOpKernel : public framework::OpKernel { // get axis from tensor if (ctx.HasInput("Axis")) { const Tensor *axis_tensor = ctx.Input("Axis"); - const auto &axis_type = - framework::TransToProtoVarType(axis_tensor->dtype()); - if (axis_type == framework::proto::VarType::INT32) { + const auto &axis_type = axis_tensor->dtype(); + if (axis_type == phi::DataType::INT32) { axis = static_cast(axis_tensor->data()[0]); - } else if (axis_type == framework::proto::VarType::INT64) { + } else if (axis_type == phi::DataType::INT64) { axis = static_cast(axis_tensor->data()[0]); } } - const auto &place = ctx.GetPlace(); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); + const auto &index_type = index->dtype(); + auto &dev_ctx = ctx.template device_context(); if (axis != 0) { - if (index_type == framework::proto::VarType::INT32) { - GatherV2Function(x, index, axis, output, place); - } else if (index_type == framework::proto::VarType::INT64) { - GatherV2Function(x, index, axis, output, place); + if (index_type == phi::DataType::INT32) { + phi::funcs::GatherV2Function(dev_ctx, x, index, axis, + output); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::GatherV2Function(dev_ctx, x, index, axis, + output); } return; } output->mutable_data(ctx.GetPlace()); if (x->numel() == 0) return; - if (index_type == framework::proto::VarType::INT32) { - CPUGather(ctx.device_context(), *x, *index, output); - } else if (index_type == framework::proto::VarType::INT64) { - CPUGather(ctx.device_context(), *x, *index, output); + if (index_type == phi::DataType::INT32) { + phi::funcs::CPUGather(dev_ctx, *x, *index, output); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::CPUGather(dev_ctx, *x, *index, output); } } }; @@ -84,44 +85,45 @@ class GatherGradientOpKernel : public framework::OpKernel { int axis = ctx.Attr("axis"); if (ctx.HasInput("Axis")) { const Tensor *axis_tensor = ctx.Input("Axis"); - const auto &axis_type = - framework::TransToProtoVarType(axis_tensor->dtype()); - if (axis_type == framework::proto::VarType::INT32) { + const auto &axis_type = axis_tensor->dtype(); + if (axis_type == phi::DataType::INT32) { axis = static_cast(axis_tensor->data()[0]); - } else if (axis_type == framework::proto::VarType::INT64) { + } else if (axis_type == phi::DataType::INT64) { axis = static_cast(axis_tensor->data()[0]); } } - const auto &index_type = framework::TransToProtoVarType(index->dtype()); + const auto &index_type = index->dtype(); + auto &dev_ctx = ctx.template device_context(); if (axis != 0) { - if (index_type == framework::proto::VarType::INT32) { - GatherV2GradFunction(dO, index, axis, dX, ctx.GetPlace()); - } else if (index_type == framework::proto::VarType::INT64) { - GatherV2GradFunction(dO, index, axis, dX, ctx.GetPlace()); + if (index_type == phi::DataType::INT32) { + phi::funcs::GatherV2GradFunction(dev_ctx, dO, index, axis, + dX); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::GatherV2GradFunction(dev_ctx, dO, index, axis, + dX); } return; } dX->mutable_data(ctx.GetPlace()); auto dxt = framework::EigenVector::Flatten(*dX); - auto &place = *ctx.template device_context() - .eigen_device(); + auto &place = *dev_ctx.eigen_device(); dxt.device(place) = dxt.constant(static_cast(0)); if (dO->numel() == 0) return; bool overwrite = ctx.Attr("overwrite"); - if (index_type == framework::proto::VarType::INT32) { + if (index_type == phi::DataType::INT32) { if (overwrite) { - ScatterAssign(ctx.device_context(), *dO, *index, dX); + phi::funcs::ScatterAssign(dev_ctx, *dO, *index, dX); } else { - ScatterAssignAdd(ctx, *dO, *index, dX); + phi::funcs::ScatterAssignAdd(dev_ctx, *dO, *index, dX); } - } else if (index_type == framework::proto::VarType::INT64) { + } else if (index_type == phi::DataType::INT64) { if (overwrite) { - ScatterAssign(ctx.device_context(), *dO, *index, dX); + phi::funcs::ScatterAssign(dev_ctx, *dO, *index, dX); } else { - ScatterAssignAdd(ctx, *dO, *index, dX); + phi::funcs::ScatterAssignAdd(dev_ctx, *dO, *index, dX); } } } diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc index 0f3dcdadcf8..c962dd06523 100644 --- a/paddle/fluid/operators/gather_test.cc +++ b/paddle/fluid/operators/gather_test.cc @@ -15,8 +15,8 @@ limitations under the License. */ #include #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/gather.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/kernels/funcs/gather.h" TEST(Gather, GatherData) { paddle::framework::Tensor* src = new paddle::framework::Tensor(); @@ -39,7 +39,7 @@ TEST(Gather, GatherData) { auto* cpu_place = new paddle::platform::CPUPlace(); paddle::platform::CPUDeviceContext ctx(*cpu_place); - paddle::operators::CPUGather(ctx, *src, *index, output); + phi::funcs::CPUGather(ctx, *src, *index, output); delete cpu_place; cpu_place = NULL; for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4); diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h index 8f3c6660f51..93e96694270 100644 --- a/paddle/fluid/operators/grid_sampler_op.h +++ b/paddle/fluid/operators/grid_sampler_op.h @@ -18,7 +18,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/fluid/operators/math/segment_pooling.cu index bb6d8756bd0..fbdcb99c02a 100644 --- a/paddle/fluid/operators/math/segment_pooling.cu +++ b/paddle/fluid/operators/math/segment_pooling.cu @@ -13,10 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/math/segment_pooling.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -379,9 +379,9 @@ class SegmentPoolGradFunctor { SimpleDiv<<>>(mean_grad.data(), summed_ids->data(), len, dim); - GPUGather(context, mean_grad, segments, in_grad); + phi::funcs::GPUGather(context, mean_grad, segments, in_grad); } else if (pooltype == "SUM") { - GPUGather(context, out_grad, segments, in_grad); + phi::funcs::GPUGather(context, out_grad, segments, in_grad); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN " diff --git a/paddle/fluid/operators/scatter_nd_add_op.cu b/paddle/fluid/operators/scatter_nd_add_op.cu index 6448f8cc405..2fe3fcb759d 100644 --- a/paddle/fluid/operators/scatter_nd_add_op.cu +++ b/paddle/fluid/operators/scatter_nd_add_op.cu @@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/gather_op.h" -#include "paddle/fluid/operators/scatter.cu.h" #include "paddle/fluid/operators/scatter_nd_add_op.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" +#include "paddle/phi/kernels/funcs/scatter.cu.h" namespace paddle { namespace operators { @@ -33,22 +33,20 @@ class ScatterNdAddOpCUDAKernel : public framework::OpKernel { auto *Out = ctx.Output("Out"); framework::TensorCopySync(*X, ctx.GetPlace(), Out); - const auto &index_type = framework::TransToProtoVarType(Ids->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s], but " - "desires to be [%s] or [%s].", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - if (index_type == framework::proto::VarType::INT32) { - GPUScatterNdAdd(ctx, *Updates, *Ids, Out); + const auto &index_type = Ids->dtype(); + bool index_type_match = index_type == phi::DataType::INT32 || + index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, true, + platform::errors::InvalidArgument( + "Index holds the wrong type, it holds [%s], but " + "desires to be [%s] or [%s].", + index_type, phi::DataType::INT32, phi::DataType::INT64)); + auto &dev_ctx = ctx.cuda_device_context(); + if (index_type == phi::DataType::INT32) { + phi::funcs::GPUScatterNdAdd(dev_ctx, *Updates, *Ids, Out); } else { - GPUScatterNdAdd(ctx, *Updates, *Ids, Out); + phi::funcs::GPUScatterNdAdd(dev_ctx, *Updates, *Ids, Out); } } }; @@ -69,12 +67,13 @@ class ScatterNdAddGradOpCUDAKernel : public framework::OpKernel { } if (dUpdates) { dUpdates->mutable_data(ctx.GetPlace()); + auto &dev_ctx = ctx.cuda_device_context(); // Gradient by Gather - const auto &index_type = framework::TransToProtoVarType(Ids->dtype()); - if (index_type == framework::proto::VarType::INT32) { - GPUGatherNd(ctx, *dOut, *Ids, dUpdates); + const auto &index_type = Ids->dtype(); + if (index_type == phi::DataType::INT32) { + phi::funcs::GPUGatherNd(dev_ctx, *dOut, *Ids, dUpdates); } else { - GPUGatherNd(ctx, *dOut, *Ids, dUpdates); + phi::funcs::GPUGatherNd(dev_ctx, *dOut, *Ids, dUpdates); } } } diff --git a/paddle/fluid/operators/scatter_nd_add_op.h b/paddle/fluid/operators/scatter_nd_add_op.h index 2bdf9ec58a8..81c95fe55ab 100644 --- a/paddle/fluid/operators/scatter_nd_add_op.h +++ b/paddle/fluid/operators/scatter_nd_add_op.h @@ -15,8 +15,8 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" -#include "paddle/fluid/operators/scatter.h" +#include "paddle/phi/kernels/funcs/gather.h" +#include "paddle/phi/kernels/funcs/scatter.h" namespace paddle { namespace operators { @@ -37,23 +37,21 @@ class ScatterNdAddOpKernel : public framework::OpKernel { // In place output: Out = X framework::TensorCopySync(*X, ctx.GetPlace(), Out); - const auto &index_type = framework::TransToProtoVarType(Ids->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s], but " - "desires to be [%s] or [%s].", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); + const auto &index_type = Ids->dtype(); + bool index_type_match = index_type == phi::DataType::INT32 || + index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, true, + platform::errors::InvalidArgument( + "Index holds the wrong type, it holds [%s], but " + "desires to be [%s] or [%s].", + index_type, phi::DataType::INT32, phi::DataType::INT64)); - if (index_type == framework::proto::VarType::INT32) { - ScatterNdAdd(ctx, *Updates, *Ids, Out); + auto &dev_ctx = ctx.template device_context(); + if (index_type == phi::DataType::INT32) { + phi::funcs::ScatterNdAdd(dev_ctx, *Updates, *Ids, Out); } else { - ScatterNdAdd(ctx, *Updates, *Ids, Out); + phi::funcs::ScatterNdAdd(dev_ctx, *Updates, *Ids, Out); } } }; @@ -76,11 +74,12 @@ class ScatterNdAddGradientOpKernel : public framework::OpKernel { if (dUpdates) { dUpdates->mutable_data(ctx.GetPlace()); // Gradient by Gather: dUpdates = dO[Ids] - const auto &index_type = framework::TransToProtoVarType(Ids->dtype()); - if (index_type == framework::proto::VarType::INT32) { - CPUGatherNd(ctx.device_context(), *dOut, *Ids, dUpdates); + const auto &index_type = Ids->dtype(); + auto &dev_ctx = ctx.template device_context(); + if (index_type == phi::DataType::INT32) { + phi::funcs::CPUGatherNd(dev_ctx, *dOut, *Ids, dUpdates); } else { - CPUGatherNd(ctx.device_context(), *dOut, *Ids, dUpdates); + phi::funcs::CPUGatherNd(dev_ctx, *dOut, *Ids, dUpdates); } } } diff --git a/paddle/fluid/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu index 549e30803b4..7755e376bc1 100644 --- a/paddle/fluid/operators/scatter_op.cu +++ b/paddle/fluid/operators/scatter_op.cu @@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/gather_op.h" -#include "paddle/fluid/operators/scatter.cu.h" #include "paddle/fluid/operators/scatter_op.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" +#include "paddle/phi/kernels/funcs/scatter.cu.h" namespace paddle { namespace operators { @@ -35,23 +35,22 @@ class ScatterOpCUDAKernel : public framework::OpKernel { framework::TensorCopy(*X, ctx.GetPlace(), Out); // use template class to support int32_t and int64_t - const auto &index_type = framework::TransToProtoVarType(Ids->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; + auto index_type = Ids->dtype(); + bool index_type_match = index_type == phi::DataType::INT32 || + index_type == phi::DataType::INT64; PADDLE_ENFORCE_EQ( index_type_match, true, platform::errors::InvalidArgument( "scatter_op Index holds the wrong type, it holds [%s]," "but desires to be [%s] or [%s].", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - if (index_type == framework::proto::VarType::INT32) { - GPUScatterAssign(ctx, *Updates, *Ids, Out, overwrite); + index_type, phi::DataType::INT32, phi::DataType::INT64)); + auto &dev_ctx = ctx.cuda_device_context(); + if (index_type == phi::DataType::INT32) { + phi::funcs::GPUScatterAssign(dev_ctx, *Updates, *Ids, Out, + overwrite); } else { - GPUScatterAssign(ctx, *Updates, *Ids, Out, overwrite); + phi::funcs::GPUScatterAssign(dev_ctx, *Updates, *Ids, Out, + overwrite); } } }; @@ -68,36 +67,33 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel { auto *Ids = ctx.Input("Ids"); auto *dOut = ctx.Input(framework::GradVarName("Out")); - const auto &index_type = framework::TransToProtoVarType(Ids->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; + auto index_type = Ids->dtype(); + bool index_type_match = index_type == phi::DataType::INT32 || + index_type == phi::DataType::INT64; PADDLE_ENFORCE_EQ( index_type_match, true, platform::errors::InvalidArgument( "scatter_op index holds the wrong type, it holds [%s]," "but desires to be [%s] or [%s]", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); + index_type, phi::DataType::INT32, phi::DataType::INT64)); + auto &dev_ctx = ctx.cuda_device_context(); if (dX) { framework::TensorCopy(*dOut, ctx.GetPlace(), dX); - if (index_type == framework::proto::VarType::INT32) { - GPUScatterGradForX(ctx.device_context(), *Ids, dX); + if (index_type == phi::DataType::INT32) { + phi::funcs::GPUScatterGradForX(dev_ctx, *Ids, dX); } else { - GPUScatterGradForX(ctx.device_context(), *Ids, dX); + phi::funcs::GPUScatterGradForX(dev_ctx, *Ids, dX); } } if (dUpdates) { dUpdates->mutable_data(ctx.GetPlace()); // Gradient by Gather: dUpdates = dO[Ids] - if (index_type == framework::proto::VarType::INT32) { - GPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); + if (index_type == phi::DataType::INT32) { + phi::funcs::GPUGather(dev_ctx, *dOut, *Ids, dUpdates); } else { - GPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); + phi::funcs::GPUGather(dev_ctx, *dOut, *Ids, dUpdates); } } } diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h index 69ab6c7135c..7733181a93f 100644 --- a/paddle/fluid/operators/scatter_op.h +++ b/paddle/fluid/operators/scatter_op.h @@ -15,8 +15,8 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" -#include "paddle/fluid/operators/scatter.h" +#include "paddle/phi/kernels/funcs/gather.h" +#include "paddle/phi/kernels/funcs/scatter.h" namespace paddle { namespace operators { @@ -39,29 +39,27 @@ class ScatterOpKernel : public framework::OpKernel { // In place output: Out = X, Out[Ids] = Updates framework::TensorCopy(*X, ctx.GetPlace(), Out); // Apply ScatterUpdate: Out[index] = Updates[:] - const auto &index_type = framework::TransToProtoVarType(Ids->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s].", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); + const auto &index_type = Ids->dtype(); + bool index_type_match = index_type == phi::DataType::INT32 || + index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, true, + platform::errors::InvalidArgument( + "Index holds the wrong type, it holds [%s]," + "but desires to be [%s] or [%s].", + index_type, phi::DataType::INT32, phi::DataType::INT64)); + auto &dev_ctx = ctx.template device_context(); if (overwrite) { - if (index_type == framework::proto::VarType::INT32) { - ScatterAssign(ctx.device_context(), *Updates, *Ids, Out); + if (index_type == phi::DataType::INT32) { + phi::funcs::ScatterAssign(dev_ctx, *Updates, *Ids, Out); } else { - ScatterAssign(ctx.device_context(), *Updates, *Ids, Out); + phi::funcs::ScatterAssign(dev_ctx, *Updates, *Ids, Out); } } else { - if (index_type == framework::proto::VarType::INT32) { - ScatterAssignAdd(ctx, *Updates, *Ids, Out); + if (index_type == phi::DataType::INT32) { + phi::funcs::ScatterAssignAdd(dev_ctx, *Updates, *Ids, Out); } else { - ScatterAssignAdd(ctx, *Updates, *Ids, Out); + phi::funcs::ScatterAssignAdd(dev_ctx, *Updates, *Ids, Out); } } } @@ -79,36 +77,33 @@ class ScatterGradientOpKernel : public framework::OpKernel { auto *Ids = ctx.Input("Ids"); auto *dOut = ctx.Input(framework::GradVarName("Out")); - const auto &index_type = framework::TransToProtoVarType(Ids->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; + const auto &index_type = Ids->dtype(); + bool index_type_match = index_type == phi::DataType::INT32 || + index_type == phi::DataType::INT64; PADDLE_ENFORCE_EQ( index_type_match, true, platform::errors::InvalidArgument( "scatter_op index holds the wrong type, it holds [%s]," "but desires to be [%s] or [%s]", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); + index_type, phi::DataType::INT32, phi::DataType::INT64)); + auto &dev_ctx = ctx.template device_context(); if (dX) { framework::TensorCopy(*dOut, ctx.GetPlace(), dX); - if (index_type == framework::proto::VarType::INT32) { - CPUScatterGradForX(ctx.device_context(), *Ids, dX); + if (index_type == phi::DataType::INT32) { + phi::funcs::CPUScatterGradForX(dev_ctx, *Ids, dX); } else { - CPUScatterGradForX(ctx.device_context(), *Ids, dX); + phi::funcs::CPUScatterGradForX(dev_ctx, *Ids, dX); } } if (dUpdates) { dUpdates->mutable_data(ctx.GetPlace()); // Gradient by Gather: dUpdates = dO[Ids] - if (index_type == framework::proto::VarType::INT32) { - CPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); + if (index_type == phi::DataType::INT32) { + phi::funcs::CPUGather(dev_ctx, *dOut, *Ids, dUpdates); } else { - CPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); + phi::funcs::CPUGather(dev_ctx, *dOut, *Ids, dUpdates); } } } diff --git a/paddle/fluid/operators/scatter_test.cc b/paddle/fluid/operators/scatter_test.cc index 0a4cab5fac1..93f2d60e5f2 100644 --- a/paddle/fluid/operators/scatter_test.cc +++ b/paddle/fluid/operators/scatter_test.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/scatter.h" +#include "paddle/phi/kernels/funcs/scatter.h" #include @@ -43,7 +43,7 @@ TEST(scatter, ScatterUpdate) { auto* cpu_place = new paddle::platform::CPUPlace(); paddle::platform::CPUDeviceContext ctx(*cpu_place); - paddle::operators::ScatterAssign(ctx, src, index, &output); + phi::funcs::ScatterAssign(ctx, src, index, &output); for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], 0.0f); for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output.data()[i], 0.0f); diff --git a/paddle/fluid/operators/segment_pool_op.cu b/paddle/fluid/operators/segment_pool_op.cu index 4e20844dc32..e147e62a983 100644 --- a/paddle/fluid/operators/segment_pool_op.cu +++ b/paddle/fluid/operators/segment_pool_op.cu @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/segment_pool_op.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc index 2d4730635fd..25c12ab565a 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc @@ -16,8 +16,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" -#include "paddle/fluid/operators/scatter.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h index 365381abc46..2960b77d5ac 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h @@ -15,8 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" -#include "paddle/fluid/operators/scatter.h" +#include "paddle/phi/kernels/funcs/scatter.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu index 3c546dd8156..68628fb2748 100644 --- a/paddle/fluid/operators/viterbi_decode_op.cu +++ b/paddle/fluid/operators/viterbi_decode_op.cu @@ -11,8 +11,8 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_functor.h" #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" -#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/viterbi_decode_op.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" #ifdef __NVCC__ #include "cub/cub.cuh" @@ -62,10 +62,11 @@ int64_t ComputeBlockSize(int64_t col) { template