From c0e29233b4b9e54bccab478bbf2f7efcb80756af Mon Sep 17 00:00:00 2001 From: From00 Date: Fri, 11 Mar 2022 17:25:52 +0800 Subject: [PATCH 001/176] Move psroi_pool OP to phi (#40353) * Move psroi_pool OP to phi * Replace platform::TensorCopy with phi::Copy --- paddle/fluid/operators/psroi_pool_op.cc | 107 +----- paddle/fluid/operators/psroi_pool_op.cu | 350 ------------------ paddle/fluid/operators/psroi_pool_op.h | 295 --------------- paddle/phi/infermeta/backward.cc | 12 + paddle/phi/infermeta/backward.h | 10 + paddle/phi/infermeta/multiary.cc | 259 ++++++++----- paddle/phi/infermeta/multiary.h | 55 +-- .../phi/kernels/cpu/psroi_pool_grad_kernel.cc | 140 +++++++ paddle/phi/kernels/cpu/psroi_pool_kernel.cc | 174 +++++++++ .../phi/kernels/gpu/psroi_pool_grad_kernel.cu | 193 ++++++++++ paddle/phi/kernels/gpu/psroi_pool_kernel.cu | 231 ++++++++++++ paddle/phi/kernels/psroi_pool_grad_kernel.h | 34 ++ paddle/phi/kernels/psroi_pool_kernel.h | 33 ++ paddle/phi/ops/compat/psroi_pool_sig.cc | 40 ++ 14 files changed, 1079 insertions(+), 854 deletions(-) delete mode 100644 paddle/fluid/operators/psroi_pool_op.cu delete mode 100644 paddle/fluid/operators/psroi_pool_op.h create mode 100644 paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/psroi_pool_kernel.cc create mode 100644 paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/psroi_pool_kernel.cu create mode 100644 paddle/phi/kernels/psroi_pool_grad_kernel.h create mode 100644 paddle/phi/kernels/psroi_pool_kernel.h create mode 100644 paddle/phi/ops/compat/psroi_pool_sig.cc diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc index da637dfeb23..cfacffff234 100644 --- a/paddle/fluid/operators/psroi_pool_op.cc +++ b/paddle/fluid/operators/psroi_pool_op.cc @@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/psroi_pool_op.h" -#include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -82,75 +82,6 @@ class PSROIPoolOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of PSROIPoolOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("ROIs"), true, - platform::errors::InvalidArgument( - "Input(ROIs) of PSROIPoolOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of PSROIPoolOp should not be null.")); - auto input_dims = ctx->GetInputDim("X"); - auto rois_dims = ctx->GetInputDim("ROIs"); - - PADDLE_ENFORCE_EQ(input_dims.size(), 4, - platform::errors::InvalidArgument( - "The format of input tensor is NCHW")); - PADDLE_ENFORCE_EQ( - rois_dims.size(), 2, - platform::errors::InvalidArgument( - "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) " - "given as [(x1, y1, x2, y2), ...]")); - PADDLE_ENFORCE_EQ( - rois_dims[1], 4, - platform::errors::InvalidArgument( - "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) " - "given as [(x1, y1, x2, y2), ...]")); - if (ctx->HasInput("RoisNum")) { - auto rois_num_dims = ctx->GetInputDim("RoisNum"); - PADDLE_ENFORCE_EQ(rois_num_dims.size(), 1, - platform::errors::InvalidArgument( - "The second dimension of RoisNum should " - "be 1, but received dimension is %d", - rois_num_dims.size())); - } - int pooled_height = ctx->Attrs().Get("pooled_height"); - int pooled_width = ctx->Attrs().Get("pooled_width"); - int output_channels = ctx->Attrs().Get("output_channels"); - float spatial_scale = ctx->Attrs().Get("spatial_scale"); - - PADDLE_ENFORCE_EQ( - input_dims[1], output_channels * pooled_height * pooled_width, - platform::errors::InvalidArgument( - "the channel of X(%d) " - "should be equal to the product of " - "output_channels(%d), pooled_height(%d) and pooled_width(%d)", - input_dims[1], output_channels, pooled_height, pooled_width)); - - PADDLE_ENFORCE_GT(pooled_height, 0, - platform::errors::InvalidArgument( - "The pooled output height must be greater than 0")); - PADDLE_ENFORCE_GT(pooled_width, 0, - platform::errors::InvalidArgument( - "The pooled output width must be greater than 0")); - PADDLE_ENFORCE_GT(output_channels, 1, - platform::errors::InvalidArgument( - "The pooled output channels must greater than 1")); - PADDLE_ENFORCE_GT(spatial_scale, 0.0f, - platform::errors::InvalidArgument( - "The spatial scale must greater than 0.")); - - auto out_dims = input_dims; - out_dims[0] = rois_dims[0]; - out_dims[1] = - output_channels; // input_dims[1] / (pooled_height * pooled_width); - out_dims[2] = pooled_height; - out_dims[3] = pooled_width; - ctx->SetOutputDim("Out", out_dims); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -164,16 +95,6 @@ class PSROIPoolGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true, - platform::errors::InvalidArgument( - "The gradient of Out should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true, - platform::errors::InvalidArgument( - "The gradient of X should not be null.")); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -204,15 +125,13 @@ class PSROIPoolGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(psroi_pool, PsroiPoolInferShapeFunctor, + PD_INFER_META(phi::PsroiPoolInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(psroi_pool_grad, PsroiPoolGradInferShapeFunctor, + PD_INFER_META(phi::PsroiPoolGradInferMeta)); REGISTER_OPERATOR(psroi_pool, ops::PSROIPoolOp, ops::PSROIPoolOpMaker, ops::PSROIPoolGradMaker, - ops::PSROIPoolGradMaker); -REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp); -REGISTER_OP_CPU_KERNEL( - psroi_pool, - ops::CPUPSROIPoolOpKernel, - ops::CPUPSROIPoolOpKernel); -REGISTER_OP_CPU_KERNEL( - psroi_pool_grad, - ops::CPUPSROIPoolGradOpKernel, - ops::CPUPSROIPoolGradOpKernel); + ops::PSROIPoolGradMaker, + PsroiPoolInferShapeFunctor); +REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp, + PsroiPoolGradInferShapeFunctor); diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu deleted file mode 100644 index c1917501db8..00000000000 --- a/paddle/fluid/operators/psroi_pool_op.cu +++ /dev/null @@ -1,350 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/psroi_pool_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -static constexpr int kNumCUDAThreads = 512; -static constexpr int kNumMaximumNumBlocks = 4096; - -static inline int NumBlocks(const int N) { - return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, - kNumMaximumNumBlocks); -} - -template -__global__ void GPUPSROIPoolForward( - const int nthreads, const T* input_data, const T* input_rois, - const float spatial_scale, const int input_channels, const int height, - const int width, const int output_channels, const int pooled_height, - const int pooled_width, const int* rois_batch_id_data, T* output_data) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - int offset = blockDim.x * gridDim.x; - for (size_t i = index; i < nthreads; i += offset) { - // The output is in order (n, c, ph, pw) - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % output_channels; - int n = i / pooled_width / pooled_height / output_channels; - - // set roi_batch_id - int roi_batch_id = rois_batch_id_data[n]; - - // [start, end) interval for spatial sampling - const T* offset_input_rois = input_rois + n * 4; - T roi_start_w = static_cast(round(offset_input_rois[0])) * spatial_scale; - T roi_start_h = static_cast(round(offset_input_rois[1])) * spatial_scale; - T roi_end_w = - static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; - T roi_end_h = - static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; - - // Force too small ROIs to be 1x1 - T roi_height = max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 - T roi_width = max(roi_end_w - roi_start_w, (T)0.1); - - // Compute w and h at input feature map - T bin_size_h = roi_height / static_cast(pooled_height); - T bin_size_w = roi_width / static_cast(pooled_width); - - int hstart = floor(bin_size_h * static_cast(ph) + roi_start_h); - int wstart = floor(bin_size_w * static_cast(pw) + roi_start_w); - int hend = ceil(bin_size_h * static_cast(ph + 1) + roi_start_h); - int wend = ceil(bin_size_w * static_cast(pw + 1) + roi_start_w); - - // Add roi offsets and clip to input boundaries - hstart = min(max(hstart, 0), height); - hend = min(max(hend, 0), height); - wstart = min(max(wstart, 0), width); - wend = min(max(wend, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - int input_channel = (c * pooled_height + ph) * pooled_width + pw; - const T* offset_input_data = - input_data + - (roi_batch_id * input_channels + input_channel) * height * width; - T outsum = 0; - - for (int ih = hstart; ih < hend; ++ih) { - for (int iw = wstart; iw < wend; ++iw) { - int input_index = ih * width + iw; - outsum += offset_input_data[input_index]; - } - } - - T bin_area = static_cast((hend - hstart) * (wend - wstart)); - output_data[i] = is_empty ? 0. : outsum / bin_area; - } -} - -template -__global__ void GPUPSROIPoolBackward( - const int nthreads, const T* input_rois, const T* output_grad_data, - const float spatial_scale, const int input_channels, const int height, - const int width, const int output_channels, const int pooled_height, - const int pooled_width, const int* rois_batch_id_data, T* input_grad_data) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - int offset = blockDim.x * gridDim.x; - for (int i = index; i < nthreads; i += offset) { - // The output is in order (n, c, ph, pw) - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % output_channels; - int n = i / pooled_width / pooled_height / output_channels; - - // set roi_batch_id - int roi_batch_id = rois_batch_id_data[n]; - int input_channel = (c * pooled_height + ph) * pooled_width + pw; - int input_offset = - (roi_batch_id * input_channels + input_channel) * height * width; - T* offset_input_grad_data = input_grad_data + input_offset; - - // [start, end) interval for spatial sampling - const T* offset_input_rois = input_rois + n * 4; - T roi_start_w = static_cast(round(offset_input_rois[0])) * spatial_scale; - T roi_start_h = static_cast(round(offset_input_rois[1])) * spatial_scale; - T roi_end_w = - static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; - T roi_end_h = - static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; - - // Force too small ROIs to be 1x1 - T roi_height = max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 - T roi_width = max(roi_end_w - roi_start_w, (T)0.1); - - // Compute w and h at input feature map - T bin_size_h = roi_height / static_cast(pooled_height); - T bin_size_w = roi_width / static_cast(pooled_width); - - int hstart = floor(bin_size_h * static_cast(ph) + roi_start_h); - int wstart = floor(bin_size_w * static_cast(pw) + roi_start_w); - int hend = ceil(bin_size_h * static_cast(ph + 1) + roi_start_h); - int wend = ceil(bin_size_w * static_cast(pw + 1) + roi_start_w); - - // Add roi offsets and clip to input boundaries - hstart = min(max(hstart, 0), height); - hend = min(max(hend, 0), height); - wstart = min(max(wstart, 0), width); - wend = min(max(wend, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - // Accumulate diff_val into input data - T bin_area = static_cast((hend - hstart) * (wend - wstart)); - T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area; - for (int ih = hstart; ih < hend; ++ih) { - for (int iw = wstart; iw < wend; ++iw) { - int input_index = ih * width + iw; - platform::CudaAtomicAdd(offset_input_grad_data + input_index, diff_val); - } - } - } -} - -template -class GPUPSROIPoolOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto output_channels = ctx.Attr("output_channels"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int input_channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - - PADDLE_ENFORCE_EQ( - input_channels, output_channels * pooled_height * pooled_width, - platform::errors::InvalidArgument( - "The channels %d of input X should equal the product of " - "output_channels %d x pooled_height %d x pooled_width %d.", - input_channels, output_channels, pooled_height, pooled_width)); - - int rois_num = rois->dims()[0]; - if (rois_num == 0) return; - int rois_batch_size; - framework::Tensor rois_batch_id_list; - rois_batch_id_list.Resize({rois_num}); - int* rois_batch_id_data = - rois_batch_id_list.mutable_data(platform::CPUPlace()); - - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - auto* rois_num_data = rois_num_t->data(); - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of input(ROIs) and input(X) must be " - "the same but received batch size of input(ROIs) and " - "input(X) is %d and %d respectively.", - rois_batch_size, batch_size)); - std::vector rois_num_list(rois_batch_size); - memory::Copy(platform::CPUPlace(), rois_num_list.data(), ctx.GetPlace(), - rois_num_data, sizeof(int) * rois_batch_size, 0); - int rois_num_count = 0; - for (int i = 0; i < rois_batch_size; ++i) { - rois_num_count += rois_num_list[i]; - } - PADDLE_ENFORCE_EQ( - rois_num_count, rois_num, - platform::errors::InvalidArgument( - "the rois_num from input and RoisNum must be the same")); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_list[n]; ++i) { - rois_batch_id_data[i] = n; - } - start += rois_num_list[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of input(ROIs) and input(X) must be " - "the same but received batch size of input(ROIs) and " - "input(X) is %d and %d respectively.", - rois_batch_size, batch_size)); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod, - platform::errors::InvalidArgument( - "The number of rois from input(ROIs) and its LOD " - "must be the same. Received rois %d of input(ROIs) " - "but the number of rois %d from its LOD is %d", - rois_num, rois_num_with_lod)); - - // set rois batch id - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; - } - } - } - framework::Tensor rois_batch_id_list_gpu; - framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(), - ctx.device_context(), &rois_batch_id_list_gpu); - - int output_size = out->numel(); - int blocks = NumBlocks(output_size); - int threads = kNumCUDAThreads; - - // call cuda kernel function - GPUPSROIPoolForward< - T><<>>( - output_size, in->data(), rois->data(), spatial_scale, - input_channels, height, width, output_channels, pooled_height, - pooled_width, rois_batch_id_list_gpu.data(), - out->mutable_data(ctx.GetPlace())); - } -}; - -template -class GPUPSROIPoolGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - - auto* output_grad = ctx.Input(framework::GradVarName("Out")); - auto* input_grad = ctx.Output(framework::GradVarName("X")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto output_channels = ctx.Attr("output_channels"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - int rois_num = rois->dims()[0]; - int input_channels = in->dims()[1]; - int height = in->dims()[2]; - int width = in->dims()[3]; - - if (input_grad) { - // set roi batch id - framework::Tensor rois_batch_id_list; - rois_batch_id_list.Resize({rois_num}); - int* rois_batch_id_data = - rois_batch_id_list.mutable_data(platform::CPUPlace()); - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - std::vector rois_num_list(rois_batch_size); - memory::Copy(platform::CPUPlace(), rois_num_list.data(), ctx.GetPlace(), - rois_num_t->data(), sizeof(int) * rois_batch_size, 0); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_list[n]; ++i) { - rois_batch_id_data[i] = n; - } - start += rois_num_list[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; - } - } - } - framework::Tensor rois_batch_id_list_gpu; - framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(), - ctx.device_context(), &rois_batch_id_list_gpu); - - input_grad->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(ctx.cuda_device_context(), input_grad, static_cast(0)); - - int output_grad_size = output_grad->numel(); - int blocks = NumBlocks(output_grad_size); - int threads = kNumCUDAThreads; - - if (output_grad_size > 0) { - GPUPSROIPoolBackward< - T><<>>( - output_grad_size, rois->data(), output_grad->data(), - spatial_scale, input_channels, height, width, output_channels, - pooled_height, pooled_width, rois_batch_id_list_gpu.data(), - input_grad->mutable_data(ctx.GetPlace())); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - psroi_pool, - ops::GPUPSROIPoolOpKernel, - ops::GPUPSROIPoolOpKernel); -REGISTER_OP_CUDA_KERNEL( - psroi_pool_grad, - ops::GPUPSROIPoolGradOpKernel, - ops::GPUPSROIPoolGradOpKernel); diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h deleted file mode 100644 index 3f020d93391..00000000000 --- a/paddle/fluid/operators/psroi_pool_op.h +++ /dev/null @@ -1,295 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class CPUPSROIPoolOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - auto output_channels = ctx.Attr("output_channels"); - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int input_channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - PADDLE_ENFORCE_EQ(input_channels, - output_channels * pooled_height * pooled_width, - platform::errors::InvalidArgument( - "the channels of input " - "X should equal the product of " - "output_channels x pooled_height x pooled_width")); - - auto in_stride = phi::stride(in_dims); - auto out_stride = phi::stride(out->dims()); - - const T* input_data = in->data(); - - framework::Tensor rois_batch_id_list; - rois_batch_id_list.Resize({rois_num}); - int* rois_batch_id_data = - rois_batch_id_list.mutable_data(ctx.GetPlace()); - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - auto* rois_num_data = rois_num_t->data(); - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of rois and the batch size of images " - " must be the same. But received the batch size of rois is %d, " - "and the batch size of images is %d", - rois_batch_size, batch_size)); - int rois_num_count = 0; - for (int i = 0; i < rois_batch_size; ++i) { - rois_num_count += rois_num_data[i]; - } - PADDLE_ENFORCE_EQ( - rois_num_count, rois_num, - platform::errors::InvalidArgument( - "the rois_num from input and RoisNum must be the same")); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_data[n]; ++i) { - rois_batch_id_data[i] = n; - } - start += rois_num_data[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument("the rois_batch_size and input(X) " - "batch_size should be the same.")); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ( - rois_num_with_lod, rois_num, - platform::errors::InvalidArgument( - "the rois_num from input and lod must be the same")); - // calculate batch id index for each roi according to LoD - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; - } - } - } - T* output_data = out->mutable_data(ctx.GetPlace()); - const T* input_rois = rois->data(); - - // calculate psroipooling, parallel processing can be implemented per ROI - for (int n = 0; n < rois_num; ++n) { - // set roi batch id - int roi_batch_id = rois_batch_id_data[n]; - - // [start, end) interval for spatial sampling - const T* offset_input_rois = input_rois + n * 4; - T roi_start_w = - static_cast(round(offset_input_rois[0])) * spatial_scale; - T roi_start_h = - static_cast(round(offset_input_rois[1])) * spatial_scale; - T roi_end_w = - static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; - T roi_end_h = - static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; - // Force too small rois to be 1 x 1 - T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 - T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1); - - // Compute bin size w and h at input feature map - T bin_size_h = roi_height / static_cast(pooled_height); - T bin_size_w = roi_width / static_cast(pooled_width); - - // calculate each pixel of the output feature map. - int out_roi_offset = n * out_stride[0]; - for (int c = 0; c < output_channels; ++c) { - // per category - int out_plane_offset = out_roi_offset + c * out_stride[1]; - for (int ph = 0; ph < pooled_height; ++ph) { - int out_row_offset = out_plane_offset + ph * out_stride[2]; - for (int pw = 0; pw < pooled_width; ++pw) { - // calculate w and h at input feature map - int hstart = floor(static_cast(ph) * bin_size_h + roi_start_h); - int wstart = floor(static_cast(pw) * bin_size_w + roi_start_w); - int hend = ceil(static_cast(ph + 1) * bin_size_h + roi_start_h); - int wend = ceil(static_cast(pw + 1) * bin_size_w + roi_start_w); - // Add roi offsets and clip to input boundaries - hstart = std::min(std::max(hstart, 0), height); - wstart = std::min(std::max(wstart, 0), width); - hend = std::min(std::max(hend, 0), height); - wend = std::min(std::max(wend, 0), width); - - int output_index = out_row_offset + pw; - int input_channel = (c * pooled_height + ph) * pooled_width + pw; - int input_plane_offset = - roi_batch_id * in_stride[0] + input_channel * in_stride[1]; - const T* offset_input_data = input_data + input_plane_offset; - T out_sum = 0.; - bool is_empty = (hend <= hstart) || (wend <= wstart); - for (int ih = hstart; ih < hend; ++ih) { - for (int iw = wstart; iw < wend; ++iw) { - int input_index = ih * in_stride[2] + iw; - out_sum += offset_input_data[input_index]; - } - } - T bin_area = (hend - hstart) * (wend - wstart); - output_data[output_index] = is_empty ? 0. : out_sum / bin_area; - } - } - } - } - return; - } -}; - -template -class CPUPSROIPoolGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* output_grad = - ctx.Input(framework::GradVarName("Out")); - auto* input_grad = - ctx.Output(framework::GradVarName("X")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto output_channels = ctx.Attr("output_channels"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - if (input_grad) { - auto in_dims = in->dims(); - int input_channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - // set roi batch id - framework::Tensor rois_batch_id_list; - rois_batch_id_list.Resize({rois_num}); - int* rois_batch_id_data = - rois_batch_id_list.mutable_data(ctx.GetPlace()); - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - auto* rois_num_data = rois_num_t->data(); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_data[n]; ++i) { - rois_batch_id_data[i] = n; - } - start += rois_num_data[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - // calculate batch id index for each roi according to LoD - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; - } - } - } - const T* input_rois = rois->data(); - const T* output_grad_data = output_grad->data(); - T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); - - // set gradient of X to be 0. before backpropagate. - phi::funcs::SetConstant set_zero; - set_zero(ctx.template device_context(), input_grad, - static_cast(0)); - - // backpropagate gradient per output pixel - int output_grad_size = output_grad->numel(); - for (int i = 0; i < output_grad_size; ++i) { - // The output is in order (n, c, ph, pw) - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % output_channels; - int n = i / pooled_width / pooled_height / output_channels; - - // set roi_batch_id - int roi_batch_id = rois_batch_id_data[n]; - int input_channel = (c * pooled_height + ph) * pooled_width + pw; - int input_offset = - (roi_batch_id * input_channels + input_channel) * height * width; - T* offset_input_grad_data = input_grad_data + input_offset; - - // [start, end) interval for spatial sampling - const T* offset_input_rois = input_rois + n * 4; - T roi_start_w = - static_cast(round(offset_input_rois[0])) * spatial_scale; - T roi_start_h = - static_cast(round(offset_input_rois[1])) * spatial_scale; - T roi_end_w = - static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; - T roi_end_h = - static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; - - // Force too small ROIs to be 1x1 - T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 - T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1); - - // Compute w and h at input feature map - T bin_size_h = roi_height / static_cast(pooled_height); - T bin_size_w = roi_width / static_cast(pooled_width); - - int hstart = floor(bin_size_h * static_cast(ph) + roi_start_h); - int wstart = floor(bin_size_w * static_cast(pw) + roi_start_w); - int hend = ceil(bin_size_h * static_cast(ph + 1) + roi_start_h); - int wend = ceil(bin_size_w * static_cast(pw + 1) + roi_start_w); - - // Add roi offsets and clip to input boundaries - hstart = std::min(std::max(hstart, 0), height); - hend = std::min(std::max(hend, 0), height); - wstart = std::min(std::max(wstart, 0), width); - wend = std::min(std::max(wend, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - // Accumulate diff_val into input data - T bin_area = static_cast((hend - hstart) * (wend - wstart)); - T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area; - for (int ih = hstart; ih < hend; ++ih) { - for (int iw = wstart; iw < wend; ++iw) { - int input_index = ih * width + iw; - offset_input_grad_data[input_index] += diff_val; - } - } - } - } - return; - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index 4ddef5b0002..0a2b4dcae58 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -115,6 +115,18 @@ void GatherNdGradInferMeta(const MetaTensor& x, x_grad->set_dtype(dtype); } +void PsroiPoolGradInferMeta(const MetaTensor& x, + const MetaTensor& rois, + paddle::optional rois_num, + const MetaTensor& dout, + int pooled_height, + int pooled_width, + int output_channels, + float spatial_scale, + MetaTensor* dx) { + dx->share_meta(x); +} + void ScatterGradInferMeta(const MetaTensor& index, const MetaTensor& updates, const MetaTensor& out_grad, diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index f7b0eed5dd9..c4003ca1fe7 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -47,6 +47,16 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out, int axis, MetaTensor* dx); +void PsroiPoolGradInferMeta(const MetaTensor& x, + const MetaTensor& rois, + paddle::optional rois_num, + const MetaTensor& dout, + int pooled_height, + int pooled_width, + int output_channels, + float spatial_scale, + MetaTensor* dx); + void ScatterGradInferMeta(const MetaTensor& index, const MetaTensor& updates, const MetaTensor& out_grad, diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index acce40713b8..84441ed8b74 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -28,6 +28,98 @@ std::vector GetMetaTensorsDim(const std::vector& tensors) { return dims; } +void AdadeltaInferMeta(const MetaTensor& param, + const MetaTensor& grad, + const MetaTensor& avg_squared_grad, + const MetaTensor& avg_squared_update, + float rho, + float epsilon, + MetaTensor* param_out, + MetaTensor* avg_squared_grad_out, + MetaTensor* avg_squared_update_out) { + auto param_dims = param.dims(); + PADDLE_ENFORCE_EQ( + param_dims, + grad.dims(), + errors::InvalidArgument( + "Param and grad input of AdadeltaOp should have same dimension.")); + PADDLE_ENFORCE_EQ( + param_dims, + avg_squared_grad.dims(), + errors::InvalidArgument("Param and AvgSquaredGrad input of AdadeltaOp " + "should have same dimension")); + PADDLE_ENFORCE_EQ( + param_dims, + avg_squared_update.dims(), + errors::InvalidArgument("Param and AvgSquaredUpdate input of AdadeltaOp " + "should have same dimension")); + + param_out->set_dims(param_dims); + param_out->set_dtype(param.dtype()); + + avg_squared_grad_out->set_dims(param_dims); + avg_squared_grad_out->set_dtype(avg_squared_grad.dtype()); + + avg_squared_update_out->set_dims(param_dims); + avg_squared_update_out->set_dtype(avg_squared_update.dtype()); +} + +void AdamaxInferMeta(const MetaTensor& param, + const MetaTensor& grad, + const MetaTensor& learning_rate, + const MetaTensor& moment, + const MetaTensor& inf_norm, + const MetaTensor& beta1_pow, + float beta1, + float beta2, + float epsilon, + MetaTensor* param_out, + MetaTensor* moment_out, + MetaTensor* inf_norm_out) { + auto lr_dims = learning_rate.dims(); + PADDLE_ENFORCE_NE( + product(lr_dims), + 0, + errors::InvalidArgument("Maybe the Input variable LearningRate has not " + "been initialized. You may need to confirm " + "if you put exe.run(startup_program) " + "after optimizer.minimize function.")); + PADDLE_ENFORCE_EQ( + product(lr_dims), + 1, + errors::InvalidArgument("Learning rate should have 1 dimension")); + auto beta1_pow_dims = beta1_pow.dims(); + PADDLE_ENFORCE_EQ(product(beta1_pow_dims), + 1, + errors::InvalidArgument( + "Beta1 power accumulator should have 1 dimension")); + auto param_dims = param.dims(); + PADDLE_ENFORCE_EQ( + param_dims, + grad.dims(), + errors::InvalidArgument( + "Param and Grad input of AdamaxOp should have same dimension")); + PADDLE_ENFORCE_EQ( + param_dims, + moment.dims(), + errors::InvalidArgument( + "Param and Moment input of AdamaxOp should have same dimension")); + PADDLE_ENFORCE_EQ( + param_dims, + inf_norm.dims(), + errors::InvalidArgument( + "Param and InfNorm input of AdamaxOp should have same dimension")); + + param_out->set_dims(param_dims); + param_out->set_dtype(param.dtype()); + + moment_out->set_dims(param_dims); + moment_out->set_dtype(moment.dtype()); + + inf_norm_out->set_dims(param_dims); + inf_norm_out->set_dtype(inf_norm.dtype()); +} + void AucInferMeta(const MetaTensor& input, const MetaTensor& label, const MetaTensor& stat_pos, @@ -108,98 +200,6 @@ void AucInferMeta(const MetaTensor& input, } } -void AdamaxInferMeta(const MetaTensor& param, - const MetaTensor& grad, - const MetaTensor& learning_rate, - const MetaTensor& moment, - const MetaTensor& inf_norm, - const MetaTensor& beta1_pow, - float beta1, - float beta2, - float epsilon, - MetaTensor* param_out, - MetaTensor* moment_out, - MetaTensor* inf_norm_out) { - auto lr_dims = learning_rate.dims(); - PADDLE_ENFORCE_NE( - product(lr_dims), - 0, - errors::InvalidArgument("Maybe the Input variable LearningRate has not " - "been initialized. You may need to confirm " - "if you put exe.run(startup_program) " - "after optimizer.minimize function.")); - PADDLE_ENFORCE_EQ( - product(lr_dims), - 1, - errors::InvalidArgument("Learning rate should have 1 dimension")); - auto beta1_pow_dims = beta1_pow.dims(); - PADDLE_ENFORCE_EQ(product(beta1_pow_dims), - 1, - errors::InvalidArgument( - "Beta1 power accumulator should have 1 dimension")); - auto param_dims = param.dims(); - PADDLE_ENFORCE_EQ( - param_dims, - grad.dims(), - errors::InvalidArgument( - "Param and Grad input of AdamaxOp should have same dimension")); - PADDLE_ENFORCE_EQ( - param_dims, - moment.dims(), - errors::InvalidArgument( - "Param and Moment input of AdamaxOp should have same dimension")); - PADDLE_ENFORCE_EQ( - param_dims, - inf_norm.dims(), - errors::InvalidArgument( - "Param and InfNorm input of AdamaxOp should have same dimension")); - - param_out->set_dims(param_dims); - param_out->set_dtype(param.dtype()); - - moment_out->set_dims(param_dims); - moment_out->set_dtype(moment.dtype()); - - inf_norm_out->set_dims(param_dims); - inf_norm_out->set_dtype(inf_norm.dtype()); -} - -void AdadeltaInferMeta(const MetaTensor& param, - const MetaTensor& grad, - const MetaTensor& avg_squared_grad, - const MetaTensor& avg_squared_update, - float rho, - float epsilon, - MetaTensor* param_out, - MetaTensor* avg_squared_grad_out, - MetaTensor* avg_squared_update_out) { - auto param_dims = param.dims(); - PADDLE_ENFORCE_EQ( - param_dims, - grad.dims(), - errors::InvalidArgument( - "Param and grad input of AdadeltaOp should have same dimension.")); - PADDLE_ENFORCE_EQ( - param_dims, - avg_squared_grad.dims(), - errors::InvalidArgument("Param and AvgSquaredGrad input of AdadeltaOp " - "should have same dimension")); - PADDLE_ENFORCE_EQ( - param_dims, - avg_squared_update.dims(), - errors::InvalidArgument("Param and AvgSquaredUpdate input of AdadeltaOp " - "should have same dimension")); - - param_out->set_dims(param_dims); - param_out->set_dtype(param.dtype()); - - avg_squared_grad_out->set_dims(param_dims); - avg_squared_grad_out->set_dtype(avg_squared_grad.dtype()); - - avg_squared_update_out->set_dims(param_dims); - avg_squared_update_out->set_dtype(avg_squared_update.dtype()); -} - void BilinearTensorProductInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, @@ -369,6 +369,81 @@ void ConcatInferMeta(const std::vector& x, out->share_lod(*x.at(0)); } +void PsroiPoolInferMeta(const MetaTensor& x, + const MetaTensor& rois, + paddle::optional rois_num, + int pooled_height, + int pooled_width, + int output_channels, + float spatial_scale, + MetaTensor* out) { + auto input_dims = x.dims(); + auto rois_dims = rois.dims(); + + PADDLE_ENFORCE_EQ( + input_dims.size(), + 4, + errors::InvalidArgument("The format of input tensor is NCHW")); + PADDLE_ENFORCE_EQ(rois_dims.size(), + 2, + errors::InvalidArgument( + "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) " + "given as [(x1, y1, x2, y2), ...]")); + PADDLE_ENFORCE_EQ(rois_dims[1], + 4, + errors::InvalidArgument( + "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) " + "given as [(x1, y1, x2, y2), ...]")); + if (rois_num.get_ptr()) { + auto rois_num_dims = rois_num->dims(); + PADDLE_ENFORCE_EQ( + rois_num_dims.size(), + 1, + errors::InvalidArgument("The second dimension of RoisNum should " + "be 1, but received dimension is %d", + rois_num_dims.size())); + } + + PADDLE_ENFORCE_EQ( + input_dims[1], + output_channels * pooled_height * pooled_width, + errors::InvalidArgument( + "the channel of X(%d) " + "should be equal to the product of " + "output_channels(%d), pooled_height(%d) and pooled_width(%d)", + input_dims[1], + output_channels, + pooled_height, + pooled_width)); + + PADDLE_ENFORCE_GT(pooled_height, + 0, + errors::InvalidArgument( + "The pooled output height must be greater than 0")); + PADDLE_ENFORCE_GT(pooled_width, + 0, + errors::InvalidArgument( + "The pooled output width must be greater than 0")); + PADDLE_ENFORCE_GT(output_channels, + 1, + errors::InvalidArgument( + "The pooled output channels must greater than 1")); + PADDLE_ENFORCE_GT( + spatial_scale, + 0.0f, + errors::InvalidArgument("The spatial scale must greater than 0.")); + + auto out_dims = input_dims; + out_dims[0] = rois_dims[0]; + out_dims[1] = + output_channels; // input_dims[1] / (pooled_height * pooled_width); + out_dims[2] = pooled_height; + out_dims[3] = pooled_width; + + out->set_dims(out_dims); + out->set_dtype(x.dtype()); +} + void WhereInferMeta(const MetaTensor& condition, const MetaTensor& x, const MetaTensor& y, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 26bdc62302f..c11843212ed 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -20,6 +20,29 @@ namespace phi { std::vector GetMetaTensorsDim(const std::vector& tensors); +void AdadeltaInferMeta(const MetaTensor& param, + const MetaTensor& grad, + const MetaTensor& avg_squared_grad, + const MetaTensor& avg_squared_update, + float rho, + float epsilon, + MetaTensor* param_out, + MetaTensor* avg_squared_grad_out, + MetaTensor* avg_squared_update_out); + +void AdamaxInferMeta(const MetaTensor& param, + const MetaTensor& grad, + const MetaTensor& learning_rate, + const MetaTensor& moment, + const MetaTensor& inf_norm, + const MetaTensor& beta1_pow, + float beta1, + float beta2, + float epsilon, + MetaTensor* param_out, + MetaTensor* moment_out, + MetaTensor* inf_norm_out); + void AucInferMeta(const MetaTensor& input, const MetaTensor& label, const MetaTensor& stat_pos, @@ -47,32 +70,18 @@ void ConcatInferMeta(const std::vector& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void PsroiPoolInferMeta(const MetaTensor& x, + const MetaTensor& rois, + paddle::optional rois_num, + int pooled_height, + int pooled_width, + int output_channels, + float spatial_scale, + MetaTensor* out); + void WhereInferMeta(const MetaTensor& condition, const MetaTensor& x, const MetaTensor& y, MetaTensor* out); -void AdamaxInferMeta(const MetaTensor& param, - const MetaTensor& grad, - const MetaTensor& learning_rate, - const MetaTensor& moment, - const MetaTensor& inf_norm, - const MetaTensor& beta1_pow, - float beta1, - float beta2, - float epsilon, - MetaTensor* param_out, - MetaTensor* moment_out, - MetaTensor* inf_norm_out); - -void AdadeltaInferMeta(const MetaTensor& param, - const MetaTensor& grad, - const MetaTensor& avg_squared_grad, - const MetaTensor& avg_squared_update, - float rho, - float epsilon, - MetaTensor* param_out, - MetaTensor* avg_squared_grad_out, - MetaTensor* avg_squared_update_out); - } // namespace phi diff --git a/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc new file mode 100644 index 00000000000..fbed3f1cb13 --- /dev/null +++ b/paddle/phi/kernels/cpu/psroi_pool_grad_kernel.cc @@ -0,0 +1,140 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/psroi_pool_grad_kernel.h" + +#include +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void PsroiPoolGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& rois, + paddle::optional rois_num, + const DenseTensor& dout, + int pooled_height, + int pooled_width, + int output_channels, + float spatial_scale, + DenseTensor* dx) { + if (dx) { + auto in_dims = x.dims(); + int input_channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num_t = rois.dims()[0]; + + // set roi batch id + DenseTensor rois_batch_id_list; + rois_batch_id_list.Resize({rois_num_t}); + int* rois_batch_id_data = ctx.template Alloc(&rois_batch_id_list); + int rois_batch_size; + if (rois_num.get_ptr()) { + rois_batch_size = rois_num->numel(); + auto* rois_num_t_data = rois_num->data(); + int start = 0; + for (int n = 0; n < rois_batch_size; ++n) { + for (int i = start; i < start + rois_num_t_data[n]; ++i) { + rois_batch_id_data[i] = n; + } + start += rois_num_t_data[n]; + } + } else { + auto rois_lod = rois.lod().back(); + rois_batch_size = rois_lod.size() - 1; + // calculate batch id index for each roi according to LoD + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } + } + } + const T* input_rois = rois.data(); + const T* dout_data = dout.data(); + T* dx_data = ctx.template Alloc(dx); + + // set gradient of X to be 0. before backpropagate. + funcs::SetConstant set_zero; + set_zero(ctx, dx, static_cast(0)); + + // backpropagate gradient per output pixel + int dout_size = dout.numel(); + for (int i = 0; i < dout_size; ++i) { + // The output is in order (n, c, ph, pw) + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % output_channels; + int n = i / pooled_width / pooled_height / output_channels; + + // set roi_batch_id + int roi_batch_id = rois_batch_id_data[n]; + int input_channel = (c * pooled_height + ph) * pooled_width + pw; + int input_offset = + (roi_batch_id * input_channels + input_channel) * height * width; + T* offset_dx_data = dx_data + input_offset; + + // [start, end) interval for spatial sampling + const T* offset_input_rois = input_rois + n * 4; + T roi_start_w = + static_cast(round(offset_input_rois[0])) * spatial_scale; + T roi_start_h = + static_cast(round(offset_input_rois[1])) * spatial_scale; + T roi_end_w = + static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; + T roi_end_h = + static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; + + // Force too small ROIs to be 1x1 + T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 + T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1); + + // Compute w and h at input feature map + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + int hstart = floor(bin_size_h * static_cast(ph) + roi_start_h); + int wstart = floor(bin_size_w * static_cast(pw) + roi_start_w); + int hend = ceil(bin_size_h * static_cast(ph + 1) + roi_start_h); + int wend = ceil(bin_size_w * static_cast(pw + 1) + roi_start_w); + + // Add roi offsets and clip to input boundaries + hstart = std::min(std::max(hstart, 0), height); + hend = std::min(std::max(hend, 0), height); + wstart = std::min(std::max(wstart, 0), width); + wend = std::min(std::max(wend, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + // Accumulate diff_val into input data + T bin_area = static_cast((hend - hstart) * (wend - wstart)); + T diff_val = is_empty ? 0. : dout_data[i] / bin_area; + for (int ih = hstart; ih < hend; ++ih) { + for (int iw = wstart; iw < wend; ++iw) { + int input_index = ih * width + iw; + offset_dx_data[input_index] += diff_val; + } + } + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + psroi_pool_grad, CPU, ALL_LAYOUT, phi::PsroiPoolGradKernel, float, double) { + kernel->InputAt(2).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} diff --git a/paddle/phi/kernels/cpu/psroi_pool_kernel.cc b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc new file mode 100644 index 00000000000..06cd03395d9 --- /dev/null +++ b/paddle/phi/kernels/cpu/psroi_pool_kernel.cc @@ -0,0 +1,174 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/psroi_pool_kernel.h" + +#include +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void PsroiPoolKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& rois, + paddle::optional rois_num, + int pooled_height, + int pooled_width, + int output_channels, + float spatial_scale, + DenseTensor* out) { + auto in_dims = x.dims(); + int batch_size = in_dims[0]; + int input_channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num_t = rois.dims()[0]; + + PADDLE_ENFORCE_EQ(input_channels, + output_channels * pooled_height * pooled_width, + errors::InvalidArgument( + "the channels of input " + "X should equal the product of " + "output_channels x pooled_height x pooled_width")); + + auto in_stride = stride(in_dims); + auto out_stride = stride(out->dims()); + + const T* input_data = x.data(); + + DenseTensor rois_batch_id_list; + rois_batch_id_list.Resize({rois_num_t}); + int* rois_batch_id_data = ctx.template Alloc(&rois_batch_id_list); + + int rois_batch_size; + if (rois_num.get_ptr()) { + rois_batch_size = rois_num->numel(); + auto* rois_num_data = rois_num->data(); + PADDLE_ENFORCE_EQ( + rois_batch_size, + batch_size, + errors::InvalidArgument( + "The batch size of rois and the batch size of images " + " must be the same. But received the batch size of rois is %d, " + "and the batch size of images is %d", + rois_batch_size, + batch_size)); + int rois_num_count = 0; + for (int i = 0; i < rois_batch_size; ++i) { + rois_num_count += rois_num_data[i]; + } + PADDLE_ENFORCE_EQ( + rois_num_count, + rois_num_t, + errors::InvalidArgument( + "the rois_num from input and RoisNum must be the same")); + int start = 0; + for (int n = 0; n < rois_batch_size; ++n) { + for (int i = start; i < start + rois_num_data[n]; ++i) { + rois_batch_id_data[i] = n; + } + start += rois_num_data[n]; + } + } else { + auto rois_lod = rois.lod().back(); + rois_batch_size = rois_lod.size() - 1; + PADDLE_ENFORCE_EQ( + rois_batch_size, + batch_size, + errors::InvalidArgument("the rois_batch_size and input(X) " + "batch_size should be the same.")); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ(rois_num_with_lod, + rois_num_t, + errors::InvalidArgument( + "the rois_num from input and lod must be the same")); + // calculate batch id index for each roi according to LoD + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } + } + } + T* output_data = ctx.template Alloc(out); + const T* input_rois = rois.data(); + + // calculate psroipooling, parallel processing can be implemented per ROI + for (int n = 0; n < rois_num_t; ++n) { + // set roi batch id + int roi_batch_id = rois_batch_id_data[n]; + + // [start, end) interval for spatial sampling + const T* offset_input_rois = input_rois + n * 4; + T roi_start_w = static_cast(round(offset_input_rois[0])) * spatial_scale; + T roi_start_h = static_cast(round(offset_input_rois[1])) * spatial_scale; + T roi_end_w = + static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; + T roi_end_h = + static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; + // Force too small rois to be 1 x 1 + T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 + T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1); + + // Compute bin size w and h at input feature map + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + // calculate each pixel of the output feature map. + int out_roi_offset = n * out_stride[0]; + for (int c = 0; c < output_channels; ++c) { + // per category + int out_plane_offset = out_roi_offset + c * out_stride[1]; + for (int ph = 0; ph < pooled_height; ++ph) { + int out_row_offset = out_plane_offset + ph * out_stride[2]; + for (int pw = 0; pw < pooled_width; ++pw) { + // calculate w and h at input feature map + int hstart = floor(static_cast(ph) * bin_size_h + roi_start_h); + int wstart = floor(static_cast(pw) * bin_size_w + roi_start_w); + int hend = ceil(static_cast(ph + 1) * bin_size_h + roi_start_h); + int wend = ceil(static_cast(pw + 1) * bin_size_w + roi_start_w); + // Add roi offsets and clip to input boundaries + hstart = std::min(std::max(hstart, 0), height); + wstart = std::min(std::max(wstart, 0), width); + hend = std::min(std::max(hend, 0), height); + wend = std::min(std::max(wend, 0), width); + + int output_index = out_row_offset + pw; + int input_channel = (c * pooled_height + ph) * pooled_width + pw; + int input_plane_offset = + roi_batch_id * in_stride[0] + input_channel * in_stride[1]; + const T* offset_input_data = input_data + input_plane_offset; + T out_sum = 0.; + bool is_empty = (hend <= hstart) || (wend <= wstart); + for (int ih = hstart; ih < hend; ++ih) { + for (int iw = wstart; iw < wend; ++iw) { + int input_index = ih * in_stride[2] + iw; + out_sum += offset_input_data[input_index]; + } + } + T bin_area = (hend - hstart) * (wend - wstart); + output_data[output_index] = is_empty ? 0. : out_sum / bin_area; + } + } + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + psroi_pool, CPU, ALL_LAYOUT, phi::PsroiPoolKernel, float, double) { + kernel->InputAt(2).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} diff --git a/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu new file mode 100644 index 00000000000..6745653eba7 --- /dev/null +++ b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu @@ -0,0 +1,193 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/psroi_pool_kernel.h" + +#include +#include +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaximumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaximumNumBlocks); +} + +template +__global__ void GPUPSROIPoolBackward(const int nthreads, + const T* input_rois, + const T* dout_data, + const float spatial_scale, + const int input_channels, + const int height, + const int width, + const int output_channels, + const int pooled_height, + const int pooled_width, + const int* rois_batch_id_data, + T* dx_data) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (int i = index; i < nthreads; i += offset) { + // The output is in order (n, c, ph, pw) + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % output_channels; + int n = i / pooled_width / pooled_height / output_channels; + + // set roi_batch_id + int roi_batch_id = rois_batch_id_data[n]; + int input_channel = (c * pooled_height + ph) * pooled_width + pw; + int input_offset = + (roi_batch_id * input_channels + input_channel) * height * width; + T* offset_dx_data = dx_data + input_offset; + + // [start, end) interval for spatial sampling + const T* offset_input_rois = input_rois + n * 4; + T roi_start_w = static_cast(round(offset_input_rois[0])) * spatial_scale; + T roi_start_h = static_cast(round(offset_input_rois[1])) * spatial_scale; + T roi_end_w = + static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; + T roi_end_h = + static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; + + // Force too small ROIs to be 1x1 + T roi_height = max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 + T roi_width = max(roi_end_w - roi_start_w, (T)0.1); + + // Compute w and h at input feature map + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + int hstart = floor(bin_size_h * static_cast(ph) + roi_start_h); + int wstart = floor(bin_size_w * static_cast(pw) + roi_start_w); + int hend = ceil(bin_size_h * static_cast(ph + 1) + roi_start_h); + int wend = ceil(bin_size_w * static_cast(pw + 1) + roi_start_w); + + // Add roi offsets and clip to input boundaries + hstart = min(max(hstart, 0), height); + hend = min(max(hend, 0), height); + wstart = min(max(wstart, 0), width); + wend = min(max(wend, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + // Accumulate diff_val into input data + T bin_area = static_cast((hend - hstart) * (wend - wstart)); + T diff_val = is_empty ? 0. : dout_data[i] / bin_area; + for (int ih = hstart; ih < hend; ++ih) { + for (int iw = wstart; iw < wend; ++iw) { + int input_index = ih * width + iw; + paddle::platform::CudaAtomicAdd(offset_dx_data + input_index, diff_val); + } + } + } +} + +template +void PsroiPoolGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& rois, + paddle::optional rois_num, + const DenseTensor& dout, + int pooled_height, + int pooled_width, + int output_channels, + float spatial_scale, + DenseTensor* dx) { + int rois_num_t = rois.dims()[0]; + int input_channels = x.dims()[1]; + int height = x.dims()[2]; + int width = x.dims()[3]; + + if (dx) { + // set roi batch id + DenseTensor rois_batch_id_list; + rois_batch_id_list.Resize({rois_num_t}); + int* rois_batch_id_data = ctx.template HostAlloc(&rois_batch_id_list); + int rois_batch_size; + if (rois_num.get_ptr()) { + rois_batch_size = rois_num->numel(); + std::vector rois_num_list(rois_batch_size); + paddle::memory::Copy(CPUPlace(), + rois_num_list.data(), + ctx.GetPlace(), + rois_num->data(), + sizeof(int) * rois_batch_size, + 0); + int start = 0; + for (int n = 0; n < rois_batch_size; ++n) { + for (int i = start; i < start + rois_num_list[n]; ++i) { + rois_batch_id_data[i] = n; + } + start += rois_num_list[n]; + } + } else { + auto rois_lod = rois.lod().back(); + rois_batch_size = rois_lod.size() - 1; + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } + } + } + + DenseTensor rois_batch_id_list_gpu; + Copy(ctx, + rois_batch_id_list, + ctx.GetPlace(), + false, + &rois_batch_id_list_gpu); + + ctx.template Alloc(dx); + funcs::SetConstant set_zero; + set_zero(ctx, dx, static_cast(0)); + + int dout_size = dout.numel(); + int blocks = NumBlocks(dout_size); + int threads = kNumCUDAThreads; + + if (dout_size > 0) { + GPUPSROIPoolBackward<<>>( + dout_size, + rois.data(), + dout.data(), + spatial_scale, + input_channels, + height, + width, + output_channels, + pooled_height, + pooled_width, + rois_batch_id_list_gpu.data(), + ctx.template Alloc(dx)); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + psroi_pool_grad, GPU, ALL_LAYOUT, phi::PsroiPoolGradKernel, float, double) { + kernel->InputAt(2).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} diff --git a/paddle/phi/kernels/gpu/psroi_pool_kernel.cu b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu new file mode 100644 index 00000000000..8f9be001ba7 --- /dev/null +++ b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu @@ -0,0 +1,231 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/psroi_pool_kernel.h" + +#include +#include +#include "paddle/fluid/memory/memory.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" + +namespace phi { + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaximumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaximumNumBlocks); +} + +template +__global__ void GPUPSROIPoolForward(const int nthreads, + const T* input_data, + const T* input_rois, + const float spatial_scale, + const int input_channels, + const int height, + const int width, + const int output_channels, + const int pooled_height, + const int pooled_width, + const int* rois_batch_id_data, + T* output_data) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (size_t i = index; i < nthreads; i += offset) { + // The output is in order (n, c, ph, pw) + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % output_channels; + int n = i / pooled_width / pooled_height / output_channels; + + // set roi_batch_id + int roi_batch_id = rois_batch_id_data[n]; + + // [start, end) interval for spatial sampling + const T* offset_input_rois = input_rois + n * 4; + T roi_start_w = static_cast(round(offset_input_rois[0])) * spatial_scale; + T roi_start_h = static_cast(round(offset_input_rois[1])) * spatial_scale; + T roi_end_w = + static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; + T roi_end_h = + static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; + + // Force too small ROIs to be 1x1 + T roi_height = max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 + T roi_width = max(roi_end_w - roi_start_w, (T)0.1); + + // Compute w and h at input feature map + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + int hstart = floor(bin_size_h * static_cast(ph) + roi_start_h); + int wstart = floor(bin_size_w * static_cast(pw) + roi_start_w); + int hend = ceil(bin_size_h * static_cast(ph + 1) + roi_start_h); + int wend = ceil(bin_size_w * static_cast(pw + 1) + roi_start_w); + + // Add roi offsets and clip to input boundaries + hstart = min(max(hstart, 0), height); + hend = min(max(hend, 0), height); + wstart = min(max(wstart, 0), width); + wend = min(max(wend, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + int input_channel = (c * pooled_height + ph) * pooled_width + pw; + const T* offset_input_data = + input_data + + (roi_batch_id * input_channels + input_channel) * height * width; + T outsum = 0; + + for (int ih = hstart; ih < hend; ++ih) { + for (int iw = wstart; iw < wend; ++iw) { + int input_index = ih * width + iw; + outsum += offset_input_data[input_index]; + } + } + + T bin_area = static_cast((hend - hstart) * (wend - wstart)); + output_data[i] = is_empty ? 0. : outsum / bin_area; + } +} + +template +void PsroiPoolKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& rois, + paddle::optional rois_num, + int pooled_height, + int pooled_width, + int output_channels, + float spatial_scale, + DenseTensor* out) { + auto in_dims = x.dims(); + int batch_size = in_dims[0]; + int input_channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + + PADDLE_ENFORCE_EQ( + input_channels, + output_channels * pooled_height * pooled_width, + errors::InvalidArgument( + "The channels %d of input X should equal the product of " + "output_channels %d x pooled_height %d x pooled_width %d.", + input_channels, + output_channels, + pooled_height, + pooled_width)); + + int rois_num_t = rois.dims()[0]; + if (rois_num_t == 0) return; + int rois_batch_size; + DenseTensor rois_batch_id_list; + rois_batch_id_list.Resize({rois_num_t}); + int* rois_batch_id_data = ctx.template HostAlloc(&rois_batch_id_list); + + if (rois_num.get_ptr()) { + rois_batch_size = rois_num->numel(); + auto* rois_num_data = rois_num->data(); + PADDLE_ENFORCE_EQ(rois_batch_size, + batch_size, + errors::InvalidArgument( + "The batch size of input(ROIs) and input(X) must be " + "the same but received batch size of input(ROIs) and " + "input(X) is %d and %d respectively.", + rois_batch_size, + batch_size)); + std::vector rois_num_list(rois_batch_size); + paddle::memory::Copy(CPUPlace(), + rois_num_list.data(), + ctx.GetPlace(), + rois_num_data, + sizeof(int) * rois_batch_size, + 0); + int rois_num_count = 0; + for (int i = 0; i < rois_batch_size; ++i) { + rois_num_count += rois_num_list[i]; + } + PADDLE_ENFORCE_EQ( + rois_num_count, + rois_num_t, + errors::InvalidArgument( + "the rois_num from input and RoisNum must be the same")); + int start = 0; + for (int n = 0; n < rois_batch_size; ++n) { + for (int i = start; i < start + rois_num_list[n]; ++i) { + rois_batch_id_data[i] = n; + } + start += rois_num_list[n]; + } + } else { + auto rois_lod = rois.lod().back(); + rois_batch_size = rois_lod.size() - 1; + PADDLE_ENFORCE_EQ(rois_batch_size, + batch_size, + errors::InvalidArgument( + "The batch size of input(ROIs) and input(X) must be " + "the same but received batch size of input(ROIs) and " + "input(X) is %d and %d respectively.", + rois_batch_size, + batch_size)); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ(rois_num_t, + rois_num_with_lod, + errors::InvalidArgument( + "The number of rois from input(ROIs) and its LOD " + "must be the same. Received rois %d of input(ROIs) " + "but the number of rois %d from its LOD is %d", + rois_num, + rois_num_with_lod)); + + // set rois batch id + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } + } + } + DenseTensor rois_batch_id_list_gpu; + Copy(ctx, rois_batch_id_list, ctx.GetPlace(), false, &rois_batch_id_list_gpu); + + int output_size = out->numel(); + int blocks = NumBlocks(output_size); + int threads = kNumCUDAThreads; + + // call cuda kernel function + GPUPSROIPoolForward<<>>( + output_size, + x.data(), + rois.data(), + spatial_scale, + input_channels, + height, + width, + output_channels, + pooled_height, + pooled_width, + rois_batch_id_list_gpu.data(), + ctx.template Alloc(out)); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + psroi_pool, GPU, ALL_LAYOUT, phi::PsroiPoolKernel, float, double) { + kernel->InputAt(2).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} diff --git a/paddle/phi/kernels/psroi_pool_grad_kernel.h b/paddle/phi/kernels/psroi_pool_grad_kernel.h new file mode 100644 index 00000000000..87163eb8e07 --- /dev/null +++ b/paddle/phi/kernels/psroi_pool_grad_kernel.h @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/utils/optional.h" + +namespace phi { + +template +void PsroiPoolGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& rois, + paddle::optional rois_num, + const DenseTensor& dout, + int pooled_height, + int pooled_width, + int output_channels, + float spatial_scale, + DenseTensor* dx); + +} // namespace phi diff --git a/paddle/phi/kernels/psroi_pool_kernel.h b/paddle/phi/kernels/psroi_pool_kernel.h new file mode 100644 index 00000000000..341037af2ca --- /dev/null +++ b/paddle/phi/kernels/psroi_pool_kernel.h @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/utils/optional.h" + +namespace phi { + +template +void PsroiPoolKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& rois, + paddle::optional rois_num, + int pooled_height, + int pooled_width, + int output_channels, + float spatial_scale, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/psroi_pool_sig.cc b/paddle/phi/ops/compat/psroi_pool_sig.cc new file mode 100644 index 00000000000..4d694d9a775 --- /dev/null +++ b/paddle/phi/ops/compat/psroi_pool_sig.cc @@ -0,0 +1,40 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature PsroiPoolOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "psroi_pool", + {"X", "ROIs", "RoisNum"}, + {"pooled_height", "pooled_width", "output_channels", "spatial_scale"}, + {"Out"}); +} + +KernelSignature PsroiPoolGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "psroi_pool_grad", + {"X", "ROIs", "RoisNum", GradVarName("Out")}, + {"pooled_height", "pooled_width", "output_channels", "spatial_scale"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(psroi_pool, phi::PsroiPoolOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(psroi_pool_grad, + phi::PsroiPoolGradOpArgumentMapping); -- GitLab From 47459e989b7607d5dc7c230fff7870c4c95e9141 Mon Sep 17 00:00:00 2001 From: Sylwester Fraczek Date: Fri, 11 Mar 2022 10:40:41 +0100 Subject: [PATCH 002/176] refactor conv+relementwise_add (residual) (#40005) --- .../conv_elementwise_add_mkldnn_fuse_pass.cc | 392 ++++++++---------- .../conv_elementwise_add_mkldnn_fuse_pass.h | 92 +--- 2 files changed, 177 insertions(+), 307 deletions(-) diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc index c537d057385..2403e60df39 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc @@ -22,6 +22,7 @@ #include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/string/pretty_log.h" namespace paddle { namespace framework { @@ -135,157 +136,9 @@ ResidualConnectionMKLDNNFusePass::ResidualConnectionMKLDNNFusePass() { .End(); } -ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::IdentityFuseHandle( - const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func, - const ResidualConnectionMKLDNNFusePass::IdentityConvFunc& - get_node_from_conv_op, - const ResidualConnectionMKLDNNFusePass::IdentityElementwiseAddFunc& - get_node_from_elementwise_add_op, - const ResidualConnectionMKLDNNFusePass* pass) - : fusion_stats{std::make_shared(0)}, - can_fuse_func{can_fuse_func}, - get_node_from_conv_op{get_node_from_conv_op}, - get_node_from_elementwise_add_op{get_node_from_elementwise_add_op}, - pass_{pass} {} - -void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()( - const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { - Node* conv_op; - Node* conv_input; - Node* conv_filter; - Node* conv_output; - - Node* elementwise_add_op; - Node* elementwise_add_identity; - Node* elementwise_add_out; - - std::tie(conv_op, conv_input, conv_filter, conv_output) = - get_node_from_conv_op(subgraph); - std::tie(elementwise_add_op, elementwise_add_identity, elementwise_add_out) = - get_node_from_elementwise_add_op(subgraph); - - if (!can_fuse_func(conv_op, elementwise_add_op)) return; - - if (!IsReachable(graph, elementwise_add_identity, conv_output)) return; - - if (HasFusedActivation(conv_op)) return; - - if (!pass_->IsCompat(subgraph, graph)) { - LOG(WARNING) - << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed."; - return; - } - - conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()}); - conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); - conv_op->Op()->SetAttr("fuse_residual_connection", true); - - GraphSafeRemoveNodes(graph, {conv_output, elementwise_add_op}); - - IR_NODE_LINK_TO(elementwise_add_identity, conv_op); - IR_NODE_LINK_TO(conv_op, elementwise_add_out); - - (*fusion_stats)++; -} - -ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::ProjectionFuseHandle( - const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func, - const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc& - get_node_from_conv_x_op, - const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc& - get_node_from_conv_y_op, - const ResidualConnectionMKLDNNFusePass::ProjectionElementwiseAddFunc& - get_node_from_elementwise_add_op, - const ResidualConnectionMKLDNNFusePass* pass) - : fusion_stats{std::make_shared(0)}, - can_fuse_func{can_fuse_func}, - get_node_from_conv_x_op{get_node_from_conv_x_op}, - get_node_from_conv_y_op{get_node_from_conv_y_op}, - get_node_from_elementwise_add_op{get_node_from_elementwise_add_op}, - pass_{pass} {} - -void ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::operator()( - const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { - Node* conv_x_op; - Node* conv_x_input; - Node* conv_x_filter; - Node* conv_x_output; - - Node* conv_y_op; - Node* conv_y_input; - Node* conv_y_filter; - Node* conv_y_output; - - Node* elementwise_add_op; - Node* elementwise_add_out; - - if (!pass_->IsCompat(subgraph, graph)) { - LOG(WARNING) - << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed."; - return; - } - - std::tie(conv_x_op, conv_x_input, conv_x_filter, conv_x_output) = - get_node_from_conv_x_op(subgraph); - std::tie(conv_y_op, conv_y_input, conv_y_filter, conv_y_output) = - get_node_from_conv_y_op(subgraph); - std::tie(elementwise_add_op, elementwise_add_out) = - get_node_from_elementwise_add_op(subgraph); - - if (!can_fuse_func(conv_x_op, elementwise_add_op)) return; - if (!can_fuse_func(conv_y_op, elementwise_add_op)) return; - - Node* projection_node; - Node* residual_conv_op; - Node* residual_conv_output; - - if (IsReachable(graph, conv_x_input, conv_y_output)) { - projection_node = conv_x_output; - residual_conv_op = conv_y_op; - residual_conv_output = conv_y_output; - } else if (IsReachable(graph, conv_y_input, conv_x_output)) { - projection_node = conv_y_output; - residual_conv_op = conv_x_op; - residual_conv_output = conv_x_output; - } else { - return; - } - - if (HasFusedActivation(residual_conv_op)) return; - - residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()}); - residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); - - residual_conv_op->Op()->SetAttr("fuse_residual_connection", true); - - GraphSafeRemoveNodes(graph, {residual_conv_output, elementwise_add_op}); - - IR_NODE_LINK_TO(projection_node, residual_conv_op); - IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out); - - (*fusion_stats)++; -} - -std::tuple -ResidualConnectionMKLDNNFusePass::GetNodesFromConv( - const patterns::Conv& conv_pattern, - const GraphPatternDetector::subgraph_t& subgraph) const { - GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern); - GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern); - GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern); - GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); - - return std::make_tuple(conv_op, conv_input, conv_filter, conv_output); -} - GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX( const std::string& name_scope, const GraphWithStats& graph_with_stats) const { - ir::Graph* graph; - int stats; - - std::tie(graph, stats) = graph_with_stats; - GraphPatternDetector gpd; auto pattern = gpd.mutable_pattern(); @@ -298,26 +151,56 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX( pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr())); conv_output->AsIntermediate(); - auto get_node_from_elementwise_add = [&elementwise_add_pattern]( - const GraphPatternDetector::subgraph_t& subgraph) - -> std::tuple { - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); - - return std::make_tuple(elementwise_add_op, elementwise_add_y, - elementwise_add_out); - }; - - return ExecuteHandleOnGraph( - &gpd, graph_with_stats, - [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) { - return GetNodesFromConv(conv_pattern, subgraph); - }, - get_node_from_elementwise_add, this); + int found_conv_as_x_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, + elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_identity, elementwise_add_y, + elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, + elementwise_add_pattern); + + if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return; + + if (!IsReachable(g, elementwise_add_identity, conv_output)) return; + + if (HasFusedActivation(conv_op)) return; + + if (!IsCompat(subgraph, g)) { + LOG(WARNING) + << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed."; + return; + } + + conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()}); + conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); + conv_op->Op()->SetAttr("fuse_residual_connection", true); + + GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op}); + + IR_NODE_LINK_TO(elementwise_add_identity, conv_op); + IR_NODE_LINK_TO(conv_op, elementwise_add_out); + + found_conv_as_x_count++; + }; + + gpd(graph_with_stats.first, handler); + if (!Has("disable_logs") || !Get("disable_logs")) { + std::stringstream msg_ss; + msg_ss << "--- Fused " << found_conv_as_x_count + << " conv (as x) + elementwise_add patterns"; + paddle::string::PrettyLogDetail(msg_ss.str().c_str()); + } + + return std::make_pair(graph_with_stats.first, + found_conv_as_x_count + graph_with_stats.second); } GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY( @@ -335,26 +218,56 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY( conv_output); conv_output->AsIntermediate(); - auto get_node_from_elementwise_add = [&elementwise_add_pattern]( - const GraphPatternDetector::subgraph_t& subgraph) - -> std::tuple { - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); - - return std::make_tuple(elementwise_add_op, elementwise_add_x, - elementwise_add_out); - }; - - return ExecuteHandleOnGraph( - &gpd, graph_with_stats, - [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) { - return GetNodesFromConv(conv_pattern, subgraph); - }, - get_node_from_elementwise_add, this); + int found_conv_as_y_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, + elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x, + elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, + elementwise_add_pattern); + + if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return; + + if (!IsReachable(g, elementwise_add_x, conv_output)) return; + + if (HasFusedActivation(conv_op)) return; + + if (!IsCompat(subgraph, g)) { + LOG(WARNING) + << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed."; + return; + } + + conv_op->Op()->SetInput("ResidualData", {elementwise_add_x->Name()}); + conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); + conv_op->Op()->SetAttr("fuse_residual_connection", true); + + GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op}); + + IR_NODE_LINK_TO(elementwise_add_x, conv_op); + IR_NODE_LINK_TO(conv_op, elementwise_add_out); + + found_conv_as_y_count++; + }; + + gpd(graph_with_stats.first, handler); + if (!Has("disable_logs") || !Get("disable_logs")) { + std::stringstream msg_ss; + msg_ss << "--- Fused " << found_conv_as_y_count + << " conv (as y) + elementwise_add patterns"; + paddle::string::PrettyLogDetail(msg_ss.str().c_str()); + } + + return std::make_pair(graph_with_stats.first, + found_conv_as_y_count + graph_with_stats.second); } GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( @@ -374,39 +287,84 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( conv_x_output->AsIntermediate(); conv_y_output->AsIntermediate(); - auto get_node_from_elementwise_add = [&elementwise_add_pattern]( - const GraphPatternDetector::subgraph_t& subgraph) - -> std::tuple { - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); - - return std::make_tuple(elementwise_add_op, elementwise_add_out); - }; - - return ExecuteHandleOnGraph( - &gpd, graph_with_stats, - [this, - &conv_x_pattern](const GraphPatternDetector::subgraph_t& subgraph) { - return GetNodesFromConv(conv_x_pattern, subgraph); - }, - [this, - &conv_y_pattern](const GraphPatternDetector::subgraph_t& subgraph) { - return GetNodesFromConv(conv_y_pattern, subgraph); - }, - get_node_from_elementwise_add, this); + int found_projection_conv_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(conv_x_op, conv_op, conv_x_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_x_input, conv_input, conv_x_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_x_filter, conv_filter, conv_x_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_x_output, conv_output, conv_x_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(conv_y_op, conv_op, conv_y_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_y_input, conv_input, conv_y_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_y_filter, conv_filter, conv_y_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_y_output, conv_output, conv_y_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, + elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, + elementwise_add_pattern); + + if (!IsCompat(subgraph, g)) { + LOG(WARNING) + << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed."; + return; + } + + if (FindFuseOption(*conv_x_op, *elementwise_add_op) != FUSE_MKLDNN) return; + if (FindFuseOption(*conv_y_op, *elementwise_add_op) != FUSE_MKLDNN) return; + + Node* projection_node; + Node* residual_conv_op; + Node* residual_conv_output; + if (IsReachable(g, conv_x_input, conv_y_output)) { + projection_node = conv_x_output; + residual_conv_op = conv_y_op; + residual_conv_output = conv_y_output; + } else if (IsReachable(g, conv_y_input, conv_x_output)) { + projection_node = conv_y_output; + residual_conv_op = conv_x_op; + residual_conv_output = conv_x_output; + } else { + return; + } + + if (HasFusedActivation(residual_conv_op)) return; + + residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()}); + residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); + + residual_conv_op->Op()->SetAttr("fuse_residual_connection", true); + + GraphSafeRemoveNodes(g, {residual_conv_output, elementwise_add_op}); + + IR_NODE_LINK_TO(projection_node, residual_conv_op); + IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out); + + found_projection_conv_count++; + }; + + gpd(graph_with_stats.first, handler); + if (!Has("disable_logs") || !Get("disable_logs")) { + std::stringstream msg_ss; + msg_ss << "--- Fused " << found_projection_conv_count + << " projection conv (as y) + elementwise_add patterns"; + paddle::string::PrettyLogDetail(msg_ss.str().c_str()); + } + + return std::make_pair(graph_with_stats.first, + found_projection_conv_count + graph_with_stats.second); } -void ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const { +void ResidualConnectionMKLDNNFusePass::ApplyImpl(ir::Graph* graph) const { FusePassBase::Init(name_scope_, graph); - auto fused_graph_with_stats = FuseConvAsY( - name_scope_, - FuseConvAsX(name_scope_, - FuseProjectionConv(name_scope_, std::make_pair(graph, 0)))); + auto graph_with_stats = + FuseProjectionConv(name_scope_, std::make_pair(graph, 0)); + graph_with_stats = FuseConvAsX(name_scope_, graph_with_stats); + graph_with_stats = FuseConvAsY(name_scope_, graph_with_stats); - LOG(INFO) << "Fused graph " << fused_graph_with_stats.second << "\n"; - AddStatis(fused_graph_with_stats.second); + AddStatis(graph_with_stats.second); } } // namespace ir } // namespace framework diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h index c83335da2f6..c4351b38218 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h @@ -28,19 +28,9 @@ namespace paddle { namespace framework { namespace ir { -class Graph; -class GraphPatternDetector; -class Node; -namespace patterns { -struct Conv; -} // namespace patterns - -using graph_ptr = ir::Graph*; using GraphWithStats = std::pair; -void CorrectGraphEdges(Graph* graph, Node* from, Node* to); bool IsReachable(ir::Graph* graph, Node* from, Node* to); -paddle::optional HasBias(const Node& op, const std::string& bias_name); class ResidualConnectionMKLDNNFusePass : public FusePassBase { private: @@ -52,91 +42,13 @@ class ResidualConnectionMKLDNNFusePass : public FusePassBase { const std::string& name_scope, const GraphWithStats& graph_with_stats) const; - template - using GetNodeFunc = - std::function; - using IdentityConvFunc = GetNodeFunc>; - using IdentityElementwiseAddFunc = - GetNodeFunc>; - - using ProjectionConvFunc = IdentityConvFunc; - using ProjectionElementwiseAddFunc = GetNodeFunc>; - - using CanFuseFunc = std::function; - - std::tuple GetNodesFromConv( - const patterns::Conv& conv_pattern, - const GraphPatternDetector::subgraph_t& subgraph) const; - - std::tuple GetNodesFromProjectionConv( - const patterns::Conv& conv_pattern, - const GraphPatternDetector::subgraph_t& subgraph) const; - - template - GraphWithStats ExecuteHandleOnGraph(GraphPatternDetector* gpd, - const GraphWithStats& graph_with_stats, - OpFuncs&&... op_funcs) const { - ir::Graph* graph; - int stats; - - std::tie(graph, stats) = graph_with_stats; - - auto can_fuse = [this](Node* op1, Node* op2) -> bool { - return this->FindFuseOption(*op1, *op2) == FUSE_MKLDNN; - }; - auto fuse_handle = HandleType{can_fuse, std::forward(op_funcs)...}; - - (*gpd)(graph, fuse_handle); - - return std::make_pair(graph, stats + fuse_handle.get_stats()); - } - - struct IdentityFuseHandle { - IdentityFuseHandle( - const CanFuseFunc& can_fuse_func, - const IdentityConvFunc& get_node_from_conv_op, - const IdentityElementwiseAddFunc& get_node_from_elementwise_add_op, - const ResidualConnectionMKLDNNFusePass* pass); - - void operator()(const GraphPatternDetector::subgraph_t& subgraph, - Graph* graph); - int get_stats() const { return *fusion_stats; } - - private: - std::shared_ptr fusion_stats; - CanFuseFunc can_fuse_func; - IdentityConvFunc get_node_from_conv_op; - IdentityElementwiseAddFunc get_node_from_elementwise_add_op; - const ResidualConnectionMKLDNNFusePass* pass_; - }; - - struct ProjectionFuseHandle { - ProjectionFuseHandle( - const CanFuseFunc& can_fuse_func, - const ProjectionConvFunc& get_node_from_conv_x_op, - const ProjectionConvFunc& get_node_from_conv_y_op, - const ProjectionElementwiseAddFunc& get_node_from_elementwise_add_op, - const ResidualConnectionMKLDNNFusePass* pass); - - void operator()(const GraphPatternDetector::subgraph_t& subgraph, - Graph* graph); - int get_stats() const { return *fusion_stats; } - - private: - std::shared_ptr fusion_stats; - CanFuseFunc can_fuse_func; - ProjectionConvFunc get_node_from_conv_x_op; - ProjectionConvFunc get_node_from_conv_y_op; - ProjectionElementwiseAddFunc get_node_from_elementwise_add_op; - const ResidualConnectionMKLDNNFusePass* pass_; - }; - public: ResidualConnectionMKLDNNFusePass(); virtual ~ResidualConnectionMKLDNNFusePass() {} protected: - void ApplyImpl(graph_ptr graph) const; + void ApplyImpl(ir::Graph* graph) const; + static bool HasFusedActivation(Node* conv_node) { return !(conv_node->Op() ->GetAttrIfExists("fuse_activation") -- GitLab From 6d830f6cf009d2e912ff9e9b102162e1b565dd7e Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Fri, 11 Mar 2022 17:51:51 +0800 Subject: [PATCH 003/176] Added Final State Matmul_v2 to C++ performance test (#40391) --- paddle/fluid/eager/CMakeLists.txt | 2 +- paddle/fluid/eager/autograd_meta.h | 3 +- .../performance_tests/benchmark_eager_cpu.cc | 41 +++++++++++++++++ .../performance_tests/benchmark_eager_cuda.cc | 44 +++++++++++++++++++ .../performance_tests/benchmark_utils.cc | 24 ++++++++++ .../tests/performance_tests/benchmark_utils.h | 11 ++--- 6 files changed, 114 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt index 698a698fc6d..f9d1b705390 100644 --- a/paddle/fluid/eager/CMakeLists.txt +++ b/paddle/fluid/eager/CMakeLists.txt @@ -1,6 +1,6 @@ set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node) set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy) -set(generated_deps dygraph_function dygraph_node) +set(generated_deps final_dygraph_function final_dygraph_node dygraph_function dygraph_node) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) message("Performing Eager Dygraph Auto Code Generation") diff --git a/paddle/fluid/eager/autograd_meta.h b/paddle/fluid/eager/autograd_meta.h index 9e1dc4f2c8c..dca76d3b8a0 100644 --- a/paddle/fluid/eager/autograd_meta.h +++ b/paddle/fluid/eager/autograd_meta.h @@ -145,8 +145,7 @@ class AutogradMeta : public AbstractAutogradMeta { private: // TODO(jiabin) :Should we use pointer instead of object? std::shared_ptr grad_{ - std::make_shared( - egr::Controller::Instance().GenerateUniqueName("@grad"))}; + std::make_shared()}; // GradNodeBase is base class of all grad op which is a // wrapper for grad op. This class will make grad op easy diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc index af365322e60..adb3246ee8c 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc @@ -80,6 +80,47 @@ TEST(Benchmark, EagerScaleCPU) { } } +TEST(Benchmark, EagerMatmulCPU) { + // Prepare Device Contexts + eager_test::InitEnv(paddle::platform::CPUPlace()); + + for (const std::string& mode : {"Accuracy", "Performance"}) { + paddle::framework::DDim ddimX = phi::make_ddim({2, 2}); + paddle::experimental::Tensor X = CreateTensorWithValue( + ddimX, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0, true); + RetainGradForTensor(X); + + paddle::framework::DDim ddimY = phi::make_ddim({2, 2}); + paddle::experimental::Tensor Y = CreateTensorWithValue( + ddimY, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 2.0, true); + RetainGradForTensor(Y); + + if (mode == "Accuracy") { + benchmark_eager_matmul(X, Y, true /* accuracy_check */); + + } else if (mode == "Performance") { + auto t_start = std::chrono::high_resolution_clock::now(); +#ifdef WITH_GPERFTOOLS + ProfilerStart("eager_matmul_cpu.out"); +#endif + benchmark_eager_matmul(X, Y); + +#ifdef WITH_GPERFTOOLS + ProfilerStop(); +#endif + auto t_end = std::chrono::high_resolution_clock::now(); + double elapsed_time_ms = + std::chrono::duration(t_end - t_start).count(); + std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl; + + } else { + PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode")); + } + } +} + TEST(Benchmark, EagerIntermediateMatmulCPU) { // Prepare Device Contexts eager_test::InitEnv(paddle::platform::CPUPlace()); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc index 5b75f1242e6..bd70e84d9b4 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc @@ -82,6 +82,50 @@ TEST(Benchmark, EagerScaleCUDA) { } } +TEST(Benchmark, EagerMatmulCUDA) { + paddle::platform::CUDAPlace place; + eager_test::InitEnv(place); + + for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { + paddle::framework::DDim ddimX = phi::make_ddim({2, 2}); + paddle::experimental::Tensor X = CreateTensorWithValue( + ddimX, paddle::platform::CUDAPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0, true); + RetainGradForTensor(X); + + paddle::framework::DDim ddimY = phi::make_ddim({2, 2}); + paddle::experimental::Tensor Y = CreateTensorWithValue( + ddimY, paddle::platform::CUDAPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 2.0, true); + RetainGradForTensor(Y); + + if (mode == "Accuracy") { + benchmark_eager_matmul(X, Y, true /* accuracy_check */); + + } else if (mode == "WarmUp") { + benchmark_eager_matmul(X, Y); + + } else if (mode == "Performance") { + auto t_start = std::chrono::high_resolution_clock::now(); +#ifdef WITH_GPERFTOOLS + ProfilerStart("eager_matmul_cuda.out"); +#endif + benchmark_eager_matmul(X, Y); + +#ifdef WITH_GPERFTOOLS + ProfilerStop(); +#endif + auto t_end = std::chrono::high_resolution_clock::now(); + double elapsed_time_ms = + std::chrono::duration(t_end - t_start).count(); + std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl; + + } else { + PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode")); + } + } +} + TEST(Benchmark, EagerIntermediateMatmulCUDA) { paddle::platform::CUDAPlace place; eager_test::InitEnv(place); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc index 96126fa5466..769bd7f687f 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc @@ -28,6 +28,7 @@ #include "paddle/fluid/eager/utils.h" // Eager Generated +#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" // Fluid @@ -67,6 +68,29 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor, } } +void benchmark_eager_matmul(const paddle::experimental::Tensor& X, + const paddle::experimental::Tensor& Y, + bool accuracy_check) { + paddle::experimental::Tensor input_tensor0 = X; + + size_t max_num_runs = accuracy_check ? 2 : max_num_benchmark_runs; + for (size_t i = 0; i < max_num_runs; i++) { + input_tensor0 = + matmul_final_state_dygraph_function(input_tensor0, Y, false, false); + } + + std::vector target_tensors = {input_tensor0}; + RunBackward(target_tensors, {}); + + if (accuracy_check) { + // Examine Forward Grad (w.r.t max_num_runs = 2) + eager_test::CompareTensorWithValue(input_tensor0, 16); + // Examine Backward Grad (w.r.t max_num_runs = 2) + eager_test::CompareGradTensorWithValue(X, 16); + eager_test::CompareGradTensorWithValue(Y, 16); + } +} + /* ----------------------------------- */ /* ---- Eager Intermediate Matmul ---- */ /* ----------------------------------- */ diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h index 0086b51b57e..86bf13707ed 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h @@ -51,15 +51,10 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor, bool accuracy_check = false); /* ---- Eager MatMul ---- */ -/* -void benchmark_eager_matmul(const paddle::experimental::Tensor& X, const -paddle::experimental::Tensor& Y, +void benchmark_eager_matmul(const paddle::experimental::Tensor& X, + const paddle::experimental::Tensor& Y, bool accuracy_check = false); -void benchmark_eager_mlp(const paddle::experimental::Tensor& X, - const std::vector& Ws, - const std::vector& Bs, - bool accuracy_check = false); -*/ + void benchmark_eager_intermediate_matmul(const paddle::experimental::Tensor& X, const paddle::experimental::Tensor& Y, bool accuracy_check = false); -- GitLab From 88c03071cda368844f84305b25c28c5071d91965 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Fri, 11 Mar 2022 18:10:52 +0800 Subject: [PATCH 004/176] polish trace op detail (#40425) --- paddle/fluid/operators/trace_op.cc | 4 ++-- paddle/phi/infermeta/unary.cc | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/trace_op.cc b/paddle/fluid/operators/trace_op.cc index 0590b66f6f8..c6c0fa3c001 100644 --- a/paddle/fluid/operators/trace_op.cc +++ b/paddle/fluid/operators/trace_op.cc @@ -61,7 +61,7 @@ the 2-D planes specified by dim1 and dim2. )DOC"); } }; -class TraceOpGrad : public framework::OperatorWithKernel { +class TraceGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -114,7 +114,7 @@ REGISTER_OPERATOR(trace, ops::TraceOp, ops::TraceOpMaker, ops::TraceGradOpMaker, TraceInferShapeFunctor); -REGISTER_OPERATOR(trace_grad, ops::TraceOpGrad, +REGISTER_OPERATOR(trace_grad, ops::TraceGradOp, ops::TraceGradNoNeedBufferVarsInferer); /* ========================== register checkpoint ===========================*/ diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index d6d4efad9fa..9daad7d6aaa 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -837,6 +837,7 @@ void TraceInferMeta( sizes.erase(sizes.begin() + std::min(dim1_, dim2_)); } out->set_dims(phi::make_ddim(sizes)); + out->set_dtype(x.dtype()); } void DiagonalInferMeta(const MetaTensor& input, -- GitLab From 17d8a5e0c270206218891d6f41ffda3271f26c4a Mon Sep 17 00:00:00 2001 From: Feng Xing <79969986+xingfeng01@users.noreply.github.com> Date: Fri, 11 Mar 2022 18:20:46 +0800 Subject: [PATCH 005/176] Separate include and macro in kp top level file (#40202) * format softmax forward * seperate include and macro to two if-else --- .../phi/kernels/primitive/kernel_primitives.h | 31 +++++++++++++------ 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/paddle/phi/kernels/primitive/kernel_primitives.h b/paddle/phi/kernels/primitive/kernel_primitives.h index 830bc1972c4..b5a1e88acc3 100644 --- a/paddle/phi/kernels/primitive/kernel_primitives.h +++ b/paddle/phi/kernels/primitive/kernel_primitives.h @@ -13,7 +13,10 @@ // limitations under the License. #pragma once + #include "paddle/phi/kernels/primitive/helper_primitives.h" + +// macro #ifdef PADDLE_WITH_XPU_KP #define KPStream XPUStream @@ -22,11 +25,6 @@ #define __forceinline__ __inline__ #define __restrict__ -#include "paddle/phi/backends/xpu/xpu_context.h" -#include "paddle/phi/kernels/primitive/compute_primitives_xpu2.h" -#include "paddle/phi/kernels/primitive/datamover_primitives_xpu2.h" -#include "paddle/phi/kernels/primitive/functor_primitives_xpu2.h" - #define THREAD_ID_X core_id() #define THREAD_ID_Y 0 #define THREAD_ID_Z 0 @@ -42,11 +40,8 @@ #define GRID_NUM_X cluster_num() #define GRID_NUM_Y 0 #define GRID_NUM_Z 0 + #else -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/kernels/primitive/compute_primitives.h" -#include "paddle/phi/kernels/primitive/datamover_primitives.h" -#include "paddle/phi/kernels/primitive/functor_primitives.h" #define KPStream gpuStream_t #define KPDevice phi::GPUContext @@ -67,4 +62,22 @@ #define GRID_NUM_X gridDim.x #define GRID_NUM_Y gridDim.y #define GRID_NUM_Z gridDim.z + +#endif + +// include file +#ifdef PADDLE_WITH_XPU_KP + +#include "paddle/phi/backends/xpu/xpu_context.h" +#include "paddle/phi/kernels/primitive/compute_primitives_xpu2.h" +#include "paddle/phi/kernels/primitive/datamover_primitives_xpu2.h" +#include "paddle/phi/kernels/primitive/functor_primitives_xpu2.h" + +#else + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/kernels/primitive/compute_primitives.h" +#include "paddle/phi/kernels/primitive/datamover_primitives.h" +#include "paddle/phi/kernels/primitive/functor_primitives.h" + #endif -- GitLab From 0d78e491a77d922102e8493ca68c638b33ebdaed Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Fri, 11 Mar 2022 18:49:02 +0800 Subject: [PATCH 006/176] Submanifold convolution (#40363) submanifold convolution --- .../kernels/sparse/convolution_grad_kernel.h | 5 +- .../phi/kernels/sparse/convolution_kernel.h | 14 +- paddle/phi/kernels/sparse/cpu/convolution.h | 27 ++- .../sparse/cpu/convolution_grad_kernel.cc | 71 ++++-- .../kernels/sparse/cpu/convolution_kernel.cc | 2 + .../cpu/submanifold_convolution_kernel.cu | 30 +++ .../phi/kernels/sparse/gpu/convolution.cu.h | 6 +- .../sparse/gpu/convolution_grad_kernel.cu | 115 +++++---- .../kernels/sparse/gpu/convolution_kernel.cu | 226 ++++++++++++++++-- paddle/phi/tests/api/test_sparse_conv_api.cc | 2 +- .../kernels/test_sparse_conv3d_dev_api.cc | 116 ++++++++- python/paddle/utils/code_gen/sparse_api.yaml | 2 +- .../paddle/utils/code_gen/sparse_bw_api.yaml | 6 +- 13 files changed, 521 insertions(+), 101 deletions(-) create mode 100644 paddle/phi/kernels/sparse/cpu/submanifold_convolution_kernel.cu diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h index f4265d303d7..42bde442e1e 100644 --- a/paddle/phi/kernels/sparse/convolution_grad_kernel.h +++ b/paddle/phi/kernels/sparse/convolution_grad_kernel.h @@ -32,6 +32,7 @@ void Conv3dGradKernel(const Context& dev_ctx, const std::vector& dilations, const std::vector& strides, const int groups, + const bool subm, DenseTensor* x_grad, DenseTensor* kernel_grad); @@ -44,7 +45,8 @@ std::vector Conv3dGrad(const Context& dev_ctx, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, - const int groups) { + const int groups, + const bool subm) { DenseTensor x_grad = phi::Empty(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout())); DenseTensor kernel_grad = phi::Empty( @@ -59,6 +61,7 @@ std::vector Conv3dGrad(const Context& dev_ctx, dilations, strides, groups, + subm, &x_grad, &kernel_grad); std::vector out(2); diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h index cfb451afdcb..778600a2285 100644 --- a/paddle/phi/kernels/sparse/convolution_kernel.h +++ b/paddle/phi/kernels/sparse/convolution_kernel.h @@ -125,6 +125,7 @@ void Conv3dKernel(const Context& dev_ctx, const std::vector& dilations, const std::vector& strides, const int groups, + const bool subm, SparseCooTensor* out, DenseTensor* rulebook); @@ -136,14 +137,23 @@ SparseCooTensor Conv3d(const Context& dev_ctx, const std::vector& dilations, const std::vector& strides, const int groups, + const bool subm, DenseTensor* rulebook) { DenseTensor indices = phi::Empty( dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW)); DenseTensor values = phi::Empty(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout())); SparseCooTensor coo(indices, values, x.dims()); - Conv3dKernel( - dev_ctx, x, kernel, paddings, dilations, strides, groups, &coo, rulebook); + Conv3dKernel(dev_ctx, + x, + kernel, + paddings, + dilations, + strides, + groups, + subm, + &coo, + rulebook); return coo; } diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h index bcb6db40788..a5a946dce79 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution.h +++ b/paddle/phi/kernels/sparse/cpu/convolution.h @@ -39,6 +39,7 @@ void ProductRuleBook(const Context& dev_ctx, const std::vector& dilations, const std::vector& strides, const DDim& out_dims, + const bool subm, DenseTensor* rulebook, DenseTensor* counter_per_kernel) { const auto& kernel_dims = kernel.dims(); @@ -59,11 +60,24 @@ void ProductRuleBook(const Context& dev_ctx, const Dims4D c_strides(1, strides[2], strides[1], strides[0]); const Dims4D c_dilations(1, dilations[2], dilations[1], dilations[0]); + std::set hash_in; + if (subm) { + for (int i = 0; i < non_zero_num; i++) { + int batch = indices_ptr[i]; + int in_z = indices_ptr[i + non_zero_num]; + int in_y = indices_ptr[i + 2 * non_zero_num]; + int in_x = indices_ptr[i + 3 * non_zero_num]; + int index = PointToIndex(batch, in_x, in_y, in_z, x_dims); + hash_in.insert(index); + } + } + auto f_calc_rulebook = [&](int* rulebook_ptr) { int kernel_index = 0, rulebook_index = 0; for (int kz = 0; kz < kernel_dims[0]; kz++) { for (int ky = 0; ky < kernel_dims[1]; ky++) { for (int kx = 0; kx < kernel_dims[2]; kx++) { + ++kernel_index; for (int64_t i = 0; i < non_zero_num; i++) { int batch = indices_ptr[i]; int in_z = indices_ptr[i + non_zero_num]; @@ -83,11 +97,19 @@ void ProductRuleBook(const Context& dev_ctx, kx, ky, kz)) { + if (subm) { + int out_index = + PointToIndex(batch, out_x, out_y, out_z, out_dims); + if (hash_in.find(out_index) == hash_in.end()) { + continue; + } + } + if (rulebook_ptr == nullptr) { - counter_ptr[kernel_index] += 1; + counter_ptr[kernel_index - 1] += 1; ++rulebook_len; } else { - rulebook_ptr[rulebook_index] = kernel_index; + rulebook_ptr[rulebook_index] = kernel_index - 1; rulebook_ptr[rulebook_index + rulebook_len] = i; // in_i rulebook_ptr[rulebook_index + rulebook_len * 2] = PointToIndex( @@ -96,7 +118,6 @@ void ProductRuleBook(const Context& dev_ctx, } } } - ++kernel_index; } } } diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc index 6ee265a3296..bb414faef67 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc @@ -38,6 +38,7 @@ void Conv3dGradKernel(const Context& dev_ctx, const std::vector& dilations, const std::vector& strides, const int groups, + const bool subm, DenseTensor* x_grad, DenseTensor* kernel_grad) { const auto& kernel_dims = kernel.dims(); @@ -70,32 +71,72 @@ void Conv3dGradKernel(const Context& dev_ctx, T* d_kernel_ptr = kernel_grad->data(); memset(d_kernel_ptr, 0, sizeof(T) * kernel_grad->numel()); - Gather(x.non_zero_elements().data(), - rulebook_ptr + rulebook_len, - rulebook_len, - in_channels, - in_features_ptr); - Gather(out_grad.non_zero_elements().data(), - rulebook_ptr + rulebook_len * 2, - rulebook_len, - out_channels, - out_grad_features_ptr); - + int half_kernel_size = kernel_size / 2; auto blas = phi::funcs::GetBlas(dev_ctx); + x_grad->Resize(x.non_zero_elements().dims()); + dev_ctx.Alloc(x_grad, x_grad->dtype(), sizeof(T) * x_grad->numel()); + T* x_grad_values_ptr = x_grad->data(); + memset(x_grad_values_ptr, 0, sizeof(T) * x_grad->numel()); + memset(d_x_features_ptr, 0, sizeof(T) * d_x_features.numel()); + std::vector offsets(kernel_size + 1), counter(kernel_size, 0); for (int i = 0; i < rulebook_len; i++) { counter[rulebook_ptr[i]] += 1; } - int offset = 0; + int offset = 0, max_count = 0; for (int i = 0; i < kernel_size; i++) { offsets[i] = offset; offset += counter[i]; + if (i < half_kernel_size) { + max_count = std::max(max_count, counter[i]); + } } offsets[kernel_size] = offset; + if (subm) { + blas.GEMM(CblasTrans, + CblasNoTrans, + x.non_zero_elements().dims()[1], + out_grad.non_zero_elements().dims()[1], + x.non_zero_elements().dims()[0], + static_cast(1), + x.non_zero_elements().data(), + out_grad.non_zero_elements().data(), + static_cast(0), + d_kernel_ptr + half_kernel_size * in_channels * out_channels); + + // call gemm: d_x = out_grad * transpose(kernel) + // (n, out_channels) * (out_channels, in_channels) + T* x_grad_ptr = x_grad->data(); + blas.GEMM(CblasNoTrans, + CblasTrans, + out_grad.non_zero_elements().dims()[0], + in_channels, + out_grad.non_zero_elements().dims()[1], + static_cast(1), + out_grad.non_zero_elements().data(), + kernel.data() + half_kernel_size * in_channels * out_channels, + static_cast(0), + x_grad_ptr); + if (max_count == 0) { + return; + } + } + + Gather(x.non_zero_elements().data(), + rulebook_ptr + rulebook_len, + rulebook_len, + in_channels, + in_features_ptr); + Gather(out_grad.non_zero_elements().data(), + rulebook_ptr + rulebook_len * 2, + rulebook_len, + out_channels, + out_grad_features_ptr); + const T* kernel_ptr = kernel.data(); for (int i = 0; i < kernel_size; i++) { - if (counter[i] <= 0) { + if (counter[i] <= 0 || (subm && i == half_kernel_size)) { continue; } @@ -136,10 +177,6 @@ void Conv3dGradKernel(const Context& dev_ctx, } // 4. scatter - x_grad->Resize(x.non_zero_elements().dims()); - dev_ctx.Alloc(x_grad, x_grad->dtype(), sizeof(T) * x_grad->numel()); - T* x_grad_values_ptr = x_grad->data(); - memset(x_grad_values_ptr, 0, sizeof(T) * x_grad->numel()); Scatter(d_x_features_ptr, rulebook.data() + rulebook_len, rulebook_len, diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc index 64ef068e03a..f65e1cf579a 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc @@ -35,6 +35,7 @@ void Conv3dKernel(const Context& dev_ctx, const std::vector& dilations, const std::vector& strides, const int groups, + const bool subm, SparseCooTensor* out, DenseTensor* rulebook) { // update padding and dilation @@ -63,6 +64,7 @@ void Conv3dKernel(const Context& dev_ctx, dilations, strides, out_dims, + subm, rulebook, &counter_per_kernel); diff --git a/paddle/phi/kernels/sparse/cpu/submanifold_convolution_kernel.cu b/paddle/phi/kernels/sparse/cpu/submanifold_convolution_kernel.cu new file mode 100644 index 00000000000..5f6d24093a4 --- /dev/null +++ b/paddle/phi/kernels/sparse/cpu/submanifold_convolution_kernel.cu @@ -0,0 +1,30 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/phi/api/lib/utils/allocator.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/sparse/submanifold_convolution_kernel.h" + +namespace phi { +namespace sparse {} // namespace sparse +} // namespace phi diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index 03a6aaa6894..8826fd7cf87 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -71,7 +71,8 @@ __global__ void ScatterKernel(const T* input, const int non_zero_num, const int rulebook_len, const int channels, - T* out) { + T* out, + const bool subm = false) { int tid = threadIdx.x + blockIdx.x * blockDim.x; for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) { int indices_i = i / channels; @@ -82,6 +83,9 @@ __global__ void ScatterKernel(const T* input, : unique_value[indices_i + 1]; // max(end-start) = kernel_size T sum = static_cast(0); + if (subm) { + sum = out[indices_i * channels + channels_i]; + } for (int j = start; j < end; j++) { const int out_feature_i = out_index[j]; sum += input[out_feature_i * channels + channels_i]; diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu index 861f18f36e6..a307ab0f546 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -43,6 +43,7 @@ void Conv3dGradKernel(const Context& dev_ctx, const std::vector& dilations, const std::vector& strides, const int groups, + const bool subm, DenseTensor* x_grad, DenseTensor* kernel_grad) { const auto& kernel_dims = kernel.dims(); @@ -69,37 +70,18 @@ void Conv3dGradKernel(const Context& dev_ctx, T* in_features_ptr = in_features.data(); T* d_x_features_ptr = d_x_features.data(); T* out_grad_features_ptr = out_grad_features.data(); - kernel_grad->Resize(kernel_dims); - dev_ctx.Alloc( - kernel_grad, kernel_grad->dtype(), kernel_grad->numel() * sizeof(T)); + kernel_grad->ResizeAndAllocate(kernel_dims); T* d_kernel_ptr = kernel_grad->data(); phi::funcs::SetConstant set_zero; set_zero(dev_ctx, kernel_grad, static_cast(0.0f)); - auto config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, rulebook_len * in_channels, 1); - GatherKernel<<>>(x.non_zero_elements().data(), - rulebook_ptr + rulebook_len, - in_features_ptr, - rulebook_len, - in_channels); - - config = phi::backends::gpu::GetGpuLaunchConfig1D( - dev_ctx, rulebook_len * out_channels, 1); - GatherKernel<<>>( - out_grad.non_zero_elements().data(), - rulebook_ptr + rulebook_len * 2, - out_grad_features_ptr, - rulebook_len, - out_channels); - + int half_kernel_size = kernel_size / 2; auto blas = phi::funcs::GetBlas(dev_ctx); + x_grad->ResizeAndAllocate(x.non_zero_elements().dims()); + T* x_grad_values_ptr = x_grad->data(); + set_zero(dev_ctx, x_grad, static_cast(0.0f)); + set_zero(dev_ctx, &d_x_features, static_cast(0.0f)); + std::vector offsets(kernel_size + 1), counter(kernel_size, 0), h_counter(rulebook_len, 0); phi::backends::gpu::GpuMemcpyAsync(&h_counter[0], @@ -117,16 +99,72 @@ void Conv3dGradKernel(const Context& dev_ctx, for (int i = 0; i < rulebook_len; i++) { counter[h_counter[i]] += 1; } - int offset = 0; + int offset = 0, max_count = 0; for (int i = 0; i < kernel_size; i++) { offsets[i] = offset; offset += counter[i]; + if (i < half_kernel_size) { + max_count = std::max(max_count, counter[i]); + } } offsets[kernel_size] = offset; + if (subm) { + blas.GEMM(CblasTrans, + CblasNoTrans, + x.non_zero_elements().dims()[1], + out_grad.non_zero_elements().dims()[1], + x.non_zero_elements().dims()[0], + static_cast(1), + x.non_zero_elements().data(), + out_grad.non_zero_elements().data(), + static_cast(0), + d_kernel_ptr + half_kernel_size * in_channels * out_channels); + + // call gemm: d_x = out_grad * transpose(kernel) + // (n, out_channels) * (out_channels, in_channels) + T* x_grad_ptr = x_grad->data(); + blas.GEMM(CblasNoTrans, + CblasTrans, + out_grad.non_zero_elements().dims()[0], + in_channels, + out_grad.non_zero_elements().dims()[1], + static_cast(1), + out_grad.non_zero_elements().data(), + kernel.data() + half_kernel_size * in_channels * out_channels, + static_cast(0), + x_grad_ptr); + if (max_count == 0) { + return; + } + } + + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, rulebook_len * in_channels, 1); + GatherKernel<<>>(x.non_zero_elements().data(), + rulebook_ptr + rulebook_len, + in_features_ptr, + rulebook_len, + in_channels); + + config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, rulebook_len * out_channels, 1); + GatherKernel<<>>( + out_grad.non_zero_elements().data(), + rulebook_ptr + rulebook_len * 2, + out_grad_features_ptr, + rulebook_len, + out_channels); + const T* kernel_ptr = kernel.data(); for (int i = 0; i < kernel_size; i++) { - if (counter[i] <= 0) { + if (counter[i] <= 0 || (subm && i == half_kernel_size)) { continue; } @@ -167,19 +205,11 @@ void Conv3dGradKernel(const Context& dev_ctx, } // 4. scatter - x_grad->Resize(x.non_zero_elements().dims()); - dev_ctx.Alloc(x_grad, x_grad->dtype(), sizeof(T) * x_grad->numel()); - T* x_grad_values_ptr = x_grad->data(); - - DenseTensor out_index = phi::Empty( - dev_ctx, - DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW)); - DenseTensor unique_key = phi::Empty( - dev_ctx, - DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW)); - DenseTensor unique_value = phi::Empty( - dev_ctx, - DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW)); + x_grad->ResizeAndAllocate(x.non_zero_elements().dims()); + DenseTensorMeta index_meta(DataType::INT32, {rulebook_len}, DataLayout::NCHW); + DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta)); + DenseTensor unique_key = phi::Empty(dev_ctx, std::move(index_meta)); + DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta)); SortedAndUniqueIndex(dev_ctx, rulebook_ptr + rulebook_len, @@ -200,7 +230,8 @@ void Conv3dGradKernel(const Context& dev_ctx, x.nnz(), rulebook_len, in_channels, - x_grad_values_ptr); + x_grad_values_ptr, + subm); } } // namespace sparse diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu index 4a533d9d1d5..94186600f1e 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/index_impl.cu.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/primitive/compute_primitives.h" #include "paddle/phi/kernels/sparse/convolution_kernel.h" @@ -32,6 +33,34 @@ limitations under the License. */ namespace phi { namespace sparse { +__global__ void SetFlagAndUpdateCounterKernel(const int* indexs, + const int n, + const int rulebook_len, + const int kernel_size, + int* rulebook_ptr, + int* counter_ptr) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + extern __shared__ int cache_count[]; // kernel_size + for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { + cache_count[i] = 0; + } + __syncthreads(); + + for (int i = tid; i < n; i += gridDim.x * blockDim.x) { + int index = indexs[i]; + int kernel_index = rulebook_ptr[index]; + rulebook_ptr[index + rulebook_len] = -1; + rulebook_ptr[index + 2 * rulebook_len] = -1; + rulebook_ptr[index] = -1; + atomicAdd(&cache_count[kernel_index], 1); + } + __syncthreads(); + + for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { + atomicSub(&counter_ptr[i], cache_count[i]); + } +} + /** * @brief: update the out index and indices * unique_keys: save the index of the output feature list @@ -95,8 +124,10 @@ __global__ void ProductRuleBookKernel(const int* x_indices, const Dims4D paddings, const Dims4D dilations, const Dims4D strides, + const bool subm, int* rulebook, - int* counter) { + int* counter, + int* in_indexs) { int tid = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ int counter_buf[]; // kernel_size const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1]; @@ -108,13 +139,16 @@ __global__ void ProductRuleBookKernel(const int* x_indices, for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) { int kernel_index = 0; + int batch = x_indices[i]; + int in_z = x_indices[i + non_zero_num]; + int in_y = x_indices[i + 2 * non_zero_num]; + int in_x = x_indices[i + 3 * non_zero_num]; + if (subm) { + in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims); + } for (int kz = 0; kz < kernel_dims[1]; kz++) { for (int ky = 0; ky < kernel_dims[2]; ky++) { for (int kx = 0; kx < kernel_dims[3]; kx++) { - int batch = x_indices[i]; - int in_z = x_indices[i + non_zero_num]; - int in_y = x_indices[i + 2 * non_zero_num]; - int in_x = x_indices[i + 3 * non_zero_num]; int in_i = -1, out_index = -1, kernel_i = -1; if (Check(x_dims, kernel_dims, @@ -182,6 +216,7 @@ int ProductRuleBook(const Context& dev_ctx, const std::vector& dilations, const std::vector& strides, const DDim& out_dims, + const bool subm, DenseTensor* rulebook, DenseTensor* counter_per_kernel, DenseTensor* offsets_per_kernel, @@ -195,13 +230,14 @@ int ProductRuleBook(const Context& dev_ctx, const int64_t non_zero_num = x.nnz(); const auto& non_zero_indices = x.non_zero_indices(); const int* indices_ptr = non_zero_indices.data(); + DenseTensor in_indexs = phi::Empty( + dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW)); int* counter_ptr = counter_per_kernel->data(); int* offsets_ptr = offsets_per_kernel->data(); int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; const int rulebook_rows = 3; const int rulebook_cols = kernel_size * non_zero_num; rulebook->ResizeAndAllocate({rulebook_rows, rulebook_cols}); - dev_ctx.Alloc(rulebook, rulebook->dtype(), sizeof(int) * rulebook->numel()); int* rulebook_ptr = rulebook->data(); const auto x_dims = x.dims(); @@ -229,8 +265,10 @@ int ProductRuleBook(const Context& dev_ctx, d_paddings, d_dilations, d_strides, + subm, rulebook_ptr, - counter_ptr); + counter_ptr, + in_indexs.data()); // 2. remove -1 #ifdef PADDLE_WITH_HIP @@ -242,6 +280,144 @@ int ProductRuleBook(const Context& dev_ctx, rulebook_ptr + rulebook_rows * rulebook_cols, -1); + DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( + rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1); + int rulebook_len = 0; + phi::backends::gpu::GpuMemcpyAsync( + &rulebook_len, + rulebook_ptr + 3 * kernel_size * non_zero_num - 1, + sizeof(int), +#ifdef PADDLE_WITH_HIP + hipMemcpyDeviceToHost, +#else + cudaMemcpyDeviceToHost, +#endif + dev_ctx.stream()); + rulebook_len /= 3; + dev_ctx.Wait(); + + if (subm) { + // At present, hashtable is not used to map the input and output indexes. + // At present, the intermediate output index is generated by normal + // convolution, + // and then the intermediate output index is subtracted from the input index + // to obain the rulebook. + // get difference + int32_t* A_key_ptr = rulebook_ptr + 2 * rulebook_len; + int32_t* B_key_ptr = in_indexs.data(); + DenseTensor A_val = phi::Empty( + dev_ctx, + DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW)); + DenseTensor B_val = phi::Empty( + dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW)); + phi::IndexKernel>( + dev_ctx, &A_val, kps::IdentityFunctor()); + phi::IndexKernel>( + dev_ctx, &B_val, kps::IdentityFunctor()); + DenseTensor key_result = phi::Empty( + dev_ctx, + DenseTensorMeta(DataType::INT32, {rulebook_len + 1}, DataLayout::NCHW)); + DenseTensor val_result = phi::Empty( + dev_ctx, + DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW)); + +#ifdef PADDLE_WITH_HIP + thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), +#endif + counter_ptr, + counter_ptr + kernel_size, + offsets_ptr); + std::vector offsets(kernel_size, 0); + // TODO(zhangkaihuo): used unified memcpy interface + phi::backends::gpu::GpuMemcpyAsync(offsets.data(), + offsets_ptr, + kernel_size * sizeof(int), +#ifdef PADDLE_WITH_HIP + hipMemcpyDeviceToHost, +#else + cudaMemcpyDeviceToHost, +#endif + dev_ctx.stream()); + dev_ctx.Wait(); + + thrust::pair end; + // Because set_diff does not support duplicate data, set_diff is performed + // separately for each segment of data. + // TODO(zhangkaihuo): Using hashtable here may get better performance, + // further tests ared needed. + for (int i = 0; i < kernel_size; i++) { + int start = offsets[i]; + int stop = i == kernel_size - 1 ? rulebook_len : offsets[i + 1]; + int* key_result_start = (i == 0 ? key_result.data() : end.first); + int* val_result_start = i == 0 ? val_result.data() : end.second; + end = +#ifdef PADDLE_WITH_HIP + thrust::set_difference_by_key(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::set_difference_by_key(thrust::cuda::par.on(dev_ctx.stream()), +#endif + A_key_ptr + start, + A_key_ptr + stop, + B_key_ptr, + B_key_ptr + x.nnz(), + A_val.data() + start, + B_val.data(), + key_result_start, + val_result_start); + } + + DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( + key_result.data(), + end.first, + key_result.data() + rulebook_len); + int len = 0; + phi::backends::gpu::GpuMemcpyAsync(&len, + key_result.data() + rulebook_len, + sizeof(int), +#ifdef PADDLE_WITH_HIP + hipMemcpyDeviceToHost, +#else + cudaMemcpyDeviceToHost, +#endif + dev_ctx.stream()); + dev_ctx.Wait(); + // set the diff value = -1, and update counter + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len, 1); + SetFlagAndUpdateCounterKernel<<>>(val_result.data(), + len, + rulebook_len, + kernel_size, + rulebook_ptr, + counter_ptr); +// remove -1 +#ifdef PADDLE_WITH_HIP + int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()), +#else + int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()), +#endif + rulebook_ptr, + rulebook_ptr + 3 * rulebook_len, + -1); + DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( + rulebook_ptr, last, key_result.data() + rulebook_len); + phi::backends::gpu::GpuMemcpyAsync(&rulebook_len, + key_result.data() + rulebook_len, + sizeof(int), +#ifdef PADDLE_WITH_HIP + hipMemcpyDeviceToHost, +#else + cudaMemcpyDeviceToHost, +#endif + dev_ctx.stream()); + dev_ctx.Wait(); + rulebook_len /= 3; + } + #ifdef PADDLE_WITH_HIP thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), #else @@ -274,23 +450,14 @@ int ProductRuleBook(const Context& dev_ctx, cudaMemcpyDeviceToHost, dev_ctx.stream()); #endif - dev_ctx.Wait(); - int rulebook_len = - (*h_counter)[kernel_size - 1] + (*h_offsets)[kernel_size - 1]; rulebook->Resize({rulebook_rows, rulebook_len}); // 3. sorted or merge the out index out_index->ResizeAndAllocate({rulebook_len}); unique_value->ResizeAndAllocate({rulebook_len}); unique_key->ResizeAndAllocate({rulebook_len}); - dev_ctx.Alloc( - out_index, out_index->dtype(), sizeof(int) * out_index->numel()); int* out_index_ptr = out_index->data(); - dev_ctx.Alloc( - unique_value, unique_value->dtype(), sizeof(int) * unique_value->numel()); int* unique_value_ptr = unique_value->data(); - dev_ctx.Alloc( - unique_key, unique_key->dtype(), sizeof(int) * unique_key->numel()); int* unique_key_ptr = unique_key->data(); int* new_end = SortedAndUniqueIndex(dev_ctx, @@ -364,6 +531,7 @@ void Conv3dKernel(const Context& dev_ctx, const std::vector& dilations, const std::vector& strides, const int groups, + const bool subm, SparseCooTensor* out, DenseTensor* rulebook) { // update padding and dilation @@ -389,20 +557,28 @@ void Conv3dKernel(const Context& dev_ctx, DataType::INT32, {kernel_size}, DataLayout::NCHW); DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(offsets_meta)); - DenseTensor out_index = phi::Empty( - dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW)); - DenseTensor unique_key = phi::Empty( - dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW)); - DenseTensor unique_value = phi::Empty( - dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW)); + DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW); + DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta)); + DenseTensor unique_key = phi::Empty(dev_ctx, std::move(index_meta)); + DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta)); + + std::vector subm_paddings(paddings), subm_strides(strides); + if (subm) { + auto kernel_dims = kernel.dims(); + for (int i = 0; i < paddings.size(); i++) { + subm_paddings[i] = kernel_dims[i] / 2; + subm_strides[i] = 1; + } + } int n = ProductRuleBook(dev_ctx, x, kernel, - paddings, + subm_paddings, dilations, - strides, + subm_strides, out_dims, + subm, rulebook, &counter_per_kernel, &offsets_per_kernel, @@ -428,6 +604,8 @@ void Conv3dKernel(const Context& dev_ctx, phi::Empty(dev_ctx, std::move(out_features_meta)); T* in_features_ptr = in_features.data(); T* out_features_ptr = out_features.data(); + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, &out_features, static_cast(0.0f)); auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1); diff --git a/paddle/phi/tests/api/test_sparse_conv_api.cc b/paddle/phi/tests/api/test_sparse_conv_api.cc index 76cb01d8a8b..7c4aa164259 100644 --- a/paddle/phi/tests/api/test_sparse_conv_api.cc +++ b/paddle/phi/tests/api/test_sparse_conv_api.cc @@ -78,7 +78,7 @@ void TestConv3dBase(const std::vector& indices, if (!std::is_same::value) { auto outs = paddle::experimental::sparse::conv3d( - x, weight, paddings, dilations, strides, 1); + x, weight, paddings, dilations, strides, 1, false); auto out = std::dynamic_pointer_cast( std::get<0>(outs).impl()); diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc index c1a8b853b32..37a69a176c6 100644 --- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc @@ -64,7 +64,8 @@ void TestConv3dBase(const std::vector& indices, const float diff = 1e-3, const bool backward = false, const std::vector features_grad = {}, - const std::vector kernel_grad = {}) { + const std::vector kernel_grad = {}, + const bool subm = false) { phi::CPUContext dev_ctx_cpu; dev_ctx_cpu.SetAllocator( paddle::memory::allocation::AllocatorFacade::Instance() @@ -114,6 +115,7 @@ void TestConv3dBase(const std::vector& indices, dilations, strides, 1, + subm, &rulebook); ASSERT_EQ(correct_out_dims.size(), out.dims().size()); @@ -138,7 +140,8 @@ void TestConv3dBase(const std::vector& indices, paddings, dilations, strides, - 1); + 1, + subm); f_verify(grads[0].data(), features_grad); f_verify(grads[1].data(), kernel_grad); } @@ -191,6 +194,7 @@ void TestConv3dBase(const std::vector& indices, dilations, strides, 1, + subm, &d_rulebook); ASSERT_EQ(correct_out_dims.size(), d_out.dims().size()); @@ -235,7 +239,8 @@ void TestConv3dBase(const std::vector& indices, paddings, dilations, strides, - 1); + 1, + subm); DenseTensor h_features_grad = phi::Empty( dev_ctx_cpu, DenseTensorMeta(grads[0].dtype(), grads[0].dims(), grads[0].layout())); @@ -266,7 +271,8 @@ void TestConv3d(const std::vector& indices, const float diff = 1e-3, const bool backward = false, const std::vector features_grad = {}, - const std::vector kernel_grad = {}) { + const std::vector kernel_grad = {}, + const bool subm = false) { // test float TestConv3dBase(indices, features, @@ -283,7 +289,8 @@ void TestConv3d(const std::vector& indices, diff, backward, features_grad, - kernel_grad); + kernel_grad, + subm); // test double TestConv3dBase(indices, cast(features), @@ -300,7 +307,8 @@ void TestConv3d(const std::vector& indices, diff, backward, cast(features_grad), - cast(kernel_grad)); + cast(kernel_grad), + subm); } TEST(DEV_API, sparse_conv3d) { @@ -661,5 +669,101 @@ TEST(DEV_API, sparse_conv3d_backward) { kernel_grad); } +TEST(DEV_API, sparse_conv2d_subm) { + const int in_channels = 1; + const int out_channels = 1; + DDim x_dims = {1, 1, 4, 5, in_channels}; + DDim kernel_dims = {1, 3, 3, in_channels, out_channels}; + DDim out_dims = {1, 1, 4, 5, out_channels}; + std::vector paddings = {0, 1, 1}; + std::vector strides = {1, 1, 1}; + std::vector dilations = {1, 1, 1}; + + const int non_zero_num = 4; + std::vector indices_flatten = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 3, 2, 2, 3}; + + std::vector features = {0.8854, 0.6505, -0.1999, 0.3583}; + // 3*3*3=27 + std::vector kernel = { + 0.9364, 0.9460, 0.6564, 0.7999, 0.2013, 0.3812, 0.5474, 0.1016, 0.3368}; + + std::vector out_indices_flatten = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 3, 2, 2, 3}; + + std::vector out_features = {0.1782, 0.2313, 0.7117, 0.5214}; + + std::vector features_grad = {0.0359, 1.2080, 0.5838, 0.4541}; + std::vector kernel_grad = { + 0.3391, 0.4630, 0.0000, -0.1042, 0.3528, 0.2550, 0.0000, -0.0462, 0.0829}; + + TestConv3d(indices_flatten, + features, + x_dims, + kernel, + kernel_dims, + out_indices_flatten, + out_features, + out_dims, + non_zero_num, + paddings, + strides, + dilations, + 1e-3, + true, + features_grad, + kernel_grad, + true); +} + +TEST(DEV_API, sparse_conv3d_subm) { + const int in_channels = 1; + const int out_channels = 1; + DDim x_dims = {1, 4, 4, 5, in_channels}; + DDim kernel_dims = {3, 3, 3, in_channels, out_channels}; + DDim out_dims = {1, 4, 4, 5, out_channels}; + std::vector paddings = {1, 1, 1}; + std::vector strides = {1, 1, 1}; + std::vector dilations = {1, 1, 1}; + + const int non_zero_num = 3; + std::vector indices_flatten = {0, 0, 0, 1, 3, 3, 2, 0, 2, 0, 3, 1}; + + std::vector features = {-0.9578, 0.1572, 0.1036}; + // 3*3*3=27 + std::vector kernel = { + 0.1367, 0.4534, 0.2138, 0.8264, 0.7534, 0.3270, 0.2880, 0.1562, 0.7770, + 0.6902, 0.1981, 0.1369, 0.6582, 0.7582, 0.5640, 0.8894, 0.7350, 0.1845, + 0.6892, 0.3654, 0.6076, 0.0326, 0.8412, 0.5289, 0.9824, 0.8235, 0.9802}; + + std::vector out_indices_flatten = {0, 0, 0, 1, 3, 3, 2, 0, 2, 0, 3, 1}; + + std::vector out_features = {-0.7262, 0.1192, 0.0785}; + + std::vector features_grad = {-0.5506, 0.0904, 0.0595}; + std::vector kernel_grad = { + 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, + 0.0000, 0.0000, 0.0000, 0.0000, 0.7224, 0.0000, 0.0000, 0.0000, 0.0000, + 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000}; + + TestConv3d(indices_flatten, + features, + x_dims, + kernel, + kernel_dims, + out_indices_flatten, + out_features, + out_dims, + non_zero_num, + paddings, + strides, + dilations, + 1e-3, + true, + features_grad, + kernel_grad, + true); +} + } // namespace tests } // namespace phi diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml index 2f233a2df35..9c859022e8a 100644 --- a/python/paddle/utils/code_gen/sparse_api.yaml +++ b/python/paddle/utils/code_gen/sparse_api.yaml @@ -1,5 +1,5 @@ - api : conv3d - args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups) + args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) output : Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor) kernel : func : sparse_conv3d diff --git a/python/paddle/utils/code_gen/sparse_bw_api.yaml b/python/paddle/utils/code_gen/sparse_bw_api.yaml index 8c9f02ebb31..6532f103cbf 100644 --- a/python/paddle/utils/code_gen/sparse_bw_api.yaml +++ b/python/paddle/utils/code_gen/sparse_bw_api.yaml @@ -1,6 +1,6 @@ - backward_api : conv3d_grad - forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor) - args : (Tensor x, Tensor kernel, Tensor rulebook, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups) + forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor) + args : (Tensor x, Tensor kernel, Tensor rulebook, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) output : Tensor(x_grad@DenseTensor), Tensor(kernel_grad@DenseTensor) kernel : - func : sparse_conv_grad + func : sparse_conv3d_grad -- GitLab From f70f5e4fdafdf31276d9adee02a3d41e0600b778 Mon Sep 17 00:00:00 2001 From: lilong12 Date: Fri, 11 Mar 2022 18:58:28 +0800 Subject: [PATCH 007/176] fix the bug for processgroup_hccl compiling (#40437) --- .../collective/ProcessGroupHCCL.cc | 8 +++---- .../distributed/collective/ProcessGroupHCCL.h | 23 ------------------- paddle/fluid/pybind/CMakeLists.txt | 2 +- 3 files changed, 4 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc index 84f5ca48d25..2deeb7ca030 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc @@ -139,11 +139,9 @@ bool ProcessGroupHCCL::HCCLTask::IsCompleted() { // TODO(sandyhouse): Add timeout for wait, now timeout unused bool ProcessGroupHCCL::HCCLTask::Wait(std::chrono::milliseconds timeout) { SynchronizeStreams(); - if (FLAGS_hccl_blocking_wait) { - // NOTE(sandyhouse): It will block host for sync - while (!IsCompleted()) { - std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout)); - } + // NOTE(sandyhouse): It will block host for sync + while (!IsCompleted()) { + std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout)); } return true; } diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h index f2376b4eed7..83d509be2cd 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h @@ -84,29 +84,6 @@ class ProcessGroupHCCL : public ProcessGroup { std::vector& tensors, const BroadcastOptions& = BroadcastOptions()) override; - std::shared_ptr Barrier( - const BarrierOptions& = BarrierOptions()) override; - - std::shared_ptr Send(std::vector& tensors, - int dst_rank) override; - - std::shared_ptr Recv(std::vector& tensors, - int src_rank) override; - - std::shared_ptr AllGather( - std::vector& in_tensors, - std::vector& out_tensors) override; - - std::shared_ptr AllToAll( - std::vector& in, std::vector& out) override; - - std::shared_ptr Reduce( - std::vector& tensors, const ReduceOptions& opts) override; - - std::shared_ptr Scatter(std::vector& in_tensors, - std::vector& out_tensors, - const ScatterOptions&) override; - protected: virtual std::shared_ptr CreateTask( std::vector places, int rank, CommType opType, diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 6c8fc450cd4..8ee22590b6d 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -92,7 +92,7 @@ if(NOT ON_INFER) if (WITH_GLOO) set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo) endif() - if(WITH_ASCEND) + if(WITH_ASCEND_CL) set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_hccl) endif() set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc) -- GitLab From d1811010bbe0f2666696d3403be2a45a0bfdd7fb Mon Sep 17 00:00:00 2001 From: Tomasz Socha Date: Fri, 11 Mar 2022 14:35:51 +0100 Subject: [PATCH 008/176] Use OneDNN's LayerNorm kernel (#40418) --- paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc index 90e6a36220a..812c55cdd50 100644 --- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc @@ -150,4 +150,5 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel { // TODO(jczaja): Enable FP32 when performance is good namespace ops = paddle::operators; REGISTER_OP_KERNEL(layer_norm, MKLDNN, ::paddle::platform::CPUPlace, + ops::LayerNormMKLDNNOpKernel, ops::LayerNormMKLDNNOpKernel); -- GitLab From 70f83f1d401bcb98afc1caeb1df8d0d154280ec6 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Sat, 12 Mar 2022 08:23:30 +0800 Subject: [PATCH 009/176] Fix eager benchmark test failed (#40468) * fix eager benchmark test failed * fix test_tracer failed --- .../fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc | 2 ++ .../fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc | 2 ++ .../fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc | 2 ++ .../fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc | 2 ++ paddle/fluid/imperative/tests/test_tracer.cc | 1 + 5 files changed, 9 insertions(+) diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc index adb3246ee8c..056c7102f66 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc @@ -40,6 +40,8 @@ PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT); using namespace egr; // NOLINT using namespace egr_utils_api; // NOLINT diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc index bd70e84d9b4..5e790389819 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc @@ -44,6 +44,8 @@ PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT); TEST(Benchmark, EagerScaleCUDA) { eager_test::InitEnv(paddle::platform::CUDAPlace()); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc index a9d297c1c64..b4b47a85f66 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc @@ -41,6 +41,8 @@ PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT); namespace paddle { namespace imperative { diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc index bd9eaa09ca9..a3e393b0394 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc @@ -43,6 +43,8 @@ PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT); namespace paddle { namespace imperative { diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc index 2e38bd77cf6..f754c6fdd0e 100644 --- a/paddle/fluid/imperative/tests/test_tracer.cc +++ b/paddle/fluid/imperative/tests/test_tracer.cc @@ -34,6 +34,7 @@ PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT); #endif namespace imperative = paddle::imperative; -- GitLab From 69a01c47339919d9c286a0685410c81517690ed3 Mon Sep 17 00:00:00 2001 From: jiangcheng Date: Sat, 12 Mar 2022 11:06:40 +0800 Subject: [PATCH 010/176] fix NetBuilder API Name bug in cinn_lib_test (#40392) * fix NetBuilder API Name bug in cinn_lib_test * update cinn version to newest --- cmake/external/cinn.cmake | 2 +- paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cmake/external/cinn.cmake b/cmake/external/cinn.cmake index 41b90345c8c..d3f330ba9dd 100644 --- a/cmake/external/cinn.cmake +++ b/cmake/external/cinn.cmake @@ -26,7 +26,7 @@ add_definitions(-w) ###################################### include(ExternalProject) set(CINN_PREFIX_DIR ${THIRD_PARTY_PATH}/CINN) -set(CINN_GIT_TAG release/v0.1) +set(CINN_GIT_TAG 56879b637e2c4db19091eedad03d7cc674e092a2) set(CINN_OPTIONAL_ARGS -DPY_VERSION=${PY_VERSION} -DWITH_CUDA=${WITH_GPU} -DWITH_CUDNN=${WITH_GPU} diff --git a/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc index 23cb653fef2..7a7a7b2798f 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc @@ -45,8 +45,8 @@ Program CreateAddProgram() { NetBuilder builder("net_builder"); auto a = builder.CreateInput(Float(32), {M, N}); auto b = builder.CreateInput(Float(32), {M, N}); - auto c = builder.add(a, b); - auto d = builder.add(a, c); + auto c = builder.Add(a, b); + auto d = builder.Add(a, c); auto program = builder.Build(); return program; @@ -116,8 +116,8 @@ TEST(net_build, program_execute_fc) { auto w = builder.CreateInput(Float(32), {N, K}, "W"); // weight auto b = builder.CreateInput(Float(32), {N}, "B"); // bias - auto mul_out = builder.mul(a, w, 2, 1); - auto add_out = builder.add(mul_out, b); + auto mul_out = builder.Mul(a, w, 2, 1); + auto add_out = builder.Add(mul_out, b); auto program = builder.Build(); #ifdef PADDLE_WITH_CUDA -- GitLab From 573ca984cb17eb6f42112e9808a53cb094bca36c Mon Sep 17 00:00:00 2001 From: Aganlengzi Date: Sat, 12 Mar 2022 11:39:18 +0800 Subject: [PATCH 011/176] [custom kernel] fix static object de-initialize bug (#40414) * [custom kernel] fix static object de-initialize bug * fix text * fix text * refine log info --- paddle/fluid/pybind/pybind.cc | 7 +++++++ paddle/phi/core/custom_kernel.cc | 9 +++++++-- python/paddle/fluid/__init__.py | 2 ++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 1c5b30fe087..98880294a27 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -729,6 +729,13 @@ PYBIND11_MODULE(core_noavx, m) { lib[string]: the libarary, could be 'phi', 'fluid' and 'all'. )DOC"); + // NOTE(Aganlengzi): KernelFactory static instance is initialized BEFORE + // plugins are loaded for custom kernels, but de-initialized AFTER they are + // unloaded. We need manually clear symbols(may contain plugins' symbols) + // stored in this static instance to avoid illegal memory access. + m.def("clear_kernel_factory", + []() { phi::KernelFactory::Instance().kernels().clear(); }); + // NOTE(zjl): ctest would load environment variables at the beginning even // though we have not `import paddle.fluid as fluid`. So we add this API // to enable eager deletion mode in unittest. diff --git a/paddle/phi/core/custom_kernel.cc b/paddle/phi/core/custom_kernel.cc index bc317da8d98..48778bb38e5 100644 --- a/paddle/phi/core/custom_kernel.cc +++ b/paddle/phi/core/custom_kernel.cc @@ -33,6 +33,10 @@ void CustomKernelMap::RegisterCustomKernel(const std::string& name, void CustomKernelMap::RegisterCustomKernels() { VLOG(3) << "Size of custom_kernel_map: " << kernels_.size(); + if (kernels_.size() <= 0) { + LOG(INFO) << "No custom kernel info found in loaded lib(s)."; + return; + } auto& kernels = KernelFactory::Instance().kernels(); for (auto& pair : kernels_) { PADDLE_ENFORCE_NE( @@ -60,9 +64,10 @@ void CustomKernelMap::RegisterCustomKernels() { << info_pair.first << "] to Paddle. It will be used like native ones."; } - kernels_[pair.first].clear(); } - LOG(INFO) << "Successed in loading custom kernels."; + LOG(INFO) << "Successed in loading " << kernels_.size() + << " custom kernel(s) from loaded lib(s), will be " + << "used like native ones."; kernels_.clear(); } diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 997075590e5..7480909a2d8 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -226,3 +226,5 @@ if core.is_compiled_with_npu(): atexit.register(core.npu_finalize) # NOTE(Aurelius84): clean up ExecutorCacheInfo in advance manually. atexit.register(core.clear_executor_cache) +# NOTE(Aganlengzi): clean up KernelFactory in advance manually. +atexit.register(core.clear_kernel_factory) -- GitLab From 39de9b8ab638ac2902b082ece3b58d215eb4f7d9 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Sat, 12 Mar 2022 11:55:53 +0800 Subject: [PATCH 012/176] [PHI] Move forward kernel of roi_align into phi (#40382) * move roi_align kernel to phi * fix bug of roi_align xpu --- paddle/fluid/operators/roi_align_op.cc | 6 +- paddle/fluid/operators/roi_align_op.cu | 199 ------------- paddle/fluid/operators/roi_align_op.h | 269 ----------------- paddle/fluid/operators/roi_align_op_npu.cc | 2 +- paddle/fluid/operators/roi_align_op_xpu.cc | 5 +- paddle/phi/kernels/cpu/roi_align_kernel.cc | 318 +++++++++++++++++++++ paddle/phi/kernels/gpu/roi_align_kernel.cu | 255 +++++++++++++++++ paddle/phi/kernels/gpu/scale_kernel.cu | 3 +- paddle/phi/kernels/roi_align_kernel.h | 34 +++ paddle/phi/ops/compat/roi_align_sig.cc | 32 +++ 10 files changed, 646 insertions(+), 477 deletions(-) create mode 100644 paddle/phi/kernels/cpu/roi_align_kernel.cc create mode 100644 paddle/phi/kernels/gpu/roi_align_kernel.cu create mode 100644 paddle/phi/kernels/roi_align_kernel.h create mode 100644 paddle/phi/ops/compat/roi_align_sig.cc diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc index 5627b4f229e..ac0cd75237b 100644 --- a/paddle/fluid/operators/roi_align_op.cc +++ b/paddle/fluid/operators/roi_align_op.cc @@ -226,11 +226,7 @@ REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker, ops::ROIAlignGradMaker); REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp, ops::RoiAlignGradNoNeedBufVarsInferer); -REGISTER_OP_CPU_KERNEL( - roi_align, - ops::CPUROIAlignOpKernel, - ops::CPUROIAlignOpKernel, - ops::CPUROIAlignOpKernel); + REGISTER_OP_CPU_KERNEL( roi_align_grad, ops::CPUROIAlignGradOpKernel, diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu index 18941d10e93..1a2e64cd45c 100644 --- a/paddle/fluid/operators/roi_align_op.cu +++ b/paddle/fluid/operators/roi_align_op.cu @@ -33,43 +33,6 @@ static inline int NumBlocks(const int N) { kNumMaxinumNumBlocks); } -template -__device__ T BilinearInterpolate(const T* input_data, const int height, - const int width, T y, T x) { - if (y < -1.0 || y > height || x < -1.0 || x > width) { - return 0; - } - y = y <= 0 ? 0 : y; - x = x <= 0 ? 0 : x; - int y_low = static_cast(y); - int x_low = static_cast(x); - int y_high; - int x_high; - if (y_low >= height - 1) { - y_high = y_low = height - 1; - y = static_cast(y_low); - } else { - y_high = y_low + 1; - } - if (x_low >= width - 1) { - x_high = x_low = width - 1; - x = static_cast(x_low); - } else { - x_high = x_low + 1; - } - T ly = y - y_low, lx = x - x_low; - T hy = 1. - ly, hx = 1. - lx; - - T v1 = input_data[y_low * width + x_low]; - T v2 = input_data[y_low * width + x_high]; - T v3 = input_data[y_high * width + x_low]; - T v4 = input_data[y_high * width + x_high]; - T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; - - T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); - return val; -} - template __device__ void BilinearInterpolateGradient(const int height, const int width, T y, T x, T* w1, T* w2, T* w3, @@ -102,65 +65,6 @@ __device__ void BilinearInterpolateGradient(const int height, const int width, return; } -template -__global__ void GPUROIAlignForward( - const int nthreads, const T* input_data, const T* input_rois, - const float spatial_scale, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int sampling_ratio, int* roi_batch_id_data, T* output_data, - const bool continuous_coordinate) { - CUDA_KERNEL_LOOP(i, nthreads) { - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % channels; - int n = i / pooled_width / pooled_height / channels; - - const T* offset_input_rois = input_rois + n * kROISize; - int roi_batch_ind = roi_batch_id_data[n]; - - T roi_offset = continuous_coordinate ? static_cast(0.5) : 0; - T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset; - T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset; - T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset; - T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset; - - T roi_width = roi_xmax - roi_xmin; - T roi_height = roi_ymax - roi_ymin; - if (!continuous_coordinate) { - roi_width = max(roi_width, static_cast(1.)); - roi_height = max(roi_height, static_cast(1.)); - } - - T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); - T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); - - const T* offset_input_data = - input_data + (roi_batch_ind * channels + c) * height * width; - - int roi_bin_grid_h = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_height / pooled_height); - int roi_bin_grid_w = - (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); - const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); - T output_val = 0; - for (int iy = 0; iy < roi_bin_grid_h; iy++) { - const T y = roi_ymin + ph * bin_size_h + - static_cast(iy + .5f) * bin_size_h / - static_cast(roi_bin_grid_h); - for (int ix = 0; ix < roi_bin_grid_w; ix++) { - const T x = roi_xmin + pw * bin_size_w + - static_cast(ix + .5f) * bin_size_w / - static_cast(roi_bin_grid_w); - T val = BilinearInterpolate(offset_input_data, height, width, y, x); - output_val += val; - } - } - output_val /= count; - output_data[i] = output_val; - } -} - template __global__ void GPUROIAlignBackward( const int nthreads, const T* input_rois, const T* out_grad, @@ -236,105 +140,6 @@ __global__ void GPUROIAlignBackward( } } -template -class GPUROIAlignOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - auto sampling_ratio = ctx.Attr("sampling_ratio"); - auto aligned = ctx.Attr("aligned"); - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - - int rois_num = rois->dims()[0]; - - if (rois_num == 0) return; - - int output_size = out->numel(); - int blocks = NumBlocks(output_size); - int threads = kNumCUDAThreads; -#ifdef WITH_NV_JETSON - platform::ChangeThreadNum(ctx.cuda_device_context(), &threads, 256); -#endif - Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - auto cplace = platform::CPUPlace(); - int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); - auto& dev_ctx = ctx.cuda_device_context(); - auto gplace = ctx.GetPlace(); - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - int rois_batch_size = rois_num_t->numel(); - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The rois_batch_size and imgs " - "batch_size must be the same. But received rois_batch_size = %d, " - "batch_size = %d", - rois_batch_size, batch_size)); - - std::vector rois_num_list(rois_batch_size); - memory::Copy(cplace, rois_num_list.data(), gplace, - rois_num_t->data(), sizeof(int) * rois_batch_size, 0); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_list[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_list[n]; - } - } else { - auto lod = rois->lod(); - PADDLE_ENFORCE_EQ( - lod.empty(), false, - platform::errors::InvalidArgument("Input(ROIs) in ROIAlignOp does " - "not contain LoD information.")); - auto rois_lod = lod.back(); - int rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of rois and batch size " - "of images must be the same. But received rois batch size = %d, " - "and images batch size = %d", - rois_batch_size, batch_size)); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ( - rois_num, rois_num_with_lod, - platform::errors::InvalidArgument( - "The actual number of rois and the number of rois " - "provided from Input(RoIsLoD) in RoIAlign must be the same." - " But received actual number of rois is %d, and the number " - "of rois from RoIsLoD is %d", - rois_num, rois_num_with_lod)); - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - int bytes = roi_batch_id_list.numel() * sizeof(int); - auto roi_ptr = memory::Alloc(dev_ctx, bytes); - int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); - memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, - dev_ctx.stream()); - GPUROIAlignForward<<>>( - output_size, in->data(), rois->data(), spatial_scale, channels, - height, width, pooled_height, pooled_width, sampling_ratio, roi_id_data, - out->mutable_data(ctx.GetPlace()), aligned); - } -}; - template class GPUROIAlignGradOpKernel : public framework::OpKernel { public: @@ -416,10 +221,6 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - roi_align, - ops::GPUROIAlignOpKernel, - ops::GPUROIAlignOpKernel); REGISTER_OP_CUDA_KERNEL( roi_align_grad, ops::GPUROIAlignGradOpKernel, diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h index e71099ed99f..589e35e4ab7 100644 --- a/paddle/fluid/operators/roi_align_op.h +++ b/paddle/fluid/operators/roi_align_op.h @@ -23,152 +23,6 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -namespace { // NOLINT -constexpr size_t get_offset(size_t x, size_t y, size_t width) { - return y * width + x; -} - -template -struct offsets_and_ratios { - offsets_and_ratios() = default; - offsets_and_ratios(std::size_t xy, std::size_t xY, std::size_t Xy, - std::size_t XY, T xy_ratio, T xY_ratio, T Xy_ratio, - T XY_ratio) - : xy(xy), - xY(xY), - Xy(Xy), - XY(XY), - xy_ratio(xy_ratio), - xY_ratio(xY_ratio), - Xy_ratio(Xy_ratio), - XY_ratio(XY_ratio) {} - - std::size_t xy = 0; - std::size_t xY = 0; - std::size_t Xy = 0; - std::size_t XY = 0; - T xy_ratio = 0.0f; - T xY_ratio = 0.0f; - T Xy_ratio = 0.0f; - T XY_ratio = 0.0f; -}; - -template -std::vector> get_indexes_and_ratios( - std::size_t width, std::size_t height, const T roi_width, - const T roi_height, const T roi_xmin, const T roi_ymin, - std::size_t pooled_width, std::size_t roi_bin_grid_w, - std::size_t pooled_height, std::size_t roi_bin_grid_h) { - const auto ind_num = - pooled_width * roi_bin_grid_w * pooled_height * roi_bin_grid_h; - - std::vector> interpolation_cords; - interpolation_cords.reserve(ind_num); - - const auto bin_w = roi_width / pooled_width; - const auto bin_h = roi_height / pooled_height; - - for (std::size_t py = 0; py < pooled_height; py++) { - for (std::size_t px = 0; px < pooled_width; px++) { - for (std::size_t iy = 0; iy < roi_bin_grid_h; iy++) { - // calculate x of sample points - auto y = - roi_ymin + - bin_h * (py + - static_cast(iy + .5f) / static_cast(roi_bin_grid_h)); - for (std::size_t ix = 0; ix < roi_bin_grid_w; ix++) { - // calculate x of sample points - auto x = roi_xmin + - bin_w * (px + - static_cast(ix + .5f) / - static_cast(roi_bin_grid_w)); - - // deal with elements out of map - if (y < -1.0 || y > height || x < -1.0 || x > width) { - interpolation_cords.emplace_back(); - continue; - } - y = y <= 0 ? 0 : y; - x = x <= 0 ? 0 : x; - - std::size_t x_low_index = static_cast(x); - std::size_t x_high_index; - if (x_low_index >= width - 1) { - x_high_index = x_low_index = width - 1; - x = static_cast(x_low_index); - } else { - x_high_index = x_low_index + 1; - } - T x_ratio = x_high_index - x; - - std::size_t y_low_index = static_cast(y); - std::size_t y_high_index; - if (y_low_index >= height - 1) { - y_high_index = y_low_index = height - 1; - y = static_cast(y_low_index); - } else { - y_high_index = y_low_index + 1; - } - T y_ratio = y_high_index - y; - - auto xy = get_offset(x_low_index, y_low_index, width); - auto xY = get_offset(x_low_index, y_high_index, width); - auto Xy = get_offset(x_high_index, y_low_index, width); - auto XY = get_offset(x_high_index, y_high_index, width); - - auto xy_ratio = x_ratio * y_ratio; - auto xY_ratio = x_ratio * (1 - y_ratio); - auto Xy_ratio = (1 - x_ratio) * y_ratio; - auto XY_ratio = (1 - x_ratio) * (1 - y_ratio); - - interpolation_cords.emplace_back(xy, xY, Xy, XY, xy_ratio, xY_ratio, - Xy_ratio, XY_ratio); - } - } - } - } - return interpolation_cords; -} // namespace - -template -void interpolate(std::vector& interpolated_values, // NOLINT - const std::vector>& interpolation_cords, - const T* data) { - for (auto& ic : interpolation_cords) { - auto xlyl_offset = ic.xy; - auto xhyl_offset = ic.Xy; - auto xlyh_offset = ic.xY; - auto xhyh_offset = ic.XY; - - auto xlyl_ratio = ic.xy_ratio; - auto xhyl_ratio = ic.Xy_ratio; - auto xlyh_ratio = ic.xY_ratio; - auto xhyh_ratio = ic.XY_ratio; - - interpolated_values.emplace_back( - xlyl_ratio * data[xlyl_offset] + xhyl_ratio * data[xhyl_offset] + - xlyh_ratio * data[xlyh_offset] + xhyh_ratio * data[xhyh_offset]); - } -} - -template -void avg_pool(const std::vector& interpolated_values, T* output_data, - int roi_bin_grid_w, int roi_bin_grid_h, int pooled_width, - int pooled_height) { - const auto data_amount = pooled_width * pooled_height; - const auto grid_points = roi_bin_grid_w * roi_bin_grid_h; - const T count = 1.0 / grid_points; - auto val_begin = interpolated_values.cbegin(); - for (auto i = 0; i < data_amount; ++i) { - T sum = 0.0; - auto val_end = val_begin + grid_points; - sum = std::accumulate(val_begin, val_end, sum); - val_begin = val_end; - output_data[i] = sum * count; - } -} -} // NOLINT - template void bilinear_interpolate_gradient(const int height, const int width, T y, T x, const T out_grad_this_bin, const T count, @@ -213,129 +67,6 @@ void bilinear_interpolate_gradient(const int height, const int width, T y, T x, } } -template -class CPUROIAlignOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - auto sampling_ratio = ctx.Attr("sampling_ratio"); - auto aligned = ctx.Attr("aligned"); - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - auto in_stride = phi::stride(in_dims); - auto roi_stride = phi::stride(rois->dims()); - auto out_stride = phi::stride(out->dims()); - - const T* input_data = in->data(); - framework::Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - int* roi_batch_id_data = - roi_batch_id_list.mutable_data(ctx.GetPlace()); - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of rois and the batch size of images " - " must be the same. But received the batch size of rois is %d, " - "and the batch size of images is %d", - rois_batch_size, batch_size)); - auto* rois_num_data = rois_num_t->data(); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_data[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_data[n]; - } - } else { - auto lod = rois->lod(); - PADDLE_ENFORCE_EQ(lod.empty(), false, - platform::errors::InvalidArgument( - "Input(ROIs) Tensor of ROIAlignOp " - "does not contain LoD information.")); - auto rois_lod = lod.back(); - int rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The rois_batch_size and imgs " - "batch_size must be the same. But received rois_batch_size = %d, " - "batch_size = %d", - rois_batch_size, batch_size)); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ( - rois_num, rois_num_with_lod, - platform::errors::InvalidArgument( - "The actual number of rois and the number of rois " - "provided from Input(RoIsLoD) in RoIAlign must be the same." - " But received actual number of rois is %d, and the number " - "of rois from RoIsLoD is %d", - rois_num, rois_num_with_lod)); - for (int n = 0; n < rois_batch_size; ++n) { - for (std::size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - T* output_data = out->mutable_data(ctx.GetPlace()); - const T* rois_data = rois->data(); - T roi_offset = aligned ? T(0.5) : 0; - for (int n = 0; n < rois_num; ++n) { - int roi_batch_id = roi_batch_id_data[n]; - T roi_xmin = rois_data[0] * spatial_scale - roi_offset; - T roi_ymin = rois_data[1] * spatial_scale - roi_offset; - T roi_xmax = rois_data[2] * spatial_scale - roi_offset; - T roi_ymax = rois_data[3] * spatial_scale - roi_offset; - - T roi_width = roi_xmax - roi_xmin; - T roi_height = roi_ymax - roi_ymin; - if (!aligned) { - roi_width = std::max(roi_width, static_cast(1.)); - roi_height = std::max(roi_height, static_cast(1.)); - } - - const T* batch_data = input_data + roi_batch_id * in_stride[0]; - - int roi_bin_grid_h = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_height / pooled_height); - int roi_bin_grid_w = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_width / pooled_width); - - auto interpolation_cords = get_indexes_and_ratios( - width, height, roi_width, roi_height, roi_xmin, roi_ymin, - pooled_width, roi_bin_grid_w, pooled_height, roi_bin_grid_h); - - std::vector interpolated_values; - interpolated_values.reserve(interpolation_cords.size()); - for (auto channel = 0; channel < channels; ++channel) { - interpolate(interpolated_values, interpolation_cords, batch_data); - avg_pool(interpolated_values, output_data, roi_bin_grid_w, - roi_bin_grid_h, pooled_width, pooled_height); - batch_data += in_stride[1]; - output_data += out_stride[1]; - interpolated_values.clear(); - } - rois_data += roi_stride[0]; - } - } -}; - template class CPUROIAlignGradOpKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/roi_align_op_npu.cc b/paddle/fluid/operators/roi_align_op_npu.cc index d5b63854d99..78509e4299b 100644 --- a/paddle/fluid/operators/roi_align_op_npu.cc +++ b/paddle/fluid/operators/roi_align_op_npu.cc @@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/roi_align_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/roi_align_op_xpu.cc b/paddle/fluid/operators/roi_align_op_xpu.cc index 09d2d906653..13490d6fcde 100644 --- a/paddle/fluid/operators/roi_align_op_xpu.cc +++ b/paddle/fluid/operators/roi_align_op_xpu.cc @@ -13,13 +13,16 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/roi_align_op.h" #include #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + template class XPUROIAlignOpKernel : public framework::OpKernel { public: diff --git a/paddle/phi/kernels/cpu/roi_align_kernel.cc b/paddle/phi/kernels/cpu/roi_align_kernel.cc new file mode 100644 index 00000000000..35ab99a98eb --- /dev/null +++ b/paddle/phi/kernels/cpu/roi_align_kernel.cc @@ -0,0 +1,318 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roi_align_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" + +namespace phi { + +constexpr size_t GetOffset(size_t x, size_t y, size_t width) { + return y * width + x; +} + +template +struct OffsetsAndRatios { + OffsetsAndRatios() = default; + OffsetsAndRatios(std::size_t xy, + std::size_t xY, + std::size_t Xy, + std::size_t XY, + T xy_ratio, + T xY_ratio, + T Xy_ratio, + T XY_ratio) + : xy(xy), + xY(xY), + Xy(Xy), + XY(XY), + xy_ratio(xy_ratio), + xY_ratio(xY_ratio), + Xy_ratio(Xy_ratio), + XY_ratio(XY_ratio) {} + + std::size_t xy = 0; + std::size_t xY = 0; + std::size_t Xy = 0; + std::size_t XY = 0; + T xy_ratio = 0.0f; + T xY_ratio = 0.0f; + T Xy_ratio = 0.0f; + T XY_ratio = 0.0f; +}; + +template +std::vector> GetIndexesAndRatios( + std::size_t width, + std::size_t height, + const T roi_width, + const T roi_height, + const T roi_xmin, + const T roi_ymin, + std::size_t pooled_width, + std::size_t roi_bin_grid_w, + std::size_t pooled_height, + std::size_t roi_bin_grid_h) { + const auto ind_num = + pooled_width * roi_bin_grid_w * pooled_height * roi_bin_grid_h; + + std::vector> interpolation_cords; + interpolation_cords.reserve(ind_num); + + const auto bin_w = roi_width / pooled_width; + const auto bin_h = roi_height / pooled_height; + + for (std::size_t py = 0; py < pooled_height; py++) { + for (std::size_t px = 0; px < pooled_width; px++) { + for (std::size_t iy = 0; iy < roi_bin_grid_h; iy++) { + // calculate x of sample points + auto y = + roi_ymin + + bin_h * (py + + static_cast(iy + .5f) / static_cast(roi_bin_grid_h)); + for (std::size_t ix = 0; ix < roi_bin_grid_w; ix++) { + // calculate x of sample points + auto x = roi_xmin + + bin_w * (px + + static_cast(ix + .5f) / + static_cast(roi_bin_grid_w)); + + // deal with elements out of map + if (y < -1.0 || y > height || x < -1.0 || x > width) { + interpolation_cords.emplace_back(); + continue; + } + y = y <= 0 ? 0 : y; + x = x <= 0 ? 0 : x; + + std::size_t x_low_index = static_cast(x); + std::size_t x_high_index; + if (x_low_index >= width - 1) { + x_high_index = x_low_index = width - 1; + x = static_cast(x_low_index); + } else { + x_high_index = x_low_index + 1; + } + T x_ratio = x_high_index - x; + + std::size_t y_low_index = static_cast(y); + std::size_t y_high_index; + if (y_low_index >= height - 1) { + y_high_index = y_low_index = height - 1; + y = static_cast(y_low_index); + } else { + y_high_index = y_low_index + 1; + } + T y_ratio = y_high_index - y; + + auto xy = GetOffset(x_low_index, y_low_index, width); + auto xY = GetOffset(x_low_index, y_high_index, width); + auto Xy = GetOffset(x_high_index, y_low_index, width); + auto XY = GetOffset(x_high_index, y_high_index, width); + + auto xy_ratio = x_ratio * y_ratio; + auto xY_ratio = x_ratio * (1 - y_ratio); + auto Xy_ratio = (1 - x_ratio) * y_ratio; + auto XY_ratio = (1 - x_ratio) * (1 - y_ratio); + + interpolation_cords.emplace_back( + xy, xY, Xy, XY, xy_ratio, xY_ratio, Xy_ratio, XY_ratio); + } + } + } + } + return interpolation_cords; +} + +template +void Interpolate(std::vector& interpolated_values, // NOLINT + const std::vector>& interpolation_cords, + const T* data) { + for (auto& ic : interpolation_cords) { + auto xlyl_offset = ic.xy; + auto xhyl_offset = ic.Xy; + auto xlyh_offset = ic.xY; + auto xhyh_offset = ic.XY; + + auto xlyl_ratio = ic.xy_ratio; + auto xhyl_ratio = ic.Xy_ratio; + auto xlyh_ratio = ic.xY_ratio; + auto xhyh_ratio = ic.XY_ratio; + + interpolated_values.emplace_back( + xlyl_ratio * data[xlyl_offset] + xhyl_ratio * data[xhyl_offset] + + xlyh_ratio * data[xlyh_offset] + xhyh_ratio * data[xhyh_offset]); + } +} + +template +void AvgPool(const std::vector& interpolated_values, + T* output_data, + int roi_bin_grid_w, + int roi_bin_grid_h, + int pooled_width, + int pooled_height) { + const auto data_amount = pooled_width * pooled_height; + const auto grid_points = roi_bin_grid_w * roi_bin_grid_h; + const T count = 1.0 / grid_points; + auto val_begin = interpolated_values.cbegin(); + for (auto i = 0; i < data_amount; ++i) { + T sum = 0.0; + auto val_end = val_begin + grid_points; + sum = std::accumulate(val_begin, val_end, sum); + val_begin = val_end; + output_data[i] = sum * count; + } +} + +template +void ROIAlignKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + int pooled_height, + int pooled_width, + float spatial_scale, + int sampling_ratio, + bool aligned, + DenseTensor* out) { + auto in_dims = x.dims(); + int batch_size = in_dims[0]; + int channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num = boxes.dims()[0]; + + auto in_stride = phi::stride(in_dims); + auto roi_stride = phi::stride(boxes.dims()); + auto out_stride = phi::stride(out->dims()); + + const T* input_data = x.data(); + DenseTensor roi_batch_id_list = Empty(dev_ctx, {rois_num}); + int* roi_batch_id_data = roi_batch_id_list.data(); + int boxes_batch_size; + if (boxes_num) { + boxes_batch_size = boxes_num->numel(); + PADDLE_ENFORCE_EQ( + boxes_batch_size, + batch_size, + errors::InvalidArgument( + "The batch size of rois and the batch size of images " + " must be the same. But received the batch size of rois is %d, " + "and the batch size of images is %d", + boxes_batch_size, + batch_size)); + auto* boxes_num_data = boxes_num->data(); + int start = 0; + for (int n = 0; n < boxes_batch_size; ++n) { + for (int i = start; i < start + boxes_num_data[n]; ++i) { + roi_batch_id_data[i] = n; + } + start += boxes_num_data[n]; + } + } else { + auto lod = boxes.lod(); + PADDLE_ENFORCE_EQ( + lod.empty(), + false, + errors::InvalidArgument("Input(ROIs) Tensor of ROIAlignOp " + "does not contain LoD information.")); + auto boxes_lod = lod.back(); + int boxes_batch_size = boxes_lod.size() - 1; + PADDLE_ENFORCE_EQ( + boxes_batch_size, + batch_size, + errors::InvalidArgument( + "The boxes_batch_size and imgs " + "batch_size must be the same. But received boxes_batch_size = %d, " + "batch_size = %d", + boxes_batch_size, + batch_size)); + int boxes_num_with_lod = boxes_lod[boxes_batch_size]; + PADDLE_ENFORCE_EQ( + rois_num, + boxes_num_with_lod, + errors::InvalidArgument( + "The actual number of rois and the number of rois " + "provided from Input(RoIsLoD) in RoIAlign must be the same." + " But received actual number of rois is %d, and the number " + "of rois from RoIsLoD is %d", + rois_num, + boxes_num_with_lod)); + for (int n = 0; n < boxes_batch_size; ++n) { + for (std::size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) { + roi_batch_id_data[i] = n; + } + } + } + T* output_data = dev_ctx.template Alloc(out); + const T* boxes_data = boxes.data(); + T roi_offset = aligned ? T(0.5) : 0; + for (int n = 0; n < rois_num; ++n) { + int roi_batch_id = roi_batch_id_data[n]; + T roi_xmin = boxes_data[0] * spatial_scale - roi_offset; + T roi_ymin = boxes_data[1] * spatial_scale - roi_offset; + T roi_xmax = boxes_data[2] * spatial_scale - roi_offset; + T roi_ymax = boxes_data[3] * spatial_scale - roi_offset; + + T roi_width = roi_xmax - roi_xmin; + T roi_height = roi_ymax - roi_ymin; + if (!aligned) { + roi_width = std::max(roi_width, static_cast(1.)); + roi_height = std::max(roi_height, static_cast(1.)); + } + + const T* batch_data = input_data + roi_batch_id * in_stride[0]; + + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + auto interpolation_cords = GetIndexesAndRatios(width, + height, + roi_width, + roi_height, + roi_xmin, + roi_ymin, + pooled_width, + roi_bin_grid_w, + pooled_height, + roi_bin_grid_h); + + std::vector interpolated_values; + interpolated_values.reserve(interpolation_cords.size()); + for (auto channel = 0; channel < channels; ++channel) { + Interpolate(interpolated_values, interpolation_cords, batch_data); + AvgPool(interpolated_values, + output_data, + roi_bin_grid_w, + roi_bin_grid_h, + pooled_width, + pooled_height); + batch_data += in_stride[1]; + output_data += out_stride[1]; + interpolated_values.clear(); + } + boxes_data += roi_stride[0]; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + roi_align, CPU, ALL_LAYOUT, phi::ROIAlignKernel, float, double, int) {} diff --git a/paddle/phi/kernels/gpu/roi_align_kernel.cu b/paddle/phi/kernels/gpu/roi_align_kernel.cu new file mode 100644 index 00000000000..2f906fa4f66 --- /dev/null +++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu @@ -0,0 +1,255 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roi_align_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" + +#include "paddle/fluid/memory/memory.h" + +namespace phi { + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; +static constexpr int kROISize = 4; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +template +__device__ T BilinearInterpolate( + const T* input_data, const int height, const int width, T y, T x) { + if (y < -1.0 || y > height || x < -1.0 || x > width) { + return 0; + } + y = y <= 0 ? 0 : y; + x = x <= 0 ? 0 : x; + int y_low = static_cast(y); + int x_low = static_cast(x); + int y_high; + int x_high; + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = static_cast(y_low); + } else { + y_high = y_low + 1; + } + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = static_cast(x_low); + } else { + x_high = x_low + 1; + } + T ly = y - y_low, lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + + T v1 = input_data[y_low * width + x_low]; + T v2 = input_data[y_low * width + x_high]; + T v3 = input_data[y_high * width + x_low]; + T v4 = input_data[y_high * width + x_high]; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +template +__global__ void GPUROIAlignForward(const int nthreads, + const T* input_data, + const T* input_rois, + const float spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + int* roi_batch_id_data, + T* output_data, + const bool continuous_coordinate) { + CUDA_KERNEL_LOOP(i, nthreads) { + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % channels; + int n = i / pooled_width / pooled_height / channels; + + const T* offset_input_rois = input_rois + n * kROISize; + int roi_batch_ind = roi_batch_id_data[n]; + + T roi_offset = continuous_coordinate ? static_cast(0.5) : 0; + T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset; + T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset; + T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset; + T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset; + + T roi_width = roi_xmax - roi_xmin; + T roi_height = roi_ymax - roi_ymin; + if (!continuous_coordinate) { + roi_width = max(roi_width, static_cast(1.)); + roi_height = max(roi_height, static_cast(1.)); + } + + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + const T* offset_input_data = + input_data + (roi_batch_ind * channels + c) * height * width; + + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); + T output_val = 0; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + const T y = roi_ymin + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T x = roi_xmin + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + T val = BilinearInterpolate(offset_input_data, height, width, y, x); + output_val += val; + } + } + output_val /= count; + output_data[i] = output_val; + } +} + +template +void ROIAlignKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + int pooled_height, + int pooled_width, + float spatial_scale, + int sampling_ratio, + bool aligned, + DenseTensor* out) { + auto in_dims = x.dims(); + int batch_size = in_dims[0]; + int channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + + int rois_num = boxes.dims()[0]; + + if (rois_num == 0) return; + + int output_size = out->numel(); + int blocks = NumBlocks(output_size); + int threads = kNumCUDAThreads; +#ifdef WITH_NV_JETSON + backends::gpu::ChangeThreadNum(dev_ctx, &threads, 256); +#endif + DenseTensor roi_batch_id_list; + roi_batch_id_list.Resize({rois_num}); + int* roi_batch_id_data = dev_ctx.template HostAlloc(&roi_batch_id_list); + auto cplace = phi::CPUPlace(); + auto gplace = dev_ctx.GetPlace(); + if (boxes_num) { + int boxes_batch_size = boxes_num->numel(); + PADDLE_ENFORCE_EQ( + boxes_batch_size, + batch_size, + errors::InvalidArgument( + "The boxes_batch_size and imgs " + "batch_size must be the same. But received boxes_batch_size = %d, " + "batch_size = %d", + boxes_batch_size, + batch_size)); + + std::vector boxes_num_list(boxes_batch_size); + paddle::memory::Copy(cplace, + boxes_num_list.data(), + gplace, + boxes_num->data(), + sizeof(int) * boxes_batch_size, + 0); + int start = 0; + for (int n = 0; n < boxes_batch_size; ++n) { + for (int i = start; i < start + boxes_num_list[n]; ++i) { + roi_batch_id_data[i] = n; + } + start += boxes_num_list[n]; + } + } else { + auto lod = boxes.lod(); + PADDLE_ENFORCE_EQ(lod.empty(), + false, + errors::InvalidArgument("Input(ROIs) in ROIAlignOp does " + "not contain LoD information.")); + auto boxes_lod = lod.back(); + int boxes_batch_size = boxes_lod.size() - 1; + PADDLE_ENFORCE_EQ( + boxes_batch_size, + batch_size, + errors::InvalidArgument( + "The batch size of rois and batch size " + "of images must be the same. But received rois batch size = %d, " + "and images batch size = %d", + boxes_batch_size, + batch_size)); + int boxes_num_with_lod = boxes_lod[boxes_batch_size]; + PADDLE_ENFORCE_EQ( + rois_num, + boxes_num_with_lod, + errors::InvalidArgument( + "The actual number of rois and the number of rois " + "provided from Input(RoIsLoD) in RoIAlign must be the same." + " But received actual number of rois is %d, and the number " + "of rois from RoIsLoD is %d", + rois_num, + boxes_num_with_lod)); + for (int n = 0; n < boxes_batch_size; ++n) { + for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) { + roi_batch_id_data[i] = n; + } + } + } + int bytes = roi_batch_id_list.numel() * sizeof(int); + auto roi_ptr = paddle::memory::Alloc(dev_ctx, bytes); + int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); + paddle::memory::Copy( + gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream()); + GPUROIAlignForward<<>>( + output_size, + x.data(), + boxes.data(), + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + roi_id_data, + dev_ctx.template Alloc(out), + aligned); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + roi_align, GPU, ALL_LAYOUT, phi::ROIAlignKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/scale_kernel.cu b/paddle/phi/kernels/gpu/scale_kernel.cu index 930c50a24be..6f96a697b2f 100644 --- a/paddle/phi/kernels/gpu/scale_kernel.cu +++ b/paddle/phi/kernels/gpu/scale_kernel.cu @@ -15,10 +15,9 @@ limitations under the License. */ #include "paddle/phi/kernels/scale_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" -// See Note [ Why still include the fluid headers? ] -#include "paddle/phi/common/float16.h" namespace phi { diff --git a/paddle/phi/kernels/roi_align_kernel.h b/paddle/phi/kernels/roi_align_kernel.h new file mode 100644 index 00000000000..16b52c563a5 --- /dev/null +++ b/paddle/phi/kernels/roi_align_kernel.h @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/utils/optional.h" + +namespace phi { + +template +void ROIAlignKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + int pooled_height, + int pooled_width, + float spatial_scale, + int sampling_ratio, + bool aligned, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/roi_align_sig.cc b/paddle/phi/ops/compat/roi_align_sig.cc new file mode 100644 index 00000000000..0549103b6fb --- /dev/null +++ b/paddle/phi/ops/compat/roi_align_sig.cc @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature ROIAlignOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("roi_align", + {"X", "ROIs", "RoisNum"}, + {"pooled_height", + "pooled_width", + "spatial_scale", + "sampling_ratio", + "aligned"}, + {"Out"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(roi_align, phi::ROIAlignOpArgumentMapping); -- GitLab From 76f8703445b269334035a891466a284148c26734 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Sat, 12 Mar 2022 19:39:32 +0800 Subject: [PATCH 013/176] [Phi] Move allclose op kernel into phi (#40469) * move allclose kernel * remove allclose op kernel * fix coverage failed --- paddle/fluid/operators/allclose_op.cc | 39 +--------- paddle/fluid/operators/allclose_op.cu | 84 -------------------- paddle/fluid/operators/allclose_op.h | 93 ----------------------- paddle/phi/api/lib/utils/tensor_utils.cc | 7 ++ paddle/phi/kernels/allclose_kernel.h | 31 ++++++++ paddle/phi/kernels/cpu/allclose_kernel.cc | 71 +++++++++++++++++ paddle/phi/kernels/gpu/allclose_kernel.cu | 89 ++++++++++++++++++++++ paddle/phi/ops/compat/allclose_sig.cc | 49 ++++++++++++ paddle/phi/tests/ops/test_op_signature.cc | 28 +++++++ 9 files changed, 276 insertions(+), 215 deletions(-) delete mode 100644 paddle/fluid/operators/allclose_op.cu delete mode 100644 paddle/fluid/operators/allclose_op.h create mode 100644 paddle/phi/kernels/allclose_kernel.h create mode 100644 paddle/phi/kernels/cpu/allclose_kernel.cc create mode 100644 paddle/phi/kernels/gpu/allclose_kernel.cu create mode 100644 paddle/phi/ops/compat/allclose_sig.cc diff --git a/paddle/fluid/operators/allclose_op.cc b/paddle/fluid/operators/allclose_op.cc index 8fb9929c39e..706a132878d 100644 --- a/paddle/fluid/operators/allclose_op.cc +++ b/paddle/fluid/operators/allclose_op.cc @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/allclose_op.h" #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/operator.h" @@ -23,41 +23,6 @@ namespace paddle { namespace operators { -template -struct GetTensorValue { - T operator()(const platform::CPUDeviceContext& dev_ctx, - const framework::Tensor& tensor) const { - return *(tensor.data()); - } -}; - -template -struct AllcloseFunctor { - void operator()(const platform::CPUDeviceContext& ctx, - const framework::Tensor& in, const framework::Tensor& other, - const double rtol, const double atol, bool equal_nan, - framework::Tensor* output) { - auto* in_a = in.data(); - auto* in_b = other.data(); - auto* out_data = output->mutable_data(ctx.GetPlace()); - auto num = in.numel(); - *out_data = true; - for (int i = 0; i < num; i++) { - const T a = in_a[i], b = in_b[i]; - bool val; - if (std::isnan(a) || std::isnan(b)) { - val = equal_nan && std::isnan(a) == std::isnan(b); - } else { - T left = (a > b ? a - b : b - a); - T right = atol + (b > 0 ? rtol * b : (-rtol) * b); - T diff = (left > right ? left - right : right - left); - val = a == b || left <= right || diff <= 1e-15; - } - *out_data &= val; - } - } -}; - class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -157,8 +122,6 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker, ops::AllcloseOpVarTypeInference); -REGISTER_OP_CPU_KERNEL(allclose, ops::AllcloseKernel, - ops::AllcloseKernel); /* ========================== register checkpoint ===========================*/ REGISTER_OP_VERSION(allclose) diff --git a/paddle/fluid/operators/allclose_op.cu b/paddle/fluid/operators/allclose_op.cu deleted file mode 100644 index 32c90ff8fdc..00000000000 --- a/paddle/fluid/operators/allclose_op.cu +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/allclose_op.h" - -namespace paddle { -namespace operators { - -template -struct GetTensorValue { - T operator()(const platform::CUDADeviceContext& dev_ctx, - const framework::Tensor& tensor) const { - const T* data = tensor.data(); - T value; - const auto gpu_place = dev_ctx.GetPlace(); - memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T), - dev_ctx.stream()); - return value; - } -}; - -template -__global__ void AllcloseCUDAKernel(const T* in_data, const T* other_data, - const double rtol, const double atol, - bool equal_nan, int num, bool* out_data) { - unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; - bool val; - for (int i = idx; i < num; i += blockDim.x * gridDim.x) { - const T a = in_data[i], b = other_data[i]; - if (isnan(a) || isnan(b)) { - val = equal_nan && isnan(a) == isnan(b); - } else { - T left = (a > b ? a - b : b - a); - T right = atol + (b > 0 ? rtol * b : (-rtol) * b); - T diff = (left > right ? left - right : right - left); - val = a == b || left <= right || diff <= 1e-15; - } - if (!val) *out_data = false; - } -} - -template -struct AllcloseFunctor { - void operator()(const platform::CUDADeviceContext& dev_ctx, - const framework::Tensor& in, const framework::Tensor& other, - const double rtol, const double atol, bool equal_nan, - framework::Tensor* output) { - int num = in.numel(); - const T* in_data = in.data(); - const T* other_data = other.data(); - bool* out_data = output->mutable_data(dev_ctx.GetPlace()); - int block = 1024; - int grid = (block - 1 + num) / block; - grid = (grid > block) ? block : grid; -#ifdef PADDLE_WITH_HIP - hipMemset(out_data, true, sizeof(bool)); -#else - cudaMemset(out_data, true, sizeof(bool)); -#endif - AllcloseCUDAKernel<<>>( - in_data, other_data, rtol, atol, equal_nan, num, out_data); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; -REGISTER_OP_CUDA_KERNEL(allclose, ops::AllcloseKernel, - ops::AllcloseKernel); diff --git a/paddle/fluid/operators/allclose_op.h b/paddle/fluid/operators/allclose_op.h deleted file mode 100644 index 7a36754194a..00000000000 --- a/paddle/fluid/operators/allclose_op.h +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace operators { -using Tensor = framework::Tensor; - -template -struct GetTensorValue { - T operator()(const platform::DeviceContext& ctx, - const framework::Tensor& tensor) const; -}; - -template -struct AllcloseFunctor { - void operator()(const DeviceContext& ctx, const framework::Tensor& in, - const framework::Tensor& other, const float rtol, - const float atol, bool equal_nan, framework::Tensor* output); -}; - -template -class AllcloseKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - // get attrs - bool equal_nan = ctx.Attr("equal_nan"); - // get input/output - const auto* input = ctx.Input("Input"); - const auto* other = ctx.Input("Other"); - auto* out = ctx.Output("Out"); - - double rtol_v = std::stod(ctx.Attr("rtol")); - double atol_v = std::stod(ctx.Attr("atol")); - - auto& dev_ctx = ctx.template device_context(); - GetTensorValue get_tensor_value; - if (ctx.HasInput("Rtol")) { - const auto* rtol = ctx.Input("Rtol"); - PADDLE_ENFORCE_EQ( - rtol->numel(), 1, - platform::errors::InvalidArgument( - "Input(Rtol) size must be 1, but get %d.", rtol->numel())); - PADDLE_ENFORCE_EQ( - framework::TransToProtoVarType(rtol->dtype()), - framework::proto::VarType::FP64, - platform::errors::InvalidArgument( - "Input(Rtol) type must be double, but get %s.", - framework::DataTypeToString( - framework::TransToProtoVarType(rtol->dtype())))); - rtol_v = get_tensor_value(dev_ctx, *rtol); - } - if (ctx.HasInput("Atol")) { - const auto* atol = ctx.Input("Atol"); - PADDLE_ENFORCE_EQ( - atol->numel(), 1, - platform::errors::InvalidArgument( - "Input(Atol) size must be 1, but get %d", atol->numel())); - PADDLE_ENFORCE_EQ( - framework::TransToProtoVarType(atol->dtype()), - framework::proto::VarType::FP64, - platform::errors::InvalidArgument( - "Input(Atol) type must be double, but get %s", - framework::DataTypeToString( - framework::TransToProtoVarType(atol->dtype())))); - atol_v = get_tensor_value(dev_ctx, *atol); - } - - AllcloseFunctor()(dev_ctx, *input, *other, rtol_v, atol_v, - equal_nan, out); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/api/lib/utils/tensor_utils.cc b/paddle/phi/api/lib/utils/tensor_utils.cc index 1c9f7c3a868..3d183ea7fee 100644 --- a/paddle/phi/api/lib/utils/tensor_utils.cc +++ b/paddle/phi/api/lib/utils/tensor_utils.cc @@ -40,6 +40,13 @@ phi::Scalar MakePhiScalarFromVar(const framework::Variable& variable) { auto expected_place = phi::TransToPhiPlace(phi::Backend::CPU); if (variable.IsType()) { const auto& tensor = variable.Get(); + PADDLE_ENFORCE_EQ( + tensor.numel(), + 1UL, + platform::errors::InvalidArgument("The DenseTensor used to construct " + "the Scalar contains more than 1 " + "value, it contains `%d` values.", + tensor.numel())); if (!platform::is_same_place(tensor.place(), expected_place)) { framework::LoDTensor tmp_tensor; framework::TensorCopySync(tensor, expected_place, &tmp_tensor); diff --git a/paddle/phi/kernels/allclose_kernel.h b/paddle/phi/kernels/allclose_kernel.h new file mode 100644 index 00000000000..3f24078b86c --- /dev/null +++ b/paddle/phi/kernels/allclose_kernel.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void AllCloseKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const Scalar& rtol, + const Scalar& atol, + bool equal_nan, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/allclose_kernel.cc b/paddle/phi/kernels/cpu/allclose_kernel.cc new file mode 100644 index 00000000000..7ffeadfeed8 --- /dev/null +++ b/paddle/phi/kernels/cpu/allclose_kernel.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/allclose_kernel.h" + +#include + +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void AllCloseKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const Scalar& rtol, + const Scalar& atol, + bool equal_nan, + DenseTensor* out) { + PADDLE_ENFORCE_EQ( + rtol.dtype(), + DataType::FLOAT64, + phi::errors::InvalidArgument( + "Input (Rtol) type must be double, but get %s.", rtol.dtype())); + PADDLE_ENFORCE_EQ( + atol.dtype(), + DataType::FLOAT64, + phi::errors::InvalidArgument( + "Input (Atol) type must be double, but get %s.", atol.dtype())); + + auto* in_a = x.data(); + auto* in_b = y.data(); + auto rtol_v = rtol.to(); + auto atol_v = atol.to(); + auto* out_data = dev_ctx.template Alloc(out); + *out_data = true; + + auto num = x.numel(); + for (int64_t i = 0; i < num; ++i) { + const T a = in_a[i], b = in_b[i]; + bool val; + if (std::isnan(a) || std::isnan(b)) { + val = equal_nan && std::isnan(a) == std::isnan(b); + } else { + T left = (a > b ? a - b : b - a); + T right = atol_v + (b > 0 ? rtol_v * b : (-rtol_v) * b); + T diff = (left > right ? left - right : right - left); + val = a == b || left <= right || diff <= 1e-15; + } + *out_data &= val; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + allclose, CPU, ALL_LAYOUT, phi::AllCloseKernel, float, double) { + kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); +} diff --git a/paddle/phi/kernels/gpu/allclose_kernel.cu b/paddle/phi/kernels/gpu/allclose_kernel.cu new file mode 100644 index 00000000000..af2612bb10c --- /dev/null +++ b/paddle/phi/kernels/gpu/allclose_kernel.cu @@ -0,0 +1,89 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/allclose_kernel.h" + +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +__global__ void AllcloseCUDAKernel(const T* in_data, + const T* other_data, + const double rtol, + const double atol, + bool equal_nan, + int num, + bool* out_data) { + unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; + bool val; + for (int i = idx; i < num; i += blockDim.x * gridDim.x) { + const T a = in_data[i], b = other_data[i]; + if (isnan(a) || isnan(b)) { + val = equal_nan && isnan(a) == isnan(b); + } else { + T left = (a > b ? a - b : b - a); + T right = atol + (b > 0 ? rtol * b : (-rtol) * b); + T diff = (left > right ? left - right : right - left); + val = a == b || left <= right || diff <= 1e-15; + } + if (!val) *out_data = false; + } +} + +template +void AllCloseKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const Scalar& rtol, + const Scalar& atol, + bool equal_nan, + DenseTensor* out) { + PADDLE_ENFORCE_EQ( + rtol.dtype(), + DataType::FLOAT64, + phi::errors::InvalidArgument( + "Input (Rtol) type must be double, but get %s.", rtol.dtype())); + PADDLE_ENFORCE_EQ( + atol.dtype(), + DataType::FLOAT64, + phi::errors::InvalidArgument( + "Input (Atol) type must be double, but get %s.", atol.dtype())); + + const T* in_data = x.data(); + const T* other_data = y.data(); + auto rtol_v = rtol.to(); + auto atol_v = atol.to(); + bool* out_data = dev_ctx.template Alloc(out); + + int num = x.numel(); + int block = 1024; + int grid = (block - 1 + num) / block; + grid = (grid > block) ? block : grid; +#ifdef PADDLE_WITH_HIP + hipMemset(out_data, true, sizeof(bool)); +#else + cudaMemset(out_data, true, sizeof(bool)); +#endif + AllcloseCUDAKernel<<>>( + in_data, other_data, rtol_v, atol_v, equal_nan, num, out_data); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + allclose, GPU, ALL_LAYOUT, phi::AllCloseKernel, float, double) { + kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); +} diff --git a/paddle/phi/ops/compat/allclose_sig.cc b/paddle/phi/ops/compat/allclose_sig.cc new file mode 100644 index 00000000000..e5c4fc027b5 --- /dev/null +++ b/paddle/phi/ops/compat/allclose_sig.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature AllCloseOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.HasInput("Rtol")) { + if (ctx.HasInput("Atol")) { + return KernelSignature("allclose", + {"Input", "Other"}, + {"Rtol", "Atol", "equal_nan"}, + {"Out"}); + } else { + return KernelSignature("allclose", + {"Input", "Other"}, + {"Rtol", "atol", "equal_nan"}, + {"Out"}); + } + } else { + if (ctx.HasInput("Atol")) { + return KernelSignature("allclose", + {"Input", "Other"}, + {"rtol", "Atol", "equal_nan"}, + {"Out"}); + } else { + return KernelSignature("allclose", + {"Input", "Other"}, + {"rtol", "atol", "equal_nan"}, + {"Out"}); + } + } +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(allclose, phi::AllCloseOpArgumentMapping); diff --git a/paddle/phi/tests/ops/test_op_signature.cc b/paddle/phi/tests/ops/test_op_signature.cc index 88c9193a8f8..c74049e0f04 100644 --- a/paddle/phi/tests/ops/test_op_signature.cc +++ b/paddle/phi/tests/ops/test_op_signature.cc @@ -484,5 +484,33 @@ TEST(ARG_MAP, set_value) { "set_value"); } +TEST(ARG_MAP, allclose) { + TestArgumentMappingContext arg_case1( + {"Input", "Other", "Rtol"}, + {}, + {{"atol", paddle::any(std::string{"1e-8"})}, + {"equal_nan", paddle::any(false)}}, + {"Out"}, + {}); + auto signature1 = + OpUtilsMap::Instance().GetArgumentMappingFn("allclose")(arg_case1); + ASSERT_EQ(signature1.name, "allclose"); + auto attr_names1 = std::get<1>(signature1.args); + ASSERT_EQ(attr_names1[0], "Rtol"); + + TestArgumentMappingContext arg_case2( + {"Input", "Other", "Atol"}, + {}, + {{"rtol", paddle::any(std::string{"1e-5"})}, + {"equal_nan", paddle::any(false)}}, + {"Out"}, + {}); + auto signature2 = + OpUtilsMap::Instance().GetArgumentMappingFn("allclose")(arg_case2); + ASSERT_EQ(signature2.name, "allclose"); + auto attr_names2 = std::get<1>(signature2.args); + ASSERT_EQ(attr_names2[1], "Atol"); +} + } // namespace tests } // namespace phi -- GitLab From ec09ef260f35f11de2436edc6f40839c810b7357 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Sat, 12 Mar 2022 20:44:29 +0800 Subject: [PATCH 014/176] [Phi] Add softmax infermeta functions (#40471) * rename softmax kernel name * move softmax infershape * fix failed test --- .../mkldnn/test_mkldnn_op_inplace.cc | 3 + paddle/fluid/operators/softmax_op.cc | 55 ++++--------------- paddle/phi/infermeta/backward.cc | 6 ++ paddle/phi/infermeta/backward.h | 2 + paddle/phi/infermeta/unary.cc | 19 +++++++ paddle/phi/infermeta/unary.h | 2 + paddle/phi/kernels/cpu/softmax_kernel.cc | 2 +- paddle/phi/kernels/gpu/softmax_kernel.cu | 2 +- paddle/phi/kernels/gpudnn/softmax_kernel.cu | 14 ++--- paddle/phi/kernels/impl/softmax_kernel_impl.h | 8 +-- paddle/phi/kernels/softmax_kernel.h | 12 +--- 11 files changed, 58 insertions(+), 67 deletions(-) diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc index e9dadd5ec93..4090d5ffca8 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc @@ -24,6 +24,7 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); @@ -32,6 +33,8 @@ USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); +PD_DECLARE_KERNEL(softmax, CPU, ALL_LAYOUT); + namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 37499209660..af90baf27d3 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" @@ -23,6 +24,10 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace operators { @@ -30,30 +35,6 @@ class SoftmaxOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("X"), true, - platform::errors::NotFound("Input(X) of SoftmaxOp is not found.")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput("Out"), true, - platform::errors::NotFound("Output(Out) of SoftmaxOp is not found.")); - - auto dim_x = ctx->GetInputDim("X"); - auto rank_x = dim_x.size(); - auto axis = ctx->Attrs().Get("axis"); - PADDLE_ENFORCE_GE(axis, -rank_x, - platform::errors::InvalidArgument( - "Attr(axis) value should be in range [-R, R-1], " - "R is the rank of Input(X).")); - PADDLE_ENFORCE_LT(axis, rank_x, - platform::errors::InvalidArgument( - "Attr(axis) value should be in range [-R, R-1], " - "R is the rank of Input(X).")); - - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -168,23 +149,6 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("Out"), true, - platform::errors::InvalidArgument("Input(Out) is not found.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput(framework::GradVarName("Out")), true, - platform::errors::InvalidArgument("Input(Out@GRAD) is not found.")); - PADDLE_ENFORCE_EQ( - ctx->GetInputDim("Out"), - ctx->GetInputDim(framework::GradVarName("Out")), - platform::errors::InvalidArgument("Input(Out) and its gradients " - "should have a same shape.")); - - ctx->SetOutputDim(framework::GradVarName("X"), - ctx->GetInputDim(framework::GradVarName("Out"))); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -244,9 +208,14 @@ DECLARE_INPLACE_OP_INFERER(SoftmaxInplaceInferer, {"X", "Out"}); namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(softmax, SoftmaxInferShapeFunctor, + PD_INFER_META(phi::SoftmaxInferMeta)); REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, ops::SoftmaxOpInferVarType, ops::SoftmaxOpGradMaker, ops::SoftmaxOpGradMaker, - ops::SoftmaxInplaceInferer); -REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad); + ops::SoftmaxInplaceInferer, SoftmaxInferShapeFunctor); +DECLARE_INFER_SHAPE_FUNCTOR(softmax_grad, SoftmaxGradnferShapeFunctor, + PD_INFER_META(phi::GeneralUnaryGradInferMeta)); +REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad, + SoftmaxGradnferShapeFunctor); diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index 0a2b4dcae58..801bd98b504 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -64,6 +64,12 @@ void BilinearTensorProductGradInferMeta(const MetaTensor& x, } } +void GeneralUnaryGradInferMeta(const MetaTensor& x, MetaTensor* dx) { + if (dx) { + dx->share_meta(x); + } +} + void GeneralBinaryGradInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* dx, diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index c4003ca1fe7..9ed24ef8646 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -30,6 +30,8 @@ void BilinearTensorProductGradInferMeta(const MetaTensor& x, MetaTensor* dweight, MetaTensor* dbias); +void GeneralUnaryGradInferMeta(const MetaTensor& x, MetaTensor* dx); + void GeneralBinaryGradInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* dx, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 9daad7d6aaa..1b820510470 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -1409,6 +1409,25 @@ void ShardIndexInferMeta(const MetaTensor& in, out->set_dtype(in.dtype()); } +void SoftmaxInferMeta(const MetaTensor& x, int axis, MetaTensor* out) { + auto dim_x = x.dims(); + auto rank_x = dim_x.size(); + PADDLE_ENFORCE_GE(axis, + -rank_x, + phi::errors::InvalidArgument( + "Attr(axis) value should be in range [-R, R-1], " + "R is the rank of Input(X).")); + PADDLE_ENFORCE_LT(axis, + rank_x, + phi::errors::InvalidArgument( + "Attr(axis) value should be in range [-R, R-1], " + "R is the rank of Input(X).")); + + out->set_dims(x.dims()); + out->set_dtype(x.dtype()); + out->share_lod(x); +} + } // namespace phi PD_REGISTER_INFER_META_FN(copy_to, phi::CopyToInferMeta); diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index e8be73e943e..c7b7f8e3c13 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -203,4 +203,6 @@ void ShardIndexInferMeta(const MetaTensor& in, MetaTensor* out, MetaConfig config = MetaConfig()); +void SoftmaxInferMeta(const MetaTensor& x, int axis, MetaTensor* out); + } // namespace phi diff --git a/paddle/phi/kernels/cpu/softmax_kernel.cc b/paddle/phi/kernels/cpu/softmax_kernel.cc index 537b4326681..1d28669571f 100644 --- a/paddle/phi/kernels/cpu/softmax_kernel.cc +++ b/paddle/phi/kernels/cpu/softmax_kernel.cc @@ -19,4 +19,4 @@ limitations under the License. */ #include "paddle/phi/kernels/impl/softmax_kernel_impl.h" PD_REGISTER_KERNEL( - softmax, CPU, ALL_LAYOUT, phi::SoftmaxRawKernel, float, double) {} + softmax, CPU, ALL_LAYOUT, phi::SoftmaxKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/softmax_kernel.cu b/paddle/phi/kernels/gpu/softmax_kernel.cu index 03c5714b967..4a02f438c7e 100644 --- a/paddle/phi/kernels/gpu/softmax_kernel.cu +++ b/paddle/phi/kernels/gpu/softmax_kernel.cu @@ -23,7 +23,7 @@ limitations under the License. */ PD_REGISTER_KERNEL(softmax, GPU, ALL_LAYOUT, - phi::SoftmaxRawKernel, + phi::SoftmaxKernel, float, double, phi::dtype::float16, diff --git a/paddle/phi/kernels/gpudnn/softmax_kernel.cu b/paddle/phi/kernels/gpudnn/softmax_kernel.cu index 7685c7dbb68..37175c427ff 100644 --- a/paddle/phi/kernels/gpudnn/softmax_kernel.cu +++ b/paddle/phi/kernels/gpudnn/softmax_kernel.cu @@ -21,10 +21,10 @@ limitations under the License. */ namespace phi { template -void SoftmaxRawGPUDNNKernel(const Context& dev_ctx, - const DenseTensor& x, - int axis, - DenseTensor* out) { +void SoftmaxGPUDNNKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + DenseTensor* out) { dev_ctx.template Alloc(out); SoftmaxForwardCUDAKernelDriver(dev_ctx, x, axis, out); } @@ -35,7 +35,7 @@ void SoftmaxRawGPUDNNKernel(const Context& dev_ctx, PD_REGISTER_KERNEL(softmax, GPUDNN, ALL_LAYOUT, - phi::SoftmaxRawGPUDNNKernel, + phi::SoftmaxGPUDNNKernel, float, phi::dtype::float16, phi::dtype::bfloat16) {} @@ -44,7 +44,7 @@ PD_REGISTER_KERNEL(softmax, PD_REGISTER_KERNEL(softmax, GPUDNN, ALL_LAYOUT, - phi::SoftmaxRawGPUDNNKernel, + phi::SoftmaxGPUDNNKernel, float, double, phi::dtype::float16, @@ -53,7 +53,7 @@ PD_REGISTER_KERNEL(softmax, PD_REGISTER_KERNEL(softmax, GPUDNN, ALL_LAYOUT, - phi::SoftmaxRawGPUDNNKernel, + phi::SoftmaxGPUDNNKernel, float, double, phi::dtype::float16) {} diff --git a/paddle/phi/kernels/impl/softmax_kernel_impl.h b/paddle/phi/kernels/impl/softmax_kernel_impl.h index 6552f6ed581..7aa43fdb7f2 100644 --- a/paddle/phi/kernels/impl/softmax_kernel_impl.h +++ b/paddle/phi/kernels/impl/softmax_kernel_impl.h @@ -22,10 +22,10 @@ limitations under the License. */ namespace phi { template -void SoftmaxRawKernel(const Context& dev_ctx, - const DenseTensor& x, - int axis, - DenseTensor* out) { +void SoftmaxKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + DenseTensor* out) { const int rank = x.dims().size(); const int calc_axis = phi::funcs::CanonicalAxis(axis, rank); int axis_dim = x.dims()[calc_axis]; diff --git a/paddle/phi/kernels/softmax_kernel.h b/paddle/phi/kernels/softmax_kernel.h index ca69d652770..4edd562ca88 100644 --- a/paddle/phi/kernels/softmax_kernel.h +++ b/paddle/phi/kernels/softmax_kernel.h @@ -19,20 +19,10 @@ limitations under the License. */ namespace phi { -template -void SoftmaxRawKernel(const Context& dev_ctx, - const DenseTensor& x, - int axis, - DenseTensor* out); - template void SoftmaxKernel(const Context& dev_ctx, const DenseTensor& x, int axis, - DataType dtype, - DenseTensor* out) { - auto cast_x = phi::Cast(dev_ctx, x, dtype); - phi::SoftmaxRawKernel(dev_ctx, axis, out); -} + DenseTensor* out); } // namespace phi -- GitLab From 080024f03b86bb480baf30e192fa00443286999d Mon Sep 17 00:00:00 2001 From: zyfncg Date: Sun, 13 Mar 2022 11:05:42 +0800 Subject: [PATCH 015/176] refactor unary infermeta (#40365) --- paddle/phi/infermeta/unary.cc | 1460 ++++++++++++++++----------------- paddle/phi/infermeta/unary.h | 193 ++--- 2 files changed, 827 insertions(+), 826 deletions(-) diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 1b820510470..f7693c2f90a 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -26,6 +26,82 @@ limitations under the License. */ namespace phi { +void ArgMinMaxInferMeta(const MetaTensor& x, + int64_t axis, + bool keepdims, + bool flatten, + int dtype, + MetaTensor* out, + MetaConfig config) { + const auto& x_dims = x.dims(); + + PADDLE_ENFORCE_GE( + axis, + -x_dims.size(), + phi::errors::InvalidArgument("'axis'(%d) must be greater than or equal to" + " -Rank(X)(%d).", + axis, + -x_dims.size())); + PADDLE_ENFORCE_LT(axis, + x_dims.size(), + phi::errors::InvalidArgument( + "'axis'(%d) must be less than Rank(X)(%d) of Input(X).", + axis, + x_dims.size())); + + PADDLE_ENFORCE_EQ( + (dtype < 0 || dtype == 2 || dtype == 3), + true, + phi::errors::InvalidArgument( + "The attribute of dtype in argmin/argmax must be [%s] or [%s], but " + "received [%s]", + paddle::framework::DataTypeToString( + paddle::framework::proto::VarType::INT32), + paddle::framework::DataTypeToString( + paddle::framework::proto::VarType::INT64), + paddle::framework::DataTypeToString( + static_cast(dtype)))); + + auto x_rank = x_dims.size(); + if (axis < 0) axis += x_rank; + if (config.is_runtime) { + if (dtype == paddle::framework::proto::VarType::INT32) { + int64_t all_element_num = 0; + if (flatten) { + all_element_num = phi::product(x_dims); + + } else { + all_element_num = x_dims[axis]; + } + PADDLE_ENFORCE_LE( + all_element_num, + INT_MAX, + phi::errors::InvalidArgument( + "The element num of the argmin/argmax input at axis is " + "%d, is larger than int32 maximum value:%d, you must " + "set the dtype of argmin/argmax to 'int64'.", + all_element_num, + INT_MAX)); + } + } + std::vector vec; + if (flatten) { + vec.emplace_back(static_cast(1)); + } else { + for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]); + if (keepdims) { + vec.emplace_back(static_cast(1)); + } + for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]); + } + out->set_dims(phi::make_ddim(vec)); + if (dtype == 2) { + out->set_dtype(DataType::INT32); + } else if (dtype == 3) { + out->set_dtype(DataType::INT64); + } +} + void ArgsortInferMeta(const MetaTensor& input, int axis, bool descending, @@ -54,96 +130,6 @@ void ArgsortInferMeta(const MetaTensor& input, indices->share_lod(input); } -void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out) { - out->share_meta(x); -} - -// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1] -void UnchangedInferMetaCheckAxis(const MetaTensor& x, - int axis, - MetaTensor* out) { - auto rank = x.dims().size(); - PADDLE_ENFORCE_GE( - axis, - -rank, - errors::InvalidArgument( - "Attr(axis) value should be in range [-R, R-1], " - "R is the rank of Input(X). But received axis: %d, R: %d.", - axis, - rank)); - PADDLE_ENFORCE_LT( - axis, - rank, - phi::errors::InvalidArgument( - "Attr(axis) value should be in range [-R, R-1], " - "R is the rank of Input(X). But received axis: %d, R: %d.", - axis, - rank)); - out->share_meta(x); -} - -void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out) { - out->set_dims(x.dims()); - out->set_dtype(dtype::ToReal(x.dtype())); - out->set_layout(x.layout()); -} - -void FlattenInferMeta(const MetaTensor& x, - int start_axis, - int stop_axis, - MetaTensor* out) { - auto x_dims = x.dims(); - int in_dims_size = x_dims.size(); - if (start_axis < 0) { - start_axis = start_axis + in_dims_size; - } - if (stop_axis < 0) { - stop_axis = stop_axis + in_dims_size; - } - PADDLE_ENFORCE_GE( - stop_axis, - start_axis, - phi::errors::InvalidArgument("The stop_axis should be greater" - "than or equal to start_axis.")); - - int64_t outer = 1; - std::vector out_shape; - out_shape.reserve(in_dims_size - stop_axis + start_axis); - - for (int i = 0; i < start_axis; ++i) { - out_shape.push_back(x_dims[i]); - } - for (int i = start_axis; i <= stop_axis; i++) { - if (x_dims[i] == -1 || outer == -1) { - outer = -1; - } else { - outer *= x_dims[i]; - } - } - out_shape.push_back(outer); - for (int i = stop_axis + 1; i < in_dims_size; i++) { - out_shape.push_back(x_dims[i]); - } - const auto& out_dims = phi::make_ddim(out_shape); - out->set_dims(out_dims); - out->set_dtype(x.dtype()); - out->set_layout(x.layout()); - - if (x_dims[0] == out_dims[0]) { - // Only pass LoD when the first dimension of output and Input(X) - // are the same. - out->share_lod(x); - } -} - -void GumbelSoftmaxInferMeta(const MetaTensor& x, - float temperature, - bool hard, - int axis, - MetaTensor* out) { - UnchangedInferMetaCheckAxis(x, axis, out); -} - void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out) { out->set_dims(x.dims()); out->set_dtype(out_dtype); @@ -203,73 +189,275 @@ void CumsumInferMeta(const MetaTensor& x, out->share_lod(x); } -void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out) { - PADDLE_ENFORCE_EQ( - product(x.dims()), - 1UL, - errors::InvalidArgument("The number of elements in Input(X) should be 1." - "Now the number is %d.", - product(x.dims()))); - out->set_dims(x.dims()); - out->share_lod(x); - out->set_dtype(x.dtype()); -} - -static phi::DDim ValidateShape(const std::vector shape, - const phi::DDim& in_dims) { - const int64_t in_size = phi::product(in_dims); - auto in_dims_vec = phi::vectorize(in_dims); - bool all_positive = std::all_of(in_dims_vec.cbegin(), - in_dims_vec.cend(), - [](int64_t i) { return i > 0; }); - // only one dimension can be set to -1, whose size will be automatically - // infered. - const int64_t unk_dim_val = -1; - const int64_t copy_dim_val = 0; +void DiagInferMeta(const MetaTensor& x, + int offset, + float padding_value, + MetaTensor* out) { + auto x_dims = x.dims(); - std::vector output_shape(shape.size(), 0); - int64_t capacity = 1; - int unk_dim_idx = -1; - for (size_t i = 0; i < shape.size(); ++i) { - if (shape[i] == unk_dim_val) { - PADDLE_ENFORCE_EQ( - unk_dim_idx, - -1, - phi::errors::InvalidArgument( - "Only one dimension value of 'shape' in ReshapeOp can " - "be -1. But received shape = [%s], shape[%d] is also -1.", - phi::make_ddim(shape), - i)); - unk_dim_idx = i; - } else if (shape[i] == copy_dim_val) { - PADDLE_ENFORCE_LT( - static_cast(i), - in_dims.size(), - phi::errors::InvalidArgument( - "The index of 0 in `shape` must be less than " - "the input tensor X's dimensions. " - "But received shape = [%s], shape[%d] = 0, X's shape = [%s], " - "X's dimensions = %d.", - phi::make_ddim(shape), - i, - in_dims, - in_dims.size())); + if (x_dims.size() == 1UL) { + int64_t size_ = x_dims[0] + std::abs(offset); + out->set_dims({size_, size_}); + out->set_dtype(x.dtype()); + } else if (x_dims.size() == 2UL) { + int64_t size_ = 0; + if (offset >= 0) { + // Note(LutaoChu): Do not use std::min here, otherwise the calculation + // of `size_` will have unexpected result on Windows Python3.8 + if (x_dims[0] < x_dims[1] - offset) { + size_ = x_dims[0]; + } else { + size_ = x_dims[1] - offset; + } } else { - PADDLE_ENFORCE_GT( - shape[i], - 0, - phi::errors::InvalidArgument( - "Each dimension value of 'shape' in ReshapeOp must not " - "be negative except one unknown dimension. " - "But received shape = [%s], shape[%d] = %d.", - phi::make_ddim(shape), - i, - shape[i])); + // Note(LutaoChu): Do not use std::min here, otherwise the calculation + // of `size_` will have unexpected result on Windows Python3.8 + if (x_dims[0] + offset < x_dims[1]) { + size_ = x_dims[0] + offset; + } else { + size_ = x_dims[1]; + } } - - // NOTE all non-zero values will be converted to True (include negative - // value) - capacity *= (shape[i] ? shape[i] : in_dims[i]); + out->set_dims({size_}); + out->set_dtype(x.dtype()); + } else { + PADDLE_THROW(phi::errors::InvalidArgument( + "The input tensor X's dimensions of DiagV2Op should be either 1 or " + "2, but received %d.", + x_dims.size())); + } +} + +void DiagonalInferMeta(const MetaTensor& input, + int offset, + int axis1, + int axis2, + MetaTensor* out) { + auto x_dims = input.dims(); + int offset_ = offset; + int axis1_ = axis1 < 0 ? x_dims.size() + axis1 : axis1; + int axis2_ = axis2 < 0 ? x_dims.size() + axis2 : axis2; + + PADDLE_ENFORCE_GE( + x_dims.size(), + 2, + phi::errors::OutOfRange("Input's dim is out of range (expected at " + "least 2 dimensions, but got %ld).", + x_dims.size())); + PADDLE_ENFORCE_LT( + axis1_, + x_dims.size(), + phi::errors::OutOfRange( + "Attr(axis1) is out of range (expected to be in range of [%ld, " + "%ld], but got %ld).", + -(x_dims.size()), + (x_dims.size() - 1), + axis1)); + PADDLE_ENFORCE_LT( + axis2_, + x_dims.size(), + phi::errors::OutOfRange( + "Attr(axis2) is out of range (expected to be in range of [%ld, " + "%ld], but got %ld).", + -(x_dims.size()), + (x_dims.size() - 1), + axis2)); + PADDLE_ENFORCE_NE( + axis1_, + axis2_, + phi::errors::InvalidArgument("The dimensions should not be identical " + "%d vs %d.", + axis1, + axis2)); + + auto out_dims = vectorize(x_dims); + // from out_dims get the dim size of axis1_. + auto axis1_size = out_dims[axis1_]; + auto axis2_size = out_dims[axis2_]; + // delete two dims by attr axis1 and axis2 from out_dims. + /* example: + out_dim = [2, 3, 4]; + axis1 = 0; + axis2 = 1; + according to the attr of axis1 and axis2, we get: + out_dim = [4]. + */ + out_dims.erase(out_dims.begin() + std::max(axis1_, axis2_)); + out_dims.erase(out_dims.begin() + std::min(axis1_, axis2_)); + + if (offset_ == 0) { + out_dims.push_back(std::min(axis1_size, axis2_size)); + } else if (offset_ > 0) { + if ((axis2_size - offset_) > 0) { + out_dims.push_back(std::min(axis1_size, axis2_size - offset_)); + } else { + out_dims.push_back(0); + } + } else { + if ((axis1_size + offset_) > 0) { + out_dims.push_back(std::min(axis1_size + offset_, axis2_size)); + } else { + out_dims.push_back(0); + } + } + out->set_dims(phi::make_ddim(out_dims)); +} + +void EighInferMeta(const MetaTensor& x, + const std::string& uplo, + MetaTensor* out_w, + MetaTensor* out_v) { + auto input_dim = x.dims(); + auto rank = input_dim.size(); + + PADDLE_ENFORCE_GE(rank, + 2, + phi::errors::InvalidArgument( + "The Input(X) should have at least 2 dimensions." + "But received a %d dimension tensor.", + rank)); + PADDLE_ENFORCE_EQ( + input_dim[rank - 2], + input_dim[rank - 1], + phi::errors::InvalidArgument( + "Eigh op is designed for square matrix, consequently" + "inner-most 2 dimensions of Input(X) should be symmetric." + "But received X's shape[-2] = %d and shape[-1] = %d.", + input_dim[rank - 2], + input_dim[rank - 1])); + + std::vector values_dim; + + for (auto i = 0; i < rank - 1; i++) { + values_dim.emplace_back(input_dim[i]); + } + out_w->set_dims(phi::make_ddim(values_dim)); + out_v->set_dims(input_dim); +} + +void FlattenInferMeta(const MetaTensor& x, + int start_axis, + int stop_axis, + MetaTensor* out) { + auto x_dims = x.dims(); + int in_dims_size = x_dims.size(); + if (start_axis < 0) { + start_axis = start_axis + in_dims_size; + } + if (stop_axis < 0) { + stop_axis = stop_axis + in_dims_size; + } + PADDLE_ENFORCE_GE( + stop_axis, + start_axis, + phi::errors::InvalidArgument("The stop_axis should be greater" + "than or equal to start_axis.")); + + int64_t outer = 1; + std::vector out_shape; + out_shape.reserve(in_dims_size - stop_axis + start_axis); + + for (int i = 0; i < start_axis; ++i) { + out_shape.push_back(x_dims[i]); + } + for (int i = start_axis; i <= stop_axis; i++) { + if (x_dims[i] == -1 || outer == -1) { + outer = -1; + } else { + outer *= x_dims[i]; + } + } + out_shape.push_back(outer); + for (int i = stop_axis + 1; i < in_dims_size; i++) { + out_shape.push_back(x_dims[i]); + } + const auto& out_dims = phi::make_ddim(out_shape); + out->set_dims(out_dims); + out->set_dtype(x.dtype()); + out->set_layout(x.layout()); + + if (x_dims[0] == out_dims[0]) { + // Only pass LoD when the first dimension of output and Input(X) + // are the same. + out->share_lod(x); + } +} + +void GumbelSoftmaxInferMeta(const MetaTensor& x, + float temperature, + bool hard, + int axis, + MetaTensor* out) { + UnchangedInferMetaCheckAxis(x, axis, out); +} + +void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out) { + PADDLE_ENFORCE_EQ( + product(x.dims()), + 1UL, + errors::InvalidArgument("The number of elements in Input(X) should be 1." + "Now the number is %d.", + product(x.dims()))); + out->set_dims(x.dims()); + out->share_lod(x); + out->set_dtype(x.dtype()); +} + +static phi::DDim ValidateShape(const std::vector shape, + const phi::DDim& in_dims) { + const int64_t in_size = phi::product(in_dims); + auto in_dims_vec = phi::vectorize(in_dims); + bool all_positive = std::all_of(in_dims_vec.cbegin(), + in_dims_vec.cend(), + [](int64_t i) { return i > 0; }); + // only one dimension can be set to -1, whose size will be automatically + // infered. + const int64_t unk_dim_val = -1; + const int64_t copy_dim_val = 0; + + std::vector output_shape(shape.size(), 0); + int64_t capacity = 1; + int unk_dim_idx = -1; + for (size_t i = 0; i < shape.size(); ++i) { + if (shape[i] == unk_dim_val) { + PADDLE_ENFORCE_EQ( + unk_dim_idx, + -1, + phi::errors::InvalidArgument( + "Only one dimension value of 'shape' in ReshapeOp can " + "be -1. But received shape = [%s], shape[%d] is also -1.", + phi::make_ddim(shape), + i)); + unk_dim_idx = i; + } else if (shape[i] == copy_dim_val) { + PADDLE_ENFORCE_LT( + static_cast(i), + in_dims.size(), + phi::errors::InvalidArgument( + "The index of 0 in `shape` must be less than " + "the input tensor X's dimensions. " + "But received shape = [%s], shape[%d] = 0, X's shape = [%s], " + "X's dimensions = %d.", + phi::make_ddim(shape), + i, + in_dims, + in_dims.size())); + } else { + PADDLE_ENFORCE_GT( + shape[i], + 0, + phi::errors::InvalidArgument( + "Each dimension value of 'shape' in ReshapeOp must not " + "be negative except one unknown dimension. " + "But received shape = [%s], shape[%d] = %d.", + phi::make_ddim(shape), + i, + shape[i])); + } + + // NOTE all non-zero values will be converted to True (include negative + // value) + capacity *= (shape[i] ? shape[i] : in_dims[i]); output_shape[i] = (shape[i] ? static_cast(shape[i]) : in_dims[i]); } @@ -360,6 +548,11 @@ void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out) { out->set_dtype(DataType::BOOL); } +void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) { + out->set_dims(x.dims()); + out->set_dtype(DataType::BOOL); +} + void MultinomialInferMeta(const MetaTensor& x, int num_samples, bool replacement, @@ -395,124 +588,97 @@ void MultinomialInferMeta(const MetaTensor& x, out->set_dtype(DataType::INT64); } -void TileInferMeta(const MetaTensor& x, - const ScalarArray& repeat_times, - MetaTensor* out, - MetaConfig config) { -#define MAX_RANK_SUPPORTED 6 - - auto repeat_times_data = repeat_times.GetData(); - auto x_dims = x.dims(); - if (repeat_times_data.size() == 0) { - repeat_times_data = std::vector(x_dims.size(), -1); - } - - PADDLE_ENFORCE_LE( - x_dims.size(), - MAX_RANK_SUPPORTED, - errors::InvalidArgument( - "The rank of the input 'x' for tile op " - "must not be greater than %d, but the value received is %d.", - MAX_RANK_SUPPORTED, - x_dims.size())); - PADDLE_ENFORCE_LE( - repeat_times_data.size(), - MAX_RANK_SUPPORTED, - errors::InvalidArgument( - "The size of the shape of input 'repeat_times' for tile op " - "must not be greater than %d, but the value received is %d.", - MAX_RANK_SUPPORTED, - repeat_times_data.size())); - PADDLE_ENFORCE_GE( - repeat_times_data.size(), - 1, - errors::InvalidArgument( - "The size of the shape of input 'repeat_times' for tile op " - "must be positive integers, but the value received is %d.", - repeat_times_data.size())); - - auto out_rank = - std::max(static_cast(x_dims.size()), repeat_times_data.size()); - std::vector out_shape(out_rank); - auto x_dim_vec = phi::vectorize(x_dims); - if (x_dim_vec.size() > repeat_times_data.size()) { - auto diff = x_dim_vec.size() - repeat_times_data.size(); - repeat_times_data.insert(repeat_times_data.begin(), diff, -1); - } else { - auto diff = repeat_times_data.size() - x_dim_vec.size(); - x_dim_vec.insert(x_dim_vec.begin(), diff, -1); +void PadInferMeta(const MetaTensor& input, + const std::vector& paddings, + float pad_value, + MetaTensor* out, + MetaConfig config) { + auto x_dim = input.dims(); + PADDLE_ENFORCE_EQ( + static_cast(paddings.size()), + x_dim.size() * 2, + phi::errors::InvalidArgument( + "Size of 'paddings' dimension should be equal to 2 * size of " + "Input(X)'s dimension, but received (size of 'paddings' dimension " + "is) %d vs (2 * size of Input(X)'s dimension is) %d.", + static_cast(paddings.size()), + x_dim.size() * 2)); + for (size_t i = 0; i < paddings.size(); ++i) { + PADDLE_ENFORCE_GE(paddings[i], + 0, + phi::errors::InvalidArgument( + "The element of 'paddings' should >= 0, but " + "received %d for index %d.", + paddings[i], + static_cast(i))); } - for (size_t i = 0; i < repeat_times_data.size(); ++i) { - if (x_dim_vec[i] == -1 || repeat_times_data[i] == -1) { - out_shape[i] = -1; + std::vector out_dims(x_dim.size()); + for (int i = 0; i < x_dim.size(); ++i) { + if ((!config.is_runtime) && (x_dim[i] == -1)) { + out_dims[i] = -1; } else { - PADDLE_ENFORCE_GT( - repeat_times_data[i], - 0, - errors::InvalidArgument( - "Every element of the input 'repeat_times' for tile op must be " - "greater than 0, but the value given is %d.", - repeat_times_data[i])); - out_shape[i] = x_dim_vec[i] * repeat_times_data[i]; + out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1]; } } - - out->set_dims(phi::make_ddim(out_shape)); - if (out_shape[0] == x_dims[0]) { - out->share_lod(x); + out->set_dims(phi::make_ddim(out_dims)); + if (out_dims[0] == x_dim[0]) { + // Only pass LoD when the first dimension is equal between + // output and input. + out->share_lod(input); } + out->set_dtype(input.dtype()); } -void ReshapeInferMeta(const MetaTensor& x, - const ScalarArray& shape, - MetaTensor* out, - MetaConfig config) { - auto& shape_data = shape.GetData(); - PADDLE_ENFORCE_NOT_NULL(out, - phi::errors::InvalidArgument( - "Output(Out) of ReshapeOp should not be null.")); - if (!config.is_runtime && shape.FromTensor()) { - out->set_dims(phi::make_ddim(shape_data)); - out->share_lod(x); - return; - } - PADDLE_ENFORCE_GT(shape_data.size(), - 0, +void PixelShuffleInferMeta(const MetaTensor& x, + int upscale_factor, + const std::string& data_format, + MetaTensor* out) { + auto input_dims = x.dims(); + PADDLE_ENFORCE_EQ(input_dims.size(), + 4, phi::errors::InvalidArgument( - "The shape's size in ReshapeOp can't be zero.")); - InferMetaFromVecValue(x, shape_data, out); -} + "Input should be a 4-D tensor of format [N, C, H, W] " + "or [N, H, W, C], but got %u.", + input_dims.size())); -void ReshapeWithXShapeInferMeta(const MetaTensor& x, - const ScalarArray& shape, - MetaTensor* xshape, - MetaTensor* out, - MetaConfig config) { - PADDLE_ENFORCE_NOT_NULL( - xshape, - phi::errors::InvalidArgument( - "Output(XShape) of ReshapeOp should not be null.")); - const auto& x_dims = x.dims(); - std::vector xshape_dims(x_dims.size() + 1); - xshape_dims[0] = 0; - for (int i = 0; i < x_dims.size(); ++i) { - xshape_dims[i + 1] = x_dims[i]; + const bool channel_last = (data_format == "NHWC"); + + if (!channel_last) { + PADDLE_ENFORCE_EQ(input_dims[1] % (upscale_factor * upscale_factor), + 0, + phi::errors::InvalidArgument( + "The square of upscale_factor[%u] should divide the " + "number of channel[%u]", + upscale_factor * upscale_factor, + input_dims[1])); + } else { + PADDLE_ENFORCE_EQ(input_dims[3] % (upscale_factor * upscale_factor), + 0, + phi::errors::InvalidArgument( + "The square of upscale_factor[%u] should divide the " + "number of channel[%u]", + upscale_factor * upscale_factor, + input_dims[3])); } - xshape->set_dims(phi::make_ddim(xshape_dims)); - xshape->share_lod(x); - ReshapeInferMeta(x, shape, out, config); + auto output_dims = input_dims; + output_dims[0] = input_dims[0]; + if (!channel_last) { + output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor); + output_dims[2] = input_dims[2] * upscale_factor; + output_dims[3] = input_dims[3] * upscale_factor; + } else { + output_dims[1] = input_dims[1] * upscale_factor; + output_dims[2] = input_dims[2] * upscale_factor; + output_dims[3] = input_dims[3] / (upscale_factor * upscale_factor); + } + out->set_dtype(x.dtype()); + out->set_dims(output_dims); } -/* Why not use SumRawInferMeta directly? - Because we need make InferMetaFunction's args follow the design of api.yaml -*/ -void SumInferMeta(const MetaTensor& x, - const std::vector& axis, - DataType dtype, - bool keep_dim, - MetaTensor* out) { - bool reduce_all = false; - SumRawInferMeta(x, axis, keep_dim, reduce_all, dtype, out); +void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out) { + out->set_dims(x.dims()); + out->set_dtype(dtype::ToReal(x.dtype())); + out->set_layout(x.layout()); } DDim ReduceInferDim(const MetaTensor& x, @@ -584,29 +750,12 @@ DDim ReduceInferDim(const MetaTensor& x, return out_dim; } -void SumRawInferMeta(const MetaTensor& x, +void ReduceInferMeta(const MetaTensor& x, const std::vector& axis, bool keep_dim, - bool reduce_all, - DataType dtype, MetaTensor* out) { - DDim out_dim = ReduceInferDim(x, axis, keep_dim, reduce_all); - - DataType out_dtype; - if (dtype != DataType::UNDEFINED) { - out_dtype = dtype; - } else { - if (x.dtype() == DataType::BOOL || x.dtype() == DataType::INT32 || - x.dtype() == DataType::INT64) { - out_dtype = DataType::INT64; - } else { - out_dtype = x.dtype(); - } - } - - out->set_dims(out_dim); - out->set_dtype(out_dtype); - out->set_layout(x.layout()); + bool reduce_all = false; + ReduceInferMetaBase(x, axis, keep_dim, reduce_all, out); } void ReduceInferMetaBase(const MetaTensor& x, @@ -620,33 +769,109 @@ void ReduceInferMetaBase(const MetaTensor& x, out->set_layout(x.layout()); } -void ReduceInferMeta(const MetaTensor& x, - const std::vector& axis, - bool keep_dim, - MetaTensor* out) { - bool reduce_all = false; - ReduceInferMetaBase(x, axis, keep_dim, reduce_all, out); +void ReshapeInferMeta(const MetaTensor& x, + const ScalarArray& shape, + MetaTensor* out, + MetaConfig config) { + auto& shape_data = shape.GetData(); + PADDLE_ENFORCE_NOT_NULL(out, + phi::errors::InvalidArgument( + "Output(Out) of ReshapeOp should not be null.")); + if (!config.is_runtime && shape.FromTensor()) { + out->set_dims(phi::make_ddim(shape_data)); + out->share_lod(x); + return; + } + PADDLE_ENFORCE_GT(shape_data.size(), + 0, + phi::errors::InvalidArgument( + "The shape's size in ReshapeOp can't be zero.")); + InferMetaFromVecValue(x, shape_data, out); } -void TransferLayoutInferMeta(const MetaTensor& x, - DataLayout layout, - MetaTensor* out) { - out->set_dims(x.dims()); - out->set_dtype(x.dtype()); - out->set_layout(layout); +void ReshapeWithXShapeInferMeta(const MetaTensor& x, + const ScalarArray& shape, + MetaTensor* xshape, + MetaTensor* out, + MetaConfig config) { + PADDLE_ENFORCE_NOT_NULL( + xshape, + phi::errors::InvalidArgument( + "Output(XShape) of ReshapeOp should not be null.")); + const auto& x_dims = x.dims(); + std::vector xshape_dims(x_dims.size() + 1); + xshape_dims[0] = 0; + for (int i = 0; i < x_dims.size(); ++i) { + xshape_dims[i + 1] = x_dims[i]; + } + xshape->set_dims(phi::make_ddim(xshape_dims)); + xshape->share_lod(x); + ReshapeInferMeta(x, shape, out, config); } -void SplitInferMeta(const MetaTensor& x, - const ScalarArray& num_or_sections, - const Scalar& axis, - std::vector out, - MetaConfig config) { - int axis_value = axis.to(); - int rank = x.dims().size(); - PADDLE_ENFORCE_EQ( - axis_value >= -rank && axis_value < rank, - true, - phi::errors::InvalidArgument( +void ShardIndexInferMeta(const MetaTensor& in, + int index_num, + int nshards, + int shard_id, + int ignore_value, + MetaTensor* out, + MetaConfig config) { + auto x_dims = in.dims(); + PADDLE_ENFORCE_GE( + x_dims.size(), + 2, + phi::errors::InvalidArgument("Rank of Input(X) should be at least 2, " + "but the value given is %d.", + x_dims.size())); + if (config.is_runtime || x_dims[x_dims.size() - 1] > 0) { + PADDLE_ENFORCE_EQ(x_dims[x_dims.size() - 1], + 1U, + phi::errors::InvalidArgument( + "The last dimension of Input(X) should be 1, " + "but the value given is %d.", + x_dims[x_dims.size() - 1])); + } + + out->set_dims(x_dims); + out->share_lod(in); + out->set_dtype(in.dtype()); +} + +void SizeInferMeta(const MetaTensor& input, MetaTensor* out) { + out->set_dtype(DataType::INT64); + out->set_dims({1}); +} + +void SoftmaxInferMeta(const MetaTensor& x, int axis, MetaTensor* out) { + auto dim_x = x.dims(); + auto rank_x = dim_x.size(); + PADDLE_ENFORCE_GE(axis, + -rank_x, + phi::errors::InvalidArgument( + "Attr(axis) value should be in range [-R, R-1], " + "R is the rank of Input(X).")); + PADDLE_ENFORCE_LT(axis, + rank_x, + phi::errors::InvalidArgument( + "Attr(axis) value should be in range [-R, R-1], " + "R is the rank of Input(X).")); + + out->set_dims(x.dims()); + out->set_dtype(x.dtype()); + out->share_lod(x); +} + +void SplitInferMeta(const MetaTensor& x, + const ScalarArray& num_or_sections, + const Scalar& axis, + std::vector out, + MetaConfig config) { + int axis_value = axis.to(); + int rank = x.dims().size(); + PADDLE_ENFORCE_EQ( + axis_value >= -rank && axis_value < rank, + true, + phi::errors::InvalidArgument( "The axis is expected to be in range of [%d, %d), but got %d", -rank, rank, @@ -767,22 +992,108 @@ void SplitInferMeta(const MetaTensor& x, } } -void UnbindInferMeta(const MetaTensor& x, - int axis, - std::vector* outs) { - auto in_dims = x.dims(); - std::vector out_dim; - axis = axis < 0 ? in_dims.size() + axis : axis; - for (int i = 0; i < in_dims.size(); ++i) { - if (i != axis) out_dim.push_back(in_dims[i]); +/* Why not use SumRawInferMeta directly? + Because we need make InferMetaFunction's args follow the design of api.yaml +*/ +void SumInferMeta(const MetaTensor& x, + const std::vector& axis, + DataType dtype, + bool keep_dim, + MetaTensor* out) { + bool reduce_all = false; + SumRawInferMeta(x, axis, keep_dim, reduce_all, dtype, out); +} + +void SumRawInferMeta(const MetaTensor& x, + const std::vector& axis, + bool keep_dim, + bool reduce_all, + DataType dtype, + MetaTensor* out) { + DDim out_dim = ReduceInferDim(x, axis, keep_dim, reduce_all); + + DataType out_dtype; + if (dtype != DataType::UNDEFINED) { + out_dtype = dtype; + } else { + if (x.dtype() == DataType::BOOL || x.dtype() == DataType::INT32 || + x.dtype() == DataType::INT64) { + out_dtype = DataType::INT64; + } else { + out_dtype = x.dtype(); + } } - auto out_dims = phi::make_ddim(out_dim); - for (size_t i = 0; i < outs->size(); ++i) { - (*outs)[i].set_dtype(x.dtype()); - (*outs)[i].set_dims(out_dims); - (*outs)[i].set_layout(x.layout()); - (*outs)[i].share_lod(x); + out->set_dims(out_dim); + out->set_dtype(out_dtype); + out->set_layout(x.layout()); +} + +void TileInferMeta(const MetaTensor& x, + const ScalarArray& repeat_times, + MetaTensor* out, + MetaConfig config) { +#define MAX_RANK_SUPPORTED 6 + + auto repeat_times_data = repeat_times.GetData(); + auto x_dims = x.dims(); + if (repeat_times_data.size() == 0) { + repeat_times_data = std::vector(x_dims.size(), -1); + } + + PADDLE_ENFORCE_LE( + x_dims.size(), + MAX_RANK_SUPPORTED, + errors::InvalidArgument( + "The rank of the input 'x' for tile op " + "must not be greater than %d, but the value received is %d.", + MAX_RANK_SUPPORTED, + x_dims.size())); + PADDLE_ENFORCE_LE( + repeat_times_data.size(), + MAX_RANK_SUPPORTED, + errors::InvalidArgument( + "The size of the shape of input 'repeat_times' for tile op " + "must not be greater than %d, but the value received is %d.", + MAX_RANK_SUPPORTED, + repeat_times_data.size())); + PADDLE_ENFORCE_GE( + repeat_times_data.size(), + 1, + errors::InvalidArgument( + "The size of the shape of input 'repeat_times' for tile op " + "must be positive integers, but the value received is %d.", + repeat_times_data.size())); + + auto out_rank = + std::max(static_cast(x_dims.size()), repeat_times_data.size()); + std::vector out_shape(out_rank); + auto x_dim_vec = phi::vectorize(x_dims); + if (x_dim_vec.size() > repeat_times_data.size()) { + auto diff = x_dim_vec.size() - repeat_times_data.size(); + repeat_times_data.insert(repeat_times_data.begin(), diff, -1); + } else { + auto diff = repeat_times_data.size() - x_dim_vec.size(); + x_dim_vec.insert(x_dim_vec.begin(), diff, -1); + } + for (size_t i = 0; i < repeat_times_data.size(); ++i) { + if (x_dim_vec[i] == -1 || repeat_times_data[i] == -1) { + out_shape[i] = -1; + } else { + PADDLE_ENFORCE_GT( + repeat_times_data[i], + 0, + errors::InvalidArgument( + "Every element of the input 'repeat_times' for tile op must be " + "greater than 0, but the value given is %d.", + repeat_times_data[i])); + out_shape[i] = x_dim_vec[i] * repeat_times_data[i]; + } + } + + out->set_dims(phi::make_ddim(out_shape)); + if (out_shape[0] == x_dims[0]) { + out->share_lod(x); } } @@ -840,79 +1151,112 @@ void TraceInferMeta( out->set_dtype(x.dtype()); } -void DiagonalInferMeta(const MetaTensor& input, - int offset, - int axis1, - int axis2, - MetaTensor* out) { - auto x_dims = input.dims(); - int offset_ = offset; - int axis1_ = axis1 < 0 ? x_dims.size() + axis1 : axis1; - int axis2_ = axis2 < 0 ? x_dims.size() + axis2 : axis2; +void TransferLayoutInferMeta(const MetaTensor& x, + DataLayout layout, + MetaTensor* out) { + out->set_dims(x.dims()); + out->set_dtype(x.dtype()); + out->set_layout(layout); +} - PADDLE_ENFORCE_GE( - x_dims.size(), - 2, - phi::errors::OutOfRange("Input's dim is out of range (expected at " - "least 2 dimensions, but got %ld).", - x_dims.size())); - PADDLE_ENFORCE_LT( - axis1_, - x_dims.size(), - phi::errors::OutOfRange( - "Attr(axis1) is out of range (expected to be in range of [%ld, " - "%ld], but got %ld).", - -(x_dims.size()), - (x_dims.size() - 1), - axis1)); - PADDLE_ENFORCE_LT( - axis2_, - x_dims.size(), - phi::errors::OutOfRange( - "Attr(axis2) is out of range (expected to be in range of [%ld, " - "%ld], but got %ld).", - -(x_dims.size()), - (x_dims.size() - 1), - axis2)); - PADDLE_ENFORCE_NE( - axis1_, - axis2_, - phi::errors::InvalidArgument("The dimensions should not be identical " - "%d vs %d.", - axis1, - axis2)); +void TransposeInferMeta(const MetaTensor& x, + const std::vector& axis, + MetaTensor* out) { + auto x_dims = x.dims(); + size_t x_rank = x_dims.size(); + size_t axis_size = axis.size(); - auto out_dims = vectorize(x_dims); - // from out_dims get the dim size of axis1_. - auto axis1_size = out_dims[axis1_]; - auto axis2_size = out_dims[axis2_]; - // delete two dims by attr axis1 and axis2 from out_dims. - /* example: - out_dim = [2, 3, 4]; - axis1 = 0; - axis2 = 1; - according to the attr of axis1 and axis2, we get: - out_dim = [4]. - */ - out_dims.erase(out_dims.begin() + std::max(axis1_, axis2_)); - out_dims.erase(out_dims.begin() + std::min(axis1_, axis2_)); + PADDLE_ENFORCE_EQ( + x_rank, + axis_size, + errors::InvalidArgument("The input tensor's dimension " + "should be equal to the axis's size. " + "But received input tensor's dimension is %d, " + "axis's size is %d", + x_rank, + axis_size)); - if (offset_ == 0) { - out_dims.push_back(std::min(axis1_size, axis2_size)); - } else if (offset_ > 0) { - if ((axis2_size - offset_) > 0) { - out_dims.push_back(std::min(axis1_size, axis2_size - offset_)); - } else { - out_dims.push_back(0); - } - } else { - if ((axis1_size + offset_) > 0) { - out_dims.push_back(std::min(axis1_size + offset_, axis2_size)); - } else { - out_dims.push_back(0); - } + std::vector count(axis_size, 0); + for (size_t i = 0; i < axis_size; i++) { + PADDLE_ENFORCE_GE( + axis[i], + 0, + errors::InvalidArgument("The axis should be greater than or equal to 0." + "But received %d of axis[%d]", + axis[i], + i)); + + PADDLE_ENFORCE_EQ( + axis[i] < static_cast(axis_size) && ++count[axis[i]] == 1, + true, + errors::InvalidArgument( + "Each element of Attribute axis should " + "be a unique value range from 0 to (dims - 1), " + "where the dims is the axis's size, " + "unique value means this axis value can appear only once. " + "But received axis[%d] is %d, axis_size is %d, " + "count[axis[%d]] is %d", + i, + axis[i], + axis_size, + i, + count[axis[i]])); } - out->set_dims(phi::make_ddim(out_dims)); + + phi::DDim out_dims(x_dims); + for (size_t i = 0; i < axis_size; ++i) { + out_dims[i] = x_dims[axis[i]]; + } + + out->set_dims(out_dims); + out->set_dtype(x.dtype()); +} + +void UnbindInferMeta(const MetaTensor& x, + int axis, + std::vector* outs) { + auto in_dims = x.dims(); + std::vector out_dim; + axis = axis < 0 ? in_dims.size() + axis : axis; + for (int i = 0; i < in_dims.size(); ++i) { + if (i != axis) out_dim.push_back(in_dims[i]); + } + auto out_dims = phi::make_ddim(out_dim); + + for (size_t i = 0; i < outs->size(); ++i) { + (*outs)[i].set_dtype(x.dtype()); + (*outs)[i].set_dims(out_dims); + (*outs)[i].set_layout(x.layout()); + (*outs)[i].share_lod(x); + } +} + +void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out) { + out->share_meta(x); +} + +// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1] +void UnchangedInferMetaCheckAxis(const MetaTensor& x, + int axis, + MetaTensor* out) { + auto rank = x.dims().size(); + PADDLE_ENFORCE_GE( + axis, + -rank, + errors::InvalidArgument( + "Attr(axis) value should be in range [-R, R-1], " + "R is the rank of Input(X). But received axis: %d, R: %d.", + axis, + rank)); + PADDLE_ENFORCE_LT( + axis, + rank, + phi::errors::InvalidArgument( + "Attr(axis) value should be in range [-R, R-1], " + "R is the rank of Input(X). But received axis: %d, R: %d.", + axis, + rank)); + out->share_meta(x); } void UnfoldInferMeta(const MetaTensor& x, @@ -1073,303 +1417,6 @@ void UnfoldInferMeta(const MetaTensor& x, out->set_dims(phi::make_ddim(out_dims)); } -void DiagInferMeta(const MetaTensor& x, - int offset, - float padding_value, - MetaTensor* out) { - auto x_dims = x.dims(); - - if (x_dims.size() == 1UL) { - int64_t size_ = x_dims[0] + std::abs(offset); - out->set_dims({size_, size_}); - out->set_dtype(x.dtype()); - } else if (x_dims.size() == 2UL) { - int64_t size_ = 0; - if (offset >= 0) { - // Note(LutaoChu): Do not use std::min here, otherwise the calculation - // of `size_` will have unexpected result on Windows Python3.8 - if (x_dims[0] < x_dims[1] - offset) { - size_ = x_dims[0]; - } else { - size_ = x_dims[1] - offset; - } - } else { - // Note(LutaoChu): Do not use std::min here, otherwise the calculation - // of `size_` will have unexpected result on Windows Python3.8 - if (x_dims[0] + offset < x_dims[1]) { - size_ = x_dims[0] + offset; - } else { - size_ = x_dims[1]; - } - } - out->set_dims({size_}); - out->set_dtype(x.dtype()); - } else { - PADDLE_THROW(phi::errors::InvalidArgument( - "The input tensor X's dimensions of DiagV2Op should be either 1 or " - "2, but received %d.", - x_dims.size())); - } -} - -void ArgMinMaxInferMeta(const MetaTensor& x, - int64_t axis, - bool keepdims, - bool flatten, - int dtype, - MetaTensor* out, - MetaConfig config) { - const auto& x_dims = x.dims(); - - PADDLE_ENFORCE_GE( - axis, - -x_dims.size(), - phi::errors::InvalidArgument("'axis'(%d) must be greater than or equal to" - " -Rank(X)(%d).", - axis, - -x_dims.size())); - PADDLE_ENFORCE_LT(axis, - x_dims.size(), - phi::errors::InvalidArgument( - "'axis'(%d) must be less than Rank(X)(%d) of Input(X).", - axis, - x_dims.size())); - - PADDLE_ENFORCE_EQ( - (dtype < 0 || dtype == 2 || dtype == 3), - true, - phi::errors::InvalidArgument( - "The attribute of dtype in argmin/argmax must be [%s] or [%s], but " - "received [%s]", - paddle::framework::DataTypeToString( - paddle::framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - paddle::framework::proto::VarType::INT64), - paddle::framework::DataTypeToString( - static_cast(dtype)))); - - auto x_rank = x_dims.size(); - if (axis < 0) axis += x_rank; - if (config.is_runtime) { - if (dtype == paddle::framework::proto::VarType::INT32) { - int64_t all_element_num = 0; - if (flatten) { - all_element_num = phi::product(x_dims); - - } else { - all_element_num = x_dims[axis]; - } - PADDLE_ENFORCE_LE( - all_element_num, - INT_MAX, - phi::errors::InvalidArgument( - "The element num of the argmin/argmax input at axis is " - "%d, is larger than int32 maximum value:%d, you must " - "set the dtype of argmin/argmax to 'int64'.", - all_element_num, - INT_MAX)); - } - } - std::vector vec; - if (flatten) { - vec.emplace_back(static_cast(1)); - } else { - for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]); - if (keepdims) { - vec.emplace_back(static_cast(1)); - } - for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]); - } - out->set_dims(phi::make_ddim(vec)); - if (dtype == 2) { - out->set_dtype(DataType::INT32); - } else if (dtype == 3) { - out->set_dtype(DataType::INT64); - } -} - -void SizeInferMeta(const MetaTensor& input, MetaTensor* out) { - out->set_dtype(DataType::INT64); - out->set_dims({1}); -} - -void PadInferMeta(const MetaTensor& input, - const std::vector& paddings, - float pad_value, - MetaTensor* out, - MetaConfig config) { - auto x_dim = input.dims(); - PADDLE_ENFORCE_EQ( - static_cast(paddings.size()), - x_dim.size() * 2, - phi::errors::InvalidArgument( - "Size of 'paddings' dimension should be equal to 2 * size of " - "Input(X)'s dimension, but received (size of 'paddings' dimension " - "is) %d vs (2 * size of Input(X)'s dimension is) %d.", - static_cast(paddings.size()), - x_dim.size() * 2)); - for (size_t i = 0; i < paddings.size(); ++i) { - PADDLE_ENFORCE_GE(paddings[i], - 0, - phi::errors::InvalidArgument( - "The element of 'paddings' should >= 0, but " - "received %d for index %d.", - paddings[i], - static_cast(i))); - } - std::vector out_dims(x_dim.size()); - for (int i = 0; i < x_dim.size(); ++i) { - if ((!config.is_runtime) && (x_dim[i] == -1)) { - out_dims[i] = -1; - } else { - out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1]; - } - } - out->set_dims(phi::make_ddim(out_dims)); - if (out_dims[0] == x_dim[0]) { - // Only pass LoD when the first dimension is equal between - // output and input. - out->share_lod(input); - } - out->set_dtype(input.dtype()); -} - -void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) { - out->set_dims(x.dims()); - out->set_dtype(DataType::BOOL); -} - -void PixelShuffleInferMeta(const MetaTensor& x, - int upscale_factor, - const std::string& data_format, - MetaTensor* out) { - auto input_dims = x.dims(); - PADDLE_ENFORCE_EQ(input_dims.size(), - 4, - phi::errors::InvalidArgument( - "Input should be a 4-D tensor of format [N, C, H, W] " - "or [N, H, W, C], but got %u.", - input_dims.size())); - - const bool channel_last = (data_format == "NHWC"); - - if (!channel_last) { - PADDLE_ENFORCE_EQ(input_dims[1] % (upscale_factor * upscale_factor), - 0, - phi::errors::InvalidArgument( - "The square of upscale_factor[%u] should divide the " - "number of channel[%u]", - upscale_factor * upscale_factor, - input_dims[1])); - } else { - PADDLE_ENFORCE_EQ(input_dims[3] % (upscale_factor * upscale_factor), - 0, - phi::errors::InvalidArgument( - "The square of upscale_factor[%u] should divide the " - "number of channel[%u]", - upscale_factor * upscale_factor, - input_dims[3])); - } - auto output_dims = input_dims; - output_dims[0] = input_dims[0]; - if (!channel_last) { - output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor); - output_dims[2] = input_dims[2] * upscale_factor; - output_dims[3] = input_dims[3] * upscale_factor; - } else { - output_dims[1] = input_dims[1] * upscale_factor; - output_dims[2] = input_dims[2] * upscale_factor; - output_dims[3] = input_dims[3] / (upscale_factor * upscale_factor); - } - out->set_dtype(x.dtype()); - out->set_dims(output_dims); -} - -void TransposeInferMeta(const MetaTensor& x, - const std::vector& axis, - MetaTensor* out) { - auto x_dims = x.dims(); - size_t x_rank = x_dims.size(); - size_t axis_size = axis.size(); - - PADDLE_ENFORCE_EQ( - x_rank, - axis_size, - errors::InvalidArgument("The input tensor's dimension " - "should be equal to the axis's size. " - "But received input tensor's dimension is %d, " - "axis's size is %d", - x_rank, - axis_size)); - - std::vector count(axis_size, 0); - for (size_t i = 0; i < axis_size; i++) { - PADDLE_ENFORCE_GE( - axis[i], - 0, - errors::InvalidArgument("The axis should be greater than or equal to 0." - "But received %d of axis[%d]", - axis[i], - i)); - - PADDLE_ENFORCE_EQ( - axis[i] < static_cast(axis_size) && ++count[axis[i]] == 1, - true, - errors::InvalidArgument( - "Each element of Attribute axis should " - "be a unique value range from 0 to (dims - 1), " - "where the dims is the axis's size, " - "unique value means this axis value can appear only once. " - "But received axis[%d] is %d, axis_size is %d, " - "count[axis[%d]] is %d", - i, - axis[i], - axis_size, - i, - count[axis[i]])); - } - - phi::DDim out_dims(x_dims); - for (size_t i = 0; i < axis_size; ++i) { - out_dims[i] = x_dims[axis[i]]; - } - - out->set_dims(out_dims); - out->set_dtype(x.dtype()); -} - -void EighInferMeta(const MetaTensor& x, - const std::string& uplo, - MetaTensor* out_w, - MetaTensor* out_v) { - auto input_dim = x.dims(); - auto rank = input_dim.size(); - - PADDLE_ENFORCE_GE(rank, - 2, - phi::errors::InvalidArgument( - "The Input(X) should have at least 2 dimensions." - "But received a %d dimension tensor.", - rank)); - PADDLE_ENFORCE_EQ( - input_dim[rank - 2], - input_dim[rank - 1], - phi::errors::InvalidArgument( - "Eigh op is designed for square matrix, consequently" - "inner-most 2 dimensions of Input(X) should be symmetric." - "But received X's shape[-2] = %d and shape[-1] = %d.", - input_dim[rank - 2], - input_dim[rank - 1])); - - std::vector values_dim; - - for (auto i = 0; i < rank - 1; i++) { - values_dim.emplace_back(input_dim[i]); - } - out_w->set_dims(phi::make_ddim(values_dim)); - out_v->set_dims(input_dim); -} - void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out) { auto rank = condition.dims().size(); PADDLE_ENFORCE_GE( @@ -1381,53 +1428,6 @@ void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out) { out->set_dtype(DataType::INT64); } -void ShardIndexInferMeta(const MetaTensor& in, - int index_num, - int nshards, - int shard_id, - int ignore_value, - MetaTensor* out, - MetaConfig config) { - auto x_dims = in.dims(); - PADDLE_ENFORCE_GE( - x_dims.size(), - 2, - phi::errors::InvalidArgument("Rank of Input(X) should be at least 2, " - "but the value given is %d.", - x_dims.size())); - if (config.is_runtime || x_dims[x_dims.size() - 1] > 0) { - PADDLE_ENFORCE_EQ(x_dims[x_dims.size() - 1], - 1U, - phi::errors::InvalidArgument( - "The last dimension of Input(X) should be 1, " - "but the value given is %d.", - x_dims[x_dims.size() - 1])); - } - - out->set_dims(x_dims); - out->share_lod(in); - out->set_dtype(in.dtype()); -} - -void SoftmaxInferMeta(const MetaTensor& x, int axis, MetaTensor* out) { - auto dim_x = x.dims(); - auto rank_x = dim_x.size(); - PADDLE_ENFORCE_GE(axis, - -rank_x, - phi::errors::InvalidArgument( - "Attr(axis) value should be in range [-R, R-1], " - "R is the rank of Input(X).")); - PADDLE_ENFORCE_LT(axis, - rank_x, - phi::errors::InvalidArgument( - "Attr(axis) value should be in range [-R, R-1], " - "R is the rank of Input(X).")); - - out->set_dims(x.dims()); - out->set_dtype(x.dtype()); - out->share_lod(x); -} - } // namespace phi PD_REGISTER_INFER_META_FN(copy_to, phi::CopyToInferMeta); diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index c7b7f8e3c13..539b6dcba42 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -32,32 +32,20 @@ class MetaConfig; // Because functions in this file not only can infer shape, but also need // infer lod or other useful data. +void ArgMinMaxInferMeta(const MetaTensor& x, + int64_t axis, + bool keepdims, + bool flatten, + int dtype, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void ArgsortInferMeta(const MetaTensor& input, int axis, bool descending, MetaTensor* output, MetaTensor* indices); -void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out); - -// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1] -void UnchangedInferMetaCheckAxis(const MetaTensor& x, - int axis, - MetaTensor* out); - -void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out); - -void FlattenInferMeta(const MetaTensor& x, - int start_axis, - int stop_axis, - MetaTensor* out); - -void GumbelSoftmaxInferMeta(const MetaTensor& x, - float temperature, - bool hard, - int axis, - MetaTensor* out); - void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out); void CholeskyInferMeta(const MetaTensor& x, bool upper, MetaTensor* out); @@ -76,6 +64,30 @@ void CumsumInferMeta(const MetaTensor& x, bool reverse, MetaTensor* out); +void DiagInferMeta(const MetaTensor& x, + int offset, + float padding_value, + MetaTensor* out); + +void DiagonalInferMeta( + const MetaTensor& input, int offset, int axis1, int axis2, MetaTensor* out); + +void EighInferMeta(const MetaTensor& x, + const std::string& uplo, + MetaTensor* out_w, + MetaTensor* out_v); + +void FlattenInferMeta(const MetaTensor& x, + int start_axis, + int stop_axis, + MetaTensor* out); + +void GumbelSoftmaxInferMeta(const MetaTensor& x, + float temperature, + bool hard, + int axis, + MetaTensor* out); + void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out); void InferMetaFromVecValue(const MetaTensor& x, @@ -84,11 +96,37 @@ void InferMetaFromVecValue(const MetaTensor& x, void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out); +void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out); + void MultinomialInferMeta(const MetaTensor& x, int num_samples, bool replacement, MetaTensor* out); +void PadInferMeta(const MetaTensor& input, + const std::vector& paddings, + float pad_value, + MetaTensor* out, + MetaConfig config = MetaConfig()); + +void PixelShuffleInferMeta(const MetaTensor& x, + int upscale_factor, + const std::string& data_format, + MetaTensor* out); + +void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out); + +void ReduceInferMeta(const MetaTensor& x, + const std::vector& axis, + bool keep_dim, + MetaTensor* out); + +void ReduceInferMetaBase(const MetaTensor& x, + const std::vector& axis, + bool keep_dim, + bool reduce_all, + MetaTensor* out); + void ReshapeInferMeta(const MetaTensor& x, const ScalarArray& shape, MetaTensor* out, @@ -100,28 +138,23 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); -void TileInferMeta(const MetaTensor& x, - const ScalarArray& repeat_times, - MetaTensor* out, - MetaConfig config = MetaConfig()); +void ShardIndexInferMeta(const MetaTensor& in, + int index_num, + int nshards, + int shard_id, + int ignore_value, + MetaTensor* out, + MetaConfig config = MetaConfig()); -void SumRawInferMeta(const MetaTensor& x, - const std::vector& axis, - bool keep_dim, - bool reduce_all, - DataType dtype, - MetaTensor* out); +void SizeInferMeta(const MetaTensor& input, MetaTensor* out); -void ReduceInferMetaBase(const MetaTensor& x, - const std::vector& axis, - bool keep_dim, - bool reduce_all, - MetaTensor* out); +void SoftmaxInferMeta(const MetaTensor& x, int axis, MetaTensor* out); -void ReduceInferMeta(const MetaTensor& x, - const std::vector& axis, - bool keep_dim, - MetaTensor* out); +void SplitInferMeta(const MetaTensor& x_meta, + const ScalarArray& num_or_sections, + const Scalar& axis, + std::vector out, + MetaConfig config = MetaConfig()); void SumInferMeta(const MetaTensor& x, const std::vector& axis, @@ -129,21 +162,39 @@ void SumInferMeta(const MetaTensor& x, bool keep_dim, MetaTensor* out); +void SumRawInferMeta(const MetaTensor& x, + const std::vector& axis, + bool keep_dim, + bool reduce_all, + DataType dtype, + MetaTensor* out); + +void TileInferMeta(const MetaTensor& x, + const ScalarArray& repeat_times, + MetaTensor* out, + MetaConfig config = MetaConfig()); + +void TraceInferMeta( + const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out); + void TransferLayoutInferMeta(const MetaTensor& x, DataLayout layout, MetaTensor* out); -void SplitInferMeta(const MetaTensor& x_meta, - const ScalarArray& num_or_sections, - const Scalar& axis, - std::vector out, - MetaConfig config = MetaConfig()); +void TransposeInferMeta(const MetaTensor& x, + const std::vector& axis, + MetaTensor* out); void UnbindInferMeta(const MetaTensor& x, int axis, std::vector* outs); -void TraceInferMeta( - const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out); + +void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out); + +// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1] +void UnchangedInferMetaCheckAxis(const MetaTensor& x, + int axis, + MetaTensor* out); void UnfoldInferMeta(const MetaTensor& x, const std::vector& kernel_sizes, @@ -153,56 +204,6 @@ void UnfoldInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); -void DiagInferMeta(const MetaTensor& x, - int offset, - float padding_value, - MetaTensor* out); - -void ArgMinMaxInferMeta(const MetaTensor& x, - int64_t axis, - bool keepdims, - bool flatten, - int dtype, - MetaTensor* out, - MetaConfig config = MetaConfig()); - -void SizeInferMeta(const MetaTensor& input, MetaTensor* out); - -void PadInferMeta(const MetaTensor& input, - const std::vector& paddings, - float pad_value, - MetaTensor* out, - MetaConfig config = MetaConfig()); - -void DiagonalInferMeta( - const MetaTensor& input, int offset, int axis1, int axis2, MetaTensor* out); - -void PixelShuffleInferMeta(const MetaTensor& x, - int upscale_factor, - const std::string& data_format, - MetaTensor* out); - -void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out); - -void TransposeInferMeta(const MetaTensor& x, - const std::vector& axis, - MetaTensor* out); - -void EighInferMeta(const MetaTensor& x, - const std::string& uplo, - MetaTensor* out_w, - MetaTensor* out_v); - void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out); -void ShardIndexInferMeta(const MetaTensor& in, - int index_num, - int nshards, - int shard_id, - int ignore_value, - MetaTensor* out, - MetaConfig config = MetaConfig()); - -void SoftmaxInferMeta(const MetaTensor& x, int axis, MetaTensor* out); - } // namespace phi -- GitLab From f3f27d25274cc08297affdc7acf6816a1e5f9b94 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Sun, 13 Mar 2022 14:21:31 +0800 Subject: [PATCH 016/176] [PHI] Refactor infermeta files (Part2) (#40367) * refactor infermeta files * update --- paddle/fluid/operators/gather_nd_op.cc | 1 - paddle/phi/infermeta/backward.cc | 29 +- paddle/phi/infermeta/backward.h | 7 +- paddle/phi/infermeta/binary.cc | 903 +++++++++++++------------ paddle/phi/infermeta/binary.h | 114 ++-- paddle/phi/infermeta/nullary.cc | 36 +- paddle/phi/infermeta/nullary.h | 18 +- paddle/phi/infermeta/ternary.cc | 305 ++++----- paddle/phi/infermeta/ternary.h | 46 +- 9 files changed, 731 insertions(+), 728 deletions(-) diff --git a/paddle/fluid/operators/gather_nd_op.cc b/paddle/fluid/operators/gather_nd_op.cc index e5ca15a39ef..7d7d6ae81a0 100644 --- a/paddle/fluid/operators/gather_nd_op.cc +++ b/paddle/fluid/operators/gather_nd_op.cc @@ -16,7 +16,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/infermeta/backward.h" #include "paddle/phi/infermeta/binary.h" -#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index 801bd98b504..a2bdf6b963b 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -64,10 +64,14 @@ void BilinearTensorProductGradInferMeta(const MetaTensor& x, } } -void GeneralUnaryGradInferMeta(const MetaTensor& x, MetaTensor* dx) { - if (dx) { - dx->share_meta(x); - } +void GatherNdGradInferMeta(const MetaTensor& x, + const MetaTensor& index, + const MetaTensor& out_grad, + MetaTensor* x_grad) { + const auto& dtype = out_grad.dtype(); + x_grad->set_dims(x.dims()); + x_grad->share_lod(x); + x_grad->set_dtype(dtype); } void GeneralBinaryGradInferMeta(const MetaTensor& x, @@ -99,6 +103,12 @@ void GeneralTernaryGradInferMeta(const MetaTensor& x, } } +void GeneralUnaryGradInferMeta(const MetaTensor& x, MetaTensor* dx) { + if (dx) { + dx->share_meta(x); + } +} + void GumbelSoftmaxGradInferMeta(const MetaTensor& out, const MetaTensor& dout, int axis, @@ -108,17 +118,8 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out, dout.dims(), errors::InvalidArgument( "Input(Out) and its gradients should have the same shape.")); - dx->share_meta(dout); -} -void GatherNdGradInferMeta(const MetaTensor& x, - const MetaTensor& index, - const MetaTensor& out_grad, - MetaTensor* x_grad) { - const auto& dtype = out_grad.dtype(); - x_grad->set_dims(x.dims()); - x_grad->share_lod(x); - x_grad->set_dtype(dtype); + dx->share_meta(dout); } void PsroiPoolGradInferMeta(const MetaTensor& x, diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index 9ed24ef8646..921df460118 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -30,7 +30,10 @@ void BilinearTensorProductGradInferMeta(const MetaTensor& x, MetaTensor* dweight, MetaTensor* dbias); -void GeneralUnaryGradInferMeta(const MetaTensor& x, MetaTensor* dx); +void GatherNdGradInferMeta(const MetaTensor& x, + const MetaTensor& index, + const MetaTensor& out_grad, + MetaTensor* x_grad); void GeneralBinaryGradInferMeta(const MetaTensor& x, const MetaTensor& y, @@ -44,6 +47,8 @@ void GeneralTernaryGradInferMeta(const MetaTensor& x, MetaTensor* dy, MetaTensor* dz); +void GeneralUnaryGradInferMeta(const MetaTensor& x, MetaTensor* dx); + void GumbelSoftmaxGradInferMeta(const MetaTensor& out, const MetaTensor& dout, int axis, diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 641956c4d9d..b9d43224456 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -22,6 +22,153 @@ limitations under the License. */ namespace phi { +void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { + out->share_meta(x); +} + +void BCELossInferMeta(const MetaTensor& input, + const MetaTensor& label, + MetaTensor* out, + MetaConfig config) { + auto input_dims = input.dims(); + auto label_dims = label.dims(); + + int rank = input_dims.size(); + PADDLE_ENFORCE_EQ(rank, + label_dims.size(), + phi::errors::InvalidArgument( + "Input(X) and Input(Label) shall have the same rank." + "But received: the rank of Input(X) is [%d], " + "the rank of Input(Label) is [%d].", + rank, + label_dims.size())); + + bool check = true; + if ((!config.is_runtime) && + (phi::product(input_dims) <= 0 || phi::product(label_dims) <= 0)) { + check = false; + } + + if (check) { + PADDLE_ENFORCE_EQ(input_dims, + label_dims, + phi::errors::InvalidArgument( + "Input(X) and Input(Label) shall have the same " + "shape. But received: the shape of Input(X) is " + "[%s], the shape of Input(Label) is [%s].", + input_dims, + label_dims)); + } + + out->set_dims(input_dims); + out->set_dtype(input.dtype()); + out->share_lod(input); +} + +void BincountInferMeta(const MetaTensor& x, + const paddle::optional weights, + int minlength, + MetaTensor* out) { + auto input_dim = x.dims(); + + PADDLE_ENFORCE_GE(minlength, + 0, + phi::errors::InvalidArgument( + "The minlength should be greater than or equal to 0." + "But received minlength is %d", + minlength)); + + PADDLE_ENFORCE_EQ( + input_dim.size(), + 1, + phi::errors::InvalidArgument("The 'shape' of Input(X) must be 1-D tensor." + "But the dimension of Input(X) is [%d]", + input_dim.size())); + + if (weights.is_initialized()) { + auto weights_dim = weights->dims(); + PADDLE_ENFORCE_EQ(weights_dim.size(), + 1, + phi::errors::InvalidArgument( + "The 'shape' of Input(Weights) must be 1-D tensor." + "But the dimension of Input(Weights) is [%d]", + weights_dim.size())); + + PADDLE_ENFORCE_EQ( + weights_dim[0], + input_dim[0], + phi::errors::InvalidArgument( + "The 'shape' of Input(Weights) must be equal to the 'shape' of " + "Input(X)." + "But received: the 'shape' of Input(Weights) is [%s]," + "the 'shape' of Input(X) is [%s]", + weights_dim, + input_dim)); + } + out->set_dims(phi::make_ddim({-1})); + if (weights.is_initialized()) { + out->set_dtype(weights->dtype()); + } else { + out->set_dtype(x.dtype()); + } + + out->share_lod(x); +} + +void CholeskySolveInferMeta(const MetaTensor& x, + const MetaTensor& y, + bool upper, + MetaTensor* out) { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + + auto x_dims_n = x_dims.size(); + auto y_dims_n = y_dims.size(); + + PADDLE_ENFORCE_GE(x_dims_n, + 2, + phi::errors::InvalidArgument( + "the rank of input Y must greater or equal to 2")); + PADDLE_ENFORCE_GE(y_dims_n, + 2, + phi::errors::InvalidArgument( + "the rank of input X must greater or equal to 2")); + PADDLE_ENFORCE_EQ( + y_dims[y_dims_n - 1], + y_dims[y_dims_n - 2], + phi::errors::InvalidArgument("input Matrix Y should be square matrix," + "But Got last shape of %ld x %ld", + y_dims[y_dims_n - 1], + y_dims[y_dims_n - 2])); + PADDLE_ENFORCE_EQ( + x_dims[x_dims_n - 2], + y_dims[y_dims_n - 2], + phi::errors::InvalidArgument("the first dim of Matrix X must be equal to " + "the fisrt dim of Matrix Y," + "But Got %ld and %ld", + x_dims[x_dims_n - 2], + y_dims[y_dims_n - 2])); + + std::vector x_dims_vec = phi::vectorize(x_dims); + std::vector y_dims_vec = phi::vectorize(y_dims); + + std::vector x_dims_vec_cut(x_dims_vec.begin(), x_dims_vec.end() - 2); + std::vector y_dims_vec_cut(y_dims_vec.begin(), y_dims_vec.end() - 2); + + std::vector expand_batch_portion = + funcs::MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut); + + std::vector x_broadcast_dims({expand_batch_portion}); + x_broadcast_dims.insert(x_broadcast_dims.end(), + {x_dims_vec[x_dims_n - 2], x_dims_vec[x_dims_n - 1]}); + + // dim of 'out' is the same with 'X' after broadcast + out->set_dims(phi::make_ddim(x_broadcast_dims)); + out->set_dtype(x.dtype()); + out->set_layout(x.layout()); + out->share_lod(x); +} + void CompareInferMeta(const MetaTensor& x, const MetaTensor& y, int axis, @@ -67,6 +214,74 @@ void CompareAllInferMeta(const MetaTensor& x, out->set_dtype(DataType::BOOL); } +void CrossInferMeta(const MetaTensor& x, + const MetaTensor& y, + int axis, + MetaTensor* out) { + auto x_dim = x.dims(); + auto y_dim = y.dims(); + auto dim = axis; + + bool dims_match = phi::funcs::CheckDims(x_dim, y_dim); + PADDLE_ENFORCE_EQ( + dims_match, + true, + phi::errors::InvalidArgument("The 'shape' of Input(X) should be equal to " + "the 'shape' of Input(Y). But received " + "Input(X).dimensions = [%s], " + "Input(Y).dimensions = [%s]", + x_dim, + y_dim)); + + if (dim != DDim::kMaxRank) { + PADDLE_ENFORCE_EQ( + dim < x_dim.size() && dim >= (0 - x_dim.size()), + true, + phi::errors::OutOfRange( + "Attr(dim) is out of range, It's expected " + "to be in range of [-%d, %d]. But received Attr(dim) = %d.", + x_dim.size(), + x_dim.size() - 1, + dim)); + if (dim < 0) { + dim += x_dim.size(); + } + PADDLE_ENFORCE_EQ(x_dim[dim] == 3 && y_dim[dim] == 3, + true, + phi::errors::InvalidArgument( + "Input(X/Y).dims()[dim] should be equal to 3." + "But received Input(X/Y).dims()[dim] = %d.", + x_dim[dim])); + } + out->set_dims(x_dim); + out->set_dtype(x.dtype()); + out->set_layout(x.layout()); + out->share_lod(x); +} + +void DistInferMeta(const MetaTensor& x, + const MetaTensor& y, + float p, + MetaTensor* out) { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + + PADDLE_ENFORCE_NE(phi::product(x_dims), + 0, + phi::errors::InvalidArgument( + "The Input(X) has not been initialized properly. The " + "shape of Input(X) = [%s].", + x_dims)); + PADDLE_ENFORCE_NE(phi::product(y_dims), + 0, + phi::errors::InvalidArgument( + "The Input(Y) has not been initialized properly. The " + "shape of Input(Y) = [%s].", + y_dims)); + out->set_dims({1}); + out->set_dtype(x.dtype()); +} + void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { auto x_dims = x.dims(); auto x_rank = static_cast(x_dims.size()); @@ -109,84 +324,11 @@ void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { out->set_layout(x.layout()); } -void MatmulInferMeta(const MetaTensor& x, - const MetaTensor& y, - bool trans_x, - bool trans_y, - MetaTensor* out) { - std::vector dims_x = phi::vectorize(x.dims()); - std::vector dims_y = phi::vectorize(y.dims()); - auto ndims_x = dims_x.size(); - auto ndims_y = dims_y.size(); - PADDLE_ENFORCE_GT(ndims_x, - 0UL, - phi::errors::InvalidArgument( - "The Input(x) dims size must be greater than 0," - " but reviced dims size is 0. ")); - PADDLE_ENFORCE_GT(ndims_y, - 0UL, - phi::errors::InvalidArgument( - "The Input(y) dims size must be greater than 0," - " but reviced dims size is 0. ")); - - bool x_broadcasted = false, y_broadcasted = false; - if (ndims_x == 1) { - dims_x.insert(dims_x.begin(), 1); - ndims_x = 2; - x_broadcasted = true; - } - - if (ndims_y == 1) { - dims_y.push_back(1); - ndims_y = 2; - y_broadcasted = true; - } - - size_t M, N; - if (trans_x) { - M = dims_x[ndims_x - 1]; - } else { - M = dims_x[ndims_x - 2]; - } - if (trans_y) { - N = dims_y[ndims_y - 2]; - } else { - N = dims_y[ndims_y - 1]; - } - - std::vector new_dims; - if (ndims_x > ndims_y) { - new_dims.assign(dims_x.begin(), dims_x.end() - 2); - } else if (ndims_x < ndims_y) { - new_dims.assign(dims_y.begin(), dims_y.end() - 2); - } else { - new_dims.reserve(ndims_x); - for (size_t i = 0; i < ndims_x - 2; ++i) { - new_dims.push_back(std::max(dims_x[i], dims_y[i])); - } - } - if (!x_broadcasted) { - new_dims.push_back(M); - } - if (!y_broadcasted) { - new_dims.push_back(N); - } - if (x_broadcasted && y_broadcasted) { - new_dims.push_back(1); - } - - auto ddim_out = phi::make_ddim(new_dims); - - out->set_dims(ddim_out); - out->set_dtype(x.dtype()); - out->set_layout(x.layout()); -} - -void ElementwiseInferMeta(const MetaTensor& x, - const MetaTensor& y, - MetaTensor* out) { - return ElementwiseRawInferMeta(x, y, -1, std::move(out)); -} +void ElementwiseInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out) { + return ElementwiseRawInferMeta(x, y, -1, std::move(out)); +} void ElementwiseRawInferMeta(const MetaTensor& x, const MetaTensor& y, @@ -223,383 +365,19 @@ void ElementwiseRawInferMeta(const MetaTensor& x, funcs::GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(), - y_dims_array.data(), - out_dims_array.data(), - max_dim, - axis); - auto out_dims = phi::make_ddim(out_dims_array); - out->set_dims(out_dims); - } else { - out->set_dims(x.dims()); - } - - out->set_dtype(x.dtype()); - out->set_layout(x.layout()); - out->share_lod(x); -} - -void HuberLossInferMeta(const MetaTensor& input, - const MetaTensor& label, - float delta, - MetaTensor* out, - MetaTensor* residual, - MetaConfig config) { - auto input_dims = input.dims(); - auto label_dims = label.dims(); - - PADDLE_ENFORCE_EQ(input_dims.size(), - label_dims.size(), - phi::errors::InvalidArgument( - "Input(input) rank and Input(label) rank should be " - "same, but received input rank(%d) != label rank(%d)", - input_dims.size(), - label_dims.size())); - - bool contain_unknown_dim = phi::contain_unknown_dim(input_dims) || - phi::contain_unknown_dim(label_dims); - if (config.is_runtime || !contain_unknown_dim) { - PADDLE_ENFORCE_EQ( - input_dims, - label_dims, - phi::errors::InvalidArgument( - "The Input(input) and Input(label) should have the same " - "shape, but received input shape [%s] != label shape [%s]", - input_dims, - label_dims)); - } - - auto out_dims = label_dims; - residual->set_dims(out_dims); - out->set_dims(out_dims); - out->share_lod(input); -} - -void CholeskySolveInferMeta(const MetaTensor& x, - const MetaTensor& y, - bool upper, - MetaTensor* out) { - auto x_dims = x.dims(); - auto y_dims = y.dims(); - - auto x_dims_n = x_dims.size(); - auto y_dims_n = y_dims.size(); - - PADDLE_ENFORCE_GE(x_dims_n, - 2, - phi::errors::InvalidArgument( - "the rank of input Y must greater or equal to 2")); - PADDLE_ENFORCE_GE(y_dims_n, - 2, - phi::errors::InvalidArgument( - "the rank of input X must greater or equal to 2")); - PADDLE_ENFORCE_EQ( - y_dims[y_dims_n - 1], - y_dims[y_dims_n - 2], - phi::errors::InvalidArgument("input Matrix Y should be square matrix," - "But Got last shape of %ld x %ld", - y_dims[y_dims_n - 1], - y_dims[y_dims_n - 2])); - PADDLE_ENFORCE_EQ( - x_dims[x_dims_n - 2], - y_dims[y_dims_n - 2], - phi::errors::InvalidArgument("the first dim of Matrix X must be equal to " - "the fisrt dim of Matrix Y," - "But Got %ld and %ld", - x_dims[x_dims_n - 2], - y_dims[y_dims_n - 2])); - - std::vector x_dims_vec = phi::vectorize(x_dims); - std::vector y_dims_vec = phi::vectorize(y_dims); - - std::vector x_dims_vec_cut(x_dims_vec.begin(), x_dims_vec.end() - 2); - std::vector y_dims_vec_cut(y_dims_vec.begin(), y_dims_vec.end() - 2); - - std::vector expand_batch_portion = - funcs::MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut); - - std::vector x_broadcast_dims({expand_batch_portion}); - x_broadcast_dims.insert(x_broadcast_dims.end(), - {x_dims_vec[x_dims_n - 2], x_dims_vec[x_dims_n - 1]}); - - // dim of 'out' is the same with 'X' after broadcast - out->set_dims(phi::make_ddim(x_broadcast_dims)); - out->set_dtype(x.dtype()); - out->set_layout(x.layout()); - out->share_lod(x); -} - -void TriangularSolveInferMeta(const MetaTensor& x, - const MetaTensor& y, - bool upper, - bool transpose, - bool unitriangular, - MetaTensor* out) { - auto x_dims = x.dims(); - auto y_dims = y.dims(); - - auto x_dims_n = x_dims.size(); - auto y_dims_n = y_dims.size(); - - PADDLE_ENFORCE_GE(x_dims_n, - 2, - phi::errors::InvalidArgument( - "The input tensor X's dimensions of TriangularSolveOp " - "should be >= 2. But received X's " - "dimensions = %d, X's shape = [%s]", - x_dims.size(), - x_dims)); - - PADDLE_ENFORCE_GE(y_dims_n, - 2, - phi::errors::InvalidArgument( - "The input tensor Y's dimensions of TriangularSolveOp " - "should be >=2. But received Y's " - "dimensions = %d, Y's shape = [%s]", - y_dims.size(), - y_dims)); - - PADDLE_ENFORCE_EQ(x_dims[x_dims_n - 2], - x_dims[x_dims_n - 1], - phi::errors::InvalidArgument( - "The inner-most 2 dimensions of Input(X) all should " - "be square matrices " - "But received X's shape[-2] = %d and shape[-1] = %d.", - x_dims[x_dims_n - 2], - x_dims[x_dims_n - 1])); - - std::vector x_dims_vec = phi::vectorize(x_dims); - std::vector y_dims_vec = phi::vectorize(y_dims); - - std::vector x_dims_vec_cut(x_dims_vec.begin(), x_dims_vec.end() - 2); - std::vector y_dims_vec_cut(y_dims_vec.begin(), y_dims_vec.end() - 2); - - std::vector expand_batch_portion = - funcs::MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut); - - std::vector y_broadcast_dims({expand_batch_portion}); - y_broadcast_dims.insert(y_broadcast_dims.end(), - {y_dims_vec[y_dims_n - 2], y_dims_vec[y_dims_n - 1]}); - - // dim of 'out' is the same with 'Y' after broadcast - out->set_dims(phi::make_ddim(y_broadcast_dims)); - out->set_dtype(y.dtype()); - out->set_layout(y.layout()); - out->share_lod(y); -} - -void IndexSampleInferMeta(const MetaTensor& x, - const MetaTensor& y, - MetaTensor* out, - MetaConfig config) { - auto input_dims = x.dims(); - PADDLE_ENFORCE_EQ(input_dims.size(), - 2, - errors::InvalidArgument( - "Inputs(X) shape of IndexSample op should be 2-D, but " - "got X's shape = [%s], please check X shape.", - input_dims)); - - auto index_dims = y.dims(); - PADDLE_ENFORCE_EQ( - index_dims.size(), - 2, - errors::InvalidArgument( - "Inputs(Index) shape of IndexSample op should be 2-D, but " - "got Index's shape [%s] , please check index shape.", - input_dims)); - if (config.is_runtime) { - PADDLE_ENFORCE_EQ(input_dims[0], - index_dims[0], - errors::InvalidArgument( - "Inputs(X)'s value of dimension 0 must same with " - "Inputs(Index)'s value of dimension 0, but " - "got %d of Inputs(X), and got %d of Inputs(Index), " - "please check Inputs shape.", - input_dims[0], - index_dims[0])); - } - out->set_dtype(x.dtype()); - out->set_dims(index_dims); - out->share_lod(y); -} -void CrossInferMeta(const MetaTensor& x, - const MetaTensor& y, - int axis, - MetaTensor* out) { - auto x_dim = x.dims(); - auto y_dim = y.dims(); - auto dim = axis; - - bool dims_match = phi::funcs::CheckDims(x_dim, y_dim); - PADDLE_ENFORCE_EQ( - dims_match, - true, - phi::errors::InvalidArgument("The 'shape' of Input(X) should be equal to " - "the 'shape' of Input(Y). But received " - "Input(X).dimensions = [%s], " - "Input(Y).dimensions = [%s]", - x_dim, - y_dim)); - - if (dim != DDim::kMaxRank) { - PADDLE_ENFORCE_EQ( - dim < x_dim.size() && dim >= (0 - x_dim.size()), - true, - phi::errors::OutOfRange( - "Attr(dim) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(dim) = %d.", - x_dim.size(), - x_dim.size() - 1, - dim)); - if (dim < 0) { - dim += x_dim.size(); - } - PADDLE_ENFORCE_EQ(x_dim[dim] == 3 && y_dim[dim] == 3, - true, - phi::errors::InvalidArgument( - "Input(X/Y).dims()[dim] should be equal to 3." - "But received Input(X/Y).dims()[dim] = %d.", - x_dim[dim])); - } - out->set_dims(x_dim); - out->set_dtype(x.dtype()); - out->set_layout(x.layout()); - out->share_lod(x); -} - -void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { - out->share_meta(x); -} - -void SegmentPoolInferMeta(const MetaTensor& x, - const MetaTensor& segment_ids, - const std::string& pooltype, - MetaTensor* out, - MetaTensor* summed_ids, - MetaConfig config) { - auto dims = x.dims(); - dims[0] = -1; - out->set_dims(dims); - out->set_dtype(x.dtype()); - out->set_layout(x.layout()); - - if (pooltype == "MEAN") { - summed_ids->set_dims({-1, 1}); - summed_ids->set_dtype(x.dtype()); - summed_ids->set_layout(x.layout()); - } -} - -void BCELossInferMeta(const MetaTensor& input, - const MetaTensor& label, - MetaTensor* out, - MetaConfig config) { - auto input_dims = input.dims(); - auto label_dims = label.dims(); - - int rank = input_dims.size(); - PADDLE_ENFORCE_EQ(rank, - label_dims.size(), - phi::errors::InvalidArgument( - "Input(X) and Input(Label) shall have the same rank." - "But received: the rank of Input(X) is [%d], " - "the rank of Input(Label) is [%d].", - rank, - label_dims.size())); - - bool check = true; - if ((!config.is_runtime) && - (phi::product(input_dims) <= 0 || phi::product(label_dims) <= 0)) { - check = false; - } - - if (check) { - PADDLE_ENFORCE_EQ(input_dims, - label_dims, - phi::errors::InvalidArgument( - "Input(X) and Input(Label) shall have the same " - "shape. But received: the shape of Input(X) is " - "[%s], the shape of Input(Label) is [%s].", - input_dims, - label_dims)); - } - - out->set_dims(input_dims); - out->set_dtype(input.dtype()); - out->share_lod(input); -} - -void BincountInferMeta(const MetaTensor& x, - const paddle::optional weights, - int minlength, - MetaTensor* out) { - auto input_dim = x.dims(); - - PADDLE_ENFORCE_GE(minlength, - 0, - phi::errors::InvalidArgument( - "The minlength should be greater than or equal to 0." - "But received minlength is %d", - minlength)); - - PADDLE_ENFORCE_EQ( - input_dim.size(), - 1, - phi::errors::InvalidArgument("The 'shape' of Input(X) must be 1-D tensor." - "But the dimension of Input(X) is [%d]", - input_dim.size())); - - if (weights.is_initialized()) { - auto weights_dim = weights->dims(); - PADDLE_ENFORCE_EQ(weights_dim.size(), - 1, - phi::errors::InvalidArgument( - "The 'shape' of Input(Weights) must be 1-D tensor." - "But the dimension of Input(Weights) is [%d]", - weights_dim.size())); - - PADDLE_ENFORCE_EQ( - weights_dim[0], - input_dim[0], - phi::errors::InvalidArgument( - "The 'shape' of Input(Weights) must be equal to the 'shape' of " - "Input(X)." - "But received: the 'shape' of Input(Weights) is [%s]," - "the 'shape' of Input(X) is [%s]", - weights_dim, - input_dim)); - } - out->set_dims(phi::make_ddim({-1})); - if (weights.is_initialized()) { - out->set_dtype(weights->dtype()); + y_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); + auto out_dims = phi::make_ddim(out_dims_array); + out->set_dims(out_dims); } else { - out->set_dtype(x.dtype()); + out->set_dims(x.dims()); } - out->share_lod(x); -} - -void DistInferMeta(const MetaTensor& x, - const MetaTensor& y, - float p, - MetaTensor* out) { - auto x_dims = x.dims(); - auto y_dims = y.dims(); - - PADDLE_ENFORCE_NE(phi::product(x_dims), - 0, - phi::errors::InvalidArgument( - "The Input(X) has not been initialized properly. The " - "shape of Input(X) = [%s].", - x_dims)); - PADDLE_ENFORCE_NE(phi::product(y_dims), - 0, - phi::errors::InvalidArgument( - "The Input(Y) has not been initialized properly. The " - "shape of Input(Y) = [%s].", - y_dims)); - out->set_dims({1}); out->set_dtype(x.dtype()); + out->set_layout(x.layout()); + out->share_lod(x); } void GatherNdInferMeta(const MetaTensor& x, @@ -648,6 +426,78 @@ void GatherTreeMeta(const MetaTensor& ids, out->set_dims(ids_dims); } +void HuberLossInferMeta(const MetaTensor& input, + const MetaTensor& label, + float delta, + MetaTensor* out, + MetaTensor* residual, + MetaConfig config) { + auto input_dims = input.dims(); + auto label_dims = label.dims(); + + PADDLE_ENFORCE_EQ(input_dims.size(), + label_dims.size(), + phi::errors::InvalidArgument( + "Input(input) rank and Input(label) rank should be " + "same, but received input rank(%d) != label rank(%d)", + input_dims.size(), + label_dims.size())); + + bool contain_unknown_dim = phi::contain_unknown_dim(input_dims) || + phi::contain_unknown_dim(label_dims); + if (config.is_runtime || !contain_unknown_dim) { + PADDLE_ENFORCE_EQ( + input_dims, + label_dims, + phi::errors::InvalidArgument( + "The Input(input) and Input(label) should have the same " + "shape, but received input shape [%s] != label shape [%s]", + input_dims, + label_dims)); + } + + auto out_dims = label_dims; + residual->set_dims(out_dims); + out->set_dims(out_dims); + out->share_lod(input); +} + +void IndexSampleInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out, + MetaConfig config) { + auto input_dims = x.dims(); + PADDLE_ENFORCE_EQ(input_dims.size(), + 2, + errors::InvalidArgument( + "Inputs(X) shape of IndexSample op should be 2-D, but " + "got X's shape = [%s], please check X shape.", + input_dims)); + + auto index_dims = y.dims(); + PADDLE_ENFORCE_EQ( + index_dims.size(), + 2, + errors::InvalidArgument( + "Inputs(Index) shape of IndexSample op should be 2-D, but " + "got Index's shape [%s] , please check index shape.", + input_dims)); + if (config.is_runtime) { + PADDLE_ENFORCE_EQ(input_dims[0], + index_dims[0], + errors::InvalidArgument( + "Inputs(X)'s value of dimension 0 must same with " + "Inputs(Index)'s value of dimension 0, but " + "got %d of Inputs(X), and got %d of Inputs(Index), " + "please check Inputs shape.", + input_dims[0], + index_dims[0])); + } + out->set_dtype(x.dtype()); + out->set_dims(index_dims); + out->share_lod(y); +} + void LogLossInferMeta(const MetaTensor& input, const MetaTensor& label, float epsilon, @@ -690,6 +540,79 @@ void LogLossInferMeta(const MetaTensor& input, out->share_lod(input); } +void MatmulInferMeta(const MetaTensor& x, + const MetaTensor& y, + bool trans_x, + bool trans_y, + MetaTensor* out) { + std::vector dims_x = phi::vectorize(x.dims()); + std::vector dims_y = phi::vectorize(y.dims()); + auto ndims_x = dims_x.size(); + auto ndims_y = dims_y.size(); + PADDLE_ENFORCE_GT(ndims_x, + 0UL, + phi::errors::InvalidArgument( + "The Input(x) dims size must be greater than 0," + " but reviced dims size is 0. ")); + PADDLE_ENFORCE_GT(ndims_y, + 0UL, + phi::errors::InvalidArgument( + "The Input(y) dims size must be greater than 0," + " but reviced dims size is 0. ")); + + bool x_broadcasted = false, y_broadcasted = false; + if (ndims_x == 1) { + dims_x.insert(dims_x.begin(), 1); + ndims_x = 2; + x_broadcasted = true; + } + + if (ndims_y == 1) { + dims_y.push_back(1); + ndims_y = 2; + y_broadcasted = true; + } + + size_t M, N; + if (trans_x) { + M = dims_x[ndims_x - 1]; + } else { + M = dims_x[ndims_x - 2]; + } + if (trans_y) { + N = dims_y[ndims_y - 2]; + } else { + N = dims_y[ndims_y - 1]; + } + + std::vector new_dims; + if (ndims_x > ndims_y) { + new_dims.assign(dims_x.begin(), dims_x.end() - 2); + } else if (ndims_x < ndims_y) { + new_dims.assign(dims_y.begin(), dims_y.end() - 2); + } else { + new_dims.reserve(ndims_x); + for (size_t i = 0; i < ndims_x - 2; ++i) { + new_dims.push_back(std::max(dims_x[i], dims_y[i])); + } + } + if (!x_broadcasted) { + new_dims.push_back(M); + } + if (!y_broadcasted) { + new_dims.push_back(N); + } + if (x_broadcasted && y_broadcasted) { + new_dims.push_back(1); + } + + auto ddim_out = phi::make_ddim(new_dims); + + out->set_dims(ddim_out); + out->set_dtype(x.dtype()); + out->set_layout(x.layout()); +} + void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) { auto dim_x = x.dims(); auto dim_vec = vec.dims(); @@ -720,6 +643,25 @@ void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) { out->share_lod(x); } +void SegmentPoolInferMeta(const MetaTensor& x, + const MetaTensor& segment_ids, + const std::string& pooltype, + MetaTensor* out, + MetaTensor* summed_ids, + MetaConfig config) { + auto dims = x.dims(); + dims[0] = -1; + out->set_dims(dims); + out->set_dtype(x.dtype()); + out->set_layout(x.layout()); + + if (pooltype == "MEAN") { + summed_ids->set_dims({-1, 1}); + summed_ids->set_dtype(x.dtype()); + summed_ids->set_layout(x.layout()); + } +} + void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x, const MetaTensor& label, bool normalize, @@ -761,4 +703,63 @@ void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x, out->share_lod(x); } +void TriangularSolveInferMeta(const MetaTensor& x, + const MetaTensor& y, + bool upper, + bool transpose, + bool unitriangular, + MetaTensor* out) { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + + auto x_dims_n = x_dims.size(); + auto y_dims_n = y_dims.size(); + + PADDLE_ENFORCE_GE(x_dims_n, + 2, + phi::errors::InvalidArgument( + "The input tensor X's dimensions of TriangularSolveOp " + "should be >= 2. But received X's " + "dimensions = %d, X's shape = [%s]", + x_dims.size(), + x_dims)); + + PADDLE_ENFORCE_GE(y_dims_n, + 2, + phi::errors::InvalidArgument( + "The input tensor Y's dimensions of TriangularSolveOp " + "should be >=2. But received Y's " + "dimensions = %d, Y's shape = [%s]", + y_dims.size(), + y_dims)); + + PADDLE_ENFORCE_EQ(x_dims[x_dims_n - 2], + x_dims[x_dims_n - 1], + phi::errors::InvalidArgument( + "The inner-most 2 dimensions of Input(X) all should " + "be square matrices " + "But received X's shape[-2] = %d and shape[-1] = %d.", + x_dims[x_dims_n - 2], + x_dims[x_dims_n - 1])); + + std::vector x_dims_vec = phi::vectorize(x_dims); + std::vector y_dims_vec = phi::vectorize(y_dims); + + std::vector x_dims_vec_cut(x_dims_vec.begin(), x_dims_vec.end() - 2); + std::vector y_dims_vec_cut(y_dims_vec.begin(), y_dims_vec.end() - 2); + + std::vector expand_batch_portion = + funcs::MatrixGetBroadcastBatchPortion(x_dims_vec_cut, y_dims_vec_cut); + + std::vector y_broadcast_dims({expand_batch_portion}); + y_broadcast_dims.insert(y_broadcast_dims.end(), + {y_dims_vec[y_dims_n - 2], y_dims_vec[y_dims_n - 1]}); + + // dim of 'out' is the same with 'Y' after broadcast + out->set_dims(phi::make_ddim(y_broadcast_dims)); + out->set_dtype(y.dtype()); + out->set_layout(y.layout()); + out->share_lod(y); +} + } // namespace phi diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index d2b16e557b0..307ecc29cac 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -29,22 +29,43 @@ namespace phi { // Because functions in this file not only can infer shape, but also need // infer lod or other useful data. +void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out); + +void BCELossInferMeta(const MetaTensor& input, + const MetaTensor& label, + MetaTensor* out, + MetaConfig config = MetaConfig()); + +void BincountInferMeta(const MetaTensor& x, + const paddle::optional weights, + int minlength, + MetaTensor* out); + +void CholeskySolveInferMeta(const MetaTensor& x, + const MetaTensor& y, + bool upper, + MetaTensor* out); + +void CompareAllInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out); + void CompareInferMeta(const MetaTensor& x, const MetaTensor& y, int axis, MetaTensor* out); -void CompareAllInferMeta(const MetaTensor& x, - const MetaTensor& y, - MetaTensor* out); +void CrossInferMeta(const MetaTensor& x, + const MetaTensor& y, + int axis, + MetaTensor* out); -void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out); +void DistInferMeta(const MetaTensor& x, + const MetaTensor& y, + float p, + MetaTensor* out); -void MatmulInferMeta(const MetaTensor& x, - const MetaTensor& y, - bool trans_x, - bool trans_y, - MetaTensor* out); +void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out); void ElementwiseInferMeta(const MetaTensor& x, const MetaTensor& y, @@ -55,6 +76,14 @@ void ElementwiseRawInferMeta(const MetaTensor& x_meta, int axis, MetaTensor* out); +void GatherNdInferMeta(const MetaTensor& x, + const MetaTensor& index, + MetaTensor* out); + +void GatherTreeMeta(const MetaTensor& ids, + const MetaTensor& parents, + MetaTensor* out); + void HuberLossInferMeta(const MetaTensor& input_meta, const MetaTensor& label_meta, float delta, @@ -62,29 +91,24 @@ void HuberLossInferMeta(const MetaTensor& input_meta, MetaTensor* residual, MetaConfig config = MetaConfig()); -void CholeskySolveInferMeta(const MetaTensor& x, - const MetaTensor& y, - bool upper, - MetaTensor* out); - -void TriangularSolveInferMeta(const MetaTensor& x, - const MetaTensor& y, - bool upper, - bool transpose, - bool unitriangular, - MetaTensor* out); - void IndexSampleInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out, MetaConfig config = MetaConfig()); -void CrossInferMeta(const MetaTensor& x, - const MetaTensor& y, - int axis, - MetaTensor* out); +void LogLossInferMeta(const MetaTensor& input, + const MetaTensor& label, + float epsilon, + MetaTensor* out, + MetaConfig config = MetaConfig()); -void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out); +void MatmulInferMeta(const MetaTensor& x, + const MetaTensor& y, + bool trans_x, + bool trans_y, + MetaTensor* out); + +void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out); void SegmentPoolInferMeta(const MetaTensor& x, const MetaTensor& segment_ids, @@ -93,37 +117,6 @@ void SegmentPoolInferMeta(const MetaTensor& x, MetaTensor* summed_ids, MetaConfig config = MetaConfig()); -void BCELossInferMeta(const MetaTensor& input, - const MetaTensor& label, - MetaTensor* out, - MetaConfig config = MetaConfig()); - -void BincountInferMeta(const MetaTensor& x, - const paddle::optional weights, - int minlength, - MetaTensor* out); - -void DistInferMeta(const MetaTensor& x, - const MetaTensor& y, - float p, - MetaTensor* out); - -void GatherNdInferMeta(const MetaTensor& x, - const MetaTensor& index, - MetaTensor* out); - -void GatherTreeMeta(const MetaTensor& ids, - const MetaTensor& parents, - MetaTensor* out); - -void LogLossInferMeta(const MetaTensor& input, - const MetaTensor& label, - float epsilon, - MetaTensor* out, - MetaConfig config = MetaConfig()); - -void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out); - void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x, const MetaTensor& label, bool normalize, @@ -131,4 +124,11 @@ void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void TriangularSolveInferMeta(const MetaTensor& x, + const MetaTensor& y, + bool upper, + bool transpose, + bool unitriangular, + MetaTensor* out); + } // namespace phi diff --git a/paddle/phi/infermeta/nullary.cc b/paddle/phi/infermeta/nullary.cc index 506d3fd14ea..081084567e8 100644 --- a/paddle/phi/infermeta/nullary.cc +++ b/paddle/phi/infermeta/nullary.cc @@ -16,6 +16,12 @@ limitations under the License. */ namespace phi { +void CreateInferMeta(const ScalarArray& shape, + DataType dtype, + MetaTensor* out) { + CreateInferMetaBase(shape.GetData(), dtype, DataLayout::NCHW, out); +} + void CreateInferMetaBase(const std::vector& shape, DataType dtype, DataLayout layout, @@ -26,12 +32,6 @@ void CreateInferMetaBase(const std::vector& shape, out->set_layout(layout); } -void CreateInferMeta(const ScalarArray& shape, - DataType dtype, - MetaTensor* out) { - CreateInferMetaBase(shape.GetData(), dtype, DataLayout::NCHW, out); -} - void EyeInferMeta(int64_t num_rows, int64_t num_columns, DataType dtype, @@ -41,18 +41,6 @@ void EyeInferMeta(int64_t num_rows, out->set_dtype(dtype); } -void TruncatedGaussianRandomInferMeta(const std::vector& shape, - float mean, - float std, - int seed, - DataType dtype, - MetaTensor* out) { - auto out_dims = phi::make_ddim(shape); - out->set_dims(out_dims); - out->set_dtype(dtype); - out->set_layout(DataLayout::NCHW); -} - void GaussianRandomInferMeta(const ScalarArray& shape, float mean, float std, @@ -65,4 +53,16 @@ void GaussianRandomInferMeta(const ScalarArray& shape, out->set_layout(DataLayout::NCHW); } +void TruncatedGaussianRandomInferMeta(const std::vector& shape, + float mean, + float std, + int seed, + DataType dtype, + MetaTensor* out) { + auto out_dims = phi::make_ddim(shape); + out->set_dims(out_dims); + out->set_dtype(dtype); + out->set_layout(DataLayout::NCHW); +} + } // namespace phi diff --git a/paddle/phi/infermeta/nullary.h b/paddle/phi/infermeta/nullary.h index bd0567486e4..38eaa636f8c 100644 --- a/paddle/phi/infermeta/nullary.h +++ b/paddle/phi/infermeta/nullary.h @@ -28,25 +28,18 @@ namespace phi { // Because functions in this file not only can infer shape, but also need // infer lod or other useful data. +void CreateInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out); + void CreateInferMetaBase(const std::vector& shape, DataType dtype, DataLayout layout, MetaTensor* out); -void CreateInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out); - void EyeInferMeta(int64_t num_rows, int64_t num_columns, DataType dtype, MetaTensor* out); -void TruncatedGaussianRandomInferMeta(const std::vector& shape, - float mean, - float std, - int seed, - DataType dtype, - MetaTensor* out); - void GaussianRandomInferMeta(const ScalarArray& shape, float mean, float std, @@ -54,4 +47,11 @@ void GaussianRandomInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out); +void TruncatedGaussianRandomInferMeta(const std::vector& shape, + float mean, + float std, + int seed, + DataType dtype, + MetaTensor* out); + } // namespace phi diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index 88ac2cb0f8d..235cfe368c1 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -18,6 +18,58 @@ limitations under the License. */ namespace phi { +void AccuracyInferMeta(const MetaTensor& out, + const MetaTensor& indice, + const MetaTensor& label, + MetaTensor* accuracy, + MetaTensor* correct, + MetaTensor* total, + MetaConfig config) { + auto inference_dim = out.dims(); + auto label_dim = label.dims(); + // Assume indices has same shape as inference, because + // it's the output of topk. + PADDLE_ENFORCE_EQ( + label_dim.size(), + 2, + phi::errors::InvalidArgument( + "ShapeError: label's dimensions of AccuracyOp must be 2. " + "But received label's dimensions = %d, label's shape = [%s]", + label_dim.size(), + label_dim)); + if (config.is_runtime) { + PADDLE_ENFORCE_EQ(label_dim[1], + 1, + phi::errors::InvalidArgument( + "ShapeError: label's second dimension of " + "AccuracyOp must be 1. But received label's " + "second dimension is = %d, label's shape = [%s]", + label_dim[1], + label_dim)); + PADDLE_ENFORCE_EQ( + inference_dim[0], + label_dim[0], + phi::errors::InvalidArgument( + "ShapeError: the output's num_rows of AccuracyOp must be" + " the same as label's num_rows. But received output's " + "shape = [%s], label's shape = [%s], output's num_rows = %d, " + "label's " + "num_rows = %d", + inference_dim, + label_dim, + inference_dim[0], + label_dim[0])); + } + + accuracy->set_dims({1}); + accuracy->set_dtype(out.dtype()); + correct->set_dims({1}); + correct->set_dtype(out.dtype()); + total->set_dims({1}); + total->set_dtype(out.dtype()); + accuracy->share_lod(out); +} + void AddmmInferMeta(const MetaTensor& input, const MetaTensor& x, const MetaTensor& y, @@ -89,6 +141,107 @@ void AddmmInferMeta(const MetaTensor& input, out->set_dtype(input.dtype()); } +void GraphSendRecvInferMeta(const MetaTensor& x, + const MetaTensor& src_index, + const MetaTensor& dst_index, + const std::string& pool_type, + MetaTensor* out, + MetaTensor* dst_count) { + auto src_index_dims = src_index.dims(); + if (src_index_dims.size() == 2) { + PADDLE_ENFORCE_EQ(src_index_dims[1], + 1, + phi::errors::InvalidArgument( + "The last dim of Src_index should be 1 when it " + "is 2D, but we get %d", + src_index_dims[1])); + } else { + PADDLE_ENFORCE_EQ( + src_index_dims.size(), + 1, + phi::errors::InvalidArgument( + "The Src_index should be 1D, when it is not 2D, but we get %d", + src_index_dims.size())); + } + + auto dst_index_dims = dst_index.dims(); + if (dst_index_dims.size() == 2) { + PADDLE_ENFORCE_EQ(dst_index_dims[1], + 1, + phi::errors::InvalidArgument( + "The last dim of Dst_index should be 1 when it " + "is 2D, but we get %d", + dst_index_dims[1])); + } else { + PADDLE_ENFORCE_EQ( + dst_index_dims.size(), + 1, + phi::errors::InvalidArgument("The Dst_index should be 1D, " + "when it is not 2D, but we get %d", + dst_index_dims.size())); + } + + PADDLE_ENFORCE_EQ(src_index_dims[0], + dst_index_dims[0], + phi::errors::InvalidArgument( + "Src_index and Dst_index should have the same shape.")); + + auto dims = x.dims(); + out->set_dims(dims); + out->set_dtype(x.dtype()); + + if (pool_type == "MEAN") { + dst_count->set_dims({dims[0]}); + dst_count->set_dtype(DataType::INT32); + } +} + +void LerpInferMeta(const MetaTensor& x, + const MetaTensor& y, + const MetaTensor& weight, + MetaTensor* out) { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + auto w_dims = weight.dims(); + DDim out_dims; + out_dims = funcs::GetOutputDims(x_dims, y_dims); + if (w_dims.size() > 1 || w_dims[0] != 1) { + out_dims = funcs::GetOutputDims(out_dims, w_dims); + } + out->set_dims(out_dims); + out->set_dtype(x.dtype()); + out->share_lod(x); +} + +void LinspaceInferMeta(const MetaTensor& start, + const MetaTensor& stop, + const MetaTensor& number, + MetaTensor* out) { + auto s_dims = start.dims(); + PADDLE_ENFORCE_EQ( + (s_dims.size() == 1) && (s_dims[0] == 1), + true, + phi::errors::InvalidArgument("The shape of Input(Start) must be [1]," + "but received input shape is [%s].", + s_dims)); + auto e_dims = stop.dims(); + PADDLE_ENFORCE_EQ( + (e_dims.size() == 1) && (e_dims[0] == 1), + true, + phi::errors::InvalidArgument("The shape of Input(Stop) must be [1]," + "but received input shape is [%s].", + e_dims)); + auto step_dims = number.dims(); + PADDLE_ENFORCE_EQ( + (step_dims.size() == 1) && (step_dims[0] == 1), + true, + phi::errors::InvalidArgument("The shape of Input(Num) must be [1]," + "but received input shape is [%s].", + step_dims)); + out->set_dims(phi::make_ddim({-1})); + out->set_dtype(start.dtype()); +} + void NllLossRawInferMeta(const MetaTensor& input, const MetaTensor& label, paddle::optional weight, @@ -319,156 +472,4 @@ void ViterbiDecodeInferMeta(const MetaTensor& input, scores->set_dtype(length.dtype()); } -void LerpInferMeta(const MetaTensor& x, - const MetaTensor& y, - const MetaTensor& weight, - MetaTensor* out) { - auto x_dims = x.dims(); - auto y_dims = y.dims(); - auto w_dims = weight.dims(); - DDim out_dims; - out_dims = funcs::GetOutputDims(x_dims, y_dims); - if (w_dims.size() > 1 || w_dims[0] != 1) { - out_dims = funcs::GetOutputDims(out_dims, w_dims); - } - out->set_dims(out_dims); - out->set_dtype(x.dtype()); - out->share_lod(x); -} - -void LinspaceInferMeta(const MetaTensor& start, - const MetaTensor& stop, - const MetaTensor& number, - MetaTensor* out) { - auto s_dims = start.dims(); - PADDLE_ENFORCE_EQ( - (s_dims.size() == 1) && (s_dims[0] == 1), - true, - phi::errors::InvalidArgument("The shape of Input(Start) must be [1]," - "but received input shape is [%s].", - s_dims)); - auto e_dims = stop.dims(); - PADDLE_ENFORCE_EQ( - (e_dims.size() == 1) && (e_dims[0] == 1), - true, - phi::errors::InvalidArgument("The shape of Input(Stop) must be [1]," - "but received input shape is [%s].", - e_dims)); - auto step_dims = number.dims(); - PADDLE_ENFORCE_EQ( - (step_dims.size() == 1) && (step_dims[0] == 1), - true, - phi::errors::InvalidArgument("The shape of Input(Num) must be [1]," - "but received input shape is [%s].", - step_dims)); - out->set_dims(phi::make_ddim({-1})); - out->set_dtype(start.dtype()); -} - -void AccuracyInferMeta(const MetaTensor& out, - const MetaTensor& indice, - const MetaTensor& label, - MetaTensor* accuracy, - MetaTensor* correct, - MetaTensor* total, - MetaConfig config) { - auto inference_dim = out.dims(); - auto label_dim = label.dims(); - // Assume indices has same shape as inference, because - // it's the output of topk. - PADDLE_ENFORCE_EQ( - label_dim.size(), - 2, - phi::errors::InvalidArgument( - "ShapeError: label's dimensions of AccuracyOp must be 2. " - "But received label's dimensions = %d, label's shape = [%s]", - label_dim.size(), - label_dim)); - if (config.is_runtime) { - PADDLE_ENFORCE_EQ(label_dim[1], - 1, - phi::errors::InvalidArgument( - "ShapeError: label's second dimension of " - "AccuracyOp must be 1. But received label's " - "second dimension is = %d, label's shape = [%s]", - label_dim[1], - label_dim)); - PADDLE_ENFORCE_EQ( - inference_dim[0], - label_dim[0], - phi::errors::InvalidArgument( - "ShapeError: the output's num_rows of AccuracyOp must be" - " the same as label's num_rows. But received output's " - "shape = [%s], label's shape = [%s], output's num_rows = %d, " - "label's " - "num_rows = %d", - inference_dim, - label_dim, - inference_dim[0], - label_dim[0])); - } - - accuracy->set_dims({1}); - accuracy->set_dtype(out.dtype()); - correct->set_dims({1}); - correct->set_dtype(out.dtype()); - total->set_dims({1}); - total->set_dtype(out.dtype()); - accuracy->share_lod(out); -} - -void GraphSendRecvInferMeta(const MetaTensor& x, - const MetaTensor& src_index, - const MetaTensor& dst_index, - const std::string& pool_type, - MetaTensor* out, - MetaTensor* dst_count) { - auto src_index_dims = src_index.dims(); - if (src_index_dims.size() == 2) { - PADDLE_ENFORCE_EQ(src_index_dims[1], - 1, - phi::errors::InvalidArgument( - "The last dim of Src_index should be 1 when it " - "is 2D, but we get %d", - src_index_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - src_index_dims.size(), - 1, - phi::errors::InvalidArgument( - "The Src_index should be 1D, when it is not 2D, but we get %d", - src_index_dims.size())); - } - - auto dst_index_dims = dst_index.dims(); - if (dst_index_dims.size() == 2) { - PADDLE_ENFORCE_EQ(dst_index_dims[1], - 1, - phi::errors::InvalidArgument( - "The last dim of Dst_index should be 1 when it " - "is 2D, but we get %d", - dst_index_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - dst_index_dims.size(), - 1, - phi::errors::InvalidArgument("The Dst_index should be 1D, " - "when it is not 2D, but we get %d", - dst_index_dims.size())); - } - - PADDLE_ENFORCE_EQ(src_index_dims[0], - dst_index_dims[0], - phi::errors::InvalidArgument( - "Src_index and Dst_index should have the same shape.")); - - auto dims = x.dims(); - out->set_dims(dims); - out->set_dtype(x.dtype()); - - if (pool_type == "MEAN") { - dst_count->set_dims({dims[0]}); - dst_count->set_dtype(DataType::INT32); - } -} } // namespace phi diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index c9a7e78db75..209a07db18b 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -45,16 +45,22 @@ void AddmmInferMeta(const MetaTensor& input, float beta, MetaTensor* out); -void GatherNdGradInferMeta(const MetaTensor& x, - const MetaTensor& index, - const MetaTensor& out_grad, - MetaTensor* x_grad); +void GraphSendRecvInferMeta(const MetaTensor& x, + const MetaTensor& src_index, + const MetaTensor& dst_index, + const std::string& pool_type, + MetaTensor* out, + MetaTensor* dst_count); -void ScatterInferMeta(const MetaTensor& x, - const MetaTensor& index, - const MetaTensor& updates, - bool overwrite, - MetaTensor* out); +void LerpInferMeta(const MetaTensor& x, + const MetaTensor& y, + const MetaTensor& weight, + MetaTensor* out); + +void LinspaceInferMeta(const MetaTensor& start, + const MetaTensor& stop, + const MetaTensor& number, + MetaTensor* out); void NllLossRawInferMeta(const MetaTensor& input, const MetaTensor& label, @@ -65,6 +71,12 @@ void NllLossRawInferMeta(const MetaTensor& input, MetaTensor* total_weight, MetaConfig config = MetaConfig()); +void ScatterInferMeta(const MetaTensor& x, + const MetaTensor& index, + const MetaTensor& updates, + bool overwrite, + MetaTensor* out); + void ScatterNdAddInferMeta(const MetaTensor& x, const MetaTensor& index, const MetaTensor& updates, @@ -78,20 +90,4 @@ void ViterbiDecodeInferMeta(const MetaTensor& input, MetaTensor* path, MetaConfig config = MetaConfig()); -void LerpInferMeta(const MetaTensor& x, - const MetaTensor& y, - const MetaTensor& weight, - MetaTensor* out); - -void LinspaceInferMeta(const MetaTensor& start, - const MetaTensor& stop, - const MetaTensor& number, - MetaTensor* out); - -void GraphSendRecvInferMeta(const MetaTensor& x, - const MetaTensor& src_index, - const MetaTensor& dst_index, - const std::string& pool_type, - MetaTensor* out, - MetaTensor* dst_count); } // namespace phi -- GitLab From 1b0cecb732d39673c120fdd919353323f165d87d Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Sun, 13 Mar 2022 15:02:15 +0800 Subject: [PATCH 017/176] polish several details (#40485) --- paddle/fluid/framework/infershape_utils.cc | 4 ++-- paddle/fluid/operators/softmax_op.cc | 4 ++-- paddle/phi/core/meta_tensor.cc | 4 ++-- paddle/phi/core/meta_tensor.h | 2 +- paddle/phi/kernels/funcs/matrix_inverse.h | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index 29c7f5d0ce7..f57674d5601 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -249,13 +249,13 @@ class CompatMetaTensor : public phi::MetaTensor { } void share_meta(const MetaTensor& meta_tensor) override { + share_dims(meta_tensor); set_dtype(meta_tensor.dtype()); // VarDesc doesn't contains layout, so we cannot share layout // set_layout(meta_tensor.layout()); - // special case 1: share lod of LoDTensor + // special case: share lod of LoDTensor share_lod(meta_tensor); - share_dims(meta_tensor); } private: diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index af90baf27d3..3840b99dd17 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -215,7 +215,7 @@ REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, ops::SoftmaxOpGradMaker, ops::SoftmaxOpGradMaker, ops::SoftmaxInplaceInferer, SoftmaxInferShapeFunctor); -DECLARE_INFER_SHAPE_FUNCTOR(softmax_grad, SoftmaxGradnferShapeFunctor, +DECLARE_INFER_SHAPE_FUNCTOR(softmax_grad, SoftmaxGradInferShapeFunctor, PD_INFER_META(phi::GeneralUnaryGradInferMeta)); REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad, - SoftmaxGradnferShapeFunctor); + SoftmaxGradInferShapeFunctor); diff --git a/paddle/phi/core/meta_tensor.cc b/paddle/phi/core/meta_tensor.cc index eb114304f53..38a6e09a61e 100644 --- a/paddle/phi/core/meta_tensor.cc +++ b/paddle/phi/core/meta_tensor.cc @@ -110,7 +110,7 @@ void MetaTensor::share_meta(const MetaTensor& meta_tensor) { } } -TensorBase* MetaTensor::get_tensor() const { return tensor_; } +TensorBase* MetaTensor::tensor() const { return tensor_; } void MetaTensor::share_dims(const MetaTensor& meta_tensor) { bool is_dense_tensor = phi::DenseTensor::classof(tensor_); @@ -118,7 +118,7 @@ void MetaTensor::share_dims(const MetaTensor& meta_tensor) { if (is_dense_tensor || is_selected_rows) { set_dims(meta_tensor.dims()); if (is_selected_rows) { - const auto in_tensor_base = meta_tensor.get_tensor(); + const auto in_tensor_base = meta_tensor.tensor(); PADDLE_ENFORCE_EQ( phi::SelectedRows::classof(in_tensor_base), true, diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h index 3971a9f7e99..79f8d1c057e 100644 --- a/paddle/phi/core/meta_tensor.h +++ b/paddle/phi/core/meta_tensor.h @@ -66,7 +66,7 @@ class MetaTensor { // Because the lod in compiletime and runtime is different, // so `LoD` cannot in public methods const LoD& lod() const; - TensorBase* get_tensor() const; + TensorBase* tensor() const; TensorBase* tensor_; }; diff --git a/paddle/phi/kernels/funcs/matrix_inverse.h b/paddle/phi/kernels/funcs/matrix_inverse.h index c5b04a81065..1c6756f1720 100644 --- a/paddle/phi/kernels/funcs/matrix_inverse.h +++ b/paddle/phi/kernels/funcs/matrix_inverse.h @@ -39,7 +39,7 @@ void ComputeInverseEigen(const Context& dev_ctx, int batch_size = rank > 2 ? a.numel() / (n * n) : 1; const T* a_ptr = a.data(); - T* a_inv_ptr = a_inv->mutable_data(dev_ctx.GetPlace()); + T* a_inv_ptr = dev_ctx.template Alloc(a_inv); for (int i = 0; i < batch_size; ++i) { ConstEigenMatrixMap mat(a_ptr + i * n * n, n, n); -- GitLab From afafb1c347cb2e3331a21a68fa9df9b322c56712 Mon Sep 17 00:00:00 2001 From: 0x45f <23097963+0x45f@users.noreply.github.com> Date: Mon, 14 Mar 2022 10:12:27 +0800 Subject: [PATCH 018/176] Refine partial_program for new run_program OP (#40355) * refine partial_program * fix code for test_mnist.py train * support quantify UT * make __fake_vars and _double_grads to lazy * fix comments --- .../eager/to_static/run_program_op_func.h | 1 + .../eager/to_static/run_program_op_node.h | 23 +++++++++--- .../fluid/pybind/custom_handwrite_op_funcs.h | 1 - paddle/fluid/pybind/eager_method.cc | 17 +++++++++ .../dygraph_to_static/function_spec.py | 2 +- .../dygraph_to_static/partial_program.py | 35 ++++++++++--------- 6 files changed, 55 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h index 6f8bccd64e4..9967d8c3690 100644 --- a/paddle/fluid/eager/to_static/run_program_op_func.h +++ b/paddle/fluid/eager/to_static/run_program_op_func.h @@ -57,6 +57,7 @@ inline void run_program_dygraph_function( auto grad_node = std::make_shared(1, 2); grad_node->SetFwdOutNames(out_names); + grad_node->SetOut(out); // Set Attributes grad_node->SetAttrMap(attrs); // Set TensorWrappers diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h index ae5d86664a3..d99624e4932 100644 --- a/paddle/fluid/eager/to_static/run_program_op_node.h +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -260,9 +260,9 @@ inline void RunProgramAPI( } VLOG(2) << "The number of sub scopes after forward: " << out_scope_vec->front()->kids().size(); - // #ifdef PADDLE_WITH_MKLDNN - // if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place); - // #endif +#ifdef PADDLE_WITH_MKLDNN + if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place); +#endif } inline void RunProgramGradAPI( @@ -357,7 +357,7 @@ inline void RunProgramGradAPI( details::ShareTensorsFromScope(params_grad, *global_block, &scope); // Step5. drop current scope - // global_inner_scope->DeleteScope(&scope); + global_inner_scope->DeleteScope(&scope); VLOG(2) << "The number of sub scopes after backward: " << global_inner_scope->kids().size(); } @@ -400,6 +400,10 @@ class GradNodeRunProgram : public egr::GradNodeBase { paddle::platform::errors::InvalidArgument( "The grads[0].size() and fwd_out_names_.size() should be equal.")); for (size_t i = 0; i < fwd_out_names_.size(); ++i) { + auto &out_grad = egr::EagerUtils::unsafe_autograd_meta(*out_[i])->Grad(); + const_cast(out_grad).set_impl( + grads[0][i].impl()); + const_cast(grads[0][i]) .set_name(fwd_out_names_[i] + "@GRAD"); } @@ -432,6 +436,10 @@ class GradNodeRunProgram : public egr::GradNodeBase { fwd_out_names_ = out_names; } + void SetOut(const std::vector &out) { + out_ = out; + } + protected: void ConstructGradTensors( const std::vector &fwd_tensors, @@ -440,7 +448,11 @@ class GradNodeRunProgram : public egr::GradNodeBase { // such as: name, tensor type(DenseTensor or SelectedRows). VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size(); for (auto &fwd_t : fwd_tensors) { - grad_tensors->emplace_back(fwd_t.impl()); + if (phi::DenseTensor::classof(fwd_t.impl().get())) { + grad_tensors->emplace_back(std::make_shared()); + } else if (phi::SelectedRows::classof(fwd_t.impl().get())) { + grad_tensors->emplace_back(std::make_shared()); + } auto &grad_t = grad_tensors->back(); grad_t.set_name(fwd_t.name() + "@GRAD"); } @@ -462,6 +474,7 @@ class GradNodeRunProgram : public egr::GradNodeBase { std::vector step_scope_; std::vector fwd_out_names_; + std::vector out_; // Attribute Map paddle::framework::AttributeMap attrs_; diff --git a/paddle/fluid/pybind/custom_handwrite_op_funcs.h b/paddle/fluid/pybind/custom_handwrite_op_funcs.h index 7a276df0d5b..3b898ce77ce 100644 --- a/paddle/fluid/pybind/custom_handwrite_op_funcs.h +++ b/paddle/fluid/pybind/custom_handwrite_op_funcs.h @@ -31,7 +31,6 @@ static PyObject *eager_api_run_program(PyObject *self, PyObject *args, tstate = PyEval_SaveThread(); run_program_dygraph_function(X, Params, Out, OutScope, DOut, attrs); - std::cout << "end run_program_dygraph_function" << std::endl; PyEval_RestoreThread(tstate); tstate = nullptr; } catch (...) { diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index e5f22338dc6..b0dbce34d34 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -688,6 +688,21 @@ static PyObject* tensor_register_reduce_hook(TensorObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } +static PyObject* set_grad_type(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + auto var_type = pybind::CastPyArg2ProtoType(PyTuple_GET_ITEM(args, 0), 0); + auto grad_tensor = + egr::EagerUtils::unsafe_autograd_meta(self->tensor)->Grad(); + if (var_type == framework::proto::VarType::LOD_TENSOR) { + grad_tensor.set_impl(std::make_shared()); + } else if (var_type == framework::proto::VarType::SELECTED_ROWS) { + grad_tensor.set_impl(std::make_shared()); + } + return Py_None; + EAGER_CATCH_AND_THROW_RETURN_NULL +} + PyMethodDef variable_methods[] = { {"numpy", (PyCFunction)(void (*)(void))tensor_method_numpy, METH_VARARGS | METH_KEYWORDS, NULL}, @@ -734,6 +749,8 @@ PyMethodDef variable_methods[] = { {"_register_backward_hook", (PyCFunction)(void (*)(void))tensor_register_reduce_hook, METH_VARARGS | METH_KEYWORDS, NULL}, + {"_set_grad_type", (PyCFunction)(void (*)(void))set_grad_type, + METH_VARARGS | METH_KEYWORDS, NULL}, {NULL, NULL, 0, NULL}}; } // namespace pybind diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py index 30012fb8666..900541459f6 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py @@ -104,7 +104,7 @@ class FunctionSpec(object): if isinstance(input_var, np.ndarray): input_var = paddle.static.InputSpec.from_numpy(input_var) _set_spec_stop_gradient(input_var, True) - elif isinstance(input_var, core.VarBase): + elif isinstance(input_var, (core.VarBase, core.eager.Tensor)): stop_gradient = input_var.stop_gradient input_var = paddle.static.InputSpec.from_tensor(input_var) _set_spec_stop_gradient(input_var, stop_gradient) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py index a442a8b92b6..216f955b751 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py @@ -148,10 +148,7 @@ class PartialProgramLayer: self._origin_main_program = self._verify_program(main_program) self._tmp_scope_vec = self._create_scope_vec() - # A fake_var to handle empty input or output - self.__fake_vars = _create_fake_var() # Set default mode to train - self._double_grads = self._get_double_grads(self._origin_main_program) self.training = True custom_white_list, custom_black_list = None, None @@ -163,6 +160,14 @@ class PartialProgramLayer: custom_white_list=custom_white_list, custom_black_list=custom_black_list) + @LazyInitialized + def __fake_vars(self): + return _create_fake_var() + + @LazyInitialized + def _double_grads(self): + return self._get_double_grads(self._origin_main_program) + @LazyInitialized def _infer_program(self): """ @@ -356,8 +361,10 @@ class PartialProgramLayer: def drop_scope_if_no_grad(self): tracer = framework._dygraph_tracer() + scope = self._tmp_scope_vec.value().get_scope() if isinstance( + self._tmp_scope_vec, (core.VarBase)) else self._tmp_scope_vec[0] if self.training and not tracer._has_grad: - self._tmp_scope_vec.value().get_scope().drop_kids() + scope.drop_kids() @property def program(self): @@ -449,18 +456,14 @@ class PartialProgramLayer: def _create_scope_vec(self): # Hold forward variables tmp_scope_vec = None + inner_scope = core.Scope() if not core._in_eager_mode(): tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [], "program_out_scope", core.VarDesc.VarType.STEP_SCOPES, True) - # TODO(jiabin): Support this later. - # else: - # tmp_scope_vec = core.eager.Tensor(core.VarDesc.VarType.FP32, [], - # "program_out_scope", - # core.VarDesc.VarType.STEP_SCOPES, True) - - inner_scope = core.Scope() tmp_scope_vec.value().set_scope(inner_scope) + else: + tmp_scope_vec = [inner_scope] return tmp_scope_vec def _restore_out(self, out_vars): @@ -598,12 +601,10 @@ def _create_fake_var(): core.VarDesc.VarType.RAW, False) ] else: - return [] - # TODO(jiabin): Support this later - # return [ - # core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var", - # core.VarDesc.VarType.RAW, False) - # ] + return [ + core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var", + core.VarDesc.VarType.RAW, False) + ] def partial_program_from(concrete_program): -- GitLab From 67166caf048c173b36e9e92b9d12c9c1f5591343 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Mon, 14 Mar 2022 10:19:00 +0800 Subject: [PATCH 019/176] Static-Check CI Build develop (#40138) --- paddle/scripts/paddle_build.sh | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 84f7a57999f..d1db8feb217 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -948,8 +948,17 @@ function generate_upstream_develop_api_spec() { git checkout . git checkout -b develop_base_pr upstream/$BRANCH startTime_firstBuild=`date +%s` - cmake_gen $1 - build $2 + + dev_commit=`git log -1|head -1|awk '{print $2}'` + dev_url="https://xly-devops.bj.bcebos.com/PR/build_whl/0/${dev_commit}/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl" + url_return=`curl -s -m 5 -IL ${dev_url} |awk 'NR==1{print $2}'` + if [ "$url_return" == '200' ];then + mkdir -p ${PADDLE_ROOT}/build/python/dist && wget -q -P ${PADDLE_ROOT}/build/python/dist ${dev_url} + else + cmake_gen $1 + build $2 + fi + cp ${PADDLE_ROOT}/python/requirements.txt /tmp pr_whl_size=`du -m ${PADDLE_ROOT}/build/python/dist/*.whl|awk '{print $1}'` echo "pr_whl_size: ${pr_whl_size}" -- GitLab From 5ab2cec53328bf814540a693876ab517a53f8b52 Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Mon, 14 Mar 2022 10:20:44 +0800 Subject: [PATCH 020/176] Move the common function to kernel funcs (#40422) * move the common function to kernel/funcs/sparse/ * add namespace * rm unused file * move func * reuse code --- .../phi/kernels/funcs/sparse/common_shape.h | 45 +++++ paddle/phi/kernels/funcs/sparse/convolution.h | 170 ++++++++++++++++++ .../phi/kernels/sparse/convolution_kernel.h | 96 +--------- paddle/phi/kernels/sparse/cpu/convolution.h | 37 ++-- .../sparse/cpu/convolution_grad_kernel.cc | 33 +--- .../kernels/sparse/cpu/convolution_kernel.cc | 14 +- .../kernels/sparse/cpu/sparse_utils_kernel.cc | 5 +- .../cpu/submanifold_convolution_kernel.cu | 30 ---- .../sparse/gpu/convolution_grad_kernel.cu | 33 +--- .../kernels/sparse/gpu/convolution_kernel.cu | 41 ++--- .../kernels/sparse/gpu/sparse_utils_kernel.cu | 78 ++++---- .../phi/kernels/sparse/sparse_utils_kernel.h | 31 ---- 12 files changed, 333 insertions(+), 280 deletions(-) create mode 100644 paddle/phi/kernels/funcs/sparse/common_shape.h create mode 100644 paddle/phi/kernels/funcs/sparse/convolution.h delete mode 100644 paddle/phi/kernels/sparse/cpu/submanifold_convolution_kernel.cu diff --git a/paddle/phi/kernels/funcs/sparse/common_shape.h b/paddle/phi/kernels/funcs/sparse/common_shape.h new file mode 100644 index 00000000000..3617e3cd2f4 --- /dev/null +++ b/paddle/phi/kernels/funcs/sparse/common_shape.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/phi/core/ddim.h" + +namespace phi { +namespace funcs { +namespace sparse { + +inline const DDim InferDenseDims(const DDim& x_dims, + const int64_t sparse_dim, + const int64_t non_zero_num) { + auto dense_dim = x_dims.size() - sparse_dim; + DDim values_dims; + if (dense_dim > 0) { + std::vector dense_dim_vec(dense_dim + 1); + dense_dim_vec[0] = non_zero_num; + memcpy(&dense_dim_vec[1], + x_dims.Get() + sparse_dim, + dense_dim * sizeof(x_dims[0])); + values_dims = phi::make_ddim(dense_dim_vec); + } else { + values_dims = phi::make_ddim({non_zero_num}); + } + return values_dims; +} + +} // namespace sparse +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/sparse/convolution.h b/paddle/phi/kernels/funcs/sparse/convolution.h new file mode 100644 index 00000000000..68fe8880a97 --- /dev/null +++ b/paddle/phi/kernels/funcs/sparse/convolution.h @@ -0,0 +1,170 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" + +namespace phi { +namespace funcs { +namespace sparse { + +struct Dims4D { + int dims[4]; + Dims4D(const int batch, const int x, const int y, const int z) { + dims[0] = batch; + dims[1] = z; + dims[2] = y; + dims[3] = x; + } + HOSTDEVICE const int& operator[](int i) const { return dims[i]; } +}; + +// Judge whether the current position x is in (lower, upper) +inline HOSTDEVICE bool Check(const int& x, + const int& kx, + const int& pad, + const int& stride, + const int dilation, + const int kdim, + const int xdim) { + const int lower = x - dilation * kx + pad; + const int uper = x + (kdim - kx - 1) * dilation - pad; + return (lower >= 0 && lower % stride == 0 && uper < xdim); +} + +// Check whether the current position(x, y, z) is legal: +// Judge the minimum and maximum values at each latitude +inline HOSTDEVICE bool Check(const Dims4D& dims, + const Dims4D& kernel_dims, + const Dims4D& paddings, + const Dims4D& dilations, + const Dims4D& strides, + const int x, + const int y, + const int z, + const int kx, + const int ky, + const int kz) { + bool x_valid = Check( + x, kx, paddings[3], strides[3], dilations[3], kernel_dims[3], dims[3]); + bool y_valid = Check( + y, ky, paddings[2], strides[2], dilations[2], kernel_dims[2], dims[2]); + bool z_valid = Check( + z, kz, paddings[1], strides[1], dilations[1], kernel_dims[1], dims[1]); + return (x_valid && y_valid && z_valid); +} + +template +inline HOSTDEVICE int PointToIndex(const int& batch, + const int& x, + const int& y, + const int& z, + const Dim& dims) { + return batch * dims[1] * dims[2] * dims[3] + z * dims[2] * dims[3] + + y * dims[3] + x; +} + +// TODO(zhangkaihuo): use division and multiply to optimize +// modulo operation +template +inline HOSTDEVICE void IndexToPoint( + const int index, const Dim& dims, int* batch, int* x, int* y, int* z) { + int n = index; + *x = n % dims[3]; + n /= dims[3]; + *y = n % dims[2]; + n /= dims[2]; + *z = n % dims[1]; + n /= dims[1]; + *batch = n; +} + +inline void GetOutShape(const DDim& x_dims, + const DDim& kernel_dims, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + DDim* out_dims) { + PADDLE_ENFORCE_EQ( + x_dims.size(), + 5, + phi::errors::InvalidArgument("the shape of x should be (N, D, H, W, C)")); + PADDLE_ENFORCE_EQ(kernel_dims.size(), + 5, + phi::errors::InvalidArgument( + "the shape of kernel should be (D, H, W, C, OC)")); + + // infer out shape + (*out_dims)[0] = x_dims[0]; + (*out_dims)[4] = kernel_dims[4]; + for (int i = 1; i < 4; i++) { + (*out_dims)[i] = (x_dims[i] + 2 * paddings[i - 1] - + dilations[i - 1] * (kernel_dims[i - 1] - 1) - 1) / + strides[i - 1] + + 1; + } +} + +inline void ResetSubmKernelSizeAndStrides(const DDim& kernel_dims, + std::vector* paddings, + std::vector* strides) { + for (uint64_t i = 0; i < paddings->size(); i++) { + (*paddings)[i] = kernel_dims[i] / 2; + (*strides)[i] = 1; + } +} + +template +inline void SubmPreProcess(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const SparseCooTensor& out_grad, + const int in_channels, + const int out_channels, + const int half_kernel_size, + DenseTensor* kernel_grad, + DenseTensor* x_grad) { + auto blas = phi::funcs::GetBlas(dev_ctx); + T* d_kernel_ptr = kernel_grad->data(); + blas.GEMM(CblasTrans, + CblasNoTrans, + x.non_zero_elements().dims()[1], + out_grad.non_zero_elements().dims()[1], + x.non_zero_elements().dims()[0], + static_cast(1), + x.non_zero_elements().data(), + out_grad.non_zero_elements().data(), + static_cast(0), + d_kernel_ptr + half_kernel_size * in_channels * out_channels); + + // call gemm: d_x = out_grad * transpose(kernel) + // (n, out_channels) * (out_channels, in_channels) + T* x_grad_ptr = x_grad->data(); + blas.GEMM(CblasNoTrans, + CblasTrans, + out_grad.non_zero_elements().dims()[0], + in_channels, + out_grad.non_zero_elements().dims()[1], + static_cast(1), + out_grad.non_zero_elements().data(), + kernel.data() + half_kernel_size * in_channels * out_channels, + static_cast(0), + x_grad_ptr); +} + +} // namespace sparse +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h index 778600a2285..ff2cf94edb5 100644 --- a/paddle/phi/kernels/sparse/convolution_kernel.h +++ b/paddle/phi/kernels/sparse/convolution_kernel.h @@ -18,105 +18,11 @@ limitations under the License. */ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/sparse/convolution.h" namespace phi { namespace sparse { -struct Dims4D { - int dims[4]; - Dims4D(const int batch, const int x, const int y, const int z) { - dims[0] = batch; - dims[1] = z; - dims[2] = y; - dims[3] = x; - } - HOSTDEVICE const int& operator[](int i) const { return dims[i]; } -}; - -// Judge whether the current position x is in (lower, upper) -inline HOSTDEVICE bool Check(const int& x, - const int& kx, - const int& pad, - const int& stride, - const int dilation, - const int kdim, - const int xdim) { - const int lower = x - dilation * kx + pad; - const int uper = x + (kdim - kx - 1) * dilation - pad; - return (lower >= 0 && lower % stride == 0 && uper < xdim); -} - -// Check whether the current position(x, y, z) is legal: -// Judge the minimum and maximum values at each latitude -inline HOSTDEVICE bool Check(const Dims4D& dims, - const Dims4D& kernel_dims, - const Dims4D& paddings, - const Dims4D& dilations, - const Dims4D& strides, - const int x, - const int y, - const int z, - const int kx, - const int ky, - const int kz) { - bool x_valid = Check( - x, kx, paddings[3], strides[3], dilations[3], kernel_dims[3], dims[3]); - bool y_valid = Check( - y, ky, paddings[2], strides[2], dilations[2], kernel_dims[2], dims[2]); - bool z_valid = Check( - z, kz, paddings[1], strides[1], dilations[1], kernel_dims[1], dims[1]); - return (x_valid && y_valid && z_valid); -} - -template -inline HOSTDEVICE int PointToIndex(const int& batch, - const int& x, - const int& y, - const int& z, - const Dim& dims) { - return batch * dims[1] * dims[2] * dims[3] + z * dims[2] * dims[3] + - y * dims[3] + x; -} - -template -inline HOSTDEVICE void IndexToPoint( - const int index, const Dim& dims, int* batch, int* x, int* y, int* z) { - int n = index; - *x = n % dims[3]; - n /= dims[3]; - *y = n % dims[2]; - n /= dims[2]; - *z = n % dims[1]; - n /= dims[1]; - *batch = n; -} - -inline void GetOutShape(const DDim& x_dims, - const DDim& kernel_dims, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - DDim* out_dims) { - PADDLE_ENFORCE_EQ( - x_dims.size(), - 5, - phi::errors::InvalidArgument("the shape of x should be (N, D, H, W, C)")); - PADDLE_ENFORCE_EQ(kernel_dims.size(), - 5, - phi::errors::InvalidArgument( - "the shape of kernel should be (D, H, W, C, OC)")); - - // infer out shape - (*out_dims)[0] = x_dims[0]; - (*out_dims)[4] = kernel_dims[4]; - for (int i = 1; i < 4; i++) { - (*out_dims)[i] = (x_dims[i] + 2 * paddings[i - 1] - - dilations[i - 1] * (kernel_dims[i - 1] - 1) - 1) / - strides[i - 1] + - 1; - } -} - template void Conv3dKernel(const Context& dev_ctx, const SparseCooTensor& x, diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h index a5a946dce79..64c32df1897 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution.h +++ b/paddle/phi/kernels/sparse/cpu/convolution.h @@ -16,8 +16,6 @@ limitations under the License. */ #include -#include "paddle/phi/api/lib/utils/allocator.h" -#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/sparse_coo_tensor.h" @@ -28,6 +26,8 @@ limitations under the License. */ namespace phi { namespace sparse { +using Dims4D = phi::funcs::sparse::Dims4D; + // such as: kernel(3, 3, 3), kernel_size = 27 // counter_per_weight: (kernel_size) // TODO(zhangkaihuo): optimize performance with multithreading @@ -67,7 +67,8 @@ void ProductRuleBook(const Context& dev_ctx, int in_z = indices_ptr[i + non_zero_num]; int in_y = indices_ptr[i + 2 * non_zero_num]; int in_x = indices_ptr[i + 3 * non_zero_num]; - int index = PointToIndex(batch, in_x, in_y, in_z, x_dims); + int index = phi::funcs::sparse::PointToIndex( + batch, in_x, in_y, in_z, x_dims); hash_in.insert(index); } } @@ -86,20 +87,20 @@ void ProductRuleBook(const Context& dev_ctx, int out_z = (in_z + paddings[0] - kz * dilations[0]) / strides[0]; int out_y = (in_y + paddings[1] - ky * dilations[1]) / strides[1]; int out_x = (in_x + paddings[2] - kx * dilations[2]) / strides[2]; - if (Check(c_x_dims, - c_kernel_dims, - c_paddings, - c_dilations, - c_strides, - in_x, - in_y, - in_z, - kx, - ky, - kz)) { + if (phi::funcs::sparse::Check(c_x_dims, + c_kernel_dims, + c_paddings, + c_dilations, + c_strides, + in_x, + in_y, + in_z, + kx, + ky, + kz)) { if (subm) { - int out_index = - PointToIndex(batch, out_x, out_y, out_z, out_dims); + int out_index = phi::funcs::sparse::PointToIndex( + batch, out_x, out_y, out_z, out_dims); if (hash_in.find(out_index) == hash_in.end()) { continue; } @@ -112,7 +113,7 @@ void ProductRuleBook(const Context& dev_ctx, rulebook_ptr[rulebook_index] = kernel_index - 1; rulebook_ptr[rulebook_index + rulebook_len] = i; // in_i rulebook_ptr[rulebook_index + rulebook_len * 2] = - PointToIndex( + phi::funcs::sparse::PointToIndex( batch, out_x, out_y, out_z, out_dims); // out_index ++rulebook_index; } @@ -161,7 +162,7 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx, for (auto it = out_indexs.begin(); it != out_indexs.end(); it++, i++) { const int index = *it; int batch, x, y, z; - IndexToPoint(index, out_dims, &batch, &x, &y, &z); + phi::funcs::sparse::IndexToPoint(index, out_dims, &batch, &x, &y, &z); out_indices_ptr[i] = batch; out_indices_ptr[i + out_non_zero_num] = z; out_indices_ptr[i + out_non_zero_num * 2] = y; diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc index bb414faef67..5d7b381b7cb 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc @@ -94,30 +94,15 @@ void Conv3dGradKernel(const Context& dev_ctx, offsets[kernel_size] = offset; if (subm) { - blas.GEMM(CblasTrans, - CblasNoTrans, - x.non_zero_elements().dims()[1], - out_grad.non_zero_elements().dims()[1], - x.non_zero_elements().dims()[0], - static_cast(1), - x.non_zero_elements().data(), - out_grad.non_zero_elements().data(), - static_cast(0), - d_kernel_ptr + half_kernel_size * in_channels * out_channels); - - // call gemm: d_x = out_grad * transpose(kernel) - // (n, out_channels) * (out_channels, in_channels) - T* x_grad_ptr = x_grad->data(); - blas.GEMM(CblasNoTrans, - CblasTrans, - out_grad.non_zero_elements().dims()[0], - in_channels, - out_grad.non_zero_elements().dims()[1], - static_cast(1), - out_grad.non_zero_elements().data(), - kernel.data() + half_kernel_size * in_channels * out_channels, - static_cast(0), - x_grad_ptr); + phi::funcs::sparse::SubmPreProcess(dev_ctx, + x, + kernel, + out_grad, + in_channels, + out_channels, + half_kernel_size, + kernel_grad, + x_grad); if (max_count == 0) { return; } diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc index f65e1cf579a..746ca04a826 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc @@ -13,8 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/kernels/sparse/cpu/convolution.h" -#include "paddle/phi/api/lib/utils/allocator.h" -#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/kernels/funcs/blas/blas.h" @@ -46,10 +44,16 @@ void Conv3dKernel(const Context& dev_ctx, const auto& kernel_dims = kernel.dims(); int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; DDim out_dims = {1, 1, 1, 1, 1}; - GetOutShape(x_dims, kernel_dims, paddings, dilations, strides, &out_dims); + phi::funcs::sparse::GetOutShape( + x_dims, kernel_dims, paddings, dilations, strides, &out_dims); const int in_channels = kernel_dims[3]; const int out_channels = kernel_dims[4]; + std::vector subm_paddings(paddings), subm_strides(strides); + if (subm) { + phi::funcs::sparse::ResetSubmKernelSizeAndStrides( + kernel.dims(), &subm_paddings, &subm_strides); + } // Second algorithm: // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf // 1. product rulebook @@ -60,9 +64,9 @@ void Conv3dKernel(const Context& dev_ctx, ProductRuleBook(dev_ctx, x, kernel, - paddings, + subm_paddings, dilations, - strides, + subm_strides, out_dims, subm, rulebook, diff --git a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc index ba89135641e..50e95ee0b8a 100644 --- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc @@ -14,9 +14,9 @@ limitations under the License. */ #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h" #include "paddle/phi/api/lib/utils/allocator.h" -#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/kernels/funcs/sparse/common_shape.h" namespace phi { namespace sparse { @@ -71,7 +71,8 @@ void DenseToSparseCooKernel(const Context& dev_ctx, int64_t non_zero_num = GetNonZeroNum(x, sparse_dim); const auto place = dev_ctx.GetPlace(); - const auto values_dims = InferDenseDims(x_dims, sparse_dim, non_zero_num); + const auto values_dims = + phi::funcs::sparse::InferDenseDims(x_dims, sparse_dim, non_zero_num); DenseTensorMeta indices_meta(DataType::INT64, {sparse_dim, static_cast(non_zero_num)}, DataLayout::NCHW); diff --git a/paddle/phi/kernels/sparse/cpu/submanifold_convolution_kernel.cu b/paddle/phi/kernels/sparse/cpu/submanifold_convolution_kernel.cu deleted file mode 100644 index 5f6d24093a4..00000000000 --- a/paddle/phi/kernels/sparse/cpu/submanifold_convolution_kernel.cu +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "paddle/phi/api/lib/utils/allocator.h" -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/core/sparse_coo_tensor.h" -#include "paddle/phi/core/tensor_meta.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/sparse/submanifold_convolution_kernel.h" - -namespace phi { -namespace sparse {} // namespace sparse -} // namespace phi diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu index a307ab0f546..d6d992d0f4b 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -110,30 +110,15 @@ void Conv3dGradKernel(const Context& dev_ctx, offsets[kernel_size] = offset; if (subm) { - blas.GEMM(CblasTrans, - CblasNoTrans, - x.non_zero_elements().dims()[1], - out_grad.non_zero_elements().dims()[1], - x.non_zero_elements().dims()[0], - static_cast(1), - x.non_zero_elements().data(), - out_grad.non_zero_elements().data(), - static_cast(0), - d_kernel_ptr + half_kernel_size * in_channels * out_channels); - - // call gemm: d_x = out_grad * transpose(kernel) - // (n, out_channels) * (out_channels, in_channels) - T* x_grad_ptr = x_grad->data(); - blas.GEMM(CblasNoTrans, - CblasTrans, - out_grad.non_zero_elements().dims()[0], - in_channels, - out_grad.non_zero_elements().dims()[1], - static_cast(1), - out_grad.non_zero_elements().data(), - kernel.data() + half_kernel_size * in_channels * out_channels, - static_cast(0), - x_grad_ptr); + phi::funcs::sparse::SubmPreProcess(dev_ctx, + x, + kernel, + out_grad, + in_channels, + out_channels, + half_kernel_size, + kernel_grad, + x_grad); if (max_count == 0) { return; } diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu index 94186600f1e..1a0c7e9b972 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -33,6 +33,8 @@ limitations under the License. */ namespace phi { namespace sparse { +using Dims4D = phi::funcs::sparse::Dims4D; + __global__ void SetFlagAndUpdateCounterKernel(const int* indexs, const int n, const int rulebook_len, @@ -83,7 +85,8 @@ __global__ void UpdateIndexKernel(const int* unique_keys, for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) { const int index = unique_keys[i]; int batch, x, y, z; - IndexToPoint(index, out_dims, &batch, &x, &y, &z); + phi::funcs::sparse::IndexToPoint( + index, out_dims, &batch, &x, &y, &z); // get out indices out_indices[i] = batch; out_indices[i + non_zero_num] = z; @@ -150,23 +153,23 @@ __global__ void ProductRuleBookKernel(const int* x_indices, for (int ky = 0; ky < kernel_dims[2]; ky++) { for (int kx = 0; kx < kernel_dims[3]; kx++) { int in_i = -1, out_index = -1, kernel_i = -1; - if (Check(x_dims, - kernel_dims, - paddings, - dilations, - strides, - in_x, - in_y, - in_z, - kx, - ky, - kz)) { + if (phi::funcs::sparse::Check(x_dims, + kernel_dims, + paddings, + dilations, + strides, + in_x, + in_y, + in_z, + kx, + ky, + kz)) { int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1]; int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2]; int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3]; in_i = i; - out_index = - PointToIndex(batch, out_x, out_y, out_z, out_dims); + out_index = phi::funcs::sparse::PointToIndex( + batch, out_x, out_y, out_z, out_dims); atomicAdd(&counter_buf[kernel_index], 1); kernel_i = kernel_index; } @@ -542,7 +545,8 @@ void Conv3dKernel(const Context& dev_ctx, const auto& kernel_dims = kernel.dims(); int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; DDim out_dims = {1, 1, 1, 1, 1}; - GetOutShape(x_dims, kernel_dims, paddings, dilations, strides, &out_dims); + phi::funcs::sparse::GetOutShape( + x_dims, kernel_dims, paddings, dilations, strides, &out_dims); out->set_dims(out_dims); const int in_channels = kernel_dims[3]; const int out_channels = kernel_dims[4]; @@ -564,11 +568,8 @@ void Conv3dKernel(const Context& dev_ctx, std::vector subm_paddings(paddings), subm_strides(strides); if (subm) { - auto kernel_dims = kernel.dims(); - for (int i = 0; i < paddings.size(); i++) { - subm_paddings[i] = kernel_dims[i] / 2; - subm_strides[i] = 1; - } + phi::funcs::sparse::ResetSubmKernelSizeAndStrides( + kernel.dims(), &subm_paddings, &subm_strides); } int n = ProductRuleBook(dev_ctx, diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu index 2e741111fb1..8048180e425 100644 --- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu @@ -16,8 +16,10 @@ limitations under the License. */ #include #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/kernels/funcs/sparse/common_shape.h" #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h" namespace phi { @@ -115,14 +117,16 @@ void DenseToSparseCooKernel(const Context& dev_ctx, PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(nums_ptr, 0, sizeof(int), dev_ctx.stream())); #endif - int grid_size = 1, block_size = 1; - GetGpuLaunchConfig1D(dev_ctx, rows, &grid_size, &block_size); + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1); auto temp_indexs_meta = phi::DenseTensorMeta(DataType::INT32, {rows}, phi::DataLayout::NCHW); DenseTensor temp_indexs = phi::Empty(dev_ctx, std::move(temp_indexs_meta)); int* temp_indexs_ptr = temp_indexs.mutable_data(place); - GetNonZeroNums<<>>( + GetNonZeroNums<<>>( x_data, rows, cols, nums_ptr, temp_indexs_ptr); #ifdef PADDLE_WITH_HIP thrust::remove(thrust::hip::par.on(dev_ctx.stream()), @@ -167,7 +171,8 @@ void DenseToSparseCooKernel(const Context& dev_ctx, dev_ctx.Wait(); // wait the copy - const auto values_dims = InferDenseDims(x_dims, sparse_dim, non_zero_num); + const auto values_dims = + phi::funcs::sparse::InferDenseDims(x_dims, sparse_dim, non_zero_num); DenseTensorMeta indices_meta(DataType::INT64, {sparse_dim, static_cast(non_zero_num)}, DataLayout::NCHW); @@ -184,16 +189,18 @@ void DenseToSparseCooKernel(const Context& dev_ctx, T* sparse_data = values.mutable_data(place); // 3. calc indices by indexs and get values by indexs - GetGpuLaunchConfig1D(dev_ctx, non_zero_num, &grid_size, &block_size); - GetNonZeroElementsAndIndices<<>>( - x_data, - sparse_dim, - cols, - d_x_dims.data(), - non_zero_num, - temp_indexs_ptr, - indices_data, - sparse_data); + config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); + GetNonZeroElementsAndIndices<<>>(x_data, + sparse_dim, + cols, + d_x_dims.data(), + non_zero_num, + temp_indexs_ptr, + indices_data, + sparse_data); out->SetMember(indices, values, x_dims, true); } @@ -263,10 +270,9 @@ void SparseCsrToCooKernel(const Context& dev_ctx, int* offsets_ptr = batchs == 1 ? nullptr : offsets.mutable_data(place); T* coo_values_data = values.mutable_data(place); - int grid_size = 1, block_size = 1; if (batchs > 1) { - GetGpuLaunchConfig1D(dev_ctx, batchs, &grid_size, &block_size); - GetBatchSizes<<>>( + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batchs, 1); + GetBatchSizes<<>>( csr_crows_data, rows, batchs, offsets_ptr); #ifdef PADDLE_WITH_HIP @@ -279,9 +285,10 @@ void SparseCsrToCooKernel(const Context& dev_ctx, offsets_ptr); } - GetGpuLaunchConfig1D(dev_ctx, rows, &grid_size, &block_size); - dim3 grids(grid_size, batchs, 1); - ConvertCsrCrowsToCooRows<<>>( + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1); + config.block_per_grid.y = batchs; + ConvertCsrCrowsToCooRows<<>>( csr_crows_data, offsets_ptr, coo_rows_data, batch_ptr, rows); #ifdef PADDLE_WITH_HIP @@ -404,21 +411,29 @@ void SparseCooToCsrKernel(const Context& dev_ctx, // TODO(zhangkahuo): call coalesced() to distinct and sort the indices } - int grid_size = 1, block_size = 1; - GetGpuLaunchConfig1D(dev_ctx, batchs, &grid_size, &block_size); + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batchs, 1); if (batchs > 1) { DenseTensorMeta batchs_meta(DataType::INT64, {batchs}, DataLayout::NCHW); phi::DenseTensor batchs_offset( phi::make_intrusive(place), std::move(batchs_meta)); int64_t* batchs_offset_ptr = batchs_offset.mutable_data(place); - GetBatchsOffset<<>>( + GetBatchsOffset<<>>( batchs_ptr, non_zero_num, batchs_offset_ptr); - dim3 grids(grid_size, batchs, 1); - ConvertCooRowsToCsrCrows<<>>( + config.block_per_grid.y = batchs; + ConvertCooRowsToCsrCrows<<>>( batchs_offset_ptr, coo_rows_data, csr_crows_data, rows, non_zero_num); } else { - ConvertCooRowsToCsrCrows<<>>( + ConvertCooRowsToCsrCrows<<>>( nullptr, coo_rows_data, csr_crows_data, rows, non_zero_num); } @@ -522,12 +537,13 @@ void SparseCooToDenseKernel(const Context& dev_ctx, PADDLE_ENFORCE_GPU_SUCCESS( cudaMemsetAsync(out_data, 0, sizeof(T) * out->numel(), dev_ctx.stream())); #endif - int grid_size = 1, block_size = 1; - GetGpuLaunchConfig1D(dev_ctx, non_zero_num, &grid_size, &block_size); + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); - KernelSparseCooToDense< - T, - int64_t><<>>( + KernelSparseCooToDense<<>>( indices.data(), d_sparse_offsets.data(), x_data, diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h index c83b2130ed4..da05eb3d3cf 100644 --- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h +++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h @@ -23,37 +23,6 @@ limitations under the License. */ namespace phi { namespace sparse { -inline const DDim InferDenseDims(const DDim& x_dims, - const int64_t sparse_dim, - const int64_t non_zero_num) { - auto dense_dim = x_dims.size() - sparse_dim; - DDim values_dims; - if (dense_dim) { - std::vector dense_dim_vec(dense_dim + 1); - dense_dim_vec[0] = non_zero_num; - memcpy(&dense_dim_vec[1], - x_dims.Get() + sparse_dim, - dense_dim * sizeof(x_dims[0])); - values_dims = phi::make_ddim(dense_dim_vec); - } else { - values_dims = phi::make_ddim({non_zero_num}); - } - return values_dims; -} - -template -inline void GetGpuLaunchConfig1D(const Context& dev_ctx, - const int64_t n, - int* grid_size, - int* block_size) { - const int MAX_BLOCK_DIM = dev_ctx.GetMaxThreadsPerBlock(); - const int MAX_GRID_DIM = dev_ctx.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM; - *block_size = (n >= MAX_BLOCK_DIM) ? MAX_BLOCK_DIM - : (1 << static_cast(std::log2(n))); - *grid_size = n / *block_size; - *grid_size = (*grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : *grid_size; -} - template void DenseToSparseCooKernel(const Context& dev_ctx, const DenseTensor& x, -- GitLab From 88ec08a70e9a6501983cbc671812be3433f9f08a Mon Sep 17 00:00:00 2001 From: From00 Date: Mon, 14 Mar 2022 10:48:38 +0800 Subject: [PATCH 021/176] Move Pool OPs to phi (#40208) * Move Pool OPs to phi * Fix CI error * Fix conflicts --- paddle/fluid/framework/infershape_utils.cc | 3 +- paddle/fluid/imperative/prepared_operator.h | 23 +- .../inference/tensorrt/convert/pool2d_op.cc | 2 +- .../inference/tensorrt/convert/pool3d_op.cc | 2 +- .../tensorrt/convert/test_pool2d_op.cc | 2 +- .../tensorrt/plugin/pool3d_op_plugin.cu | 22 +- .../tensorrt/plugin/pool_op_plugin.cu | 22 +- paddle/fluid/operators/flatten_op.h | 1 - paddle/fluid/operators/math/CMakeLists.txt | 1 - paddle/fluid/operators/math/pooling.h | 315 ---- .../fluid/operators/mkldnn/pool_mkldnn_op.cc | 16 +- .../operators/mkldnn/test_mkldnn_op_nhwc.cc | 3 +- paddle/fluid/operators/pool_cudnn_op.cu.cc | 567 ------- paddle/fluid/operators/pool_op.cc | 182 +-- paddle/fluid/operators/pool_op.cu | 48 - paddle/fluid/operators/pool_op.h | 304 +--- paddle/fluid/operators/pool_op_mlu.cc | 15 +- paddle/fluid/operators/pool_op_npu.cc | 12 +- paddle/fluid/operators/pool_op_xpu.cc | 6 +- paddle/fluid/operators/pool_with_index_op.cc | 135 +- .../fluid/operators/pool_with_index_op.cu.cc | 43 - paddle/fluid/operators/pool_with_index_op.h | 121 -- paddle/fluid/operators/spp_op.h | 31 +- paddle/fluid/operators/squeeze_op.h | 1 - paddle/fluid/operators/unsqueeze_op.h | 1 - paddle/phi/core/meta_tensor.h | 6 +- paddle/phi/infermeta/backward.cc | 29 + paddle/phi/infermeta/backward.h | 25 + paddle/phi/infermeta/unary.cc | 185 +++ paddle/phi/infermeta/unary.h | 24 + paddle/phi/kernels/CMakeLists.txt | 23 +- paddle/phi/kernels/concat_kernel.h | 2 +- paddle/phi/kernels/cpu/pool_grad_kernel.cc | 49 + paddle/phi/kernels/cpu/pool_kernel.cc | 41 + paddle/phi/kernels/cpu/split_kernel.cc | 2 +- paddle/phi/kernels/funcs/CMakeLists.txt | 9 +- .../math => phi/kernels/funcs}/pooling.cc | 423 ++--- .../math => phi/kernels/funcs}/pooling.cu | 1421 +++++++++++------ paddle/phi/kernels/funcs/pooling.h | 469 ++++++ paddle/phi/kernels/gpu/pool_grad_kernel.cu | 60 + paddle/phi/kernels/gpu/pool_kernel.cu | 54 + paddle/phi/kernels/gpu/split_kernel.cu | 2 +- paddle/phi/kernels/gpudnn/pool_gpudnn.h | 43 + paddle/phi/kernels/gpudnn/pool_grad_kernel.cu | 448 ++++++ paddle/phi/kernels/gpudnn/pool_kernel.cu | 312 ++++ .../phi/kernels/impl/pool_grad_kernel_impl.h | 332 ++++ paddle/phi/kernels/impl/pool_kernel_impl.h | 321 ++++ paddle/phi/kernels/pool_grad_kernel.h | 145 ++ paddle/phi/kernels/pool_kernel.h | 105 ++ paddle/phi/ops/compat/pool_sig.cc | 154 ++ paddle/phi/tests/core/test_meta_fn_utils.cc | 6 +- 51 files changed, 4148 insertions(+), 2420 deletions(-) delete mode 100644 paddle/fluid/operators/math/pooling.h delete mode 100644 paddle/fluid/operators/pool_cudnn_op.cu.cc delete mode 100644 paddle/fluid/operators/pool_op.cu delete mode 100644 paddle/fluid/operators/pool_with_index_op.cu.cc delete mode 100644 paddle/fluid/operators/pool_with_index_op.h create mode 100644 paddle/phi/kernels/cpu/pool_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/pool_kernel.cc rename paddle/{fluid/operators/math => phi/kernels/funcs}/pooling.cc (83%) rename paddle/{fluid/operators/math => phi/kernels/funcs}/pooling.cu (54%) create mode 100644 paddle/phi/kernels/funcs/pooling.h create mode 100644 paddle/phi/kernels/gpu/pool_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/pool_kernel.cu create mode 100644 paddle/phi/kernels/gpudnn/pool_gpudnn.h create mode 100644 paddle/phi/kernels/gpudnn/pool_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpudnn/pool_kernel.cu create mode 100644 paddle/phi/kernels/impl/pool_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/pool_kernel_impl.h create mode 100644 paddle/phi/kernels/pool_grad_kernel.h create mode 100644 paddle/phi/kernels/pool_kernel.h create mode 100644 paddle/phi/ops/compat/pool_sig.cc diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index f57674d5601..5119c306906 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -297,7 +297,8 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, VLOG(3) << "BuildInferMetaContext: op kernel signature - " << signature; // 2. build infermeta context - phi::InferMetaContext infer_meta_context(ctx->IsRuntime()); + phi::InferMetaContext infer_meta_context( + {ctx->IsRuntime(), ctx->IsRunMKLDNNKernel()}); auto& input_names = std::get<0>(signature.args); auto& attr_names = std::get<1>(signature.args); diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index d7c0c8cc547..91e6974fa2e 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -264,14 +264,23 @@ void BuildDygraphPhiKernelContext( size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second); - if ((it == ins.end()) && - (input_defs[i].type_index == - std::type_index(typeid(paddle::optional)))) { - kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr); - auto end_idx = start_idx + 1; - kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); - continue; + if (it == ins.end()) { + if (LIKELY(input_defs[i].type_index == + std::type_index( + typeid(paddle::optional)))) { + kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr); + auto end_idx = start_idx + 1; + kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); + continue; + } else { + PADDLE_THROW(phi::errors::NotFound( + "Can not find input variable '%s' for %s OP, please check whether " + "the name setting in OpArgumentMapping is consistent with that in " + "OpMaker.", + input_names[i], pt_kernel_signature.name)); + } } + auto ins_vector = it->second; size_t end_idx = start_idx + ins_vector.size(); diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index fe04d552e40..7b65d2d7c97 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -328,5 +328,5 @@ class Pool2dOpConverter : public OpConverter { } // namespace inference } // namespace paddle -USE_OP(pool2d); +USE_OP_ITSELF(pool2d); REGISTER_TRT_OP_CONVERTER(pool2d, Pool2dOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc index b8e87a8d94d..5a306f622ad 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc @@ -224,5 +224,5 @@ class Pool3dOpConverter : public OpConverter { } // namespace inference } // namespace paddle -USE_OP(pool3d); +USE_OP_ITSELF(pool3d); REGISTER_TRT_OP_CONVERTER(pool3d, Pool3dOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc index bded833505c..36f13262a73 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc @@ -71,4 +71,4 @@ TEST(Pool2dOpConverter, avg_ceil_test) { test_pool2d(false, true, "avg"); } } // namespace inference } // namespace paddle -USE_OP(pool2d); +USE_OP_ITSELF(pool2d); diff --git a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu index 861a9aa9d00..5596a89a083 100644 --- a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h" -#include "paddle/fluid/operators/math/pooling.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace inference { @@ -108,16 +108,14 @@ int Pool3DPlugin::enqueue(int batchSize, const void *const *inputs, output_shape.insert(output_shape.begin(), batchSize); if (pool3d_type_ == Pool3DType::max) { - paddle::operators::math::MaxPool pool_process; - paddle::operators::math::Pool3dDirectCUDAFunctor< - paddle::operators::math::MaxPool, float> + phi::funcs::MaxPool pool_process; + phi::funcs::Pool3dDirectCUDAFunctor, float> pool3d_forward; pool3d_forward(idata, input_shape, output_shape, ksize_, strides_, paddings_, true, adaptive_, odatas[0], stream, pool_process); } else if (pool3d_type_ == Pool3DType::avg) { - paddle::operators::math::AvgPool pool_process; - paddle::operators::math::Pool3dDirectCUDAFunctor< - paddle::operators::math::AvgPool, float> + phi::funcs::AvgPool pool_process; + phi::funcs::Pool3dDirectCUDAFunctor, float> pool3d_forward; pool3d_forward(idata, input_shape, output_shape, ksize_, strides_, paddings_, true, adaptive_, odatas[0], stream, pool_process); @@ -351,16 +349,14 @@ int Pool3DPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc, } if (pool3d_type_ == "max") { - paddle::operators::math::MaxPool pool_process; - paddle::operators::math::Pool3dDirectCUDAFunctor< - paddle::operators::math::MaxPool, float> + phi::funcs::MaxPool pool_process; + phi::funcs::Pool3dDirectCUDAFunctor, float> pool3d_forward; pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings, true, adaptive_, output, stream, pool_process); } else if (pool3d_type_ == "avg") { - paddle::operators::math::AvgPool pool_process; - paddle::operators::math::Pool3dDirectCUDAFunctor< - paddle::operators::math::AvgPool, float> + phi::funcs::AvgPool pool_process; + phi::funcs::Pool3dDirectCUDAFunctor, float> pool3d_forward; pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings, true, adaptive_, output, stream, pool_process); diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu index 6d711c26adc..9bfe98d759d 100644 --- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h" -#include "paddle/fluid/operators/math/pooling.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace inference { @@ -84,16 +84,14 @@ int PoolPlugin::enqueue(int batchSize, const void *const *inputs, output_shape.insert(output_shape.begin(), batchSize); if (pool_type_ == PoolType::max) { - paddle::operators::math::MaxPool pool_process; - paddle::operators::math::Pool2dDirectCUDAFunctor< - paddle::operators::math::MaxPool, float> + phi::funcs::MaxPool pool_process; + phi::funcs::Pool2dDirectCUDAFunctor, float> pool2d_forward; pool2d_forward(idata, input_shape, output_shape, ksize_, strides_, paddings_, true, false, odatas[0], stream, pool_process); } else if (pool_type_ == PoolType::avg) { - paddle::operators::math::AvgPool pool_process; - paddle::operators::math::Pool2dDirectCUDAFunctor< - paddle::operators::math::AvgPool, float> + phi::funcs::AvgPool pool_process; + phi::funcs::Pool2dDirectCUDAFunctor, float> pool2d_forward; pool2d_forward(idata, input_shape, output_shape, ksize_, strides_, paddings_, exclusive_, adaptive_, odatas[0], stream, @@ -292,16 +290,14 @@ int PoolPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc, } if (pool_type_ == "max") { - paddle::operators::math::MaxPool pool_process; - paddle::operators::math::Pool2dDirectCUDAFunctor< - paddle::operators::math::MaxPool, float> + phi::funcs::MaxPool pool_process; + phi::funcs::Pool2dDirectCUDAFunctor, float> pool2d_forward; pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings, true, false, output, stream, pool_process); } else if (pool_type_ == "avg") { - paddle::operators::math::AvgPool pool_process; - paddle::operators::math::Pool2dDirectCUDAFunctor< - paddle::operators::math::AvgPool, float> + phi::funcs::AvgPool pool_process; + phi::funcs::Pool2dDirectCUDAFunctor, float> pool2d_forward; pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings, exclusive_, adaptive_, output, stream, pool_process); diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h index 5ef13b38c8a..feae954e355 100644 --- a/paddle/fluid/operators/flatten_op.h +++ b/paddle/fluid/operators/flatten_op.h @@ -16,7 +16,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/phi_utils.h" -#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/flatten_grad_kernel.h" diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 31a98d9f630..af1069cb867 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -20,7 +20,6 @@ math_library(sampler DEPS generator) # math_library(math_function DEPS blas dense_tensor tensor) math_library(maxouting) -math_library(pooling) if(WITH_MKLDNN) math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mkldnn_axpy_handler) diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h deleted file mode 100644 index dfd3dad3864..00000000000 --- a/paddle/fluid/operators/math/pooling.h +++ /dev/null @@ -1,315 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/macros.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { -namespace math { - -/* - * \brief Extracting simple operations from pooling. - * Both MaxPool and AvgPool need "initial", "compute" and "finalize" - * operation. - * MaxPool initializes temp variable to the negative maximum to find the - * maximum value in the pooling field. - * AvgPool initializes temp variable to the zero to accumulate all values - * in pool pooling, and finally takes the average. - * MaxPoolGrad and AvgPoolGrad are gradient operations respectively. - */ -template -class MaxPool { - public: - DEVICE inline T initial() { return static_cast(-FLT_MAX); } - DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; } - DEVICE inline void finalize(const T& pool_field, T* y) {} -}; - -template -class AvgPool { - using MT = typename details::MPTypeTrait::Type; - MT intermediate_res; - - public: - DEVICE inline T initial() { - intermediate_res = static_cast(0.0f); - return static_cast(0); - } - - DEVICE inline void compute(const T& x, T* y) { - intermediate_res += static_cast(x); - } - - DEVICE inline void finalize(const T& pool_field, T* y) { - *y = static_cast(intermediate_res / (static_cast(pool_field))); - } -}; - -template -class MaxPoolGrad { - public: - static constexpr bool use_x = true; - HOSTDEVICE inline void compute(const T& x, const T& y, const T& dy, T scale, - T* dx) { - *dx += dy * static_cast(x == y); - } -}; - -template -class AvgPoolGrad { - public: - static constexpr bool use_x = false; - HOSTDEVICE inline void compute(const T& x, const T& y, const T& dy, T scale, - T* dx) { - *dx += (scale * dy); - } -}; - -/* used for adaptive pool to calculate start and end index of each divided grid - */ -HOSTDEVICE inline int AdaptStartIndex(int ph, int input_size, int output_size) { - return static_cast( - floor(static_cast(ph * input_size) / output_size)); -} - -HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) { - return static_cast( - ceil(static_cast((ph + 1) * input_size) / output_size)); -} - -/* - * \brief Getting pooling results, and calculating gradient. - * - * In pool2d, all Tensors are in NCHW or NHWC format. Where N is batch size, C - * is the number of channels, H and W is the height and width of feature. - * In pool3d, all Tensors are in NCDHW or NDHWC format. Where N is batch size, C - * is the number of channels, D, H and W is the depth, height and width of - * feature. - * - * In max pooling, it is possible that the pooling region has multiple maximum - * elements. In this case, we should compute the gradient of the first maximum - * element. - * This is different from average pooling. So we rewrite the max_pool_grad: - * MaxPool2dGradFunctor, MaxPool3dGradFunctor. - */ -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -class Pool2dDirectCUDAFunctor { - public: - void operator()(const T* input, const std::vector& input_shape, - const std::vector& output_shape, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, T* output, gpuStream_t stream, - PoolProcess pool_compute); -}; -#endif - -template -class Pool2dFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* output, - PoolProcess pool_compute); - - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* output, PoolProcess pool_compute); -}; - -template -class Pool2dGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* input_grad, - PoolProcess pool_compute); - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* input_grad, PoolProcess pool_compute); -}; - -template -class MaxPool2dGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - framework::Tensor* input_grad); - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, framework::Tensor* input_grad); -}; - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -class Pool3dDirectCUDAFunctor { - public: - void operator()(const T* input, const std::vector& input_shape, - const std::vector& output_shape, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, T* output, gpuStream_t stream, - PoolProcess pool_compute); -}; -#endif - -template -class Pool3dFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* output, - PoolProcess pool_compute); - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* output, PoolProcess pool_compute); -}; - -template -class Pool3dGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* input_grad, - PoolProcess pool_compute); - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* input_grad, PoolProcess pool_compute); -}; - -template -class MaxPool3dGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - framework::Tensor* input_grad); - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, framework::Tensor* input_grad); -}; - -/* - * \brief Getting max pooling results and corresponding max index, and - * calculating gradient. - * In up-sampling-pooling, it is necessary to know max element index. - * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in - * NCDHW format. - */ -template -class MaxPool2dWithIndexFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* output, framework::Tensor* mask); -}; - -template -class MaxPool2dWithIndexGradFunctor { - public: - void operator()(const DeviceContext& context, - const framework::Tensor& output_grad, - const framework::Tensor& mask, const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* input_grad); -}; - -template -class MaxPool3dWithIndexFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* output, framework::Tensor* mask); -}; - -template -class MaxPool3dWithIndexGradFunctor { - public: - void operator()(const DeviceContext& context, - const framework::Tensor& output_grad, - const framework::Tensor& mask, const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* input_grad); -}; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc index ab02d4cfed9..1078b451c55 100644 --- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc @@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pool_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_reuse.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace operators { using framework::DataLayout; +using framework::Tensor; using dnnl::memory; using dnnl::pooling_backward; using dnnl::pooling_forward; @@ -83,11 +85,11 @@ class PoolingMKLDNNHandler phi::slice_ddim(input_dims, 2, input_dims.size()); if (global_pooling) { - operators::UpdateKsize(&ksize, data_dims); + phi::funcs::UpdateKernelSize(&ksize, data_dims); } - operators::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, + data_dims, strides, ksize); const auto src_tz = phi::vectorize(input->dims()); const auto dst_tz = phi::vectorize(output->dims()); @@ -173,11 +175,11 @@ class PoolingMKLDNNHandler framework::DDim data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); if (global_pooling) { - operators::UpdateKsize(&ksize, data_dims); + phi::funcs::UpdateKernelSize(&ksize, data_dims); } - operators::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, + data_dims, strides, ksize); auto src_tz = phi::vectorize(in_x->dims()); auto diff_src_tz = phi::vectorize(in_x_grad->dims()); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc index 9d0062e3138..717af61b858 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc @@ -26,13 +26,14 @@ #include "paddle/fluid/platform/place.h" #include "paddle/phi/core/kernel_registry.h" -USE_OP(pool2d); +USE_OP_ITSELF(pool2d); USE_OP_DEVICE_KERNEL(pool2d, MKLDNN); USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(transpose); USE_OP_DEVICE_KERNEL(transpose, MKLDNN); +PD_DECLARE_KERNEL(pool2d, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT); namespace paddle { diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc deleted file mode 100644 index 6335004e69a..00000000000 --- a/paddle/fluid/operators/pool_cudnn_op.cu.cc +++ /dev/null @@ -1,567 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/pool_op.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/operator.h" -#endif -#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; -using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor; -using DataLayout = platform::DataLayout; -using PoolingMode = platform::PoolingMode; -template -using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; - -DataLayout getLayoutFromStr(std::string data_format) { - if (data_format == "NHWC") { - return DataLayout::kNHWC; - } else if (data_format == "NCHW") { - return DataLayout::kNCHW; - } else if (data_format == "NCDHW") { - return DataLayout::kNCDHW; - } else { - return DataLayout::kNCDHW; - } -} - -template -class PoolCUDNNOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument("Pool operator CUDA kernel must use " - "CUDAPlace rather than CPUPlace.")); - - const Tensor *input = ctx.Input("X"); - Tensor *output = ctx.Output("Out"); - output->mutable_data(ctx.GetPlace()); - std::string pooling_type = ctx.Attr("pooling_type"); - bool exclusive = ctx.Attr("exclusive"); - bool adaptive = ctx.Attr("adaptive"); - std::vector ksize = ctx.Attr>("ksize"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::string data_format = ctx.Attr("data_format"); - bool global_pooling = ctx.Attr("global_pooling"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // update paddings - auto in_x_dims = input->dims(); - framework::DDim data_dims; - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); - if (data_dims.size() * 2 == static_cast(paddings.size())) { - for (int i = 0; i < data_dims.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - - const std::string str_NCHW = "NCHW", str_NHWC = "NHWC"; - const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC"; - - // -----------------transformed tensor ------------------------ - - Tensor transformed_input(input->type()); - Tensor transformed_output(output->type()); - DataLayout layout; - - if (data_format == str_NDHWC) { - layout = DataLayout::kNCDHW; - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 4, 1, 2, 3}; - - // input - transformed_input.Resize(input->dims()); - - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[4]; - in_dims_vec[2] = input->dims()[1]; - in_dims_vec[3] = input->dims()[2]; - in_dims_vec[4] = input->dims()[3]; - transformed_input.Resize(phi::make_ddim(in_dims_vec)); - transformed_input.mutable_data(ctx.GetPlace(), input->type()); - - phi::funcs::Transpose trans5; - trans5(dev_ctx, *input, &transformed_input, axis); - - // output - transformed_output.Resize(output->dims()); - - auto out_dims_vec = phi::vectorize(output->dims()); - out_dims_vec[1] = output->dims()[4]; - out_dims_vec[2] = output->dims()[1]; - out_dims_vec[3] = output->dims()[2]; - out_dims_vec[4] = output->dims()[3]; - transformed_output.Resize(phi::make_ddim(out_dims_vec)); -#ifdef PADDLE_WITH_HIP - // MIOPEN not support NHWC data layout - } else if (data_format == str_NHWC) { - layout = DataLayout::kNCHW; - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 3, 1, 2}; - - transformed_input.Resize(input->dims()); - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[3]; - in_dims_vec[2] = input->dims()[1]; - in_dims_vec[3] = input->dims()[2]; - transformed_input.Resize(phi::make_ddim(in_dims_vec)); - transformed_input.mutable_data(ctx.GetPlace(), input->type()); - - phi::funcs::Transpose trans; - trans(dev_ctx, *input, &transformed_input, axis); - - transformed_output.Resize(output->dims()); - auto out_dims_vec = phi::vectorize(output->dims()); - out_dims_vec[1] = output->dims()[3]; - out_dims_vec[2] = output->dims()[1]; - out_dims_vec[3] = output->dims()[2]; - transformed_output.Resize(phi::make_ddim(out_dims_vec)); -#endif - } else { - layout = getLayoutFromStr(data_format); - transformed_input = *input; - transformed_output = *output; - } - - const T *tranformed_input_data = transformed_input.data(); - T *tranformed_output_data = transformed_output.mutable_data( - transformed_output.dims(), ctx.GetPlace()); - - // ------------------- cudnn descriptors --------------------- - ScopedTensorDescriptor input_desc; - ScopedTensorDescriptor output_desc; - ScopedPoolingDescriptor pool_desc; - -#ifdef PADDLE_WITH_HIP - miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, phi::vectorize(transformed_input.dims())); - miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, phi::vectorize(transformed_output.dims())); -#else - cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, phi::vectorize(transformed_input.dims())); - cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, phi::vectorize(transformed_output.dims())); -#endif - PoolingMode pooling_mode; - if (pooling_type == "max") { - pooling_mode = PoolingMode::kMaximum; - } else { - pooling_mode = exclusive ? PoolingMode::kAverageExclusive - : PoolingMode::kAverageInclusive; - } - -#ifdef PADDLE_WITH_HIP - miopenPoolingDescriptor_t cudnn_pool_desc = - pool_desc.descriptor(pooling_mode, ksize, paddings, strides); -#else - cudnnPoolingDescriptor_t cudnn_pool_desc = - pool_desc.descriptor(pooling_mode, ksize, paddings, strides); -#endif - - // ------------------- cudnn pool algorithm --------------------- - auto handle = ctx.cuda_device_context().cudnn_handle(); - ScalingParamType alpha = 1.0f, beta = 0.0f; - -#ifdef PADDLE_WITH_HIP - char *pool_workspace; - size_t pool_worksize = 0; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenPoolingGetWorkSpaceSizeV2( - cudnn_pool_desc, cudnn_output_desc, &pool_worksize)); - PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenPoolingForward( - handle, cudnn_pool_desc, &alpha, cudnn_input_desc, - tranformed_input_data, &beta, cudnn_output_desc, tranformed_output_data, - false, pool_workspace, pool_worksize)); - PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnPoolingForward( - handle, cudnn_pool_desc, &alpha, cudnn_input_desc, - tranformed_input_data, &beta, cudnn_output_desc, - tranformed_output_data)); -#endif - // add - if (data_format == str_NDHWC) { - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 2, 3, 4, 1}; - phi::funcs::Transpose - trans5_v2; - trans5_v2(dev_ctx, transformed_output, output, axis); - } -#ifdef PADDLE_WITH_HIP - // MIOPEN not support NHWC data layout - if (data_format == str_NHWC) { - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 2, 3, 1}; - phi::funcs::Transpose trans; - trans(dev_ctx, transformed_output, output, axis); - } -#endif - } -}; - -template -class PoolCUDNNGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument("Pool operator CUDA kernel must use " - "CUDAPlace rather than CPUPlace.")); - - const Tensor *input = ctx.Input("X"); - const Tensor *output = ctx.Input("Out"); - const Tensor *output_grad = - ctx.Input(framework::GradVarName("Out")); - Tensor *input_grad = ctx.Output(framework::GradVarName("X")); - - std::string pooling_type = ctx.Attr("pooling_type"); - bool exclusive = ctx.Attr("exclusive"); - bool adaptive = ctx.Attr("adaptive"); - std::vector ksize = ctx.Attr>("ksize"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::string data_format = ctx.Attr("data_format"); - bool global_pooling = ctx.Attr("global_pooling"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - -#ifdef PADDLE_WITH_HIP - if (pooling_type == "max") { - using OpKernelMap = paddle::framework::OperatorWithKernel::OpKernelMap; - using OpKernelFunc = paddle::framework::OperatorWithKernel::OpKernelFunc; - auto &all_op_kernels = - paddle::framework::OperatorWithKernel::AllOpKernels(); - std::string op_type = "pool2d_grad"; - auto kernels_iter = all_op_kernels.find(op_type); - PADDLE_ENFORCE_NE( - kernels_iter, all_op_kernels.end(), - platform::errors::Unavailable( - "There are no kernels which are registered in the %s operator.", - op_type)); - OpKernelMap &kernels = kernels_iter->second; - paddle::framework::OpKernelType expected_kernel_key( - paddle::framework::ToDataType(typeid(T)), ctx.GetPlace()); - auto kernel_iter = kernels.find(expected_kernel_key); - PADDLE_ENFORCE_NE(kernel_iter, kernels.end(), - platform::errors::NotFound( - "Operator (%s) does not have kernel for %s.", - op_type, KernelTypeToString(expected_kernel_key))); - std::unique_ptr kernel_func_( - new OpKernelFunc(kernel_iter->second)); - (*kernel_func_)(ctx); - return; - } -#endif - - // update paddings - auto in_x_dims = input->dims(); - framework::DDim data_dims; - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); - if (data_dims.size() * 2 == static_cast(paddings.size())) { - for (int i = 0; i < data_dims.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - - // ------- tensor grad -------------- - Tensor transformed_input(input->type()); - Tensor transformed_output(output->type()); - Tensor transformed_output_grad(output_grad->type()); - - input_grad->mutable_data(ctx.GetPlace()); - Tensor transformed_input_grad(input_grad->type()); - DataLayout layout; - const std::string str_NCHW = "NCHW", str_NHWC = "NHWC"; - const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC"; - if (data_format == str_NDHWC) { - layout = DataLayout::kNCDHW; - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 4, 1, 2, 3}; - - // input - transformed_input.Resize(input->dims()); - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[4]; - in_dims_vec[2] = input->dims()[1]; - in_dims_vec[3] = input->dims()[2]; - in_dims_vec[4] = input->dims()[3]; - transformed_input.Resize(phi::make_ddim(in_dims_vec)); - transformed_input.mutable_data(ctx.GetPlace(), input->type()); - - phi::funcs::Transpose trans5; - trans5(dev_ctx, *input, &transformed_input, axis); - - // output - transformed_output.Resize(output->dims()); - auto out_dims_vec = phi::vectorize(output->dims()); - out_dims_vec[1] = output->dims()[4]; - out_dims_vec[2] = output->dims()[1]; - out_dims_vec[3] = output->dims()[2]; - out_dims_vec[4] = output->dims()[3]; - transformed_output.Resize(phi::make_ddim(out_dims_vec)); - - transformed_output.mutable_data(ctx.GetPlace(), output->type()); - - phi::funcs::Transpose - trans5_v2; - trans5_v2(dev_ctx, *output, &transformed_output, axis); - - // output grad - transformed_output_grad.Resize(phi::make_ddim(out_dims_vec)); - transformed_output_grad.mutable_data(ctx.GetPlace(), output_grad->type()); - - phi::funcs::Transpose - trans5_v3; - trans5_v3(dev_ctx, *output_grad, &transformed_output_grad, axis); - - // input grad - transformed_input_grad.Resize(phi::make_ddim(in_dims_vec)); - -#ifdef PADDLE_WITH_HIP - // MIOPEN not support NHWC data layout - } else if (data_format == str_NHWC) { - layout = DataLayout::kNCHW; - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 3, 1, 2}; - - // input - transformed_input.Resize(input->dims()); - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[3]; - in_dims_vec[2] = input->dims()[1]; - in_dims_vec[3] = input->dims()[2]; - transformed_input.Resize(phi::make_ddim(in_dims_vec)); - transformed_input.mutable_data(ctx.GetPlace(), input->type()); - - phi::funcs::Transpose trans4; - trans4(dev_ctx, *input, &transformed_input, axis); - - // output - transformed_output.Resize(output->dims()); - auto out_dims_vec = phi::vectorize(output->dims()); - out_dims_vec[1] = output->dims()[3]; - out_dims_vec[2] = output->dims()[1]; - out_dims_vec[3] = output->dims()[2]; - transformed_output.Resize(phi::make_ddim(out_dims_vec)); - - transformed_output.mutable_data(ctx.GetPlace(), output->type()); - - phi::funcs::Transpose - trans4_v2; - trans4_v2(dev_ctx, *output, &transformed_output, axis); - - // output grad - transformed_output_grad.Resize(phi::make_ddim(out_dims_vec)); - transformed_output_grad.mutable_data(ctx.GetPlace(), output_grad->type()); - - phi::funcs::Transpose - trans4_v3; - trans4_v3(dev_ctx, *output_grad, &transformed_output_grad, axis); - - // input grad - transformed_input_grad.Resize(phi::make_ddim(in_dims_vec)); -#endif - } else { - layout = getLayoutFromStr(data_format); - transformed_input = *input; - transformed_output = *output; - transformed_output_grad = *output_grad; - transformed_input_grad = *input_grad; - } - - const T *input_data = transformed_input.data(); - const T *output_data = transformed_output.data(); - const T *output_grad_data = transformed_output_grad.data(); - - // ------------------- cudnn descriptors --------------------- - ScopedTensorDescriptor input_desc; - ScopedTensorDescriptor output_desc; - ScopedPoolingDescriptor pool_desc; - -#ifdef PADDLE_WITH_HIP - miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, phi::vectorize(transformed_input.dims())); - miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, phi::vectorize(transformed_output.dims())); -#else - cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, phi::vectorize(transformed_input.dims())); - cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, phi::vectorize(transformed_output.dims())); -#endif - PoolingMode pooling_mode; - if (pooling_type == "max") { - if (FLAGS_cudnn_deterministic) { - pooling_mode = PoolingMode::kMaximumDeterministic; - } else { - pooling_mode = PoolingMode::kMaximum; - } - } else { - pooling_mode = exclusive ? PoolingMode::kAverageExclusive - : PoolingMode::kAverageInclusive; - } - -#ifdef PADDLE_WITH_HIP - miopenPoolingDescriptor_t cudnn_pool_desc = - pool_desc.descriptor(pooling_mode, ksize, paddings, strides); -#else - cudnnPoolingDescriptor_t cudnn_pool_desc = - pool_desc.descriptor(pooling_mode, ksize, paddings, strides); -#endif - - // ------------------- cudnn pool algorithm --------------------- - auto handle = ctx.cuda_device_context().cudnn_handle(); - ScalingParamType alpha = 1.0f, beta = 0.0f; - if (input_grad) { - T *input_grad_data = transformed_input_grad.mutable_data( - transformed_input_grad.dims(), ctx.GetPlace()); -// Because beta is zero, it is unnecessary to reset input_grad. -#ifdef PADDLE_WITH_HIP - char *pool_workspace; - size_t pool_worksize = 0; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenPoolingGetWorkSpaceSizeV2( - cudnn_pool_desc, cudnn_output_desc, &pool_worksize)); - PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenPoolingBackward( - handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data, - cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data, - &beta, cudnn_input_desc, input_grad_data, pool_workspace)); - PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnPoolingBackward( - handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data, - cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data, - &beta, cudnn_input_desc, input_grad_data)); -#endif - - if (data_format == str_NDHWC) { - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 2, 3, 4, 1}; - phi::funcs::Transpose - trans5_v4; - trans5_v4(dev_ctx, transformed_input_grad, input_grad, axis); - } -#ifdef PADDLE_WITH_HIP - // MIOPEN not support NHWC data layout - if (data_format == str_NHWC) { - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 2, 3, 1}; - phi::funcs::Transpose - trans4_v4; - trans4_v4(dev_ctx, transformed_input_grad, input_grad, axis); - } -#endif - } - } -}; - -template -class PoolCUDNNGradGradOpKernel : public PoolCUDNNOpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - std::string pooling_type = ctx.Attr("pooling_type"); - if (pooling_type == "max") { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op grad grad only supports avgpool.")); - } else { - PoolCUDNNOpKernel::Compute(ctx); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -#ifdef PADDLE_WITH_HIP -// MIOPEN do not support double -REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel); -REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradOpKernel, - ops::PoolCUDNNGradOpKernel); - -REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel); -REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradOpKernel); -#else -REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel); -REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradOpKernel, - ops::PoolCUDNNGradOpKernel, - ops::PoolCUDNNGradOpKernel); -REGISTER_OP_KERNEL(pool2d_grad_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradGradOpKernel, - ops::PoolCUDNNGradGradOpKernel, - ops::PoolCUDNNGradGradOpKernel); - -REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel); -REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradOpKernel, - ops::PoolCUDNNGradOpKernel); -#endif diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index ae095c2fa7a..44f3d8090e5 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -15,6 +15,12 @@ limitations under the License. */ #include "paddle/fluid/operators/pool_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/unary.h" + #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -23,125 +29,6 @@ limitations under the License. */ namespace paddle { namespace operators { -int PoolOutputSize(int input_size, int filter_size, int padding_1, - int padding_2, int stride, bool ceil_mode) { - int output_size; - if (!ceil_mode) { - output_size = - (input_size - filter_size + padding_1 + padding_2) / stride + 1; - } else { - output_size = - (input_size - filter_size + padding_1 + padding_2 + stride - 1) / - stride + - 1; - } - PADDLE_ENFORCE_GT( - output_size, 0, - platform::errors::InvalidArgument( - "the output size must be greater than 0. But received: " - "output_size = %d due to the settings of input_size(%d), " - "padding(%d,%d), " - "k_size(%d) and stride(%d). Please check again!", - output_size, input_size, padding_1, padding_2, filter_size, stride)); - return output_size; -} - -void PoolOp::InferShape(framework::InferShapeContext* ctx) const { - PADDLE_ENFORCE_EQ( - ctx->HasInput("X"), true, - platform::errors::NotFound("Input(X) of Pool operator is not found.")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput("Out"), true, - platform::errors::NotFound("Output(Out) of Pool operator is not found.")); - - std::string pooling_type = ctx->Attrs().Get("pooling_type"); - std::vector ksize = ctx->Attrs().Get>("ksize"); - std::vector strides = ctx->Attrs().Get>("strides"); - std::vector paddings = ctx->Attrs().Get>("paddings"); - bool ceil_mode = ctx->Attrs().Get("ceil_mode"); - bool adaptive = ctx->Attrs().Get("adaptive"); - bool global_pooling = ctx->Attrs().Get("global_pooling"); - std::string data_format = ctx->Attrs().Get("data_format"); - std::string padding_algorithm = - ctx->Attrs().Get("padding_algorithm"); - - auto in_x_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_EQ( - in_x_dims.size() == 4 || in_x_dims.size() == 5, true, - platform::errors::InvalidArgument( - "the input of Op(pool) should be 4-D or 5-D Tensor. But " - "received: %u-D Tensor and it's shape is [%s].", - in_x_dims.size(), in_x_dims)); - - PADDLE_ENFORCE_EQ( - in_x_dims.size() - ksize.size(), 2U, - platform::errors::InvalidArgument( - "the dimension of input minus the size of " - "Attr(ksize) must be euqal to 2 in Op(pool). " - "But received: the dimension of input minus the size " - "of Attr(ksize) is %d, the " - "input's dimension is %d, the shape of input " - "is [%s], the Attr(ksize)'s size is %d, the Attr(ksize) is [%s].", - in_x_dims.size() - ksize.size(), in_x_dims.size(), in_x_dims, - ksize.size(), phi::make_ddim(ksize))); - - PADDLE_ENFORCE_EQ( - ksize.size(), strides.size(), - platform::errors::InvalidArgument( - "the size of Attr(ksize) and Attr(strides) in " - "Op(pool) must be equal. " - "But received: Attr(ksize)'s size is %d, Attr(strides)'s " - "size is %d, Attr(ksize) is [%s], Attr(strides)is [%s].", - ksize.size(), strides.size(), phi::make_ddim(ksize), - phi::make_ddim(strides))); - - // MKL-DNN Kernels are using NCHW order of dims description - // so we ignore data_format consideration for MKL-DNN kernel - const bool channel_last = (ctx->IsRunMKLDNNKernel() == false) && - (data_format == "NHWC" || data_format == "NDHWC"); - - // update paddings if "SAME" or global_pooling - framework::DDim data_dims; - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - - std::vector output_shape; - if (adaptive) { - output_shape.insert(output_shape.end(), ksize.begin(), ksize.end()); - } else { - for (int i = 0; i < data_dims.size(); ++i) { - if ((!ctx->IsRuntime()) && (data_dims[i] < 0)) { - output_shape.push_back(data_dims[i]); - } else { - output_shape.push_back( - PoolOutputSize(data_dims[i], ksize[i], paddings[2 * i], - paddings[2 * i + 1], strides[i], ceil_mode)); - } - } - } - - // output_N = input_N - output_shape.insert(output_shape.begin(), in_x_dims[0]); - // output_C = input_C - if (channel_last) { - output_shape.push_back(in_x_dims[in_x_dims.size() - 1]); - } else { - output_shape.insert(output_shape.begin() + 1, in_x_dims[1]); - } - - ctx->SetOutputDim("Out", phi::make_ddim(output_shape)); - ctx->ShareLoD("X", "Out"); -} - bool CanMKLDNNSupportPool(const framework::ExecutionContext& ctx) { if (ctx.Attr("adaptive") == false) return true; // (jczaja): oneDNN is supporting only unchangable in size pool window @@ -216,16 +103,6 @@ framework::OpKernelType PoolOp::GetKernelTypeForVar( tensor.place(), tensor.layout()); } -void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::NotFound( - "Input(X) of Pool Gradoperator is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true, - platform::errors::NotFound( - "Input(X@GRAD) of Pool Gradoperator is not found.")); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); -} - framework::OpKernelType PoolOpGrad::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library_{framework::LibraryType::kPlain}; @@ -471,7 +348,7 @@ class Pool2dOpGradGradMaker : public framework::SingleGradOpMaker { protected: void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("pool2d_grad_grad"); + grad_op->SetType("pool2d_double_grad"); grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X"))); grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out"))); grad_op->SetAttrMap(this->Attrs()); @@ -692,35 +569,34 @@ Example: namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(pool2d, Pool2dInferShapeFunctor, + PD_INFER_META(phi::PoolInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(pool2d_grad, Pool2dGradInferShapeFunctor, + PD_INFER_META(phi::PoolGradInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(pool2d_double_grad, + Pool2dDoubleGradInferShapeFunctor, + PD_INFER_META(phi::PoolInferMeta)); + REGISTER_OPERATOR( pool2d, ops::PoolOp, ops::Pool2dOpMaker, ops::PoolOpInferVarType, paddle::framework::DefaultGradOpMaker, - paddle::framework::DefaultGradOpMaker); + paddle::framework::DefaultGradOpMaker, + Pool2dInferShapeFunctor); REGISTER_OPERATOR(pool2d_grad, ops::PoolOpGrad, ops::Pool2dOpGradGradMaker, - ops::Pool2dOpGradGradMaker); -REGISTER_OPERATOR(pool2d_grad_grad, ops::PoolOp); - -REGISTER_OP_CPU_KERNEL( - pool2d, ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CPU_KERNEL( - pool2d_grad, ops::PoolGradKernel, - ops::PoolGradKernel); -REGISTER_OP_CPU_KERNEL( - pool2d_grad_grad, - ops::PoolGradGradKernel, - ops::PoolGradGradKernel); + ops::Pool2dOpGradGradMaker, + Pool2dGradInferShapeFunctor); +REGISTER_OPERATOR(pool2d_double_grad, ops::PoolOp, + Pool2dDoubleGradInferShapeFunctor); + +DECLARE_INFER_SHAPE_FUNCTOR(pool3d, Pool3dInferShapeFunctor, + PD_INFER_META(phi::PoolInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(pool3d_grad, Pool3dGradInferShapeFunctor, + PD_INFER_META(phi::PoolGradInferMeta)); REGISTER_OPERATOR( pool3d, ops::PoolOp, ops::Pool3dOpMaker, ops::PoolOpInferVarType, paddle::framework::DefaultGradOpMaker, - paddle::framework::DefaultGradOpMaker); -REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad); - -REGISTER_OP_CPU_KERNEL( - pool3d, ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CPU_KERNEL( - pool3d_grad, ops::PoolGradKernel, - ops::PoolGradKernel); + paddle::framework::DefaultGradOpMaker, + Pool3dInferShapeFunctor); +REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad, Pool3dGradInferShapeFunctor); diff --git a/paddle/fluid/operators/pool_op.cu b/paddle/fluid/operators/pool_op.cu deleted file mode 100644 index 069ce0c1fda..00000000000 --- a/paddle/fluid/operators/pool_op.cu +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/pool_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - pool2d, ops::PoolKernel, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CUDA_KERNEL( - pool2d_grad, - ops::PoolGradKernel, - ops::PoolGradKernel, - ops::PoolGradKernel); - -REGISTER_OP_CUDA_KERNEL( - pool2d_grad_grad, - ops::PoolGradGradKernel, - ops::PoolGradGradKernel, - ops::PoolGradGradKernel); - -REGISTER_OP_CUDA_KERNEL( - pool3d, ops::PoolKernel, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CUDA_KERNEL( - pool3d_grad, - ops::PoolGradKernel, - ops::PoolGradKernel, - ops::PoolGradKernel); diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h index bea6506ee86..d48ac3bd358 100644 --- a/paddle/fluid/operators/pool_op.h +++ b/paddle/fluid/operators/pool_op.h @@ -12,19 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once +// NOTE(Ruibiao): Difficult to remove code from this header file because too +// many files rely on it through "mkldnn_reuse.h" -#include -#include -#include +#pragma once -#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/pooling.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#if defined(__HIPCC__) || defined(__NVCC__) -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#endif namespace paddle { namespace operators { @@ -35,8 +28,6 @@ class PoolOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override; - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override; @@ -50,8 +41,6 @@ class PoolOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override; - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override; @@ -71,292 +60,5 @@ class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override; }; -template -inline void UpdatePadding(std::vector* paddings, const bool global_pooling, - const bool adaptive, - const std::string padding_algorithm, - const framework::DDim data_dims, - const std::vector& strides, - const std::vector& ksize) { - // set padding size == data_dims.size() * 2 - auto data_shape = phi::vectorize(data_dims); - if (static_cast(paddings->size()) == data_dims.size()) { - for (int i = 0; i < data_dims.size(); ++i) { - T copy_pad = *(paddings->begin() + 2 * i); - paddings->insert(paddings->begin() + 2 * i + 1, copy_pad); - } - } else { - PADDLE_ENFORCE_EQ(data_dims.size() * 2, paddings->size(), - platform::errors::InvalidArgument( - "Paddings size %d should be the same or twice as the " - "pooling size %d.", - paddings->size(), data_dims.size() * 2)); - } - - // when padding_algorithm is "VALID" or "SAME" - if (padding_algorithm == "SAME") { - for (int i = 0; i < data_dims.size(); ++i) { - T out_size = (data_dims[i] + strides[i] - 1) / strides[i]; - T pad_sum = - std::max((out_size - 1) * strides[i] + ksize[i] - data_shape[i], - static_cast(0)); - T pad_0 = pad_sum / 2; - T pad_1 = pad_sum - pad_0; - *(paddings->begin() + i * 2) = pad_0; - *(paddings->begin() + i * 2 + 1) = pad_1; - } - } else if (padding_algorithm == "VALID") { - for (auto it = paddings->begin(); it != paddings->end(); it++) { - *it = 0; - } - } - - // if global_pooling == true or adaptive == true, padding will be ignore - if (global_pooling || adaptive) { - for (auto it = paddings->begin(); it != paddings->end(); it++) { - *it = 0; - } - } -} - -template -inline void UpdateKsize(std::vector* ksize, - const framework::DDim data_dims) { - ksize->resize(static_cast(data_dims.size())); - for (size_t i = 0; i < ksize->size(); ++i) { - *(ksize->begin() + i) = static_cast(data_dims[i]); - } -} - -inline int getReduceNum(const framework::Tensor& input, - const framework::Tensor* output, - const std::string data_format, - std::vector* reduce_dim) { - // data_format only can be NCHW - bool channel_last = (data_format == "NHWC"); - if (channel_last) { - return 0; - } - int reduce_num = 0; - const int output_height = output->dims()[2]; - const int output_width = output->dims()[3]; - if ((output_height == 1) && (output_width == 1)) { - reduce_dim->push_back(2); - reduce_dim->push_back(3); - reduce_num = input.dims()[2] * input.dims()[3]; - } - return reduce_num; -} - -template -class PoolKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - Tensor* out = context.Output("Out"); - - std::string pooling_type = context.Attr("pooling_type"); - std::vector ksize = context.Attr>("ksize"); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::string data_format = context.Attr("data_format"); - bool exclusive = context.Attr("exclusive"); - bool adaptive = context.Attr("adaptive"); - bool global_pooling = context.Attr("global_pooling"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // update paddings - auto in_x_dims = in_x->dims(); - framework::DDim data_dims; - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - } - - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); - if (data_dims.size() * 2 == static_cast(paddings.size())) { - for (int i = 0; i < data_dims.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - auto& dev_ctx = context.template device_context(); - switch (ksize.size()) { - case 2: { - if (pooling_type == "max") { - paddle::operators::math::Pool2dFunctor< - DeviceContext, paddle::operators::math::MaxPool, T> - pool2d_forward; - paddle::operators::math::MaxPool pool_process; - pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format, - true, false, out, pool_process); - - } else if (pooling_type == "avg") { - std::vector reduce_dim; - int reduce_num = getReduceNum(*in_x, out, data_format, &reduce_dim); - if (reduce_num > 0 && - adaptive) { // for adaptive_avg_pool2d && output_size == 1 -#if defined(__HIPCC__) || defined(__NVCC__) - auto stream = dev_ctx.stream(); - TensorReduceImpl>( - dev_ctx, *in_x, out, kps::DivideFunctor(reduce_num), - reduce_dim, stream); -#else // for cpu - paddle::operators::math::Pool2dFunctor< - DeviceContext, paddle::operators::math::AvgPool, T> - pool2d_forward; - paddle::operators::math::AvgPool pool_process; - pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, - data_format, exclusive, adaptive, out, pool_process); -#endif - } else { // avgpool_2d or adaptive_avg_pool2d && output_size != 1 - paddle::operators::math::Pool2dFunctor< - DeviceContext, paddle::operators::math::AvgPool, T> - pool2d_forward; - paddle::operators::math::AvgPool pool_process; - pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, - data_format, exclusive, adaptive, out, pool_process); - } - } - } break; - case 3: { - if (pooling_type == "max") { - paddle::operators::math::Pool3dFunctor< - DeviceContext, paddle::operators::math::MaxPool, T> - pool3d_forward; - paddle::operators::math::MaxPool pool_process; - pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format, - true, false, out, pool_process); - - } else if (pooling_type == "avg") { - paddle::operators::math::Pool3dFunctor< - DeviceContext, paddle::operators::math::AvgPool, T> - pool3d_forward; - paddle::operators::math::AvgPool pool_process; - pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format, - exclusive, adaptive, out, pool_process); - } - } break; - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op only supports 2D and 3D input.")); - } - } - } -}; - -template -class PoolGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - const Tensor* out = context.Input("Out"); - const Tensor* out_grad = - context.Input(framework::GradVarName("Out")); - Tensor* in_x_grad = context.Output(framework::GradVarName("X")); - - std::string pooling_type = context.Attr("pooling_type"); - std::vector ksize = context.Attr>("ksize"); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - bool exclusive = context.Attr("exclusive"); - bool adaptive = context.Attr("adaptive"); - std::string data_format = context.Attr("data_format"); - bool global_pooling = context.Attr("global_pooling"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // update paddings - auto in_x_dims = in_x->dims(); - framework::DDim data_dims; - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); - if (data_dims.size() * 2 == static_cast(paddings.size())) { - for (int i = 0; i < data_dims.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - - auto& dev_ctx = context.template device_context(); - if (in_x_grad) { - in_x_grad->mutable_data(context.GetPlace()); - phi::funcs::SetConstant set_constant; - set_constant(dev_ctx, in_x_grad, static_cast(0.0)); - - switch (ksize.size()) { - case 2: { - if (pooling_type == "max") { - paddle::operators::math::MaxPool2dGradFunctor - pool2d_backward; - pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, data_format, in_x_grad); - } else if (pooling_type == "avg") { - paddle::operators::math::Pool2dGradFunctor< - DeviceContext, paddle::operators::math::AvgPoolGrad, T> - pool2d_backward; - paddle::operators::math::AvgPoolGrad pool_process; - pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, data_format, exclusive, adaptive, - in_x_grad, pool_process); - } - } break; - case 3: { - if (pooling_type == "max") { - paddle::operators::math::MaxPool3dGradFunctor - pool3d_backward; - pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, data_format, in_x_grad); - } else if (pooling_type == "avg") { - paddle::operators::math::Pool3dGradFunctor< - DeviceContext, paddle::operators::math::AvgPoolGrad, T> - pool3d_backward; - paddle::operators::math::AvgPoolGrad pool_process; - pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, data_format, exclusive, adaptive, - in_x_grad, pool_process); - } - } break; - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op only supports 2D and 3D input.")); - } - } - } - } -}; - -template -class PoolGradGradKernel : public PoolKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - std::string pooling_type = context.Attr("pooling_type"); - if (pooling_type == "max") { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op grad grad only supports avgpool.")); - } else { - PoolKernel::Compute(context); - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/pool_op_mlu.cc b/paddle/fluid/operators/pool_op_mlu.cc index 08656e64231..fa88d128a9a 100644 --- a/paddle/fluid/operators/pool_op_mlu.cc +++ b/paddle/fluid/operators/pool_op_mlu.cc @@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pool_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace operators { @@ -80,10 +81,10 @@ class MLUPoolOpKernel : public framework::OpKernel { data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive, + padding_algorithm, data_dims, strides, ksize); if (global_pooling) { - UpdateKsize(&ksize, data_dims); + phi::funcs::UpdateKernelSize(&ksize, data_dims); } MLUCnnlTensorDesc in_x_desc(*in_x, cnnl_layout, ToCnnlDataType()); @@ -191,10 +192,10 @@ class MLUPoolGradOpKernel : public framework::OpKernel { data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive, + padding_algorithm, data_dims, strides, ksize); if (global_pooling) { - UpdateKsize(&ksize, data_dims); + phi::funcs::UpdateKernelSize(&ksize, data_dims); } // inputs need with NHWC layout diff --git a/paddle/fluid/operators/pool_op_npu.cc b/paddle/fluid/operators/pool_op_npu.cc index bd26d6350d9..0efcb8b7981 100644 --- a/paddle/fluid/operators/pool_op_npu.cc +++ b/paddle/fluid/operators/pool_op_npu.cc @@ -11,8 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pool_op.h" + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace operators { @@ -68,8 +70,8 @@ class NPUPoolOpKernel : public framework::OpKernel { strides_vec[2] = strides[0]; strides_vec[3] = strides[1]; } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive, + padding_algorithm, data_dims, strides, ksize); PADDLE_ENFORCE_LT( std::max(paddings[0], paddings[1]), ksize[0], platform::errors::InvalidArgument( @@ -201,8 +203,8 @@ class NPUPoolGradOpKernel : public framework::OpKernel { strides_vec[2] = strides[0]; strides_vec[3] = strides[1]; } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive, + padding_algorithm, data_dims, strides, ksize); PADDLE_ENFORCE_LT( std::max(paddings[0], paddings[1]), ksize[0], diff --git a/paddle/fluid/operators/pool_op_xpu.cc b/paddle/fluid/operators/pool_op_xpu.cc index 402dd6c1080..87c437d8a78 100644 --- a/paddle/fluid/operators/pool_op_xpu.cc +++ b/paddle/fluid/operators/pool_op_xpu.cc @@ -8,13 +8,17 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pool_op.h" + #include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" #ifdef PADDLE_WITH_XPU namespace paddle { namespace operators { +using framework::Tensor; + xpu::Pooling_t XPUPoolingType(const std::string& pooltype, bool exclusive, bool is_test) { if (pooltype == "max") { diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc index d061f9ae056..e0341f4a4b4 100644 --- a/paddle/fluid/operators/pool_with_index_op.cc +++ b/paddle/fluid/operators/pool_with_index_op.cc @@ -12,8 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pool_with_index_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -28,71 +32,6 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of Pooling should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of Pooling should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Mask"), true, - platform::errors::InvalidArgument( - "Output(Mask) of Pooling should not be null.")); - - auto in_x_dims = ctx->GetInputDim("X"); - - std::vector ksize = ctx->Attrs().Get>("ksize"); - std::vector strides = ctx->Attrs().Get>("strides"); - std::vector paddings = ctx->Attrs().Get>("paddings"); - bool adaptive = ctx->Attrs().Get("adaptive"); - - PADDLE_ENFORCE( - in_x_dims.size() == 4 || in_x_dims.size() == 5, - platform::errors::InvalidArgument("Pooling intput should be 4-D or 5-D " - "tensor but received %dD-Tensor", - in_x_dims.size())); - - if (ctx->Attrs().Get("global_pooling")) { - ksize.resize(static_cast(in_x_dims.size()) - 2); - for (size_t i = 0; i < ksize.size(); ++i) { - paddings[i] = 0; - ksize[i] = static_cast(in_x_dims[i + 2]); - } - } - - PADDLE_ENFORCE_EQ( - in_x_dims.size() - ksize.size(), 2U, - platform::errors::InvalidArgument( - "The input size %d minus the kernel size %d should equal to 2.", - in_x_dims.size(), ksize.size())); - PADDLE_ENFORCE_EQ( - ksize.size(), strides.size(), - platform::errors::InvalidArgument( - "Strides size %d and pooling size %d should be the same.", - strides.size(), ksize.size())); - PADDLE_ENFORCE_EQ( - ksize.size(), paddings.size(), - platform::errors::InvalidArgument( - "Paddings size %d and pooling size %d should be the same.", - paddings.size(), ksize.size())); - - std::vector output_shape({in_x_dims[0], in_x_dims[1]}); - if (adaptive) { - output_shape.insert(output_shape.end(), ksize.begin(), ksize.end()); - } else { - for (size_t i = 0; i < ksize.size(); ++i) { - if ((!ctx->IsRuntime()) && (in_x_dims[i + 2] < 0)) { - output_shape.push_back(in_x_dims[i + 2]); - } else { - output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i], - paddings[i], strides[i])); - } - } - } - ctx->SetOutputDim("Out", phi::make_ddim(output_shape)); - ctx->SetOutputDim("Mask", phi::make_ddim(output_shape)); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -106,22 +45,6 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("Mask"), true, - platform::errors::InvalidArgument("Input(Mask) must not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("X"), true, - platform::errors::InvalidArgument("Input(X) must not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true, - platform::errors::InvalidArgument( - "Input(Out@GRAD) should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true, - platform::errors::InvalidArgument( - "Output(X@GRAD) should not be null.")); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -335,40 +258,34 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER( namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(max_pool2d_with_index, + MaxPool2dWithIndexInferShapeFunctor, + PD_INFER_META(phi::MaxPoolWithIndexInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(max_pool2d_with_index_grad, + MaxPool2dWithIndexGradInferShapeFunctor, + PD_INFER_META(phi::MaxPoolWithIndexGradInferMeta)); + REGISTER_OPERATOR(max_pool2d_with_index, ops::MaxPoolWithIndexOp, ops::MaxPool2dWithIndexOpMaker, ops::MaxPoolWithIndexGradOpMaker, - ops::MaxPoolWithIndexGradOpMaker); + ops::MaxPoolWithIndexGradOpMaker, + MaxPool2dWithIndexInferShapeFunctor); REGISTER_OPERATOR(max_pool2d_with_index_grad, ops::MaxPoolWithIndexOpGrad, - ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer); + ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer, + MaxPool2dWithIndexGradInferShapeFunctor); -REGISTER_OP_CPU_KERNEL( - max_pool2d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_CPU_KERNEL( - max_pool2d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel); +DECLARE_INFER_SHAPE_FUNCTOR(max_pool3d_with_index, + MaxPool3dWithIndexInferShapeFunctor, + PD_INFER_META(phi::MaxPoolWithIndexInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(max_pool3d_with_index_grad, + MaxPool3dWithIndexGradInferShapeFunctor, + PD_INFER_META(phi::MaxPoolWithIndexGradInferMeta)); REGISTER_OPERATOR(max_pool3d_with_index, ops::MaxPoolWithIndexOp, ops::MaxPool3dWithIndexOpMaker, ops::MaxPoolWithIndexGradOpMaker, - ops::MaxPoolWithIndexGradOpMaker); + ops::MaxPoolWithIndexGradOpMaker, + MaxPool3dWithIndexInferShapeFunctor); REGISTER_OPERATOR(max_pool3d_with_index_grad, ops::MaxPoolWithIndexOpGrad, - ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer); - -REGISTER_OP_CPU_KERNEL( - max_pool3d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_CPU_KERNEL( - max_pool3d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel); + ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer, + MaxPool3dWithIndexGradInferShapeFunctor); diff --git a/paddle/fluid/operators/pool_with_index_op.cu.cc b/paddle/fluid/operators/pool_with_index_op.cu.cc deleted file mode 100644 index 5497dcbd9ce..00000000000 --- a/paddle/fluid/operators/pool_with_index_op.cu.cc +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/pool_with_index_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - max_pool2d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_CUDA_KERNEL( - max_pool2d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel); - -REGISTER_OP_CUDA_KERNEL( - max_pool3d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_CUDA_KERNEL( - max_pool3d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel); diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h deleted file mode 100644 index 6e51a833f5c..00000000000 --- a/paddle/fluid/operators/pool_with_index_op.h +++ /dev/null @@ -1,121 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/pooling.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class MaxPoolWithIndexKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - Tensor* out = context.Output("Out"); - Tensor* mask = context.Output("Mask"); - - std::vector ksize = context.Attr>("ksize"); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - bool adaptive = context.Attr("adaptive"); - - auto& dev_ctx = context.template device_context(); - if (context.Attr("global_pooling")) { - for (size_t i = 0; i < ksize.size(); ++i) { - paddings[i] = 0; - ksize[i] = static_cast(in_x->dims()[i + 2]); - } - } - - switch (ksize.size()) { - case 2: { - paddle::operators::math::MaxPool2dWithIndexFunctor - pool2d_forward; - pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out, - mask); - } break; - case 3: { - paddle::operators::math::MaxPool3dWithIndexFunctor - pool3d_forward; - pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out, - mask); - } break; - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op only supports 2D and 3D input.")); - } - } - } -}; - -template -class MaxPoolWithIndexGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* mask = context.Input("Mask"); - const Tensor* out_grad = - context.Input(framework::GradVarName("Out")); - Tensor* in_x_grad = context.Output(framework::GradVarName("X")); - - std::vector ksize = context.Attr>("ksize"); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - bool adaptive = context.Attr("adaptive"); - if (context.Attr("global_pooling")) { - for (size_t i = 0; i < ksize.size(); ++i) { - paddings[i] = 0; - ksize[i] = static_cast(in_x_grad->dims()[i + 2]); - } - } - - if (in_x_grad) { - in_x_grad->mutable_data(context.GetPlace()); - auto& device_ctx = context.template device_context(); - phi::funcs::set_constant(device_ctx, in_x_grad, 0); - - switch (ksize.size()) { - case 2: { - paddle::operators::math::MaxPool2dWithIndexGradFunctor - pool2d_backward; - pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides, - paddings, adaptive, in_x_grad); - } break; - case 3: { - paddle::operators::math::MaxPool3dWithIndexGradFunctor - pool3d_backward; - pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides, - paddings, adaptive, in_x_grad); - } break; - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op only supports 2D and 3D input.")); - } - } - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h index bff8061814a..aa944cfcfbb 100644 --- a/paddle/fluid/operators/spp_op.h +++ b/paddle/fluid/operators/spp_op.h @@ -16,9 +16,10 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/pooling.h" +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace operators { @@ -53,14 +54,20 @@ class SppKernel : public framework::OpKernel { out_level.mutable_data(output_shape, context.GetPlace()); // pooling if (pooling_type == "max") { - math::Pool2dFunctor, T> pool_forward; - math::MaxPool max_process; + phi::funcs::Pool2dFunctor< + typename framework::ConvertToPhiContext::TYPE, + phi::funcs::MaxPool, T> + pool_forward; + phi::funcs::MaxPool max_process; pool_forward(context.template device_context(), *in_x, kernel_size, strides, paddings, true, false, &out_level, max_process); } else if (pooling_type == "avg") { - math::Pool2dFunctor, T> pool_forward; - math::AvgPool avg_process; + phi::funcs::Pool2dFunctor< + typename framework::ConvertToPhiContext::TYPE, + phi::funcs::AvgPool, T> + pool_forward; + phi::funcs::AvgPool avg_process; pool_forward(context.template device_context(), *in_x, kernel_size, strides, paddings, true, false, &out_level, avg_process); @@ -95,7 +102,9 @@ class SppGradKernel : public framework::OpKernel { std::string pooling_type = context.template Attr("pooling_type"); auto& device_ctx = context.template device_context(); - phi::funcs::SetConstant zero; + phi::funcs::SetConstant< + typename framework::ConvertToPhiContext::TYPE, T> + zero; in_x_grad->mutable_data(context.GetPlace()); zero(device_ctx, in_x_grad, static_cast(0)); auto out_stride = phi::stride(out->dims()); @@ -145,14 +154,18 @@ class SppGradKernel : public framework::OpKernel { outgrad_level.Resize(out_shape); // pooling backward if (pooling_type == "max") { - math::MaxPool2dGradFunctor pool2d_backward; + phi::funcs::MaxPool2dGradFunctor< + typename framework::ConvertToPhiContext::TYPE, T> + pool2d_backward; pool2d_backward(context.template device_context(), *in_x, *&out_level, *&outgrad_level, kernel_size, strides, paddings, in_x_grad); } else if (pooling_type == "avg") { - math::Pool2dGradFunctor, T> + phi::funcs::Pool2dGradFunctor< + typename framework::ConvertToPhiContext::TYPE, + phi::funcs::AvgPoolGrad, T> pool_backward; - math::AvgPoolGrad avg_process; + phi::funcs::AvgPoolGrad avg_process; pool_backward(context.template device_context(), *in_x, *&out_level, *&outgrad_level, kernel_size, strides, paddings, true, false, in_x_grad, avg_process); diff --git a/paddle/fluid/operators/squeeze_op.h b/paddle/fluid/operators/squeeze_op.h index 58e54406899..a776a78616b 100644 --- a/paddle/fluid/operators/squeeze_op.h +++ b/paddle/fluid/operators/squeeze_op.h @@ -17,7 +17,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/unsqueeze_op.h b/paddle/fluid/operators/unsqueeze_op.h index 7f676cbb65e..f6112fb59c1 100644 --- a/paddle/fluid/operators/unsqueeze_op.h +++ b/paddle/fluid/operators/unsqueeze_op.h @@ -16,7 +16,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/operators/utils.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/kernels/funcs/blas/blas.h" diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h index 79f8d1c057e..10c3a7c1a3d 100644 --- a/paddle/phi/core/meta_tensor.h +++ b/paddle/phi/core/meta_tensor.h @@ -26,11 +26,13 @@ namespace phi { // TODO(chenweihang): add other flags if needed struct MetaConfig { bool is_runtime{true}; - + bool is_run_mkldnn_kernel{false}; MetaConfig() = default; // supporting implicit construction is easier to use - MetaConfig(bool is_runtime) : is_runtime(is_runtime) {} // NOLINT + MetaConfig(bool is_runtime, bool is_run_mkldnn_kernel) + : is_runtime(is_runtime), + is_run_mkldnn_kernel(is_run_mkldnn_kernel) {} // NOLINT }; class MetaTensor { diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index a2bdf6b963b..37d1a234b57 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -122,6 +122,35 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out, dx->share_meta(dout); } +void MaxPoolWithIndexGradInferMeta(const MetaTensor& x, + const MetaTensor& mask, + const MetaTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + MetaTensor* dx) { + dx->share_meta(x); +} + +void PoolGradInferMeta(const MetaTensor& x, + const MetaTensor& out, + const MetaTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + MetaTensor* dx) { + dx->share_meta(x); +} + void PsroiPoolGradInferMeta(const MetaTensor& x, const MetaTensor& rois, paddle::optional rois_num, diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index 921df460118..06ee5a205d7 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -54,6 +54,16 @@ void GumbelSoftmaxGradInferMeta(const MetaTensor& out, int axis, MetaTensor* dx); +void MaxPoolWithIndexGradInferMeta(const MetaTensor& x, + const MetaTensor& mask, + const MetaTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + MetaTensor* dx); + void PsroiPoolGradInferMeta(const MetaTensor& x, const MetaTensor& rois, paddle::optional rois_num, @@ -64,6 +74,21 @@ void PsroiPoolGradInferMeta(const MetaTensor& x, float spatial_scale, MetaTensor* dx); +void PoolGradInferMeta(const MetaTensor& x, + const MetaTensor& out, + const MetaTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + MetaTensor* dx); + void ScatterGradInferMeta(const MetaTensor& index, const MetaTensor& updates, const MetaTensor& out_grad, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index f7693c2f90a..d09a2191fb2 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/phi/common/type_traits.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/kernels/funcs/pooling.h" #include "paddle/phi/kernels/funcs/unfold_functor.h" namespace phi { @@ -553,6 +554,78 @@ void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) { out->set_dtype(DataType::BOOL); } +void MaxPoolWithIndexInferMeta(const MetaTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + MetaTensor* out, + MetaTensor* mask, + MetaConfig config) { + std::vector paddings_ = paddings; + std::vector kernel_size_ = kernel_size; + + auto x_dims = x.dims(); + + PADDLE_ENFORCE( + x_dims.size() == 4 || x_dims.size() == 5, + errors::InvalidArgument( + "Pooling intput should be 4-D or 5-D tensor but received %dD-Tensor", + x_dims.size())); + + if (global_pooling) { + kernel_size_.resize(static_cast(x_dims.size()) - 2); + for (size_t i = 0; i < kernel_size_.size(); ++i) { + paddings_[i] = 0; + kernel_size_[i] = static_cast(x_dims[i + 2]); + } + } + + PADDLE_ENFORCE_EQ( + x_dims.size() - kernel_size_.size(), + 2U, + errors::InvalidArgument( + "The input size %d minus the kernel size %d should equal to 2.", + x_dims.size(), + kernel_size_.size())); + PADDLE_ENFORCE_EQ( + kernel_size_.size(), + strides.size(), + errors::InvalidArgument( + "Strides size %d and pooling size %d should be the same.", + strides.size(), + kernel_size_.size())); + PADDLE_ENFORCE_EQ( + kernel_size_.size(), + paddings_.size(), + errors::InvalidArgument( + "Paddings size %d and pooling size %d should be the same.", + paddings_.size(), + kernel_size_.size())); + + std::vector output_shape({x_dims[0], x_dims[1]}); + if (adaptive) { + output_shape.insert( + output_shape.end(), kernel_size_.begin(), kernel_size_.end()); + } else { + for (size_t i = 0; i < kernel_size_.size(); ++i) { + if ((!config.is_runtime) && (x_dims[i + 2] < 0)) { + output_shape.push_back(x_dims[i + 2]); + } else { + output_shape.push_back(funcs::MaxPoolOutputSize( + x_dims[i + 2], kernel_size_[i], paddings_[i], strides[i])); + } + } + } + + out->set_dims(make_ddim(output_shape)); + out->set_dtype(x.dtype()); + + mask->set_dims(make_ddim(output_shape)); + mask->set_dtype(paddle::experimental::CppTypeToDataType::Type()); +} + void MultinomialInferMeta(const MetaTensor& x, int num_samples, bool replacement, @@ -675,6 +748,118 @@ void PixelShuffleInferMeta(const MetaTensor& x, out->set_dims(output_dims); } +void PoolInferMeta(const MetaTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + MetaTensor* out, + MetaConfig config) { + std::vector paddings_ = paddings; + std::vector kernel_size_ = kernel_size; + + auto x_dims = x.dims(); + PADDLE_ENFORCE_EQ( + x_dims.size() == 4 || x_dims.size() == 5, + true, + errors::InvalidArgument( + "the input of Op(pool) should be 4-D or 5-D Tensor. But " + "received: %u-D Tensor and it's shape is [%s].", + x_dims.size(), + x_dims)); + + PADDLE_ENFORCE_EQ(x_dims.size() - kernel_size_.size(), + 2U, + errors::InvalidArgument( + "the dimension of input minus the size of " + "Attr(kernel_size_) must be euqal to 2 in Op(pool). " + "But received: the dimension of input minus the size " + "of Attr(kernel_size_) is %d, the " + "input's dimension is %d, the shape of input " + "is [%s], the Attr(kernel_size_)'s size is %d, the " + "Attr(kernel_size_) is [%s].", + x_dims.size() - kernel_size_.size(), + x_dims.size(), + x_dims, + kernel_size_.size(), + make_ddim(kernel_size_))); + + PADDLE_ENFORCE_EQ( + kernel_size_.size(), + strides.size(), + errors::InvalidArgument( + "the size of Attr(kernel_size_) and Attr(strides) in " + "Op(pool) must be equal. " + "But received: Attr(kernel_size_)'s size is %d, Attr(strides)'s " + "size is %d, Attr(kernel_size_) is [%s], Attr(strides)is [%s].", + kernel_size_.size(), + strides.size(), + make_ddim(kernel_size_), + make_ddim(strides))); + + // MKL-DNN Kernels are using NCHW order of dims description + // so we ignore data_format consideration for MKL-DNN kernel + const bool channel_last = (config.is_run_mkldnn_kernel == false) && + (data_format == "NHWC" || data_format == "NDHWC"); + + // update paddings if "SAME" or global_pooling + DDim data_dims; + if (channel_last) { + data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1); + } else { + data_dims = slice_ddim(x_dims, 2, x_dims.size()); + } + funcs::UpdatePadding(&paddings_, + global_pooling, + adaptive, + padding_algorithm, + data_dims, + strides, + kernel_size_); + + if (global_pooling) { + funcs::UpdateKernelSize(&kernel_size_, data_dims); + } + + std::vector output_shape; + if (adaptive) { + output_shape.insert( + output_shape.end(), kernel_size_.begin(), kernel_size_.end()); + } else { + for (int i = 0; i < data_dims.size(); ++i) { + if ((!config.is_runtime) && (data_dims[i] < 0)) { + output_shape.push_back(data_dims[i]); + } else { + output_shape.push_back(funcs::PoolOutputSize(data_dims[i], + kernel_size_[i], + paddings_[2 * i], + paddings_[2 * i + 1], + strides[i], + ceil_mode)); + } + } + } + + // output_N = input_N + output_shape.insert(output_shape.begin(), x_dims[0]); + // output_C = input_C + if (channel_last) { + output_shape.push_back(x_dims[x_dims.size() - 1]); + } else { + output_shape.insert(output_shape.begin() + 1, x_dims[1]); + } + + out->set_dims(make_ddim(output_shape)); + out->share_lod(x); + out->set_dtype(x.dtype()); +} + void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out) { out->set_dims(x.dims()); out->set_dtype(dtype::ToReal(x.dtype())); diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 539b6dcba42..a1fc6fd4053 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -98,6 +98,16 @@ void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out); void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out); +void MaxPoolWithIndexInferMeta(const MetaTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + MetaTensor* out, + MetaTensor* mask, + MetaConfig config = MetaConfig()); + void MultinomialInferMeta(const MetaTensor& x, int num_samples, bool replacement, @@ -114,6 +124,20 @@ void PixelShuffleInferMeta(const MetaTensor& x, const std::string& data_format, MetaTensor* out); +void PoolInferMeta(const MetaTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out); void ReduceInferMeta(const MetaTensor& x, diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 093cb654979..d443b7bb2a0 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -11,7 +11,7 @@ set_property(GLOBAL PROPERTY PHI_KERNELS "") # [ 1. Common kernel compilation dependencies ] set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils custom_kernel) -set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor softmax) +set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor) # remove this dep after removing fluid deps on tensor creation set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta) @@ -27,22 +27,25 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel) # Some kernels depend on some targets that are not commonly used. # These targets are not suitable for common dependencies. # In this case, you need to manually generate them here. -set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel maxout_kernel maxout_grad_kernel put_along_axis_kernel put_along_axis_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel eigh_kernel segment_pool_kernel segment_pool_grad_kernel matrix_power_kernel matrix_power_grad_kernel) +set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel math_kernel matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel triangular_solve_grad_kernel) +kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function) +kernel_library(gumbel_softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) +kernel_library(gumbel_softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel) -kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) -kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) -kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce) +kernel_library(matrix_power_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) +kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting) kernel_library(maxout_grad_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting) +kernel_library(pool_kernel DEPS ${COMMON_KERNEL_DEPS} pooling) kernel_library(put_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) kernel_library(put_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) -kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) -kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) -kernel_library(matrix_power_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) -kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) -kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function) kernel_library(segment_pool_kernel DEPS ${COMMON_KERNEL_DEPS} segment_pooling) kernel_library(segment_pool_grad_kernel DEPS ${COMMON_KERNEL_DEPS} segment_pooling) +kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) +kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) +kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) +kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) +kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce) # 4. auto parse and build kernel targets by cmake register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS} ) diff --git a/paddle/phi/kernels/concat_kernel.h b/paddle/phi/kernels/concat_kernel.h index 4e72159aeca..cf83ab9aaab 100644 --- a/paddle/phi/kernels/concat_kernel.h +++ b/paddle/phi/kernels/concat_kernel.h @@ -40,7 +40,7 @@ DenseTensor Concat(const Context& dev_ctx, DenseTensor dense_out; MetaTensor meta_out(&dense_out); - ConcatInferMeta(meta_x_ptr, axis.to(), &meta_out, /*is_runtime=*/true); + ConcatInferMeta(meta_x_ptr, axis.to(), &meta_out); ConcatKernel(dev_ctx, x, axis, &dense_out); return dense_out; } diff --git a/paddle/phi/kernels/cpu/pool_grad_kernel.cc b/paddle/phi/kernels/cpu/pool_grad_kernel.cc new file mode 100644 index 00000000000..bb97694d8fc --- /dev/null +++ b/paddle/phi/kernels/cpu/pool_grad_kernel.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pool_grad_kernel.h" + +#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL( + pool2d_grad, CPU, ALL_LAYOUT, phi::Pool2dGradKernel, float, double) {} +PD_REGISTER_KERNEL(pool2d_double_grad, + CPU, + ALL_LAYOUT, + phi::Pool2dDoubleGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(max_pool2d_with_index_grad, + CPU, + ALL_LAYOUT, + phi::MaxPool2dWithIndexGradKernel, + float, + double) { + kernel->InputAt(1).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} + +PD_REGISTER_KERNEL( + pool3d_grad, CPU, ALL_LAYOUT, phi::Pool3dGradKernel, float, double) {} +PD_REGISTER_KERNEL(max_pool3d_with_index_grad, + CPU, + ALL_LAYOUT, + phi::MaxPool3dWithIndexGradKernel, + float, + double) { + kernel->InputAt(1).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} diff --git a/paddle/phi/kernels/cpu/pool_kernel.cc b/paddle/phi/kernels/cpu/pool_kernel.cc new file mode 100644 index 00000000000..1d57e282c3c --- /dev/null +++ b/paddle/phi/kernels/cpu/pool_kernel.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pool_kernel.h" + +#include "paddle/phi/kernels/impl/pool_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(pool2d, CPU, ALL_LAYOUT, phi::Pool2dKernel, float, double) {} +PD_REGISTER_KERNEL(max_pool2d_with_index, + CPU, + ALL_LAYOUT, + phi::MaxPool2dWithIndexKernel, + float, + double) { + kernel->OutputAt(1).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} + +PD_REGISTER_KERNEL(pool3d, CPU, ALL_LAYOUT, phi::Pool3dKernel, float, double) {} +PD_REGISTER_KERNEL(max_pool3d_with_index, + CPU, + ALL_LAYOUT, + phi::MaxPool3dWithIndexKernel, + float, + double) { + kernel->OutputAt(1).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} diff --git a/paddle/phi/kernels/cpu/split_kernel.cc b/paddle/phi/kernels/cpu/split_kernel.cc index 324798effbe..ea8e2702c19 100644 --- a/paddle/phi/kernels/cpu/split_kernel.cc +++ b/paddle/phi/kernels/cpu/split_kernel.cc @@ -38,7 +38,7 @@ void SplitKernel(const Context& dev_ctx, out_metas_ptr.push_back(&out_metas.back()); } - phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr, true); + phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr); for (size_t i = 0; i < out_metas.size(); ++i) { outs[i]->Resize(out_metas[i].dims()); diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt index e0db7b51f8e..942eecae168 100644 --- a/paddle/phi/kernels/funcs/CMakeLists.txt +++ b/paddle/phi/kernels/funcs/CMakeLists.txt @@ -3,11 +3,12 @@ add_subdirectory(blas) add_subdirectory(lapack) add_subdirectory(detail) -math_library(math_function DEPS blas dense_tensor tensor) -math_library(segment_pooling) -math_library(sequence2batch) +math_library(concat_and_split_functor DEPS dense_tensor) math_library(gru_compute DEPS activation_functions math_function) math_library(lstm_compute DEPS activation_functions) -math_library(concat_and_split_functor DEPS dense_tensor) +math_library(math_function DEPS blas dense_tensor tensor) math_library(matrix_reduce DEPS dense_tensor) math_library(matrix_inverse DEPS dense_tensor eigen3 blas) +math_library(pooling DEPS dense_tensor) +math_library(segment_pooling) +math_library(sequence2batch) diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/phi/kernels/funcs/pooling.cc similarity index 83% rename from paddle/fluid/operators/math/pooling.cc rename to paddle/phi/kernels/funcs/pooling.cc index f2e5e955ec4..10c88b9798c 100644 --- a/paddle/fluid/operators/math/pooling.cc +++ b/paddle/phi/kernels/funcs/pooling.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -11,11 +11,15 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/math/pooling.h" -namespace paddle { -namespace operators { -namespace math { +#include "paddle/phi/kernels/funcs/pooling.h" + +#include +#include +#include "paddle/phi/backends/cpu/cpu_context.h" + +namespace phi { +namespace funcs { /* * Tensors are in NCHW or NHWC format. @@ -25,13 +29,16 @@ namespace math { * height_down, width_left and width_right, respectively. */ template -class Pool2dFunctor { +class Pool2dFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const CPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* output, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* output, PoolProcess pool_process) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; @@ -50,7 +57,7 @@ class Pool2dFunctor { const int output_stride = output_height * output_width; const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); + T* output_data = context.template Alloc(output); int hstart = 0, hend = 1; int wstart = 0, wend = 1; @@ -101,12 +108,16 @@ class Pool2dFunctor { } } - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const CPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* output, PoolProcess pool_process) { + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* output, + PoolProcess pool_process) { bool channel_last = (data_format == "NHWC"); const int batch_size = input.dims()[0]; @@ -131,7 +142,7 @@ class Pool2dFunctor { const int padding_width = paddings[1]; const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); + T* output_data = context.template Alloc(output); int hstart = 0, hend = 1; int wstart = 0, wend = 1; @@ -244,14 +255,19 @@ class Pool2dFunctor { * height_down, width_left and width_right, respectively. */ template -class Pool2dGradFunctor { +class Pool2dGradFunctor { public: - void operator()( - const platform::CPUDeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, const framework::Tensor& output_grad, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool exclusive, bool adaptive, - framework::Tensor* input_grad, PoolProcess pool_grad_process) { + void operator()(const CPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_grad_process) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -270,7 +286,7 @@ class Pool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int hstart = 0, hend = 1; int wstart = 0, wend = 1; @@ -324,13 +340,18 @@ class Pool2dGradFunctor { } } - void operator()( - const platform::CPUDeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, const framework::Tensor& output_grad, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, const std::string data_format, - bool exclusive, bool adaptive, framework::Tensor* input_grad, - PoolProcess pool_grad_process) { + void operator()(const CPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_grad_process) { bool channel_last = (data_format == "NHWC"); const int batch_size = input.dims()[0]; @@ -357,7 +378,7 @@ class Pool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int hstart = 0, hend = 1; int wstart = 0, wend = 1; @@ -451,10 +472,11 @@ class Pool2dGradFunctor { h * input_width * input_channels + w * input_channels + c; auto output_idx = ph * output_width * output_channels + pw * output_channels + c; - pool_grad_process.compute( - input_data[input_idx], output_data[output_idx], - output_grad_data[output_idx], static_cast(scale), - input_grad_data + input_idx); + pool_grad_process.compute(input_data[input_idx], + output_data[output_idx], + output_grad_data[output_idx], + static_cast(scale), + input_grad_data + input_idx); } } } @@ -477,13 +499,16 @@ class Pool2dGradFunctor { * height_down, width_left and width_right, respectively. */ template -class MaxPool2dGradFunctor { +class MaxPool2dGradFunctor { public: - void operator()( - const platform::CPUDeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, const framework::Tensor& output_grad, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, framework::Tensor* input_grad) { + void operator()(const CPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + DenseTensor* input_grad) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -502,7 +527,7 @@ class MaxPool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { @@ -536,12 +561,15 @@ class MaxPool2dGradFunctor { } } - void operator()( - const platform::CPUDeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, const framework::Tensor& output_grad, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, const std::string data_format, - framework::Tensor* input_grad) { + void operator()(const CPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + DenseTensor* input_grad) { bool channel_last = (data_format == "NHWC"); const int batch_size = input.dims()[0]; @@ -568,7 +596,7 @@ class MaxPool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); if (!channel_last) { const int input_stride = input_height * input_width; @@ -641,29 +669,17 @@ class MaxPool2dGradFunctor { } } }; -template class MaxPool2dGradFunctor; -template class MaxPool2dGradFunctor; - -template class Pool2dFunctor, float>; -template class Pool2dFunctor, float>; -template class Pool2dGradFunctor, - float>; -template class Pool2dGradFunctor, - float>; -template class Pool2dFunctor, double>; -template class Pool2dFunctor, double>; -template class Pool2dGradFunctor, - double>; -template class Pool2dGradFunctor, - double>; +template class MaxPool2dGradFunctor; +template class MaxPool2dGradFunctor; + +template class Pool2dFunctor, float>; +template class Pool2dFunctor, float>; +template class Pool2dGradFunctor, float>; +template class Pool2dGradFunctor, float>; +template class Pool2dFunctor, double>; +template class Pool2dFunctor, double>; +template class Pool2dGradFunctor, double>; +template class Pool2dGradFunctor, double>; /* * Tensors are in NCDHW or NDHWC format. @@ -674,13 +690,16 @@ template class Pool2dGradFunctor -class Pool3dFunctor { +class Pool3dFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const CPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* output, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* output, PoolProcess pool_process) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; @@ -704,7 +723,7 @@ class Pool3dFunctor { const int output_stride = output_depth * output_height * output_width; const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); + T* output_data = context.template Alloc(output); int dstart = 0, dend = 1; int hstart = 0, hend = 1; @@ -771,12 +790,16 @@ class Pool3dFunctor { } } } - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const CPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* output, PoolProcess pool_process) { + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* output, + PoolProcess pool_process) { bool channel_last = (data_format == "NDHWC"); const int batch_size = input.dims()[0]; @@ -807,7 +830,7 @@ class Pool3dFunctor { const int padding_width = paddings[2]; const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); + T* output_data = context.template Alloc(output); int dstart = 0, dend = 1; int hstart = 0, hend = 1; @@ -966,14 +989,19 @@ class Pool3dFunctor { * height_up, height_down, width_left and width_right, respectively. */ template -class Pool3dGradFunctor { +class Pool3dGradFunctor { public: - void operator()( - const platform::CPUDeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, const framework::Tensor& output_grad, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool exclusive, bool adaptive, - framework::Tensor* input_grad, PoolProcess pool_grad_process) { + void operator()(const CPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_grad_process) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -997,7 +1025,7 @@ class Pool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int dstart = 0, dend = 1; int hstart = 0, hend = 1; @@ -1051,10 +1079,11 @@ class Pool3dGradFunctor { int input_idx = (d * input_height + h) * input_width + w; int output_idx = (pd * output_height + ph) * output_width + pw; - pool_grad_process.compute( - input_data[input_idx], output_data[output_idx], - output_grad_data[output_idx], static_cast(scale), - input_grad_data + input_idx); + pool_grad_process.compute(input_data[input_idx], + output_data[output_idx], + output_grad_data[output_idx], + static_cast(scale), + input_grad_data + input_idx); } } } @@ -1068,13 +1097,18 @@ class Pool3dGradFunctor { } } } - void operator()( - const platform::CPUDeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, const framework::Tensor& output_grad, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, const std::string data_format, - bool exclusive, bool adaptive, framework::Tensor* input_grad, - PoolProcess pool_grad_process) { + void operator()(const CPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_grad_process) { bool channel_last = (data_format == "NDHWC"); const int batch_size = input.dims()[0]; @@ -1105,7 +1139,7 @@ class Pool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int dstart = 0, dend = 1; int hstart = 0, hend = 1; @@ -1164,10 +1198,11 @@ class Pool3dGradFunctor { int input_idx = (d * input_height + h) * input_width + w; int output_idx = (pd * output_height + ph) * output_width + pw; - pool_grad_process.compute( - input_data[input_idx], output_data[output_idx], - output_grad_data[output_idx], static_cast(scale), - input_grad_data + input_idx); + pool_grad_process.compute(input_data[input_idx], + output_data[output_idx], + output_grad_data[output_idx], + static_cast(scale), + input_grad_data + input_idx); } } } @@ -1241,10 +1276,11 @@ class Pool3dGradFunctor { ((pd * output_height + ph) * output_width + pw) * output_channels + c; - pool_grad_process.compute( - input_data[input_idx], output_data[output_idx], - output_grad_data[output_idx], static_cast(scale), - input_grad_data + input_idx); + pool_grad_process.compute(input_data[input_idx], + output_data[output_idx], + output_grad_data[output_idx], + static_cast(scale), + input_grad_data + input_idx); } } } @@ -1270,13 +1306,16 @@ class Pool3dGradFunctor { * height_up, height_down, width_left and width_right, respectively. */ template -class MaxPool3dGradFunctor { +class MaxPool3dGradFunctor { public: - void operator()( - const platform::CPUDeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, const framework::Tensor& output_grad, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, framework::Tensor* input_grad) { + void operator()(const CPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + DenseTensor* input_grad) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -1300,7 +1339,7 @@ class MaxPool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { @@ -1342,12 +1381,15 @@ class MaxPool3dGradFunctor { } } } - void operator()( - const platform::CPUDeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, const framework::Tensor& output_grad, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, const std::string data_format, - framework::Tensor* input_grad) { + void operator()(const CPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + DenseTensor* input_grad) { bool channel_last = (data_format == "NDHWC"); const int batch_size = input.dims()[0]; @@ -1378,7 +1420,7 @@ class MaxPool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); if (!channel_last) { const int input_stride = input_depth * input_height * input_width; @@ -1475,29 +1517,17 @@ class MaxPool3dGradFunctor { } } }; -template class MaxPool3dGradFunctor; -template class MaxPool3dGradFunctor; - -template class Pool3dFunctor, float>; -template class Pool3dFunctor, float>; -template class Pool3dGradFunctor, - float>; -template class Pool3dGradFunctor, - float>; -template class Pool3dFunctor, double>; -template class Pool3dFunctor, double>; -template class Pool3dGradFunctor, - double>; -template class Pool3dGradFunctor, - double>; +template class MaxPool3dGradFunctor; +template class MaxPool3dGradFunctor; + +template class Pool3dFunctor, float>; +template class Pool3dFunctor, float>; +template class Pool3dGradFunctor, float>; +template class Pool3dGradFunctor, float>; +template class Pool3dFunctor, double>; +template class Pool3dFunctor, double>; +template class Pool3dGradFunctor, double>; +template class Pool3dGradFunctor, double>; /* * All tensors are in NCHW format. @@ -1505,13 +1535,16 @@ template class Pool3dGradFunctor -class MaxPool2dWithIndexFunctor { +class MaxPool2dWithIndexFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const CPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* output, framework::Tensor* mask) { + const std::vector& paddings, + bool adaptive, + DenseTensor* output, + DenseTensor* mask) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -1528,8 +1561,8 @@ class MaxPool2dWithIndexFunctor { const int output_stride = output_height * output_width; const T1* input_data = input.data(); - T1* output_data = output->mutable_data(context.GetPlace()); - T2* mask_data = mask->mutable_data(context.GetPlace()); + T1* output_data = context.template Alloc(output); + T2* mask_data = context.template Alloc(mask); int hstart, hend; int wstart, wend; @@ -1583,14 +1616,16 @@ class MaxPool2dWithIndexFunctor { * height and width, respectively. */ template -class MaxPool2dWithIndexGradFunctor { +class MaxPool2dWithIndexGradFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& output_grad, - const framework::Tensor& mask, const std::vector& ksize, + void operator()(const CPUContext& context, + const DenseTensor& output_grad, + const DenseTensor& mask, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* input_grad) { + const std::vector& paddings, + bool adaptive, + DenseTensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_height = input_grad->dims()[2]; const int input_width = input_grad->dims()[3]; @@ -1602,7 +1637,7 @@ class MaxPool2dWithIndexGradFunctor { const T2* mask_data = mask.data(); const T1* output_grad_data = output_grad.data(); - T1* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T1* input_grad_data = context.template Alloc(input_grad); for (int n = 0; n < batch_size; ++n) { for (int c = 0; c < output_channels; ++c) { @@ -1622,14 +1657,10 @@ class MaxPool2dWithIndexGradFunctor { } }; -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; /* * All tensors are in NCDHW format. @@ -1637,13 +1668,16 @@ template class MaxPool2dWithIndexGradFunctor -class MaxPool3dWithIndexFunctor { +class MaxPool3dWithIndexFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const CPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* output, framework::Tensor* mask) { + const std::vector& paddings, + bool adaptive, + DenseTensor* output, + DenseTensor* mask) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -1665,8 +1699,8 @@ class MaxPool3dWithIndexFunctor { const int output_stride = output_depth * output_height * output_width; const T1* input_data = input.data(); - T1* output_data = output->mutable_data(context.GetPlace()); - T2* mask_data = mask->mutable_data(context.GetPlace()); + T1* output_data = context.template Alloc(output); + T2* mask_data = context.template Alloc(mask); int dstart, dend; int hstart, hend; @@ -1735,14 +1769,16 @@ class MaxPool3dWithIndexFunctor { * depth, height and width, respectively. */ template -class MaxPool3dWithIndexGradFunctor { +class MaxPool3dWithIndexGradFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& output_grad, - const framework::Tensor& mask, const std::vector& ksize, + void operator()(const CPUContext& context, + const DenseTensor& output_grad, + const DenseTensor& mask, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* input_grad) { + const std::vector& paddings, + bool adaptive, + DenseTensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_depth = input_grad->dims()[2]; const int input_height = input_grad->dims()[3]; @@ -1756,7 +1792,7 @@ class MaxPool3dWithIndexGradFunctor { const T2* mask_data = mask.data(); const T1* output_grad_data = output_grad.data(); - T1* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T1* input_grad_data = context.template Alloc(input_grad); for (int n = 0; n < batch_size; ++n) { for (int c = 0; c < output_channels; ++c) { @@ -1779,14 +1815,9 @@ class MaxPool3dWithIndexGradFunctor { } }; -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; -} // namespace math -} // namespace operators -} // namespace paddle +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; +} // namespace funcs +} // namespace phi diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu similarity index 54% rename from paddle/fluid/operators/math/pooling.cu rename to paddle/phi/kernels/funcs/pooling.cu index 9d96345eb1f..4cf5e1c02c5 100644 --- a/paddle/fluid/operators/math/pooling.cu +++ b/paddle/phi/kernels/funcs/pooling.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 paddlepaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,63 +12,72 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/phi/kernels/funcs/pooling.h" + #include #include - -#include "paddle/fluid/operators/math/pooling.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/fast_divmod.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" -namespace paddle { -namespace operators { -namespace math { +namespace phi { +namespace funcs { struct FastDivModForPooling { public: - platform::FastDivMod channel; - platform::FastDivMod width; - platform::FastDivMod height; + paddle::platform::FastDivMod channel; + paddle::platform::FastDivMod width; + paddle::platform::FastDivMod height; explicit HOSTDEVICE FastDivModForPooling(const int channels, const int output_width, const int output_height) { - channel = platform::FastDivMod(channels); - width = platform::FastDivMod(output_width); - height = platform::FastDivMod(output_height); + channel = paddle::platform::FastDivMod(channels); + width = paddle::platform::FastDivMod(output_width); + height = paddle::platform::FastDivMod(output_height); } }; struct FastDivModForPoolingWithMoreStaff { public: - platform::FastDivMod channel; - platform::FastDivMod width; - platform::FastDivMod height; - platform::FastDivMod ksize_w; - platform::FastDivMod ksize_h; - platform::FastDivMod stride_w; - platform::FastDivMod stride_h; + paddle::platform::FastDivMod channel; + paddle::platform::FastDivMod width; + paddle::platform::FastDivMod height; + paddle::platform::FastDivMod ksize_w; + paddle::platform::FastDivMod ksize_h; + paddle::platform::FastDivMod stride_w; + paddle::platform::FastDivMod stride_h; explicit HOSTDEVICE FastDivModForPoolingWithMoreStaff( - const int channels, const int input_width, const int input_height, - const int ksize_width, const int ksize_height, const int stride_width, + const int channels, + const int input_width, + const int input_height, + const int ksize_width, + const int ksize_height, + const int stride_width, const int stride_height) { - channel = platform::FastDivMod(channels); - width = platform::FastDivMod(input_width); - height = platform::FastDivMod(input_height); - ksize_w = platform::FastDivMod(ksize_width); - ksize_h = platform::FastDivMod(ksize_height); - stride_w = platform::FastDivMod(stride_width); - stride_h = platform::FastDivMod(stride_height); + channel = paddle::platform::FastDivMod(channels); + width = paddle::platform::FastDivMod(input_width); + height = paddle::platform::FastDivMod(input_height); + ksize_w = paddle::platform::FastDivMod(ksize_width); + ksize_h = paddle::platform::FastDivMod(ksize_height); + stride_w = paddle::platform::FastDivMod(stride_width); + stride_h = paddle::platform::FastDivMod(stride_height); } }; template -__device__ void OffsetPreparationFor4Dimension( - int index, bool channel_last, FastDivModForPooling divmods, - const int pad_width, const int pad_height, const int aux_width, - const int aux_height, int* w_offset, int* h_offset, int* c_offset, - int* stride) { +__device__ void OffsetPreparationFor4Dimension(int index, + bool channel_last, + FastDivModForPooling divmods, + const int pad_width, + const int pad_height, + const int aux_width, + const int aux_height, + int* w_offset, + int* h_offset, + int* c_offset, + int* stride) { if (!channel_last) { /* NCHW */ auto input_width_divmod = divmods.width.Divmod(index); auto input_height_divmod = divmods.height.Divmod(input_width_divmod.val[0]); @@ -91,21 +100,40 @@ __device__ void OffsetPreparationFor4Dimension( } template -__global__ void KernelPool2D( - const int nthreads, const T* input_data, const int channels, - const int input_height, const int input_width, const int output_height, - const int output_width, const int ksize_height, const int ksize_width, - const int stride_height, const int stride_width, const int padding_height, - const int padding_width, FastDivModForPooling divmods, - PoolProcess pool_process, bool exclusive, bool adaptive, T* output_data, - bool channel_last = false) { +__global__ void KernelPool2D(const int nthreads, + const T* input_data, + const int channels, + const int input_height, + const int input_width, + const int output_height, + const int output_width, + const int ksize_height, + const int ksize_width, + const int stride_height, + const int stride_width, + const int padding_height, + const int padding_width, + FastDivModForPooling divmods, + PoolProcess pool_process, + bool exclusive, + bool adaptive, + T* output_data, + bool channel_last = false) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int hstart, hend, wstart, wend; int w_offset, h_offset, c_offset, input_offset; - OffsetPreparationFor4Dimension( - index, channel_last, divmods, 0, 0, input_width, input_height, - &w_offset, &h_offset, &c_offset, &input_offset); + OffsetPreparationFor4Dimension(index, + channel_last, + divmods, + 0, + 0, + input_width, + input_height, + &w_offset, + &h_offset, + &c_offset, + &input_offset); input_data += input_offset; if (adaptive) { @@ -139,25 +167,43 @@ __global__ void KernelPool2D( } template -__global__ void KernelPool2DGrad( - const int nthreads, const T* __restrict__ input_data, - const T* __restrict__ output_data, const const T* __restrict__ output_grad, - const int output_width, const int output_height, const int input_width, - const int input_height, const int ksize_width, const int ksize_height, - const int stride_width, const int stride_height, const int padding_width, - const int padding_height, FastDivModForPoolingWithMoreStaff divmods, - PoolProcess pool_process, bool exclusive, bool adaptive, - T* __restrict__ input_grad, bool channel_last = false) { +__global__ void KernelPool2DGrad(const int nthreads, + const T* __restrict__ input_data, + const T* __restrict__ output_data, + const const T* __restrict__ output_grad, + const int output_width, + const int output_height, + const int input_width, + const int input_height, + const int ksize_width, + const int ksize_height, + const int stride_width, + const int stride_height, + const int padding_width, + const int padding_height, + FastDivModForPoolingWithMoreStaff divmods, + PoolProcess pool_process, + bool exclusive, + bool adaptive, + T* __restrict__ input_grad, + bool channel_last = false) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { T input = static_cast(0); T input_grad_data = static_cast(0); int phstart, phend, pwstart, pwend; int w_offset, h_offset, c_offset, output_offset; - OffsetPreparationFor4Dimension<>(index, channel_last, divmods, - padding_width, padding_height, - output_width, output_height, &w_offset, - &h_offset, &c_offset, &output_offset); + OffsetPreparationFor4Dimension<>(index, + channel_last, + divmods, + padding_width, + padding_height, + output_width, + output_height, + &w_offset, + &h_offset, + &c_offset, + &output_offset); if (pool_process.use_x) { input = input_data[index]; output_data += output_offset; @@ -188,7 +234,9 @@ __global__ void KernelPool2DGrad( : tmp_idx; T ouput_value = pool_process.use_x ? output_data[output_sub_idx] : static_cast(0); - pool_process.compute(input, ouput_value, output_grad[output_sub_idx], + pool_process.compute(input, + ouput_value, + output_grad[output_sub_idx], static_cast(1.0 / pool_size), &input_grad_data); } @@ -217,9 +265,11 @@ __global__ void KernelPool2DGrad( : tmp_idx; T ouput_value = pool_process.use_x ? output_data[output_sub_idx] : static_cast(0); - pool_process.compute( - input, ouput_value, output_grad[output_sub_idx], - static_cast(1.0 / pool_size), &input_grad_data); + pool_process.compute(input, + ouput_value, + output_grad[output_sub_idx], + static_cast(1.0 / pool_size), + &input_grad_data); } } } else { @@ -232,9 +282,11 @@ __global__ void KernelPool2DGrad( : tmp_idx; T ouput_value = pool_process.use_x ? output_data[output_sub_idx] : static_cast(0); - pool_process.compute( - input, ouput_value, output_grad[output_sub_idx], - static_cast(1.0 / pool_size), &input_grad_data); + pool_process.compute(input, + ouput_value, + output_grad[output_sub_idx], + static_cast(1.0 / pool_size), + &input_grad_data); } } } @@ -244,19 +296,38 @@ __global__ void KernelPool2DGrad( } template -__global__ void KernelMaxPool2DGrad( - const int nthreads, const T* input_data, const T* output_data, - const T* output_grad, const int channels, const int input_height, - const int input_width, const int output_height, const int output_width, - const int ksize_height, const int ksize_width, const int stride_height, - const int stride_width, const int padding_height, const int padding_width, - T* input_grad, FastDivModForPooling divmods, bool channel_last = false) { +__global__ void KernelMaxPool2DGrad(const int nthreads, + const T* input_data, + const T* output_data, + const T* output_grad, + const int channels, + const int input_height, + const int input_width, + const int output_height, + const int output_width, + const int ksize_height, + const int ksize_width, + const int stride_height, + const int stride_width, + const int padding_height, + const int padding_width, + T* input_grad, + FastDivModForPooling divmods, + bool channel_last = false) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int w_offset, h_offset, c_offset, input_offset; - OffsetPreparationFor4Dimension( - index, channel_last, divmods, 0, 0, input_width, input_height, - &w_offset, &h_offset, &c_offset, &input_offset); + OffsetPreparationFor4Dimension(index, + channel_last, + divmods, + 0, + 0, + input_width, + input_height, + &w_offset, + &h_offset, + &c_offset, + &input_offset); input_data += input_offset; input_grad += input_offset; @@ -285,17 +356,24 @@ __global__ void KernelMaxPool2DGrad( if (maxIndex != -1) { // atomic add - platform::CudaAtomicAdd(input_grad + maxIndex, output_grad[index]); + paddle::platform::CudaAtomicAdd(input_grad + maxIndex, + output_grad[index]); } } } template void Pool2dDirectCUDAFunctor::operator()( - const T* input, const std::vector& input_shape, - const std::vector& output_shape, const std::vector& ksize, - const std::vector& strides, const std::vector& paddings, - bool exclusive, bool adaptive, T* output, gpuStream_t stream, + const T* input, + const std::vector& input_shape, + const std::vector& output_shape, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + T* output, + gpuStream_t stream, PoolProcess pool_compute) { const int batch_size = input_shape[0]; const int input_channels = input_shape[1]; @@ -314,7 +392,7 @@ void Pool2dDirectCUDAFunctor::operator()( int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - // platform::ChangeThreadNum(context, &thread_num); + // paddle::platform::ChangeThreadNum(context, &thread_num); thread_num = 512; #endif int blocks = (nthreads + thread_num - 1) / thread_num; @@ -323,11 +401,24 @@ void Pool2dDirectCUDAFunctor::operator()( auto pool_divmods = FastDivModForPooling(input_channels, output_width, output_height); - KernelPool2D<<>>( - nthreads, input, input_channels, input_height, input_width, output_height, - output_width, ksize_height, ksize_width, stride_height, stride_width, - padding_height, padding_width, pool_divmods, pool_compute, exclusive, - adaptive, output); + KernelPool2D<<>>(nthreads, + input, + input_channels, + input_height, + input_width, + output_height, + output_width, + ksize_height, + ksize_width, + stride_height, + stride_width, + padding_height, + padding_width, + pool_divmods, + pool_compute, + exclusive, + adaptive, + output); } /* @@ -338,13 +429,16 @@ void Pool2dDirectCUDAFunctor::operator()( * height_down, width_left and width_right, respectively. */ template -class Pool2dFunctor { +class Pool2dFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* output, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* output, PoolProcess pool_process) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -361,12 +455,12 @@ class Pool2dFunctor { const int padding_width = paddings[1]; const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); + T* output_data = context.template Alloc(output); int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - platform::ChangeThreadNum(context, &thread_num); + paddle::platform::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); @@ -375,17 +469,35 @@ class Pool2dFunctor { auto pool_divmods = FastDivModForPooling(input_channels, output_width, output_height); KernelPool2D<<>>( - nthreads, input_data, input_channels, input_height, input_width, - output_height, output_width, ksize_height, ksize_width, stride_height, - stride_width, padding_height, padding_width, pool_divmods, pool_process, - exclusive, adaptive, output_data); + nthreads, + input_data, + input_channels, + input_height, + input_width, + output_height, + output_width, + ksize_height, + ksize_width, + stride_height, + stride_width, + padding_height, + padding_width, + pool_divmods, + pool_process, + exclusive, + adaptive, + output_data); } - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* output, PoolProcess pool_process) { + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* output, + PoolProcess pool_process) { bool channel_last = (data_format == "NHWC"); const int batch_size = input.dims()[0]; @@ -410,12 +522,12 @@ class Pool2dFunctor { const int padding_width = paddings[1]; const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); + T* output_data = context.template Alloc(output); int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - platform::ChangeThreadNum(context, &thread_num); + paddle::platform::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); @@ -424,10 +536,25 @@ class Pool2dFunctor { auto pool_divmods = FastDivModForPooling(input_channels, output_width, output_height); KernelPool2D<<>>( - nthreads, input_data, input_channels, input_height, input_width, - output_height, output_width, ksize_height, ksize_width, stride_height, - stride_width, padding_height, padding_width, pool_divmods, pool_process, - exclusive, adaptive, output_data, channel_last); + nthreads, + input_data, + input_channels, + input_height, + input_width, + output_height, + output_width, + ksize_height, + ksize_width, + stride_height, + stride_width, + padding_height, + padding_width, + pool_divmods, + pool_process, + exclusive, + adaptive, + output_data, + channel_last); } }; /* @@ -438,16 +565,18 @@ class Pool2dFunctor { * height_down, width_left and width_right, respectively. */ template -class Pool2dGradFunctor { +class Pool2dGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* input_grad, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, PoolProcess pool_process) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -465,30 +594,53 @@ class Pool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * input_channels * input_height * input_width; - auto pool_divmods = FastDivModForPoolingWithMoreStaff( - input_channels, input_width, input_height, ksize_width, ksize_height, - stride_width, stride_height); - - auto config = GetGpuLaunchConfig1D(context, nthreads); - KernelPool2DGrad<<< - config.block_per_grid, config.thread_per_block, 0, context.stream()>>>( - nthreads, input_data, output_data, output_grad_data, output_width, - output_height, input_width, input_height, ksize_width, ksize_height, - stride_width, stride_height, padding_width, padding_height, - pool_divmods, pool_process, exclusive, adaptive, input_grad_data); + auto pool_divmods = FastDivModForPoolingWithMoreStaff(input_channels, + input_width, + input_height, + ksize_width, + ksize_height, + stride_width, + stride_height); + + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(context, nthreads); + KernelPool2DGrad<<>>(nthreads, + input_data, + output_data, + output_grad_data, + output_width, + output_height, + input_width, + input_height, + ksize_width, + ksize_height, + stride_width, + stride_height, + padding_width, + padding_height, + pool_divmods, + pool_process, + exclusive, + adaptive, + input_grad_data); } - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* input_grad, PoolProcess pool_process) { + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_process) { bool channel_last = (data_format == "NHWC"); const int batch_size = input.dims()[0]; @@ -514,21 +666,41 @@ class Pool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * input_channels * input_height * input_width; - auto pool_divmods = FastDivModForPoolingWithMoreStaff( - input_channels, input_width, input_height, ksize_width, ksize_height, - stride_width, stride_height); - - auto config = GetGpuLaunchConfig1D(context, nthreads); - KernelPool2DGrad<<< - config.block_per_grid, config.thread_per_block, 0, context.stream()>>>( - nthreads, input_data, output_data, output_grad_data, output_width, - output_height, input_width, input_height, ksize_width, ksize_height, - stride_width, stride_height, padding_width, padding_height, - pool_divmods, pool_process, exclusive, adaptive, input_grad_data, - channel_last); + auto pool_divmods = FastDivModForPoolingWithMoreStaff(input_channels, + input_width, + input_height, + ksize_width, + ksize_height, + stride_width, + stride_height); + + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(context, nthreads); + KernelPool2DGrad<<>>(nthreads, + input_data, + output_data, + output_grad_data, + output_width, + output_height, + input_width, + input_height, + ksize_width, + ksize_height, + stride_width, + stride_height, + padding_width, + padding_height, + pool_divmods, + pool_process, + exclusive, + adaptive, + input_grad_data, + channel_last); } }; @@ -540,16 +712,16 @@ class Pool2dGradFunctor { * height_down, width_left and width_right, respectively. */ template -class MaxPool2dGradFunctor { +class MaxPool2dGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, - framework::Tensor* input_grad) { + DenseTensor* input_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -567,7 +739,7 @@ class MaxPool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * output_channels * output_height * output_width; int blocks = (nthreads + 1024 - 1) / 1024; @@ -577,17 +749,33 @@ class MaxPool2dGradFunctor { auto pool_divmods = FastDivModForPooling(input_channels, output_width, output_height); KernelMaxPool2DGrad<<>>( - nthreads, input_data, output_data, output_grad_data, input_channels, - input_height, input_width, output_height, output_width, ksize_height, - ksize_width, stride_height, stride_width, padding_height, padding_width, - input_grad_data, pool_divmods); + nthreads, + input_data, + output_data, + output_grad_data, + input_channels, + input_height, + input_width, + output_height, + output_width, + ksize_height, + ksize_width, + stride_height, + stride_width, + padding_height, + padding_width, + input_grad_data, + pool_divmods); } - void operator()( - const platform::CUDADeviceContext& context, - const framework::Tensor& input, const framework::Tensor& output, - const framework::Tensor& output_grad, const std::vector& ksize, - const std::vector& strides, const std::vector& paddings, - const std::string data_format, framework::Tensor* input_grad) { + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + DenseTensor* input_grad) { bool channel_last = (data_format == "NHWC"); const int batch_size = input.dims()[0]; @@ -614,7 +802,7 @@ class MaxPool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * output_channels * output_height * output_width; int blocks = (nthreads + 1024 - 1) / 1024; @@ -625,71 +813,80 @@ class MaxPool2dGradFunctor { FastDivModForPooling(input_channels, output_width, output_height); KernelMaxPool2DGrad<<>>( - nthreads, input_data, output_data, output_grad_data, input_channels, - input_height, input_width, output_height, output_width, ksize_height, - ksize_width, stride_height, stride_width, padding_height, padding_width, - input_grad_data, pool_divmods, channel_last); + nthreads, + input_data, + output_data, + output_grad_data, + input_channels, + input_height, + input_width, + output_height, + output_width, + ksize_height, + ksize_width, + stride_height, + stride_width, + padding_height, + padding_width, + input_grad_data, + pool_divmods, + channel_last); } }; -template class Pool2dDirectCUDAFunctor, - float>; -template class Pool2dDirectCUDAFunctor, - float>; - -template class MaxPool2dGradFunctor; -template class MaxPool2dGradFunctor; -template class MaxPool2dGradFunctor; - -template class Pool2dFunctor, float>; -template class Pool2dFunctor, float>; -template class Pool2dGradFunctor, - float>; -template class Pool2dGradFunctor, - float>; -template class Pool2dFunctor, double>; -template class Pool2dFunctor, double>; -template class Pool2dGradFunctor, - double>; -template class Pool2dGradFunctor, - double>; - -template class Pool2dFunctor< - platform::CUDADeviceContext, - paddle::operators::math::MaxPool, - paddle::platform::float16>; -template class Pool2dFunctor< - platform::CUDADeviceContext, - paddle::operators::math::AvgPool, - paddle::platform::float16>; -template class Pool2dGradFunctor< - platform::CUDADeviceContext, - paddle::operators::math::MaxPoolGrad, - paddle::platform::float16>; -template class Pool2dGradFunctor< - platform::CUDADeviceContext, - paddle::operators::math::AvgPoolGrad, - paddle::platform::float16>; +template class Pool2dDirectCUDAFunctor, float>; +template class Pool2dDirectCUDAFunctor, float>; + +template class MaxPool2dGradFunctor; +template class MaxPool2dGradFunctor; +template class MaxPool2dGradFunctor; + +template class Pool2dFunctor, float>; +template class Pool2dFunctor, float>; +template class Pool2dGradFunctor, float>; +template class Pool2dGradFunctor, float>; +template class Pool2dFunctor, double>; +template class Pool2dFunctor, double>; +template class Pool2dGradFunctor, double>; +template class Pool2dGradFunctor, double>; + +template class Pool2dFunctor, + dtype::float16>; +template class Pool2dFunctor, + dtype::float16>; +template class Pool2dGradFunctor, + dtype::float16>; +template class Pool2dGradFunctor, + dtype::float16>; template -__global__ void KernelPool3D( - const int nthreads, const T* input_data, const int channels, - const int input_depth, const int input_height, const int input_width, - const int output_depth, const int output_height, const int output_width, - const int ksize_depth, const int ksize_height, const int ksize_width, - const int stride_depth, const int stride_height, const int stride_width, - const int padding_depth, const int padding_height, const int padding_width, - PoolProcess pool_process, bool exclusive, bool adaptive, T* output_data, - bool channel_last = false) { +__global__ void KernelPool3D(const int nthreads, + const T* input_data, + const int channels, + const int input_depth, + const int input_height, + const int input_width, + const int output_depth, + const int output_height, + const int output_width, + const int ksize_depth, + const int ksize_height, + const int ksize_width, + const int stride_depth, + const int stride_height, + const int stride_width, + const int padding_depth, + const int padding_height, + const int padding_width, + PoolProcess pool_process, + bool exclusive, + bool adaptive, + T* output_data, + bool channel_last = false) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int pw, ph, pd, c, batch_idx; @@ -764,16 +961,31 @@ __global__ void KernelPool3D( } template -__global__ void KernelPool3DGrad( - const int nthreads, const T* __restrict__ input_data, - const T* __restrict__ output_data, const T* __restrict__ output_grad, - const int channels, const int input_depth, const int input_height, - const int input_width, const int output_depth, const int output_height, - const int output_width, const int ksize_depth, const int ksize_height, - const int ksize_width, const int stride_depth, const int stride_height, - const int stride_width, const int padding_depth, const int padding_height, - const int padding_width, PoolProcess pool_process, bool exclusive, - bool adaptive, T* input_grad, bool channel_last = false) { +__global__ void KernelPool3DGrad(const int nthreads, + const T* __restrict__ input_data, + const T* __restrict__ output_data, + const T* __restrict__ output_grad, + const int channels, + const int input_depth, + const int input_height, + const int input_width, + const int output_depth, + const int output_height, + const int output_width, + const int ksize_depth, + const int ksize_height, + const int ksize_width, + const int stride_depth, + const int stride_height, + const int stride_width, + const int padding_depth, + const int padding_height, + const int padding_width, + PoolProcess pool_process, + bool exclusive, + bool adaptive, + T* input_grad, + bool channel_last = false) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int w_offset, h_offset, d_offset, c_offset, batch_idx, output_stride; @@ -867,7 +1079,9 @@ __global__ void KernelPool3DGrad( : (pd * output_height + ph) * output_width + pw; T ouput_value = pool_process.use_x ? output_data[output_sub_idx] : static_cast(0); - pool_process.compute(input, ouput_value, output_grad[output_sub_idx], + pool_process.compute(input, + ouput_value, + output_grad[output_sub_idx], static_cast(1.0 / pool_size), &input_grad_data); } @@ -878,15 +1092,28 @@ __global__ void KernelPool3DGrad( } template -__global__ void KernelMaxPool3DGrad( - const int nthreads, const T* input_data, const T* output_data, - const T* output_grad, const int channels, const int input_depth, - const int input_height, const int input_width, const int output_depth, - const int output_height, const int output_width, const int ksize_depth, - const int ksize_height, const int ksize_width, const int stride_depth, - const int stride_height, const int stride_width, const int padding_depth, - const int padding_height, const int padding_width, T* input_grad, - bool channel_last = false) { +__global__ void KernelMaxPool3DGrad(const int nthreads, + const T* input_data, + const T* output_data, + const T* output_grad, + const int channels, + const int input_depth, + const int input_height, + const int input_width, + const int output_depth, + const int output_height, + const int output_width, + const int ksize_depth, + const int ksize_height, + const int ksize_width, + const int stride_depth, + const int stride_height, + const int stride_width, + const int padding_depth, + const int padding_height, + const int padding_width, + T* input_grad, + bool channel_last = false) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int pw, ph, pd, c, batch_idx; @@ -949,17 +1176,23 @@ __global__ void KernelMaxPool3DGrad( } if (maxIdx != -1) { // atomic add - platform::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]); + paddle::platform::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]); } } } template void Pool3dDirectCUDAFunctor::operator()( - const T* input, const std::vector& input_shape, - const std::vector& output_shape, const std::vector& ksize, - const std::vector& strides, const std::vector& paddings, - bool exclusive, bool adaptive, T* output, gpuStream_t stream, + const T* input, + const std::vector& input_shape, + const std::vector& output_shape, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + T* output, + gpuStream_t stream, PoolProcess pool_compute) { const int batch_size = input_shape[0]; const int input_channels = input_shape[1]; @@ -990,11 +1223,28 @@ void Pool3dDirectCUDAFunctor::operator()( dim3 threads(thread_num, 1); dim3 grid(blocks, 1); - KernelPool3D<<>>( - nthreads, input, input_channels, input_depth, input_height, input_width, - output_depth, output_height, output_width, ksize_depth, ksize_height, - ksize_width, stride_depth, stride_height, stride_width, padding_depth, - padding_height, padding_width, pool_compute, exclusive, adaptive, output); + KernelPool3D<<>>(nthreads, + input, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + pool_compute, + exclusive, + adaptive, + output); } /* @@ -1006,13 +1256,16 @@ void Pool3dDirectCUDAFunctor::operator()( * height_up, height_down, width_left and width_right, respectively. */ template -class Pool3dFunctor { +class Pool3dFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* output, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* output, PoolProcess pool_process) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -1034,31 +1287,52 @@ class Pool3dFunctor { const int padding_width = paddings[2]; const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); + T* output_data = context.template Alloc(output); int nthreads = batch_size * output_channels * output_depth * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - platform::ChangeThreadNum(context, &thread_num); + paddle::platform::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); dim3 grid(blocks, 1); KernelPool3D<<>>( - nthreads, input_data, input_channels, input_depth, input_height, - input_width, output_depth, output_height, output_width, ksize_depth, - ksize_height, ksize_width, stride_depth, stride_height, stride_width, - padding_depth, padding_height, padding_width, pool_process, exclusive, - adaptive, output_data); + nthreads, + input_data, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + pool_process, + exclusive, + adaptive, + output_data); } - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* output, PoolProcess pool_process) { + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* output, + PoolProcess pool_process) { bool channel_last = (data_format == "NDHWC"); const int batch_size = input.dims()[0]; @@ -1089,24 +1363,42 @@ class Pool3dFunctor { const int padding_width = paddings[2]; const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); + T* output_data = context.template Alloc(output); int nthreads = batch_size * output_channels * output_depth * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - platform::ChangeThreadNum(context, &thread_num); + paddle::platform::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); dim3 grid(blocks, 1); KernelPool3D<<>>( - nthreads, input_data, input_channels, input_depth, input_height, - input_width, output_depth, output_height, output_width, ksize_depth, - ksize_height, ksize_width, stride_depth, stride_height, stride_width, - padding_depth, padding_height, padding_width, pool_process, exclusive, - adaptive, output_data, channel_last); + nthreads, + input_data, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + pool_process, + exclusive, + adaptive, + output_data, + channel_last); } }; @@ -1119,16 +1411,18 @@ class Pool3dFunctor { * height_up, height_down, width_left and width_right, respectively. */ template -class Pool3dGradFunctor { +class Pool3dGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* input_grad, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, PoolProcess pool_process) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -1152,7 +1446,7 @@ class Pool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * input_channels * input_depth * input_height * input_width; @@ -1161,21 +1455,43 @@ class Pool3dGradFunctor { dim3 grid(blocks, 1); KernelPool3DGrad<<>>( - nthreads, input_data, output_data, output_grad_data, input_channels, - input_depth, input_height, input_width, output_depth, output_height, - output_width, ksize_depth, ksize_height, ksize_width, stride_depth, - stride_height, stride_width, padding_depth, padding_height, - padding_width, pool_process, exclusive, adaptive, input_grad_data); + nthreads, + input_data, + output_data, + output_grad_data, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + pool_process, + exclusive, + adaptive, + input_grad_data); } - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* input_grad, PoolProcess pool_process) { + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_process) { bool channel_last = (data_format == "NDHWC"); const int batch_size = input.dims()[0]; @@ -1206,7 +1522,7 @@ class Pool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * input_channels * input_depth * input_height * input_width; @@ -1215,11 +1531,30 @@ class Pool3dGradFunctor { dim3 grid(blocks, 1); KernelPool3DGrad<<>>( - nthreads, input_data, output_data, output_grad_data, input_channels, - input_depth, input_height, input_width, output_depth, output_height, - output_width, ksize_depth, ksize_height, ksize_width, stride_depth, - stride_height, stride_width, padding_depth, padding_height, - padding_width, pool_process, exclusive, adaptive, input_grad_data, + nthreads, + input_data, + output_data, + output_grad_data, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + pool_process, + exclusive, + adaptive, + input_grad_data, channel_last); // add channel_last } }; @@ -1233,16 +1568,16 @@ class Pool3dGradFunctor { * height_up, height_down, width_left and width_right, respectively. */ template -class MaxPool3dGradFunctor { +class MaxPool3dGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, - framework::Tensor* input_grad) { + DenseTensor* input_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_depth = input.dims()[2]; @@ -1265,7 +1600,7 @@ class MaxPool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * output_channels * output_depth * output_height * output_width; @@ -1274,18 +1609,37 @@ class MaxPool3dGradFunctor { dim3 grid(blocks, 1); KernelMaxPool3DGrad<<>>( - nthreads, input_data, output_data, output_grad_data, input_channels, - input_depth, input_height, input_width, output_depth, output_height, - output_width, ksize_depth, ksize_height, ksize_width, stride_depth, - stride_height, stride_width, padding_depth, padding_height, - padding_width, input_grad_data); + nthreads, + input_data, + output_data, + output_grad_data, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + input_grad_data); } - void operator()( - const platform::CUDADeviceContext& context, - const framework::Tensor& input, const framework::Tensor& output, - const framework::Tensor& output_grad, const std::vector& ksize, - const std::vector& strides, const std::vector& paddings, - const std::string data_format, framework::Tensor* input_grad) { + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + DenseTensor* input_grad) { bool channel_last = (data_format == "NDHWC"); const int batch_size = input.dims()[0]; @@ -1316,7 +1670,7 @@ class MaxPool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * output_channels * output_depth * output_height * output_width; @@ -1325,77 +1679,93 @@ class MaxPool3dGradFunctor { dim3 grid(blocks, 1); KernelMaxPool3DGrad<<>>( - nthreads, input_data, output_data, output_grad_data, input_channels, - input_depth, input_height, input_width, output_depth, output_height, - output_width, ksize_depth, ksize_height, ksize_width, stride_depth, - stride_height, stride_width, padding_depth, padding_height, - padding_width, input_grad_data, channel_last); // add channel_last + nthreads, + input_data, + output_data, + output_grad_data, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + input_grad_data, + channel_last); // add channel_last } }; -template class Pool3dDirectCUDAFunctor, - float>; -template class Pool3dDirectCUDAFunctor, - float>; - -template class MaxPool3dGradFunctor; -template class MaxPool3dGradFunctor; -template class MaxPool3dGradFunctor; - -template class Pool3dFunctor, float>; -template class Pool3dFunctor, float>; -template class Pool3dGradFunctor, - float>; -template class Pool3dGradFunctor, - float>; -template class Pool3dFunctor, double>; -template class Pool3dFunctor, double>; -template class Pool3dGradFunctor, - double>; -template class Pool3dGradFunctor, - double>; - -template class Pool3dFunctor< - platform::CUDADeviceContext, - paddle::operators::math::MaxPool, - paddle::platform::float16>; -template class Pool3dFunctor< - platform::CUDADeviceContext, - paddle::operators::math::AvgPool, - paddle::platform::float16>; -template class Pool3dGradFunctor< - platform::CUDADeviceContext, - paddle::operators::math::MaxPoolGrad, - paddle::platform::float16>; -template class Pool3dGradFunctor< - platform::CUDADeviceContext, - paddle::operators::math::AvgPoolGrad, - paddle::platform::float16>; +template class Pool3dDirectCUDAFunctor, float>; +template class Pool3dDirectCUDAFunctor, float>; + +template class MaxPool3dGradFunctor; +template class MaxPool3dGradFunctor; +template class MaxPool3dGradFunctor; + +template class Pool3dFunctor, float>; +template class Pool3dFunctor, float>; +template class Pool3dGradFunctor, float>; +template class Pool3dGradFunctor, float>; +template class Pool3dFunctor, double>; +template class Pool3dFunctor, double>; +template class Pool3dGradFunctor, double>; +template class Pool3dGradFunctor, double>; + +template class Pool3dFunctor, + dtype::float16>; +template class Pool3dFunctor, + dtype::float16>; +template class Pool3dGradFunctor, + dtype::float16>; +template class Pool3dGradFunctor, + dtype::float16>; template -__global__ void KernelMaxPool2dWithIdx( - const int nthreads, const T1* input_data, const int channels, - const int input_height, const int input_width, const int output_height, - const int output_width, const int ksize_height, const int ksize_width, - const int stride_height, const int stride_width, const int padding_height, - const int padding_width, bool adaptive, T1* output_data, T2* mask_data, - FastDivModForPooling divmods) { +__global__ void KernelMaxPool2dWithIdx(const int nthreads, + const T1* input_data, + const int channels, + const int input_height, + const int input_width, + const int output_height, + const int output_width, + const int ksize_height, + const int ksize_width, + const int stride_height, + const int stride_width, + const int padding_height, + const int padding_width, + bool adaptive, + T1* output_data, + T2* mask_data, + FastDivModForPooling divmods) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int hstart, hend, wstart, wend; int w_offset, h_offset, c_offset, input_offset; - OffsetPreparationFor4Dimension( - index, false, divmods, 0, 0, input_width, input_height, &w_offset, - &h_offset, &c_offset, &input_offset); + OffsetPreparationFor4Dimension(index, + false, + divmods, + 0, + 0, + input_width, + input_height, + &w_offset, + &h_offset, + &c_offset, + &input_offset); input_data += input_offset; if (adaptive) { @@ -1431,20 +1801,38 @@ __global__ void KernelMaxPool2dWithIdx( } template -__global__ void KernelMaxPool2DWithIdxGrad( - const int nthreads, const T1* output_grad, const T2* mask_data, - const int channels, const int input_height, const int input_width, - const int output_height, const int output_width, const int ksize_height, - const int ksize_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, bool adaptive, - T1* input_grad, FastDivModForPooling divmods) { +__global__ void KernelMaxPool2DWithIdxGrad(const int nthreads, + const T1* output_grad, + const T2* mask_data, + const int channels, + const int input_height, + const int input_width, + const int output_height, + const int output_width, + const int ksize_height, + const int ksize_width, + const int stride_height, + const int stride_width, + const int padding_height, + const int padding_width, + bool adaptive, + T1* input_grad, + FastDivModForPooling divmods) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int phstart, phend, pwstart, pwend; int w_offset, h_offset, c_offset, output_offset; - OffsetPreparationFor4Dimension( - index, false, divmods, 0, 0, output_width, output_height, &w_offset, - &h_offset, &c_offset, &output_offset); + OffsetPreparationFor4Dimension(index, + false, + divmods, + 0, + 0, + output_width, + output_height, + &w_offset, + &h_offset, + &c_offset, + &output_offset); mask_data += output_offset; output_grad += output_offset; @@ -1487,13 +1875,16 @@ __global__ void KernelMaxPool2DWithIdxGrad( * height and width, respectively. */ template -class MaxPool2dWithIndexFunctor { +class MaxPool2dWithIndexFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* output, framework::Tensor* mask) { + const std::vector& paddings, + bool adaptive, + DenseTensor* output, + DenseTensor* mask) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -1509,13 +1900,13 @@ class MaxPool2dWithIndexFunctor { const int padding_width = paddings[1]; const T1* input_data = input.data(); - T1* output_data = output->mutable_data(context.GetPlace()); - T2* mask_data = mask->mutable_data(context.GetPlace()); + T1* output_data = context.template Alloc(output); + T2* mask_data = context.template Alloc(mask); int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - platform::ChangeThreadNum(context, &thread_num); + paddle::platform::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; @@ -1525,10 +1916,23 @@ class MaxPool2dWithIndexFunctor { auto pool_divmods = FastDivModForPooling(input_channels, output_width, output_height); KernelMaxPool2dWithIdx<<>>( - nthreads, input_data, input_channels, input_height, input_width, - output_height, output_width, ksize_height, ksize_width, stride_height, - stride_width, padding_height, padding_width, adaptive, output_data, - mask_data, pool_divmods); + nthreads, + input_data, + input_channels, + input_height, + input_width, + output_height, + output_width, + ksize_height, + ksize_width, + stride_height, + stride_width, + padding_height, + padding_width, + adaptive, + output_data, + mask_data, + pool_divmods); } }; @@ -1538,14 +1942,16 @@ class MaxPool2dWithIndexFunctor { * height and width, respectively. */ template -class MaxPool2dWithIndexGradFunctor { +class MaxPool2dWithIndexGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& output_grad, - const framework::Tensor& mask, const std::vector& ksize, + void operator()(const phi::GPUContext& context, + const DenseTensor& output_grad, + const DenseTensor& mask, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* input_grad) { + const std::vector& paddings, + bool adaptive, + DenseTensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_channels = input_grad->dims()[1]; const int input_height = input_grad->dims()[2]; @@ -1561,7 +1967,7 @@ class MaxPool2dWithIndexGradFunctor { const T2* mask_data = mask.data(); const T1* output_grad_data = output_grad.data(); - T1* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T1* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * input_channels * input_height * input_width; int blocks = (nthreads + 1024 - 1) / 1024; @@ -1571,31 +1977,53 @@ class MaxPool2dWithIndexGradFunctor { auto pool_divmods = FastDivModForPooling(input_channels, input_width, input_height); KernelMaxPool2DWithIdxGrad<<>>( - nthreads, output_grad_data, mask_data, input_channels, input_height, - input_width, output_height, output_width, ksize_height, ksize_width, - stride_height, stride_width, padding_height, padding_width, adaptive, - input_grad_data, pool_divmods); + nthreads, + output_grad_data, + mask_data, + input_channels, + input_height, + input_width, + output_height, + output_width, + ksize_height, + ksize_width, + stride_height, + stride_width, + padding_height, + padding_width, + adaptive, + input_grad_data, + pool_divmods); } }; -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; template -__global__ void KernelMaxPool3DWithIdx( - const int nthreads, const T1* input_data, const int channels, - const int input_depth, const int input_height, const int input_width, - const int output_depth, const int output_height, const int output_width, - const int ksize_depth, const int ksize_height, const int ksize_width, - const int stride_depth, const int stride_height, const int stride_width, - const int padding_depth, const int padding_height, const int padding_width, - bool adaptive, T1* output_data, T2* mask_data) { +__global__ void KernelMaxPool3DWithIdx(const int nthreads, + const T1* input_data, + const int channels, + const int input_depth, + const int input_height, + const int input_width, + const int output_depth, + const int output_height, + const int output_width, + const int ksize_depth, + const int ksize_height, + const int ksize_width, + const int stride_depth, + const int stride_height, + const int stride_width, + const int padding_depth, + const int padding_height, + const int padding_width, + bool adaptive, + T1* output_data, + T2* mask_data) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int pw = index % output_width; @@ -1650,14 +2078,27 @@ __global__ void KernelMaxPool3DWithIdx( } template -__global__ void KernelMaxPool3DWithIdxGrad( - const int nthreads, const T1* output_grad, const T2* mask, - const int channels, const int input_depth, const int input_height, - const int input_width, const int output_depth, const int output_height, - const int output_width, const int ksize_depth, const int ksize_height, - const int ksize_width, const int stride_depth, const int stride_height, - const int stride_width, const int padding_depth, const int padding_height, - const int padding_width, bool adaptive, T1* input_grad) { +__global__ void KernelMaxPool3DWithIdxGrad(const int nthreads, + const T1* output_grad, + const T2* mask, + const int channels, + const int input_depth, + const int input_height, + const int input_width, + const int output_depth, + const int output_height, + const int output_width, + const int ksize_depth, + const int ksize_height, + const int ksize_width, + const int stride_depth, + const int stride_height, + const int stride_width, + const int padding_depth, + const int padding_height, + const int padding_width, + bool adaptive, + T1* input_grad) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int w_offset = index % input_width; @@ -1727,13 +2168,16 @@ __global__ void KernelMaxPool3DWithIdxGrad( * depth, height and width, respectively. */ template -class MaxPool3dWithIndexFunctor { +class MaxPool3dWithIndexFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, const std::vector& ksize, + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* output, framework::Tensor* mask) { + const std::vector& paddings, + bool adaptive, + DenseTensor* output, + DenseTensor* mask) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_depth = input.dims()[2]; @@ -1754,14 +2198,14 @@ class MaxPool3dWithIndexFunctor { const int padding_width = paddings[2]; const T1* input_data = input.data(); - T1* output_data = output->mutable_data(context.GetPlace()); - T2* mask_data = mask->mutable_data(context.GetPlace()); + T1* output_data = context.template Alloc(output); + T2* mask_data = context.template Alloc(mask); int nthreads = batch_size * output_channels * output_depth * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - platform::ChangeThreadNum(context, &thread_num); + paddle::platform::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; @@ -1769,10 +2213,26 @@ class MaxPool3dWithIndexFunctor { dim3 grid(blocks, 1); KernelMaxPool3DWithIdx<<>>( - nthreads, input_data, input_channels, input_depth, input_height, - input_width, output_depth, output_height, output_width, ksize_depth, - ksize_height, ksize_width, stride_depth, stride_height, stride_width, - padding_depth, padding_height, padding_width, adaptive, output_data, + nthreads, + input_data, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + adaptive, + output_data, mask_data); } }; @@ -1783,14 +2243,16 @@ class MaxPool3dWithIndexFunctor { * depth, height and width, respectively. */ template -class MaxPool3dWithIndexGradFunctor { +class MaxPool3dWithIndexGradFunctor { public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& output_grad, - const framework::Tensor& mask, const std::vector& ksize, + void operator()(const phi::GPUContext& context, + const DenseTensor& output_grad, + const DenseTensor& mask, + const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* input_grad) { + const std::vector& paddings, + bool adaptive, + DenseTensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_channels = input_grad->dims()[1]; const int input_depth = input_grad->dims()[2]; @@ -1811,7 +2273,7 @@ class MaxPool3dWithIndexGradFunctor { const T1* output_grad_data = output_grad.data(); const T2* mask_data = mask.data(); - T1* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T1* input_grad_data = context.template Alloc(input_grad); int nthreads = batch_size * input_channels * input_depth * input_height * input_width; @@ -1820,23 +2282,34 @@ class MaxPool3dWithIndexGradFunctor { dim3 grid(blocks, 1); KernelMaxPool3DWithIdxGrad<<>>( - nthreads, output_grad_data, mask_data, input_channels, input_depth, - input_height, input_width, output_depth, output_height, output_width, - ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, - stride_width, padding_depth, padding_height, padding_width, adaptive, + nthreads, + output_grad_data, + mask_data, + input_channels, + input_depth, + input_height, + input_width, + output_depth, + output_height, + output_width, + ksize_depth, + ksize_height, + ksize_width, + stride_depth, + stride_height, + stride_width, + padding_depth, + padding_height, + padding_width, + adaptive, input_grad_data); } }; -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/pooling.h b/paddle/phi/kernels/funcs/pooling.h new file mode 100644 index 00000000000..19c6d52c4c9 --- /dev/null +++ b/paddle/phi/kernels/funcs/pooling.h @@ -0,0 +1,469 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/platform/macros.h" // import FLT_MAX +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/hostdevice.h" + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#include "paddle/phi/backends/gpu/gpu_decls.h" +#endif + +namespace phi { +namespace funcs { + +/* + * \brief Extracting simple operations from pooling. + * Both MaxPool and AvgPool need "initial", "compute" and "finalize" + * operation. + * MaxPool initializes temp variable to the negative maximum to find the + * maximum value in the pooling field. + * AvgPool initializes temp variable to the zero to accumulate all values + * in pool pooling, and finally takes the average. + * MaxPoolGrad and AvgPoolGrad are gradient operations respectively. + */ +template +class MaxPool { + public: + DEVICE inline T initial() { return static_cast(-FLT_MAX); } + DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; } + DEVICE inline void finalize(const T& pool_field, T* y) {} +}; + +template +class AvgPool { + using MT = typename dtype::MPTypeTrait::Type; + MT intermediate_res; + + public: + DEVICE inline T initial() { + intermediate_res = static_cast(0.0f); + return static_cast(0); + } + + DEVICE inline void compute(const T& x, T* y) { + intermediate_res += static_cast(x); + } + + DEVICE inline void finalize(const T& pool_field, T* y) { + *y = static_cast(intermediate_res / (static_cast(pool_field))); + } +}; + +template +class MaxPoolGrad { + public: + static constexpr bool use_x = true; + HOSTDEVICE inline void compute( + const T& x, const T& y, const T& dy, T scale, T* dx) { + *dx += dy * static_cast(x == y); + } +}; + +template +class AvgPoolGrad { + public: + static constexpr bool use_x = false; + HOSTDEVICE inline void compute( + const T& x, const T& y, const T& dy, T scale, T* dx) { + *dx += (scale * dy); + } +}; + +/* used for adaptive pool to calculate start and end index of each divided grid + */ +HOSTDEVICE inline int AdaptStartIndex(int ph, int input_size, int output_size) { + return static_cast( + floor(static_cast(ph * input_size) / output_size)); +} + +HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) { + return static_cast( + ceil(static_cast((ph + 1) * input_size) / output_size)); +} + +/* + * \brief Getting pooling results, and calculating gradient. + * + * In pool2d, all Tensors are in NCHW or NHWC format. Where N is batch size, C + * is the number of channels, H and W is the height and width of feature. + * In pool3d, all Tensors are in NCDHW or NDHWC format. Where N is batch size, C + * is the number of channels, D, H and W is the depth, height and width of + * feature. + * + * In max pooling, it is possible that the pooling region has multiple maximum + * elements. In this case, we should compute the gradient of the first maximum + * element. + * This is different from average pooling. So we rewrite the max_pool_grad: + * MaxPool2dGradFunctor, MaxPool3dGradFunctor. + */ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +template +class Pool2dDirectCUDAFunctor { + public: + void operator()(const T* input, + const std::vector& input_shape, + const std::vector& output_shape, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + T* output, + gpuStream_t stream, + PoolProcess pool_compute); +}; +#endif + +template +class Pool2dFunctor { + public: + void operator()(const Context& context, + const DenseTensor& input, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* output, + PoolProcess pool_compute); + + // overload operator() to support argument data_format + void operator()(const Context& context, + const DenseTensor& input, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* output, + PoolProcess pool_compute); +}; + +template +class Pool2dGradFunctor { + public: + void operator()(const Context& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_compute); + // overload operator() to support argument data_format + void operator()(const Context& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_compute); +}; + +template +class MaxPool2dGradFunctor { + public: + void operator()(const Context& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + DenseTensor* input_grad); + // overload operator() to support argument data_format + void operator()(const Context& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + DenseTensor* input_grad); +}; + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +template +class Pool3dDirectCUDAFunctor { + public: + void operator()(const T* input, + const std::vector& input_shape, + const std::vector& output_shape, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + T* output, + gpuStream_t stream, + PoolProcess pool_compute); +}; +#endif + +template +class Pool3dFunctor { + public: + void operator()(const Context& context, + const DenseTensor& input, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* output, + PoolProcess pool_compute); + // overload operator() to support argument data_format + void operator()(const Context& context, + const DenseTensor& input, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* output, + PoolProcess pool_compute); +}; + +template +class Pool3dGradFunctor { + public: + void operator()(const Context& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_compute); + // overload operator() to support argument data_format + void operator()(const Context& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + bool exclusive, + bool adaptive, + DenseTensor* input_grad, + PoolProcess pool_compute); +}; + +template +class MaxPool3dGradFunctor { + public: + void operator()(const Context& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + DenseTensor* input_grad); + // overload operator() to support argument data_format + void operator()(const Context& context, + const DenseTensor& input, + const DenseTensor& output, + const DenseTensor& output_grad, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + const std::string data_format, + DenseTensor* input_grad); +}; + +/* + * \brief Getting max pooling results and corresponding max index, and + * calculating gradient. + * In up-sampling-pooling, it is necessary to know max element index. + * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in + * NCDHW format. + */ +template +class MaxPool2dWithIndexFunctor { + public: + void operator()(const Context& context, + const DenseTensor& input, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool adaptive, + DenseTensor* output, + DenseTensor* mask); +}; + +template +class MaxPool2dWithIndexGradFunctor { + public: + void operator()(const Context& context, + const DenseTensor& output_grad, + const DenseTensor& mask, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool adaptive, + DenseTensor* input_grad); +}; + +template +class MaxPool3dWithIndexFunctor { + public: + void operator()(const Context& context, + const DenseTensor& input, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool adaptive, + DenseTensor* output, + DenseTensor* mask); +}; + +template +class MaxPool3dWithIndexGradFunctor { + public: + void operator()(const Context& context, + const DenseTensor& output_grad, + const DenseTensor& mask, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, + bool adaptive, + DenseTensor* input_grad); +}; + +inline int PoolOutputSize(int input_size, + int filter_size, + int padding_1, + int padding_2, + int stride, + bool ceil_mode) { + int output_size; + if (!ceil_mode) { + output_size = + (input_size - filter_size + padding_1 + padding_2) / stride + 1; + } else { + output_size = + (input_size - filter_size + padding_1 + padding_2 + stride - 1) / + stride + + 1; + } + PADDLE_ENFORCE_GT( + output_size, + 0, + errors::InvalidArgument( + "the output size must be greater than 0. But received: " + "output_size = %d due to the settings of input_size(%d), " + "padding(%d,%d), " + "k_size(%d) and stride(%d). Please check again!", + output_size, + input_size, + padding_1, + padding_2, + filter_size, + stride)); + return output_size; +} + +inline int MaxPoolOutputSize(int input_size, + int filter_size, + int padding, + int stride) { + int output_size = (input_size - filter_size + 2 * padding) / stride + 1; + return output_size; +} + +template +inline void UpdatePadding(std::vector* paddings, + const bool global_pooling, + const bool adaptive, + const std::string padding_algorithm, + const DDim data_dims, + const std::vector& strides, + const std::vector& kernel_size) { + // set padding size == data_dims.size() * 2 + auto data_shape = vectorize(data_dims); + if (static_cast(paddings->size()) == data_dims.size()) { + for (int i = 0; i < data_dims.size(); ++i) { + T copy_pad = *(paddings->begin() + 2 * i); + paddings->insert(paddings->begin() + 2 * i + 1, copy_pad); + } + } else { + PADDLE_ENFORCE_EQ(data_dims.size() * 2, + paddings->size(), + errors::InvalidArgument( + "Paddings size %d should be the same or twice as the " + "pooling size %d.", + paddings->size(), + data_dims.size() * 2)); + } + + // when padding_algorithm is "VALID" or "SAME" + if (padding_algorithm == "SAME") { + for (int i = 0; i < data_dims.size(); ++i) { + T out_size = (data_dims[i] + strides[i] - 1) / strides[i]; + T pad_sum = + std::max((out_size - 1) * strides[i] + kernel_size[i] - data_shape[i], + static_cast(0)); + T pad_0 = pad_sum / 2; + T pad_1 = pad_sum - pad_0; + *(paddings->begin() + i * 2) = pad_0; + *(paddings->begin() + i * 2 + 1) = pad_1; + } + } else if (padding_algorithm == "VALID") { + for (auto it = paddings->begin(); it != paddings->end(); it++) { + *it = 0; + } + } + + // if global_pooling == true or adaptive == true, padding will be ignore + if (global_pooling || adaptive) { + for (auto it = paddings->begin(); it != paddings->end(); it++) { + *it = 0; + } + } +} + +template +inline void UpdateKernelSize(std::vector* kernel_size, + const DDim data_dims) { + kernel_size->resize(static_cast(data_dims.size())); + for (size_t i = 0; i < kernel_size->size(); ++i) { + *(kernel_size->begin() + i) = static_cast(data_dims[i]); + } +} + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/pool_grad_kernel.cu b/paddle/phi/kernels/gpu/pool_grad_kernel.cu new file mode 100644 index 00000000000..a5ab6a1ccd4 --- /dev/null +++ b/paddle/phi/kernels/gpu/pool_grad_kernel.cu @@ -0,0 +1,60 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pool_grad_kernel.h" + +#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h" + +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(pool2d_grad, + GPU, + ALL_LAYOUT, + phi::Pool2dGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(pool2d_double_grad, + GPU, + ALL_LAYOUT, + phi::Pool2dDoubleGradKernel, + float, + double) {} +PD_REGISTER_KERNEL(max_pool2d_with_index_grad, + GPU, + ALL_LAYOUT, + phi::MaxPool2dWithIndexGradKernel, + float, + double) { + kernel->InputAt(1).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} + +PD_REGISTER_KERNEL(pool3d_grad, + GPU, + ALL_LAYOUT, + phi::Pool3dGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(max_pool3d_with_index_grad, + GPU, + ALL_LAYOUT, + phi::MaxPool3dWithIndexGradKernel, + float, + double) { + kernel->InputAt(1).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} diff --git a/paddle/phi/kernels/gpu/pool_kernel.cu b/paddle/phi/kernels/gpu/pool_kernel.cu new file mode 100644 index 00000000000..e8641395bef --- /dev/null +++ b/paddle/phi/kernels/gpu/pool_kernel.cu @@ -0,0 +1,54 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pool_kernel.h" + +#include "paddle/phi/kernels/impl/pool_kernel_impl.h" + +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(pool2d, + GPU, + ALL_LAYOUT, + phi::Pool2dKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(max_pool2d_with_index, + GPU, + ALL_LAYOUT, + phi::MaxPool2dWithIndexKernel, + float, + double) { + kernel->OutputAt(1).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} + +PD_REGISTER_KERNEL(pool3d, + GPU, + ALL_LAYOUT, + phi::Pool3dKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(max_pool3d_with_index, + GPU, + ALL_LAYOUT, + phi::MaxPool3dWithIndexKernel, + float, + double) { + kernel->OutputAt(1).SetDataType( + paddle::experimental::CppTypeToDataType::Type()); +} diff --git a/paddle/phi/kernels/gpu/split_kernel.cu b/paddle/phi/kernels/gpu/split_kernel.cu index c28fc3794f0..83c2ec4b6e9 100644 --- a/paddle/phi/kernels/gpu/split_kernel.cu +++ b/paddle/phi/kernels/gpu/split_kernel.cu @@ -37,7 +37,7 @@ void SplitKernel(const Context& dev_ctx, out_metas_ptr.push_back(&out_metas.back()); } - phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr, true); + phi::SplitInferMeta(x, num_or_sections, axis_scalar, out_metas_ptr); for (size_t i = 0; i < out_metas.size(); ++i) { outs[i]->Resize(out_metas[i].dims()); diff --git a/paddle/phi/kernels/gpudnn/pool_gpudnn.h b/paddle/phi/kernels/gpudnn/pool_gpudnn.h new file mode 100644 index 00000000000..0cf2c991464 --- /dev/null +++ b/paddle/phi/kernels/gpudnn/pool_gpudnn.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" + +namespace phi { + +using GPUDNNDataLayout = paddle::platform::DataLayout; +using PoolingMode = paddle::platform::PoolingMode; +using ScopedPoolingDescriptor = paddle::platform::ScopedPoolingDescriptor; +using ScopedTensorDescriptor = paddle::platform::ScopedTensorDescriptor; + +template +using ScalingParamType = + typename paddle::platform::CudnnDataType::ScalingParamType; + +inline GPUDNNDataLayout GetLayoutFromStr(std::string data_format) { + if (data_format == "NHWC") { + return GPUDNNDataLayout::kNHWC; + } else if (data_format == "NCHW") { + return GPUDNNDataLayout::kNCHW; + } else if (data_format == "NCDHW") { + return GPUDNNDataLayout::kNCDHW; + } else { + return GPUDNNDataLayout::kNCDHW; + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu new file mode 100644 index 00000000000..b731d033470 --- /dev/null +++ b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu @@ -0,0 +1,448 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/pool_grad_kernel.h" + +#include "paddle/phi/kernels/gpudnn/pool_gpudnn.h" + +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/pooling.h" +#include "paddle/phi/kernels/pool_kernel.h" + +#ifdef PADDLE_WITH_HIP +#include "paddle/phi/kernels/impl/pool_grad_kernel_impl.h" // PoolGradRawGPUDNNKernel will call PoolGradRawKernel for pooling type "max" in ROCm +#endif + +namespace phi { + +template +void PoolGradRawGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx) { + PADDLE_ENFORCE_EQ( + paddle::platform::is_gpu_place(ctx.GetPlace()), + true, + errors::InvalidArgument("Pool operator CUDA kernel must use CUDAPlace " + "rather than CPUPlace.")); + + const DenseTensor* input = &x; + const DenseTensor* output = &out; + const DenseTensor* output_grad = &dout; + DenseTensor* input_grad = dx; + std::vector paddings_ = paddings; + std::vector kernel_size_ = kernel_size; + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + +#ifdef PADDLE_WITH_HIP + if (pooling_type == "max") { + PoolGradRawKernel(ctx, + x, + out, + dout, + kernel_size, + strides, + paddings_, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + dx); + return; + } +#endif + + // update paddings + auto in_x_dims = input->dims(); + DDim data_dims; + if (channel_last) { + data_dims = slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); + } else { + data_dims = slice_ddim(in_x_dims, 2, in_x_dims.size()); + } + funcs::UpdatePadding(&paddings_, + global_pooling, + adaptive, + padding_algorithm, + data_dims, + strides, + kernel_size_); + if (data_dims.size() * 2 == static_cast(paddings_.size())) { + for (int i = 0; i < data_dims.size(); ++i) { + paddings_.erase(paddings_.begin() + i + 1); + } + } + + if (global_pooling) { + funcs::UpdateKernelSize(&kernel_size_, data_dims); + } + + // ------- tensor grad -------------- + DenseTensor transformed_input(input->type()); + DenseTensor transformed_output(output->type()); + DenseTensor transformed_output_grad(output_grad->type()); + + ctx.template Alloc(input_grad); + DenseTensor transformed_input_grad(input_grad->type()); + GPUDNNDataLayout layout; + const std::string str_NCHW = "NCHW", str_NHWC = "NHWC"; + const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC"; + if (data_format == str_NDHWC) { + layout = GPUDNNDataLayout::kNCDHW; + std::vector axis{0, 4, 1, 2, 3}; + + // input + transformed_input.Resize(input->dims()); + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[4]; + in_dims_vec[2] = input->dims()[1]; + in_dims_vec[3] = input->dims()[2]; + in_dims_vec[4] = input->dims()[3]; + transformed_input.Resize(make_ddim(in_dims_vec)); + ctx.Alloc(&transformed_input, input->type()); + + funcs::Transpose trans5; + trans5(ctx, *input, &transformed_input, axis); + + // output + transformed_output.Resize(output->dims()); + auto out_dims_vec = vectorize(output->dims()); + out_dims_vec[1] = output->dims()[4]; + out_dims_vec[2] = output->dims()[1]; + out_dims_vec[3] = output->dims()[2]; + out_dims_vec[4] = output->dims()[3]; + transformed_output.Resize(make_ddim(out_dims_vec)); + + ctx.Alloc(&transformed_output, output->type()); + + funcs::Transpose trans5_v2; + trans5_v2(ctx, *output, &transformed_output, axis); + + // output grad + transformed_output_grad.Resize(make_ddim(out_dims_vec)); + ctx.Alloc(&transformed_output_grad, output_grad->type()); + + funcs::Transpose trans5_v3; + trans5_v3(ctx, *output_grad, &transformed_output_grad, axis); + + // input grad + transformed_input_grad.Resize(make_ddim(in_dims_vec)); + +#ifdef PADDLE_WITH_HIP + // MIOPEN not support NHWC data layout + } else if (data_format == str_NHWC) { + layout = GPUDNNDataLayout::kNCHW; + + std::vector axis{0, 3, 1, 2}; + + // input + transformed_input.Resize(input->dims()); + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[3]; + in_dims_vec[2] = input->dims()[1]; + in_dims_vec[3] = input->dims()[2]; + transformed_input.Resize(make_ddim(in_dims_vec)); + ctx.Alloc(&transformed_input, input->type()); + + funcs::Transpose trans4; + trans4(ctx, *input, &transformed_input, axis); + + // output + transformed_output.Resize(output->dims()); + auto out_dims_vec = vectorize(output->dims()); + out_dims_vec[1] = output->dims()[3]; + out_dims_vec[2] = output->dims()[1]; + out_dims_vec[3] = output->dims()[2]; + transformed_output.Resize(make_ddim(out_dims_vec)); + ctx.Alloc(&transformed_output, output->type()); + + funcs::Transpose trans4_v2; + trans4_v2(ctx, *output, &transformed_output, axis); + + // output grad + transformed_output_grad.Resize(make_ddim(out_dims_vec)); + ctx.Alloc(&transformed_output_grad, output_grad->type()); + + funcs::Transpose trans4_v3; + trans4_v3(ctx, *output_grad, &transformed_output_grad, axis); + + // input grad + transformed_input_grad.Resize(make_ddim(in_dims_vec)); +#endif + } else { + layout = GetLayoutFromStr(data_format); + transformed_input = *input; + transformed_output = *output; + transformed_output_grad = *output_grad; + transformed_input_grad = *input_grad; + } + + const T* input_data = transformed_input.data(); + const T* output_data = transformed_output.data(); + const T* output_grad_data = transformed_output_grad.data(); + + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor input_desc; + ScopedTensorDescriptor output_desc; + ScopedPoolingDescriptor pool_desc; + +#ifdef PADDLE_WITH_HIP + miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, vectorize(transformed_input.dims())); + miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, vectorize(transformed_output.dims())); +#else + cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, vectorize(transformed_input.dims())); + cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, vectorize(transformed_output.dims())); +#endif + PoolingMode pooling_mode; + if (pooling_type == "max") { + if (FLAGS_cudnn_deterministic) { + pooling_mode = PoolingMode::kMaximumDeterministic; + } else { + pooling_mode = PoolingMode::kMaximum; + } + } else { + pooling_mode = exclusive ? PoolingMode::kAverageExclusive + : PoolingMode::kAverageInclusive; + } + +#ifdef PADDLE_WITH_HIP + miopenPoolingDescriptor_t cudnn_pool_desc = + pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides); +#else + cudnnPoolingDescriptor_t cudnn_pool_desc = + pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides); +#endif + + // ------------------- cudnn pool algorithm --------------------- + auto handle = ctx.cudnn_handle(); + ScalingParamType alpha = 1.0f, beta = 0.0f; + if (input_grad) { + T* input_grad_data = ctx.template Alloc(&transformed_input_grad); +// Because beta is zero, it is unnecessary to reset input_grad. +#ifdef PADDLE_WITH_HIP + char* pool_workspace; + size_t pool_worksize = 0; + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenPoolingGetWorkSpaceSizeV2( + cudnn_pool_desc, cudnn_output_desc, &pool_worksize)); + PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenPoolingBackward(handle, + cudnn_pool_desc, + &alpha, + cudnn_output_desc, + output_data, + cudnn_output_desc, + output_grad_data, + cudnn_input_desc, + input_data, + &beta, + cudnn_input_desc, + input_grad_data, + pool_workspace)); + PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace)); +#else + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cudnnPoolingBackward(handle, + cudnn_pool_desc, + &alpha, + cudnn_output_desc, + output_data, + cudnn_output_desc, + output_grad_data, + cudnn_input_desc, + input_data, + &beta, + cudnn_input_desc, + input_grad_data)); +#endif + + if (data_format == str_NDHWC) { + std::vector axis{0, 2, 3, 4, 1}; + funcs::Transpose trans5_v4; + trans5_v4(ctx, transformed_input_grad, input_grad, axis); + } +#ifdef PADDLE_WITH_HIP + // MIOPEN not support NHWC data layout + if (data_format == str_NHWC) { + std::vector axis{0, 2, 3, 1}; + funcs::Transpose trans4_v4; + trans4_v4(ctx, transformed_input_grad, input_grad, axis); + } +#endif + } +} + +template +void Pool2dGradGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx) { + PoolGradRawGPUDNNKernel(ctx, + x, + out, + dout, + kernel_size, + strides, + paddings, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + dx); +} + +template +void Pool2dDoubleGradGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out) { + if (pooling_type == "max") { + PADDLE_THROW( + errors::InvalidArgument("Pool op grad grad only supports avgpool.")); + } else { + Pool2dGPUDNNKernel(ctx, + x, + kernel_size, + strides, + paddings, + ceil_mode, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + out); + } +} + +template +void Pool3dGradGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx) { + PoolGradRawGPUDNNKernel(ctx, + x, + out, + dout, + kernel_size, + strides, + paddings, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + dx); +} + +} // namespace phi + +using phi::dtype::float16; + +#ifdef PADDLE_WITH_HIP +// MIOPEN do not support double +PD_REGISTER_KERNEL(pool2d_grad, + GPUDNN, + ALL_LAYOUT, + phi::Pool2dGradGPUDNNKernel, + float, + float16) {} +PD_REGISTER_KERNEL(pool2d_double_grad, + GPUDNN, + ALL_LAYOUT, + phi::Pool2dDoubleGradGPUDNNKernel, + float, + float16) {} +PD_REGISTER_KERNEL(pool3d_grad, + GPUDNN, + ALL_LAYOUT, + phi::Pool3dGradGPUDNNKernel, + float, + float16) {} +#else +PD_REGISTER_KERNEL(pool2d_grad, + GPUDNN, + ALL_LAYOUT, + phi::Pool2dGradGPUDNNKernel, + float, + double, + float16) {} +PD_REGISTER_KERNEL(pool2d_double_grad, + GPUDNN, + ALL_LAYOUT, + phi::Pool2dDoubleGradGPUDNNKernel, + float, + double, + float16) {} +PD_REGISTER_KERNEL(pool3d_grad, + GPUDNN, + ALL_LAYOUT, + phi::Pool3dGradGPUDNNKernel, + float, + double, + float16) {} +#endif diff --git a/paddle/phi/kernels/gpudnn/pool_kernel.cu b/paddle/phi/kernels/gpudnn/pool_kernel.cu new file mode 100644 index 00000000000..d8f96566775 --- /dev/null +++ b/paddle/phi/kernels/gpudnn/pool_kernel.cu @@ -0,0 +1,312 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/pool_kernel.h" + +#include "paddle/phi/kernels/gpudnn/pool_gpudnn.h" + +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/pooling.h" + +namespace phi { + +template +void PoolRawGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out) { + PADDLE_ENFORCE_EQ( + paddle::platform::is_gpu_place(ctx.GetPlace()), + true, + errors::InvalidArgument("Pool operator CUDA kernel must use CUDAPlace " + "rather than CPUPlace.")); + + const DenseTensor* input = &x; + DenseTensor* output = out; + std::vector paddings_ = paddings; + std::vector kernel_size_ = kernel_size; + + ctx.template Alloc(output); + + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + + // update paddings_ + auto x_dims = input->dims(); + DDim data_dims; + if (channel_last) { + data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1); + } else { + data_dims = slice_ddim(x_dims, 2, x_dims.size()); + } + funcs::UpdatePadding(&paddings_, + global_pooling, + adaptive, + padding_algorithm, + data_dims, + strides, + kernel_size_); + if (data_dims.size() * 2 == static_cast(paddings_.size())) { + for (int i = 0; i < data_dims.size(); ++i) { + paddings_.erase(paddings_.begin() + i + 1); + } + } + + if (global_pooling) { + funcs::UpdateKernelSize(&kernel_size_, data_dims); + } + + const std::string str_NCHW = "NCHW", str_NHWC = "NHWC"; + const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC"; + + // -----------------transformed tensor ------------------------ + + DenseTensor transformed_input(input->type()); + DenseTensor transformed_output(output->type()); + GPUDNNDataLayout layout; + + if (data_format == str_NDHWC) { + layout = GPUDNNDataLayout::kNCDHW; + std::vector axis{0, 4, 1, 2, 3}; + + // input + transformed_input.Resize(input->dims()); + + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[4]; + in_dims_vec[2] = input->dims()[1]; + in_dims_vec[3] = input->dims()[2]; + in_dims_vec[4] = input->dims()[3]; + transformed_input.Resize(make_ddim(in_dims_vec)); + ctx.Alloc(&transformed_input, input->type()); + + funcs::Transpose trans5; + trans5(ctx, *input, &transformed_input, axis); + + // output + transformed_output.Resize(output->dims()); + + auto out_dims_vec = vectorize(output->dims()); + out_dims_vec[1] = output->dims()[4]; + out_dims_vec[2] = output->dims()[1]; + out_dims_vec[3] = output->dims()[2]; + out_dims_vec[4] = output->dims()[3]; + transformed_output.Resize(make_ddim(out_dims_vec)); +#ifdef PADDLE_WITH_HIP + // MIOPEN not support NHWC data layout + } else if (data_format == str_NHWC) { + layout = GPUDNNDataLayout::kNCHW; + + std::vector axis{0, 3, 1, 2}; + + transformed_input.Resize(input->dims()); + auto in_dims_vec = vectorize(input->dims()); + in_dims_vec[1] = input->dims()[3]; + in_dims_vec[2] = input->dims()[1]; + in_dims_vec[3] = input->dims()[2]; + transformed_input.Resize(make_ddim(in_dims_vec)); + ctx.Alloc(&transformed_input, input->type()); + + funcs::Transpose trans; + trans(ctx, *input, &transformed_input, axis); + + transformed_output.Resize(output->dims()); + auto out_dims_vec = vectorize(output->dims()); + out_dims_vec[1] = output->dims()[3]; + out_dims_vec[2] = output->dims()[1]; + out_dims_vec[3] = output->dims()[2]; + transformed_output.Resize(make_ddim(out_dims_vec)); +#endif + } else { + layout = GetLayoutFromStr(data_format); + transformed_input = *input; + transformed_output = *output; + } + + const T* tranformed_input_data = transformed_input.data(); + T* tranformed_output_data = ctx.template Alloc(&transformed_output); + + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor input_desc; + ScopedTensorDescriptor output_desc; + ScopedPoolingDescriptor pool_desc; + +#ifdef PADDLE_WITH_HIP + miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, vectorize(transformed_input.dims())); + miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, vectorize(transformed_output.dims())); +#else + cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, vectorize(transformed_input.dims())); + cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, vectorize(transformed_output.dims())); +#endif + PoolingMode pooling_mode; + if (pooling_type == "max") { + pooling_mode = PoolingMode::kMaximum; + } else { + pooling_mode = exclusive ? PoolingMode::kAverageExclusive + : PoolingMode::kAverageInclusive; + } + +#ifdef PADDLE_WITH_HIP + miopenPoolingDescriptor_t cudnn_pool_desc = + pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides); +#else + cudnnPoolingDescriptor_t cudnn_pool_desc = + pool_desc.descriptor(pooling_mode, kernel_size_, paddings_, strides); +#endif + + // ------------------- cudnn pool algorithm --------------------- + auto handle = ctx.cudnn_handle(); + ScalingParamType alpha = 1.0f, beta = 0.0f; + +#ifdef PADDLE_WITH_HIP + char* pool_workspace; + size_t pool_workernel_size_ = 0; + PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenPoolingGetWorkSpaceSizeV2( + cudnn_pool_desc, cudnn_output_desc, &pool_workernel_size_)); + PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_workernel_size_)); + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::miopenPoolingForward(handle, + cudnn_pool_desc, + &alpha, + cudnn_input_desc, + tranformed_input_data, + &beta, + cudnn_output_desc, + tranformed_output_data, + false, + pool_workspace, + pool_workernel_size_)); + PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cudnnPoolingForward(handle, + cudnn_pool_desc, + &alpha, + cudnn_input_desc, + tranformed_input_data, + &beta, + cudnn_output_desc, + tranformed_output_data)); +#endif + // add + if (data_format == str_NDHWC) { + std::vector axis{0, 2, 3, 4, 1}; + funcs::Transpose trans5_v2; + trans5_v2(ctx, transformed_output, output, axis); + } +#ifdef PADDLE_WITH_HIP + // MIOPEN not support NHWC data layout + if (data_format == str_NHWC) { + std::vector axis{0, 2, 3, 1}; + funcs::Transpose trans; + trans(ctx, transformed_output, output, axis); + } +#endif +} + +template +void Pool2dGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out) { + PoolRawGPUDNNKernel(ctx, + x, + kernel_size, + strides, + paddings, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + out); +} + +template +void Pool3dGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out) { + PoolRawGPUDNNKernel(ctx, + x, + kernel_size, + strides, + paddings, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + out); +} + +} // namespace phi + +using phi::dtype::float16; + +#ifdef PADDLE_WITH_HIP +// MIOPEN do not support double +PD_REGISTER_KERNEL( + pool2d, GPUDNN, ALL_LAYOUT, phi::Pool2dGPUDNNKernel, float, float16) {} +PD_REGISTER_KERNEL( + pool3d, GPUDNN, ALL_LAYOUT, phi::Pool3dGPUDNNKernel, float, float16) {} +#else +PD_REGISTER_KERNEL(pool2d, + GPUDNN, + ALL_LAYOUT, + phi::Pool2dGPUDNNKernel, + float, + double, + float16) {} +PD_REGISTER_KERNEL(pool3d, + GPUDNN, + ALL_LAYOUT, + phi::Pool3dGPUDNNKernel, + float, + double, + float16) {} +#endif diff --git a/paddle/phi/kernels/impl/pool_grad_kernel_impl.h b/paddle/phi/kernels/impl/pool_grad_kernel_impl.h new file mode 100644 index 00000000000..7fe89ce34c8 --- /dev/null +++ b/paddle/phi/kernels/impl/pool_grad_kernel_impl.h @@ -0,0 +1,332 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/kernels/pool_grad_kernel.h" + +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/pooling.h" +#include "paddle/phi/kernels/pool_kernel.h" + +namespace phi { + +template +void PoolGradRawKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx) { + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + std::vector paddings_ = paddings; + std::vector kernel_size_ = kernel_size; + + // update paddings + auto x_dims = x.dims(); + DDim data_dims; + if (channel_last) { + data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1); + } else { + data_dims = slice_ddim(x_dims, 2, x_dims.size()); + } + funcs::UpdatePadding(&paddings_, + global_pooling, + adaptive, + padding_algorithm, + data_dims, + strides, + kernel_size_); + if (data_dims.size() * 2 == static_cast(paddings_.size())) { + for (int i = 0; i < data_dims.size(); ++i) { + paddings_.erase(paddings_.begin() + i + 1); + } + } + + if (global_pooling) { + funcs::UpdateKernelSize(&kernel_size_, data_dims); + } + + if (dx) { + ctx.template Alloc(dx); + funcs::SetConstant set_constant; + set_constant(ctx, dx, static_cast(0.0)); + + switch (kernel_size_.size()) { + case 2: { + if (pooling_type == "max") { + funcs::MaxPool2dGradFunctor pool2d_backward; + pool2d_backward(ctx, + x, + out, + dout, + kernel_size_, + strides, + paddings_, + data_format, + dx); + } else if (pooling_type == "avg") { + funcs::Pool2dGradFunctor, T> + pool2d_backward; + funcs::AvgPoolGrad pool_process; + pool2d_backward(ctx, + x, + out, + dout, + kernel_size_, + strides, + paddings_, + data_format, + exclusive, + adaptive, + dx, + pool_process); + } + } break; + case 3: { + if (pooling_type == "max") { + funcs::MaxPool3dGradFunctor pool3d_backward; + pool3d_backward(ctx, + x, + out, + dout, + kernel_size_, + strides, + paddings_, + data_format, + dx); + } else if (pooling_type == "avg") { + funcs::Pool3dGradFunctor, T> + pool3d_backward; + funcs::AvgPoolGrad pool_process; + pool3d_backward(ctx, + x, + out, + dout, + kernel_size_, + strides, + paddings_, + data_format, + exclusive, + adaptive, + dx, + pool_process); + } + } break; + default: { + PADDLE_THROW( + errors::InvalidArgument("Pool op only supports 2D and 3D input.")); + } + } + } +} + +template +void MaxPoolWithIndexGradRawKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& mask, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* dx) { + std::vector paddings_ = paddings; + std::vector kernel_size_ = kernel_size; + + if (global_pooling) { + for (size_t i = 0; i < kernel_size_.size(); ++i) { + paddings_[i] = 0; + kernel_size_[i] = static_cast(dx->dims()[i + 2]); + } + } + + if (dx) { + ctx.template Alloc(dx); + funcs::set_constant(ctx, dx, 0); + + switch (kernel_size_.size()) { + case 2: { + funcs::MaxPool2dWithIndexGradFunctor pool2d_backward; + pool2d_backward( + ctx, dout, mask, kernel_size_, strides, paddings_, adaptive, dx); + } break; + case 3: { + funcs::MaxPool3dWithIndexGradFunctor pool3d_backward; + pool3d_backward( + ctx, dout, mask, kernel_size_, strides, paddings_, adaptive, dx); + } break; + default: { + PADDLE_THROW( + errors::InvalidArgument("Pool op only supports 2D and 3D input.")); + } + } + } +} + +template +void Pool2dGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx) { + PoolGradRawKernel(ctx, + x, + out, + dout, + kernel_size, + strides, + paddings, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + dx); +} + +template +void Pool2dDoubleGradKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out) { + if (pooling_type == "max") { + PADDLE_THROW( + errors::InvalidArgument("Pool op grad grad only supports avgpool.")); + } else { + Pool2dKernel(ctx, + x, + kernel_size, + strides, + paddings, + ceil_mode, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + out); + } +} + +template +void MaxPool2dWithIndexGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& mask, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* dx) { + MaxPoolWithIndexGradRawKernel(ctx, + x, + mask, + dout, + kernel_size, + strides, + paddings, + global_pooling, + adaptive, + dx); +} + +template +void Pool3dGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx) { + PoolGradRawKernel(ctx, + x, + out, + dout, + kernel_size, + strides, + paddings, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + dx); +} + +template +void MaxPool3dWithIndexGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& mask, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* dx) { + MaxPoolWithIndexGradRawKernel(ctx, + x, + mask, + dout, + kernel_size, + strides, + paddings, + global_pooling, + adaptive, + dx); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/pool_kernel_impl.h b/paddle/phi/kernels/impl/pool_kernel_impl.h new file mode 100644 index 00000000000..665d02fd017 --- /dev/null +++ b/paddle/phi/kernels/impl/pool_kernel_impl.h @@ -0,0 +1,321 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/kernels/pool_kernel.h" + +#include +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/kernels/funcs/pooling.h" + +#if defined(__HIPCC__) || defined(__NVCC__) +#include "paddle/phi/kernels/funcs/reduce_function.h" +#include "paddle/phi/kernels/primitive/functor_primitives.h" +#endif + +namespace phi { + +inline int GetReduceNum(const DenseTensor& input, + const DenseTensor* output, + const std::string data_format, + std::vector* reduce_dim) { + // data_format only can be NCHW + bool channel_last = (data_format == "NHWC"); + if (channel_last) { + return 0; + } + int reduce_num = 0; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + if ((output_height == 1) && (output_width == 1)) { + reduce_dim->push_back(2); + reduce_dim->push_back(3); + reduce_num = input.dims()[2] * input.dims()[3]; + } + return reduce_num; +} + +template +void PoolRawKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out) { + const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); + std::vector paddings_ = paddings; + std::vector kernel_size_ = kernel_size; + + // update paddings + auto x_dims = x.dims(); + DDim data_dims; + if (channel_last) { + data_dims = slice_ddim(x_dims, 1, x_dims.size() - 1); + } else { + data_dims = slice_ddim(x_dims, 2, x_dims.size()); + } + + funcs::UpdatePadding(&paddings_, + global_pooling, + adaptive, + padding_algorithm, + data_dims, + strides, + kernel_size_); + + if (data_dims.size() * 2 == static_cast(paddings_.size())) { + for (int i = 0; i < data_dims.size(); ++i) { + paddings_.erase(paddings_.begin() + i + 1); + } + } + + if (global_pooling) { + funcs::UpdateKernelSize(&kernel_size_, data_dims); + } + + switch (kernel_size_.size()) { + case 2: { + if (pooling_type == "max") { + funcs::Pool2dFunctor, T> pool2d_forward; + funcs::MaxPool pool_process; + pool2d_forward(ctx, + x, + kernel_size_, + strides, + paddings_, + data_format, + true, + false, + out, + pool_process); + + } else if (pooling_type == "avg") { + std::vector reduce_dim; + int reduce_num = GetReduceNum(x, out, data_format, &reduce_dim); + if (reduce_num > 0 && + adaptive) { // for adaptive_avg_pool2d && output_size == 1 +#if defined(__HIPCC__) || defined(__NVCC__) + auto stream = ctx.stream(); + funcs::ReduceKernel>( + ctx, x, out, kps::DivideFunctor(reduce_num), reduce_dim); +#else // for cpu + funcs::Pool2dFunctor, T> pool2d_forward; + funcs::AvgPool pool_process; + pool2d_forward(ctx, + x, + kernel_size_, + strides, + paddings_, + data_format, + exclusive, + adaptive, + out, + pool_process); +#endif + } else { // avgpool_2d or adaptive_avg_pool2d && output_size != 1 + funcs::Pool2dFunctor, T> pool2d_forward; + funcs::AvgPool pool_process; + pool2d_forward(ctx, + x, + kernel_size_, + strides, + paddings_, + data_format, + exclusive, + adaptive, + out, + pool_process); + } + } + } break; + case 3: { + if (pooling_type == "max") { + funcs::Pool3dFunctor, T> pool3d_forward; + funcs::MaxPool pool_process; + pool3d_forward(ctx, + x, + kernel_size_, + strides, + paddings_, + data_format, + true, + false, + out, + pool_process); + } else if (pooling_type == "avg") { + funcs::Pool3dFunctor, T> pool3d_forward; + funcs::AvgPool pool_process; + pool3d_forward(ctx, + x, + kernel_size_, + strides, + paddings_, + data_format, + exclusive, + adaptive, + out, + pool_process); + } + } break; + default: { + PADDLE_THROW( + errors::InvalidArgument("Pool op only supports 2D and 3D input.")); + } + } +} + +template +void MaxPoolWithIndexRawKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* out, + DenseTensor* mask) { + std::vector paddings_ = paddings; + std::vector kernel_size_ = kernel_size; + + if (global_pooling) { + for (size_t i = 0; i < kernel_size_.size(); ++i) { + paddings_[i] = 0; + kernel_size_[i] = static_cast(x.dims()[i + 2]); + } + } + + switch (kernel_size_.size()) { + case 2: { + funcs::MaxPool2dWithIndexFunctor pool2d_forward; + pool2d_forward( + ctx, x, kernel_size_, strides, paddings_, adaptive, out, mask); + } break; + case 3: { + funcs::MaxPool3dWithIndexFunctor pool3d_forward; + pool3d_forward( + ctx, x, kernel_size_, strides, paddings_, adaptive, out, mask); + } break; + default: { + PADDLE_THROW( + errors::InvalidArgument("Pool op only supports 2D and 3D input.")); + } + } +} + +template +void Pool2dKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out) { + PoolRawKernel(ctx, + x, + kernel_size, + strides, + paddings, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + out); +} + +template +void MaxPool2dWithIndexKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* out, + DenseTensor* mask) { + MaxPoolWithIndexRawKernel(ctx, + x, + kernel_size, + strides, + paddings, + global_pooling, + adaptive, + out, + mask); +} + +template +void Pool3dKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out) { + PoolRawKernel(ctx, + x, + kernel_size, + strides, + paddings, + exclusive, + data_format, + pooling_type, + global_pooling, + adaptive, + padding_algorithm, + out); +} + +template +void MaxPool3dWithIndexKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* out, + DenseTensor* mask) { + MaxPoolWithIndexRawKernel(ctx, + x, + kernel_size, + strides, + paddings, + global_pooling, + adaptive, + out, + mask); +} + +} // namespace phi diff --git a/paddle/phi/kernels/pool_grad_kernel.h b/paddle/phi/kernels/pool_grad_kernel.h new file mode 100644 index 00000000000..0658dc22c82 --- /dev/null +++ b/paddle/phi/kernels/pool_grad_kernel.h @@ -0,0 +1,145 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void Pool2dGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx); + +template +void Pool2dGradGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx); + +template +void Pool2dDoubleGradKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out); + +template +void Pool2dDoubleGradGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out); + +template +void MaxPool2dWithIndexGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& mask, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* dx); + +template +void Pool3dGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx); + +template +void Pool3dGradGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* dx); + +template +void MaxPool3dWithIndexGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& mask, + const DenseTensor& dout, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* dx); + +} // namespace phi diff --git a/paddle/phi/kernels/pool_kernel.h b/paddle/phi/kernels/pool_kernel.h new file mode 100644 index 00000000000..348af021815 --- /dev/null +++ b/paddle/phi/kernels/pool_kernel.h @@ -0,0 +1,105 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void Pool2dKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out); + +template +void Pool2dGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out); + +template +void MaxPool2dWithIndexKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* out, + DenseTensor* mask); + +template +void Pool3dKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out); + +template +void Pool3dGPUDNNKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool ceil_mode, + bool exclusive, + const std::string& data_format, + const std::string& pooling_type, + bool global_pooling, + bool adaptive, + const std::string& padding_algorithm, + DenseTensor* out); + +template +void MaxPool3dWithIndexKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& kernel_size, + const std::vector& strides, + const std::vector& paddings, + bool global_pooling, + bool adaptive, + DenseTensor* out, + DenseTensor* mask); + +} // namespace phi diff --git a/paddle/phi/ops/compat/pool_sig.cc b/paddle/phi/ops/compat/pool_sig.cc new file mode 100644 index 00000000000..390d3db5e78 --- /dev/null +++ b/paddle/phi/ops/compat/pool_sig.cc @@ -0,0 +1,154 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature Pool2dOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("pool2d", + {"X"}, + {"ksize", + "strides", + "paddings", + "ceil_mode", + "exclusive", + "data_format", + "pooling_type", + "global_pooling", + "adaptive", + "padding_algorithm"}, + {"Out"}); +} + +KernelSignature Pool2dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("pool2d_grad", + {"X", "Out", GradVarName("Out")}, + {"ksize", + "strides", + "paddings", + "ceil_mode", + "exclusive", + "data_format", + "pooling_type", + "global_pooling", + "adaptive", + "padding_algorithm"}, + {GradVarName("X")}); +} + +KernelSignature Pool2dDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("pool2d_double_grad", + {"X"}, + {"ksize", + "strides", + "paddings", + "ceil_mode", + "exclusive", + "data_format", + "pooling_type", + "global_pooling", + "adaptive", + "padding_algorithm"}, + {"Out"}); +} + +KernelSignature MaxPool2dWithIndexOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "max_pool2d_with_index", + {"X"}, + {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, + {"Out", "Mask"}); +} + +KernelSignature MaxPool2dWithIndexGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "max_pool2d_with_index_grad", + {"X", "Mask", GradVarName("Out")}, + {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, + {GradVarName("X")}); +} + +KernelSignature Pool3dOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("pool3d", + {"X"}, + {"ksize", + "strides", + "paddings", + "ceil_mode", + "exclusive", + "data_format", + "pooling_type", + "global_pooling", + "adaptive", + "padding_algorithm"}, + {"Out"}); +} + +KernelSignature Pool3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("pool3d_grad", + {"X", "Out", GradVarName("Out")}, + {"ksize", + "strides", + "paddings", + "ceil_mode", + "exclusive", + "data_format", + "pooling_type", + "global_pooling", + "adaptive", + "padding_algorithm"}, + {GradVarName("X")}); +} + +KernelSignature MaxPool3dWithIndexOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "max_pool3d_with_index", + {"X"}, + {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, + {"Out", "Mask"}); +} + +KernelSignature MaxPool3dWithIndexGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "max_pool3d_with_index_grad", + {"X", "Mask", GradVarName("Out")}, + {"ksize", "strides", "paddings", "global_pooling", "adaptive"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(pool2d, phi::Pool2dOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pool2d_grad, phi::Pool2dGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pool2d_double_grad, + phi::Pool2dDoubleGradOpArgumentMapping); + +PD_REGISTER_ARG_MAPPING_FN(max_pool2d_with_index, + phi::MaxPool2dWithIndexOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(max_pool2d_with_index_grad, + phi::MaxPool2dWithIndexGradOpArgumentMapping); + +PD_REGISTER_ARG_MAPPING_FN(pool3d, phi::Pool3dOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pool3d_grad, phi::Pool3dGradOpArgumentMapping); + +PD_REGISTER_ARG_MAPPING_FN(max_pool3d_with_index, + phi::MaxPool3dWithIndexOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(max_pool3d_with_index_grad, + phi::MaxPool3dWithIndexGradOpArgumentMapping); diff --git a/paddle/phi/tests/core/test_meta_fn_utils.cc b/paddle/phi/tests/core/test_meta_fn_utils.cc index f4288c2aa2f..399112d09c2 100644 --- a/paddle/phi/tests/core/test_meta_fn_utils.cc +++ b/paddle/phi/tests/core/test_meta_fn_utils.cc @@ -52,7 +52,7 @@ TEST(MetaFnFactory, InferMetaFnExists) { phi::InferMetaContext ctx; ctx.EmplaceBackInput(shared_meat_x); ctx.EmplaceBackOutput(shared_meta_out); - ctx.SetMetaConfig(/*is_runtime=*/true); + ctx.SetMetaConfig({/*is_runtime =*/true, /*is_run_mkldnn_kernel=*/false}); phi::MetaFnFactory::Instance().Get("sign")(&ctx); EXPECT_EQ(dense_out1.dims().size(), dense_out2.dims().size()); @@ -78,7 +78,7 @@ TEST(MetaFnFactory, CopyInferMetaFn) { ctx.EmplaceBackAttr(Backend::CPU); ctx.EmplaceBackAttr(false); ctx.EmplaceBackOutput(shared_meta_out); - ctx.SetMetaConfig(/*is_runtime=*/true); + ctx.SetMetaConfig({/*is_runtime =*/true, /*is_run_mkldnn_kernel=*/false}); phi::MetaFnFactory::Instance().Get("copy_to")(&ctx); EXPECT_EQ(dense_out1.dims().size(), dense_out2.dims().size()); @@ -105,7 +105,7 @@ TEST(MetaFnFactory, SplitInferMetaFn) { ctx.EmplaceBackAttr(num_or_sections); ctx.EmplaceBackAttr(axis); ctx.EmplaceBackOutputs(out); - ctx.SetMetaConfig(/*is_runtime=*/true); + ctx.SetMetaConfig({/*is_runtime =*/true, /*is_run_mkldnn_kernel=*/false}); phi::MetaFnFactory::Instance().Get("split")(&ctx); ASSERT_EQ(dense_out1.dims().size(), 2); -- GitLab From 29453da112c8530b64bda8fbb86ec458226977bf Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Mon, 14 Mar 2022 10:48:54 +0800 Subject: [PATCH 022/176] Fix bug when eigen_device() is nullptr in top_k (#40459) --- paddle/phi/kernels/gpu/top_k_kernel.cu | 38 ++++++++++++++------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/paddle/phi/kernels/gpu/top_k_kernel.cu b/paddle/phi/kernels/gpu/top_k_kernel.cu index 4e9aa88c6cb..7f06af7de43 100644 --- a/paddle/phi/kernels/gpu/top_k_kernel.cu +++ b/paddle/phi/kernels/gpu/top_k_kernel.cu @@ -78,15 +78,16 @@ void TopkKernel(const Context& dev_ctx, // The conclusion is drawn from the data through multiple sets of // statistics if (input_width >= 128 && k >= input_width * 0.75) { - if (ops::SortTopk( - paddle::platform::CUDADeviceContext(dev_ctx.GetPlace()), - input, - input_width, - input_height, - k, - out, - indices, - largest)) { + auto* ctx = reinterpret_cast( + &dev_ctx); + if (ops::SortTopk(*ctx, + input, + input_width, + input_height, + k, + out, + indices, + largest)) { // Successed, return. return; } else { @@ -181,15 +182,16 @@ void TopkKernel(const Context& dev_ctx, // The conclusion is drawn from the data through multiple sets of // statistics if (input_width >= 128 && k >= input_width * 0.75) { - if (ops::SortTopk( - paddle::platform::CUDADeviceContext(dev_ctx.GetPlace()), - &trans_input, - input_width, - input_height, - k, - &trans_out, - &trans_ind, - largest)) { + auto* ctx = reinterpret_cast( + &dev_ctx); + if (ops::SortTopk(*ctx, + &trans_input, + input_width, + input_height, + k, + &trans_out, + &trans_ind, + largest)) { // last step, tranpose back the indices and output funcs::TransCompute( ndims, dev_ctx, trans_ind, indices, trans); -- GitLab From 95a526b23c4a3de9c3b01ccf1ba281dca9bc5922 Mon Sep 17 00:00:00 2001 From: huzhiqiang <912790387@qq.com> Date: Sun, 13 Mar 2022 22:00:29 -0500 Subject: [PATCH 023/176] [infrt] add skip list (#40450) --- tools/infrt/get_compat_kernel_signature.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/infrt/get_compat_kernel_signature.py b/tools/infrt/get_compat_kernel_signature.py index 78d59c2aef1..b8c4232076c 100644 --- a/tools/infrt/get_compat_kernel_signature.py +++ b/tools/infrt/get_compat_kernel_signature.py @@ -16,6 +16,8 @@ import os import re import json +skip_list = [] + def parse_compat_registry(kernel_info): name, inputs_str, attrs_str, outputs_str = kernel_info.split(",{") @@ -42,6 +44,8 @@ def get_compat_kernels_info(): compat_files.remove(file_) for file_ in compat_files: + if file_ in skip_list: + continue with open("../../paddle/phi/ops/compat/" + file_) as in_file: txt = in_file.readlines() content = "" -- GitLab From e553f758163e61dbea7acca5c9c78b2fa1ee701c Mon Sep 17 00:00:00 2001 From: Zhong Hui Date: Mon, 14 Mar 2022 12:05:30 +0800 Subject: [PATCH 024/176] [multiprocessing] Add paddle.incubate.multiprocessing for sharing tensors between python processes. (#37302) * Add support for paddle.multiprocessing * move multiprocessing to incubate. --- paddle/fluid/memory/allocation/CMakeLists.txt | 3 + .../memory/allocation/cuda_ipc_allocator.cc | 80 +++++ .../memory/allocation/cuda_ipc_allocator.h | 56 ++++ .../fluid/memory/allocation/mmap_allocator.cc | 187 ++++++++++-- .../fluid/memory/allocation/mmap_allocator.h | 69 ++++- paddle/fluid/pybind/CMakeLists.txt | 3 + paddle/fluid/pybind/pybind.cc | 284 ++++++++++++++++++ .../fluid/tests/unittests/CMakeLists.txt | 2 + .../unittests/test_paddle_multiprocessing.py | 199 ++++++++++++ .../incubate/multiprocessing/__init__.py | 27 ++ .../incubate/multiprocessing/reductions.py | 189 ++++++++++++ 11 files changed, 1080 insertions(+), 19 deletions(-) create mode 100644 paddle/fluid/memory/allocation/cuda_ipc_allocator.cc create mode 100644 paddle/fluid/memory/allocation/cuda_ipc_allocator.h create mode 100644 python/paddle/fluid/tests/unittests/test_paddle_multiprocessing.py create mode 100644 python/paddle/incubate/multiprocessing/__init__.py create mode 100644 python/paddle/incubate/multiprocessing/reductions.py diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index a7a417c29a7..f296ce96d4e 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -131,4 +131,7 @@ cc_library(virtual_memory_auto_growth_best_fit_allocator SRCS virtual_memory_aut if(NOT WIN32) cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator) cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator) + if (WITH_GPU) + cc_library(cuda_ipc_allocator SRCS cuda_ipc_allocator.cc DEPS allocator) + endif() endif(NOT WIN32) diff --git a/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc b/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc new file mode 100644 index 00000000000..b2f24d5aed1 --- /dev/null +++ b/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _WIN32 + +#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h" +#include "paddle/fluid/platform/cuda_device_guard.h" + +#include +#include +#include +#include +#include + +#include "glog/logging.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace memory { +namespace allocation { + +namespace { +std::mutex ipc_mutex_; +std::unordered_map> ipc_handle_to_baseptr_; +} // namespace + +std::shared_ptr GetIpcBasePtr(std::string handle) { + std::lock_guard lock(ipc_mutex_); + + auto iter = ipc_handle_to_baseptr_.find(handle); + if (iter != ipc_handle_to_baseptr_.end()) { + auto baseptr = iter->second.lock(); + if (baseptr) return baseptr; + } + // The IpcMemHandle can only open once for the same handle, + // so here we cache it here. + void *baseptr = nullptr; + auto ipc_handle = + reinterpret_cast(handle.c_str()); + PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcOpenMemHandle( + &baseptr, *ipc_handle, cudaIpcMemLazyEnablePeerAccess)); + // Close ipc handle on the same device. + int device_id = platform::GetCurrentDeviceId(); + // Add deleter to close ipc handle. + auto sp = std::shared_ptr(baseptr, [handle, device_id](void *ptr) { + platform::CUDADeviceGuard guard(device_id); + std::lock_guard lock(ipc_mutex_); + PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcCloseMemHandle(ptr)); + ipc_handle_to_baseptr_.erase(handle); + VLOG(6) << "cudaIpcCloseMemHandle for ptr:" + << "\t" << ptr; + }); + std::weak_ptr wp = sp; + ipc_handle_to_baseptr_.insert(iter, {handle, wp}); + + return sp; +} + +CudaIpcAllocation::~CudaIpcAllocation() { + shared_ptr_.reset(); + VLOG(6) << "tensor deleted cudaIpcCloseMemHandle for ptr:" + << "\t" << this->ptr(); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle + +#endif diff --git a/paddle/fluid/memory/allocation/cuda_ipc_allocator.h b/paddle/fluid/memory/allocation/cuda_ipc_allocator.h new file mode 100644 index 00000000000..52e3cf10ea7 --- /dev/null +++ b/paddle/fluid/memory/allocation/cuda_ipc_allocator.h @@ -0,0 +1,56 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _WIN32 +#pragma once + +#include +#include // NOLINT +#include +#include +#include + +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace memory { +namespace allocation { + +std::shared_ptr GetIpcBasePtr(std::string handle); + +class CudaIpcAllocation : public Allocation { + public: + explicit CudaIpcAllocation(void *ptr, size_t size, int device_id, + std::shared_ptr shared_ptr) + : Allocation(ptr, size, platform::CUDAPlace(device_id)), + device_id_(std::move(device_id)), + shared_ptr_(std::move(shared_ptr)) {} + + inline const int &device_id() const { return device_id_; } + + ~CudaIpcAllocation() override; + + private: + int device_id_; + std::shared_ptr shared_ptr_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle + +#endif diff --git a/paddle/fluid/memory/allocation/mmap_allocator.cc b/paddle/fluid/memory/allocation/mmap_allocator.cc index acaf5d54855..25c2235cce8 100644 --- a/paddle/fluid/memory/allocation/mmap_allocator.cc +++ b/paddle/fluid/memory/allocation/mmap_allocator.cc @@ -29,6 +29,155 @@ namespace paddle { namespace memory { namespace allocation { +std::string GetIPCName() { + static std::random_device rd; + std::string handle = "/paddle_"; +#ifdef _WIN32 + handle += std::to_string(GetCurrentProcessId()); +#else + handle += std::to_string(getpid()); +#endif + handle += "_"; + handle += std::to_string(rd()); + return handle; +} + +struct CountInfo { + std::atomic refcount; +}; + +void AllocateMemoryMap(std::string filename, int flags, size_t size, + void **map_ptr_, int *fd_) { + // TODO(@ZHUI): support win32 + int file_flags = 0; + int fd = -1; + if (flags & MAPPED_SHAREDMEM) { + file_flags = O_RDWR | O_CREAT; + } else { + file_flags = O_RDONLY; + } + if (flags & MAPPED_EXCLUSIVE) { + file_flags |= O_EXCL; + } + if (flags & MAPPED_NOCREATE) { + file_flags &= ~O_CREAT; + } + + if (!(flags & MAPPED_FROMFD)) { + if (flags & MAPPED_SHAREDMEM) { + fd = shm_open(filename.c_str(), file_flags, (mode_t)0600); + PADDLE_ENFORCE_NE( + fd, -1, + platform::errors::Unavailable( + "File descriptor %s open failed, unable in read-write mode", + filename.c_str())); + VLOG(6) << "shm_open: " << filename; + } + } else { + fd = -1; + } + + PADDLE_ENFORCE_EQ(ftruncate(fd, size), 0, + platform::errors::Unavailable( + "Fruncate a file to a specified length failed!")); + + if (flags & MAPPED_SHAREDMEM) { + *map_ptr_ = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + } else { + *map_ptr_ = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + } + + PADDLE_ENFORCE_NE(*map_ptr_, MAP_FAILED, + platform::errors::Unavailable( + "Memory map failed when create shared memory.")); + + if (flags & MAPPED_KEEPFD) { + *fd_ = fd; + } else { + PADDLE_ENFORCE_NE(::close(fd), -1, + platform::errors::Unavailable( + "Error closing memory maped file <", filename, ">")); + + *fd_ = -1; + } +} + +std::shared_ptr +AllocateRefcountedMemoryMapAllocation(std::string filename, int flags, + size_t size) { + int fd = -1; + void *base_ptr = nullptr; + AllocateMemoryMap(filename, flags, size + mmap_alignment, &base_ptr, &fd); + void *aliged_base_ptr = + static_cast(static_cast(base_ptr) + mmap_alignment); + return std::make_shared(aliged_base_ptr, size, + filename, flags, fd); +} + +RefcountedMemoryMapAllocation::RefcountedMemoryMapAllocation( + void *ptr, size_t size, std::string ipc_name, int fd, int flags) + : MemoryMapAllocation(ptr, size, ipc_name, fd, flags) { + // must reset base ptr first. + resetBaseptr(); + initializeRefercount(); +} + +void MemoryMapAllocation::close() { + if (closed_) { + return; + } + closed_ = true; +} + +MemoryMapAllocation::~MemoryMapAllocation() { close(); } + +void RefcountedMemoryMapAllocation::incref() { + CountInfo *info = static_cast(map_ptr_); + ++info->refcount; +} + +int RefcountedMemoryMapAllocation::decref() { + CountInfo *info = static_cast(map_ptr_); + return --info->refcount == 0; +} + +void RefcountedMemoryMapAllocation::resetBaseptr() { + map_ptr_ = + static_cast(static_cast(map_ptr_) - mmap_alignment); + map_size_ = map_size_ + mmap_alignment; +} + +void RefcountedMemoryMapAllocation::initializeRefercount() { + CountInfo *info = reinterpret_cast(map_ptr_); + + if (flags_ & MAPPED_EXCLUSIVE) { + new (&info->refcount) std::atomic(1); + } else { + info->refcount++; + } +} + +void RefcountedMemoryMapAllocation::close() { + if (closed_) { + return; + } + closed_ = true; + void *data = map_ptr_; + CountInfo *info = reinterpret_cast(data); + if (--info->refcount == 0) { + PADDLE_ENFORCE_NE( + shm_unlink(ipc_name_.c_str()), -1, + platform::errors::Unavailable( + "could not unlink the shared memory file ", ipc_name_)); + VLOG(6) << "shm_unlink file: " << ipc_name_; + } + + PADDLE_ENFORCE_NE( + munmap(map_ptr_, map_size_), -1, + platform::errors::Unavailable("could not unmap the shared memory file: ", + strerror(errno), " (", errno, ")")); +} + MemoryMapWriterAllocation::~MemoryMapWriterAllocation() { PADDLE_ENFORCE_NE( munmap(this->ptr(), this->size()), -1, @@ -44,30 +193,30 @@ MemoryMapReaderAllocation::~MemoryMapReaderAllocation() { /* Here we do not pay attention to the result of shm_unlink, because the memory mapped file may have been cleared due to the MemoryMapFdSet::Clear() */ + + // Code of DataLoader subprocess: + // + // core._array_to_share_memory_tensor(b) + // out_queue.put((idx, tensor_list, structure)) + // core._remove_tensor_list_mmap_fds(tensor_list) + + /* If the tensor in already in the send queue, the tensor will be + * deconstructed by the function. If the tensor not send yet, it + * will be cleared by MemoryMapFdSet::Clear(). + * If the `_remove_tensor_list_mmap_fds` have be interrupted, the + * tensor will be cleared by both methods. + * */ + shm_unlink(this->ipc_name().c_str()); MemoryMapFdSet::Instance().Remove(this->ipc_name()); VLOG(3) << "~MemoryMapReaderAllocation: " << this->ipc_name(); } -std::string GetIPCName() { - static std::random_device rd; - std::string handle = "/paddle_"; -#ifdef _WIN32 - handle += std::to_string(GetCurrentProcessId()); -#else - handle += std::to_string(getpid()); -#endif - handle += "_"; - handle += std::to_string(rd()); - return handle; -} - std::shared_ptr AllocateMemoryMapWriterAllocation( size_t size) { const std::string &ipc_name = GetIPCName(); int flags = O_RDWR | O_CREAT; - - int fd = shm_open(ipc_name.c_str(), flags, 0644); + int fd = shm_open(ipc_name.c_str(), flags, 0600); PADDLE_ENFORCE_NE( fd, -1, platform::errors::Unavailable("File descriptor %s open failed", ipc_name.c_str())); @@ -86,12 +235,14 @@ std::shared_ptr AllocateMemoryMapWriterAllocation( std::shared_ptr RebuildMemoryMapReaderAllocation( const std::string &ipc_name, size_t size) { - int fd = shm_open(ipc_name.c_str(), O_RDONLY, 0644); + int flags = O_RDWR | O_CREAT; + flags &= ~O_CREAT; + + int fd = shm_open(ipc_name.c_str(), flags, 0600); PADDLE_ENFORCE_NE( fd, -1, platform::errors::Unavailable("File descriptor %s open failed", ipc_name.c_str())); - - void *ptr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); + void *ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); PADDLE_ENFORCE_NE(ptr, MAP_FAILED, platform::errors::Unavailable( "Memory map failed when rebuild shared memory.")); diff --git a/paddle/fluid/memory/allocation/mmap_allocator.h b/paddle/fluid/memory/allocation/mmap_allocator.h index 3f91e5c4278..4f8dbfbb51e 100644 --- a/paddle/fluid/memory/allocation/mmap_allocator.h +++ b/paddle/fluid/memory/allocation/mmap_allocator.h @@ -16,8 +16,9 @@ #ifndef _WIN32 +#include #include -#include // NOLINT +#include #include #include #include @@ -28,6 +29,72 @@ namespace paddle { namespace memory { namespace allocation { +std::string GetIPCName(); + +static constexpr int64_t mmap_alignment = 64; + +enum MappedModes { + MAPPED_SHAREDMEM = 1, + MAPPED_EXCLUSIVE = 2, + MAPPED_NOCREATE = 4, + MAPPED_KEEPFD = 8, + MAPPED_FROMFD = 16, + MAPPED_UNLINK = 32 +}; + +class MemoryMapAllocation : public Allocation { + public: + explicit MemoryMapAllocation(void *ptr, size_t size, std::string ipc_name) + : Allocation(ptr, size, platform::CPUPlace()), + ipc_name_(std::move(ipc_name)), + map_ptr_(ptr), + map_size_(size) {} + explicit MemoryMapAllocation(void *ptr, size_t size, std::string ipc_name, + int flags, int fd) + : Allocation(ptr, size, platform::CPUPlace()), + ipc_name_(std::move(ipc_name)), + fd_(fd), + flags_(flags), + map_ptr_(ptr), + map_size_(size) {} + + inline const std::string &ipc_name() const { return ipc_name_; } + + virtual void close(); + + ~MemoryMapAllocation() override; + + protected: + std::string ipc_name_; + int fd_ = -1; + int flags_ = 0; + void *map_ptr_ = nullptr; + size_t map_size_ = 0; + bool closed_ = false; +}; + +class RefcountedMemoryMapAllocation : public MemoryMapAllocation { + public: + RefcountedMemoryMapAllocation(void *ptr, size_t size, std::string ipc_name, + int flags, int fd); + + void incref(); + int decref(); + void close() override; + virtual ~RefcountedMemoryMapAllocation() { close(); } + + protected: + void initializeRefercount(); + void resetBaseptr(); +}; + +void AllocateMemoryMap(std::string filename, int flags, size_t size, + void **base_ptr_, int *fd_); + +std::shared_ptr +AllocateRefcountedMemoryMapAllocation(std::string filename, int flags, + size_t size); + class MemoryMapWriterAllocation : public Allocation { public: explicit MemoryMapWriterAllocation(void *ptr, size_t size, diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 8ee22590b6d..2e901f3bffd 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -44,6 +44,9 @@ endif() if(NOT WIN32) set(PYBIND_DEPS ${PYBIND_DEPS} data_loader) set(PYBIND_DEPS ${PYBIND_DEPS} mmap_allocator) + if (WITH_GPU) + set(PYBIND_DEPS ${PYBIND_DEPS} cuda_ipc_allocator) + endif() if (WITH_NCCL OR WITH_RCCL) set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context) set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 98880294a27..ee6dce5dc23 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -64,6 +64,9 @@ limitations under the License. */ #include "paddle/fluid/imperative/amp_auto_cast.h" #include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h" +#endif #include "paddle/fluid/memory/allocation/mmap_allocator.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/common_infer_shape_functions.h" @@ -1187,6 +1190,287 @@ PYBIND11_MODULE(core_noavx, m) { }); #else }) +#ifdef PADDLE_WITH_CUDA + .def("_share_buffer_with", + [](framework::Tensor &self, const framework::Tensor src, + py::tuple t) { + auto *cuda_ipc_allocation = + dynamic_cast( + src.Holder().get()); + + PADDLE_ENFORCE_NOT_NULL( + cuda_ipc_allocation, + platform::errors::PreconditionNotMet( + "Tensor is not Cuda IPC shared tensor. " + "Now only Tensor shared by cuda ipc could use this " + "api.")); + + size_t size = t[0].cast(); + auto dtype = + static_cast(t[1].cast()); + auto dims = phi::make_ddim(t[2].cast>()); + auto lod_info = t[3].cast(); + auto device_id = t[4].cast(); + + auto shared_reader_holder = + std::make_shared( + cuda_ipc_allocation->ptr(), + cuda_ipc_allocation->base_ptr(), size, + platform::CUDAPlace(device_id)); + + self.ResetHolderWithType(shared_reader_holder, dtype); + self.Resize(dims); + self.set_lod(lod_info); + + VLOG(6) << "Reconstructed tensor with buffer shared!"; + }, + R"DOC( + Deserialize GPU Tensor for existed shared Cuda IPC tensor. + + Params: + tensor: Shared Cuda IPC tensor. + tuple: contrains data size, data type, + tensor dims, lod information, device index. + + )DOC") + .def("_share_cuda", + [](framework::Tensor self) { + if (!self.IsInitialized() || self.numel() == 0) + throw std::runtime_error( + "Tensor not initialized or numel is 0. could not pass " + "to shared memory. "); + + auto *holder = dynamic_cast( + self.Holder().get()); + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(holder->place()), true, + platform::errors::InvalidArgument( + "Tensor is not on GPU. share_cuda only support GPU " + "Tensor, share_filename is for CPU tensor.")); + + void *base_ptr = holder->base_ptr(); + ptrdiff_t offset_bytes = reinterpret_cast(holder->ptr()) - + reinterpret_cast(base_ptr); + + cudaIpcMemHandle_t handle; + PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcGetMemHandle(&handle, base_ptr)); + + auto _handle = py::bytes(reinterpret_cast(&handle), + (py::ssize_t)CUDA_IPC_HANDLE_SIZE); + + // TODO(ZHUI): use cuda event, to avoid sync. + const auto &device_id = paddle::platform::GetCurrentDeviceId(); + auto stream = + paddle::platform::stream::get_current_stream(device_id); + stream->Synchronize(); + + int type_idx = static_cast(self.type()); + size_t data_size = + self.numel() * + framework::SizeOfType( + framework::TransToProtoVarType(self.type())); + + return py::make_tuple(_handle, (py::size_t)offset_bytes, data_size, + type_idx, vectorize(self.dims()), self.lod(), + device_id); + }, + R"DOC( + Serialize GPU Tensor by cudaIpcMemHandle. + + Returns: + tuple: contrains handle, data size, data type, + tensor dims, lod information, device index. + + Examples: + .. code-block:: python + + import paddle + tensor = paddle.ones([3,3]) + metainfo = tensor.value().get_tensor()._share_cuda() + + )DOC") + .def("_new_shared_cuda", + [](py::tuple t) { + if (t.size() != 7) + throw std::runtime_error( + "Invalid Tensor meta info for shared cuda tensor!"); + + // 1. Create a new C++ instance + framework::Tensor tensor; + + // 2. Rebuild Allocation from handle + const std::string &handle = t[0].cast(); + ptrdiff_t offset_bytes = (ptrdiff_t)t[1].cast(); + auto device_id = t[6].cast(); + auto base_ptr = memory::allocation::GetIpcBasePtr(handle); + size_t size = t[2].cast(); + void *dev = base_ptr.get(); + dev = reinterpret_cast(dev) + offset_bytes; + + auto shared_reader_holder = + std::make_shared( + dev, size, device_id, std::move(base_ptr)); + + // 3. Rebuild Tensor + tensor.ResetHolderWithType( + shared_reader_holder, + static_cast(t[3].cast())); + tensor.Resize(phi::make_ddim(t[4].cast>())); + tensor.set_lod(t[5].cast()); + + return tensor; + }, + R"DOC( + Deserialize GPU lod tensor from cudaIpcMemHandle. + + Params: + tuple: contrains handle, data size, data type, + tensor dims, lod information, device index. + + Examples: + .. code-block:: python + + import paddle + tensor = paddle.ones([3,3]) + metainfo = tensor.value().get_tensor()._share_cuda() + tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_cuda(metainfo)) + + )DOC") +#endif + .def("_share_filename", + [](framework::Tensor &self) { + if (!self.IsInitialized() || self.numel() == 0) + throw std::runtime_error( + "Tensor not initialized or numel is 0. could not pass to " + "shared memory. "); + + auto holder = self.Holder(); + PADDLE_ENFORCE_EQ( + platform::is_cpu_place(holder->place()) || + platform::is_cuda_pinned_place(holder->place()), + true, platform::errors::InvalidArgument( + "Tensor is not on CPU. share_filename only " + "support CPU Tensor.")); + + auto *mmap_allocation = dynamic_cast< + memory::allocation::RefcountedMemoryMapAllocation *>( + holder.get()); + // If the tensor is not shared, allocate memory map allocation. + if (mmap_allocation == nullptr) { + void *data_ptr = self.data(); + size_t data_size = + self.numel() * + framework::SizeOfType( + framework::TransToProtoVarType(self.type())); + + int flags = memory::allocation::MAPPED_SHAREDMEM | + memory::allocation::MAPPED_EXCLUSIVE; + std::string handle = memory::allocation::GetIPCName(); + auto shared_holder = + memory::allocation::AllocateRefcountedMemoryMapAllocation( + handle, flags, data_size); + + // copy data & reset holder + if (platform::is_cuda_pinned_place(holder->place())) { +#ifdef PADDLE_WITH_CUDA + memory::Copy(platform::CPUPlace(), shared_holder->ptr(), + platform::CUDAPinnedPlace(), data_ptr, data_size); +#endif + } else { + memory::Copy(platform::CPUPlace(), shared_holder->ptr(), + platform::CPUPlace(), data_ptr, data_size); + } + self.ResetHolder(shared_holder); + mmap_allocation = shared_holder.get(); + } + int type_idx = static_cast(self.type()); + + return py::make_tuple(mmap_allocation->ipc_name(), + mmap_allocation->size(), type_idx, + vectorize(self.dims()), self.lod()); + }, + R"DOC( + Serialize CPU lod tensor in shared memory to tuple. + If the tensor is not in shared memory, we will copy it first. + + Returns: + tuple: contrains ipc name, data size, data type, + tensor dims and lod imformation. + + Examples: + .. code-block:: python + + import paddle + tensor = paddle.ones([3,3]) + metainfo = tensor.value().get_tensor()._share_filename() + + )DOC") + .def("_new_shared_filename", + [](py::tuple t) { // __setstate__ + if (t.size() != 5) + throw std::runtime_error("Invalid Tensor meta info state!"); + + framework::Tensor tensor; + + // 2. Rebuild Allocation + const std::string &ipc_name = t[0].cast(); + size_t size = t[1].cast(); + int flags = memory::allocation::MAPPED_SHAREDMEM | + memory::allocation::MAPPED_NOCREATE; + + auto shared_holder = + memory::allocation::AllocateRefcountedMemoryMapAllocation( + ipc_name, flags, size); + + // 3. Rebuild Tensor + tensor.ResetHolderWithType( + shared_holder, + static_cast(t[2].cast())); + tensor.Resize(phi::make_ddim(t[3].cast>())); + tensor.set_lod(t[4].cast()); + + return tensor; + }, + R"DOC( + Deserialize CPU lod tensor from shared memory. + + Params: + tuple: contrains ipc file name, data size, data type, + tensor dims and lod information. + + Examples: + .. code-block:: python + + import paddle + tensor = paddle.ones([3,3]) + metainfo = tensor.value().get_tensor()._share_filename() + tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_filename(metainfo)) + + )DOC") + .def("_shared_incref", + [](framework::Tensor &self) { + auto *mmap_allocation = dynamic_cast< + memory::allocation::RefcountedMemoryMapAllocation *>( + self.Holder().get()); + if (mmap_allocation) { + mmap_allocation->incref(); + } + }, + R"DOC( + Increase reference count of share_filename tensor. + )DOC") + .def("_shared_decref", + [](framework::Tensor &self) { + auto *mmap_allocation = dynamic_cast< + memory::allocation::RefcountedMemoryMapAllocation *>( + self.Holder().get()); + if (mmap_allocation) { + mmap_allocation->decref(); + } + }, + R"DOC( + Decrease reference count of share_filename tensor. + )DOC") .def(py::pickle( [](const framework::Tensor &t) { // __getstate__ auto holder = t.Holder(); diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index e75b8d1f60b..b05f16a0606 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -557,6 +557,7 @@ if (APPLE OR WIN32) list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_exception) list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_iterable_dataset) list(REMOVE_ITEM TEST_OPS test_multiprocess_dataloader_dataset) + list(REMOVE_ITEM TEST_OPS test_paddle_multiprocessing) endif() if (NOT WITH_GLOO) @@ -1174,6 +1175,7 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32) test_collective_global_scatter PROPERTIES LABELS "RUN_TYPE=DIST") endif() + set_tests_properties(test_paddle_multiprocessing PROPERTIES TIMEOUT 120) set_tests_properties(test_reducescatter_api PROPERTIES TIMEOUT 120) set_tests_properties(test_broadcast PROPERTIES TIMEOUT 120) set_tests_properties(test_reducescatter PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/test_paddle_multiprocessing.py b/python/paddle/fluid/tests/unittests/test_paddle_multiprocessing.py new file mode 100644 index 00000000000..1e31356a6bc --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_paddle_multiprocessing.py @@ -0,0 +1,199 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import gc +import sys +import unittest +import time +import paddle +import paddle.incubate.multiprocessing as mp + +REPEAT = 20 +HAS_SHM_FILES = os.path.isdir('/dev/shm') + + +def fill_tensor(queue, event): + data = queue.get() + with paddle.no_grad(): + data[0][:] = 5 + data[1][:] = 5 + + event.set() + + +def send_tensor(queue, event, device, dtype): + tensor = paddle.ones([5, 5], dtype=dtype) + queue.put(tensor) + queue.put(tensor) + event.wait() + + +def send_parambase(queue, event, device, dtype): + tensor = paddle.nn.Layer().create_parameter( + [5, 5], + dtype=dtype, + default_initializer=paddle.nn.initializer.Constant(value=1.0)) + queue.put(tensor) + queue.put(tensor) + event.wait() + + +class leak_checker(object): + def __init__(self, test_case): + self.checked_pids = [os.getpid()] + self.test_case = test_case + + def __enter__(self): + self.next_fds = self._get_next_fds(10) + return self + + def __exit__(self, *args): + if args[0] is None: + self.test_case.assertFalse(self.has_shm_files()) + return False + + def check_pid(self, pid): + self.checked_pids.append(pid) + + def _get_next_fds(self, n=1): + fds = [os.dup(0) for i in range(n)] + for fd in fds: + os.close(fd) + return fds + + def has_shm_files(self, wait=True): + if not HAS_SHM_FILES: + return False + result = self._has_shm_files() + if result and wait: + time.sleep(0.5) + return self._has_shm_files() + return result + + def _has_shm_files(self): + gc.collect() + names = ['paddle_' + str(pid) for pid in self.checked_pids] + for filename in os.listdir('/dev/shm'): + for name in names: + if filename.startswith(name): + print("have", filename) + return True + return False + + +class TestMultiprocessingBase(unittest.TestCase): + def get_tensor(self, device="cpu"): + self.device = device.lower() + place = None + tensor = paddle.zeros([5, 5], dtype="float32") + return tensor + + def get_parameter(self): + w = paddle.nn.Layer().create_parameter( + [10, 10], + default_initializer=paddle.nn.initializer.Constant(value=0.0)) + return w + + def _test_empty(self, dtype="float32"): + q = mp.Queue() + empty = paddle.to_tensor([], dtype=dtype) + q.put(empty) + out = q.get(timeout=1) + self.assertEqual(str(out), str(empty)) + + def _test_sharing(self, + ctx=mp, + device='cpu', + dtype="float32", + repeat=1, + param=False): + def test_fill(): + if param: + x = self.get_parameter() + y = (x[:, 1]).detach() + else: + x = self.get_tensor() + y = x[:, 1] + + data = [x, y] + + queue = ctx.Queue() + event = ctx.Event() + queue.put(data) + + process = ctx.Process(target=fill_tensor, args=(queue, event)) + process.daemon = True + lc.check_pid(process.pid) + process.start() + + event.wait(30) + + self.assertTrue(event.is_set()) + self.assertTrue(data[0].equal(5).all()) + self.assertTrue(data[1].equal(5).all()) + + process.join(1 if device != "gpu" else 10) + self.assertFalse(process.is_alive()) + + def test_receive(): + queue = ctx.Queue() + event = ctx.Event() + + process = ctx.Process( + target=send_parambase if param else send_tensor, + args=(queue, event, device, dtype)) + process.daemon = True + lc.check_pid(process.pid) + process.start() + + t1 = queue.get() + t2 = queue.get() + self.assertTrue(t1.equal(1).all()) + del t1, t2 + + event.set() + process.join(1 if device != "gpu" else 10) + self.assertFalse(process.is_alive()) + + with leak_checker(self) as lc: + for _ in range(repeat): + test_fill() + test_receive() + + +class TestMultiprocessingCpu(TestMultiprocessingBase): + def test_pass_tensor(self): + paddle.set_device("cpu") + self._test_sharing(repeat=REPEAT) + + def test_pass_parambase(self): + paddle.set_device("cpu") + self._test_sharing(repeat=1, param=True) + + def test_pass_empty(self): + paddle.set_device("cpu") + self._test_empty() + + +class TestMultiprocessingGpu(TestMultiprocessingBase): + @unittest.skipIf(not paddle.fluid.core.is_compiled_with_cuda(), + "core is not compiled with CUDA") + def test_pass_tensor(self): + paddle.set_device("gpu") + self._test_sharing(mp.get_context("spawn"), "gpu") + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/incubate/multiprocessing/__init__.py b/python/paddle/incubate/multiprocessing/__init__.py new file mode 100644 index 00000000000..27c23be3a89 --- /dev/null +++ b/python/paddle/incubate/multiprocessing/__init__.py @@ -0,0 +1,27 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .reductions import init_reductions +import multiprocessing + +__all__ = [] + +from multiprocessing import * # noqa: F403 + +__all__ += multiprocessing.__all__ # type: ignore[attr-defined] + +# Only support linux for now +# Only support file_system sharing strategy. + +init_reductions() diff --git a/python/paddle/incubate/multiprocessing/reductions.py b/python/paddle/incubate/multiprocessing/reductions.py new file mode 100644 index 00000000000..cfbc55afd3b --- /dev/null +++ b/python/paddle/incubate/multiprocessing/reductions.py @@ -0,0 +1,189 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle + +# TODO: check the hooks of tensor +# TODO: check serializing named tensor +# TODO: check influence on autograd +import os +import sys +import warnings +import math +import copy +import threading +import multiprocessing +from multiprocessing.util import register_after_fork +from multiprocessing.reduction import ForkingPickler + +from collections import OrderedDict + + +def _supported_check(): + if sys.platform != "linux": + # warnings.warn("`paddle.multiprocessing` only support linux for now, " + # " import this will not take any effect !") + + return False + + if not sys.version_info >= (3, 4): + warnings.warn("Use `paddle.multiprocessing` to share paddle tensor " + "requires python version greater than 3.4 ." + " `paddle.multiprocessing` will not take any effect !!!") + return False + + return True + + +class LRUSharedCache(OrderedDict): + def __init__(self): + self.limit = 128 + self._after_fork() + register_after_fork(self, LRUSharedCache._after_fork) + + def _after_fork(self): + self.lock = threading.Lock() + + def get(self, key): + with self.lock: + try: + value = super().pop(key) + super().__setitem__(key, value) + return value + except KeyError: + return None + + def __setitem__(self, key, value): + with self.lock: + try: + super().__delitem__(key) + except KeyError: + if len(self) >= self.limit: + super().popitem(last=False) + super().__setitem__(key, value) + + +shared_cache = LRUSharedCache() + + +def cuda_from_cache(key): + lodtensor = shared_cache.get(key) + if lodtensor is None: + return None + return lodtensor + + +def rebuild_tensor(cls, lodtensor, metadata): + if cls == paddle.fluid.framework.ParamBase: + tensor = paddle.fluid.framework.ParamBase(lodtensor.shape(), + lodtensor._dtype(), + **metadata) + tensor.value().get_tensor()._share_data_with(lodtensor) + else: + size, stop_gradient = metadata + tensor = paddle.fluid.core.VarBase() + if lodtensor._is_initialized(): + tensor.value().get_tensor()._share_data_with(lodtensor) + else: + tensor = paddle.to_tensor([], dtype=lodtensor._dtype()) + tensor.stop_gradient = stop_gradient + return tensor + + +def reduce_tensor(tensor): + lodtensor = tensor.value().get_tensor() + + if not tensor.stop_gradient and not tensor.is_leaf: + raise RuntimeError( + "Refusing to serialize non-leaf tensor which not stop_gradient, you can detach it!" + ) + # TODO: add serializing name and hooks check + if tensor.place.is_cpu_place() or tensor.place.is_gpu_place( + ) or tensor.place.is_cuda_pinned_place(): + if type(tensor) == paddle.fluid.framework.ParamBase: + metadata = copy.deepcopy(tensor.__dict__) + else: + metadata = (tensor.size, tensor.stop_gradient) + + return (rebuild_tensor, (type(tensor), lodtensor, metadata)) + else: + raise ValueError( + "Only support tensors of CPU/CUDA/CUDAPinned Place, Not support %s for now!" + % tensor.place) + + +def rebuild_lodtensor_filename(cls, ipc_name, size, type_idx, dims, lod): + lodtensor = cls._new_shared_filename((ipc_name, size, type_idx, dims, lod)) + lodtensor._shared_decref() + return lodtensor + + +def rebuild_cuda_tensor(cls, handle, offset_bytes, size, type_idx, dims, lod, + device_idx): + cache_tensor = cuda_from_cache((handle, offset_bytes)) + if cache_tensor is None: + lodtensor = cls._new_shared_cuda( + (handle, offset_bytes, size, type_idx, dims, lod, device_idx)) + # We only cache cuda shared tensor here. + # The opening cost of cudaIpcMemoryHandle is very high. + # Since we cache the recived tensor directly, + # The sender may reallocate the tensor space, + # you should manualy maintian the lifecycle of ipc tensor + shared_cache[(handle, offset_bytes)] = lodtensor + else: + lodtensor = paddle.fluid.core.LoDTensor() + lodtensor._share_buffer_with(cache_tensor, + (size, type_idx, dims, lod, device_idx)) + + return lodtensor + + +def rebuild_lodtensor_empty(cls): + #TODO: check if tensor initialized + #TODO: handle the dtype of empty tensor + return cls() + + +def reduce_lodtensor(lodtensor): + if lodtensor._place().is_cpu_place() or lodtensor._place( + ).is_cuda_pinned_place(): + for dim in lodtensor.shape(): + if dim == 0: + # Empty tensors have nothing be mmapped. + return (rebuild_lodtensor_empty, (type(lodtensor), )) + + # Default use share filename stratege + metadata = lodtensor._share_filename( + ) # ipc_name, size, type_idx, dims, lod + rebuild = rebuild_lodtensor_filename + lodtensor._shared_incref() + # TODO, maintain reference for lodtensor + # TODO: support file_discriptor stratege + elif lodtensor._place().is_gpu_place(): + metadata = lodtensor._share_cuda() + rebuild = rebuild_cuda_tensor + else: + raise RuntimeError("We only support pass cpu/gpu lodtensor for now!") + + return (rebuild, (type(lodtensor), ) + metadata) + + +def init_reductions(): + if not _supported_check(): + return + + ForkingPickler.register(paddle.Tensor, reduce_tensor) + ForkingPickler.register(paddle.fluid.core.VarBase, reduce_tensor) + ForkingPickler.register(paddle.fluid.framework.ParamBase, reduce_tensor) + ForkingPickler.register(paddle.fluid.core.LoDTensor, reduce_lodtensor) -- GitLab From e5c59fc9c2921fe153ca9dc62dda48bb63191cea Mon Sep 17 00:00:00 2001 From: zmxdream Date: Mon, 14 Mar 2022 12:10:53 +0800 Subject: [PATCH 025/176] [GPUPS]fix instag lod information (#40483) --- paddle/fluid/operators/filter_by_instag_op.cu | 62 +------------------ 1 file changed, 2 insertions(+), 60 deletions(-) diff --git a/paddle/fluid/operators/filter_by_instag_op.cu b/paddle/fluid/operators/filter_by_instag_op.cu index 508730c3c73..7870efba4e7 100644 --- a/paddle/fluid/operators/filter_by_instag_op.cu +++ b/paddle/fluid/operators/filter_by_instag_op.cu @@ -96,30 +96,6 @@ __global__ void filter_copy_fuse_kernel( if (N < ins_end) ins_end = N; - /* - if (!x1_lods_filled) { - for (int p = ins_start; p < ins_end; p++) { - x1_lods_data[p] = p; - } - if (idx == 0) { - x1_lods_data[N] = N; - } - } - - if (!x2_lods_filled) { - for (int p = ins_start; p < ins_end; p++) { - x2_lods_data[p] = p; - } - if (idx == 0) { - x2_lods_data[N] = N; - } - } - - if (!x1_lods_filled || !x2_lods_filled) { - b.sync(); - } - */ - int flag_data[5]; int prefix_sum_data[5]; int prefix_sum_data2[5]; @@ -173,8 +149,6 @@ __global__ void filter_copy_fuse_kernel( local_addr = prefix_sum_data[ins_end - 1 - ins_start]; sum_addr = local_addr; - // flag - // local_flag = 0; for (int p = ins_start; p < ins_end; p++) { local_flag += flag_data[p - ins_start]; } @@ -188,7 +162,6 @@ __global__ void filter_copy_fuse_kernel( sum_out_lods = local_out_lods; } - // 32 threads for (int i = 1; i < warp_thread_num; i *= 2) { int temp_addr = g.shfl_up(sum_addr, i); int temp_flag = g.shfl_up(sum_flag, i); @@ -266,27 +239,16 @@ __global__ void filter_copy_fuse_kernel( if (ins_start < ins_end) { int out_lods_idx = p_flag + 1; - - // ins_start = 1 - // BUG fix for (int p = ins_start; p < ins_end; p++) { if (flag_data[p - ins_start] == 1) { - // batch_len = 2 - // batch_len = 4 size_t batch_len = x1_lods_data[p + 1] - x1_lods_data[p]; - // t = 0 - // t = 1 int t = out_lods_idx - 1; - // out_lods_data[0] = 0; int previous; - if (out_lods_idx == p_flag + 1) { - // out_lods_data[t] = p_out_lods; previous = p_out_lods; } else { previous = out_lods_data[t]; } - map_data[t * 3] = (int64_t)previous; map_data[t * 3 + 1] = x1_lods_data[p]; map_lods_data[t] = t; @@ -300,7 +262,6 @@ __global__ void filter_copy_fuse_kernel( if (sum_out_lods4 > 1) { int out_data_num = sum_out_lods4 - 1; int out_start = ins_start; - if (out_start < out_data_num) { int out_end = ins_end >= out_data_num ? out_data_num : ins_end; for (int p = out_start; p < out_end; p++) { @@ -314,11 +275,8 @@ __global__ void filter_copy_fuse_kernel( if (flag_data[p - ins_start] == 1) { auto output_start_idx = prefix_sum_data2[p - ins_start]; T* dst = out_data + output_start_idx * x1_embed_size; - const T* src_start = x1_data + x1_lods_data[p] * x1_embed_size; const T* src_end = x1_data + x1_lods_data[p + 1] * x1_embed_size; - - // optimized for (const T *j = src_start; j != src_end; dst++, j++) { *dst = *j; } @@ -338,12 +296,10 @@ __global__ void copy_grad_kernel(const size_t N, const int ins_per_thread, int idx = blockIdx.x * blockDim.x + threadIdx.x; int ins_start = idx * ins_per_thread; int ins_end = (idx + 1) * ins_per_thread; - if (ins_start >= N) { return; } if (ins_end > N) ins_end = N; - for (int p = ins_start; p < ins_end; p++) { T* dst = x1_grad_data + map_data[p * 3 + 1] * x1_embed_size; const T* src_start = out_grad_data + map_data[p * 3] * x1_embed_size; @@ -394,21 +350,17 @@ class FilterByInstagGPUKernel : public framework::OpKernel { const Tensor* x3 = context.Input("Filter_tag"); const int64_t* x3_data = x3->data(); - // int x2_lods_filled = 1; - Vector x2_lods; - // Vector, in GPU if (x2->lod().size() != 0) { // lod_level = 1 x2_lods = x2->lod()[0]; - // x2_lods_filled = 1; - } else { // lod_level = 0 const size_t x2_lods_size = x2->dims()[0]; + const size_t instag_per_num = x2->dims()[1]; // x2_lods.resize(x2->dims()[0] + 1); // move to cuda x2_lods.push_back(0); for (size_t i = 0; i < x2_lods_size; i++) { - x2_lods.push_back(i + 1); + x2_lods.push_back(x2_lods.back() + instag_per_num); } } @@ -417,13 +369,8 @@ class FilterByInstagGPUKernel : public framework::OpKernel { size_t* x2_lods_data = mixv_x2_lods.CUDAMutableData(gpu_place); - // Vector, in GPU - // int x1_lods_filled = 1; Vector x1_lods; - if (!is_x1_lod) { - // move to cuda - // x1_lods.resize(x1->dims()[0] + 1); x1_lods.push_back(0); for (int i = 0; i < x1->dims()[0]; i++) { x1_lods.push_back(i + 1); @@ -432,7 +379,6 @@ class FilterByInstagGPUKernel : public framework::OpKernel { // x1_lods = context.Input("Ins")->lod()[0]; // new: lod_level=0 => lod() return {} if (x1->lod().size() != 0) { // lod_level = 1 - // x1_lods_filled = 1; x1_lods = x1->lod()[0]; } else { // lod_level = 0 // x1_lods.resize(x1->dims()[0] + 1); @@ -458,10 +404,6 @@ class FilterByInstagGPUKernel : public framework::OpKernel { LoDTensor* loss_weight = context.Output("LossWeight"); int out_first = x1_lods.back(); - // int out_first = x1->dims()[0]; - // if (x1_lods_filled) { - // out_first = x1_lods.back(); - // } out->Resize(phi::make_ddim({(int64_t)out_first, (int64_t)x1_embed_size})); map->Resize(phi::make_ddim({(int64_t)x2_lods_size, 3})); -- GitLab From 481db5e97dc6bb6cc595adfe2b7b5c9e13c1442c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?= <78149749+winter-wang@users.noreply.github.com> Date: Mon, 14 Mar 2022 12:49:38 +0800 Subject: [PATCH 026/176] [infrt] unify the infrt dialect. test=develop (#40451) --- cmake/external/llvm.cmake | 3 +- paddle/infrt/CMakeLists.txt | 1 - paddle/infrt/api/infrt_api.cc | 4 +- paddle/infrt/dialect/CMakeLists.txt | 10 +-- paddle/infrt/dialect/dense_tensor.h | 2 +- paddle/infrt/dialect/dense_tensor.td | 2 +- paddle/infrt/dialect/infrt/CMakeLists.txt | 18 +--- .../infrt/dialect/infrt/common/CMakeLists.txt | 6 ++ .../infrt/{common_type.cc => common/types.cc} | 2 +- .../infrt/{common_type.h => common/types.h} | 0 paddle/infrt/dialect/infrt/common/utils.cc | 28 +++++++ .../{pd_types.cc => infrt/common/utils.h} | 18 +++- paddle/infrt/dialect/infrt/ir/CMakeLists.txt | 18 ++++ .../dialect/{ => infrt/ir}/basic_kernels.cc | 63 +------------- .../dialect/{ => infrt/ir}/basic_kernels.h | 2 +- .../dialect/{ => infrt/ir}/basic_kernels.td | 39 +-------- .../{infrt_ops_base.td => ir/infrt_base.td} | 17 ++++ .../dialect/infrt/{ => ir}/infrt_dialect.cc | 30 +++++-- .../dialect/infrt/{ => ir}/infrt_dialect.h | 10 +-- .../infrt/dialect/infrt/{ => ir}/infrt_ops.td | 22 ++++- .../dialect/{ => infrt/ir}/test_kernels.cc | 6 +- .../dialect/{ => infrt/ir}/test_kernels.h | 2 +- .../dialect/{ => infrt/ir}/test_kernels.td | 6 +- .../infrt/dialect/infrt/pass/infrt_op_fuse.td | 2 +- .../dialect/infrt/pass/infrt_op_fuse_pass.cc | 2 +- paddle/infrt/dialect/infrt_base.cc | 56 ------------- paddle/infrt/dialect/infrt_base.h | 83 ------------------- paddle/infrt/dialect/infrt_base.td | 33 -------- ...nit_infrt_dialects.cc => init_dialects.cc} | 11 ++- ...{init_infrt_dialects.h => init_dialects.h} | 0 paddle/infrt/dialect/mlir_loader.cc | 2 +- paddle/infrt/dialect/mlir_loader_test.cc | 12 +-- paddle/infrt/dialect/opt.cc | 2 +- paddle/infrt/dialect/pd_op_base.td | 2 +- paddle/infrt/dialect/pd_ops.cc | 1 - paddle/infrt/dialect/pd_ops.h | 2 +- paddle/infrt/dialect/pd_types.h | 56 ------------- paddle/infrt/dialect/phi/data_type.h | 2 +- paddle/infrt/dialect/phi/ir/infrt_phi_base.td | 2 +- .../infrt/dialect/phi/ir/infrt_phi_kernel.td | 2 +- .../infrt/dialect/phi/ir/infrt_phi_tensor.td | 2 +- paddle/infrt/dialect/phi/ir/phi_base.h | 2 +- paddle/infrt/dialect/phi/ir/phi_kernels.h | 2 +- .../infrt/dialect/phi/pass/kernel_op_desc.h | 2 +- .../infrt/dialect/phi/pass/phi_op_cvt_pass.cc | 80 +++++++++++------- .../infrt/dialect/phi/pass/phi_op_cvt_pass.h | 38 +-------- paddle/infrt/dialect/phi/phi_ir_exec.cc | 2 +- paddle/infrt/dialect/print_ir.cc | 2 +- paddle/infrt/dialect/rewrite.td | 2 +- paddle/infrt/dialect/tensor_shape.td | 2 +- .../infrt/dialect/tensorrt/pd_lower_to_trt.td | 2 +- .../dialect/tensorrt/trt_graph_fuse_pass.h | 13 ++- .../dialect/tensorrt/trt_graph_split_pass.h | 7 +- .../dialect/tensorrt/trt_op_converter_pass.cc | 3 +- .../dialect/tensorrt/trt_op_converter_pass.h | 10 +-- .../dialect/tensorrt/trt_op_teller_pass.cc | 4 +- .../dialect/tensorrt/trt_op_teller_pass.h | 11 ++- paddle/infrt/dialect/tensorrt/trt_ops.h | 4 +- paddle/infrt/external_kernels/basic.mlir | 6 +- paddle/infrt/external_kernels/fc.mlir | 50 +++++------ paddle/infrt/external_kernels/paddle.mlir | 64 +++++++------- paddle/infrt/host_context/mlir_exec.cc | 2 +- .../infrt/host_context/mlir_tests/basic.mlir | 24 +++--- .../host_context/mlir_tests/dense_tensor.mlir | 8 +- .../infrt/host_context/mlir_tests/shape.mlir | 4 +- .../host_context/mlir_to_runtime_translate.cc | 8 +- .../host_context/mlir_to_runtime_translate.h | 2 +- .../mlir_to_runtime_translate_test.cc | 26 +++--- paddle/infrt/host_context/paddle_mlir.cc | 1 - paddle/infrt/host_context/paddle_mlir.h | 6 +- paddle/infrt/host_context/value.h | 2 +- paddle/infrt/kernel/basic_kernels.cc | 24 +++--- paddle/infrt/kernel/control_flow_kernels.cc | 2 +- .../infrt/kernel/phi/dense_tensor_kernels.h | 2 +- paddle/infrt/kernel/test_kernels.cc | 2 +- paddle/infrt/tests/dialect/basic.mlir | 28 +++---- paddle/infrt/tests/dialect/benchmark.mlir | 14 ++-- paddle/infrt/tests/dialect/dense_tensor.mlir | 8 +- .../tests/dialect/disabled_tensor_map.mlir | 28 +++---- paddle/infrt/tests/dialect/paddle_ops.mlir | 2 +- .../infrt/tests/dialect/phi/dense_tensor.mlir | 2 +- paddle/infrt/tests/dialect/phi/phi_test.mlir | 6 +- .../tests/dialect/tensor/dense_tensor.mlir | 8 +- .../tests/dialect/tensor/naive_kernels.mlir | 4 +- .../tests/dialect/tensor/tensor_map.mlir.in | 4 +- .../tests/dialect/tensor/tensor_shape.mlir | 2 +- .../tests/dialect/tensor/tensor_type.mlir | 2 +- paddle/infrt/tests/dialect/tensor_shape.mlir | 2 +- paddle/infrt/tests/dialect/tensor_type.mlir | 2 +- paddle/infrt/tests/dialect/trt_ops.mlir | 2 +- 90 files changed, 439 insertions(+), 675 deletions(-) create mode 100644 paddle/infrt/dialect/infrt/common/CMakeLists.txt rename paddle/infrt/dialect/infrt/{common_type.cc => common/types.cc} (97%) rename paddle/infrt/dialect/infrt/{common_type.h => common/types.h} (100%) create mode 100644 paddle/infrt/dialect/infrt/common/utils.cc rename paddle/infrt/dialect/{pd_types.cc => infrt/common/utils.h} (57%) create mode 100644 paddle/infrt/dialect/infrt/ir/CMakeLists.txt rename paddle/infrt/dialect/{ => infrt/ir}/basic_kernels.cc (63%) rename paddle/infrt/dialect/{ => infrt/ir}/basic_kernels.h (92%) rename paddle/infrt/dialect/{ => infrt/ir}/basic_kernels.td (69%) rename paddle/infrt/dialect/infrt/{infrt_ops_base.td => ir/infrt_base.td} (85%) rename paddle/infrt/dialect/infrt/{ => ir}/infrt_dialect.cc (84%) rename paddle/infrt/dialect/infrt/{ => ir}/infrt_dialect.h (77%) rename paddle/infrt/dialect/infrt/{ => ir}/infrt_ops.td (64%) rename paddle/infrt/dialect/{ => infrt/ir}/test_kernels.cc (96%) rename paddle/infrt/dialect/{ => infrt/ir}/test_kernels.h (92%) rename paddle/infrt/dialect/{ => infrt/ir}/test_kernels.td (93%) delete mode 100644 paddle/infrt/dialect/infrt_base.cc delete mode 100644 paddle/infrt/dialect/infrt_base.h delete mode 100644 paddle/infrt/dialect/infrt_base.td rename paddle/infrt/dialect/{init_infrt_dialects.cc => init_dialects.cc} (83%) rename paddle/infrt/dialect/{init_infrt_dialects.h => init_dialects.h} (100%) delete mode 100644 paddle/infrt/dialect/pd_types.h diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake index 9f6fd32ad98..5c48afa2806 100644 --- a/cmake/external/llvm.cmake +++ b/cmake/external/llvm.cmake @@ -99,7 +99,8 @@ endfunction() function(mlir_add_rewriter td_base) set(LLVM_TARGET_DEFINITIONS ${td_base}.td) - mlir_tablegen(${td_base}.cpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass") + set(LLVM_TARGET_DEPENDS ${LLVM_TARGET_DEPENDS} ${CMAKE_SOURCE_DIR}/paddle/infrt/dialect/infrt/ir/infrt_base.td) + mlir_tablegen(${td_base}.cpp.inc -gen-rewriters) add_public_tablegen_target(MLIR${td_base}IncGen) add_dependencies(mlir-headers MLIR${td_base}IncGen) endfunction() diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt index ed29b5b44c7..4e273f6d551 100644 --- a/paddle/infrt/CMakeLists.txt +++ b/paddle/infrt/CMakeLists.txt @@ -90,7 +90,6 @@ add_subdirectory(tests) set(infrt_mlir_incs basic_kernels_inc test_kernels_inc - infrt_base_inc tensor_shape_inc dense_tensor_inc pd_ops_inc diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc index e0488117783..0500a812304 100644 --- a/paddle/infrt/api/infrt_api.cc +++ b/paddle/infrt/api/infrt_api.cc @@ -24,7 +24,7 @@ #include "paddle/infrt/common/global.h" #include "paddle/infrt/dialect/dense_tensor.h" -#include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/mlir_loader.h" #include "paddle/infrt/host_context/core_runtime.h" #include "paddle/infrt/host_context/kernel_registry.h" @@ -144,7 +144,7 @@ class PredictExecutor : public MlirToRuntimeTranslator { // process results auto& last_op = predict_func.front().back(); - if (last_op.getName().getStringRef() == "Infrt.return") { + if (last_op.getName().getStringRef() == "infrt.return") { for (size_t i = 0; i < last_op.getNumOperands(); ++i) { auto* value = AddValue(mlir::Value(last_op.getOperand(i))); results_.push_back(ValueRef(value)); diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt index e35989da208..a3f2d0afafc 100644 --- a/paddle/infrt/dialect/CMakeLists.txt +++ b/paddle/infrt/dialect/CMakeLists.txt @@ -2,26 +2,20 @@ core_gather_headers() gather_srcs(infrt_src SRCS dialect.cc - basic_kernels.cc - test_kernels.cc - infrt_base.cc - init_infrt_dialects.cc + init_dialects.cc tensor_shape.cc dense_tensor.cc mlir_loader.cc diagnostic_utils.cc - pd_types.cc pd_ops.cc ) -mlir_tablegen_on(basic_kernels) -mlir_tablegen_on(test_kernels) -mlir_tablegen_on(infrt_base DIALECT Infrt) mlir_tablegen_on(tensor_shape DIALECT ts) mlir_tablegen_on(dense_tensor DIALECT dt) mlir_tablegen_on(pd_op_base DIALECT pd) mlir_tablegen_on(pd_ops) mlir_tablegen_on(pd_extra_ops) + mlir_add_rewriter(rewrite) # TODO(Superjomn) add a cmake function cc_executable to ecapsulate the following code diff --git a/paddle/infrt/dialect/dense_tensor.h b/paddle/infrt/dialect/dense_tensor.h index 27febffe815..7fbd1e8a4ef 100644 --- a/paddle/infrt/dialect/dense_tensor.h +++ b/paddle/infrt/dialect/dense_tensor.h @@ -19,7 +19,7 @@ #include -#include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/dense_tensor_dialect.hpp.inc" diff --git a/paddle/infrt/dialect/dense_tensor.td b/paddle/infrt/dialect/dense_tensor.td index f5db90648ee..666c7b300af 100644 --- a/paddle/infrt/dialect/dense_tensor.td +++ b/paddle/infrt/dialect/dense_tensor.td @@ -2,7 +2,7 @@ #else #define DT_OPS -include "paddle/infrt/dialect/infrt_base.td" +include "paddle/infrt/dialect/infrt/ir/infrt_base.td" include "paddle/infrt/dialect/tensor_shape_base.td" include "mlir/Interfaces/SideEffectInterfaces.td" diff --git a/paddle/infrt/dialect/infrt/CMakeLists.txt b/paddle/infrt/dialect/infrt/CMakeLists.txt index 08ce2d4707b..5f65336453f 100644 --- a/paddle/infrt/dialect/infrt/CMakeLists.txt +++ b/paddle/infrt/dialect/infrt/CMakeLists.txt @@ -1,17 +1,3 @@ -core_gather_headers() - -gather_srcs(infrt_src SRCS - common_type.cc - infrt_dialect.cc - ) - - -add_mlir_dialect(infrt_ops infrt) - -set(LLVM_TARGET_DEFINITIONS infrt_ops.td) -mlir_tablegen(infrt_opsAttributes.h.inc -gen-attrdef-decls -dialect=infrt) -mlir_tablegen(infrt_opsAttributes.cpp.inc -gen-attrdef-defs -dialect=infrt) -add_public_tablegen_target(MLIRinfrt_opsAttributesIncGen) -add_dependencies(mlir-headers MLIRinfrt_opsAttributesIncGen) - +add_subdirectory(common) +add_subdirectory(ir) add_subdirectory(pass) diff --git a/paddle/infrt/dialect/infrt/common/CMakeLists.txt b/paddle/infrt/dialect/infrt/common/CMakeLists.txt new file mode 100644 index 00000000000..f693c82b506 --- /dev/null +++ b/paddle/infrt/dialect/infrt/common/CMakeLists.txt @@ -0,0 +1,6 @@ +core_gather_headers() + +gather_srcs(infrt_src SRCS + types.cc + utils.cc + ) diff --git a/paddle/infrt/dialect/infrt/common_type.cc b/paddle/infrt/dialect/infrt/common/types.cc similarity index 97% rename from paddle/infrt/dialect/infrt/common_type.cc rename to paddle/infrt/dialect/infrt/common/types.cc index 00684c50526..62419a19628 100644 --- a/paddle/infrt/dialect/infrt/common_type.cc +++ b/paddle/infrt/dialect/infrt/common/types.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/dialect/infrt/common_type.h" +#include "paddle/infrt/dialect/infrt/common/types.h" namespace infrt { diff --git a/paddle/infrt/dialect/infrt/common_type.h b/paddle/infrt/dialect/infrt/common/types.h similarity index 100% rename from paddle/infrt/dialect/infrt/common_type.h rename to paddle/infrt/dialect/infrt/common/types.h diff --git a/paddle/infrt/dialect/infrt/common/utils.cc b/paddle/infrt/dialect/infrt/common/utils.cc new file mode 100644 index 00000000000..0ffb23c490f --- /dev/null +++ b/paddle/infrt/dialect/infrt/common/utils.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/infrt/common/utils.h" + +mlir::SmallVector infrt::cvtValueToValueRange( + const mlir::Value &operand) { + return mlir::SmallVector(1, operand); +} + +mlir::SmallVector infrt::concatTwoValueRange( + mlir::ValueRange operand_0, mlir::ValueRange operand_1) { + mlir::SmallVector operands; + operands.append(operand_0.begin(), operand_0.end()); + operands.append(operand_1.begin(), operand_1.end()); + return operands; +} diff --git a/paddle/infrt/dialect/pd_types.cc b/paddle/infrt/dialect/infrt/common/utils.h similarity index 57% rename from paddle/infrt/dialect/pd_types.cc rename to paddle/infrt/dialect/infrt/common/utils.h index 94856e362d3..886407b5664 100644 --- a/paddle/infrt/dialect/pd_types.cc +++ b/paddle/infrt/dialect/infrt/common/utils.h @@ -12,4 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/dialect/pd_types.h" +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace infrt { + +mlir::SmallVector cvtValueToValueRange( + const mlir::Value &operand); + +mlir::SmallVector concatTwoValueRange( + mlir::ValueRange operand_0, mlir::ValueRange operand_1); +} // namespace infrt diff --git a/paddle/infrt/dialect/infrt/ir/CMakeLists.txt b/paddle/infrt/dialect/infrt/ir/CMakeLists.txt new file mode 100644 index 00000000000..7c009bdb267 --- /dev/null +++ b/paddle/infrt/dialect/infrt/ir/CMakeLists.txt @@ -0,0 +1,18 @@ +core_gather_headers() + +gather_srcs(infrt_src SRCS + infrt_dialect.cc + basic_kernels.cc + test_kernels.cc + ) + +add_mlir_dialect(infrt_ops infrt) + +set(LLVM_TARGET_DEFINITIONS infrt_ops.td) +mlir_tablegen(infrt_opsAttributes.h.inc -gen-attrdef-decls -dialect=infrt) +mlir_tablegen(infrt_opsAttributes.cpp.inc -gen-attrdef-defs -dialect=infrt) +add_public_tablegen_target(MLIRinfrt_opsAttributesIncGen) +add_dependencies(mlir-headers MLIRinfrt_opsAttributesIncGen) + +mlir_tablegen_on(basic_kernels) +mlir_tablegen_on(test_kernels) diff --git a/paddle/infrt/dialect/basic_kernels.cc b/paddle/infrt/dialect/infrt/ir/basic_kernels.cc similarity index 63% rename from paddle/infrt/dialect/basic_kernels.cc rename to paddle/infrt/dialect/infrt/ir/basic_kernels.cc index c1aa75fb246..ba83f3e36c9 100644 --- a/paddle/infrt/dialect/basic_kernels.cc +++ b/paddle/infrt/dialect/infrt/ir/basic_kernels.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/dialect/basic_kernels.h" +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" #include #include @@ -30,23 +30,6 @@ namespace infrt { namespace dialect { using namespace mlir; // NOLINT -static ParseResult parseCallOp(OpAsmParser &parser, // NOLINT - OperationState &result) { // NOLINT - SymbolRefAttr callee_attr; - FunctionType callee_type; - SmallVector operands; - auto callee_loc = parser.getNameLoc(); - if (parser.parseAttribute(callee_attr, "callee", result.attributes) || - parser.parseOperandList(operands, OpAsmParser::Delimiter::Paren) || - parser.parseOptionalAttrDict(result.attributes) || - parser.parseColonType(callee_type) || - parser.addTypesToList(callee_type.getResults(), result.types) || - parser.resolveOperands( - operands, callee_type.getInputs(), callee_loc, result.operands)) - return failure(); - return success(); -} - static ParseResult parseConstantOp(Type attrType, OpAsmParser &parser, // NOLINT OperationState &result) { // NOLINT @@ -79,24 +62,6 @@ static ParseResult parseConstantI64Op(OpAsmParser &parser, // NOLINT IntegerType::get(result.getContext(), 64), parser, result); } -static ParseResult parseReturnOp(OpAsmParser &parser, // NOLINT - OperationState &result) { // NOLINT - SmallVector opInfo; - SmallVector types; - llvm::SMLoc loc = parser.getCurrentLocation(); - return failure(parser.parseOperandList(opInfo) || - (!opInfo.empty() && parser.parseColonTypeList(types)) || - parser.resolveOperands(opInfo, types, loc, result.operands)); -} - -static void print(OpAsmPrinter &p, CallOp op) { // NOLINT - p << op->getAttr("callee") << "("; - p.printOperands(op.getOperands()); - p << ")"; - p.printOptionalAttrDict(op->getAttrs(), {"callee"}); - p << " : "; -} - static void printConstant(OpAsmPrinter &p, mlir::Operation *op) { // NOLINT p << " "; p.printOptionalAttrDict(op->getAttrs(), /*elidedAttrs=*/{"value"}); @@ -127,37 +92,13 @@ static void print(OpAsmPrinter &p, ConstantI64Op op) { // NOLINT printConstant(p, op); } -static void print(OpAsmPrinter &p, ReturnOp op) { // NOLINT - if (op.getNumOperands() > 0) { - p << ' '; - p.printOperands(op.getOperands()); - p << " : "; - llvm::interleaveComma(op.getOperands(), p); - } -} - -static LogicalResult verify(CallOp op) { return success(); } - static LogicalResult verify(ConstantF32Op op) { return success(); } static LogicalResult verify(ConstantI32Op op) { return success(); } static LogicalResult verify(ConstantF64Op op) { return success(); } static LogicalResult verify(ConstantI64Op op) { return success(); } -static LogicalResult verify(ReturnOp op) { - auto function = dyn_cast(op->getParentOp()); - - if (!function) return success(); - - auto results = function.getType().getResults(); - if (op.getNumOperands() != results.size()) - return op.emitOpError("has ") - << op.getNumOperands() - << " operands, but enclosing function returns " << results.size(); - - return success(); -} } // namespace dialect } // namespace infrt #define GET_OP_CLASSES -#include "paddle/infrt/dialect/basic_kernels.cpp.inc" +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.cpp.inc" diff --git a/paddle/infrt/dialect/basic_kernels.h b/paddle/infrt/dialect/infrt/ir/basic_kernels.h similarity index 92% rename from paddle/infrt/dialect/basic_kernels.h rename to paddle/infrt/dialect/infrt/ir/basic_kernels.h index b82abcd52d2..a36f55691b7 100644 --- a/paddle/infrt/dialect/basic_kernels.h +++ b/paddle/infrt/dialect/infrt/ir/basic_kernels.h @@ -18,4 +18,4 @@ #include #define GET_OP_CLASSES -#include "paddle/infrt/dialect/basic_kernels.hpp.inc" +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.hpp.inc" diff --git a/paddle/infrt/dialect/basic_kernels.td b/paddle/infrt/dialect/infrt/ir/basic_kernels.td similarity index 69% rename from paddle/infrt/dialect/basic_kernels.td rename to paddle/infrt/dialect/infrt/ir/basic_kernels.td index 89d8cd65b85..60315b45dd0 100644 --- a/paddle/infrt/dialect/basic_kernels.td +++ b/paddle/infrt/dialect/infrt/ir/basic_kernels.td @@ -4,10 +4,10 @@ #else #define BASIC_OPS -include "paddle/infrt/dialect/infrt_base.td" +include "paddle/infrt/dialect/infrt/ir/infrt_base.td" include "mlir/Interfaces/SideEffectInterfaces.td" -class INFRT_Op traits = []> : Op { +class INFRT_Op traits = []> : Op { // Each registered op needs to provide all of a printer, parser and verifier. let printer = [{ return infrt::dialect::print(p, *this); }]; @@ -15,23 +15,6 @@ class INFRT_Op traits = []> : Op { - let summary = "call a host operation"; - let description = [{ - The "infrt.call" operation represents a direct call to a function. The operands and result types of the call must match the specified function type. - - %2 = infrt.call @add(%0, %1) : (f32, f32) -> f32 - }]; - - let arguments = (ins FlatSymbolRefAttr:$callee, Variadic:$operands); - let results = (outs Variadic); - - let extraClassDeclaration = [{ - mlir::StringRef getCallee() { return callee(); } - mlir::FunctionType getCalleeType(); - }]; -} - class ConstantOp : INFRT_Op<"constant." # suffix, [NoSideEffect]> { let summary = "constant value constructor in host"; @@ -45,22 +28,6 @@ def ConstantI64Op : ConstantOp<"i64", I64, I64Attr>; def ConstantF32Op : ConstantOp<"f32", F32, F32Attr>; def ConstantF64Op : ConstantOp<"f64", F64, F64Attr>; -def ReturnOp : INFRT_Op<"return", [Terminator]> { - let summary = "host executor return operation"; - let description = [{ - The "Infrt.return" operation represents a return operation within a function. - - func @foo() : (i32, f8) { - Infrt.return %0, %1 : i32, f8 - } - }]; - - let arguments = (ins Variadic:$operands); - - let builders = [OpBuilder<(ins), - [{ build($_builder, $_state, llvm::None); }]>]; -} - class AddOp : INFRT_Op<"add." # suffix, [NoSideEffect]> { let summary = "infrt.add operation"; let description = [{ @@ -112,7 +79,7 @@ def PrintF32Op : PrintOp<"f32", F32>; def PrintF64Op : PrintOp<"f64", F64>; def PrintStringOp : INFRT_Op<"print_string"> { - let summary = "Infrt.print_string"; + let summary = "infrt.print_string"; let description = [{ An operation that prints a string. }]; diff --git a/paddle/infrt/dialect/infrt/infrt_ops_base.td b/paddle/infrt/dialect/infrt/ir/infrt_base.td similarity index 85% rename from paddle/infrt/dialect/infrt/infrt_ops_base.td rename to paddle/infrt/dialect/infrt/ir/infrt_base.td index 3190c1c84b8..c5130e89bb1 100644 --- a/paddle/infrt/dialect/infrt/infrt_ops_base.td +++ b/paddle/infrt/dialect/infrt/ir/infrt_base.td @@ -101,4 +101,21 @@ class Infrt_Attr traits = [], : AttrDef { let mnemonic = ?; } + +// tools function. used for pattern rewriter +class INFRT_createI32Attr : NativeCodeCall< + "$_builder.getI32IntegerAttr(" # value # ")">; + +class INFRT_createSI32Attr : NativeCodeCall< + "$_builder.getSI32IntegerAttr(" # value # ")">; + +class INFRT_createF32Attr : NativeCodeCall< + "$_builder.getF32FloatAttr(" # value # ")">; + +def INFRT_cvtValueToValueRange : NativeCodeCall< + "infrt::cvtValueToValueRange($0)">; + +def INFRT_concatTwoValueRange : NativeCodeCall< + "infrt::concatTwoValueRange($0, $1)">; + #endif // INFRT_OPS_BASE diff --git a/paddle/infrt/dialect/infrt/infrt_dialect.cc b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc similarity index 84% rename from paddle/infrt/dialect/infrt/infrt_dialect.cc rename to paddle/infrt/dialect/infrt/ir/infrt_dialect.cc index 400e4921c94..42de08ebc41 100644 --- a/paddle/infrt/dialect/infrt/infrt_dialect.cc +++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc @@ -12,40 +12,52 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include #include #include #include #include "paddle/infrt/dialect/dense_tensor.h" -#include "paddle/infrt/dialect/infrt/infrt_opsDialect.cpp.inc" +#include "paddle/infrt/dialect/infrt/ir/infrt_opsDialect.cpp.inc" #define GET_TYPEDEF_CLASSES -#include "paddle/infrt/dialect/infrt/infrt_opsTypes.cpp.inc" +#include "paddle/infrt/dialect/infrt/ir/infrt_opsTypes.cpp.inc" #define GET_ATTRDEF_CLASSES -#include "paddle/infrt/dialect/infrt/infrt_opsAttributes.cpp.inc" +#include "paddle/infrt/dialect/infrt/ir/infrt_opsAttributes.cpp.inc" #define GET_OP_CLASSES -#include "paddle/infrt/dialect/infrt/infrt_ops.cpp.inc" +#include "paddle/infrt/dialect/infrt/ir/infrt_ops.cpp.inc" + +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" + +#include "paddle/infrt/dialect/infrt/ir/test_kernels.h" namespace infrt { void InfrtDialect::initialize() { addTypes< #define GET_TYPEDEF_LIST -#include "paddle/infrt/dialect/infrt/infrt_opsTypes.cpp.inc" // NOLINT +#include "paddle/infrt/dialect/infrt/ir/infrt_opsTypes.cpp.inc" // NOLINT >(); addAttributes< #define GET_ATTRDEF_LIST -#include "paddle/infrt/dialect/infrt/infrt_opsAttributes.cpp.inc" // NOLINT +#include "paddle/infrt/dialect/infrt/ir/infrt_opsAttributes.cpp.inc" // NOLINT >(); addOperations< #define GET_OP_LIST -#include "paddle/infrt/dialect/infrt/infrt_ops.cpp.inc" // NOLINT +#include "paddle/infrt/dialect/infrt/ir/infrt_ops.cpp.inc" // NOLINT + >(); + addOperations< +#define GET_OP_LIST +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.cpp.inc" + >(); + addOperations< +#define GET_OP_LIST +#include "paddle/infrt/dialect/infrt/ir/test_kernels.cpp.inc" >(); } @@ -128,7 +140,7 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const { void InfrtDialect::printType(::mlir::Type type, ::mlir::DialectAsmPrinter &os) const { - // print LoDTensorType, for example: !Infrt.lod_tensor<3x64x3x3xf32,5> + // print LoDTensorType, for example: !infrt.lod_tensor<3x64x3x3xf32,5> if (type.isa()) { auto lod_tensor_type = type.cast(); os << "lod_tensor<"; diff --git a/paddle/infrt/dialect/infrt/infrt_dialect.h b/paddle/infrt/dialect/infrt/ir/infrt_dialect.h similarity index 77% rename from paddle/infrt/dialect/infrt/infrt_dialect.h rename to paddle/infrt/dialect/infrt/ir/infrt_dialect.h index ed5b36e5561..3e6ea2a74c7 100644 --- a/paddle/infrt/dialect/infrt/infrt_dialect.h +++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.h @@ -22,14 +22,14 @@ #include #include #include -#include "paddle/infrt/dialect/infrt/common_type.h" +#include "paddle/infrt/dialect/infrt/common/types.h" -#include "paddle/infrt/dialect/infrt/infrt_opsDialect.h.inc" +#include "paddle/infrt/dialect/infrt/ir/infrt_opsDialect.h.inc" #define GET_TYPEDEF_CLASSES -#include "paddle/infrt/dialect/infrt/infrt_opsTypes.h.inc" +#include "paddle/infrt/dialect/infrt/ir/infrt_opsTypes.h.inc" #define GET_ATTRDEF_CLASSES -#include "paddle/infrt/dialect/infrt/infrt_opsAttributes.h.inc" +#include "paddle/infrt/dialect/infrt/ir/infrt_opsAttributes.h.inc" #define GET_OP_CLASSES -#include "paddle/infrt/dialect/infrt/infrt_ops.h.inc" +#include "paddle/infrt/dialect/infrt/ir/infrt_ops.h.inc" diff --git a/paddle/infrt/dialect/infrt/infrt_ops.td b/paddle/infrt/dialect/infrt/ir/infrt_ops.td similarity index 64% rename from paddle/infrt/dialect/infrt/infrt_ops.td rename to paddle/infrt/dialect/infrt/ir/infrt_ops.td index 16ade66d47b..f5430b03d0d 100644 --- a/paddle/infrt/dialect/infrt/infrt_ops.td +++ b/paddle/infrt/dialect/infrt/ir/infrt_ops.td @@ -1,4 +1,4 @@ -include "paddle/infrt/dialect/infrt/infrt_ops_base.td" +include "paddle/infrt/dialect/infrt/ir/infrt_base.td" // Op definition class Infrt_Op traits = []> : Op { @@ -33,6 +33,26 @@ def Infrt_ReturnOp : Infrt_Op<"return", [Terminator]> { let assemblyFormat = "attr-dict ($operands^ `:` type($operands))?"; } +def Infrt_CallOp : Infrt_Op<"call"> { + let summary = "call a host operation"; + let description = [{ + The "infrt.call" operation represents a direct call to a function. The operands and result types of the call must match the specified function type. + + %2 = infrt.call @add(%0, %1) : (f32, f32) -> f32 + }]; + + let arguments = (ins FlatSymbolRefAttr:$callee, Variadic:$operands); + let results = (outs Variadic); + + //let extraClassDeclaration = [{ + // mlir::StringRef getCallee() { return callee(); } + // mlir::FunctionType getCalleeType(); + // }]; + let assemblyFormat = [{ + $callee `(` $operands `)` attr-dict `:` functional-type($operands, results) + }]; +} + def Infrt_CvtTensorOp : Infrt_Op<"cvt_tensor", [NoSideEffect]> { let summary = "convert tensor type op"; let description = [{convert tensor type op!}]; diff --git a/paddle/infrt/dialect/test_kernels.cc b/paddle/infrt/dialect/infrt/ir/test_kernels.cc similarity index 96% rename from paddle/infrt/dialect/test_kernels.cc rename to paddle/infrt/dialect/infrt/ir/test_kernels.cc index f0c4723b49a..5f7f83a9dfa 100644 --- a/paddle/infrt/dialect/test_kernels.cc +++ b/paddle/infrt/dialect/infrt/ir/test_kernels.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/dialect/test_kernels.h" +#include "paddle/infrt/dialect/infrt/ir/test_kernels.h" #include #include @@ -147,7 +147,7 @@ static mlir::LogicalResult verify(BenchmarkOp op) { // Verify that the target benchmark region has exactly one return value. auto ®ion = op.region(); auto &last_op = region.front().back(); - if (last_op.getName().getStringRef() != "Infrt.return") { + if (last_op.getName().getStringRef() != "infrt.return") { return op.emitOpError("missing return statement"); } if (last_op.getNumOperands() != 1) { @@ -161,4 +161,4 @@ static mlir::LogicalResult verify(BenchmarkOp op) { } // namespace infrt #define GET_OP_CLASSES -#include "paddle/infrt/dialect/test_kernels.cpp.inc" +#include "paddle/infrt/dialect/infrt/ir/test_kernels.cpp.inc" diff --git a/paddle/infrt/dialect/test_kernels.h b/paddle/infrt/dialect/infrt/ir/test_kernels.h similarity index 92% rename from paddle/infrt/dialect/test_kernels.h rename to paddle/infrt/dialect/infrt/ir/test_kernels.h index 73c8a6fb387..1fe5020b240 100644 --- a/paddle/infrt/dialect/test_kernels.h +++ b/paddle/infrt/dialect/infrt/ir/test_kernels.h @@ -17,4 +17,4 @@ #include #define GET_OP_CLASSES -#include "paddle/infrt/dialect/test_kernels.hpp.inc" +#include "paddle/infrt/dialect/infrt/ir/test_kernels.hpp.inc" diff --git a/paddle/infrt/dialect/test_kernels.td b/paddle/infrt/dialect/infrt/ir/test_kernels.td similarity index 93% rename from paddle/infrt/dialect/test_kernels.td rename to paddle/infrt/dialect/infrt/ir/test_kernels.td index 6e4bc26aa14..0ce1f3f65e8 100644 --- a/paddle/infrt/dialect/test_kernels.td +++ b/paddle/infrt/dialect/infrt/ir/test_kernels.td @@ -4,12 +4,12 @@ #else #define TEST_OPS -include "paddle/infrt/dialect/infrt_base.td" +include "paddle/infrt/dialect/infrt/ir/infrt_base.td" include "mlir/Interfaces/SideEffectInterfaces.td" // Base class for Test dialect ops. class Test_Op traits = []> : - Op { + Op { // Each registered op in the Test namespace needs to provide all of a printer, // parser and verifier. @@ -45,7 +45,7 @@ def BenchmarkOp : Test_Op<"benchmark"> { // The following code benchmarks the infrt.add.i32 kernel. %x = infrt.add.i32 %c, %c // The benchmarked function needs to return exactly one value. - Infrt.return %x : i32 + infrt.return %x : i32 } }]; diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td index ef702650b6f..51addb4deb4 100644 --- a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td +++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td @@ -2,7 +2,7 @@ #define INFRT_OP_FUSE include "mlir/Interfaces/SideEffectInterfaces.td" -include "paddle/infrt/dialect/infrt/infrt_ops.td" +include "paddle/infrt/dialect/infrt/ir/infrt_ops.td" include "paddle/infrt/dialect/pd_ops.td" def FuseCvtTensorPattern : Pat< diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc index cb16e054418..25ecf2ae99d 100644 --- a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc +++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc @@ -15,7 +15,7 @@ #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h" #include -#include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/pd_ops.h" namespace { #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse.cpp.inc" // NOLINT diff --git a/paddle/infrt/dialect/infrt_base.cc b/paddle/infrt/dialect/infrt_base.cc deleted file mode 100644 index e951762abb2..00000000000 --- a/paddle/infrt/dialect/infrt_base.cc +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/infrt/dialect/infrt_base.h" - -#include "paddle/infrt/dialect/basic_kernels.h" -#include "paddle/infrt/dialect/dense_tensor.h" -#include "paddle/infrt/dialect/test_kernels.h" - -namespace infrt { -namespace dialect { - -// ----INFRTDialect definition begin---- -void INFRTDialect::initialize() { - allowUnknownTypes(); - allowUnknownOperations(); - addOperations< -#define GET_OP_LIST -#include "paddle/infrt/dialect/basic_kernels.cpp.inc" - >(); - addOperations< -#define GET_OP_LIST -#include "paddle/infrt/dialect/test_kernels.cpp.inc" - >(); -} - -mlir::Type INFRTDialect::parseType(mlir::DialectAsmParser &parser) const { - llvm::StringRef keyword; - if (parser.parseKeyword(&keyword)) return mlir::Type(); - // parse TensorMapType, for example: !infrt.tensor_map - parser.emitError(parser.getCurrentLocation(), "unknown infrt type: ") - << keyword; - return mlir::Type(); -} - -void INFRTDialect::printType(mlir::Type type, - mlir::DialectAsmPrinter &printer) const { - // print TensorMapType, for example: !infrt.tensor_map - llvm_unreachable("unknown infrt type."); -} - -// ----INFRTDialect definition end---- - -} // namespace dialect -} // namespace infrt diff --git a/paddle/infrt/dialect/infrt_base.h b/paddle/infrt/dialect/infrt_base.h deleted file mode 100644 index 3ef73171dcd..00000000000 --- a/paddle/infrt/dialect/infrt_base.h +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "paddle/infrt/dialect/infrt_base.hpp.inc" - -namespace infrt { -namespace dialect { - -class INFRTDialect : public mlir::Dialect { - explicit INFRTDialect(mlir::MLIRContext *context) - : mlir::Dialect( - getDialectNamespace(), context, mlir::TypeID::get()) { - initialize(); - } - - // parse types registered to the dialect. - mlir::Type parseType(mlir::DialectAsmParser &parser) const override; - // print types registered to the dialect. - void printType(mlir::Type type, - mlir::DialectAsmPrinter &printer) const override; - - void initialize(); - friend class mlir::MLIRContext; - - public: - static ::llvm::StringRef getDialectNamespace() { return "Infrt"; } -}; -} // namespace dialect - -template -static mlir::IntegerAttr createI32Attr(mlir::OpBuilder &b, // NOLINT - mlir::Location loc, - T constant) { - return b.getIntegerAttr(b.getI32Type(), constant); -} - -template -static mlir::IntegerAttr createSI32Attr(mlir::OpBuilder &b, // NOLINT - mlir::Location loc, - T constant) { - return b.getSI32IntegerAttr(constant); -} - -template -static mlir::FloatAttr createF32Attr(mlir::OpBuilder &b, // NOLINT - mlir::Location loc, - T constant) { - return b.getF32FloatAttr(constant); -} - -static mlir::SmallVector cvtValueToValueRange( - const mlir::Value &operand) { - return mlir::SmallVector(1, operand); -} - -static mlir::SmallVector concatTwoValueRange( - mlir::ValueRange operand_0, mlir::ValueRange operand_1) { - mlir::SmallVector operands; - operands.append(operand_0.begin(), operand_0.end()); - operands.append(operand_1.begin(), operand_1.end()); - return operands; -} -} // namespace infrt diff --git a/paddle/infrt/dialect/infrt_base.td b/paddle/infrt/dialect/infrt_base.td deleted file mode 100644 index 45e6b116f48..00000000000 --- a/paddle/infrt/dialect/infrt_base.td +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef INFRT_BASE -#define INFRT_BASE - -include "mlir/IR/OpBase.td" -include "paddle/infrt/dialect/infrt/infrt_ops_base.td" - -def INFRT_Dialect : Dialect { - let name = "Infrt"; - - let description = [{ - The INFRT host dialect. - }]; - - let cppNamespace = "::infrt::dialect"; -} - -def BufferType : OpaqueType<"b", "buffer", "buffer">; - -class INFRT_createI32Attr : NativeCodeCall< - "infrt::createI32Attr($_builder, $_loc, " # value # ")">; - -class INFRT_createSI32Attr : NativeCodeCall< - "infrt::createSI32Attr($_builder, $_loc, " # value # ")">; - -class INFRT_createF32Attr : NativeCodeCall< - "infrt::createF32Attr($_builder, $_loc, " # value # ")">; - -def INFRT_cvtValueToValueRange : NativeCodeCall< - "infrt::cvtValueToValueRange($0)">; - -def INFRT_concatTwoValueRange : NativeCodeCall< - "infrt::concatTwoValueRange($0, $1)">; -#endif // INFRT_BASE diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_dialects.cc similarity index 83% rename from paddle/infrt/dialect/init_infrt_dialects.cc rename to paddle/infrt/dialect/init_dialects.cc index 5eae0171936..0c5944ebf84 100644 --- a/paddle/infrt/dialect/init_infrt_dialects.cc +++ b/paddle/infrt/dialect/init_dialects.cc @@ -12,14 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/dialect/init_infrt_dialects.h" +#include "paddle/infrt/dialect/init_dialects.h" #include -#include "paddle/infrt/dialect/basic_kernels.h" #include "paddle/infrt/dialect/dense_tensor.h" -#include "paddle/infrt/dialect/infrt/infrt_dialect.h" -#include "paddle/infrt/dialect/infrt_base.h" +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" + #include "paddle/infrt/dialect/pd_ops.h" #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" #include "paddle/infrt/dialect/phi/ir/phi_base.h" @@ -30,8 +30,7 @@ namespace infrt { void registerCinnDialects(mlir::DialectRegistry ®istry) { // NOLINT registry.insert #include "paddle/infrt/dialect/diagnostic_utils.h" -#include "paddle/infrt/dialect/init_infrt_dialects.h" +#include "paddle/infrt/dialect/init_dialects.h" namespace infrt { namespace dialect { diff --git a/paddle/infrt/dialect/mlir_loader_test.cc b/paddle/infrt/dialect/mlir_loader_test.cc index 2f721e49a63..8ccb07161d3 100644 --- a/paddle/infrt/dialect/mlir_loader_test.cc +++ b/paddle/infrt/dialect/mlir_loader_test.cc @@ -22,7 +22,7 @@ #include -#include "paddle/infrt/dialect/init_infrt_dialects.h" +#include "paddle/infrt/dialect/init_dialects.h" namespace infrt { namespace dialect { @@ -32,13 +32,13 @@ TEST(MlirLoader, basic) { auto source = R"ROC( func @main() -> f32 { - %v0 = Infrt.constant.f32 1.0 - %v1 = Infrt.constant.f32 2.0 - %value = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 + %v0 = infrt.constant.f32 1.0 + %v1 = infrt.constant.f32 2.0 + %value = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 - "Infrt.print.f32"(%v0) : (f32) -> () + "infrt.print.f32"(%v0) : (f32) -> () - Infrt.return %value : f32 + infrt.return %value : f32 } )ROC"; diff --git a/paddle/infrt/dialect/opt.cc b/paddle/infrt/dialect/opt.cc index 5bcf5a23f4c..2006530958f 100644 --- a/paddle/infrt/dialect/opt.cc +++ b/paddle/infrt/dialect/opt.cc @@ -14,7 +14,7 @@ #include #include -#include "paddle/infrt/dialect/init_infrt_dialects.h" +#include "paddle/infrt/dialect/init_dialects.h" int main(int argc, char **argv) { mlir::DialectRegistry registry; diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd_op_base.td index 26425e3945c..f6af4c83aed 100644 --- a/paddle/infrt/dialect/pd_op_base.td +++ b/paddle/infrt/dialect/pd_op_base.td @@ -6,7 +6,7 @@ include "mlir/IR/OpBase.td" include "mlir/Interfaces/SideEffectInterfaces.td" -include "paddle/infrt/dialect/infrt/infrt_ops_base.td" +include "paddle/infrt/dialect/infrt/ir/infrt_base.td" def PD_Dialect : Dialect { let name = "pd"; diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd_ops.cc index 55ab174fcaf..96e9e307f2f 100644 --- a/paddle/infrt/dialect/pd_ops.cc +++ b/paddle/infrt/dialect/pd_ops.cc @@ -16,7 +16,6 @@ #include #include -#include "paddle/infrt/dialect/infrt_base.h" #define GET_OP_CLASSES #include "paddle/infrt/dialect/pd_ops.cpp.inc" // NOLINT diff --git a/paddle/infrt/dialect/pd_ops.h b/paddle/infrt/dialect/pd_ops.h index 41dd2ddd94e..e6b0f30c059 100644 --- a/paddle/infrt/dialect/pd_ops.h +++ b/paddle/infrt/dialect/pd_ops.h @@ -28,7 +28,7 @@ #include #include #include -#include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" namespace mlir { namespace pd { diff --git a/paddle/infrt/dialect/pd_types.h b/paddle/infrt/dialect/pd_types.h deleted file mode 100644 index 0da888a9c07..00000000000 --- a/paddle/infrt/dialect/pd_types.h +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This file defines the types used in PaddlePaddle MLIR dialect. -// We borrowed much ideas from tensorflow mlir dialect (tf_types.h in -// tensorflow). - -#pragma once - -#include -#include -#include -#include -#include - -namespace mlir { -namespace PD { - -class PaddleType : public Type { - public: - using Type::Type; - - static bool classof(Type type); -}; - -namespace detail { - -template -class PaddleTypeImpl : public Type::TypeBase { - public: - using Base = typename Type::TypeBase; - using PDBase = PaddleTypeImpl; - using Base::Base; -}; - -} // namespace detail - -#define HANDLE_PD_TYPE(pdtype, enumerant, name) \ - class pdtype##Type : public detail::PaddleTypeImpl { \ - public: \ - using PDBase::PDBase; \ - }; - -} // namespace PD -} // namespace mlir diff --git a/paddle/infrt/dialect/phi/data_type.h b/paddle/infrt/dialect/phi/data_type.h index b618ef38613..f2a76507b85 100644 --- a/paddle/infrt/dialect/phi/data_type.h +++ b/paddle/infrt/dialect/phi/data_type.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/infrt/dialect/infrt/common_type.h" +#include "paddle/infrt/dialect/infrt/common/types.h" #include "paddle/phi/common/backend.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/layout.h" diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_base.td b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td index 671646b9259..5d7338ec429 100644 --- a/paddle/infrt/dialect/phi/ir/infrt_phi_base.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td @@ -2,7 +2,7 @@ #define PHI_BASE include "mlir/IR/OpBase.td" -include "paddle/infrt/dialect/infrt_base.td" +include "paddle/infrt/dialect/infrt/ir/infrt_base.td" include "mlir/Interfaces/InferTypeOpInterface.td" def PHI_Dialect : Dialect { diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td index ee23470fc75..d2ff7acfba8 100644 --- a/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td @@ -3,7 +3,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/IR/OpBase.td" -include "paddle/infrt/dialect/infrt_base.td" +include "paddle/infrt/dialect/infrt/ir/infrt_base.td" include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td" def PHI_CPUKernelDialect : Dialect { diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td index 21c4669b645..8c3a79498d7 100644 --- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td @@ -5,7 +5,7 @@ include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td" include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/IR/OpBase.td" -include "paddle/infrt/dialect/infrt_base.td" +include "paddle/infrt/dialect/infrt/ir/infrt_base.td" def PHI_DenseTensorDialect : Dialect { let name = "phi_dt"; diff --git a/paddle/infrt/dialect/phi/ir/phi_base.h b/paddle/infrt/dialect/phi/ir/phi_base.h index 0ea1973a733..64cd08cc05e 100644 --- a/paddle/infrt/dialect/phi/ir/phi_base.h +++ b/paddle/infrt/dialect/phi/ir/phi_base.h @@ -18,7 +18,7 @@ #include #include -#include "paddle/infrt/dialect/infrt/common_type.h" +#include "paddle/infrt/dialect/infrt/common/types.h" #include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.h.inc" diff --git a/paddle/infrt/dialect/phi/ir/phi_kernels.h b/paddle/infrt/dialect/phi/ir/phi_kernels.h index b84d1b2b729..4f8b41852cc 100644 --- a/paddle/infrt/dialect/phi/ir/phi_kernels.h +++ b/paddle/infrt/dialect/phi/ir/phi_kernels.h @@ -30,7 +30,7 @@ #include #include "paddle/infrt/dialect/dense_tensor.h" -#include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/phi/ir/phi_base.h" #include "paddle/infrt/dialect/phi/ir/phi_cpu_kernelsDialect.h.inc" diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h index 34fd2f0f62d..b1f7c6c0811 100644 --- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h @@ -16,7 +16,7 @@ #include #include -#include "paddle/infrt/dialect/infrt/common_type.h" +#include "paddle/infrt/dialect/infrt/common/types.h" namespace infrt { diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc index fb00a3de3fc..485bf2a75d8 100644 --- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc +++ b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc @@ -24,13 +24,29 @@ #include #include -#include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" #include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h" #include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h" #include "paddle/phi/core/compat/op_utils.h" #include "paddle/phi/ops/compat/signatures.h" -namespace infrt { + +namespace { +class phiOpCvtPass + : public mlir::PassWrapper { + public: + ::llvm::StringRef getName() const override { return "phiOpCvtPass"; } + void runOnFunction() override; + explicit phiOpCvtPass( + std::vector valid_places = std::vector()) + : valid_places_(valid_places) {} + + private: + void convertStage(); + void diapatchStage(); + std::vector valid_places_; +}; + // Implementation of the phiOpCvtPass. void phiOpCvtPass::runOnFunction() { convertStage(); @@ -63,7 +79,7 @@ void phiOpCvtPass::convertStage() { ::phi::KernelSignature kernel_sign = ::phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)( - ProtoArgumentMappingContext(op)); + infrt::ProtoArgumentMappingContext(op)); // resort input&output according to kernel_sign ::llvm::SmallVector inputs, ori_output; ::llvm::SmallVector output_types; @@ -109,10 +125,10 @@ void phiOpCvtPass::diapatchStage() { } mlir::OpBuilder builder(&block, block.begin()); - std::map phi_context; + std::map phi_context; for (infrt::KernelOp kernel_op : worklist) { std::string kernel_name = kernel_op.name().str(); - std::vector candidates = + std::vector candidates = getCandidateKernels(kernel_name, valid_places_); if (candidates.empty()) { LOG(FATAL) << "No candidate kernels for op:" << kernel_name; @@ -121,12 +137,13 @@ void phiOpCvtPass::diapatchStage() { builder.setInsertionPoint(kernel_op); // Todo: Implimentation the concrete pass pick strategy - const PhiKernelDesc &phi_kernel_desc = candidates.front(); + const infrt::PhiKernelDesc &phi_kernel_desc = candidates.front(); - kernel_name = getPhiTargetPrefix(phi_kernel_desc.kernelType.target) + - kernel_name + - getPhiPrecisionSuffix(phi_kernel_desc.kernelType.precision) + - getPhiLayoutSuffix(phi_kernel_desc.kernelType.layout); + kernel_name = + infrt::getPhiTargetPrefix(phi_kernel_desc.kernelType.target) + + kernel_name + + infrt::getPhiPrecisionSuffix(phi_kernel_desc.kernelType.precision) + + infrt::getPhiLayoutSuffix(phi_kernel_desc.kernelType.layout); mlir::OperationName operation_name(kernel_name, kernel_op.getContext()); mlir::OperationState operation_state(kernel_op.getLoc(), operation_name); @@ -134,18 +151,18 @@ void phiOpCvtPass::diapatchStage() { if (phi_context.find(phi_kernel_desc.kernelType.target) == phi_context.end()) { switch (phi_kernel_desc.kernelType.target) { - case TargetType::CPU: { + case infrt::TargetType::CPU: { auto context_value = builder .create( kernel_op.getLoc(), - phi::ContextType::get(kernel_op.getContext(), - TargetType::CPU)) + infrt::phi::ContextType::get(kernel_op.getContext(), + infrt::TargetType::CPU)) .output(); - phi_context[TargetType::CPU] = context_value; + phi_context[infrt::TargetType::CPU] = context_value; } break; - case TargetType::GPU: - case TargetType::UNK: + case infrt::TargetType::GPU: + case infrt::TargetType::UNK: default: LOG(FATAL) << "Unsupported TargetType"; break; @@ -155,29 +172,30 @@ void phiOpCvtPass::diapatchStage() { phi_context.at(phi_kernel_desc.kernelType.target)); for (size_t index = 0; index < phi_kernel_desc.inputsType.size(); ++index) { mlir::Value input = kernel_op.getOperand(index); - auto cvt_tensor_type_op = builder.create( + auto cvt_tensor_type_op = builder.create( kernel_op.getLoc(), - DenseTensorType::get(kernel_op.getContext(), - phi_kernel_desc.inputsType[index].target, - phi_kernel_desc.inputsType[index].precision, - phi_kernel_desc.inputsType[index].layout), + infrt::DenseTensorType::get( + kernel_op.getContext(), + phi_kernel_desc.inputsType[index].target, + phi_kernel_desc.inputsType[index].precision, + phi_kernel_desc.inputsType[index].layout), input); operation_state.addOperands(cvt_tensor_type_op.output()); } for (size_t index = 0; index < phi_kernel_desc.outputsType.size(); ++index) { - operation_state.addTypes( - DenseTensorType::get(kernel_op.getContext(), - phi_kernel_desc.outputsType[index].target, - phi_kernel_desc.outputsType[index].precision, - phi_kernel_desc.outputsType[index].layout)); + operation_state.addTypes(infrt::DenseTensorType::get( + kernel_op.getContext(), + phi_kernel_desc.outputsType[index].target, + phi_kernel_desc.outputsType[index].precision, + phi_kernel_desc.outputsType[index].layout)); } operation_state.addAttributes(kernel_op.attrsAttr().getValue()); mlir::Operation *phi_operation = builder.createOperation(operation_state); for (size_t index = 0; index < phi_kernel_desc.outputsType.size(); ++index) { mlir::Value input = phi_operation->getResult(index); - auto cvt_tensor_type_op = builder.create( + auto cvt_tensor_type_op = builder.create( kernel_op.getLoc(), kernel_op.getResultTypes()[index], input); kernel_op.getResult(index).replaceAllUsesWith( cvt_tensor_type_op.output()); @@ -185,4 +203,10 @@ void phiOpCvtPass::diapatchStage() { kernel_op.erase(); } } -} // namespace infrt + +} // namespace + +std::unique_ptr infrt::createPhiOpCvtPass( + std::vector valid_places) { + return std::make_unique(valid_places); +} diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h index 051fee9b61a..8b1944042aa 100644 --- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h +++ b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h @@ -14,44 +14,14 @@ #pragma once #include -#include "paddle/infrt/dialect/infrt/common_type.h" +#include "paddle/infrt/dialect/infrt/common/types.h" namespace infrt { /* * phiOpCvtPass. - * - * Convert the general operators in pd Dialect to a infrt.kernelOp. - * - * source func: - * - * func @main() -> tensor { - * %a = "pd.feed"()... - * %c = "pd.conv2d"(%a) ... - * %d = "pd.conv3d"(%c) ... - * %f = "pd.conv2d"(%a) ... - * "pd.fetch" (%d, %f) - * } - * - * destination func: - * func @main() -> tensor { - * %a = "pd.feed"()... - * %c = "infrt.kernel"(%a){name = "conv2d"} ... - * %d = "infrt.kernel"(%c){name = "conv3d"}... - * %f = "infrt.kernel"(%a){name = "conv2d"}... - * "pd.fetch" (%d, %f) - * } + * Convert the general operators from pd Dialect to phi dialect. */ -class phiOpCvtPass - : public mlir::PassWrapper { - public: - ::llvm::StringRef getName() const override { return "phiOpCvtPass"; } - void runOnFunction() override; - explicit phiOpCvtPass(std::vector valid_places = std::vector()) - : valid_places_(valid_places) {} +std::unique_ptr createPhiOpCvtPass( + std::vector valid_places = std::vector()); - private: - void convertStage(); - void diapatchStage(); - std::vector valid_places_; -}; } // namespace infrt diff --git a/paddle/infrt/dialect/phi/phi_ir_exec.cc b/paddle/infrt/dialect/phi/phi_ir_exec.cc index 559fb90a64a..de61dba8e74 100644 --- a/paddle/infrt/dialect/phi/phi_ir_exec.cc +++ b/paddle/infrt/dialect/phi/phi_ir_exec.cc @@ -38,7 +38,7 @@ int main(int argc, char** argv) { std::vector valid_places = {{infrt::TargetType::CPU, infrt::PrecisionType::FLOAT32, infrt::LayoutType::NCHW}}; - phi_pass_manager.addPass(std::make_unique(valid_places)); + phi_pass_manager.addPass(infrt::createPhiOpCvtPass(valid_places)); phi_pass_manager.addPass(infrt::createInfrtOpFusePass()); if (mlir::failed(pm.run(*module))) { std::cout << "\npass failed!\n" << std::endl; diff --git a/paddle/infrt/dialect/print_ir.cc b/paddle/infrt/dialect/print_ir.cc index a37df265955..b118a5f7a9c 100644 --- a/paddle/infrt/dialect/print_ir.cc +++ b/paddle/infrt/dialect/print_ir.cc @@ -31,7 +31,7 @@ #include #include "paddle/infrt/common/global.h" -#include "paddle/infrt/dialect/init_infrt_dialects.h" +#include "paddle/infrt/dialect/init_dialects.h" namespace cl = llvm::cl; diff --git a/paddle/infrt/dialect/rewrite.td b/paddle/infrt/dialect/rewrite.td index 5e228fed4d5..62e7471a390 100644 --- a/paddle/infrt/dialect/rewrite.td +++ b/paddle/infrt/dialect/rewrite.td @@ -1,7 +1,7 @@ #ifndef INFRT_REWRITE #define INFRT_REWRITE -include "paddle/infrt/dialect/infrt_base.td" +include "paddle/infrt/dialect/infrt/ir/infrt_base.td" include "mlir/Interfaces/SideEffectInterfaces.td" include "paddle/infrt/dialect/pd_ops.td" include "paddle/infrt/dialect/pd_extra_ops.td" diff --git a/paddle/infrt/dialect/tensor_shape.td b/paddle/infrt/dialect/tensor_shape.td index d3714c8ed14..2be21d6aa77 100644 --- a/paddle/infrt/dialect/tensor_shape.td +++ b/paddle/infrt/dialect/tensor_shape.td @@ -2,7 +2,7 @@ #else #define INFRT_OPS -include "paddle/infrt/dialect/infrt_base.td" +include "paddle/infrt/dialect/infrt/ir/infrt_base.td" include "paddle/infrt/dialect/tensor_shape_base.td" include "mlir/Interfaces/SideEffectInterfaces.td" diff --git a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td index 68ca1559ace..46c250b0549 100644 --- a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td +++ b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td @@ -2,7 +2,7 @@ #define PD_LOWER_TO_TRT include "mlir/Interfaces/SideEffectInterfaces.td" -include "paddle/infrt/dialect/infrt_base.td" +include "paddle/infrt/dialect/infrt/ir/infrt_base.td" include "paddle/infrt/dialect/pd_ops.td" include "paddle/infrt/dialect/tensorrt/trt_ops.td" diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h index 803e53e3244..18afba19e06 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h +++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h @@ -14,7 +14,6 @@ #pragma once #include -#include "paddle/infrt/dialect/infrt_base.h" namespace infrt { namespace trt { @@ -28,17 +27,17 @@ namespace trt { * func @main(%a : tensor) -> tensor { * %c = "pd.graph"(%a) { * %m = "pd.conv2d"(%a)... - * "infrt.return" (%m) + * infrt.return %m... * } ... * %d = "pd.graph"(%c) { * %m = "pd.conv3d"(%c)... - * "infrt.return" (%m) + * infrt.return %m... * } ... * %f = "pd.graph"(%a) { * %m = "pd.conv2d"(%a)... - * "infrt.return" (%m) + * infrt.return %m... * } ... - * "infrt.return" (%d, %f).. + * infrt.return %d, %f :... * } * * destination func: @@ -47,9 +46,9 @@ namespace trt { * %m = "pd.conv2d"(%a)... * %n = "pd.conv3d"(%m)... * %s = "pd.conv2d"(%a)... - * "infrt.return" (%n, %s) + * infrt.return %n, %s:... * } ... - * "infrt.return" (%d, %f) + * infrt.return %d, %f:... * } */ class TRTGraphFusePass diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h index 1c44a13cf9d..a5dd4f14b29 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h +++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h @@ -14,7 +14,6 @@ #pragma once #include -#include "paddle/infrt/dialect/infrt_base.h" namespace infrt { namespace trt { @@ -31,9 +30,9 @@ namespace trt { * %m = "pd.conv2d"(%a)... * %n = "pd.conv3d"(%m)... * %s = "pd.conv2d"(%a)... - * "infrt.return" (%n, %s)... + * infrt.return %n, %s : ... * } ... - * "infrt.return" (%d, %f)... + * infrt.return %d, %f : ... * } * * destination func: @@ -41,7 +40,7 @@ namespace trt { * %c = "pd.conv2d"(%a) ... * %d = "pd.conv3d"(%c) ... * %f = "pd.conv2d"(%a) ... - * "infrt.return" (%d, %f)... + * infrt.return %d, %f:... * } */ class TRTGraphSplitPass diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc index 1be5f4dbc39..83bebdb6bf1 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc @@ -14,7 +14,6 @@ #include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h" #include #include -#include "paddle/infrt/dialect/infrt_base.h" #include "paddle/infrt/dialect/pd_ops.h" #include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h" @@ -24,7 +23,7 @@ namespace trt { #include "paddle/infrt/dialect/tensorrt/pd_lower_to_trt.cpp.inc" // NOLINT struct PD2TRT_GraphLower : public ::mlir::RewritePattern { - PD2TRT_GraphLower(::mlir::MLIRContext *context) + explicit PD2TRT_GraphLower(::mlir::MLIRContext *context) : ::mlir::RewritePattern("pd.graph", 1, context, {"trt.create_engine"}) {} ::mlir::LogicalResult matchAndRewrite( ::mlir::Operation *op, ::mlir::PatternRewriter &rewriter) const override { diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h index 7550d8c84e1..ede64f8bcd5 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h +++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h @@ -15,7 +15,7 @@ #pragma once #include "mlir/IR/Dialect.h" #include "mlir/Pass/Pass.h" -#include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/tensorrt/trt_ops.h" namespace infrt { @@ -29,9 +29,9 @@ namespace trt { * %m = "pd.conv2d"(%a)... * %n = "pd.conv3d"(%m)... * %s = "pd.conv2d"(%a)... - * "infrt.return" (%n, %s)... + * infrt.return %n, %s:... * } ... - * "infrt.return" (%d, %f)... + * infrt.return %d, %f:... * } * * destination ir: @@ -40,10 +40,10 @@ namespace trt { * %m = "trt.Convolution"(%a)... * %n = "trt.Convolution"(%m)... * %s = "trt.Convolution"(%a)... - * "infrt.return" (%n, %s)... + * infrt.return %n, %s :... * }){run_once = true} ... * %d, %f = "trt.execute"(%engine, %a)... - * "infrt.return" (%d, %f)... + * infrt.return %d, %f :... * } */ struct TRTOpConverterPass diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc index 13b7f1aee55..9f348b4122f 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc @@ -15,8 +15,8 @@ #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h" #include -#include "paddle/infrt/dialect/basic_kernels.h" -#include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/pd_ops.h" namespace infrt { diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h index b9e461c8633..1cb08dc0a21 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h +++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h @@ -14,7 +14,6 @@ #pragma once #include -#include "paddle/infrt/dialect/infrt_base.h" namespace infrt { namespace trt { @@ -29,24 +28,24 @@ namespace trt { * %c = "pd.conv2d"(%a) ... * %d = "pd.conv3d"(%c) ... * %f = "pd.conv2d"(%a) ... - * "infrt.return"(%d, %f) ... + * infrt.return %d, %f: ... * } * * destination func: * func @main(%a : tensor) -> tensor { * %c = "pd.graph"(%a) { * %m = "pd.conv2d"(%a)... - * "infrt.return" (%m) + * infrt.return %m:... * } ... * %d = "pd.graph"(%c) { * %m = "pd.conv3d"(%c)... - * "infrt.return" (%m) + * infrt.return %m:... * } ... * %f = "pd.graph"(%a) { * %m = "pd.conv2d"(%a)... - * "infrt.return" (%m) + * infrt.return %m:... * } ... - * "infrt.return" (%d, %f) + * infrt.return %d, %f:... * } * TODO(winter-wang): Supplementary how to judge the operators can be supported * by tensorrt. diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h index 44444232915..78d960b5120 100644 --- a/paddle/infrt/dialect/tensorrt/trt_ops.h +++ b/paddle/infrt/dialect/tensorrt/trt_ops.h @@ -28,8 +28,8 @@ #include #include #include -#include "paddle/infrt/dialect/basic_kernels.h" -#include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/pd_ops.h" namespace infrt { diff --git a/paddle/infrt/external_kernels/basic.mlir b/paddle/infrt/external_kernels/basic.mlir index 1a7ea854c9c..843b12ced21 100644 --- a/paddle/infrt/external_kernels/basic.mlir +++ b/paddle/infrt/external_kernels/basic.mlir @@ -1,7 +1,7 @@ // CHECK: basic func @basic() -> f32 { - %v0 = Infrt.constant.f32 1.0 - %v1 = Infrt.constant.f32 2.0 + %v0 = infrt.constant.f32 1.0 + %v1 = infrt.constant.f32 2.0 %v2 = "external.add.f32"(%v0, %v1) : (f32, f32) -> f32 // CHECK: 1 @@ -17,5 +17,5 @@ func @basic() -> f32 { // CHECK: 6 "external.print.f32"(%v3) : (f32) -> () - Infrt.return %v3 : f32 + infrt.return %v3 : f32 } diff --git a/paddle/infrt/external_kernels/fc.mlir b/paddle/infrt/external_kernels/fc.mlir index b0cabddc3eb..26b2d24cace 100644 --- a/paddle/infrt/external_kernels/fc.mlir +++ b/paddle/infrt/external_kernels/fc.mlir @@ -1,43 +1,43 @@ // CHECK-LABEL: @fc -func @fc(%input : !Infrt.tensor, - %w : !Infrt.tensor, - %bias : !Infrt.tensor) -> !Infrt.tensor +func @fc(%input : !infrt.dense_tensor, + %w : !infrt.dense_tensor, + %bias : !infrt.dense_tensor) -> !infrt.dense_tensor { - %out = dt.create_uninit_tensor.f32 [30, 50] -> !Infrt.tensor - // dt.fill_tensor_with_constant.f32 (%out : !Infrt.tensor) {value=0.0:f32} + %out = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.dense_tensor + // dt.fill_tensor_with_constant.f32 (%out : !infrt.dense_tensor) {value=0.0:f32} // fc1 - "external.matmul"(%input, %w, %out) {}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () - "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () - "external.sigmoid"(%out, %out) {}: (!Infrt.tensor, !Infrt.tensor) -> () + "external.matmul"(%input, %w, %out) {}: (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> () + "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> () + "external.sigmoid"(%out, %out) {}: (!infrt.dense_tensor, !infrt.dense_tensor) -> () // fc2 - "external.matmul"(%out, %w, %out) {}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () - "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () - "external.sigmoid"(%out, %out) {}: (!Infrt.tensor, !Infrt.tensor) -> () + "external.matmul"(%out, %w, %out) {}: (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> () + "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> () + "external.sigmoid"(%out, %out) {}: (!infrt.dense_tensor, !infrt.dense_tensor) -> () - Infrt.return %out : !Infrt.tensor + infrt.return %out : !infrt.dense_tensor } // CHECK-LABEL: @benchmark func @benchmark() { - %input = dt.create_uninit_tensor.f32 [30, 50] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor) {value=1.0:f32} + %input = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%input : !infrt.dense_tensor) {value=1.0:f32} - %w = dt.create_uninit_tensor.f32 [50, 50] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%w : !Infrt.tensor) {value=2.0:f32} + %w = dt.create_uninit_tensor.f32 [50, 50] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%w : !infrt.dense_tensor) {value=2.0:f32} - %bias = dt.create_uninit_tensor.f32 [30, 50] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%bias : !Infrt.tensor) {value=3.0:f32} + %bias = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%bias : !infrt.dense_tensor) {value=3.0:f32} - Infrt.benchmark "add.f32"( - %input:!Infrt.tensor, - %w:!Infrt.tensor, - %bias:!Infrt.tensor) + infrt.benchmark "add.f32"( + %input:!infrt.dense_tensor, + %w:!infrt.dense_tensor, + %bias:!infrt.dense_tensor) duration_secs = 100, max_count = 300000, num_warmup_runs = 3 { - %res = Infrt.call @fc(%input, %w, %bias) : (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> (!Infrt.tensor) - Infrt.return %res : !Infrt.tensor + %res = infrt.call @fc(%input, %w, %bias) : (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> (!infrt.dense_tensor) + infrt.return %res : !infrt.dense_tensor } - Infrt.return + infrt.return } diff --git a/paddle/infrt/external_kernels/paddle.mlir b/paddle/infrt/external_kernels/paddle.mlir index d55d9904b5b..97781e5c8c5 100644 --- a/paddle/infrt/external_kernels/paddle.mlir +++ b/paddle/infrt/external_kernels/paddle.mlir @@ -1,50 +1,50 @@ // CHECK: paddle_func func @paddle_func() -> () { - %input = dt.create_uninit_tensor.f32 [3, 5] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor) {value=1.0:f32} + %input = dt.create_uninit_tensor.f32 [3, 5] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%input : !infrt.dense_tensor) {value=1.0:f32} - %w = dt.create_uninit_tensor.f32 [5, 4] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%w : !Infrt.tensor) {value=2.0:f32} + %w = dt.create_uninit_tensor.f32 [5, 4] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%w : !infrt.dense_tensor) {value=2.0:f32} - %bias = dt.create_uninit_tensor.f32 [4] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%bias : !Infrt.tensor) {value=3.0:f32} + %bias = dt.create_uninit_tensor.f32 [4] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%bias : !infrt.dense_tensor) {value=3.0:f32} - %out = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%out : !Infrt.tensor) {value=0.0:f32} + %out = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%out : !infrt.dense_tensor) {value=0.0:f32} - "external.fc2"(%input, %w, %bias, %out) {in_num_col_dims=3:i32, test_attr=5:i32}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () + "external.fc2"(%input, %w, %bias, %out) {in_num_col_dims=3:i32, test_attr=5:i32}: (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> () // CHECK-LABEL: tensor: shape=shape[3,5], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] - dt.print_tensor (%input : !Infrt.tensor) + dt.print_tensor (%input : !infrt.dense_tensor) // CHECK-LABEL: tensor: shape=shape[5,4], values=[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] - dt.print_tensor (%w : !Infrt.tensor) - dt.print_tensor (%bias : !Infrt.tensor) - dt.print_tensor (%out : !Infrt.tensor) + dt.print_tensor (%w : !infrt.dense_tensor) + dt.print_tensor (%bias : !infrt.dense_tensor) + dt.print_tensor (%out : !infrt.dense_tensor) // test external.matmul - %out1 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%out1 : !Infrt.tensor) {value=0.0:f32} - "external.matmul"(%input, %w, %out1) {}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () - dt.print_tensor (%out1 : !Infrt.tensor) + %out1 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%out1 : !infrt.dense_tensor) {value=0.0:f32} + "external.matmul"(%input, %w, %out1) {}: (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> () + dt.print_tensor (%out1 : !infrt.dense_tensor) // test external.elementwise_add - %out2 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%out2 : !Infrt.tensor) {value=0.0:f32} - %bias1 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%bias1 : !Infrt.tensor) {value=3.0:f32} - "external.elementwise_add"(%out1, %bias1, %out2) {axis=-1}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () - dt.print_tensor (%out2 : !Infrt.tensor) + %out2 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%out2 : !infrt.dense_tensor) {value=0.0:f32} + %bias1 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%bias1 : !infrt.dense_tensor) {value=3.0:f32} + "external.elementwise_add"(%out1, %bias1, %out2) {axis=-1}: (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> () + dt.print_tensor (%out2 : !infrt.dense_tensor) // test external.relu - %out3 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%out3 : !Infrt.tensor) {value=0.0:f32} - "external.relu"(%out1, %out3) {}: (!Infrt.tensor, !Infrt.tensor) -> () - dt.print_tensor (%out3 : !Infrt.tensor) + %out3 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%out3 : !infrt.dense_tensor) {value=0.0:f32} + "external.relu"(%out1, %out3) {}: (!infrt.dense_tensor, !infrt.dense_tensor) -> () + dt.print_tensor (%out3 : !infrt.dense_tensor) // test external.sigmoid - %out4 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%out4 : !Infrt.tensor) {value=0.0:f32} - "external.sigmoid"(%out1, %out4) {}: (!Infrt.tensor, !Infrt.tensor) -> () - dt.print_tensor (%out4 : !Infrt.tensor) + %out4 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%out4 : !infrt.dense_tensor) {value=0.0:f32} + "external.sigmoid"(%out1, %out4) {}: (!infrt.dense_tensor, !infrt.dense_tensor) -> () + dt.print_tensor (%out4 : !infrt.dense_tensor) - Infrt.return + infrt.return } diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc index 90bcb1df220..1506282f626 100644 --- a/paddle/infrt/host_context/mlir_exec.cc +++ b/paddle/infrt/host_context/mlir_exec.cc @@ -92,7 +92,7 @@ int main(int argc, char** argv) { std::vector valid_places = {{infrt::TargetType::CPU, infrt::PrecisionType::FLOAT32, infrt::LayoutType::NCHW}}; - phi_pass_manager.addPass(std::make_unique(valid_places)); + phi_pass_manager.addPass(infrt::createPhiOpCvtPass(valid_places)); phi_pass_manager.addPass(infrt::createInfrtOpFusePass()); #endif diff --git a/paddle/infrt/host_context/mlir_tests/basic.mlir b/paddle/infrt/host_context/mlir_tests/basic.mlir index 1b55b408f2b..263d5884134 100644 --- a/paddle/infrt/host_context/mlir_tests/basic.mlir +++ b/paddle/infrt/host_context/mlir_tests/basic.mlir @@ -1,30 +1,30 @@ // CHECK-LABEL: basic func @basic() -> f32 { - %v0 = Infrt.constant.f32 1.0 - %v1 = Infrt.constant.f32 2.0 - %v2 = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 + %v0 = infrt.constant.f32 1.0 + %v1 = infrt.constant.f32 2.0 + %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 // CHECK: 1 - "Infrt.print.f32"(%v0) : (f32) -> () + "infrt.print.f32"(%v0) : (f32) -> () // CHECK: 2 - "Infrt.print.f32"(%v1) : (f32) -> () + "infrt.print.f32"(%v1) : (f32) -> () // CHECK: 3 - "Infrt.print.f32"(%v2) : (f32) -> () + "infrt.print.f32"(%v2) : (f32) -> () - %v3 = "Infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32 + %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32 // CHECK: 6 - "Infrt.print.f32"(%v3) : (f32) -> () + "infrt.print.f32"(%v3) : (f32) -> () - Infrt.return %v3 : f32 + infrt.return %v3 : f32 } // CHECK-LABEL: basic1 // Check the mlir executor can work with more than one function in a file. func @basic1() -> () { - %v0 = Infrt.constant.f32 1.0 - "Infrt.print.f32"(%v0) : (f32) -> () + %v0 = infrt.constant.f32 1.0 + "infrt.print.f32"(%v0) : (f32) -> () // CHECK: 1 - Infrt.return + infrt.return } \ No newline at end of file diff --git a/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir b/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir index 5a973a3eb23..1a7fa28f1e5 100644 --- a/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir +++ b/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir @@ -1,9 +1,9 @@ // CHECK-LABEL: build_tensor1 func @build_tensor1() { - %a = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%a : !Infrt.tensor) {value=1.0:f32} + %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%a : !infrt.dense_tensor) {value=1.0:f32} // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] - dt.print_tensor (%a : !Infrt.tensor) + dt.print_tensor (%a : !infrt.dense_tensor) - Infrt.return + infrt.return } diff --git a/paddle/infrt/host_context/mlir_tests/shape.mlir b/paddle/infrt/host_context/mlir_tests/shape.mlir index 22df1c8010d..691ce62cbf8 100644 --- a/paddle/infrt/host_context/mlir_tests/shape.mlir +++ b/paddle/infrt/host_context/mlir_tests/shape.mlir @@ -3,5 +3,5 @@ func @build_tensor1() { %a = ts.build_shape [1:i64, 57:i64, 92:i64] // CHECK: shape[1,57,92] ts.print_shape %a - Infrt.return -} \ No newline at end of file + infrt.return +} diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc index a901c323ec0..b3ea930e8ce 100644 --- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc +++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc @@ -75,7 +75,7 @@ struct MlirToRuntimeTranslator::Impl { }; bool MlirToRuntimeTranslator::EmitConstantOp(mlir::Operation* op) { - if (!infrt::Startswith(op->getName().getStringRef().str(), "Infrt.constant")) + if (!infrt::Startswith(op->getName().getStringRef().str(), "infrt.constant")) return false; VLOG(3) << "Emitting constant op [" << op->getName().getStringRef().str() << "]"; @@ -267,7 +267,7 @@ boost::optional> MlirToRuntimeTranslator::EmitAttribute( } static bool IsReturn(mlir::Operation* op) { - return op->getName().getStringRef() == "Infrt.return"; + return op->getName().getStringRef() == "infrt.return"; } bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) { @@ -405,7 +405,7 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) { bool MlirToRuntimeTranslator::EmitReturnOp( mlir::Operation* op, llvm::SmallVectorImpl* results) { CHECK(results); - if (op->getName().getStringRef() == "Infrt.return") { + if (op->getName().getStringRef() == "infrt.return") { for (size_t i = 0; i < op->getNumOperands(); i++) { results->push_back(op->getOperand(i)); } @@ -478,7 +478,7 @@ bool MlirToRuntimeTranslator::EmitCallOp(mlir::Operation* op, function_defs_t* function_table) { CHECK(op); CHECK(function_table); - if (op->getName().getStringRef() != "Infrt.call") return false; + if (op->getName().getStringRef() != "infrt.call") return false; impl_->cur_op = impl_->runtime->NewOpExecutable(op->getName().getStringRef().str()); diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.h b/paddle/infrt/host_context/mlir_to_runtime_translate.h index 0c453651d9e..fcd79eaf386 100644 --- a/paddle/infrt/host_context/mlir_to_runtime_translate.h +++ b/paddle/infrt/host_context/mlir_to_runtime_translate.h @@ -57,7 +57,7 @@ class MlirToRuntimeTranslator { protected: //! Emit a "infrt.constant.*" operation, return true if succeed. bool EmitConstantOp(mlir::Operation* op); - //! Emit a "Infrt.return" operation. + //! Emit a "infrt.return" operation. bool EmitReturnOp(mlir::Operation* op, llvm::SmallVectorImpl* results); //! Emit a "ts.build_shape" operation. diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc index 5824e40abf9..31615fbc3f6 100644 --- a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc +++ b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc @@ -37,14 +37,14 @@ TEST(MlirToRuntimeTranslate, basic) { auto source = R"ROC( func @main() -> () { - %v0 = Infrt.constant.f32 1.0 - %v1 = Infrt.constant.f32 2.0 - %v2 = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 - %v3 = "Infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32 + %v0 = infrt.constant.f32 1.0 + %v1 = infrt.constant.f32 2.0 + %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 + %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32 - "Infrt.print.f32"(%v1) : (f32) -> () + "infrt.print.f32"(%v1) : (f32) -> () - Infrt.return + infrt.return } )ROC"; @@ -63,14 +63,14 @@ TEST(TestMlir, basic) { auto source = R"ROC( func @main() -> () { - %v0 = Infrt.constant.f32 1.0 - %v1 = Infrt.constant.f32 2.0 - %v2 = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 - %v3 = "Infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32 + %v0 = infrt.constant.f32 1.0 + %v1 = infrt.constant.f32 2.0 + %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 + %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32 - "Infrt.print.f32"(%v1) : (f32) -> () + "infrt.print.f32"(%v1) : (f32) -> () - Infrt.return + infrt.return } )ROC"; @@ -101,7 +101,7 @@ func @predict(%a: !infrt.dense_tensor, %b: !infrt.dense_tensor< "!infrt.dense_tensor"; auto end = R"ROC( -Infrt.return %a0, %b0: !infrt.dense_tensor, !infrt.dense_tensor +infrt.return %a0, %b0: !infrt.dense_tensor, !infrt.dense_tensor } )ROC"; diff --git a/paddle/infrt/host_context/paddle_mlir.cc b/paddle/infrt/host_context/paddle_mlir.cc index 6afef5935c7..18c25827b8e 100644 --- a/paddle/infrt/host_context/paddle_mlir.cc +++ b/paddle/infrt/host_context/paddle_mlir.cc @@ -19,7 +19,6 @@ MLIRModelGenImpl::MLIRModelGenImpl() : context_(infrt::Global::getMLIRContext()), builder_(context_) { context_->allowUnregisteredDialects(); context_->getOrLoadDialect(); - context_->getOrLoadDialect(); context_->getOrLoadDialect(); context_->getOrLoadDialect(); context_->getOrLoadDialect(); diff --git a/paddle/infrt/host_context/paddle_mlir.h b/paddle/infrt/host_context/paddle_mlir.h index 78dfefcfda2..e825cbb5a11 100644 --- a/paddle/infrt/host_context/paddle_mlir.h +++ b/paddle/infrt/host_context/paddle_mlir.h @@ -25,10 +25,10 @@ #include "mlir/IR/MLIRContext.h" #include "paddle/infrt/common/global.h" #include "paddle/infrt/common/string.h" -#include "paddle/infrt/dialect/basic_kernels.h" #include "paddle/infrt/dialect/dense_tensor.h" -#include "paddle/infrt/dialect/infrt_base.h" -#include "paddle/infrt/dialect/init_infrt_dialects.h" +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" + +#include "paddle/infrt/dialect/init_dialects.h" #include "paddle/infrt/dialect/pd_ops.h" #include "paddle/infrt/dialect/tensor_shape.h" #include "paddle/infrt/paddle/model_parser.h" diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h index 86df3508cf8..957d852442b 100644 --- a/paddle/infrt/host_context/value.h +++ b/paddle/infrt/host_context/value.h @@ -22,7 +22,7 @@ #include "paddle/infrt/common/object.h" #include "paddle/infrt/common/shared.h" -#include "paddle/infrt/dialect/infrt/common_type.h" +#include "paddle/infrt/dialect/infrt/common/types.h" #include "paddle/infrt/host_context/function.h" #include "paddle/infrt/support/variant.h" #include "paddle/infrt/tensor/dense_host_tensor.h" diff --git a/paddle/infrt/kernel/basic_kernels.cc b/paddle/infrt/kernel/basic_kernels.cc index 23e50a5ddc8..b186cfcfd2b 100644 --- a/paddle/infrt/kernel/basic_kernels.cc +++ b/paddle/infrt/kernel/basic_kernels.cc @@ -63,24 +63,24 @@ static void PrintString(const std::string &str) { void RegisterBasicKernels(host_context::KernelRegistry *registry) { RegisterIntBasicKernels(registry); RegisterFloatBasicKernels(registry); - registry->AddKernel("Infrt.get_string", INFRT_KERNEL(GetString)); - registry->AddKernel("Infrt.print_string", INFRT_KERNEL(PrintString)); + registry->AddKernel("infrt.get_string", INFRT_KERNEL(GetString)); + registry->AddKernel("infrt.print_string", INFRT_KERNEL(PrintString)); } void RegisterIntBasicKernels(host_context::KernelRegistry *registry) { - registry->AddKernel("Infrt.add.i32", INFRT_KERNEL(add)); - registry->AddKernel("Infrt.sub.i32", INFRT_KERNEL(sub)); - registry->AddKernel("Infrt.mul.i32", INFRT_KERNEL(mul)); - registry->AddKernel("Infrt.div.i32", INFRT_KERNEL(div)); - registry->AddKernel("Infrt.print.i32", INFRT_KERNEL(print)); + registry->AddKernel("infrt.add.i32", INFRT_KERNEL(add)); + registry->AddKernel("infrt.sub.i32", INFRT_KERNEL(sub)); + registry->AddKernel("infrt.mul.i32", INFRT_KERNEL(mul)); + registry->AddKernel("infrt.div.i32", INFRT_KERNEL(div)); + registry->AddKernel("infrt.print.i32", INFRT_KERNEL(print)); } void RegisterFloatBasicKernels(host_context::KernelRegistry *registry) { - registry->AddKernel("Infrt.add.f32", INFRT_KERNEL(add)); - registry->AddKernel("Infrt.sub.f32", INFRT_KERNEL(sub)); - registry->AddKernel("Infrt.mul.f32", INFRT_KERNEL(mul)); - registry->AddKernel("Infrt.div.f32", INFRT_KERNEL(div)); - registry->AddKernel("Infrt.print.f32", INFRT_KERNEL(print)); + registry->AddKernel("infrt.add.f32", INFRT_KERNEL(add)); + registry->AddKernel("infrt.sub.f32", INFRT_KERNEL(sub)); + registry->AddKernel("infrt.mul.f32", INFRT_KERNEL(mul)); + registry->AddKernel("infrt.div.f32", INFRT_KERNEL(div)); + registry->AddKernel("infrt.print.f32", INFRT_KERNEL(print)); } } // namespace kernel diff --git a/paddle/infrt/kernel/control_flow_kernels.cc b/paddle/infrt/kernel/control_flow_kernels.cc index 8b18aca0210..6cc94dbcce0 100644 --- a/paddle/infrt/kernel/control_flow_kernels.cc +++ b/paddle/infrt/kernel/control_flow_kernels.cc @@ -37,7 +37,7 @@ static void INFRTCall( } void RegisterControlFlowKernels(host_context::KernelRegistry* registry) { - registry->AddKernel("Infrt.call", INFRT_KERNEL(INFRTCall)); + registry->AddKernel("infrt.call", INFRT_KERNEL(INFRTCall)); } } // namespace kernel diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h index 187e5c64511..e77e9becb6f 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h @@ -15,7 +15,7 @@ #pragma once #include "paddle/infrt/backends/host/phi_allocator.h" -#include "paddle/infrt/dialect/infrt/common_type.h" +#include "paddle/infrt/dialect/infrt/common/types.h" #include "paddle/infrt/host_context/kernel_utils.h" #include "paddle/phi/core/dense_tensor.h" diff --git a/paddle/infrt/kernel/test_kernels.cc b/paddle/infrt/kernel/test_kernels.cc index d15bbe221f9..bcf475d1bc0 100644 --- a/paddle/infrt/kernel/test_kernels.cc +++ b/paddle/infrt/kernel/test_kernels.cc @@ -193,7 +193,7 @@ tensor::DenseHostTensor ShadowCopyTensor(tensor::DenseHostTensor src) { } void RegisterTestKernels(host_context::KernelRegistry *registry) { - registry->AddKernel("Infrt.benchmark", INFRT_KERNEL(benchmark)); + registry->AddKernel("infrt.benchmark", INFRT_KERNEL(benchmark)); registry->AddKernel("Infrt.test.shadow_copy_tensor", INFRT_KERNEL(ShadowCopyTensor)); } diff --git a/paddle/infrt/tests/dialect/basic.mlir b/paddle/infrt/tests/dialect/basic.mlir index 2d4d6f2629e..f534a3aa44a 100644 --- a/paddle/infrt/tests/dialect/basic.mlir +++ b/paddle/infrt/tests/dialect/basic.mlir @@ -1,33 +1,33 @@ // RUN: infrtexec -i %s | FileCheck %s // CHECK-LABEL: @basic_f32 func @basic_f32() -> f32 { - %v0 = Infrt.constant.f32 1.0 - %v1 = Infrt.constant.f32 2.0 - %value = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 + %v0 = infrt.constant.f32 1.0 + %v1 = infrt.constant.f32 2.0 + %value = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 // CHECK-NEXT: 3 - "Infrt.print.f32"(%value) : (f32) -> () + "infrt.print.f32"(%value) : (f32) -> () - Infrt.return %value : f32 + infrt.return %value : f32 } /// ================================================================ /// @caller call the other function @callee func @callee.add.f32(%x : f32, %y : f32, %y1 : f32) -> f32 { - %z = "Infrt.add.f32"(%x, %y) : (f32, f32) -> f32 - %z1 = "Infrt.add.f32"(%z, %y1) : (f32, f32) -> f32 - Infrt.return %z1 : f32 + %z = "infrt.add.f32"(%x, %y) : (f32, f32) -> f32 + %z1 = "infrt.add.f32"(%z, %y1) : (f32, f32) -> f32 + infrt.return %z1 : f32 } // CHECK-LABEL: @caller.add.f32 func @caller.add.f32() -> f32 { - %x = Infrt.constant.f32 1.0 - %y = Infrt.constant.f32 2.0 - %y1 = Infrt.constant.f32 3.0 - %z = Infrt.call @callee.add.f32(%x, %y, %y1) : (f32, f32, f32) -> f32 + %x = infrt.constant.f32 1.0 + %y = infrt.constant.f32 2.0 + %y1 = infrt.constant.f32 3.0 + %z = infrt.call @callee.add.f32(%x, %y, %y1) : (f32, f32, f32) -> f32 // CHECK-NEXT: 6 - "Infrt.print.f32"(%z) : (f32) -> () - Infrt.return %z : f32 + "infrt.print.f32"(%z) : (f32) -> () + infrt.return %z : f32 } /// <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/paddle/infrt/tests/dialect/benchmark.mlir b/paddle/infrt/tests/dialect/benchmark.mlir index 381fd534f6a..1a57b434990 100644 --- a/paddle/infrt/tests/dialect/benchmark.mlir +++ b/paddle/infrt/tests/dialect/benchmark.mlir @@ -12,13 +12,13 @@ func @benchmark() { // CHECK-LABEL: BM:add.f32:CPU 95%(ns) // CHECK-LABEL: BM:add.f32:CPU 99%(ns) // CHECK-LABEL: BM:add.f32:CPU utilization(percent) - Infrt.benchmark "add.f32"() duration_secs = 1, max_count = 3, num_warmup_runs = 3 + infrt.benchmark "add.f32"() duration_secs = 1, max_count = 3, num_warmup_runs = 3 { - %0 = Infrt.constant.f32 1.0 - %1 = Infrt.constant.f32 2.0 - %res = "Infrt.add.f32"(%0, %1) : (f32, f32) -> f32 - "Infrt.print.f32"(%res) : (f32) -> () - Infrt.return %res : f32 + %0 = infrt.constant.f32 1.0 + %1 = infrt.constant.f32 2.0 + %res = "infrt.add.f32"(%0, %1) : (f32, f32) -> f32 + "infrt.print.f32"(%res) : (f32) -> () + infrt.return %res : f32 } - Infrt.return + infrt.return } diff --git a/paddle/infrt/tests/dialect/dense_tensor.mlir b/paddle/infrt/tests/dialect/dense_tensor.mlir index faade62d350..6dc99046104 100644 --- a/paddle/infrt/tests/dialect/dense_tensor.mlir +++ b/paddle/infrt/tests/dialect/dense_tensor.mlir @@ -4,14 +4,14 @@ func @dense_shape0() { %shape = ts.build_shape [1:i64, 57:i64] %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor - Infrt.return + infrt.return } func @predict(%a: !infrt.dense_tensor, %b: !infrt.dense_tensor) -> (!infrt.dense_tensor, !infrt.dense_tensor) { %a0 = dt.shallow_copy_tensor %a : !infrt.dense_tensor -> !infrt.dense_tensor %b0 = dt.shallow_copy_tensor %b : !infrt.dense_tensor -> !infrt.dense_tensor - Infrt.return %a0, %b0: !infrt.dense_tensor, !infrt.dense_tensor + infrt.return %a0, %b0: !infrt.dense_tensor, !infrt.dense_tensor } @@ -19,6 +19,6 @@ func @main() { %shape = ts.build_shape [1:i64, 57:i64] %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor - %b, %c = Infrt.call @predict(%a, %a) : (!infrt.dense_tensor, !infrt.dense_tensor) -> (!infrt.dense_tensor, !infrt.dense_tensor) - Infrt.return + %b, %c = infrt.call @predict(%a, %a) : (!infrt.dense_tensor, !infrt.dense_tensor) -> (!infrt.dense_tensor, !infrt.dense_tensor) + infrt.return } diff --git a/paddle/infrt/tests/dialect/disabled_tensor_map.mlir b/paddle/infrt/tests/dialect/disabled_tensor_map.mlir index 1cae065bd5f..936c8f32c01 100644 --- a/paddle/infrt/tests/dialect/disabled_tensor_map.mlir +++ b/paddle/infrt/tests/dialect/disabled_tensor_map.mlir @@ -1,30 +1,30 @@ // CHECK-LABEL: @predict -func @predict(%input:!Infrt.tensor, %map: !Infrt.tensor_map) -> (!Infrt.tensor) { - %w = dt.get_param(%map, "create_parameter_0.w_0") -> !Infrt.tensor - %bias = dt.get_param(%map, "create_parameter_1.w_0") -> !Infrt.tensor +func @predict(%input:!infrt.dense_tensor, %map: !infrt.dense_tensor_map) -> (!infrt.dense_tensor) { + %w = dt.get_param(%map, "create_parameter_0.w_0") -> !infrt.dense_tensor + %bias = dt.get_param(%map, "create_parameter_1.w_0") -> !infrt.dense_tensor - %out = dt.create_uninit_tensor.f32 [3, 3] -> !Infrt.tensor + %out = dt.create_uninit_tensor.f32 [3, 3] -> !infrt.dense_tensor // fc - "external.matmul"(%input, %w, %out) {}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () - "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () - "external.sigmoid"(%out, %out) {}: (!Infrt.tensor, !Infrt.tensor) -> () - //dt.print_tensor (%out : !Infrt.tensor) + "external.matmul"(%input, %w, %out) {}: (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> () + "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> () + "external.sigmoid"(%out, %out) {}: (!infrt.dense_tensor, !infrt.dense_tensor) -> () + //dt.print_tensor (%out : !infrt.dense_tensor) - Infrt.return %out : !Infrt.tensor + infrt.return %out : !infrt.dense_tensor } // CHECK-LABEL: @main func @main() { - %input = dt.create_uninit_tensor.f32 [3, 3] -> !Infrt.tensor - dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor) {value=1.0:f32} + %input = dt.create_uninit_tensor.f32 [3, 3] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%input : !infrt.dense_tensor) {value=1.0:f32} // CHECK-LABEL: loading params %map = dt.load_params() {path="/Infrt/build/paddle/paddle_1.8_fc_model"} - %out = Infrt.call @predict(%input, %map): (!Infrt.tensor, !Infrt.tensor_map) -> (!Infrt.tensor) - dt.print_tensor (%out : !Infrt.tensor) + %out = infrt.call @predict(%input, %map): (!infrt.dense_tensor, !infrt.dense_tensor_map) -> (!infrt.dense_tensor) + dt.print_tensor (%out : !infrt.dense_tensor) - Infrt.return + infrt.return } diff --git a/paddle/infrt/tests/dialect/paddle_ops.mlir b/paddle/infrt/tests/dialect/paddle_ops.mlir index 48ee4b9d725..4b805551493 100644 --- a/paddle/infrt/tests/dialect/paddle_ops.mlir +++ b/paddle/infrt/tests/dialect/paddle_ops.mlir @@ -5,5 +5,5 @@ func @ops() { %b = pd.feed() {name="input1"}: tensor %d = pd.feed() {name="input3"}: !infrt.lod_tensor<3x4x9xf32, 0> %c = "pd.matmul"(%a, %b) {transpose_x=true, transpose_y=false} : (tensor, tensor) -> tensor - Infrt.return + infrt.return } diff --git a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir index e8f09f07c82..b40184e7266 100644 --- a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir +++ b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir @@ -11,6 +11,6 @@ func @sign_any_float32_execute() { // CHECK: dense_tensor: shape=shape[1], values=[1] "phi_dt.print_tensor" (%e) : (!infrt.dense_tensor) -> () - Infrt.return + infrt.return } diff --git a/paddle/infrt/tests/dialect/phi/phi_test.mlir b/paddle/infrt/tests/dialect/phi/phi_test.mlir index 923f4e9d9d2..5b0fa735897 100644 --- a/paddle/infrt/tests/dialect/phi/phi_test.mlir +++ b/paddle/infrt/tests/dialect/phi/phi_test.mlir @@ -2,14 +2,14 @@ module { func @predict(%arg0: !infrt.dense_tensor) -> !infrt.dense_tensor { %2 = "pd.abs"(%arg0) : (!infrt.dense_tensor) -> !infrt.dense_tensor - Infrt.return %2 : !infrt.dense_tensor + infrt.return %2 : !infrt.dense_tensor } func @main() { %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context %t = "phi_dt.create_dense_tensor" (%ctx) {precision=#infrt.precision, layout=#infrt.layout, lod=[1:i64], dims=[1:i64]}: (!phi.context) -> (!infrt.dense_tensor) "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor) -> () - %2 = Infrt.call@predict(%t) : (!infrt.dense_tensor) -> !infrt.dense_tensor + %2 = infrt.call@predict(%t) : (!infrt.dense_tensor) -> !infrt.dense_tensor phi_dt.print_tensor(%2 : !infrt.dense_tensor) - Infrt.return + infrt.return } } diff --git a/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir b/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir index 76ae140dd6c..47bc1f78331 100644 --- a/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir +++ b/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir @@ -3,14 +3,14 @@ func @dense_shape0() { %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor - Infrt.return + infrt.return } func @predict(%a: !infrt.dense_tensor, %b: !infrt.dense_tensor) -> (!infrt.dense_tensor, !infrt.dense_tensor) { %a0 = dt.shallow_copy_tensor %a : !infrt.dense_tensor -> !infrt.dense_tensor %b0 = dt.shallow_copy_tensor %b : !infrt.dense_tensor -> !infrt.dense_tensor - Infrt.return %a0, %b0: !infrt.dense_tensor, !infrt.dense_tensor + infrt.return %a0, %b0: !infrt.dense_tensor, !infrt.dense_tensor } @@ -18,6 +18,6 @@ func @main() { %shape = ts.build_shape [1:i64, 57:i64] %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor - %b, %c = Infrt.call @predict(%a, %a) : (!infrt.dense_tensor, !infrt.dense_tensor) -> (!infrt.dense_tensor, !infrt.dense_tensor) - Infrt.return + %b, %c = infrt.call @predict(%a, %a) : (!infrt.dense_tensor, !infrt.dense_tensor) -> (!infrt.dense_tensor, !infrt.dense_tensor) + infrt.return } diff --git a/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir b/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir index 52b296e06cd..d6b69fdd595 100644 --- a/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir +++ b/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir @@ -13,7 +13,7 @@ func @naive_elementwise_add() { // CHECK: tensor: shape=shape[2,8], values=[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3] dt.print_tensor (%c : !infrt.dense_tensor) - Infrt.return + infrt.return } // RUN: infrtexec -i %s | FileCheck %s @@ -31,5 +31,5 @@ func @naive_matmul() { // CHECK: tensor: shape=shape[2,4], values=[16, 16, 16, 16, 16, 16, 16, 16] dt.print_tensor (%c : !infrt.dense_tensor) - Infrt.return + infrt.return } diff --git a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in index 28450ed6bd8..7aeb3f8a4d0 100644 --- a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in +++ b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in @@ -3,12 +3,12 @@ func @load_tensor_map() { %map = dt.load_params(){path="@CMAKE_BINARY_DIR@/multi_fc_model"} %size = dt.tensor_map_get_size(%map) -> i32 - Infrt.print.i32 %size + infrt.print.i32 %size %a = dt.tensor_map_get_tensor(%map) {name="fc_bias"} -> !infrt.dense_tensor // CHECK: tensor: shape=shape[2], values=[0, 0] dt.print_tensor (%a : !infrt.dense_tensor) - Infrt.return + infrt.return } diff --git a/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir b/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir index 5623aef71aa..09210078b9d 100644 --- a/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir +++ b/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir @@ -4,5 +4,5 @@ func @build_tensor1() { %a = ts.build_shape [1:i64, 57:i64, 92:i64] // CHECK: shape[1,57,92] ts.print_shape %a - Infrt.return + infrt.return } diff --git a/paddle/infrt/tests/dialect/tensor/tensor_type.mlir b/paddle/infrt/tests/dialect/tensor/tensor_type.mlir index e580634055a..5847d567cf6 100644 --- a/paddle/infrt/tests/dialect/tensor/tensor_type.mlir +++ b/paddle/infrt/tests/dialect/tensor/tensor_type.mlir @@ -6,5 +6,5 @@ func @test_tensor_type() { // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] dt.print_tensor (%a : !infrt.dense_tensor) - Infrt.return + infrt.return } diff --git a/paddle/infrt/tests/dialect/tensor_shape.mlir b/paddle/infrt/tests/dialect/tensor_shape.mlir index 5623aef71aa..09210078b9d 100644 --- a/paddle/infrt/tests/dialect/tensor_shape.mlir +++ b/paddle/infrt/tests/dialect/tensor_shape.mlir @@ -4,5 +4,5 @@ func @build_tensor1() { %a = ts.build_shape [1:i64, 57:i64, 92:i64] // CHECK: shape[1,57,92] ts.print_shape %a - Infrt.return + infrt.return } diff --git a/paddle/infrt/tests/dialect/tensor_type.mlir b/paddle/infrt/tests/dialect/tensor_type.mlir index e580634055a..5847d567cf6 100644 --- a/paddle/infrt/tests/dialect/tensor_type.mlir +++ b/paddle/infrt/tests/dialect/tensor_type.mlir @@ -6,5 +6,5 @@ func @test_tensor_type() { // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] dt.print_tensor (%a : !infrt.dense_tensor) - Infrt.return + infrt.return } diff --git a/paddle/infrt/tests/dialect/trt_ops.mlir b/paddle/infrt/tests/dialect/trt_ops.mlir index 6d25044d139..e3cb9670bec 100644 --- a/paddle/infrt/tests/dialect/trt_ops.mlir +++ b/paddle/infrt/tests/dialect/trt_ops.mlir @@ -12,5 +12,5 @@ func @main(%bias:tensor, %c:tensor, %b1:tensor, %b2:tensor< %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=-1:si32} : (tensor, tensor) -> tensor %e2 = "pd.relu"(%d2) {} : (tensor) -> tensor - "infrt.return"(%e2) : (tensor)->() + infrt.return %e2 : tensor } -- GitLab From d6e99fe4eeb24efd445cfd1093c35dc43e6e0e15 Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Mon, 14 Mar 2022 13:47:13 +0800 Subject: [PATCH 027/176] Adjust Yaml name parsing to satisfy Sparse-related APIs (#40480) --- .../final_state_generator/eager_gen.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 967891fe522..537c2bb7f02 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -148,6 +148,12 @@ def ReadBwdFile(filepath): ###################### ### Yaml Parsers ### ###################### +def RemoveSpecialSymbolsInName(string): + # Remove any name after '@' + ret = string.split("@")[0] + return ret + + def IntermediateValidationCheck(intermediate_outputs, forward_returns_list): # intermediate_outputs : [name0, name1, ...] # forward_returns_list : [[ret_name, type, orig_pos], ...] @@ -166,15 +172,19 @@ def IntermediateValidationCheck(intermediate_outputs, forward_returns_list): def ParseDispensable(string): # string: "X, Y" + string = RemoveSpecialSymbolsInName(string) return [v.strip() for v in string.split(",")] def ParseIntermediate(string): + string = RemoveSpecialSymbolsInName(string) return [v.strip() for v in string.split(",")] def ParseNoNeedBuffer(string): # string: "x, y" + string = RemoveSpecialSymbolsInName(string) + no_need_buffer_set = set() for name in string.split(","): no_need_buffer_set.add(name.strip()) @@ -204,6 +214,8 @@ def ParseYamlArgs(string): assert arg_type in yaml_types_mapping.keys() arg_type = yaml_types_mapping[arg_type] + + arg_name = RemoveSpecialSymbolsInName(arg_name) if "Tensor" in arg_type: assert default_value is None inputs_list.append([arg_name, arg_type, i]) @@ -239,6 +251,7 @@ def ParseYamlReturns(string): ret_type = yaml_types_mapping[ret_type] assert "Tensor" in ret_type + ret_name = RemoveSpecialSymbolsInName(ret_name) returns_list.append([ret_name, ret_type, i]) return returns_list -- GitLab From e14a6ec976cd2fdcaf5644d72dede00b2b131f9e Mon Sep 17 00:00:00 2001 From: Baibaifan <39549453+Baibaifan@users.noreply.github.com> Date: Mon, 14 Mar 2022 14:35:34 +0800 Subject: [PATCH 028/176] fix_group_sharded_note (#40488) --- .../meta_parallel/sharding/sharding_stage3.py | 1 - .../distributed/sharding/group_sharded.py | 26 +++++++++++-------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py index 9886ca4e2de..f96273cc84c 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py @@ -912,7 +912,6 @@ def _device2cpu(trans_param, convert_dtype=False): def _cpu2device(param): tmp_p = param.fw_storage.cuda(DEV_ID) - param.fw_storage._clear() if tmp_p.dtype == Type.fp32.value and param2dtype[ param.name] == Type.fp16.value: tmp_p = paddle.cast(tmp_p, Type.fp16.value) diff --git a/python/paddle/distributed/sharding/group_sharded.py b/python/paddle/distributed/sharding/group_sharded.py index 2fdb20600f6..6fd4caa7b4a 100644 --- a/python/paddle/distributed/sharding/group_sharded.py +++ b/python/paddle/distributed/sharding/group_sharded.py @@ -39,19 +39,20 @@ def group_sharded_parallel(model, segment_size=2**20, sync_comm=False): """ - Use this module to configure and wrap up the parameters of the group shared module. + Use group_sharded_parallel can perform group shared configuration on the model, optimizer and GradScaler. Level has three string options, 'os', 'os_g' and 'p_g_os' corresponds to three different usage scenarios: optimizer state segmentation, optimizer state + gradient segmentation, and parameter + gradient + optimizer state segmentation. + Usually, optimizer state + gradient segmentation is actually a re optimization of optimizer state segmentation, so optimizer state + gradient segmentation can be used to realize optimizer state segmentation. Args: model (Layer): The layer to be wrapped with group_sharded_parallel. optimizer (Optimizer): The optimizer to be wrapped with group_sharded_parallel. level (str): The different level of the group sharded. Such as `os`, `os_g`, `p_g_os`. - scaler (GradScaler, optional): The scaler to be wrapped with group_sharded_parallel. Defaults to None. - group (Group, optional): The group instance. Defaults to None.d - offload (bool, optional): Whether to perform optimizer state and gradient transfer CPU. Defaults to False. - sync_buffers (bool, optional): Whether to broadcast model buffers. Defaults to False. - buffer_max_size (int, optional): The max size of the buffer used to integrate gradient in `os_g`. Defaults to 2**23. - segment_size (int, optional): The smallest size of parameter to be sharded in `p_g_os`. Defaults to 2**20. - sync_comm (bool, optional): Whether to use synchronous communication, only in `p_g_os` used. Defaults to False. + scaler (GradScaler, optional): If AMP is used, you need to pass GradScaler. Defaults to None, indicating that GradScaler is not used. + group (Group, optional): The group instance. Defaults to None, indicating that the default environment group is used. + offload (bool, optional): Whether to use the offload function. Defaults to False, which means that the offload function is not used. + sync_buffers (bool, optional): Whether to broadcast model buffers. It is generally used when there are registered model buffers. Defaults to False, indicating that model buffers are not used. + buffer_max_size (int, optional): The max size of the buffer used to integrate gradient in `os_g`. The larger the size, the more GPU memory will be used. Defaults to 2**23, which means that the dimension of the buffer is 2**23. + segment_size (int, optional): The smallest size of parameter to be sharded in `p_g_os`. Defaults to 2**20, indicating that the dimension of the minimum segmented parameter is 2**20. + sync_comm (bool, optional): Whether to use synchronous communication, only in `p_g_os` used. Defaults to False, indicating that asynchronous communication is used. Returns: model: A wrapper for group sharded given model. @@ -101,7 +102,7 @@ def group_sharded_parallel(model, def check_dtype(param): return param.dtype == paddle.float16 - params_fp16 = filter(check_dtype, model.parameters()) + params_fp16 = list(filter(check_dtype, model.parameters())) if scaler is None and len(params_fp16) > 0: raise ValueError("Please enter the correct scaler.") # convert model/optimizer/scaler @@ -146,10 +147,13 @@ def save_group_sharded_model(model, output, optimizer=None): """ Group sharded encapsulated model and optimizer state saving module. + .. note:: + If using save_group_sharded_model saves the model. When loading again, you need to set the model or optimizer state before using group_sharded_parallel. + Args: model (Layer): A wrapper for group sharded given model. output (str): Save directory. - optimizer (Optimizer, optional): Group sharded encapsulated optimizer. Defaults to None. + optimizer (Optimizer, optional): Group sharded encapsulated optimizer. Defaults to None, indicating that the optimizer state is not saved. Examples: .. code-block:: python @@ -182,7 +186,7 @@ def save_group_sharded_model(model, output, optimizer=None): optimizer.clear_grad() # save model and optimizer state_dict - save_group_sharded_model(model, optimizer,output=output_dir) + save_group_sharded_model(model, optimizer, output=output_dir) """ logger_.info( "==========Begin to save group sharded model and optimizer==========") -- GitLab From 89a70c765d986312f2294d66a7d019bcfb5f5856 Mon Sep 17 00:00:00 2001 From: liutiexing <74819124+liutiexing@users.noreply.github.com> Date: Mon, 14 Mar 2022 15:03:40 +0800 Subject: [PATCH 029/176] Update profiler (#40460) --- paddle/fluid/platform/profiler.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index feb72bce72b..940fc98d3b3 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -77,7 +77,9 @@ RecordEvent::RecordEvent(const char *name, const TracerEventType type, #endif #endif if (FLAGS_enable_host_event_recorder_hook == false) { - OriginalConstruct(name, role, "none"); + if (g_state != ProfilerState::kDisabled) { // avoid temp string + OriginalConstruct(name, role, "none"); + } return; } if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) { @@ -165,8 +167,8 @@ void RecordEvent::End() { } #endif #endif - uint64_t end_ns = PosixInNsec(); if (LIKELY(FLAGS_enable_host_event_recorder_hook && is_enabled_)) { + uint64_t end_ns = PosixInNsec(); if (LIKELY(shallow_copy_name_ != nullptr)) { HostEventRecorder::GetInstance().RecordEvent( shallow_copy_name_, start_ns_, end_ns, role_, type_); @@ -190,6 +192,7 @@ void RecordEvent::End() { // lock is not needed, the code below is thread-safe DeviceTracer *tracer = GetDeviceTracer(); if (tracer) { + uint64_t end_ns = PosixInNsec(); tracer->AddCPURecords(CurAnnotationName(), start_ns_, end_ns, BlockDepth(), g_thread_id); } -- GitLab From 930a5136078b0ad32b4846631fcf9ce982adb9c2 Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Mon, 14 Mar 2022 15:45:26 +0800 Subject: [PATCH 030/176] [Phi] Migrate triangular_solve dependence to phi (#40417) --- paddle/fluid/operators/lstsq_op.cu | 15 +++-- paddle/fluid/operators/lstsq_op.h | 1 - paddle/fluid/operators/lu_op.h | 28 +++++--- paddle/fluid/operators/math/matrix_solve.cc | 39 ----------- .../fluid/operators/math/matrix_solve.cu.cc | 61 ------------------ paddle/fluid/operators/math/matrix_solve.h | 8 --- paddle/fluid/operators/triangular_solve_op.cc | 2 - paddle/fluid/operators/triangular_solve_op.h | 64 ------------------- .../kernels/cpu/triangular_solve_kernel.cc | 1 + 9 files changed, 31 insertions(+), 188 deletions(-) delete mode 100644 paddle/fluid/operators/triangular_solve_op.h diff --git a/paddle/fluid/operators/lstsq_op.cu b/paddle/fluid/operators/lstsq_op.cu index 92c9857f0b9..10e2867bf29 100644 --- a/paddle/fluid/operators/lstsq_op.cu +++ b/paddle/fluid/operators/lstsq_op.cu @@ -17,9 +17,11 @@ #include #include +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/operators/lstsq_op.h" #include "paddle/fluid/operators/qr_op.h" #include "paddle/fluid/platform/dynload/cusolver.h" +#include "paddle/phi/kernels/triangular_solve_kernel.h" namespace paddle { namespace operators { @@ -70,6 +72,10 @@ class LstsqCUDAKernel : public framework::OpKernel { Tensor tau = dito.Fill(tau_dims_vec, 0); auto tau_data = tau.mutable_data(context.GetPlace()); + using Context = + typename framework::ConvertToPhiContext::TYPE; + auto& phi_dev_ctx = static_cast(dev_ctx); + if (m >= n) { Tensor tmp_x = dito.Transpose(new_x); Tensor tmp_y = dito.Transpose(new_y); @@ -93,8 +99,9 @@ class LstsqCUDAKernel : public framework::OpKernel { Tensor slice_y = dito.Slice(trans_y, {-2}, {0}, {min_mn}); // Step 3, solve R X = Y - triangular_solve(dev_ctx, res_r, slice_y, solution, - true, false, false); + phi::TriangularSolveKernel(phi_dev_ctx, res_r, slice_y, true, + false, false, solution); + } else { auto x_data = new_x.mutable_data(context.GetPlace()); auto y_data = new_y.mutable_data(context.GetPlace()); @@ -105,8 +112,8 @@ class LstsqCUDAKernel : public framework::OpKernel { // Step 2, solve R^H Z = Y Tensor trans_r = dito.Transpose(new_x); - triangular_solve(dev_ctx, trans_r, new_y, solution, - true, true, false); + phi::TriangularSolveKernel(phi_dev_ctx, trans_r, new_y, true, + true, false, solution); // Step 3, X <- Q Z BatchedOrgqr(dev_ctx, batch_count, n, n, min_mn, x_data, diff --git a/paddle/fluid/operators/lstsq_op.h b/paddle/fluid/operators/lstsq_op.h index 3cbbc62e7be..520722dafcb 100644 --- a/paddle/fluid/operators/lstsq_op.h +++ b/paddle/fluid/operators/lstsq_op.h @@ -22,7 +22,6 @@ #include "paddle/fluid/operators/math/matrix_solve.h" #include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/operators/triangular_solve_op.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/funcs/complex_functors.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h index f323e2e041d..214b2eccae9 100644 --- a/paddle/fluid/operators/lu_op.h +++ b/paddle/fluid/operators/lu_op.h @@ -15,12 +15,13 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/operators/set_value_op.h" #include "paddle/fluid/operators/svd_helper.h" -#include "paddle/fluid/operators/triangular_solve_op.h" #include "paddle/fluid/operators/tril_triu_op.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" #include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/triangular_solve_kernel.h" namespace paddle { namespace operators { @@ -555,6 +556,11 @@ class LUGradKernel : public framework::OpKernel { framework::Tensor Pmat; Unpack_Pivot(dev_ctx, *P, &Pmat, m, k); + + using Context = + typename framework::ConvertToPhiContext::TYPE; + auto& phi_dev_ctx = static_cast(dev_ctx); + if (m <= n) { if (k < n) { framework::Tensor U_complement, U_grad_complement, phi_complement, @@ -605,8 +611,9 @@ class LUGradKernel : public framework::OpKernel { framework::Tensor psi_principal, phi_mH, psi_tmp; Tensor_Conj(dev_ctx, phi, &phi_mH); phi_mH = helper.Transpose(phi_mH); - triangular_solve(dev_ctx, U_narrow, phi_mH, - &psi_principal, true, false, false); + + phi::TriangularSolveKernel( + phi_dev_ctx, U_narrow, phi_mH, true, false, false, &psi_principal); Tensor_Conj(dev_ctx, psi_principal, &psi_principal); psi_principal = helper.Transpose(psi_principal); @@ -620,8 +627,9 @@ class LUGradKernel : public framework::OpKernel { SetValueCompute_dispatch(ctx, &psi, &psi_principal, &psi, axes, &slice_starts, &slice_ends, valuedims, xrank); - triangular_solve(dev_ctx, L_narrow_mH, psi, &psi_tmp, - true, false, true); + + phi::TriangularSolveKernel(phi_dev_ctx, L_narrow_mH, psi, + true, false, true, &psi_tmp); auto mat_dim_p = phi::funcs::CreateMatrixDescriptor(Pmat.dims(), 0, false); @@ -672,8 +680,10 @@ class LUGradKernel : public framework::OpKernel { &psi, axes, &slice_starts, &slice_ends, valuedims, xrank); framework::Tensor psi_principal, phi_mH, psi_tmp, U_narrow_mH; - triangular_solve(dev_ctx, L_narrow_mH, phi, - &psi_principal, true, false, true); + + phi::TriangularSolveKernel(phi_dev_ctx, L_narrow_mH, phi, + true, false, true, &psi_principal); + slice_starts[0] = 0; slice_starts[1] = 0; slice_ends[0] = k; @@ -695,8 +705,8 @@ class LUGradKernel : public framework::OpKernel { psi_tmp = helper.Transpose(psi_tmp); Tensor_Conj(dev_ctx, U_narrow, &U_narrow_mH); - triangular_solve(dev_ctx, U_narrow_mH, psi_tmp, &psi, - true, false, false); + phi::TriangularSolveKernel(phi_dev_ctx, U_narrow_mH, psi_tmp, + true, false, false, &psi); *dx = helper.Transpose(psi); } } diff --git a/paddle/fluid/operators/math/matrix_solve.cc b/paddle/fluid/operators/math/matrix_solve.cc index 883ee9b1486..7b239b81666 100644 --- a/paddle/fluid/operators/math/matrix_solve.cc +++ b/paddle/fluid/operators/math/matrix_solve.cc @@ -34,45 +34,6 @@ class MatrixSolveFunctor { template class MatrixSolveFunctor; template class MatrixSolveFunctor; -template -class TriangularSolveFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor* a, framework::Tensor* b, bool left, - bool upper, bool transpose, bool unitriangular) { - CBLAS_SIDE side = left ? CblasLeft : CblasRight; - CBLAS_UPLO uplo = upper ? CblasUpper : CblasLower; - CBLAS_TRANSPOSE transA = transpose ? CblasTrans : CblasNoTrans; - CBLAS_DIAG diag = unitriangular ? CblasUnit : CblasNonUnit; - - const T* a_data = a->data(); - T* b_data = b->mutable_data(context.GetPlace()); - - int a_dim_size = a->dims().size(); - int b_dim_size = b->dims().size(); - - int M = static_cast(b->dims()[b_dim_size - 2]); - int N = static_cast(b->dims()[b_dim_size - 1]); - auto lda = left ? std::max(1, M) : std::max(1, N); - auto ldb = std::max(1, N); - - int batch_size = 1; - auto& a_dim = a->dims(); - for (int i = 0; i < a_dim_size - 2; i++) { - batch_size *= a_dim[i]; - } - - auto blas = phi::funcs::GetBlas(context); - for (int i = 0; i < batch_size; i++) { - blas.TRSM(side, uplo, transA, diag, M, N, T(1), a_data + i * M * M, lda, - b_data + i * N * M, ldb); - } - } -}; - -template class TriangularSolveFunctor; -template class TriangularSolveFunctor; - } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/matrix_solve.cu.cc b/paddle/fluid/operators/math/matrix_solve.cu.cc index d3490ead212..737196dde1d 100644 --- a/paddle/fluid/operators/math/matrix_solve.cu.cc +++ b/paddle/fluid/operators/math/matrix_solve.cu.cc @@ -161,67 +161,6 @@ class MatrixSolveFunctor { template class MatrixSolveFunctor; template class MatrixSolveFunctor; -template -class TriangularSolveFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, const Tensor* a, - Tensor* b, bool left, bool upper, bool transpose, - bool unitriangular) { - CBLAS_SIDE side = left ? CblasLeft : CblasRight; - CBLAS_UPLO uplo = upper ? CblasUpper : CblasLower; - CBLAS_TRANSPOSE transA = transpose ? CblasTrans : CblasNoTrans; - CBLAS_DIAG diag = unitriangular ? CblasUnit : CblasNonUnit; - - const T* a_data = a->data(); - T* b_data = b->mutable_data(context.GetPlace()); - - int a_dim_size = a->dims().size(); - int b_dim_size = b->dims().size(); - - int M = static_cast(b->dims()[b_dim_size - 2]); - int N = static_cast(b->dims()[b_dim_size - 1]); - auto lda = left ? std::max(1, M) : std::max(1, N); - auto ldb = std::max(1, N); - - int batch_size = 1; - auto& a_dim = a->dims(); - for (int i = 0; i < a_dim_size - 2; i++) { - batch_size *= a_dim[i]; - } - - auto blas = phi::funcs::GetBlas(context); - if (batch_size <= 8 && M >= 64) { - for (auto i = 0; i < batch_size; i++) { - blas.TRSM(side, uplo, transA, diag, M, N, static_cast(1.0), - a_data + i * M * M, lda, b_data + i * N * M, ldb); - } - } else { - std::vector cpu_ptrs(batch_size * 2); - for (int i = 0; i < batch_size; ++i) { - cpu_ptrs[i] = a_data + i * M * M; - cpu_ptrs[i + batch_size] = b_data + i * M * N; - } - - // Copy the addresses of A and tmp_b from host to device. - memory::allocation::AllocationPtr tmp_gpu_ptrs_data = - memory::Alloc(context, cpu_ptrs.size() * sizeof(T*)); - memory::Copy(context.GetPlace(), tmp_gpu_ptrs_data->ptr(), - platform::CPUPlace(), static_cast(cpu_ptrs.data()), - cpu_ptrs.size() * sizeof(T*), context.stream()); - - const T** gpu_a_ptrs = - reinterpret_cast(tmp_gpu_ptrs_data->ptr()); - T** gpu_b_ptrs = - reinterpret_cast(tmp_gpu_ptrs_data->ptr()) + batch_size; - blas.BatchedTRSM(side, uplo, transA, diag, M, N, static_cast(1.0), - gpu_a_ptrs, lda, gpu_b_ptrs, ldb, batch_size); - } - } -}; - -template class TriangularSolveFunctor; -template class TriangularSolveFunctor; - } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/matrix_solve.h b/paddle/fluid/operators/math/matrix_solve.h index 1dc43205592..415d0c6dd8e 100644 --- a/paddle/fluid/operators/math/matrix_solve.h +++ b/paddle/fluid/operators/math/matrix_solve.h @@ -117,14 +117,6 @@ class MatrixSolveFunctor { const framework::Tensor& b, framework::Tensor* out); }; -template -class TriangularSolveFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor* a, - framework::Tensor* b, bool left, bool upper, bool transpose, - bool unitriangular); -}; - } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/triangular_solve_op.cc b/paddle/fluid/operators/triangular_solve_op.cc index df84659a00f..35b925ca172 100644 --- a/paddle/fluid/operators/triangular_solve_op.cc +++ b/paddle/fluid/operators/triangular_solve_op.cc @@ -12,10 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/triangular_solve_op.h" #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/solve_op.h" #include "paddle/phi/infermeta/binary.h" namespace paddle { diff --git a/paddle/fluid/operators/triangular_solve_op.h b/paddle/fluid/operators/triangular_solve_op.h deleted file mode 100644 index fd46aca456c..00000000000 --- a/paddle/fluid/operators/triangular_solve_op.h +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "glog/logging.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" -#include "paddle/fluid/operators/solve_op.h" -#include "paddle/fluid/operators/tril_triu_op.h" -#include "paddle/phi/core/ddim.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -static void triangular_solve(const DeviceContext &context, const Tensor &x, - const Tensor &y, Tensor *out, bool upper, - bool transpose, bool unitriangular) { - // Tensor broadcast use eigen library - std::vector x_bst_dims_vec; - std::vector y_bst_dims_vec; - std::tie(x_bst_dims_vec, y_bst_dims_vec) = get_broadcast_dims(x, y); - - Tensor x_bst(x.type()); - TensorExpand(context, x, &x_bst, x_bst_dims_vec); - - Tensor y_bst(y.type()); - TensorExpand(context, y, &y_bst, y_bst_dims_vec); - - // TriangularSolveFunctor performs calculations in-place - // x_clone should be a copy of 'x' after broadcast - // out should be a copy of 'y' after broadcast - Tensor x_clone(x.type()); - x_clone.Resize(phi::make_ddim(x_bst_dims_vec)); - x_clone.mutable_data(context.GetPlace()); - framework::TensorCopy(x_bst, context.GetPlace(), context, &x_clone); - - out->Resize(phi::make_ddim(y_bst_dims_vec)); - out->mutable_data(context.GetPlace()); - framework::TensorCopy(y_bst, context.GetPlace(), context, out); - - math::TriangularSolveFunctor functor; - functor(context, &x_clone, out, /*left=*/true, upper, transpose, - unitriangular); -} - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/kernels/cpu/triangular_solve_kernel.cc b/paddle/phi/kernels/cpu/triangular_solve_kernel.cc index 5aca5be1279..c91e7475f5b 100644 --- a/paddle/phi/kernels/cpu/triangular_solve_kernel.cc +++ b/paddle/phi/kernels/cpu/triangular_solve_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/triangular_solve_kernel.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/ddim.h" #include "paddle/phi/core/kernel_registry.h" -- GitLab From edd97f94464a53153ac8ec38b62c9a64a8f25893 Mon Sep 17 00:00:00 2001 From: WangXi Date: Mon, 14 Mar 2022 16:06:20 +0800 Subject: [PATCH 031/176] [hybrid fix] fix sharding save offload (#40477) --- .../distributed/fleet/meta_optimizers/sharding/utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py index b42f21989ab..1a3a8a4883d 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py @@ -900,11 +900,12 @@ def save_persistables(exe, dirname, main_program, filename=None): def is_opt_vars(var): # NOTE(JZ-LIANG): The checks should be updated when add new compatible optimizer - # now only Momentum and adam are compatible with sharding - # support EMA optimizer + # now only Momentum and adam are compatible with sharding, + # support EMA optimizer with '_ema_0', + # support offload with '@offload_0' and '.cast_fp16' checks = [ "_moment1_0", "_moment2_0", "_beta1_pow_acc_0", "_beta2_pow_acc_0", - "_velocity_0", "_ema_0" + "_velocity_0", "_ema_0", "@offload_0", ".cast_fp16" ] for check in checks: if var.name.endswith(check) and var.persistable: -- GitLab From b9d4285b7eea3abe1eaafada9560401c6df92362 Mon Sep 17 00:00:00 2001 From: crystal <62974595+Zjq9409@users.noreply.github.com> Date: Mon, 14 Mar 2022 16:11:05 +0800 Subject: [PATCH 032/176] =?UTF-8?q?=E3=80=90phi=E3=80=91migrate=20matrix?= =?UTF-8?q?=5Frank=20to=20phi=20(#40074)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * migrate matrix_rank to phi * migrate eigh and matrix_rank to phi * fix matrix_rank * optimize code * move matrix_rank to phi * add max functor * migrate matrix_rank to phi * optimize code --- paddle/fluid/operators/matrix_rank_op.cc | 139 +----- paddle/fluid/operators/matrix_rank_op.cu | 316 ------------- paddle/phi/kernels/cpu/matrix_rank_kernel.cc | 43 ++ .../phi/kernels/cpu/matrix_rank_tol_kernel.cc | 178 +++++++ paddle/phi/kernels/gpu/matrix_rank_kernel.cu | 52 +++ .../phi/kernels/gpu/matrix_rank_tol_kernel.cu | 438 ++++++++++++++++++ .../kernels/impl/matrix_rank_kernel_impl.h} | 28 +- paddle/phi/kernels/matrix_rank_kernel.h | 29 ++ paddle/phi/kernels/matrix_rank_tol_kernel.h | 29 ++ paddle/phi/ops/compat/matrix_rank_sig.cc | 38 ++ 10 files changed, 828 insertions(+), 462 deletions(-) delete mode 100644 paddle/fluid/operators/matrix_rank_op.cu create mode 100644 paddle/phi/kernels/cpu/matrix_rank_kernel.cc create mode 100644 paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc create mode 100644 paddle/phi/kernels/gpu/matrix_rank_kernel.cu create mode 100644 paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu rename paddle/{fluid/operators/matrix_rank_op.h => phi/kernels/impl/matrix_rank_kernel_impl.h} (72%) create mode 100644 paddle/phi/kernels/matrix_rank_kernel.h create mode 100644 paddle/phi/kernels/matrix_rank_tol_kernel.h create mode 100644 paddle/phi/ops/compat/matrix_rank_sig.cc diff --git a/paddle/fluid/operators/matrix_rank_op.cc b/paddle/fluid/operators/matrix_rank_op.cc index 1f04875c220..e7d08b65973 100644 --- a/paddle/fluid/operators/matrix_rank_op.cc +++ b/paddle/fluid/operators/matrix_rank_op.cc @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/matrix_rank_op.h" #include #include #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" @@ -70,9 +69,9 @@ class MatrixRankeOp : public framework::OperatorWithKernel { std::vector x_batch_dims_array(max_dim); std::vector tol_dims_array(max_dim); std::vector out_dims_array(max_dim); - GetBroadcastDimsArrays(dim_x_batch, dim_tol, x_batch_dims_array.data(), - tol_dims_array.data(), out_dims_array.data(), - max_dim, axis); + phi::funcs::GetBroadcastDimsArrays( + dim_x_batch, dim_tol, x_batch_dims_array.data(), + tol_dims_array.data(), out_dims_array.data(), max_dim, axis); ctx->SetOutputDim("Out", phi::make_ddim(out_dims_array)); } } else { @@ -115,141 +114,9 @@ class MatrixRankeOpMaker : public framework::OpProtoAndCheckerMaker { } }; -template -void BatchEigenvalues(const T* x_data, T* eigenvalues_data, int batches, - int rows, int cols, int k) { - // Eigen::Matrix API need non-const pointer. - T* input = const_cast(x_data); - int stride = rows * cols; - for (int i = 0; i < batches; i++) { - auto m = Eigen::Map< - Eigen::Matrix>( - input + i * stride, rows, rows); - Eigen::SelfAdjointEigenSolver< - Eigen::Matrix> - eigen_solver(m); - auto eigenvalues = eigen_solver.eigenvalues().cwiseAbs(); - for (int j = 0; j < k; j++) { - *(eigenvalues_data + i * k + j) = eigenvalues[j]; - } - } -} - -template -void BatchSVD(const T* x_data, T* eigenvalues_data, int batches, int rows, - int cols, int k) { - // Eigen::Matrix API need non-const pointer. - T* input = const_cast(x_data); - int stride = rows * cols; - Eigen::BDCSVD< - Eigen::Matrix> - svd; - for (int i = 0; i < batches; i++) { - auto m = Eigen::Map< - Eigen::Matrix>( - input + i * stride, rows, cols); - svd.compute(m); - auto res_s = svd.singularValues(); - for (int j = 0; j < k; j++) { - eigenvalues_data[i * k + j] = res_s[j]; - } - } -} - -template -class MatrixRankCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* x = context.Input("X"); - auto* x_data = x->data(); - auto* out = context.Output("Out"); - out->mutable_data(context.GetPlace()); - bool hermitian = context.Attr("hermitian"); - - auto dim_x = x->dims(); - auto dim_out = out->dims(); - int rows = dim_x[dim_x.size() - 2]; - int cols = dim_x[dim_x.size() - 1]; - int k = std::min(rows, cols); - auto numel = x->numel(); - int batches = numel / (rows * cols); - - bool use_default_tol = context.Attr("use_default_tol"); - const Tensor* atol_tensor = nullptr; - Tensor temp_tensor; - T rtol_T = 0; - if (use_default_tol) { - framework::TensorFromVector(std::vector{0}, - context.device_context(), &temp_tensor); - atol_tensor = &temp_tensor; - rtol_T = std::numeric_limits::epsilon() * std::max(rows, cols); - } else if (context.HasInput("TolTensor")) { - atol_tensor = context.Input("TolTensor"); - } else { - framework::TensorFromVector(std::vector{context.Attr("tol")}, - context.device_context(), &temp_tensor); - atol_tensor = &temp_tensor; - } - - Tensor eigenvalue_tensor; - auto* eigenvalue_data = eigenvalue_tensor.mutable_data( - detail::GetEigenvalueDim(dim_x, k), context.GetPlace()); - if (hermitian) { - BatchEigenvalues(x_data, eigenvalue_data, batches, rows, cols, k); - } else { - BatchSVD(x_data, eigenvalue_data, batches, rows, cols, k); - } - - auto dito_T = - math::DeviceIndependenceTensorOperations( - context); - std::vector max_eigenvalue_shape = - phi::vectorize(detail::RemoveLastDim(eigenvalue_tensor.dims())); - Tensor max_eigenvalue_tensor = - dito_T.ReduceMax(eigenvalue_tensor, max_eigenvalue_shape); - - Tensor temp_rtol_tensor; - framework::TensorFromVector(std::vector{rtol_T}, &temp_rtol_tensor); - Tensor rtol_tensor = dito_T.Mul(temp_rtol_tensor, max_eigenvalue_tensor); - Tensor tol_tensor; - tol_tensor.mutable_data(dim_out, context.GetPlace()); - ElementwiseComputeEx, platform::CPUDeviceContext, - T, T>(context, atol_tensor, &rtol_tensor, -1, - GreaterElementFunctor(), &tol_tensor); - - tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1)); - - Tensor compare_result; - compare_result.mutable_data(detail::NewAxisDim(dim_out, k), - context.GetPlace()); - - int axis = -1; - if (eigenvalue_tensor.dims().size() >= tol_tensor.dims().size()) { - ElementwiseComputeEx, - platform::CPUDeviceContext, T, int>( - context, &eigenvalue_tensor, &tol_tensor, axis, - phi::funcs::GreaterThanFunctor(), &compare_result); - } else { - ElementwiseComputeEx, - platform::CPUDeviceContext, T, int>( - context, &eigenvalue_tensor, &tol_tensor, axis, - phi::funcs::LessThanFunctor(), &compare_result); - } - auto dito_int = - math::DeviceIndependenceTensorOperations(context); - std::vector result_shape = phi::vectorize(dim_out); - Tensor result = dito_int.ReduceSum(compare_result, result_shape); - out->ShareDataWith(result); - } -}; - } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(matrix_rank, ops::MatrixRankeOp, ops::MatrixRankeOpMaker); - -REGISTER_OP_CPU_KERNEL(matrix_rank, ops::MatrixRankCPUKernel, - ops::MatrixRankCPUKernel); diff --git a/paddle/fluid/operators/matrix_rank_op.cu b/paddle/fluid/operators/matrix_rank_op.cu deleted file mode 100644 index dccd716022d..00000000000 --- a/paddle/fluid/operators/matrix_rank_op.cu +++ /dev/null @@ -1,316 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef PADDLE_WITH_HIP -// HIP not support cusolver -#include -#include -#include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/fluid/operators/matrix_rank_op.h" -#include "paddle/fluid/operators/svd_helper.h" -#include "paddle/fluid/platform/dynload/cusolver.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/compare_functors.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { -namespace detail { -DDim GetUDDim(const DDim& x_dim, int k) { - auto x_vec = phi::vectorize(x_dim); - x_vec[x_vec.size() - 1] = k; - return phi::make_ddim(x_vec); -} - -DDim GetVHDDim(const DDim& x_dim, int k) { - auto x_vec = phi::vectorize(x_dim); - x_vec[x_vec.size() - 2] = k; - return phi::make_ddim(x_vec); -} -} // namespace detail - -template -class MatrixRankGPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto& dev_ctx = - context.template device_context(); - - const Tensor* x = context.Input("X"); - auto* x_data = x->data(); - auto* out = context.Output("Out"); - out->mutable_data(context.GetPlace()); - bool hermitian = context.Attr("hermitian"); - - auto dim_x = x->dims(); - auto dim_out = out->dims(); - int rows = dim_x[dim_x.size() - 2]; - int cols = dim_x[dim_x.size() - 1]; - int k = std::min(rows, cols); - auto numel = x->numel(); - int batches = numel / (rows * cols); - - bool use_default_tol = context.Attr("use_default_tol"); - const Tensor* atol_tensor = nullptr; - Tensor temp_tensor; - T rtol_T = 0; - if (use_default_tol) { - framework::TensorFromVector(std::vector{0}, - context.device_context(), &temp_tensor); - atol_tensor = &temp_tensor; - rtol_T = std::numeric_limits::epsilon() * std::max(rows, cols); - } else if (context.HasInput("TolTensor")) { - atol_tensor = context.Input("TolTensor"); - } else { - framework::TensorFromVector(std::vector{context.Attr("tol")}, - context.device_context(), &temp_tensor); - atol_tensor = &temp_tensor; - } - - // Must Copy X once, because the gesvdj will destory the content when exit. - Tensor x_tmp; - paddle::framework::TensorCopy(*x, context.GetPlace(), &x_tmp); - auto info = memory::Alloc(dev_ctx, sizeof(int) * batches); - int* info_ptr = reinterpret_cast(info->ptr()); - - Tensor eigenvalue_tensor; - auto* eigenvalue_data = eigenvalue_tensor.mutable_data( - detail::GetEigenvalueDim(dim_x, k), context.GetPlace()); - if (hermitian) { - SyevjBatched(dev_ctx, batches, rows, x_tmp.data(), eigenvalue_data, - info_ptr); - platform::ForRange for_range( - dev_ctx, eigenvalue_tensor.numel()); - phi::funcs::AbsFunctor functor(eigenvalue_data, eigenvalue_data, - eigenvalue_tensor.numel()); - for_range(functor); - } else { - Tensor U, VH; - auto* u_data = - U.mutable_data(detail::GetUDDim(dim_x, k), context.GetPlace()); - auto* vh_data = - VH.mutable_data(detail::GetVHDDim(dim_x, k), context.GetPlace()); - GesvdjBatched(dev_ctx, batches, cols, rows, k, x_tmp.data(), vh_data, - u_data, eigenvalue_data, info_ptr, 1); - } - - auto dito_T = - math::DeviceIndependenceTensorOperations(context); - std::vector max_eigenvalue_shape = - phi::vectorize(detail::RemoveLastDim(eigenvalue_tensor.dims())); - Tensor max_eigenvalue_tensor = - dito_T.ReduceMax(eigenvalue_tensor, max_eigenvalue_shape); - Tensor temp_rtol_tensor; - framework::TensorFromVector(std::vector{rtol_T}, - context.device_context(), &temp_rtol_tensor); - Tensor rtol_tensor = dito_T.Mul(temp_rtol_tensor, max_eigenvalue_tensor); - Tensor tol_tensor; - tol_tensor.mutable_data(dim_out, context.GetPlace()); - ElementwiseComputeEx, platform::CUDADeviceContext, - T, T>(context, atol_tensor, &rtol_tensor, -1, - GreaterElementFunctor(), &tol_tensor); - - tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1)); - - Tensor compare_result; - compare_result.mutable_data(detail::NewAxisDim(dim_out, k), - context.GetPlace()); - int axis = -1; - ElementwiseComputeEx, - platform::CUDADeviceContext, T, int64_t>( - context, &eigenvalue_tensor, &tol_tensor, axis, - phi::funcs::GreaterThanFunctor(), &compare_result); - auto dito_int = - math::DeviceIndependenceTensorOperations(context); - std::vector result_shape = phi::vectorize(dim_out); - Tensor result = dito_int.ReduceSum(compare_result, result_shape); - out->ShareDataWith(result); - } - - void GesvdjBatched(const platform::CUDADeviceContext& dev_ctx, int batchSize, - int m, int n, int k, T* A, T* U, T* V, T* S, int* info, - int thin_UV = 1) const; - - void SyevjBatched(const platform::CUDADeviceContext& dev_ctx, int batchSize, - int n, T* A, T* W, int* info) const; -}; - -template <> -void MatrixRankGPUKernel::GesvdjBatched( - const platform::CUDADeviceContext& dev_ctx, int batchSize, int m, int n, - int k, float* A, float* U, float* V, float* S, int* info, - int thin_UV) const { - // do not compute singular vectors - const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; - gesvdjInfo_t gesvdj_params = NULL; - int lda = m; - int ldu = m; - int ldt = n; - int lwork = 0; - auto handle = dev_ctx.cusolver_dn_handle(); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj_bufferSize( - handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork, - gesvdj_params)); - auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float)); - float* workspace_ptr = reinterpret_cast(workspace->ptr()); - int stride_A = lda * n; - int stride_U = ldu * (thin_UV ? k : m); - int stride_V = ldt * (thin_UV ? k : n); - for (int i = 0; i < batchSize; i++) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj( - handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i, - U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork, - info, gesvdj_params)); - int error_info; - memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, - sizeof(int), dev_ctx.stream()); - PADDLE_ENFORCE_EQ( - error_info, 0, - platform::errors::PreconditionNotMet( - "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info)); - } - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params)); -} - -template <> -void MatrixRankGPUKernel::GesvdjBatched( - const platform::CUDADeviceContext& dev_ctx, int batchSize, int m, int n, - int k, double* A, double* U, double* V, double* S, int* info, - int thin_UV) const { - // do not compute singular vectors - const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; - gesvdjInfo_t gesvdj_params = NULL; - int lda = m; - int ldu = m; - int ldt = n; - int lwork = 0; - auto handle = dev_ctx.cusolver_dn_handle(); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj_bufferSize( - handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork, - gesvdj_params)); - auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double)); - double* workspace_ptr = reinterpret_cast(workspace->ptr()); - int stride_A = lda * n; - int stride_U = ldu * (thin_UV ? k : m); - int stride_V = ldt * (thin_UV ? k : n); - for (int i = 0; i < batchSize; ++i) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj( - handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i, - U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork, - info, gesvdj_params)); - // check the error info - int error_info; - memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, - sizeof(int), dev_ctx.stream()); - PADDLE_ENFORCE_EQ( - error_info, 0, - platform::errors::PreconditionNotMet( - "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info)); - } - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params)); -} - -template <> -void MatrixRankGPUKernel::SyevjBatched( - const platform::CUDADeviceContext& dev_ctx, int batchSize, int n, float* A, - float* W, int* info) const { - auto handle = dev_ctx.cusolver_dn_handle(); - // Compute eigenvalues only - const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; - // matrix is saved as column-major in cusolver. - // numpy and torch use lower triangle to compute eigenvalues, so here use - // upper triangle - cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER; - int lda = n; - int stride_A = lda * n; - int lwork = 0; - syevjInfo_t params = NULL; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnCreateSyevjInfo(¶ms)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj_bufferSize( - handle, jobz, uplo, n, A, lda, W, &lwork, params)); - auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float)); - float* workspace_ptr = reinterpret_cast(workspace->ptr()); - for (int i = 0; i < batchSize; i++) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj( - handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr, - lwork, info, params)); - - int error_info; - memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, - sizeof(int), dev_ctx.stream()); - PADDLE_ENFORCE_EQ( - error_info, 0, - platform::errors::PreconditionNotMet( - "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", i, - error_info)); - } - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnDestroySyevjInfo(params)); -} - -template <> -void MatrixRankGPUKernel::SyevjBatched( - const platform::CUDADeviceContext& dev_ctx, int batchSize, int n, double* A, - double* W, int* info) const { - auto handle = dev_ctx.cusolver_dn_handle(); - // Compute eigenvalues only - const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; - // upper triangle of A is stored - cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER; - int lda = n; - int stride_A = lda * n; - int lwork = 0; - syevjInfo_t params = NULL; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnCreateSyevjInfo(¶ms)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDsyevj_bufferSize( - handle, jobz, uplo, n, A, lda, W, &lwork, params)); - auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double)); - double* workspace_ptr = reinterpret_cast(workspace->ptr()); - - for (int i = 0; i < batchSize; i++) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDsyevj( - handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr, - lwork, info, params)); - int error_info; - memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, - sizeof(int), dev_ctx.stream()); - PADDLE_ENFORCE_EQ( - error_info, 0, - platform::errors::PreconditionNotMet( - "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", i, - error_info)); - } - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnDestroySyevjInfo(params)); -} - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(matrix_rank, ops::MatrixRankGPUKernel, - ops::MatrixRankGPUKernel); -#endif // not PADDLE_WITH_HIP diff --git a/paddle/phi/kernels/cpu/matrix_rank_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_kernel.cc new file mode 100644 index 00000000000..5e13abe8aed --- /dev/null +++ b/paddle/phi/kernels/cpu/matrix_rank_kernel.cc @@ -0,0 +1,43 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/matrix_rank_kernel.h" +#include "paddle/phi/kernels/matrix_rank_tol_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/full_kernel.h" + +namespace phi { + +template +void MatrixRankKernel(const Context& dev_ctx, + const DenseTensor& x, + float tol, + bool use_default_tol, + bool hermitian, + DenseTensor* out) { + DenseTensor atol_tensor; + if (use_default_tol) { + atol_tensor = phi::Full(dev_ctx, {1}, static_cast(0)); + } else { + atol_tensor = phi::Full(dev_ctx, {1}, static_cast(tol)); + } + MatrixRankTolKernel( + dev_ctx, x, atol_tensor, use_default_tol, hermitian, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + matrix_rank, CPU, ALL_LAYOUT, phi::MatrixRankKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc new file mode 100644 index 00000000000..210750da1e0 --- /dev/null +++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc @@ -0,0 +1,178 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/matrix_rank_tol_kernel.h" + +#include +#include +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/reduce.h" +#include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/compare_functors.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" +#include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h" +#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/reduce_max_kernel.h" + +namespace phi { + +template +void BatchEigenvalues(const T* x_data, + T* eigenvalues_data, + int batches, + int rows, + int cols, + int k) { + // Eigen::Matrix API need non-const pointer. + T* input = const_cast(x_data); + int stride = rows * cols; + for (int i = 0; i < batches; i++) { + auto m = Eigen::Map< + Eigen::Matrix>( + input + i * stride, rows, rows); + Eigen::SelfAdjointEigenSolver< + Eigen::Matrix> + eigen_solver(m); + auto eigenvalues = eigen_solver.eigenvalues().cwiseAbs(); + for (int j = 0; j < k; j++) { + *(eigenvalues_data + i * k + j) = eigenvalues[j]; + } + } +} + +template +void BatchSVD(const T* x_data, + T* eigenvalues_data, + int batches, + int rows, + int cols, + int k) { + // Eigen::Matrix API need non-const pointer. + T* input = const_cast(x_data); + int stride = rows * cols; + Eigen::BDCSVD< + Eigen::Matrix> + svd; + for (int i = 0; i < batches; i++) { + auto m = Eigen::Map< + Eigen::Matrix>( + input + i * stride, rows, cols); + svd.compute(m); + auto res_s = svd.singularValues(); + for (int j = 0; j < k; j++) { + eigenvalues_data[i * k + j] = res_s[j]; + } + } +} + +template +void MatrixRankTolKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& atol_tensor, + bool use_default_tol, + bool hermitian, + DenseTensor* out) { + auto* x_data = x.data(); + dev_ctx.template Alloc(out); + auto dim_x = x.dims(); + auto dim_out = out->dims(); + int rows = dim_x[dim_x.size() - 2]; + int cols = dim_x[dim_x.size() - 1]; + int k = std::min(rows, cols); + auto numel = x.numel(); + int batches = numel / (rows * cols); + + T rtol_T = 0; + + if (use_default_tol) { + rtol_T = std::numeric_limits::epsilon() * std::max(rows, cols); + } + + DenseTensor eigenvalue_tensor; + eigenvalue_tensor.Resize(detail::GetEigenvalueDim(dim_x, k)); + auto* eigenvalue_data = dev_ctx.template Alloc(&eigenvalue_tensor); + + if (hermitian) { + BatchEigenvalues(x_data, eigenvalue_data, batches, rows, cols, k); + } else { + BatchSVD(x_data, eigenvalue_data, batches, rows, cols, k); + } + + DenseTensor max_eigenvalue_tensor; + max_eigenvalue_tensor.Resize(detail::RemoveLastDim(eigenvalue_tensor.dims())); + dev_ctx.template Alloc(&max_eigenvalue_tensor); + phi::MaxKernel(dev_ctx, + eigenvalue_tensor, + std::vector{-1}, + false, + &max_eigenvalue_tensor); + + DenseTensor temp_rtol_tensor; + temp_rtol_tensor = + phi::Full(dev_ctx, {1}, static_cast(rtol_T)); + + DenseTensor rtol_tensor = + phi::Multiply(dev_ctx, temp_rtol_tensor, max_eigenvalue_tensor); + + DenseTensor tol_tensor; + tol_tensor.Resize(dim_out); + dev_ctx.template Alloc(&tol_tensor); + funcs::ElementwiseCompute, T, T>( + dev_ctx, + atol_tensor, + rtol_tensor, + -1, + GreaterElementFunctor(), + &tol_tensor); + + tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1)); + + DenseTensor compare_result; + compare_result.Resize(detail::NewAxisDim(dim_out, k)); + dev_ctx.template Alloc(&compare_result); + int axis = -1; + if (eigenvalue_tensor.dims().size() >= tol_tensor.dims().size()) { + funcs::ElementwiseCompute, T, int>( + dev_ctx, + eigenvalue_tensor, + tol_tensor, + axis, + funcs::GreaterThanFunctor(), + &compare_result); + } else { + funcs::ElementwiseCompute, T, int>( + dev_ctx, + eigenvalue_tensor, + tol_tensor, + axis, + funcs::LessThanFunctor(), + &compare_result); + } + + phi::SumKernel(dev_ctx, + compare_result, + std::vector{-1}, + compare_result.dtype(), + false, + out); +} +} // namespace phi + +PD_REGISTER_KERNEL( + matrix_rank_tol, CPU, ALL_LAYOUT, phi::MatrixRankTolKernel, float, double) { +} diff --git a/paddle/phi/kernels/gpu/matrix_rank_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_kernel.cu new file mode 100644 index 00000000000..9b889a9b4c0 --- /dev/null +++ b/paddle/phi/kernels/gpu/matrix_rank_kernel.cu @@ -0,0 +1,52 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef PADDLE_WITH_HIP +// HIP not support cusolver + +#include "paddle/phi/kernels/matrix_rank_kernel.h" +#include "paddle/phi/kernels/matrix_rank_tol_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/full_kernel.h" + +namespace phi { + +template +void MatrixRankKernel(const Context& dev_ctx, + const DenseTensor& x, + float tol, + bool use_default_tol, + bool hermitian, + DenseTensor* out) { + DenseTensor atol_tensor; + if (use_default_tol) { + atol_tensor = phi::Full(dev_ctx, {1}, static_cast(0)); + } else { + atol_tensor = phi::Full(dev_ctx, {1}, static_cast(tol)); + } + MatrixRankTolKernel( + dev_ctx, x, atol_tensor, use_default_tol, hermitian, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(matrix_rank, // cuda_only + GPU, + ALL_LAYOUT, + phi::MatrixRankKernel, + float, + double) {} + +#endif // not PADDLE_WITH_HIP diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu new file mode 100644 index 00000000000..ccd9f714956 --- /dev/null +++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu @@ -0,0 +1,438 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef PADDLE_WITH_HIP +// HIP not support cusolver + +#include "paddle/phi/kernels/matrix_rank_tol_kernel.h" + +#include +#include +#include "paddle/fluid/memory/memory.h" +#include "paddle/phi/backends/dynload/cusolver.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/abs_kernel.h" +#include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" +#include "paddle/phi/kernels/funcs/compare_functors.h" +#include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" +#include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h" +#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/reduce_max_kernel.h" + +namespace phi { + +template +void GesvdjBatched(const phi::GPUContext& dev_ctx, + int batchSize, + int m, + int n, + int k, + T* A, + T* U, + T* V, + T* S, + int* info, + int thin_UV = 1); + +template +void SyevjBatched(const phi::GPUContext& dev_ctx, + int batchSize, + int n, + T* A, + T* W, + int* info); + +template <> +void GesvdjBatched(const phi::GPUContext& dev_ctx, + int batchSize, + int m, + int n, + int k, + float* A, + float* U, + float* V, + float* S, + int* info, + int thin_UV) { + // do not compute singular vectors + const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; + gesvdjInfo_t gesvdj_params = NULL; + int lda = m; + int ldu = m; + int ldt = n; + int lwork = 0; + auto handle = dev_ctx.cusolver_dn_handle(); + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params)); + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cusolverDnSgesvdj_bufferSize(handle, + jobz, + thin_UV, + m, + n, + A, + lda, + S, + U, + ldu, + V, + ldt, + &lwork, + gesvdj_params)); + auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(float)); + float* workspace_ptr = reinterpret_cast(workspace->ptr()); + int stride_A = lda * n; + int stride_U = ldu * (thin_UV ? k : m); + int stride_V = ldt * (thin_UV ? k : n); + for (int i = 0; i < batchSize; i++) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSgesvdj(handle, + jobz, + thin_UV, + m, + n, + A + stride_A * i, + lda, + S + k * i, + U + stride_U * i, + ldu, + V + stride_V * i, + ldt, + workspace_ptr, + lwork, + info, + gesvdj_params)); + int error_info; + paddle::memory::Copy(phi::CPUPlace(), + &error_info, + dev_ctx.GetPlace(), + info, + sizeof(int), + dev_ctx.stream()); + PADDLE_ENFORCE_EQ( + error_info, + 0, + phi::errors::PreconditionNotMet( + "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info)); + } + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params)); +} + +template <> +void GesvdjBatched(const phi::GPUContext& dev_ctx, + int batchSize, + int m, + int n, + int k, + double* A, + double* U, + double* V, + double* S, + int* info, + int thin_UV) { + // do not compute singular vectors + const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; + gesvdjInfo_t gesvdj_params = NULL; + int lda = m; + int ldu = m; + int ldt = n; + int lwork = 0; + auto handle = dev_ctx.cusolver_dn_handle(); + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params)); + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cusolverDnDgesvdj_bufferSize(handle, + jobz, + thin_UV, + m, + n, + A, + lda, + S, + U, + ldu, + V, + ldt, + &lwork, + gesvdj_params)); + auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(double)); + double* workspace_ptr = reinterpret_cast(workspace->ptr()); + int stride_A = lda * n; + int stride_U = ldu * (thin_UV ? k : m); + int stride_V = ldt * (thin_UV ? k : n); + for (int i = 0; i < batchSize; ++i) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDgesvdj(handle, + jobz, + thin_UV, + m, + n, + A + stride_A * i, + lda, + S + k * i, + U + stride_U * i, + ldu, + V + stride_V * i, + ldt, + workspace_ptr, + lwork, + info, + gesvdj_params)); + // check the error info + int error_info; + paddle::memory::Copy(phi::CPUPlace(), + &error_info, + dev_ctx.GetPlace(), + info, + sizeof(int), + dev_ctx.stream()); + PADDLE_ENFORCE_EQ( + error_info, + 0, + phi::errors::PreconditionNotMet( + "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info)); + } + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params)); +} + +template <> +void SyevjBatched(const phi::GPUContext& dev_ctx, + int batchSize, + int n, + float* A, + float* W, + int* info) { + auto handle = dev_ctx.cusolver_dn_handle(); + // Compute eigenvalues only + const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; + // matrix is saved as column-major in cusolver. + // numpy and torch use lower triangle to compute eigenvalues, so here use + // upper triangle + cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER; + int lda = n; + int stride_A = lda * n; + int lwork = 0; + syevjInfo_t params = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(¶ms)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize( + handle, jobz, uplo, n, A, lda, W, &lwork, params)); + auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(float)); + float* workspace_ptr = reinterpret_cast(workspace->ptr()); + for (int i = 0; i < batchSize; i++) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj(handle, + jobz, + uplo, + n, + A + stride_A * i, + lda, + W + n * i, + workspace_ptr, + lwork, + info, + params)); + + int error_info; + paddle::memory::Copy(phi::CPUPlace(), + &error_info, + dev_ctx.GetPlace(), + info, + sizeof(int), + dev_ctx.stream()); + PADDLE_ENFORCE_EQ( + error_info, + 0, + phi::errors::PreconditionNotMet( + "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", + i, + error_info)); + } + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDestroySyevjInfo(params)); +} + +template <> +void SyevjBatched(const phi::GPUContext& dev_ctx, + int batchSize, + int n, + double* A, + double* W, + int* info) { + auto handle = dev_ctx.cusolver_dn_handle(); + // Compute eigenvalues only + const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; + // upper triangle of A is stored + cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER; + int lda = n; + int stride_A = lda * n; + int lwork = 0; + syevjInfo_t params = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(¶ms)); + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDsyevj_bufferSize( + handle, jobz, uplo, n, A, lda, W, &lwork, params)); + auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(double)); + double* workspace_ptr = reinterpret_cast(workspace->ptr()); + + for (int i = 0; i < batchSize; i++) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDsyevj(handle, + jobz, + uplo, + n, + A + stride_A * i, + lda, + W + n * i, + workspace_ptr, + lwork, + info, + params)); + int error_info; + paddle::memory::Copy(phi::CPUPlace(), + &error_info, + dev_ctx.GetPlace(), + info, + sizeof(int), + dev_ctx.stream()); + PADDLE_ENFORCE_EQ( + error_info, + 0, + phi::errors::PreconditionNotMet( + "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", + i, + error_info)); + } + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDestroySyevjInfo(params)); +} + +template +void MatrixRankTolKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& atol_tensor, + bool use_default_tol, + bool hermitian, + DenseTensor* out) { + auto* x_data = x.data(); + dev_ctx.template Alloc(out); + + auto dim_x = x.dims(); + auto dim_out = out->dims(); + int rows = dim_x[dim_x.size() - 2]; + int cols = dim_x[dim_x.size() - 1]; + int k = std::min(rows, cols); + auto numel = x.numel(); + int batches = numel / (rows * cols); + + T rtol_T = 0; + if (use_default_tol) { + rtol_T = std::numeric_limits::epsilon() * std::max(rows, cols); + } + + // Must Copy X once, because the gesvdj will destory the content when exit. + DenseTensor x_tmp; + paddle::framework::TensorCopy(x, dev_ctx.GetPlace(), &x_tmp); + auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batches); + int* info_ptr = reinterpret_cast(info->ptr()); + + DenseTensor eigenvalue_tensor; + eigenvalue_tensor.Resize(detail::GetEigenvalueDim(dim_x, k)); + auto* eigenvalue_data = dev_ctx.template Alloc(&eigenvalue_tensor); + + if (hermitian) { + SyevjBatched( + dev_ctx, batches, rows, x_tmp.data(), eigenvalue_data, info_ptr); + + phi::AbsKernel(dev_ctx, eigenvalue_tensor, &eigenvalue_tensor); + + } else { + DenseTensor U, VH; + U.Resize(detail::GetUDDim(dim_x, k)); + VH.Resize(detail::GetVHDDim(dim_x, k)); + auto* u_data = dev_ctx.template Alloc(&U); + auto* vh_data = dev_ctx.template Alloc(&VH); + GesvdjBatched(dev_ctx, + batches, + cols, + rows, + k, + x_tmp.data(), + vh_data, + u_data, + eigenvalue_data, + info_ptr, + 1); + } + + DenseTensor max_eigenvalue_tensor; + dev_ctx.template Alloc(&max_eigenvalue_tensor); + max_eigenvalue_tensor.Resize(detail::RemoveLastDim(eigenvalue_tensor.dims())); + + phi::MaxKernel(dev_ctx, + eigenvalue_tensor, + std::vector{-1}, + false, + &max_eigenvalue_tensor); + + DenseTensor temp_rtol_tensor; + temp_rtol_tensor = + phi::Full(dev_ctx, {1}, static_cast(rtol_T)); + + DenseTensor rtol_tensor = + phi::Multiply(dev_ctx, temp_rtol_tensor, max_eigenvalue_tensor); + DenseTensor tol_tensor; + tol_tensor.Resize(dim_out); + dev_ctx.template Alloc(&tol_tensor); + + funcs::ElementwiseCompute, T, T>( + dev_ctx, + atol_tensor, + rtol_tensor, + -1, + GreaterElementFunctor(), + &tol_tensor); + + tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1)); + + DenseTensor compare_result; + compare_result.Resize(detail::NewAxisDim(dim_out, k)); + dev_ctx.template Alloc(&compare_result); + + int axis = -1; + funcs::ElementwiseCompute, T, int64_t>( + dev_ctx, + eigenvalue_tensor, + tol_tensor, + axis, + funcs::GreaterThanFunctor(), + &compare_result); + + phi::SumKernel(dev_ctx, + compare_result, + std::vector{-1}, + compare_result.dtype(), + false, + out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(matrix_rank_tol, // cuda_only + GPU, + ALL_LAYOUT, + phi::MatrixRankTolKernel, + float, + double) {} + +#endif // not PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/matrix_rank_op.h b/paddle/phi/kernels/impl/matrix_rank_kernel_impl.h similarity index 72% rename from paddle/fluid/operators/matrix_rank_op.h rename to paddle/phi/kernels/impl/matrix_rank_kernel_impl.h index 93545fd3103..b0dd76a17ee 100644 --- a/paddle/fluid/operators/matrix_rank_op.h +++ b/paddle/phi/kernels/impl/matrix_rank_kernel_impl.h @@ -1,4 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,14 +13,11 @@ // limitations under the License. #pragma once -#include -#include "paddle/fluid/framework/tensor.h" -#include "paddle/phi/core/ddim.h" -namespace paddle { -namespace operators { -using Tensor = framework::Tensor; -using DDim = framework::DDim; +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/matrix_rank_kernel.h" + +namespace phi { namespace detail { static DDim GetEigenvalueDim(const DDim& dim, int k) { @@ -44,6 +41,18 @@ static DDim RemoveLastDim(const DDim& dim) { vec.erase(vec.end() - 1, vec.end()); return phi::make_ddim(vec); } + +static DDim GetUDDim(const DDim& x_dim, int k) { + auto x_vec = phi::vectorize(x_dim); + x_vec[x_vec.size() - 1] = k; + return phi::make_ddim(x_vec); +} + +static DDim GetVHDDim(const DDim& x_dim, int k) { + auto x_vec = phi::vectorize(x_dim); + x_vec[x_vec.size() - 2] = k; + return phi::make_ddim(x_vec); +} } // namespace detail template @@ -57,5 +66,4 @@ struct GreaterElementFunctor { } }; -} // namespace operators -} // namespace paddle +} // namespace phi diff --git a/paddle/phi/kernels/matrix_rank_kernel.h b/paddle/phi/kernels/matrix_rank_kernel.h new file mode 100644 index 00000000000..6edea2723e5 --- /dev/null +++ b/paddle/phi/kernels/matrix_rank_kernel.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void MatrixRankKernel(const Context& dev_ctx, + const DenseTensor& x, + float tol, + bool use_default_tol, + bool hermitian, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/matrix_rank_tol_kernel.h b/paddle/phi/kernels/matrix_rank_tol_kernel.h new file mode 100644 index 00000000000..351358dfa04 --- /dev/null +++ b/paddle/phi/kernels/matrix_rank_tol_kernel.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void MatrixRankTolKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& atol_tensor, + bool use_default_tol, + bool hermitian, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/matrix_rank_sig.cc b/paddle/phi/ops/compat/matrix_rank_sig.cc new file mode 100644 index 00000000000..40dc29579b4 --- /dev/null +++ b/paddle/phi/ops/compat/matrix_rank_sig.cc @@ -0,0 +1,38 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +// we have to return every specific KernelSignature for infrt now +KernelSignature MatrixRankOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.HasInput("TolTensor")) { + return KernelSignature("matrix_rank_tol", + {"X", "TolTensor"}, + {"use_default_tol", "hermitian"}, + {"Out"}); + } else { + return KernelSignature("matrix_rank", + {"X"}, + { + "tol", "use_default_tol", "hermitian", + }, + {"Out"}); + } +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(matrix_rank, phi::MatrixRankOpArgumentMapping); -- GitLab From 3881b6cb809052c8e8b581bfcfe5059078e96bcb Mon Sep 17 00:00:00 2001 From: zhaoyingli <86812880+zhaoyinglia@users.noreply.github.com> Date: Mon, 14 Mar 2022 16:15:00 +0800 Subject: [PATCH 033/176] [AutoParallel] Converter (#40434) * [AutoParallel] Converter Converter API --- .../distributed/auto_parallel/converter.py | 455 ++++++++++++++++++ .../unittests/auto_parallel/CMakeLists.txt | 2 + .../unittests/auto_parallel/converter.py | 83 ++++ .../unittests/auto_parallel/test_converter.py | 69 +++ .../auto_parallel/test_engine_api.py | 2 +- 5 files changed, 610 insertions(+), 1 deletion(-) create mode 100644 python/paddle/distributed/auto_parallel/converter.py create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/converter.py create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/test_converter.py diff --git a/python/paddle/distributed/auto_parallel/converter.py b/python/paddle/distributed/auto_parallel/converter.py new file mode 100644 index 00000000000..d88f9fe7501 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/converter.py @@ -0,0 +1,455 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import warnings +import logging +import numpy as np +from ..utils import get_logger + + +class Converter(object): + """ + Converter is a class object for auto parallel to convert tensors from + one parallel strategy to another one. Tensors will merge and slice value + with their strategy when strategies are different. + """ + + def __init__(self, tensors_dict, pre_strategy, cur_strategy): + """ + Args: + tensors_dict(dict): tensors' value of all ranks that to be converted. + key is tensor's name(str), value is all ranks' data(list(numpy.ndarray)) + pre_strategy(dict): tensors' distributed attribute of last training process. + key is tensor's name(str), value is tensor's distributed attribute in last + training process. + cur_strategy(dict): tensors' distributed attribute of current rank. + key is tensor's name(str), value is tensor's distributed attribute in current + rank. + """ + self._tensors_dict = self._check_tensor_dict(tensors_dict) + self._pre_strategy = self._check_pre_strategy(pre_strategy) + self._cur_strategy = self._check_cur_strategy(cur_strategy) + self._logger = get_logger(logging.INFO) + + def _check_tensor_dict(self, tensors_dict): + if not tensors_dict: + raise ValueError("'tensors_dict' is None, " + "the tensors to be converted cannot be None.") + if not isinstance(tensors_dict, dict): + raise TypeError( + "The type of 'tensors_dict' should be 'dict', but got '{}'.". + format(str(type(tensors_dict)))) + return tensors_dict + + def _check_pre_strategy(self, pre_strategy): + if not pre_strategy: + raise ValueError("'pre_strategy' is None, " + "there are not tensors in pre process.") + if not isinstance(pre_strategy, dict): + raise TypeError("The type of 'pre_strategy' should be 'dict', " + "but got '{}'.".format(str(type(pre_strategy)))) + return pre_strategy + + def _check_cur_strategy(self, cur_strategy): + if not cur_strategy: + warnings.warn("'cur_strategy' is None, " + "there are not tensors in cur process") + if not isinstance(cur_strategy, dict): + raise TypeError("The type of 'cur_strategy' should be 'dict', " + "but got '{}'.".format(str(type(cur_strategy)))) + return cur_strategy + + def convert(self, strict=True): + """ + Convert tensors + + Args: + strict(bool): whether to strict convert tensor with tensor's name. If False, it will + convert tensors by prefix matching. Otherwise, tensors will be converted with + their name strictly. + + Returns: + converted tensors(dict) + + Examples: + .. code-block:: python + + import numpy as np + complete_tensors = np.arange(4).reshape([2, 2]) + partitial_tensors = np.split(complete_tensors, 2, axis=0) + name = "tmp_0" + tensors_dict = {name: partitial_tensors} + strategy_1 = { + name: { + "process_shape": [2], + "process_group": [0, 1], + "dims_mapping": [0, -1] + } + } + strategy_2 = { + name: { + "process_shape": [2], + "process_group": [0, 1], + "dims_mapping": [-1, -1] + } + } + converter = Converter(tensors_dict, strategy_1, strategy_2) + result = converter.convert() + # the result's value is equal to `complete_tensors` + """ + tensors_dict = {} + # the name which is in cur_process but not in pre_process + tensor_not_in_pre = [] + # the name which is in pre_process but not in cur_process + tensor_not_in_cur = [] + # the name which is in strategy but not in ckpt files + tensor_not_in_ckpt = [] + self._logger.info("Start to convert tensors.") + for tensor_name in self._cur_strategy: + if tensor_name not in self._pre_strategy: + tensor_not_in_pre.append(tensor_name) + continue + if tensor_name not in self._tensors_dict: + tensor_not_in_ckpt.append(tensor_name) + continue + self._pre_name = tensor_name + self._cur_name = tensor_name + tensor_list = self._tensors_dict[tensor_name] + pre_dist_attr = self._pre_strategy[tensor_name] + cur_dist_attr = self._cur_strategy[tensor_name] + try: + tensors_dict[tensor_name] = Converter.merge_and_slice( + tensor_list, pre_dist_attr, cur_dist_attr) + except ValueError as err: + raise ValueError("Fail to convert tensor '{}'. " + .format(str(tensor_name)) + str(err)) + + for tensor_name in self._pre_strategy: + if tensor_name not in self._cur_strategy: + tensor_not_in_cur.append(tensor_name) + + if not strict: + tensors_dict, tensor_match_with_pre, tensor_match_with_cur = self.convert_with_prefix_match( + tensors_dict, tensor_not_in_pre, tensor_not_in_cur) + else: + tensors_dict, tensor_match_with_pre, tensor_match_with_cur = tensors_dict, [], [] + + tensor_not_in_pre = set(tensor_not_in_pre) - set(tensor_match_with_pre) + tensor_not_in_cur = set(tensor_not_in_cur) - set(tensor_match_with_cur) + if tensor_not_in_pre: + warnings.warn( + "tensors [{}] are not found in last training strategy." + .format(str(tensor_not_in_pre))) + if tensor_not_in_cur: + warnings.warn( + "tensors [{}] are not found in current training strategy." + .format(str(tensor_not_in_cur))) + if tensor_not_in_ckpt: + warnings.warn( + "tensors [{}] are found in pre_strategy, but are not found" + "in checkpoint files, please check your checkpoint files." + .format(str(tensor_not_in_ckpt))) + + return tensors_dict + + def convert_with_prefix_match(self, tensors_dict, tensor_not_in_pre, + tensor_not_in_cur): + # the name which in cur_process and can match with pre_process + tensor_match_with_pre = [] + # the name which in pre_process and can match with cur_process + tensor_match_with_cur = [] + for cur_name in tensor_not_in_pre: + prefix_name = cur_name + while prefix_name.find("_") != -1: + prefix_name = prefix_name[:prefix_name.rfind("_")] + for pre_name in tensor_not_in_cur: + if prefix_name in pre_name: + # 'cur_name' of cur_process can match with 'pre_name' of pre_process + self._pre_name = pre_name + self._cur_name = cur_name + pre_tensor_list = self._tensors_dict[pre_name] + pre_dist_attr = self._pre_strategy[pre_name] + cur_dist_attr = self._cur_strategy[cur_name] + try: + tensors_dict[cur_name] = Converter.merge_and_slice( + pre_tensor_list, pre_dist_attr, cur_dist_attr) + except ValueError as err: + raise ValueError( + "Fail to convert tensor '{}' by '{}'. ".format( + str(cur_name), str(pre_name)) + str(err)) + self._logger.info( + "tensor [{}] is matched with tensor [{}]".format( + cur_name, pre_name)) + tensor_match_with_pre.append(cur_name) + tensor_match_with_cur.append(pre_name) + break + break + + return tensors_dict, tensor_match_with_pre, tensor_match_with_cur + + @staticmethod + def merge_and_slice(tensor_list, pre_dist_attr, cur_dist_attr): + """ + Merge tensors with previous dist_attr and slice tensors with current dist_attr + + Returns: + tensor(numpy.narray): a tensor's value of current rank. + """ + assert isinstance(tensor_list, list) + assert all(isinstance(p, np.ndarray) for p in tensor_list) + + if pre_dist_attr == cur_dist_attr: + # skip merge and slice tensor + rank_id = paddle.distributed.get_rank() + index = cur_dist_attr["process_group"].index(rank_id) + tensor = tensor_list[index] + else: + pre_dims_mapping = pre_dist_attr["dims_mapping"] + cur_dims_mapping = cur_dist_attr["dims_mapping"] + if len(set(pre_dims_mapping)) > 1 or -1 not in pre_dims_mapping: + # merge tensor + tensor = Converter.merge_with_dist_attr(tensor_list, + pre_dist_attr) + else: + # skip merge tensor + tensor = tensor_list[0] + + if len(set(cur_dims_mapping)) > 1 or -1 not in cur_dims_mapping: + # slice tensor + tensor = Converter.slice_with_dist_attr(tensor, cur_dist_attr) + + return tensor + + @staticmethod + def merge_with_dist_attr(tensor_list, dist_attr): + """ Merge tensor with distributed attribute """ + from .reshard import _compute_complete_shape, _compute_partition_index + + dims_mapping = dist_attr["dims_mapping"] + process_shape = dist_attr["process_shape"] + process_group = dist_attr["process_group"] + # get the complete shape of the tensor + complete_shape = _compute_complete_shape(tensor_list[0].shape, + process_shape, dims_mapping) + # merge the tensor with dist_attr + partition_tensor_list = [] + merged_partiton = [] + for process in process_group: + partition_index = _compute_partition_index( + process, complete_shape, dims_mapping, process_shape, + process_group) + index = process_group.index(process) + if partition_index not in merged_partiton: + merged_partiton.append(partition_index) + Converter.merge(partition_tensor_list, tensor_list[index], + partition_index, complete_shape) + + if len(partition_tensor_list) != 1: + raise ValueError("Fail to merge tensor with dist_attr '{}'.".format( + str(dist_attr))) + complete_tensor = partition_tensor_list[0][0] + return complete_tensor + + @staticmethod + def slice_with_dist_attr(tensor, dist_attr): + """ Slice tensor with distributed attribute """ + dims_mapping = dist_attr["dims_mapping"] + process_shape = dist_attr["process_shape"] + process_group = dist_attr["process_group"] + # slice the tensor with dist_attr + partition_index_list = Converter._get_split_indices( + tensor.shape, dims_mapping, process_shape, process_group) + sliced_tensor_list = Converter.split(tensor, partition_index_list, + len(partition_index_list)) + # get the current tensor's index in sliced_tensor_list + rank_id = paddle.distributed.get_rank() + sliced_tensor_index = Converter._get_sliced_index( + rank_id, tensor.shape, dims_mapping, process_shape, process_group) + if sliced_tensor_index not in range(len(sliced_tensor_list)): + raise ValueError("Fail to slice tensor with dist_attr '{}'.".format( + str(dist_attr))) + sliced_tensor = sliced_tensor_list[sliced_tensor_index] + return sliced_tensor + + @staticmethod + def merge(partition_tensor_list, tensor, partition_index, complete_shape): + """ + Merge partitial tensors to a complete. + + Returns: + None + + Examples: + .. code-block:: python + + import numpy as np + partition_tensor_list = [(np.array([[[1.11, 1.12]]]), [[0,1],[0,1],[0,2]])] + tensor = np.array([[[1.13, 1.14]]]) + partition_index = [[0,1],[0,1],[2,4]] + + _merge_tensor(partition_tensor_list, tensor, partition_index) + # partition_tensor_list: [(np.array([[[1.11, 1.12, 1.13, 1.14]]]), [[0,1],[0,1],[0,4]])] + """ + from .reshard import _compute_concat_info + + if len(partition_tensor_list) == 1: + is_complete_data = True + for idx, item in enumerate(partition_tensor_list[0][1]): + if item[0] != 0 or item[1] != complete_shape[idx]: + is_complete_data = False + break + if is_complete_data: + return + + if not partition_tensor_list: + partition_tensor_list.append((tensor, partition_index)) + else: + i = 0 + while i < len(partition_tensor_list): + concat_axis, first_order, new_partition = _compute_concat_info( + partition_tensor_list[i][1], partition_index) + if concat_axis != -1: + if first_order == 0: + new_tensor = np.concatenate( + (partition_tensor_list[i][0], tensor), + axis=concat_axis) + else: + new_tensor = np.concatenate( + (tensor, partition_tensor_list[i][0]), + axis=concat_axis) + + partition_tensor_list.pop(i) + Converter.merge(partition_tensor_list, new_tensor, + new_partition, complete_shape) + break + i += 1 + + @staticmethod + def split(complete_tensor, partition_index_list, length): + """ + Slice a complete tensor. + + Returns: + sliced_tensor_list(list): sliced tensors with 'partition_index_list' + + Examples: + .. code-block:: python + + import numpy as np + complete_tensor = np.array([[[1.11, 1.12, 1.13, 1.14, 1.15, 1.16]]]) + rank = 2 + complete_shape = [1, 1, 6] + dims_mapping = [-1, -1, 0] + process_shape = [3] + process_group = [0, 1, 2] + + sliced_tensor_list = split(complete_tensor, [[], [], [2, 4]], 3) + # [array([[[1.11, 1.12]]]), array([[[1.13, 1.14]]]), array([[[1.15, 1.16]]])] + """ + sliced_tensor_list = [] + axis = len(complete_tensor.shape) - length + sliced_tensor = np.split( + complete_tensor, partition_index_list[axis], axis=axis) + if length == 1: + return sliced_tensor + for tensor in sliced_tensor: + sliced_tensor_list.extend( + Converter.split(tensor, partition_index_list, length - 1)) + return sliced_tensor_list + + @staticmethod + def _get_split_indices(complete_shape, dims_mapping, process_shape, + process_group): + """ + Get split indices of every dimension. + + Returns: + split_indices_list(list): the split indices of every dimension of the tensor + + Examples: + .. code-block:: python + + import numpy as np + complete_tensor = np.array([[[1.11, 1.12, 1.13, 1.14, 1.15, 1.16]]]) + complete_shape = [1, 1, 6] + dims_mapping = [-1, -1, 0] + process_shape = [3] + process_group = [0, 1, 2] + + index = _get_split_indices(complete_shape, dims_mapping, process_shape, process_group) + # index: [[], [], [2, 4]] + """ + from .reshard import _compute_partition_index + + split_indices_list = [] + for process in process_group: + partition_index = _compute_partition_index( + process, complete_shape, dims_mapping, process_shape, + process_group) + if split_indices_list: + for dim in range(len(partition_index)): + split_indices_list[dim].extend(partition_index[dim]) + else: + split_indices_list = partition_index + split_indices_list = list( + map(lambda x, y: list(set(x) - set([y]) - set([0])), + split_indices_list, complete_shape)) + split_indices_list = [sorted(x) for x in split_indices_list] + return split_indices_list + + @staticmethod + def _get_sliced_index(rank_id, complete_shape, dims_mapping, process_shape, + process_group): + """ + Get sliced_tensor's index of current rank in all sliced tensors list. + + Returns: + sliced_tensor_index(int): the index of sliced tensor in sliced_tensor_list + + Examples: + .. code-block:: python + + import numpy as np + complete_tensor = np.array([[[1.11, 1.12, 1.13, 1.14, 1.15, 1.16]]]) + rank = 2 + complete_shape = [1, 1, 6] + dims_mapping = [-1, -1, 0] + process_shape = [3] + process_group = [0, 1, 2] + + slice_tensor = _slice_tensor(complete_tensor, [[], [], [2, 4]], 3) + # slice_tensor: + # [array([[[1.11, 1.12]]]), array([[[1.13, 1.14]]]), array([[[1.15, 1.16]]])] + + index = _get_sliced_index(rank, complete_shape, dims_mapping + process_shape, process_group) + # index: 2 + """ + from .reshard import _compute_partition_index + + partition_index = _compute_partition_index( + rank_id, complete_shape, dims_mapping, process_shape, process_group) + sliced_index = 0 + for i, shape in enumerate(complete_shape): + if dims_mapping[i] == -1: + slice_shape = shape + else: + slice_shape = shape // process_shape[dims_mapping[i]] + if shape == 1: + index = 0 + else: + index = (partition_index[i][0] + 1) // slice_shape + sliced_index = sliced_index * (shape // slice_shape) + index + return sliced_index diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index 80bc206ae7b..1f7ae53acdf 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -9,4 +9,6 @@ if(WITH_DISTRIBUTE AND WITH_GPU) set_tests_properties(test_relaunch_with_gpt_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 240) py_test_modules(test_engine_api MODULES test_engine_api ENVS ${dist_ENVS}) set_tests_properties(test_engine_api PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 80) + py_test_modules(test_converter MODULES test_converter ENVS ${dist_ENVS}) + set_tests_properties(test_converter PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50) endif() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/converter.py b/python/paddle/fluid/tests/unittests/auto_parallel/converter.py new file mode 100644 index 00000000000..e34f267b423 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/converter.py @@ -0,0 +1,83 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +import paddle +from paddle.distributed.auto_parallel.converter import Converter + + +def test_convert(): + rank_id = paddle.distributed.get_rank() + complete_tensor = np.arange(64).reshape([8, 8]) + tensor_row = np.split(complete_tensor, 2, axis=0) + tensor_col = np.split(complete_tensor, 2, axis=1) + tensor_name = "tensor_0" + complet_strategy = { + tensor_name: { + "process_shape": [2], + "process_group": [0, 1], + "dims_mapping": [-1, -1] + } + } + row_strategy = { + tensor_name: { + "process_shape": [2], + "process_group": [0, 1], + "dims_mapping": [0, -1] + } + } + col_strategy = { + tensor_name: { + "process_shape": [2], + "process_group": [0, 1], + "dims_mapping": [-1, 0] + } + } + + # test merge + tensor_dict = {tensor_name: tensor_row} + converter = Converter(tensor_dict, row_strategy, complet_strategy) + convert_tensor_dict = converter.convert() + assert np.equal(convert_tensor_dict[tensor_name], complete_tensor).all() + + # test slice + tensor_dict = {tensor_name: [complete_tensor]} + converter = Converter(tensor_dict, complet_strategy, col_strategy) + convert_tensor_dict = converter.convert() + assert np.equal(convert_tensor_dict[tensor_name], tensor_col[rank_id]).all() + + # test merge and slice + tensor_dict = {tensor_name: tensor_col} + converter = Converter(tensor_dict, col_strategy, row_strategy) + convert_tensor_dict = converter.convert() + assert np.equal(convert_tensor_dict[tensor_name], tensor_row[rank_id]).all() + + # test merge and slice with prefix match + new_name = "tensor_1" + row_strategy = { + new_name: { + "process_shape": [2], + "process_group": [0, 1], + "dims_mapping": [0, -1] + } + } + converter = Converter(tensor_dict, col_strategy, row_strategy) + convert_tensor_dict = converter.convert(strict=False) + assert np.equal(convert_tensor_dict[new_name], tensor_row[rank_id]).all() + + +if __name__ == "__main__": + test_convert() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_converter.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_converter.py new file mode 100644 index 00000000000..fbadbb7d8c1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_converter.py @@ -0,0 +1,69 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import os +import sys +import shutil +import subprocess +from paddle.distributed.fleet.launch_utils import run_with_coverage +from paddle.distributed.auto_parallel.converter import Converter + + +class TestConverter(unittest.TestCase): + def test_converter(self): + file_dir = os.path.dirname(os.path.abspath(__file__)) + launch_model_path = os.path.join(file_dir, "converter.py") + + if os.environ.get("WITH_COVERAGE", "OFF") == "ON": + coverage_args = ["-m", "coverage", "run", "--branch", "-p"] + else: + coverage_args = [] + + cmd = [sys.executable, "-u"] + coverage_args + [ + "-m", "launch", "--gpus", "0,1", launch_model_path + ] + + process = subprocess.Popen(cmd) + process.wait() + self.assertEqual(process.returncode, 0) + + # Remove unnecessary files + log_path = os.path.join(file_dir, "log") + if os.path.exists(log_path): + shutil.rmtree(log_path) + + def test_input_invalid(self): + with self.assertRaises(ValueError): + Converter({}, [], []) + with self.assertRaises(TypeError): + Converter([0, 1], [], []) + with self.assertRaises(ValueError): + Converter({"tmp_0": [0]}, {}, []) + with self.assertRaises(TypeError): + Converter({"tmp_0": [0]}, [0], []) + + strategy_1 = { + 'tmp_0': { + "process_shape": [1], + "process_group": [0], + "dims_mapping": [-1] + } + } + with self.assertRaises(TypeError): + Converter({"tmp_0": [0]}, strategy_1, []) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py index a7d51a7e176..d150da761aa 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. -- GitLab From f269ca3f78083f41e9ff56ad0af5ef9e24905734 Mon Sep 17 00:00:00 2001 From: Lijunhui <1578034415@qq.com> Date: Mon, 14 Mar 2022 16:24:19 +0800 Subject: [PATCH 034/176] [KP] Add unittests for brelu,ceil,celu,elu,floor,hard_shrink,hard_sigmoid,log1p,logsigmoid,relu6,silu,soft_relu,softsign,swish (#40448) * solve unexecuted UT * add 24 activation op UT * append swish&thresholded_relu to kpfirst_list * rm thresholded_relu --- .../platform/device/xpu/xpu_op_kpfirst_list.h | 3 + .../unittests/xpu/test_activation_op_xpu.py | 405 ++++++++++++++++++ 2 files changed, 408 insertions(+) diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h index c5dff84723c..ce9b09f60ca 100644 --- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h +++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h @@ -56,6 +56,9 @@ XPUOpMap& get_kp_ops() { {"hard_shrink", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"hard_sigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"thresholded_relu", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, }; return s_xpu_kp_kernels; diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py index d50c0fecdee..69bca8dd9ef 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py @@ -474,5 +474,410 @@ def ref_softplus(x, beta=1, threshold=20): return out +# XPU_KP unittests, these ops can be found from xpu_op_kpfirst_list.h +class XPUTestBReluOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'brelu' + self.use_dynamic_create_class = False + + class XPUTestBRelu(TestActivationOPBase): + def set_case(self): + self.op_type = "brelu" + self.dtype = self.in_type + + np.random.seed(1024) + x = np.random.uniform(-5, 10, [10, 12]).astype(self.dtype) + t_min = 1.0 + t_max = 4.0 + # The same with TestAbs + x[np.abs(x - t_min) < 0.005] = t_min + 0.02 + x[np.abs(x - t_max) < 0.005] = t_max + 0.02 + t = np.copy(x) + t[t < t_min] = t_min + t[t > t_max] = t_max + + self.inputs = {'X': x} + self.outputs = {'Out': t} + self.attrs = {'use_xpu': True, 't_min': t_min, 't_max': t_max} + + +support_types = get_xpu_op_support_types('brelu') +for stype in support_types: + create_test_class(globals(), XPUTestBReluOP, stype) + + +class XPUTestCeilOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'ceil' + self.use_dynamic_create_class = False + + class XPUTestCeil(TestActivationOPBase): + def set_case(self): + self.op_type = "ceil" + self.dtype = self.in_type + + np.random.seed(1024) + x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype) + out = np.ceil(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True} + + +support_types = get_xpu_op_support_types('ceil') +for stype in support_types: + create_test_class(globals(), XPUTestCeilOP, stype) + + +class XPUTestCeluOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'celu' + self.use_dynamic_create_class = False + + class XPUTestCelu(TestActivationOPBase): + def set_case(self): + self.op_type = "celu" + self.dtype = self.in_type + + alpha = 1.5 + x = np.random.uniform(-3, 3, [10, 12]).astype(self.dtype) + out = ref_celu(x, alpha) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True, 'alpha': alpha} + + +support_types = get_xpu_op_support_types('celu') +for stype in support_types: + create_test_class(globals(), XPUTestCeluOP, stype) + + +def ref_celu(x, alpha): + out_ref = np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x / alpha) - 1)) + return out_ref.astype(x.dtype) + + +class XPUTestEluOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'elu' + self.use_dynamic_create_class = False + + class XPUTestElu(TestActivationOPBase): + def set_case(self): + self.op_type = "elu" + self.dtype = self.in_type + + alpha = 1. + x = np.random.uniform(-3, 3, [10, 12]).astype(self.dtype) + out = ref_elu(x, alpha) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True, 'alpha': alpha} + + +support_types = get_xpu_op_support_types('elu') +for stype in support_types: + create_test_class(globals(), XPUTestEluOP, stype) + + +def ref_elu(x, alpha): + out_ref = np.where(x > 0, x, alpha * (np.exp(x) - 1)) + return out_ref.astype(x.dtype) + + +class XPUTestFloorOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'floor' + self.use_dynamic_create_class = False + + class XPUTestFloor(TestActivationOPBase): + def set_case(self): + self.op_type = "floor" + self.dtype = self.in_type + + np.random.seed(1024) + x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype) + out = np.floor(x) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True} + + +support_types = get_xpu_op_support_types('floor') +for stype in support_types: + create_test_class(globals(), XPUTestFloorOP, stype) + + +class XPUTestHardShrinkOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'hard_shrink' + self.use_dynamic_create_class = False + + class XPUTestHardShrink(TestActivationOPBase): + def set_case(self): + self.op_type = "hard_shrink" + self.dtype = self.in_type + + threshold = 0.5 + # self.set_attrs() + np.random.seed(1024) + x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype) * 10 + out = ref_hardshrink(x, threshold) + + self.attrs = {'use_xpu': True} + self.inputs = {'X': x} + self.outputs = {'Out': out} + + +support_types = get_xpu_op_support_types('hard_shrink') +for stype in support_types: + create_test_class(globals(), XPUTestHardShrinkOP, stype) + + +def ref_hardshrink(x, threshold): + out = np.copy(x) + out[(out >= -threshold) & (out <= threshold)] = 0 + return out + + +class XPUTestHardSigmoidOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'hard_sigmoid' + self.use_dynamic_create_class = False + + class XPUTestHardSigmoid(TestActivationOPBase): + def set_case(self): + self.op_type = "hard_sigmoid" + self.dtype = self.in_type + self.slope = 0.166666666666667 + self.offset = 0.5 + + x = np.random.uniform(-5, 5, [10, 12]).astype(self.dtype) + lower_threshold = -self.offset / self.slope + upper_threshold = (1. - self.offset) / self.slope + + # Same reason as TestAbs + delta = 0.005 + x[np.abs(x - lower_threshold) < delta] = lower_threshold - 0.02 + x[np.abs(x - upper_threshold) < delta] = upper_threshold - 0.02 + + out = ref_hardsigmoid(x, self.slope, self.offset) + + self.attrs = { + 'use_xpu': True, + 'slope': self.slope, + 'offset': self.offset + } + self.inputs = {'X': x} + self.outputs = {'Out': out} + + +support_types = get_xpu_op_support_types('hard_sigmoid') +for stype in support_types: + create_test_class(globals(), XPUTestHardSigmoidOP, stype) + + +def ref_hardsigmoid(x, slope=0.166666666666667, offset=0.5): + return np.maximum(np.minimum(x * slope + offset, 1.), 0.).astype(x.dtype) + + +class XPUTestLog1pOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'log1p' + self.use_dynamic_create_class = False + + class XPUTestLog1p(TestActivationOPBase): + def set_case(self): + self.op_type = "log1p" + self.dtype = self.in_type + + np.random.seed(1024) + x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype) + out = np.log1p(x) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True} + + +support_types = get_xpu_op_support_types('log1p') +for stype in support_types: + create_test_class(globals(), XPUTestLog1pOP, stype) + + +class XPUTestLogsigmoidOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'logsigmoid' + self.use_dynamic_create_class = False + + class XPUTestLogsigmoid(TestActivationOPBase): + def set_case(self): + self.op_type = "logsigmoid" + self.dtype = self.in_type + + np.random.seed(2048) + x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype) + out = np.log(1 / (1 + np.exp(-x))) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True} + + +support_types = get_xpu_op_support_types('logsigmoid') +for stype in support_types: + create_test_class(globals(), XPUTestLogsigmoidOP, stype) + + +class XPUTestRelu6OP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'relu6' + self.use_dynamic_create_class = False + + class XPUTestRelu6(TestActivationOPBase): + def set_case(self): + self.op_type = "relu6" + self.dtype = self.in_type + + np.random.seed(1024) + x = np.random.uniform(-1, 10, [10, 12]).astype(self.dtype) + x[np.abs(x) < 0.005] = 0.02 + out = ref_relu6(x) + + self.attrs = {'use_xpu': True} + self.inputs = {'X': x} + self.outputs = {'Out': out} + + +support_types = get_xpu_op_support_types('relu6') +for stype in support_types: + create_test_class(globals(), XPUTestRelu6OP, stype) + + +def ref_relu6(x, threshold=6.0): + out = np.copy(x) + out[np.abs(x - threshold) < 0.005] = threshold + 0.02 + out = np.minimum(np.maximum(x, 0), threshold) + return out + + +class XPUTestSiluOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'silu' + self.use_dynamic_create_class = False + + class XPUTestSilu(TestActivationOPBase): + def set_case(self): + self.op_type = "silu" + self.dtype = self.in_type + + np.random.seed(1024) + x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype) + out = x / (np.exp(-x) + 1) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True} + + +support_types = get_xpu_op_support_types('silu') +for stype in support_types: + create_test_class(globals(), XPUTestSiluOP, stype) + + +class XPUTestSoftReluOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'soft_relu' + self.use_dynamic_create_class = False + + class XPUTestSoftRelu(TestActivationOPBase): + def set_case(self): + self.op_type = "soft_relu" + self.dtype = self.in_type + + np.random.seed(4096) + x = np.random.uniform(-3, 3, [4, 4]).astype(self.dtype) + threshold = 2.0 + # The same reason with TestAbs + x[np.abs(x - threshold) < 0.005] = threshold + 0.02 + x[np.abs(x + threshold) < 0.005] = -threshold - 0.02 + t = np.copy(x) + t[t < -threshold] = -threshold + t[t > threshold] = threshold + out = np.log((np.exp(t) + 1)) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True, 'threshold': threshold} + + +support_types = get_xpu_op_support_types('soft_relu') +for stype in support_types: + create_test_class(globals(), XPUTestSoftReluOP, stype) + + +class XPUTestSoftSignOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'softsign' + self.use_dynamic_create_class = False + + class XPUTestSoftSign(TestActivationOPBase): + def set_case(self): + self.op_type = "softsign" + self.dtype = self.in_type + + np.random.seed(1024) + x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype) + out = ref_softsign(x) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True} + + +support_types = get_xpu_op_support_types('softsign') +for stype in support_types: + create_test_class(globals(), XPUTestSoftSignOP, stype) + + +def ref_softsign(x): + out = np.divide(x, 1 + np.abs(x)) + return out + + +class XPUTestSwishOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'swish' + self.use_dynamic_create_class = False + + class XPUTestSwish(TestActivationOPBase): + def set_case(self): + self.op_type = "swish" + self.dtype = self.in_type + + np.random.seed(1024) + x = np.random.uniform(-1, 1, [10, 12]).astype(self.dtype) + out = ref_swish(x) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True} + + +support_types = get_xpu_op_support_types('swish') +for stype in support_types: + create_test_class(globals(), XPUTestSwishOP, stype) + + +def ref_swish(x): + from scipy.special import expit + out = x * expit(x) + return out + + if __name__ == "__main__": unittest.main() -- GitLab From c6ec8b9f698c55b62062131b38b263c4c2658522 Mon Sep 17 00:00:00 2001 From: 0x45f <23097963+0x45f@users.noreply.github.com> Date: Mon, 14 Mar 2022 16:31:55 +0800 Subject: [PATCH 035/176] adjust params order for eager.Tensor._copy_to (#40449) --- paddle/fluid/pybind/eager_method.cc | 4 ++-- python/paddle/fluid/tests/unittests/test_egr_python_api.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index b0dbce34d34..082ec382c79 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -214,8 +214,8 @@ static PyObject* tensor_method__is_initialized(TensorObject* self, static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args, PyObject* kwargs) { EAGER_TRY - bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 0), 0); - auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 1), 1); + auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 0), 0); + bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 1), 1); auto cp_tensor = self->tensor.copy_to(phi::TransToPhiBackend(place), blocking); egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true); diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py index 156fdcb9b0a..9744cda629e 100644 --- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py +++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py @@ -632,13 +632,13 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase): tensor2.persistable = True tensor2.stop_gradient = False if core.is_compiled_with_cuda(): - tensor3 = tensor2._copy_to(True, core.CUDAPlace(0)) + tensor3 = tensor2._copy_to(core.CUDAPlace(0), True) self.assertTrue(np.array_equal(tensor3.numpy(), arr2)) self.assertTrue(tensor3.persistable, True) self.assertTrue(tensor3.stop_gradient, True) self.assertTrue(tensor3.place.is_gpu_place()) else: - tensor3 = tensor2._copy_to(True, core.CPUPlace()) + tensor3 = tensor2._copy_to(core.CPUPlace(), True) self.assertTrue(np.array_equal(tensor3.numpy(), arr2)) self.assertTrue(tensor3.persistable, True) self.assertTrue(tensor3.stop_gradient, True) -- GitLab From 65adfecf37311fba6d5ecf657a18c15525f14e48 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Mon, 14 Mar 2022 16:44:13 +0800 Subject: [PATCH 036/176] [Eager] [Bug Fix] fix eager trace op bug (#40402) * fix some slice bug, test=develop * refine, test=develop --- python/paddle/fluid/dygraph/tracer.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py index 563cd433910..a7dd938a1cf 100644 --- a/python/paddle/fluid/dygraph/tracer.py +++ b/python/paddle/fluid/dygraph/tracer.py @@ -140,7 +140,12 @@ class Tracer(core.Tracer): outputs[retname][j].reconstruct_from_(returns[i][j], False) else: - outputs[retname][0].reconstruct_from_(returns[i], False) + if isinstance(outputs[retname], list): + outputs[retname][0].reconstruct_from_(returns[i], + False) + else: + outputs[retname].reconstruct_from_(returns[i], + False) elif isinstance(returns, list): assert len(outputs.keys()) == 1 key = list(outputs.keys())[0] -- GitLab From 02e80f5996aba2cca5cb1e29ed2f582d2643b33a Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Mon, 14 Mar 2022 16:48:02 +0800 Subject: [PATCH 037/176] set WITH_ONNXRUNTIME=off in windows (#40502) --- paddle/scripts/paddle_build.bat | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index 35b2ce751b1..75afa4ef43f 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -66,6 +66,7 @@ if not defined WITH_TESTING set WITH_TESTING=ON if not defined MSVC_STATIC_CRT set MSVC_STATIC_CRT=ON if not defined WITH_PYTHON set WITH_PYTHON=ON if not defined ON_INFER set ON_INFER=ON +if not defined WITH_ONNXRUNTIME set WITH_ONNXRUNTIME=OFF if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON if not defined WITH_STATIC_LIB set WITH_STATIC_LIB=ON if not defined WITH_TPCACHE set WITH_TPCACHE=OFF @@ -757,7 +758,7 @@ for /F %%i in ("%libsize%") do ( ) cd /d %work_dir%\paddle\fluid\inference\api\demo_ci -%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %WITH_TENSORRT% %TENSORRT_ROOT% %MSVC_STATIC_CRT% +%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %WITH_TENSORRT% %TENSORRT_ROOT% %WITH_ONNXRUNTIME% %MSVC_STATIC_CRT% goto:eof :test_inference_error @@ -857,7 +858,7 @@ echo Step 7. Testing fluid library with infer_ut for inference ... echo ======================================== cd /d %work_dir%\paddle\fluid\inference\tests\infer_ut -%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %TENSORRT_ROOT% %MSVC_STATIC_CRT% +%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %TENSORRT_ROOT% %WITH_ONNXRUNTIME% %MSVC_STATIC_CRT% goto:eof :test_inference_ut_error -- GitLab From 250e254fac69680e07510a3c4f34337656a56a79 Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Mon, 14 Mar 2022 16:48:30 +0800 Subject: [PATCH 038/176] Optimize performance of log_softmax (#38992) * Optimize performance of log_softmax * delete unity build * modify to phi * fix * fixfixfixfix * fix * fix * fix * fix * simplify * fix * fix enforce --- paddle/fluid/operators/log_softmax_op.cu | 464 ++------------------- paddle/phi/kernels/gpudnn/softmax_gpudnn.h | 122 ++++-- 2 files changed, 124 insertions(+), 462 deletions(-) diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu index 8770abdac83..26b6ce43303 100644 --- a/paddle/fluid/operators/log_softmax_op.cu +++ b/paddle/fluid/operators/log_softmax_op.cu @@ -12,459 +12,43 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include #include "paddle/fluid/operators/log_softmax_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/phi/common/amp_type_traits.h" -#include "paddle/phi/kernels/funcs/elementwise_functor.h" -#include "paddle/phi/kernels/funcs/functors.h" +#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" namespace paddle { namespace operators { -#define LAUNCH_WARP_FORWAR_COMPUTE(near_greater_power_of_two) \ - case near_greater_power_of_two: \ - ComputeLogSoftmaxForwardInWarp< \ - T, AccT, near_greater_power_of_two><<>>( \ - dst, src, outer_size, dim_size); \ - break; - -template -__device__ __forceinline__ T WarpReduceSum(T value) { -#pragma unroll - for (int offset = KernelWarpSize / 2; offset > 0; offset /= 2) { - T sum_val = platform::CudaShuffleXorSync(0xFFFFFFFF, value, offset); - value = value + sum_val; - } - return value; -} - -template -__device__ __forceinline__ T WarpReduceMax(T value) { -#pragma unroll - for (int offset = KernelWarpSize / 2; offset > 0; offset /= 2) { - T max_val = platform::CudaShuffleXorSync(0xFFFFFFFF, value, offset); - value = max(value, max_val); - } - return value; -} - -int GetNearGreaterPowerOfTwo(int value) { - int log2_value = 0; - while ((1 << log2_value) < value) { - ++log2_value; - } - return 1 << log2_value; -} - -template -__global__ void ComputeLogSoftmaxForwardInWarp(T *dst, const T *src, - int batch_size, - int element_count) { - constexpr int near_greater_power_of_two = NearGreaterPowerOfTwo; - constexpr int kernel_warp_size = - (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32; - constexpr int warp_iter = near_greater_power_of_two / kernel_warp_size; - int batch_id = blockDim.y * blockIdx.x + threadIdx.y; - - int thread_in_warp_idx = threadIdx.x; - - // 1.read data from global memory to registers - AccT elements[warp_iter]; - // set effective_element_count as the num of elements when warps do effective - // work - // set effective_element_count as 0, when warps do ineffective work - int effective_element_count = (batch_id < batch_size) ? element_count : 0; - for (int it = 0; it < warp_iter; ++it) { - int element_index = thread_in_warp_idx + it * kernel_warp_size; - if (element_index < effective_element_count) { - elements[it] = - static_cast(src[batch_id * element_count + element_index]); - } else { - elements[it] = -std::numeric_limits::infinity(); - } - } - - // 2.compute max_value. For each thread, loop all registers to find max - AccT max_value = elements[0]; -#pragma unroll - for (int it = 1; it < warp_iter; ++it) { - max_value = (max_value > elements[it]) ? max_value : elements[it]; - } - max_value = WarpReduceMax(max_value); - - // 3.For each warp, accumulate all thread registers - AccT sum = 0.0f; -#pragma unroll - for (int it = 0; it < warp_iter; ++it) { - sum += std::exp(elements[it] - max_value); - } - sum = WarpReduceSum(sum); - - // 4.store result. - sum = std::log(sum); -#pragma unroll - for (int it = 0; it < warp_iter; ++it) { - int element_index = thread_in_warp_idx + it * kernel_warp_size; - if (element_index < effective_element_count) { - dst[batch_id * element_count + element_index] = - static_cast(elements[it] - max_value - sum); - } else { - break; - } - } -} - -template -void LaunchSoftmaxForwardForLastAxis(T *dst, const T *src, int dim_size, - int outer_size, gpuStream_t stream) { - int threads_per_block = 128; - int near_greater_power_of_two = GetNearGreaterPowerOfTwo(dim_size); - int kernel_warp_size = - (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32; - int warps_per_block = (threads_per_block / kernel_warp_size); - int blocks = (outer_size + warps_per_block - 1) / warps_per_block; - dim3 threads(kernel_warp_size, warps_per_block, 1); - - switch (near_greater_power_of_two) { - LAUNCH_WARP_FORWAR_COMPUTE(1); - LAUNCH_WARP_FORWAR_COMPUTE(2); - LAUNCH_WARP_FORWAR_COMPUTE(4); // dim_size: 3~4 - LAUNCH_WARP_FORWAR_COMPUTE(8); // dim_size: 5~8 - LAUNCH_WARP_FORWAR_COMPUTE(16); // dim_size: 9~16 - LAUNCH_WARP_FORWAR_COMPUTE(32); // dim_size: 17~32 - LAUNCH_WARP_FORWAR_COMPUTE(64); // dim_size: 33~64 - LAUNCH_WARP_FORWAR_COMPUTE(128); // dim_size 65~128 - LAUNCH_WARP_FORWAR_COMPUTE(256); // dim_size 129~256 - LAUNCH_WARP_FORWAR_COMPUTE(512); // dim_size 257~512 - LAUNCH_WARP_FORWAR_COMPUTE(1024); // dim_size 513~1024 - - default: - break; - } -} - -// Returns the final item after reduce operation along block.x. -// Firstly, get shared memory(smem) offset, find the starting position for every -// y. -// Secondly, initialise every smem position with value 'val' of thread itself. -// Thirdly, apply standard reduction along x direction as below: -// -// -> x direction -// [o o o o o o o o] time 0 -// | |/ / -// | /| / -// | / | / -// |/ |/ -// [o o o o x x x x] time 1 -// | |/ / -// |/|/ -// [o o x x x x x x] time 2 -// |/ -// [o x x x x x x x] time 3 -// -// Finally, return the first item. -// Imaging multiple reductions executed in paralell along y axis, -// Note that when blockDim.x is not 1, it's a EVEN number in all cases, -// and the size of shared memory is even as well. -template class Functor> -__forceinline__ __device__ T BlockReduceAlongDimX(T *shared, T val) { - Functor func; - // This reduction is not Block-wise reduction, only reduce along block.x. - // therefore the shared mem has offsets for different block.y. - shared += threadIdx.y * blockDim.x; - shared[threadIdx.x] = val; - int offset = blockDim.x / 2; - - while (offset > 0) { - __syncthreads(); - if (threadIdx.x < offset) { - shared[threadIdx.x] = - func(shared[threadIdx.x], shared[threadIdx.x + offset]); - } - offset /= 2; - } - __syncthreads(); - return shared[0]; -} - -template -__global__ void LogSoftmaxForwardCUDAKernelNotLastAxis( - T *output, const T *input, int outer_size, int dim_size, int inner_size) { - extern __shared__ unsigned char smem[]; - auto sdata = reinterpret_cast(smem); - - const int outer_stride = inner_size * dim_size; - const int dim_stride = inner_size; - - for (int x_id = blockIdx.x; x_id < outer_size; x_id += gridDim.x) { - for (int y_id = blockIdx.y * blockDim.y + threadIdx.y; y_id < inner_size; - y_id += blockDim.y * gridDim.y) { - const int data_offset = x_id * outer_stride + y_id; - // When blockDim.x==1, no block.x-reduction opetaions are needed. - // And threadIdx.x is 0 all the time, so the for-loops below are literally - // loops (No parallel executions). Loop all elements along axis and - // calculate the Max, Sum and (input[id]-Max-log(Sum)) to get the final - // log_softmax values along that axis. - // 1. reduce max - AccT max_value = -std::numeric_limits::infinity(); - // For one thread, iterate all items it responsable for, and get - // max_value. - // If there are N threads, N max_value will be returned. - for (int d = threadIdx.x; d < dim_size; d += blockDim.x) { - const AccT value = - static_cast(input[data_offset + d * dim_stride]); - max_value = phi::funcs::MaxFunctor()(max_value, value); - } - // If there are more than 1 threads along block x, reduce all max_values - // and get the global max_value, which is the max value along "axis". - // If there is only one thread along block x, no need to reduce, as the - // 'max_value' is the global max_value. - if (blockDim.x > 1) { - max_value = BlockReduceAlongDimX( - sdata, max_value); - } - - // 2. reduce sum - AccT sum = 0; - // Below is the same execution as '1. reduce max' - for (int d = threadIdx.x; d < dim_size; d += blockDim.x) { - sum += std::exp(static_cast(input[data_offset + d * dim_stride]) - - max_value); - } - if (blockDim.x > 1) { - sum = BlockReduceAlongDimX(sdata, sum); - } - - // 3. input-max-log_sum and write to output - for (int d = threadIdx.x; d < dim_size; d += blockDim.x) { - output[data_offset + d * dim_stride] = static_cast( - static_cast(input[data_offset + d * dim_stride]) - max_value - - std::log(sum)); - } - } - } -} - -// block.y covers inner_size. Threads along the x axis process dim_size -// elements, and make sure not to exceed the 1024 threads per block. -// Note that dim_threads namely blockDim.x is either 1 or a even number. -inline dim3 GetBlockSize(int dim_size, int inner_size) { - int inner_threads = inner_size; - inner_threads = std::min(inner_threads, 1024); - int dim_threads = 1; - - while (dim_threads * inner_threads <= 1024 && dim_threads <= dim_size) { - dim_threads *= 2; - } - dim_threads /= 2; - return dim3(dim_threads, inner_threads); -} - -// First cover the y axis as many blocks as possible. -// Then cover the x axis as many blocks as possible, -// and make sure not to exceed the max_active_blocks. -inline dim3 GetGridSize(dim3 block, int max_active_blocks, int outer_size, - int dim_size, int inner_size) { - int inner_blocks = (inner_size + block.y - 1) / block.y; - if (inner_blocks > max_active_blocks) inner_blocks = max_active_blocks; - - int outer_blocks = (max_active_blocks + inner_blocks - 1) / inner_blocks; - if (outer_blocks > outer_size) outer_blocks = outer_size; - return dim3(outer_blocks, inner_blocks); -} - -// When designing grid size and block size, priority is given to block size, -// and grid will be determined according to the maximum number of active blocks, -// which is set by as a experience value. -template -void ComputeLaunchConfigure(Kernel k, int outer_size, int dim_size, - int inner_size, dim3 &grid, dim3 &block, - int &shared_mem, int num_sm) { - block = GetBlockSize(dim_size, inner_size); - int block_threads = block.x * block.y; - shared_mem = block.x == 1 ? 0 : block_threads * sizeof(T); - int max_active_blocks = num_sm * 2; - grid = - GetGridSize(block, max_active_blocks, outer_size, dim_size, inner_size); -} - -template -void LaunchLogSoftmaxForwardCUDAKernelNotLastAxis(T *output_data, - const T *input_data, - int outer_size, int dim_size, - int inner_size, int num_sm, - gpuStream_t stream) { - int shared_mem; - dim3 grid; - dim3 block; - - ComputeLaunchConfigure( - &LogSoftmaxForwardCUDAKernelNotLastAxis, outer_size, dim_size, - inner_size, grid, block, shared_mem, num_sm); - - LogSoftmaxForwardCUDAKernelNotLastAxis< - T, MPDType><<>>( - output_data, input_data, outer_size, dim_size, inner_size); -} +using Tensor = framework::Tensor; template class LogSoftmaxKernel : public framework::OpKernel { - using MPDType = typename phi::dtype::MPTypeTrait::Type; - public: - void Compute(const framework::ExecutionContext &context) const override { - const auto *x = context.Input("X"); - auto *out = context.Output("Out"); - const auto *input_data = x->data(); - auto *output_data = out->mutable_data(context.GetPlace()); - - const int rank = x->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); + void Compute(const framework::ExecutionContext &ctx) const override { + auto *x = ctx.Input("X"); + auto *out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); - int dim_size = x->dims()[axis]; - int inner_size = 1; - for (int i = axis + 1; i < x->dims().size(); ++i) { - inner_size *= x->dims()[i]; - } - int outer_size = SizeToAxis(axis, x->dims()); - gpuStream_t stream = context.cuda_device_context().stream(); - int num_sm = context.cuda_device_context().GetSMCount(); - - if (inner_size == 1 && dim_size <= 1024 && dim_size * sizeof(T) <= 4096) { - LaunchSoftmaxForwardForLastAxis(output_data, input_data, - dim_size, outer_size, stream); - } else { - LaunchLogSoftmaxForwardCUDAKernelNotLastAxis( - output_data, input_data, outer_size, dim_size, inner_size, num_sm, - stream); - } + int input_axis = ctx.Attr("axis"); + auto &dev_ctx = ctx.template device_context(); + phi::SoftmaxForwardCUDAKernelDriver(dev_ctx, *x, input_axis, out); } }; -// Backward below -#define LAUNCH_WARP_BACKWARD_COMPUTE(near_greater_power_of_two) \ - case near_greater_power_of_two: \ - ComputeLogSoftmaxBackwardInWarp< \ - T, AccT, near_greater_power_of_two><<>>( \ - output, grad_output, grad_input, outer_size, dim_size); \ - break; - -template -__global__ void ComputeLogSoftmaxBackwardInWarp(const T *output, - const T *grad_output, - T *grad_input, int batch_size, - int element_count) { - constexpr int near_greater_power_of_two = NearGreaterPowerOfTwo; - constexpr int kernel_warp_size = - (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32; - constexpr int warp_iter = near_greater_power_of_two / kernel_warp_size; - int batch_id = blockDim.y * blockIdx.x + threadIdx.y; - - int thread_in_warp_idx = threadIdx.x; - - // 1.read data from global memory to registers - AccT output_register[warp_iter]; - AccT grad_output_register[warp_iter]; - int effective_element_count = (batch_id < batch_size) ? element_count : 0; - for (int iter = 0; iter < warp_iter; ++iter) { - int element_index = thread_in_warp_idx + iter * kernel_warp_size; - if (element_index < effective_element_count) { - output_register[iter] = - static_cast(output[batch_id * element_count + element_index]); - grad_output_register[iter] = static_cast( - grad_output[batch_id * element_count + element_index]); - } else { - output_register[iter] = static_cast(0); - grad_output_register[iter] = static_cast(0); - } - } - - // 2. For each warp, accumulate all thread registers - AccT sum = grad_output_register[0]; -#pragma unroll - for (int iter = 1; iter < warp_iter; ++iter) { - sum += grad_output_register[iter]; - } - sum = WarpReduceSum(sum); - -// 3. write result in grad_input -#pragma unroll - for (int iter = 0; iter < warp_iter; ++iter) { - int element_index = thread_in_warp_idx + iter * kernel_warp_size; - if (element_index < effective_element_count) { - grad_input[batch_id * element_count + element_index] = static_cast( - (grad_output_register[iter] - std::exp(output_register[iter]) * sum)); - } - } -} - -template -void LaunchSoftmaxBackwardForLastAxis(T *grad_input, const T *grad_output, - const T *output, int dim_size, - int outer_size, gpuStream_t stream) { - int threads_per_block = 128; - int near_greater_power_of_two = GetNearGreaterPowerOfTwo(dim_size); - int kernel_warp_size = - (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32; - int warps_per_block = (threads_per_block / kernel_warp_size); - int blocks = (outer_size + warps_per_block - 1) / warps_per_block; - dim3 threads(kernel_warp_size, warps_per_block, 1); - - switch (near_greater_power_of_two) { - LAUNCH_WARP_BACKWARD_COMPUTE(1); // dim_size: 1 - LAUNCH_WARP_BACKWARD_COMPUTE(2); // dim_size: 2 - LAUNCH_WARP_BACKWARD_COMPUTE(4); // dim_size: 3~4 - LAUNCH_WARP_BACKWARD_COMPUTE(8); // dim_size: 5~8 - LAUNCH_WARP_BACKWARD_COMPUTE(16); // dim_size: 9~16 - LAUNCH_WARP_BACKWARD_COMPUTE(32); // dim_size: 17~32 - LAUNCH_WARP_BACKWARD_COMPUTE(64); // dim_size: 33~64 - LAUNCH_WARP_BACKWARD_COMPUTE(128); // dim_size: 65~128 - LAUNCH_WARP_BACKWARD_COMPUTE(256); // dim_size: 129~256 - LAUNCH_WARP_BACKWARD_COMPUTE(512); // dim_size: 257~512 - LAUNCH_WARP_BACKWARD_COMPUTE(1024); // dim_size: 513~1024 - - default: - break; - } -} - template class LogSoftmaxGradKernel : public framework::OpKernel { - using MPDType = typename phi::dtype::MPTypeTrait::Type; - public: - void Compute(const framework::ExecutionContext &context) const override { - const auto *out = context.Input("Out"); - const auto *d_out = - context.Input(framework::GradVarName("Out")); - auto *d_x = context.Output(framework::GradVarName("X")); + void Compute(const framework::ExecutionContext &ctx) const override { + auto *out = ctx.Input("Out"); + auto *dout = ctx.Input(framework::GradVarName("Out")); + auto *dx = ctx.Output(framework::GradVarName("X")); + dx->mutable_data(ctx.GetPlace()); - const auto *out_data = out->data(); - const auto *d_out_data = d_out->data(); - auto *d_x_data = d_x->mutable_data(context.GetPlace()); - - const int rank = out->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); - - int dim_size = out->dims()[axis]; - int inner_size = 1; - for (int i = axis + 1; i < out->dims().size(); ++i) { - inner_size *= out->dims()[i]; - } - int outer_size = SizeToAxis(axis, out->dims()); - gpuStream_t stream = context.cuda_device_context().stream(); - - if (inner_size == 1 && dim_size <= 1024 && dim_size * sizeof(T) <= 4096) { - LaunchSoftmaxBackwardForLastAxis( - d_x_data, d_out_data, out_data, dim_size, outer_size, stream); - } else { - LogSoftmaxGradFunctor()( - context.template device_context(), out, - d_out, d_x, axis); - } + int input_axis = ctx.Attr("axis"); + auto &dev_ctx = ctx.template device_context(); + phi::SoftmaxBackwardCUDAKernelDriver(dev_ctx, *out, *dout, + input_axis, dx); } }; @@ -473,6 +57,17 @@ class LogSoftmaxGradKernel namespace ops = paddle::operators; namespace plat = paddle::platform; + +#ifdef PADDLE_WITH_HIP +REGISTER_OP_CUDA_KERNEL( + log_softmax, ops::LogSoftmaxKernel, + ops::LogSoftmaxKernel, + ops::LogSoftmaxKernel); +REGISTER_OP_CUDA_KERNEL( + log_softmax_grad, ops::LogSoftmaxGradKernel, + ops::LogSoftmaxGradKernel, + ops::LogSoftmaxGradKernel); +#else REGISTER_OP_CUDA_KERNEL( log_softmax, ops::LogSoftmaxKernel, ops::LogSoftmaxKernel, @@ -483,3 +78,4 @@ REGISTER_OP_CUDA_KERNEL( ops::LogSoftmaxGradKernel, ops::LogSoftmaxGradKernel, ops::LogSoftmaxGradKernel); +#endif diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h index 0352fdf6fa2..2b2dd511896 100644 --- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h +++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h @@ -351,8 +351,17 @@ __global__ void WarpSoftmaxForward(T* softmax, VecT* softmax_v = reinterpret_cast(&softmax[(first_batch + i) * stride]); VecT* reg_v = reinterpret_cast(&out_tmp[i][0][0]); - kps::ElementwiseUnary>( - &out_tmp[i][0][0], &srcdata[i][0][0], UnaryDivFunctor(sum[i])); + if (LogMode) { + kps::ElementwiseUnary>( + &srcdata[i][0][0], &srcdata[i][0][0], UnaryLogFunctor()); + kps::ElementwiseUnary>( + &out_tmp[i][0][0], + &srcdata[i][0][0], + UnarySubFunctor(std::log(sum[i]))); + } else { + kps::ElementwiseUnary>( + &out_tmp[i][0][0], &srcdata[i][0][0], UnaryDivFunctor(sum[i])); + } kps::WriteData( &softmax_v[0], ®_v[0], idx_max_v[i], 0, kWarpSize, 1); } @@ -434,15 +443,25 @@ __global__ void WarpSoftmaxBackward(T* dst, AccT sum_tmp[kBatchSize][kLoopsV][kVSize]; AccT* gradptr = reinterpret_cast(&grad_tmp[0][0][0]); AccT* srcptr = reinterpret_cast(&src_tmp[0][0][0]); - kps::ElementwiseBinary>( - &sum_tmp[0][0][0], &gradptr[0], &srcptr[0], kps::MulFunctor()); - kps::Reduce, - kps::details::ReduceMode::kLocalMode>( - &sum[0], &sum_tmp[0][0][0], kps::AddFunctor(), true); + if (LogMode) { + kps::Reduce, + kps::details::ReduceMode::kLocalMode>( + &sum[0], &grad_tmp[0][0][0], kps::AddFunctor(), true); + } else { + kps::ElementwiseBinary>( + &sum_tmp[0][0][0], &gradptr[0], &srcptr[0], kps::MulFunctor()); + kps::Reduce, + kps::details::ReduceMode::kLocalMode>( + &sum[0], &sum_tmp[0][0][0], kps::AddFunctor(), true); + } WarpReduceSum(sum); // write result to global memory @@ -453,10 +472,23 @@ __global__ void WarpSoftmaxBackward(T* dst, if (i >= local_batches) break; AccT* gradptr = reinterpret_cast(&grad_tmp[i][0][0]); AccT* srcptr = reinterpret_cast(&src_tmp[i][0][0]); - kps::ElementwiseUnary>( - &out[i][0][0], &gradptr[0], UnarySubFunctor(sum[i])); - kps::ElementwiseBinary>( - &out_tmp[i][0][0], &srcptr[0], &out[i][0][0], kps::MulFunctor()); + if (LogMode) { + kps::ElementwiseUnary>( + &out[i][0][0], &srcptr[0], ExpMulFunctor(sum[i])); + kps::ElementwiseBinary>( + &out_tmp[i][0][0], + &gradptr[0], + &out[i][0][0], + kps::SubFunctor()); + } else { + kps::ElementwiseUnary>( + &out[i][0][0], &gradptr[0], UnarySubFunctor(sum[i])); + kps::ElementwiseBinary>( + &out_tmp[i][0][0], + &srcptr[0], + &out[i][0][0], + kps::MulFunctor()); + } VecT* dst_v = reinterpret_cast(&dst[(first_batch + i) * stride]); VecT* reg_v = reinterpret_cast(&out_tmp[i][0][0]); kps::WriteData( @@ -639,7 +671,8 @@ __global__ void NormalSoftmaxForward( template class Functor> + template class Functor, + bool LogMode> __global__ void NormalSoftmaxBackward(T* input_grad, const T* output_grad, const T* output, @@ -656,10 +689,17 @@ __global__ void NormalSoftmaxBackward(T* input_grad, // 1. reduce sum AccT sum = 0; - for (int mid_id = threadIdx.y; mid_id < mid_dim; mid_id += blockDim.y) { - int data_offset = grad_offset + mid_id * mid_stride; - sum += static_cast(output_grad[data_offset]) * - static_cast(output[data_offset]); + if (LogMode) { + for (int mid_id = threadIdx.y; mid_id < mid_dim; mid_id += blockDim.y) { + int data_offset = grad_offset + mid_id * mid_stride; + sum += static_cast(output_grad[data_offset]); + } + } else { + for (int mid_id = threadIdx.y; mid_id < mid_dim; mid_id += blockDim.y) { + int data_offset = grad_offset + mid_id * mid_stride; + sum += static_cast(output_grad[data_offset]) * + static_cast(output[data_offset]); + } } if (blockDim.y > 1) { kps::Reduce, kMode::kGlobalMode>( @@ -715,10 +755,10 @@ void LaunchNormalSoftmaxBackward(const GPUContext& dev_ctx, dim3 grid, block; GetLaunchConfig(high_dim, mid_dim, low_dim, &grid, &block); if (LogMode) { - NormalSoftmaxBackward< - T, - AccT, - LogSoftmaxBackwardFunctor><<>>( + NormalSoftmaxBackward<<>>( input_grad_data, output_grad_data, output_data, @@ -726,10 +766,10 @@ void LaunchNormalSoftmaxBackward(const GPUContext& dev_ctx, mid_dim, low_dim); } else { - NormalSoftmaxBackward< - T, - AccT, - SoftmaxBackwardFunctor><<>>( + NormalSoftmaxBackward<<>>( input_grad_data, output_grad_data, output_data, @@ -864,6 +904,32 @@ static bool CanUseCudnnSoftmax(const GPUContext& dev_ctx) { return false; } +#if CUDNN_VERSION < 8100 +template <> +inline void SoftmaxForwardCudnnKernel( + const GPUContext& dev_ctx, + const DenseTensor& x, + const int axis, + const bool log_mode, + DenseTensor* out) { + PADDLE_THROW(errors::Unavailable( + "This kernel is not supported when the dtype is bf16 and CUDNN_VERSION < " + "8100.")); +} +template <> +inline void SoftmaxBackwardCudnnKernel( + const GPUContext& dev_ctx, + const DenseTensor& out, + const DenseTensor& dout, + const int axis, + const bool log_mode, + DenseTensor* dx) { + PADDLE_THROW(errors::Unavailable( + "This kernel is not supported when the dtype is bf16 and CUDNN_VERSION < " + "8100.")); +} +#endif + template void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx, const DenseTensor& x, -- GitLab From 227fa4083efa2a0aa902aa4edee18e7a367d0f2a Mon Sep 17 00:00:00 2001 From: Jiabin Yang <360788950@qq.com> Date: Mon, 14 Mar 2022 16:59:55 +0800 Subject: [PATCH 039/176] Support custom op and paddle.autograd.bacward in eager (#40423) * eager, test=develop * fix bug, test=develop * eager, test=develop * merge legacy to fluid * eager, test=develop * eager, test=develop * Refactor TensorAdd func by template and remove gradient_accumulation in eager * Remove needless target name * eager, test=develop * eager, test=develop * Use overload instead of template * Remove legacy code * Remove legacy code * selectedrows, test=develop * Remove DataType test * eager, test=develop * eager, test=develop * support gan, test=develop * Using Tensor directly instead of using EagerTensor * support gradient_accumulation * make test_imperative_lod_tensor_to_selected_rows longer * make test_imperative_lod_tensor_to_selected_rows longer * refine code * ptb, test=develop * Rename all EagerTensor to Tensor * Rename some EagerTensor to Tensor * rename EagerTensor to EagerVariable * eager, test=develop * eager, test=develop * eager, test=develop * eager, test=develop * add more test * eager, test=develop * Support copiable selected rows and merge develop * save load, eager, test=develop * save load, eager, test=develop * refine, test=develop * remove useless _set_value method * refine, test=develop * refine, test=develop * revert static_runner, test=develop * EagerTensor to Tensor, test=develop * refine, test=develop * refine, test=develop * clear grad, test=develop * merge, develop * merge, develop * merge, test=develop * merge, test=develop * Support quant and part of slice * support legacy static save * extend slim tests time * remove imperative on inference * remove imperative on inference * merge develop * fix typo * fix typo * split slice related code into 2 part for imperative and eager * split slice from inference * split slice from inference * fix test_tensor_register_hook * support custom op in eager mode * fix inference deps error * split eager utils from custom operator * fix type match * fix typo Co-authored-by: Wang Huan Co-authored-by: Weilong Wu Co-authored-by: wanghuancoder --- paddle/fluid/eager/CMakeLists.txt | 5 +- paddle/fluid/eager/api/utils/global_utils.h | 22 +- paddle/fluid/eager/backward.cc | 30 +- .../eager/custom_operator/CMakeLists.txt | 1 + .../custom_operator/custom_operator_node.cc | 90 ++++++ .../custom_operator/custom_operator_node.h | 77 +++++ paddle/fluid/eager/grad_node_info.cc | 2 +- paddle/fluid/framework/CMakeLists.txt | 1 + paddle/fluid/framework/custom_operator.cc | 6 +- paddle/fluid/framework/custom_operator.h | 5 +- paddle/fluid/pybind/CMakeLists.txt | 2 +- paddle/fluid/pybind/eager_functions.cc | 277 +++++++++++++++++- paddle/fluid/pybind/eager_properties.cc | 2 +- paddle/fluid/pybind/eager_utils.cc | 21 +- paddle/fluid/pybind/eager_utils.h | 6 +- paddle/fluid/pybind/exception.cc | 4 +- paddle/fluid/pybind/pybind.cc | 62 +++- paddle/phi/api/ext/op_meta_info.h | 18 +- paddle/phi/api/lib/op_meta_info.cc | 17 +- paddle/phi/api/lib/tensor.cc | 4 +- paddle/phi/api/lib/tensor_method.cc | 1 + python/paddle/autograd/backward_mode.py | 27 +- .../fluid/dygraph/varbase_patch_methods.py | 2 +- .../fluid/tests/custom_op/custom_relu_op.cc | 2 + .../fluid/tests/custom_op/custom_relu_op.cu | 2 + .../tests/custom_op/test_custom_attrs_jit.py | 15 +- .../tests/custom_op/test_custom_concat.py | 15 +- .../fluid/tests/custom_op/test_custom_conj.py | 8 +- .../tests/custom_op/test_custom_linear.py | 8 +- .../custom_op/test_custom_raw_op_kernel_op.py | 6 - .../tests/custom_op/test_custom_relu_model.py | 31 +- .../custom_op/test_custom_relu_op_jit.py | 19 +- .../custom_op/test_custom_relu_op_setup.py | 8 +- .../custom_op/test_custom_simple_slice.py | 8 +- .../tests/custom_op/test_dispatch_jit.py | 9 +- .../tests/custom_op/test_multi_out_jit.py | 9 +- .../tests/unittests/test_custom_grad_input.py | 42 ++- .../tests/unittests/test_egr_python_api.py | 4 +- python/paddle/utils/code_gen/api.yaml | 1 + .../utils/cpp_extension/extension_utils.py | 43 ++- 40 files changed, 803 insertions(+), 109 deletions(-) create mode 100644 paddle/fluid/eager/custom_operator/CMakeLists.txt create mode 100644 paddle/fluid/eager/custom_operator/custom_operator_node.cc create mode 100644 paddle/fluid/eager/custom_operator/custom_operator_node.h diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt index f9d1b705390..691a381405e 100644 --- a/paddle/fluid/eager/CMakeLists.txt +++ b/paddle/fluid/eager/CMakeLists.txt @@ -1,4 +1,5 @@ -set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node) +set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node custom_operator_node) + set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy) set(generated_deps final_dygraph_function final_dygraph_node dygraph_function dygraph_node) @@ -9,6 +10,8 @@ endif() add_subdirectory(api) add_subdirectory(accumulation) +add_subdirectory(custom_operator) + cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor) cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator) diff --git a/paddle/fluid/eager/api/utils/global_utils.h b/paddle/fluid/eager/api/utils/global_utils.h index 00578d9a359..a9a62fcd50e 100644 --- a/paddle/fluid/eager/api/utils/global_utils.h +++ b/paddle/fluid/eager/api/utils/global_utils.h @@ -18,7 +18,7 @@ #include #include #include "paddle/fluid/imperative/tracer.h" - +#include "paddle/phi/api/ext/op_meta_info.h" namespace egr { class UniqueNameGenerator { @@ -70,6 +70,21 @@ class Controller { void SetInEagerMode(bool in_eager_mode) { in_eager_mode_ = in_eager_mode; } + const std::unordered_map>& + GetOpMetaInfoMap() { + return op_meta_info_map_; + } + + void MergeOpMetaInfoMap(const std::unordered_map< + std::string, std::vector>& map) { + op_meta_info_map_.insert(map.begin(), map.end()); + } + + std::unordered_map>>& + GetCustomEdgesSlotMap() { + return custom_edges_slot_map_; + } + private: Controller() = default; static Controller* controller_; @@ -77,6 +92,11 @@ class Controller { new paddle::imperative::Tracer()}; // TODO(jiabin): remove when we don't need imperative. bool in_eager_mode_{false}; + std::unordered_map> + op_meta_info_map_; + /* op_type : {{grad_outputs}, {grad_inputs}, {input}, {output}, {attrs}}*/ + std::unordered_map>> + custom_edges_slot_map_; DISABLE_COPY_AND_ASSIGN(Controller); }; diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 934497d7d17..603f93d9ddc 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -112,7 +112,8 @@ void RunBackward(const std::vector& tensors, // Prepare GradTensorHolder if (!node_input_buffers_dict.count(grad_node)) { - VLOG(6) << "Create Value for grad input tensor " << i; + VLOG(6) << "Create Value for grad input tensor " << i + << " of grad node: " << grad_node->name(); node_input_buffers_dict[grad_node] = std::make_unique(grad_node->InputMeta()); } @@ -158,19 +159,23 @@ void RunBackward(const std::vector& tensors, VLOG(6) << "Run Backward"; while (!queue.empty()) { GradNodeBase* node = queue.front(); - queue.pop(); + if (queue.size() > 1 && node_in_degree_map[node] != 0) { + queue.pop(); + continue; + } + queue.pop(); // Run node: This is where Hook happens PADDLE_ENFORCE( node_input_buffers_dict.count(node), paddle::platform::errors::Fatal( - "Unable to find next node in the InputBuufer" + "Unable to find next node in the GradTensorHolder \n" "Trying to run Node without configuring its GradTensorHolder")); std::unique_ptr node_input_buffer = std::move(node_input_buffers_dict[node]); - VLOG(6) << "Run Backward Kernel with input_buffer"; + VLOG(6) << "Run Backward Kernel with GradTensorHolder"; // Run Pre Backward Node and get outputs std::vector> grad_output_tensors = (*node)(node_input_buffer->Buffers()); @@ -215,9 +220,8 @@ void RunBackward(const std::vector& tensors, if ((!grad_output_tensor.defined() || !grad_output_tensor.initialized())) { - VLOG(6) - << "We get grad_output_tensor with slot: " << i << ", rank: " << j - << " as uninitialized or undefined in both tensor and variable"; + VLOG(6) << "We get grad_output_tensor with slot: " << i + << ", rank: " << j << " as uninitialized or undefined tensor"; } VLOG(6) << "Get Edge and grad_output_tensor with slot: " << i << ", rank: " << j @@ -228,6 +232,8 @@ void RunBackward(const std::vector& tensors, const auto& input_meta = next_node->InputMeta(); auto grad_tensor_holder = std::make_unique(input_meta); + VLOG(6) << "Construct GradTensorHolder for grad node: " + << next_node->name(); node_input_buffers_dict[next_node] = std::move(grad_tensor_holder); } VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first @@ -237,10 +243,12 @@ void RunBackward(const std::vector& tensors, // Update queue node_in_degree_map[next_node]--; - PADDLE_ENFORCE(node_in_degree_map[next_node] >= 0, - paddle::platform::errors::Fatal( - "Detected in-degree value smaller than zero." - "Node's in-degree cannot be negative")); + PADDLE_ENFORCE( + node_in_degree_map[next_node] >= 0, + paddle::platform::errors::Fatal( + "Detected in-degree value smaller than zero. For Node: %s" + "Node's in-degree cannot be negative", + next_node->name())); if (node_in_degree_map[next_node] == 0) { queue.emplace(std::move(next_node)); } diff --git a/paddle/fluid/eager/custom_operator/CMakeLists.txt b/paddle/fluid/eager/custom_operator/CMakeLists.txt new file mode 100644 index 00000000000..ccc9a03a556 --- /dev/null +++ b/paddle/fluid/eager/custom_operator/CMakeLists.txt @@ -0,0 +1 @@ +cc_library(custom_operator_node SRCS custom_operator_node.cc DEPS phi_tensor phi_api grad_node_info custom_operator op_meta_info) diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc new file mode 100644 index 00000000000..48ac8c8358a --- /dev/null +++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/eager/custom_operator/custom_operator_node.h" +#include "paddle/fluid/framework/custom_operator.h" +#include "paddle/fluid/framework/op_meta_info_helper.h" +#include "paddle/phi/api/ext/op_meta_info.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace egr { +std::vector> RunCustomOpNode:: +operator()( + const std::vector>& grads) { + paddle::CustomOpKernelContext ctx; + auto grad_inputs_name = paddle::framework::OpMetaInfoHelper::GetInputs( + egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]); + auto grad_outputs_names = paddle::framework::OpMetaInfoHelper::GetOutputs( + egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]); + auto map = egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type_); + auto kernel_map = egr::Controller::Instance().GetOpMetaInfoMap(); + + std::vector> tmp_ins( + grad_inputs_name.size()); + VLOG(7) << " Prepare Backward inputs of grads with size: " << grads.size() + << ", whose grad_inputs_name size is: " << grad_inputs_name.size(); + for (size_t i = 0; i < grads.size(); i++) { + if (map[1].find(i) != map[1].end()) { + VLOG(7) << "Insert grad: " << i << " to grad_inputs: " << map[1][i]; + tmp_ins[map[1][i]] = grads[i]; + } + } + + for (auto it : fwd_outs) { + VLOG(7) << "Insert fwd_outs to grad_inputs: " << it.first; + tmp_ins[it.first] = RunCustomOpNode::Recover(&(it.second)); + } + + for (auto it : fwd_ins) { + VLOG(7) << "Insert fwd_ins to grad_inputs: " << it.first; + tmp_ins[it.first] = RunCustomOpNode::Recover(&(it.second)); + } + + VLOG(6) << "Prepare Grad inputs"; + for (const auto& in : tmp_ins) { + ctx.EmplaceBackInputs(in); + } + VLOG(6) << "Prepare Grad attrs"; + ctx.EmplaceBackAttrs(attrs_); + std::vector> outs( + GetEdges().size()); + std::vector> tmp_outs( + grad_outputs_names.size()); + VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size(); + for (size_t i = 0; i < GetEdges().size(); i++) { + if (map[0].find(i) != map[0].end()) { + VLOG(7) << "Insert grad outputs: " << i + << " with size: " << GetEdges()[i].size() + << " to tmp_outputs: " << map[0][i]; + for (size_t j = 0; j < GetEdges()[i].size(); j++) { + outs[i].emplace_back(/* init it incase of copy nullptr of shared_ptr */ + std::make_shared( + phi::DataType::UNDEFINED), + egr::Controller::Instance().GenerateUniqueName( + "custom_tmp_grad")); + } + tmp_outs[map[0][i]] = outs[i]; + } + } + for (size_t i = 0; i < tmp_outs.size(); i++) { + VLOG(7) << "Prepare grad outputs size: " << tmp_outs[i].size(); + ctx.EmplaceBackOutputs(tmp_outs[i]); + } + VLOG(7) << "Run Kernel of Grad Custom Op: " << op_type_; + + (*paddle::framework::OpMetaInfoHelper::GetKernelFn( + kernel_map.at(op_type_)[1]))(&ctx); + return outs; +} +} // namespace egr diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.h b/paddle/fluid/eager/custom_operator/custom_operator_node.h new file mode 100644 index 00000000000..e5ddef9c062 --- /dev/null +++ b/paddle/fluid/eager/custom_operator/custom_operator_node.h @@ -0,0 +1,77 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/grad_node_info.h" +#include "paddle/fluid/eager/hooks.h" +#include "paddle/fluid/eager/tensor_wrapper.h" +#include "paddle/fluid/framework/custom_operator.h" +#include "paddle/utils/any.h" + +namespace egr { +class RunCustomOpNode : public GradNodeBase { + public: + // Constructor: configure fwd input tensors to grad node + explicit RunCustomOpNode(size_t bwd_in_slot_num, size_t bwd_out_slot_num, + const std::string& op_type) + : GradNodeBase(bwd_in_slot_num, bwd_out_slot_num), op_type_(op_type) { + VLOG(6) << "Construct RunCustomOpNode for op: " << op_type; + } + + ~RunCustomOpNode() override { + VLOG(6) << "Destruct RunCustomOpNode for op: " << op_type_; + } + + // Functor: perform backward computations + virtual std::vector> operator()( + const std::vector>& grads) + override; + + std::string name() { + return paddle::string::Sprintf("RunCustomOpNode: %s_grad", op_type_); + } + + static std::vector ConstructTensorWrapper( + const std::vector& fwd_var) { + std::vector res; + for (auto const& var : fwd_var) { + res.emplace_back(var); + } + return res; + } + + static std::vector Recover( + std::vector* fwd_var) { + std::vector res; + for (size_t i = 0; i < fwd_var->size(); i++) { + res.emplace_back(fwd_var->at(i).recover(nullptr)); + } + return res; + } + + void SetAttrs(const std::vector& attr) { attrs_ = attr; } + + public: + std::unordered_map> fwd_outs; + std::unordered_map> fwd_ins; + std::unordered_map grads2grad_in_map; + + private: + std::vector attrs_; + std::string op_type_{""}; +}; + +} // namespace egr diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index 427be83c3bb..7eb2902d935 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -25,7 +25,7 @@ #include "glog/logging.h" /** - * Implementation of GradNodeBase, Edge and InputBuffer. + * Implementation of GradNodeBase, Edge and GradTensorHolder. **/ namespace egr { diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index aa92a3b2226..5dc3d9e89c5 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -440,6 +440,7 @@ message(STATUS "branch: ${PADDLE_BRANCH}") configure_file(commit.h.in commit.h) cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper phi_tensor op_meta_info phi_api) + #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ) #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler) diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index b9e3bee25f6..478e39b99dc 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -25,6 +25,7 @@ limitations under the License. */ #include #include +#include "paddle/fluid/eager/api/utils/global_utils.h" #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/op_meta_info_helper.h" @@ -946,15 +947,16 @@ void RegisterOperatorWithMetaInfoMap( ////////////////////// User APIs /////////////////////// // load op api -void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name) { +const std::unordered_map>& +LoadOpMetaInfoAndRegisterOp(const std::string& dso_name) { void* handle = paddle::platform::dynload::GetOpDsoHandle(dso_name); VLOG(3) << "load custom_op lib: " << dso_name; typedef OpMetaInfoMap& get_op_meta_info_map_t(); auto* get_op_meta_info_map = detail::DynLoad(handle, "PD_GetOpMetaInfoMap"); auto& op_meta_info_map = get_op_meta_info_map(); - RegisterOperatorWithMetaInfoMap(op_meta_info_map, handle); + return op_meta_info_map.GetMap(); } } // namespace framework diff --git a/paddle/fluid/framework/custom_operator.h b/paddle/fluid/framework/custom_operator.h index 4310b564371..fef1e82a14f 100644 --- a/paddle/fluid/framework/custom_operator.h +++ b/paddle/fluid/framework/custom_operator.h @@ -20,9 +20,9 @@ limitations under the License. */ namespace paddle { namespace framework { - // Load custom op api: register op after user compiled -void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name); +const std::unordered_map>& +LoadOpMetaInfoAndRegisterOp(const std::string& dso_name); // Register custom op api: register op directly void RegisterOperatorWithMetaInfoMap( @@ -31,6 +31,5 @@ void RegisterOperatorWithMetaInfoMap( // Interface for selective register custom op. void RegisterOperatorWithMetaInfo(const std::vector& op_meta_infos, void* dso_handle = nullptr); - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 2e901f3bffd..7b223f7ed27 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -351,7 +351,7 @@ if(WITH_PYTHON) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) cc_library(paddle_eager SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc eager_utils.cc - DEPS eager_api autograd_meta backward grad_node_info phi op_function_common final_dygraph_function final_dygraph_node dygraph_function dygraph_node accumulation_node global_utils utils python) + DEPS eager_api autograd_meta backward grad_node_info phi op_function_common final_dygraph_function final_dygraph_node dygraph_function dygraph_node accumulation_node global_utils utils python custom_operator custom_operator_node) add_dependencies(paddle_eager eager_codegen) add_dependencies(paddle_eager eager_op_function_generator_cmd) list(APPEND PYBIND_DEPS paddle_eager) diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index 0b04dc7347c..e110432c67d 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -21,21 +21,25 @@ limitations under the License. */ #include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/backward.h" +#include "paddle/fluid/eager/custom_operator/custom_operator_node.h" #include "paddle/fluid/eager/utils.h" #include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/custom_operator.h" +#include "paddle/fluid/framework/op_meta_info_helper.h" #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/pybind/eager.h" #include "paddle/fluid/pybind/eager_utils.h" #include "paddle/fluid/pybind/exception.h" +#include "paddle/phi/api/ext/op_meta_info.h" #include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/api/lib/utils/storage.h" #include "paddle/phi/api/lib/utils/tensor_utils.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/dense_tensor.h" - namespace paddle { namespace pybind { @@ -168,7 +172,276 @@ static PyObject* eager_api_read_next_tensor_list(PyObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } +static void ConstructFwdAndBwdMap( + const std::vector& vec_map, + const std::string& op_type) { + auto& in_out_map = egr::Controller::Instance().GetCustomEdgesSlotMap(); + if (in_out_map.find(op_type) != in_out_map.end()) { + VLOG(7) << "Find Exist CustomEdgesSlotMap Skip >>>> "; + return; + } else { + VLOG(7) << "Construct CustomEdgesSlotMap "; + auto inputs_names = + paddle::framework::OpMetaInfoHelper::GetInputs(vec_map[0]); + auto outputs_names = + paddle::framework::OpMetaInfoHelper::GetOutputs(vec_map[0]); + auto attrs_names = + paddle::framework::OpMetaInfoHelper::GetAttrs(vec_map[0]); + auto grad_outputs_names = + paddle::framework::OpMetaInfoHelper::GetOutputs(vec_map[1]); + auto grad_inputs_names = + paddle::framework::OpMetaInfoHelper::GetInputs(vec_map[1]); + auto grad_attrs_names = + paddle::framework::OpMetaInfoHelper::GetAttrs(vec_map[1]); + std::vector> res(5); + in_out_map.insert({op_type, res}); + // Prepare pos map for grad_outputs + VLOG(7) << "Prepare pos map for grad_outputs"; + PADDLE_ENFORCE_LE( + grad_outputs_names.size(), inputs_names.size(), + paddle::platform::errors::InvalidArgument( + "Grad outputs num should be less equal than forward inputs num.")); + for (size_t i = 0; i < grad_outputs_names.size(); i++) { + size_t end = grad_outputs_names[i].find("@GRAD"); + PADDLE_ENFORCE_NE( + end, std::string::npos, + paddle::platform::errors::NotFound( + "All Grad outputs should be grad and we got %s is not grad var, " + "please check your op and change to fit the rule.", + grad_outputs_names[i])); + for (size_t j = 0; j < inputs_names.size(); j++) { + if (grad_outputs_names[i].substr(0, end) == inputs_names[j]) { + VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j + << " inputs: " << inputs_names[j] << " related to No." << i + << " grad_outputs: " << grad_outputs_names[i]; + in_out_map[op_type][0][j] = i; + } + } + } + // Prepare pos map for grad_inputs + for (size_t i = 0; i < grad_inputs_names.size(); i++) { + size_t end = grad_inputs_names[i].find("@GRAD"); + if (end != std::string::npos) { + for (size_t j = 0; j < outputs_names.size(); j++) { + if (grad_inputs_names[i].substr(0, end) == outputs_names[j]) { + VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j + << " outputs: " << outputs_names[j] << " related to No." + << i << " grad_inputs's grad: " << grad_inputs_names[i]; + in_out_map[op_type][1][j] = i; + } + } + } else { + if (std::find(outputs_names.begin(), outputs_names.end(), + grad_inputs_names[i]) != outputs_names.end()) { + for (size_t j = 0; j < outputs_names.size(); j++) { + if (grad_inputs_names[i] == outputs_names[j]) { + VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j + << " outputs: " << outputs_names[j] << " related to No." + << i + << " grad_inputs fwd outputs: " << grad_inputs_names[i]; + in_out_map[op_type][2][j] = i; + } + } + } else { + for (size_t j = 0; j < inputs_names.size(); j++) { + if (grad_inputs_names[i] == inputs_names[j]) { + VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j + << " inputs: " << inputs_names[j] << " related to No." + << i + << " grad_inputs fwd inputs: " << grad_inputs_names[i]; + in_out_map[op_type][3][j] = i; + } + } + } + } + } + + // Prepare pos map for grad attrs_ + for (size_t i = 0; i < grad_attrs_names.size(); i++) { + auto end = std::find(attrs_names.begin(), attrs_names.end(), + grad_attrs_names[i]); + PADDLE_ENFORCE_NE(end, attrs_names.end(), + paddle::platform::errors::NotFound( + "All Grad attrs should be one of forward attrs and " + "we got %s is not one of them, please check your " + "op and change to fit the rule.", + grad_attrs_names[i])); + for (size_t j = 0; j < attrs_names.size(); j++) { + if (grad_attrs_names[i] == attrs_names[j]) { + VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j + << " attrs: " << attrs_names[j] << " related to No." << i + << " grad_attrs: " << grad_attrs_names[i]; + in_out_map[op_type][4][j] = i; + } + } + } + } +} + +static std::vector CastAttrsToTragetType( + const std::vector& src, + const std::vector& attrs_names) { + std::vector res; + PADDLE_ENFORCE_EQ(src.size(), attrs_names.size(), + paddle::platform::errors::InvalidArgument( + "We Expected same size of attrs and attrs_name list, " + "if u got this error indicate your custom op setting " + "%s attrs, but you just give %s", + attrs_names.size(), src.size())); + for (size_t i = 0; i < src.size(); i++) { + size_t end = attrs_names[i].find(": "); + std::string type_name = + attrs_names[i].substr(end + 2, attrs_names.size() - end - 2); + if (type_name == "int") { + if (src[i].type() == typeid(bool)) { + res.emplace_back(static_cast(paddle::any_cast(src[i]))); + } else if (src[i].type() == typeid(int)) { + res.emplace_back(src[i]); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Your No. %s attrs should only can be bool or int32, other type is " + "forbidden for now but we got %s. Check your code first please", + i, src[i].type().name())); + } + } else if (type_name == "int64_t") { + if (src[i].type() == typeid(bool)) { + res.emplace_back(static_cast(paddle::any_cast(src[i]))); + } else if (src[i].type() == typeid(int)) { + res.emplace_back(static_cast(paddle::any_cast(src[i]))); + } else if (src[i].type() == typeid(int64_t)) { + res.emplace_back(src[i]); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Your No. %s attrs should only can be bool or int32 or int64_t, " + "other type is forbidden for now but we got %s. Check your code " + "first please", + i, src[i].type().name())); + } + } else { + res.emplace_back(src[i]); + } + } + return res; +} + +static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + paddle::CustomOpKernelContext ctx = + CastPyArg2CustomOpKernelContext(PyTuple_GET_ITEM(args, 0), 0); + std::string op_type = CastPyArg2AttrString(PyTuple_GET_ITEM(args, 1), 1); + bool trace_backward = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2); + VLOG(7) << "Get things for python for Custom Op: " << op_type + << ", trace_backward is: " << trace_backward; + auto meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap(); + PADDLE_ENFORCE_NE(meta_info_map.find(op_type), meta_info_map.end(), + paddle::platform::errors::NotFound( + "Can't find %s in Eager OpMetaInfoMap which should be " + "created by LoadOpMetaInfoAndRegisterOp, please make " + "sure you registered your op first and try again. ", + op_type)); + VLOG(7) << "Run Kernel of Custom Op: " << op_type; + std::vector res_attrs = CastAttrsToTragetType( + ctx.Attrs(), paddle::framework::OpMetaInfoHelper::GetAttrs( + meta_info_map.at(op_type)[0])); + ctx.EmplaceBackAttrs(res_attrs); + const auto& vec_map = meta_info_map.at(op_type); + (*paddle::framework::OpMetaInfoHelper::GetKernelFn(vec_map[0]))(&ctx); + + VLOG(7) << "Get AutogradMeta for inputs and outputs for Custom Op"; + std::vector> ins_auto_grad_metas; + std::vector> outs_auto_grad_metas; + VLOG(7) << "We got slot num of ins is: " << ctx.InputRange().size(); + ins_auto_grad_metas.resize(ctx.InputRange().size()); + VLOG(7) << "We got slot num of outs is: " << ctx.OutputRange().size(); + outs_auto_grad_metas.resize(ctx.OutputRange().size()); + for (size_t i = 0; i < ctx.InputRange().size(); i++) { + ins_auto_grad_metas[i] = + egr::EagerUtils::nullable_autograd_meta(ctx.InputsBetween( + ctx.InputRangeAt(i).first, ctx.InputRangeAt(i).second)); + } + for (size_t i = 0; i < ctx.OutputRange().size(); i++) { + outs_auto_grad_metas[i] = + egr::EagerUtils::unsafe_autograd_meta(ctx.OutputsBetweeen( + ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second)); + } + bool require_any_grad = false; + for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) { + require_any_grad = + require_any_grad || egr::EagerUtils::ComputeRequireGrad( + trace_backward, &(ins_auto_grad_metas[i])); + } + if (require_any_grad) { + VLOG(6) << " Construct Grad for Custom Op: " << op_type; + ConstructFwdAndBwdMap(vec_map, op_type); + for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) { + egr::EagerUtils::PassStopGradient(false, &(outs_auto_grad_metas[i])); + } + auto grad_node = std::make_shared( + outs_auto_grad_metas.size(), ins_auto_grad_metas.size(), op_type); + auto slot_map = + egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type); + // Prepare Grad outputs + size_t no_grad_cnt = 0; + for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) { + if (slot_map[0].find(i) != slot_map[0].end()) { + grad_node->SetGradOutMeta(&ins_auto_grad_metas[i], slot_map[0][i]); + grad_node->AddEdges(&ins_auto_grad_metas[i], slot_map[0][i]); + } else { + grad_node->SetGradOutMeta(&ins_auto_grad_metas[i], + ins_auto_grad_metas.size() - 1 - no_grad_cnt); + grad_node->AddEdges(&ins_auto_grad_metas[i], + ins_auto_grad_metas.size() - 1 - no_grad_cnt); + no_grad_cnt++; + } + } + // Prepare Grad inputs with grad of fwd outputs + for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) { + egr::EagerUtils::SetOutRankWithSlot(&(outs_auto_grad_metas[i]), i); + egr::EagerUtils::SetHistory(&(outs_auto_grad_metas[i]), grad_node); + grad_node->SetGradInMeta(&(outs_auto_grad_metas[i]), i); + egr::EagerUtils::CheckAndRetainGrad(ctx.OutputsBetweeen( + ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second)); + } + + // Prepare Grad inputs with fwd outputs + for (auto it = slot_map[2].begin(); it != slot_map[2].end(); it++) { + VLOG(7) << "Prepare fwd_outs: " << it->first + << " to grad_inputs: " << it->second; + grad_node->fwd_outs[it->second] = + egr::RunCustomOpNode::ConstructTensorWrapper( + ctx.OutputsBetweeen(ctx.OutputRangeAt(it->first).first, + ctx.OutputRangeAt(it->first).second)); + } + + // Prepare Grad inputs with fwd inputs + for (auto it = slot_map[3].begin(); it != slot_map[3].end(); it++) { + VLOG(7) << "Prepare fwd_ins: " << it->first + << " to grad_inputs: " << it->second; + grad_node->fwd_ins[it->second] = + egr::RunCustomOpNode::ConstructTensorWrapper( + ctx.InputsBetween(ctx.InputRangeAt(it->first).first, + ctx.InputRangeAt(it->first).second)); + } + + auto attrs_names = paddle::framework::OpMetaInfoHelper::GetAttrs( + meta_info_map.at(op_type)[1]); + std::vector attrs(attrs_names.size()); + // Prepare attrs for Grad node + for (auto it = slot_map[4].begin(); it != slot_map[4].end(); it++) { + VLOG(7) << "Prepare fwd attrs: " << it->first + << " to grad_attrs: " << it->second; + attrs[it->second] = res_attrs[it->first]; + } + grad_node->SetAttrs(attrs); + } + Py_INCREF(Py_None); + return Py_None; + EAGER_CATCH_AND_THROW_RETURN_NULL +} + PyMethodDef variable_functions[] = { + // TODO(jiabin): Remove scale when we have final state tests {"scale", (PyCFunction)(void (*)(void))eager_api_scale, METH_VARARGS | METH_KEYWORDS, NULL}, {"_set_expected_place", @@ -179,6 +452,8 @@ PyMethodDef variable_functions[] = { METH_VARARGS | METH_KEYWORDS, NULL}, {"run_backward", (PyCFunction)(void (*)(void))eager_api_run_backward, METH_VARARGS | METH_KEYWORDS, NULL}, + {"_run_custom_op", (PyCFunction)(void (*)(void))eager_api_run_costum_op, + METH_VARARGS | METH_KEYWORDS, NULL}, {"tensor_copy", (PyCFunction)(void (*)(void))eager_api_tensor_copy, METH_VARARGS | METH_KEYWORDS, NULL}, {"read_next_tensor_list", diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc index 2e1390cb961..2572866b8f5 100644 --- a/paddle/fluid/pybind/eager_properties.cc +++ b/paddle/fluid/pybind/eager_properties.cc @@ -72,7 +72,7 @@ PyObject* tensor_properties_get_grad(TensorObject* self, void* closure) { EAGER_TRY VLOG(6) << "Get grad for tensor: " << self->tensor.name(); auto meta = egr::EagerUtils::nullable_autograd_meta(self->tensor); - if (meta) { + if (meta && meta->Grad().initialized()) { return ToPyObject(meta->Grad()); } else { Py_INCREF(Py_None); diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index f4e148cf8dc..217edad0c0a 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -27,10 +27,10 @@ limitations under the License. */ #include "paddle/fluid/pybind/eager_utils.h" #include "paddle/fluid/pybind/op_function_common.h" #include "paddle/fluid/pybind/tensor_py.h" +#include "paddle/phi/api/ext/op_meta_info.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/dense_tensor.h" - namespace paddle { namespace pybind { @@ -46,6 +46,7 @@ extern PyTypeObject* g_npuplace_pytype; extern PyTypeObject* g_cudapinnedplace_pytype; extern PyTypeObject* g_framework_tensor_pytype; extern PyTypeObject* g_framework_lodtensorarray_pytype; +extern PyTypeObject* g_custom_op_kernel_ctx_pytype; int TensorDtype2NumpyDtype(phi::DataType dtype) { switch (dtype) { @@ -184,7 +185,7 @@ paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos) { } else { PADDLE_THROW(platform::errors::InvalidArgument( "argument (position %d) must be " - "EagerVariable, but got %s", + "Tensor, but got %s", arg_pos + 1, reinterpret_cast(obj->ob_type)->tp_name)); } } @@ -319,7 +320,7 @@ framework::Tensor CastPyArg2FrameworkTensor(PyObject* obj, ssize_t arg_pos) { } else { PADDLE_THROW(platform::errors::InvalidArgument( "argument (position %d) must be " - "EagerVariable, but got %s", + "DenseTensor, but got %s", arg_pos + 1, reinterpret_cast(obj->ob_type)->tp_name)); } } @@ -391,6 +392,19 @@ paddle::framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj, return dtype; } +paddle::CustomOpKernelContext CastPyArg2CustomOpKernelContext(PyObject* obj, + ssize_t arg_pos) { + if (PyObject_IsInstance( + obj, reinterpret_cast(g_custom_op_kernel_ctx_pytype))) { + return ::pybind11::handle(obj).cast(); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "argument (position %d) must be " + "one of(Place,CUDAPlace,CPUPlace,XPUPlace,NPUPlace,CUDAPinnedPlace), " + "but got %s", + arg_pos + 1, reinterpret_cast(obj->ob_type)->tp_name)); + } +} PyObject* ToPyObject(bool value) { if (value) { Py_INCREF(Py_True); @@ -928,6 +942,5 @@ paddle::experimental::DataType CastPyArg2DataType(PyObject* obj, framework::proto::VarType::Type type = CastPyArg2ProtoType(obj, arg_pos); return framework::TransToPhiDataType(type); } - } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index 966a920377b..2187555e1c3 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -20,10 +20,10 @@ limitations under the License. */ #include "pybind11/pybind11.h" #include "pybind11/stl.h" namespace paddle { +class CustomOpKernelContext; namespace framework { class Scope; } - namespace pybind { typedef struct { @@ -40,6 +40,8 @@ int CastPyArg2AttrInt(PyObject* obj, ssize_t arg_pos); int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos); float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos); std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos); +paddle::CustomOpKernelContext CastPyArg2CustomOpKernelContext(PyObject* obj, + ssize_t arg_pos); paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos); std::shared_ptr CastPyArg2VarBase(PyObject* obj, ssize_t arg_pos); @@ -52,6 +54,7 @@ std::vector CastPyArg2VectorOfTensorBase(PyObject* obj, std::vector CastPyArg2VectorOfInt(PyObject* obj, size_t arg_pos); framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj, ssize_t arg_pos); + PyObject* ToPyObject(int value); PyObject* ToPyObject(bool value); PyObject* ToPyObject(int64_t value); @@ -138,6 +141,7 @@ std::vector GetTensorPtrListFromArgs( ssize_t arg_idx, bool dispensable = false); // end of Slice related methods + std::vector GetScopePtrListFromArgs( const std::string& op_type, const std::string& arg_name, PyObject* args, ssize_t arg_idx, bool dispensable); diff --git a/paddle/fluid/pybind/exception.cc b/paddle/fluid/pybind/exception.cc index 362a3e44fab..4f25a6f1a5c 100644 --- a/paddle/fluid/pybind/exception.cc +++ b/paddle/fluid/pybind/exception.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/pybind/exception.h" - +#include "paddle/phi/api/ext/exception.h" namespace paddle { namespace pybind { @@ -122,6 +122,8 @@ void ThrowExceptionToPython(std::exception_ptr p) { PyErr_SetString(EnforceNotMetException, e.what()); break; } + } catch (const paddle::PD_Exception& e) { + PyErr_SetString(PyExc_OSError, e.what()); } } } // namespace pybind diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index ee6dce5dc23..21bbc7f3e36 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -164,6 +164,9 @@ limitations under the License. */ #include "paddle/fluid/pybind/fleet_py.h" #endif +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/pybind/eager_utils.h" +#include "paddle/phi/api/ext/op_meta_info.h" #include "pybind11/stl.h" DECLARE_bool(use_mkldnn); @@ -187,6 +190,7 @@ PyTypeObject *g_cudapinnedplace_pytype = nullptr; PyTypeObject *g_mluplace_pytype = nullptr; PyTypeObject *g_framework_tensor_pytype = nullptr; PyTypeObject *g_framework_lodtensorarray_pytype = nullptr; +PyTypeObject *g_custom_op_kernel_ctx_pytype = nullptr; bool IsCompiledWithCUDA() { #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) @@ -757,6 +761,57 @@ PYBIND11_MODULE(core_noavx, m) { m.def("_promote_types_if_complex_exists", &paddle::framework::PromoteTypesIfComplexExists); + py::class_ custom_op_kernel_ctx( + m, "CustomOpKernelContext", R"DOC()DOC"); + g_custom_op_kernel_ctx_pytype = + reinterpret_cast(custom_op_kernel_ctx.ptr()); + custom_op_kernel_ctx.def(py::init<>()) + .def("add_inputs", + [](paddle::CustomOpKernelContext &self, const py::handle &input) { + PyObject *obj = input.ptr(); + if (PyList_Check(obj) || PyTuple_Check(obj)) { + self.EmplaceBackInputs( + std::move(CastPyArg2VectorOfTensor(obj, 1))); + } else { + self.EmplaceBackInput(std::move(CastPyArg2Tensor(obj, 1))); + } + }) + .def("add_outputs", + [](paddle::CustomOpKernelContext &self, py::handle &outputs) { + PyObject *obj = outputs.ptr(); + if (PyList_Check(obj) || PyTuple_Check(obj)) { + self.EmplaceBackOutputs( + std::move(CastPyArg2VectorOfTensor(obj, 1))); + } else { + self.EmplaceBackOutput(std::move(CastPyArg2Tensor(obj, 1))); + } + }) + .def("add_attr", [](paddle::CustomOpKernelContext &self, + bool attr) { self.EmplaceBackAttr(attr); }) + .def("add_attr", [](paddle::CustomOpKernelContext &self, + int attr) { self.EmplaceBackAttr(attr); }) + .def("add_attr", [](paddle::CustomOpKernelContext &self, + float attr) { self.EmplaceBackAttr(attr); }) + .def("add_attr", [](paddle::CustomOpKernelContext &self, + int64_t attr) { self.EmplaceBackAttr(attr); }) + .def("add_attr", + [](paddle::CustomOpKernelContext &self, const std::string &attr) { + self.EmplaceBackAttr(attr); + }) + .def("add_attr", + [](paddle::CustomOpKernelContext &self, + const std::vector &attr) { self.EmplaceBackAttr(attr); }) + .def("add_attr", + [](paddle::CustomOpKernelContext &self, + const std::vector &attr) { self.EmplaceBackAttr(attr); }) + .def("add_attr", + [](paddle::CustomOpKernelContext &self, + const std::vector &attr) { self.EmplaceBackAttr(attr); }) + .def("add_attr", [](paddle::CustomOpKernelContext &self, + const std::vector &attr) { + self.EmplaceBackAttr(attr); + }); + py::class_ framework_tensor(m, "Tensor", py::buffer_protocol()); g_framework_tensor_pytype = @@ -2827,10 +2882,11 @@ All parameter, weight, gradient are variables in Paddle. m.def("init_gflags", framework::InitGflags); m.def("init_glog", framework::InitGLOG); - m.def("load_op_meta_info_and_register_op", - framework::LoadOpMetaInfoAndRegisterOp); + m.def("load_op_meta_info_and_register_op", [](const std::string dso_name) { + egr::Controller::Instance().MergeOpMetaInfoMap( + framework::LoadOpMetaInfoAndRegisterOp(dso_name)); + }); m.def("init_devices", []() { framework::InitDevices(); }); - m.def("is_compiled_with_cuda", IsCompiledWithCUDA); m.def("is_compiled_with_ascend", IsCompiledWithAscend); m.def("is_compiled_with_rocm", IsCompiledWithROCM); diff --git a/paddle/phi/api/ext/op_meta_info.h b/paddle/phi/api/ext/op_meta_info.h index 7601696293a..88660449b68 100644 --- a/paddle/phi/api/ext/op_meta_info.h +++ b/paddle/phi/api/ext/op_meta_info.h @@ -86,19 +86,28 @@ class PADDLE_API CustomOpKernelContext { CustomOpKernelContext() = default; void EmplaceBackInput(Tensor&& input); - void EmplaceBackInputs(std::vector&& inputs); + void EmplaceBackInputs(const std::vector& inputs); void EmplaceBackOutput(Tensor&& output); - void EmplaceBackOutputs(std::vector&& outputs); + void EmplaceBackOutputs(const std::vector& outputs); void EmplaceBackAttr(paddle::any attr); - + void EmplaceBackAttrs(const std::vector& attrs) { + attrs_ = std::move(attrs); + } const std::pair& InputRangeAt(size_t idx) const; const std::pair& OutputRangeAt(size_t idx) const; const Tensor& InputAt(size_t idx) const; std::vector InputsBetween(size_t start, size_t end) const; - + const std::vector& Attrs() const { return attrs_; } + const std::vector>& InputRange() { + return input_range_; + } + const std::vector>& OutputRange() { + return output_range_; + } Tensor* MutableOutputAt(size_t idx); std::vector MutableOutputBetweeen(size_t start, size_t end); + std::vector OutputsBetweeen(size_t start, size_t end); std::vector* AllMutableOutput(); template @@ -552,7 +561,6 @@ class PADDLE_API OpMetaInfo { std::vector inputs_; std::vector outputs_; std::vector attrs_; - // 2. func info KernelFunc kernel_fn_{nullptr}; InferShapeFunc infer_shape_fn_{nullptr}; diff --git a/paddle/phi/api/lib/op_meta_info.cc b/paddle/phi/api/lib/op_meta_info.cc index 51d51c954de..14dba664c41 100644 --- a/paddle/phi/api/lib/op_meta_info.cc +++ b/paddle/phi/api/lib/op_meta_info.cc @@ -51,7 +51,8 @@ void CustomOpKernelContext::EmplaceBackInput(Tensor&& input) { input_range_.emplace_back(std::make_pair(index, index + 1)); } -void CustomOpKernelContext::EmplaceBackInputs(std::vector&& inputs) { +void CustomOpKernelContext::EmplaceBackInputs( + const std::vector& inputs) { size_t index = inputs_.size(); input_range_.emplace_back(std::make_pair(index, index + inputs.size())); inputs_.insert(inputs_.end(), @@ -65,7 +66,8 @@ void CustomOpKernelContext::EmplaceBackOutput(Tensor&& output) { output_range_.emplace_back(std::make_pair(index, index + 1)); } -void CustomOpKernelContext::EmplaceBackOutputs(std::vector&& outputs) { +void CustomOpKernelContext::EmplaceBackOutputs( + const std::vector& outputs) { size_t index = outputs_.size(); output_range_.emplace_back(std::make_pair(index, index + outputs.size())); outputs_.insert(outputs_.end(), @@ -75,6 +77,8 @@ void CustomOpKernelContext::EmplaceBackOutputs(std::vector&& outputs) { void CustomOpKernelContext::EmplaceBackAttr(paddle::any attr) { attrs_.emplace_back(std::move(attr)); + VLOG(7) << "attrs_ No." << attrs_.size() - 1 + << " has value of type: " << attrs_[attrs_.size() - 1].type().name(); } const Tensor& CustomOpKernelContext::InputAt(size_t idx) const { @@ -102,6 +106,15 @@ std::vector CustomOpKernelContext::MutableOutputBetweeen(size_t start, return rlt; } +std::vector CustomOpKernelContext::OutputsBetweeen(size_t start, + size_t end) { + std::vector rlt; + for (size_t i = start; i < end; ++i) { + rlt.emplace_back(outputs_.at(i)); + } + return rlt; +} + std::vector* CustomOpKernelContext::AllMutableOutput() { return &outputs_; } diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index 311dd0fc309..40174a505dc 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -111,8 +111,8 @@ void Tensor::reshape(const std::vector &shape) { "touching underlying data, this requires the total size of " "the tensor to remain constant."; if (is_dense_tensor()) { - std::dynamic_pointer_cast(impl_)->set_meta( - phi::DenseTensorMeta(dtype(), phi::make_ddim(shape))); + std::dynamic_pointer_cast(impl_)->Resize( + phi::make_ddim(shape)); } else { PADDLE_THROW(phi::errors::Unimplemented( "Only support reshape operation on DenseTensor now.")); diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc index aefa26952d1..885e29b27fa 100644 --- a/paddle/phi/api/lib/tensor_method.cc +++ b/paddle/phi/api/lib/tensor_method.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/api/lib/ext_compat_utils.h" +#include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/tensor_base.h" diff --git a/python/paddle/autograd/backward_mode.py b/python/paddle/autograd/backward_mode.py index 36ca048c512..6fc6f7d3d49 100644 --- a/python/paddle/autograd/backward_mode.py +++ b/python/paddle/autograd/backward_mode.py @@ -81,15 +81,14 @@ def backward(tensors, grad_tensors=None, retain_graph=False): if isinstance(in_out_list, (list, tuple)): assert len(in_out_list) > 0, "{} connot be empyt".format(name) for each_var in in_out_list: - assert isinstance( - each_var, paddle. - Tensor), "Elements of {} must be paddle.Tensor".format(name) + assert isinstance(each_var, ( + paddle.Tensor, core.eager.Tensor + )), "Elements of {} must be paddle.Tensor".format(name) return in_out_list else: - assert isinstance( - in_out_list, - paddle.Tensor), "{} must be Tensor or list of Tensor".format( - name) + assert isinstance(in_out_list, ( + paddle.Tensor, core.eager.Tensor + )), "{} must be Tensor or list of Tensor".format(name) return [in_out_list] tensors = check_tensors(tensors, "tensors") @@ -105,10 +104,13 @@ def backward(tensors, grad_tensors=None, retain_graph=False): for each_tensor in grad_tensors: if each_tensor is not None: assert isinstance( - each_tensor, paddle.Tensor + each_tensor, (paddle.Tensor, core.eager.Tensor) ), "The argument 'grad_tensors' of paddle.autograd.backward is invalid, it can be 'None', 'paddle.Tensor' or 'list[None/paddle.Tensor]'." else: - grad_tensors = [None] * len(tensors) + if core._in_eager_mode(): + grad_tensors = [] + else: + grad_tensors = [None] * len(tensors) if len(grad_tensors) > 0: assert len(tensors) == len( @@ -116,5 +118,8 @@ def backward(tensors, grad_tensors=None, retain_graph=False): assert isinstance(retain_graph, bool), "retain_graph must be True or False" - core.dygraph_run_backward(tensors, grad_tensors, retain_graph, - framework._dygraph_tracer()) + if core._in_eager_mode(): + core.eager.run_backward(tensors, grad_tensors, retain_graph) + else: + core.dygraph_run_backward(tensors, grad_tensors, retain_graph, + framework._dygraph_tracer()) diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index 6843c0e4c3f..2b67a202972 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -311,7 +311,7 @@ def monkey_patch_varbase(): """ if core._in_eager_mode(): - if not self.grad._is_initialized(): + if self.grad is None: return None # TODO(wanghuancoder) support SELECTED_ROWS return self.grad.numpy() diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc index c89990be34c..acaf7cb7428 100644 --- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc +++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc @@ -153,6 +153,7 @@ PD_BUILD_GRAD_OP(custom_relu_no_x_in_backward) .SetInferShapeFn(PD_INFER_SHAPE(ReluBackwardWithoutXInferShape)); void relu_cpu_forward_out(const paddle::Tensor& x, paddle::Tensor* out) { + out->reshape(x.shape()); PD_DISPATCH_FLOATING_TYPES( x.type(), "relu_cpu_forward", ([&] { relu_cpu_forward_kernel( @@ -164,6 +165,7 @@ void relu_cpu_backward_out(const paddle::Tensor& x, const paddle::Tensor& out, const paddle::Tensor& grad_out, paddle::Tensor* grad_x) { + grad_x->reshape(x.shape()); PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] { relu_cpu_backward_kernel( grad_out.data(), diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu index 33c5ede299b..4bb773cdaec 100644 --- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu +++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu @@ -94,6 +94,7 @@ void relu_cuda_forward_out(const paddle::Tensor& x, paddle::Tensor* out) { int numel = x.size(); int block = 512; int grid = (numel + block - 1) / block; + out->reshape(x.shape()); PD_DISPATCH_FLOATING_AND_HALF_TYPES( x.type(), "relu_cuda_forward_kernel", ([&] { relu_cuda_forward_kernel<<>>( @@ -108,6 +109,7 @@ void relu_cuda_backward_out(const paddle::Tensor& x, int numel = out.size(); int block = 512; int grid = (numel + block - 1) / block; + grad_x->reshape(x.shape()); PD_DISPATCH_FLOATING_AND_HALF_TYPES( out.type(), "relu_cuda_backward_kernel", ([&] { relu_cuda_backward_kernel<<>>( diff --git a/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py index 1c9c6eedbae..785bfc74229 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py @@ -20,6 +20,7 @@ import paddle from paddle.utils.cpp_extension import load, get_build_directory from utils import paddle_includes, extra_cc_args, extra_nvcc_args from paddle.utils.cpp_extension.extension_utils import run_cmd +from paddle.fluid.framework import _test_eager_guard, _in_eager_mode # Because Windows don't use docker, the shared lib already exists in the # cache dir, it will not be compiled again unless the shared lib is removed. @@ -53,7 +54,7 @@ class TestJitCustomAttrs(unittest.TestCase): self.int64_vec_attr = [10000000000, 10000000000, 10000000000] self.str_vec_attr = ["StrAttr", "StrAttr", "StrAttr"] - def test_attr_value(self): + def func_attr_value(self): x = paddle.ones([2, 2], dtype='float32') x.stop_gradient = False out = custom_attrs.attr_test( @@ -65,7 +66,12 @@ class TestJitCustomAttrs(unittest.TestCase): self.assertTrue(np.array_equal(x.numpy(), out.numpy())) - def test_const_attr_value(self): + def test_attr_value(self): + with _test_eager_guard(): + self.func_attr_value() + self.func_attr_value() + + def func_const_attr_value(self): x = paddle.ones([2, 2], dtype='float32') x.stop_gradient = False out = custom_attrs.const_attr_test( @@ -77,6 +83,11 @@ class TestJitCustomAttrs(unittest.TestCase): self.assertTrue(np.array_equal(x.numpy(), out.numpy())) + def test_const_attr_value(self): + with _test_eager_guard(): + self.func_const_attr_value() + self.func_const_attr_value() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/custom_op/test_custom_concat.py b/python/paddle/fluid/tests/custom_op/test_custom_concat.py index 9049b604c91..62e61c5bc7f 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_concat.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_concat.py @@ -21,6 +21,7 @@ import paddle.static as static from paddle.utils.cpp_extension import load, get_build_directory from paddle.utils.cpp_extension.extension_utils import run_cmd from utils import paddle_includes, extra_cc_args, extra_nvcc_args +from paddle.fluid.framework import _test_eager_guard, _in_eager_mode # Because Windows don't use docker, the shared lib already exists in the # cache dir, it will not be compiled again unless the shared lib is removed. @@ -116,7 +117,7 @@ class TestCustomConcatDynamicAxisJit(unittest.TestCase): "custom op {}: {},\n paddle api {}: {}".format(name, out, name, pd_out)) - def test_dynamic(self): + def func_dynamic(self): for dtype in self.dtypes: for axis in self.axises: out, grad_inputs = concat_dynamic(custom_ops.custom_concat, @@ -128,6 +129,11 @@ class TestCustomConcatDynamicAxisJit(unittest.TestCase): for x_grad, pd_x_grad in zip(grad_inputs, pd_grad_inputs): self.check_output(x_grad, pd_x_grad, "x_grad") + def test_dynamic(self): + with _test_eager_guard(): + self.func_dynamic() + self.func_dynamic() + def test_static(self): for dtype in self.dtypes: for axis in self.axises: @@ -140,7 +146,7 @@ class TestCustomConcatDynamicAxisJit(unittest.TestCase): self.check_output(x1_grad, pd_x1_grad, "x1_grad") self.check_output(x2_grad, pd_x2_grad, "x2_grad") - def test_dynamic_with_attr(self): + def func_dynamic_with_attr(self): for dtype in self.dtypes: for axis in self.axises: out, grad_inputs = concat_dynamic( @@ -153,6 +159,11 @@ class TestCustomConcatDynamicAxisJit(unittest.TestCase): for x_grad, pd_x_grad in zip(grad_inputs, pd_grad_inputs): self.check_output(x_grad, pd_x_grad, "x_grad") + def test_dynamic_with_attr(self): + with _test_eager_guard(): + self.func_dynamic_with_attr() + self.func_dynamic_with_attr() + def test_static_with_attr(self): for dtype in self.dtypes: for axis in self.axises: diff --git a/python/paddle/fluid/tests/custom_op/test_custom_conj.py b/python/paddle/fluid/tests/custom_op/test_custom_conj.py index 25c88ee6c6b..5f3c107a9b2 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_conj.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_conj.py @@ -21,6 +21,7 @@ import paddle.static as static from paddle.utils.cpp_extension import load, get_build_directory from paddle.utils.cpp_extension.extension_utils import run_cmd from utils import paddle_includes, extra_cc_args, extra_nvcc_args +from paddle.fluid.framework import _test_eager_guard # Because Windows don't use docker, the shared lib already exists in the # cache dir, it will not be compiled again unless the shared lib is removed. @@ -116,11 +117,16 @@ class TestCustomConjJit(unittest.TestCase): self.check_output(out, pd_out, "out") self.check_output(x_grad, pd_x_grad, "x's grad") - def test_dynamic(self): + def func_dynamic(self): for dtype in self.dtypes: np_input = np.random.random(self.shape).astype(dtype) self.run_dynamic(dtype, np_input) + def test_dynamic(self): + with _test_eager_guard(): + self.func_dynamic() + self.func_dynamic() + def test_static(self): for dtype in self.dtypes: np_input = np.random.random(self.shape).astype(dtype) diff --git a/python/paddle/fluid/tests/custom_op/test_custom_linear.py b/python/paddle/fluid/tests/custom_op/test_custom_linear.py index 0ba70eaa3e0..811eedf1eda 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_linear.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_linear.py @@ -22,6 +22,7 @@ import paddle.nn.functional as F from paddle.utils.cpp_extension import load, get_build_directory from paddle.utils.cpp_extension.extension_utils import run_cmd from utils import paddle_includes, extra_cc_args, extra_nvcc_args +from paddle.fluid.framework import _test_eager_guard, _in_eager_mode # Because Windows don't use docker, the shared lib already exists in the # cache dir, it will not be compiled again unless the shared lib is removed. @@ -94,7 +95,7 @@ class TestCustomLinearJit(unittest.TestCase): self.np_bias) self.check_output(pten_out, pd_out, "pten_out") - def test_dynamic(self): + def func_dynamic(self): for dtype in self.dtypes: pten_out = linear_dynamic(custom_ops.pten_linear, dtype, self.np_x, self.np_weight, self.np_bias) @@ -102,6 +103,11 @@ class TestCustomLinearJit(unittest.TestCase): self.np_bias) self.check_output(pten_out, pd_out, "pten_out") + def test_dynamic(self): + with _test_eager_guard(): + self.func_dynamic() + self.func_dynamic() + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py b/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py index 207ea879741..4da99b1ea10 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_raw_op_kernel_op.py @@ -68,12 +68,6 @@ class TestCustomRawReluOp(unittest.TestCase): self.assertTrue(custom_raw_relu_op is not None) return custom_raw_relu_op(x) - def test_dygraph(self): - x = paddle.to_tensor(np.random.uniform(low=-1.0, high=1.0, size=[2, 3])) - y1 = self.custom_raw_relu(x) - y2 = paddle.nn.ReLU()(x) - self.assertTrue(np.array_equal(y1.numpy(), y2.numpy())) - def test_static(self): paddle.enable_static() shape = [2, 3] diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py index dddb14eb78e..81793f1391d 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py @@ -22,6 +22,7 @@ from paddle.utils.cpp_extension import load, get_build_directory from paddle.utils.cpp_extension.extension_utils import run_cmd from utils import paddle_includes, extra_cc_args, extra_nvcc_args, IS_MAC +from paddle.fluid.framework import _test_eager_guard, _in_eager_mode # Because Windows don't use docker, the shared lib already exists in the # cache dir, it will not be compiled again unless the shared lib is removed. @@ -98,7 +99,7 @@ class TestDygraphModel(unittest.TestCase): self.x_spec = paddle.static.InputSpec( shape=[None, self.in_dim], dtype='float32', name='x') - def test_train_eval(self): + def func_train_eval(self): for device in self.devices: # set device paddle.set_device(device) @@ -106,26 +107,34 @@ class TestDygraphModel(unittest.TestCase): # for train origin_relu_train_out = self.train_model(use_custom_op=False) custom_relu_train_out = self.train_model(use_custom_op=True) - custom_relu_dy2stat_train_out = self.train_model( - use_custom_op=True, dy2stat=True) # for to_static + # open this when dy2stat is ready for eager + if not _in_eager_mode(): + custom_relu_dy2stat_train_out = self.train_model( + use_custom_op=True, dy2stat=True) # for to_static + self.assertTrue( + np.array_equal(origin_relu_train_out, + custom_relu_dy2stat_train_out)) self.assertTrue( np.array_equal(origin_relu_train_out, custom_relu_train_out)) - self.assertTrue( - np.array_equal(origin_relu_train_out, - custom_relu_dy2stat_train_out)) # for eval origin_relu_eval_out = self.eval_model(use_custom_op=False) custom_relu_eval_out = self.eval_model(use_custom_op=True) - custom_relu_dy2stat_eval_out = self.eval_model( - use_custom_op=True, dy2stat=True) # for to_static + if not _in_eager_mode(): + custom_relu_dy2stat_eval_out = self.eval_model( + use_custom_op=True, dy2stat=True) # for to_static + self.assertTrue( + np.array_equal(origin_relu_eval_out, + custom_relu_dy2stat_eval_out)) self.assertTrue( np.array_equal(origin_relu_eval_out, custom_relu_eval_out)) - self.assertTrue( - np.array_equal(origin_relu_eval_out, - custom_relu_dy2stat_eval_out)) + + def test_train_eval(self): + with _test_eager_guard(): + self.func_train_eval() + self.func_train_eval() def train_model(self, use_custom_op=False, dy2stat=False): # reset random seed diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py index 407eb342ba9..a747d10823e 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py @@ -20,7 +20,7 @@ from paddle.utils.cpp_extension import load, get_build_directory from paddle.utils.cpp_extension.extension_utils import run_cmd from utils import paddle_includes, extra_cc_args, extra_nvcc_args, IS_WINDOWS, IS_MAC from test_custom_relu_op_setup import custom_relu_dynamic, custom_relu_static - +from paddle.fluid.framework import _test_eager_guard, _in_eager_mode # Because Windows don't use docker, the shared lib already exists in the # cache dir, it will not be compiled again unless the shared lib is removed. file = '{}\\custom_relu_module_jit\\custom_relu_module_jit.pyd'.format( @@ -75,7 +75,7 @@ class TestJITLoad(unittest.TestCase): "custom op out: {},\n paddle api out: {}".format( out, pd_out)) - def test_dynamic(self): + def func_dynamic(self): for device in self.devices: for dtype in self.dtypes: if device == 'cpu' and dtype == 'float16': @@ -95,8 +95,14 @@ class TestJITLoad(unittest.TestCase): "custom op x grad: {},\n paddle api x grad: {}".format( x_grad, pd_x_grad)) - def test_exception(self): + def test_dynamic(self): + with _test_eager_guard(): + self.func_dynamic() + self.func_dynamic() + + def func_exception(self): caught_exception = False + # if not _in_eager_mode(): try: x = np.random.uniform(-1, 1, [4, 8]).astype('int32') custom_relu_dynamic(custom_module.custom_relu, 'cpu', 'int32', x) @@ -114,11 +120,11 @@ class TestJITLoad(unittest.TestCase): "python/paddle/fluid/tests/custom_op/custom_relu_op.cc" in str(e)) self.assertTrue(caught_exception) - caught_exception = False # MAC-CI don't support GPU if IS_MAC: return + # if not _in_eager_mode(): try: x = np.random.uniform(-1, 1, [4, 8]).astype('int32') custom_relu_dynamic(custom_module.custom_relu, 'gpu', 'int32', x) @@ -132,6 +138,11 @@ class TestJITLoad(unittest.TestCase): str(e)) self.assertTrue(caught_exception) + def test_exception(self): + with _test_eager_guard(): + self.func_exception() + self.func_exception() + def test_load_multiple_module(self): custom_module = load( name='custom_conj_jit', diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py index 0af0aa16466..7c61e11a18e 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py @@ -21,6 +21,7 @@ import paddle.static as static import subprocess import numpy as np from paddle.utils.cpp_extension.extension_utils import run_cmd +from paddle.fluid.framework import _test_eager_guard def custom_relu_dynamic(func, device, dtype, np_x, use_func=True): @@ -216,7 +217,7 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase): "custom op out: {},\n paddle api out: {}".format( out, pd_out)) - def test_dynamic(self): + def func_dynamic(self): for device in self.devices: for dtype in self.dtypes: if device == 'cpu' and dtype == 'float16': @@ -236,6 +237,11 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase): "custom op x grad: {},\n paddle api x grad: {}".format( x_grad, pd_x_grad)) + def test_dynamic(self): + with _test_eager_guard(): + self.func_dynamic() + self.func_dynamic() + def test_static_save_and_load_inference_model(self): paddle.enable_static() np_data = np.random.random((1, 1, 28, 28)).astype("float32") diff --git a/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py b/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py index c60bac4060b..f68a37b1a2f 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_simple_slice.py @@ -20,6 +20,7 @@ import paddle from paddle.utils.cpp_extension import load, get_build_directory from paddle.utils.cpp_extension.extension_utils import run_cmd from utils import paddle_includes, extra_cc_args, extra_nvcc_args +from paddle.fluid.framework import _test_eager_guard, _in_eager_mode # Because Windows don't use docker, the shared lib already exists in the # cache dir, it will not be compiled again unless the shared lib is removed. @@ -39,7 +40,7 @@ custom_ops = load( class TestCustomSimpleSliceJit(unittest.TestCase): - def test_slice_output(self): + def func_slice_output(self): np_x = np.random.random((5, 2)).astype("float32") x = paddle.to_tensor(np_x) custom_op_out = custom_ops.custom_simple_slice(x, 2, 3) @@ -48,6 +49,11 @@ class TestCustomSimpleSliceJit(unittest.TestCase): np.array_equal(custom_op_out, np_out), "custom op: {},\n numpy: {}".format(np_out, custom_op_out.numpy())) + def test_slice_output(self): + with _test_eager_guard(): + self.func_slice_output() + self.func_slice_output() + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py index 12e9f50a5e4..0d2cb941eaf 100644 --- a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py +++ b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py @@ -19,7 +19,7 @@ import numpy as np from paddle.utils.cpp_extension import load, get_build_directory from utils import paddle_includes, extra_cc_args from paddle.utils.cpp_extension.extension_utils import run_cmd - +from paddle.fluid.framework import _test_eager_guard # Because Windows don't use docker, the shared lib already exists in the # cache dir, it will not be compiled again unless the shared lib is removed. file = '{}\\dispatch_op\\dispatch_op.pyd'.format(get_build_directory()) @@ -39,7 +39,7 @@ class TestJitDispatch(unittest.TestCase): def setUp(self): paddle.set_device('cpu') - def run_dispatch_test(self, func, dtype): + def run_dispatch_test_impl(self, func, dtype): np_x = np.ones([2, 2]).astype(dtype) x = paddle.to_tensor(np_x) out = func(x) @@ -50,6 +50,11 @@ class TestJitDispatch(unittest.TestCase): np.array_equal(np_x, np_out), "custom op x: {},\n custom op out: {}".format(np_x, np_out)) + def run_dispatch_test(self, func, dtype): + with _test_eager_guard(): + self.run_dispatch_test_impl(func, dtype) + self.run_dispatch_test_impl(func, dtype) + def test_dispatch_integer(self): dtypes = ["int32", "int64", "int8", "uint8", "int16"] for dtype in dtypes: diff --git a/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py b/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py index 97b37498c4d..4fc9270b0f4 100644 --- a/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py +++ b/python/paddle/fluid/tests/custom_op/test_multi_out_jit.py @@ -22,7 +22,7 @@ from paddle.utils.cpp_extension import load from paddle.utils.cpp_extension import load, get_build_directory from paddle.utils.cpp_extension.extension_utils import run_cmd from utils import paddle_includes, extra_cc_args - +from paddle.fluid.framework import _test_eager_guard # Because Windows don't use docker, the shared lib already exists in the # cache dir, it will not be compiled again unless the shared lib is removed. file = '{}\\multi_out_jit\\multi_out_jit.pyd'.format(get_build_directory()) @@ -84,7 +84,7 @@ class TestMultiOutputDtypes(unittest.TestCase): self.check_multi_outputs(res) paddle.disable_static() - def test_dynamic(self): + def func_dynamic(self): for device in self.devices: for dtype in self.dtypes: paddle.set_device(device) @@ -95,6 +95,11 @@ class TestMultiOutputDtypes(unittest.TestCase): self.assertTrue(len(outs) == 3) self.check_multi_outputs(outs, True) + def test_dynamic(self): + with _test_eager_guard(): + self.func_dynamic() + self.func_dynamic() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py index bc280a01890..83a25b71626 100644 --- a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py +++ b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py @@ -20,6 +20,7 @@ import numpy as np import paddle import paddle.fluid.dygraph as dg from op_test import OpTest +from paddle.fluid.framework import _test_eager_guard class TestTensorBackward(unittest.TestCase): @@ -29,7 +30,7 @@ class TestTensorBackward(unittest.TestCase): if paddle.is_compiled_with_cuda(): self._places.append(paddle.CUDAPlace(0)) - def test_tensor_backward(self): + def func_tensor_backward(self): for dtype in self._dtypes: x = np.random.random([2, 100]).astype(dtype) y = np.random.random([100, 2]).astype(dtype) @@ -48,6 +49,11 @@ class TestTensorBackward(unittest.TestCase): self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy())) + def test_tensor_backward(self): + with _test_eager_guard(): + self.func_tensor_backward() + self.func_tensor_backward() + class TestBackwardAPI(unittest.TestCase): def setUp(self): @@ -56,7 +62,7 @@ class TestBackwardAPI(unittest.TestCase): if paddle.is_compiled_with_cuda(): self._places.append(paddle.CUDAPlace(0)) - def test_backward_api(self): + def func_backward_api(self): for dtype in self._dtypes: x = np.random.random([2, 2]).astype(dtype) y = np.random.random([2, 2]).astype(dtype) @@ -78,7 +84,12 @@ class TestBackwardAPI(unittest.TestCase): self.assertTrue( np.allclose(x_grad * 2, x_tensor.grad.numpy())) - def test_backward_single_tensor(self): + def test_backward_api(self): + with _test_eager_guard(): + self.func_backward_api() + self.func_backward_api() + + def func_backward_single_tensor(self): for dtype in self._dtypes: x = np.random.random([2, 2]).astype(dtype) y = np.random.random([2, 2]).astype(dtype) @@ -97,7 +108,12 @@ class TestBackwardAPI(unittest.TestCase): self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy())) - def test_backward_none_grad_tensor(self): + def test_backward_single_tensor(self): + with _test_eager_guard(): + self.func_backward_single_tensor() + self.func_backward_single_tensor() + + def func_backward_none_grad_tensor(self): for dtype in self._dtypes: x = np.random.random([2, 2]).astype(dtype) y = np.random.random([2, 2]).astype(dtype) @@ -115,7 +131,12 @@ class TestBackwardAPI(unittest.TestCase): self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy())) - def test_backward_accumulator_with_init_grad(self): + def test_backward_none_grad_tensor(self): + with _test_eager_guard(): + self.func_backward_none_grad_tensor() + self.func_backward_none_grad_tensor() + + def func_backward_accumulator_with_init_grad(self): for dtype in self._dtypes: x = np.random.random([10, ]).astype(dtype) y_grad = np.random.random([10, ]).astype(dtype) @@ -134,11 +155,14 @@ class TestBackwardAPI(unittest.TestCase): y = x**2 z = x**3 - x_grad = 2 * x_tensor * ( - y_grad_tensor + 3 * y_tensor * y_tensor * z_grad_tensor) + x_grad = 2 * x * (y_grad + 3 * y * y * z_grad) - self.assertTrue( - np.allclose(x_grad.numpy(), x_tensor.grad.numpy())) + self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy())) + + def test_backward_accumulator_with_init_grad(self): + with _test_eager_guard(): + self.func_backward_accumulator_with_init_grad() + self.func_backward_accumulator_with_init_grad() if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py index 9744cda629e..27aec284de4 100644 --- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py +++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py @@ -50,7 +50,7 @@ class EagerScaleTestCase(unittest.TestCase): data_eager.retain_grads() out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True) - self.assertFalse(data_eager.grad._is_initialized()) + self.assertIsNone(data_eager.grad) out_eager.backward(grad_eager, False) self.assertTrue(data_eager.grad._is_initialized()) self.assertTrue(np.array_equal(data_eager.grad.numpy(), input_data)) @@ -72,7 +72,7 @@ class EagerScaleTestCase(unittest.TestCase): data_eager.retain_grads() out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True) - self.assertFalse(data_eager.grad._is_initialized()) + self.assertIsNone(data_eager.grad) with self.assertRaisesRegexp( AssertionError, "The type of grad_tensor must be paddle.Tensor"): diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 6c27d465cb1..aac68efc59a 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -158,6 +158,7 @@ param : [x] kernel : func : scale, scale_sr + inplace : (x -> out) - api : sign args : (Tensor x) diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py index 853a98a62b5..b0a5d37a535 100644 --- a/python/paddle/utils/cpp_extension/extension_utils.py +++ b/python/paddle/utils/cpp_extension/extension_utils.py @@ -146,6 +146,9 @@ def custom_write_stub(resource, pyfile): import types import paddle + cur_dir = os.path.dirname(os.path.abspath(__file__)) + so_path = os.path.join(cur_dir, "{resource}") + def inject_ext_module(module_name, api_names): if module_name in sys.modules: return sys.modules[module_name] @@ -157,9 +160,6 @@ def custom_write_stub(resource, pyfile): return new_module def __bootstrap__(): - cur_dir = os.path.dirname(os.path.abspath(__file__)) - so_path = os.path.join(cur_dir, "{resource}") - assert os.path.exists(so_path) # load custom op shared library with abs path @@ -169,6 +169,7 @@ def custom_write_stub(resource, pyfile): __bootstrap__() {custom_api} + """).lstrip() # Parse registerring op information @@ -900,7 +901,7 @@ def _generate_python_module(module_name, # delete the temp file before exit python process atexit.register(lambda: remove_if_exit(api_file)) - # write into .py file with RWLock + # write into .py file with RWLockc api_content = [_custom_api_content(op_name) for op_name in op_names] with open(api_file, 'w') as f: f.write('\n\n'.join(api_content)) @@ -911,13 +912,15 @@ def _generate_python_module(module_name, def _custom_api_content(op_name): - params_str, ins_str, attrs_str, outs_str = _get_api_inputs_str(op_name) - + params_str, ins_str, attrs_str, outs_str, in_names, attrs_names = _get_api_inputs_str( + op_name) + lower_in_names = [p.split("@")[0].lower() for p in in_names] API_TEMPLATE = textwrap.dedent(""" - from paddle.fluid.core import VarBase - from paddle.fluid.framework import in_dygraph_mode, _dygraph_tracer + import paddle.fluid.core as core + from paddle.fluid.core import VarBase, CustomOpKernelContext + from paddle.fluid.framework import in_dygraph_mode, _dygraph_tracer, _in_eager_mode from paddle.fluid.layer_helper import LayerHelper - + def {op_name}({inputs}): # prepare inputs and outputs ins = {ins} @@ -928,9 +931,20 @@ def _custom_api_content(op_name): # The output variable's dtype use default value 'float32', # and the actual dtype of output variable will be inferred in runtime. if in_dygraph_mode(): - for out_name in out_names: - outs[out_name] = VarBase() - _dygraph_tracer().trace_op(type="{op_name}", inputs=ins, outputs=outs, attrs=attrs) + if _in_eager_mode(): + ctx = CustomOpKernelContext() + for i in {in_names}: + ctx.add_inputs(i) + for j in {attr_names}: + ctx.add_attr(j) + for out_name in out_names: + outs[out_name] = core.eager.Tensor() + ctx.add_outputs(outs[out_name]) + core.eager._run_custom_op(ctx, "{op_name}", True) + else: + for out_name in out_names: + outs[out_name] = VarBase() + _dygraph_tracer().trace_op(type="{op_name}", inputs=ins, outputs=outs, attrs=attrs) else: helper = LayerHelper("{op_name}", **locals()) for out_name in out_names: @@ -949,6 +963,9 @@ def _custom_api_content(op_name): inputs=params_str, ins=ins_str, attrs=attrs_str, + # "[x, y, z]"" + in_names="[" + ",".join(lower_in_names) + "]", + attr_names="[" + ",".join(attrs_names) + "]", out_names=outs_str) return api_content @@ -996,7 +1013,7 @@ def _get_api_inputs_str(op_name): ]) # e.g: ['Out', 'Index'] outs_str = "[%s]" % ','.join(["'{}'".format(name) for name in out_names]) - return params_str, ins_str, attrs_str, outs_str + return params_str, ins_str, attrs_str, outs_str, in_names, attr_names def _write_setup_file(name, -- GitLab From bb801960a24e6364b5a156d829a05668cf85eb0b Mon Sep 17 00:00:00 2001 From: Xiaoxu Chen Date: Mon, 14 Mar 2022 17:21:04 +0800 Subject: [PATCH 040/176] [phi]migrate fmax,fmin kernel to phi (#40140) --- .../elementwise/elementwise_functor.h | 83 ------- .../elementwise/elementwise_max_op.cc | 18 -- .../elementwise/elementwise_max_op.cu | 18 -- .../elementwise/elementwise_max_op.h | 98 -------- .../elementwise/elementwise_min_op.cc | 18 -- .../elementwise/elementwise_min_op.cu | 18 -- .../elementwise/elementwise_min_op.h | 99 -------- .../kernels/cpu/elementwise_grad_kernel.cc | 17 ++ paddle/phi/kernels/cpu/elementwise_kernel.cc | 35 +++ paddle/phi/kernels/elementwise_grad_kernel.h | 18 ++ paddle/phi/kernels/elementwise_kernel.h | 36 +++ .../phi/kernels/funcs/elementwise_functor.h | 213 ++++++++++++++++++ .../kernels/gpu/elementwise_grad_kernel.cu | 17 ++ paddle/phi/kernels/gpu/elementwise_kernel.cu | 35 +++ .../impl/elementwise_grad_kernel_impl.h | 96 ++++++++ .../kernels/impl/elementwise_kernel_impl.h | 47 ++++ paddle/phi/ops/compat/elementwise_sig.cc | 22 ++ 17 files changed, 536 insertions(+), 352 deletions(-) create mode 100644 paddle/phi/kernels/cpu/elementwise_kernel.cc create mode 100644 paddle/phi/kernels/elementwise_kernel.h create mode 100644 paddle/phi/kernels/gpu/elementwise_kernel.cu create mode 100644 paddle/phi/kernels/impl/elementwise_kernel_impl.h diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h index 14baeaa74d2..54931d99292 100644 --- a/paddle/fluid/operators/elementwise/elementwise_functor.h +++ b/paddle/fluid/operators/elementwise/elementwise_functor.h @@ -1,11 +1,8 @@ /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -90,86 +87,6 @@ struct MinFunctor { template using Complex = paddle::platform::complex; -// Fmax -template -struct FMaxFunctor { - inline HOSTDEVICE T operator()(const T a, const T b) const { - return std::fmax(a, b); - } -}; - -template <> -struct FMaxFunctor { - inline HOSTDEVICE paddle::platform::float16 operator()( - const paddle::platform::float16 a, - const paddle::platform::float16 b) const { - float float_a = static_cast(a); - float float_b = static_cast(b); - auto result = std::fmax(float_a, float_b); - return static_cast(result); - } -}; - -template <> -struct FMaxFunctor { - inline HOSTDEVICE int operator()(const int a, const int b) const { - float float_a = static_cast(a); - float float_b = static_cast(b); - auto result = std::fmax(float_a, float_b); - return std::lrint(result); - } -}; - -template <> -struct FMaxFunctor { - inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const { - double double_a = static_cast(a); - double double_b = static_cast(b); - auto result = std::fmax(double_a, double_b); - return std::llrint(result); - } -}; - -// Fmin -template -struct FMinFunctor { - inline HOSTDEVICE T operator()(const T a, const T b) const { - return std::fmin(a, b); - } -}; - -template <> -struct FMinFunctor { - inline HOSTDEVICE paddle::platform::float16 operator()( - const paddle::platform::float16 a, - const paddle::platform::float16 b) const { - float float_a = static_cast(a); - float float_b = static_cast(b); - auto result = std::fmin(float_a, float_b); - return static_cast(result); - } -}; - -template <> -struct FMinFunctor { - inline HOSTDEVICE int operator()(const int a, const int b) const { - float float_a = static_cast(a); - float float_b = static_cast(b); - auto result = std::fmin(float_a, float_b); - return std::lrint(result); - } -}; - -template <> -struct FMinFunctor { - inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const { - double double_a = static_cast(a); - double double_b = static_cast(b); - auto result = std::fmin(double_a, double_b); - return std::llrint(result); - } -}; - template struct MinGradXFunctor { inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const { diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cc b/paddle/fluid/operators/elementwise/elementwise_max_op.cc index 91da732ef0d..d91315cc511 100644 --- a/paddle/fluid/operators/elementwise/elementwise_max_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cc @@ -151,21 +151,3 @@ REGISTER_OPERATOR(elementwise_fmax, ops::ElementwiseOp, ops::ElementwiseFMaxGradOpMaker); REGISTER_OPERATOR(elementwise_fmax_grad, ops::ElementwiseOpGrad); - -REGISTER_OP_CPU_KERNEL( - elementwise_fmax, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel); -REGISTER_OP_CPU_KERNEL( - elementwise_fmax_grad, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cu b/paddle/fluid/operators/elementwise/elementwise_max_op.cu index 123332a4a23..0d5f56fda17 100644 --- a/paddle/fluid/operators/elementwise/elementwise_max_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cu @@ -86,21 +86,3 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseMaxGradKernel, ops::ElementwiseMaxGradKernel); - -REGISTER_OP_CUDA_KERNEL( - elementwise_fmax, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel); -REGISTER_OP_CUDA_KERNEL( - elementwise_fmax_grad, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.h b/paddle/fluid/operators/elementwise/elementwise_max_op.h index cff30be50a3..afe1073d89a 100644 --- a/paddle/fluid/operators/elementwise/elementwise_max_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_max_op.h @@ -35,21 +35,6 @@ class ElementwiseMaxKernel : public framework::OpKernel { } }; -template -class ElementwiseFMaxKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); - - z->mutable_data(ctx.GetPlace()); - int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, - FMaxFunctor(), z); - } -}; - template struct MaxGradDx { HOSTDEVICE T operator()(T x, T y, T out, T dout) const { @@ -104,88 +89,5 @@ class ElementwiseMaxGradKernel : public ElemwiseGradKernel { } }; -template -struct FMaxGradDx { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return dout * static_cast((x >= y) || isnan(y)); - } -}; - -template <> -struct FMaxGradDx { - HOSTDEVICE paddle::platform::float16 operator()( - paddle::platform::float16 x, paddle::platform::float16 y, - paddle::platform::float16 out, paddle::platform::float16 dout) const { - return dout * static_cast( - (x >= y) || paddle::platform::isnan(y)); - } -}; - -template <> -struct FMaxGradDx { - HOSTDEVICE int operator()(int x, int y, int out, int dout) const { - return dout * static_cast((x >= y)); - } -}; - -template <> -struct FMaxGradDx { - HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out, - int64_t dout) const { - return dout * static_cast((x >= y)); - } -}; - -template -struct FMaxGradDy { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return dout * static_cast(!((x >= y) || isnan(y))); - } -}; - -template <> -struct FMaxGradDy { - HOSTDEVICE paddle::platform::float16 operator()( - paddle::platform::float16 x, paddle::platform::float16 y, - paddle::platform::float16 out, paddle::platform::float16 dout) const { - return dout * static_cast( - !((x >= y) || paddle::platform::isnan(y))); - } -}; - -template <> -struct FMaxGradDy { - HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out, - int64_t dout) const { - return dout * static_cast(!((x >= y))); - } -}; - -template <> -struct FMaxGradDy { - HOSTDEVICE int operator()(int x, int y, int out, int dout) const { - return dout * static_cast(!((x >= y))); - } -}; - -template -class ElementwiseFMaxGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - auto* out = dout; // Fake out, not used - int axis = ctx.Attr("axis"); - ElemwiseGradCompute, FMaxGradDy>( - ctx, *x, *y, *out, *dout, axis, dx, dy, FMaxGradDx(), - FMaxGradDy()); - } -}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cc b/paddle/fluid/operators/elementwise/elementwise_min_op.cc index 3a195199954..dad80a2c33f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_min_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cc @@ -147,21 +147,3 @@ REGISTER_OPERATOR(elementwise_fmin, ops::ElementwiseOp, ops::ElementwiseFMinGradOpMaker); REGISTER_OPERATOR(elementwise_fmin_grad, ops::ElementwiseOpGrad); - -REGISTER_OP_CPU_KERNEL( - elementwise_fmin, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel); -REGISTER_OP_CPU_KERNEL( - elementwise_fmin_grad, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cu b/paddle/fluid/operators/elementwise/elementwise_min_op.cu index 5af985567d8..fb8bc9ac7f8 100644 --- a/paddle/fluid/operators/elementwise/elementwise_min_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cu @@ -82,21 +82,3 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseMinGradKernel, ops::ElementwiseMinGradKernel); - -REGISTER_OP_CUDA_KERNEL( - elementwise_fmin, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel); -REGISTER_OP_CUDA_KERNEL( - elementwise_fmin_grad, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.h b/paddle/fluid/operators/elementwise/elementwise_min_op.h index 88fb044d422..283ad2adde9 100644 --- a/paddle/fluid/operators/elementwise/elementwise_min_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_min_op.h @@ -35,21 +35,6 @@ class ElementwiseMinKernel : public framework::OpKernel { } }; -template -class ElementwiseFMinKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); - - z->mutable_data(ctx.GetPlace()); - int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, - FMinFunctor(), z); - } -}; - template struct MinGradDx { HOSTDEVICE T operator()(T x, T y, T out, T dout) const { @@ -124,89 +109,5 @@ class ElementwiseMinGradKernel : public ElemwiseGradKernel { ElementwiseMinGrad(ctx, x, y, out, dout, dx, dy); } }; - -template -struct FMinGradDx { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return dout * static_cast((x <= y) || isnan(y)); - } -}; - -template <> -struct FMinGradDx { - HOSTDEVICE paddle::platform::float16 operator()( - paddle::platform::float16 x, paddle::platform::float16 y, - paddle::platform::float16 out, paddle::platform::float16 dout) const { - return dout * static_cast( - (x <= y) || paddle::platform::isnan(y)); - } -}; - -template <> -struct FMinGradDx { - HOSTDEVICE int operator()(int x, int y, int out, int dout) const { - return dout * static_cast((x <= y)); - } -}; - -template <> -struct FMinGradDx { - HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out, - int64_t dout) const { - return dout * static_cast((x <= y)); - } -}; - -template -struct FMinGradDy { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return dout * static_cast(!((x <= y) || isnan(y))); - } -}; - -template <> -struct FMinGradDy { - HOSTDEVICE paddle::platform::float16 operator()( - paddle::platform::float16 x, paddle::platform::float16 y, - paddle::platform::float16 out, paddle::platform::float16 dout) const { - return dout * static_cast( - !((x <= y) || paddle::platform::isnan(y))); - } -}; - -template <> -struct FMinGradDy { - HOSTDEVICE int operator()(int x, int y, int out, int dout) const { - return dout * static_cast(!((x <= y))); - } -}; - -template <> -struct FMinGradDy { - HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out, - int64_t dout) const { - return dout * static_cast(!((x <= y))); - } -}; - -template -class ElementwiseFMinGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - auto* out = dout; // Fake out, not used - int axis = ctx.Attr("axis"); - ElemwiseGradCompute, FMinGradDy>( - ctx, *x, *y, *out, *dout, axis, dx, dy, FMinGradDx(), - FMinGradDy()); - } -}; } // namespace operators } // namespace paddle diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc index cd513e809fd..bf6ec012b24 100644 --- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc @@ -259,3 +259,20 @@ PD_REGISTER_KERNEL(multiply_triple_grad, phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} +PD_REGISTER_KERNEL(elementwise_fmax_grad, + CPU, + ALL_LAYOUT, + phi::ElementwiseFMaxGradKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL(elementwise_fmin_grad, + CPU, + ALL_LAYOUT, + phi::ElementwiseFMinGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/elementwise_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc new file mode 100644 index 00000000000..37ad18df56e --- /dev/null +++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h" + +PD_REGISTER_KERNEL(elementwise_fmax, + CPU, + ALL_LAYOUT, + phi::ElementwiseFMaxKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL(elementwise_fmin, + CPU, + ALL_LAYOUT, + phi::ElementwiseFMinKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/elementwise_grad_kernel.h b/paddle/phi/kernels/elementwise_grad_kernel.h index 58ae11a9c42..fb2633cc9fc 100644 --- a/paddle/phi/kernels/elementwise_grad_kernel.h +++ b/paddle/phi/kernels/elementwise_grad_kernel.h @@ -124,4 +124,22 @@ void MultiplyTripleGradKernel(const Context& dev_ctx, DenseTensor* d_ddx, DenseTensor* d_ddy); +template +void ElementwiseFMaxGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad, + DenseTensor* y_grad); + +template +void ElementwiseFMinGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad, + DenseTensor* y_grad); + } // namespace phi diff --git a/paddle/phi/kernels/elementwise_kernel.h b/paddle/phi/kernels/elementwise_kernel.h new file mode 100644 index 00000000000..c1e73ad91c6 --- /dev/null +++ b/paddle/phi/kernels/elementwise_kernel.h @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" + +namespace phi { + +template +void ElementwiseFMaxKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +template +void ElementwiseFMinKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h index b01d50015f0..f9e66836a62 100644 --- a/paddle/phi/kernels/funcs/elementwise_functor.h +++ b/paddle/phi/kernels/funcs/elementwise_functor.h @@ -159,6 +159,219 @@ struct DivGradYFunctor> { return -a * out_div_c_conj; } }; +// Fmin +template +struct FMinFunctor { + inline HOSTDEVICE T operator()(const T a, const T b) const { + return std::fmin(a, b); + } +}; + +template <> +struct FMinFunctor { + inline HOSTDEVICE dtype::float16 operator()(const dtype::float16 a, + const dtype::float16 b) const { + float float_a = static_cast(a); + float float_b = static_cast(b); + auto result = std::fmin(float_a, float_b); + return static_cast(result); + } +}; + +template <> +struct FMinFunctor { + inline HOSTDEVICE int operator()(const int a, const int b) const { + float float_a = static_cast(a); + float float_b = static_cast(b); + auto result = std::fmin(float_a, float_b); + return std::lrint(result); + } +}; + +template <> +struct FMinFunctor { + inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const { + double double_a = static_cast(a); + double double_b = static_cast(b); + auto result = std::fmin(double_a, double_b); + return std::llrint(result); + } +}; + +// Fmax +template +struct FMaxFunctor { + inline HOSTDEVICE T operator()(const T a, const T b) const { + return std::fmax(a, b); + } +}; + +template <> +struct FMaxFunctor { + inline HOSTDEVICE dtype::float16 operator()(const dtype::float16 a, + const dtype::float16 b) const { + float float_a = static_cast(a); + float float_b = static_cast(b); + auto result = std::fmax(float_a, float_b); + return static_cast(result); + } +}; + +template <> +struct FMaxFunctor { + inline HOSTDEVICE int operator()(const int a, const int b) const { + float float_a = static_cast(a); + float float_b = static_cast(b); + auto result = std::fmax(float_a, float_b); + return std::lrint(result); + } +}; + +template <> +struct FMaxFunctor { + inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const { + double double_a = static_cast(a); + double double_b = static_cast(b); + auto result = std::fmax(double_a, double_b); + return std::llrint(result); + } +}; + +template +struct FMaxGradDx { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return dout * static_cast((x >= y) || isnan(y)); + } +}; + +template <> +struct FMaxGradDx { + HOSTDEVICE dtype::float16 operator()(dtype::float16 x, + dtype::float16 y, + dtype::float16 out, + dtype::float16 dout) const { + return dout * static_cast((x >= y) || dtype::isnan(y)); + } +}; + +template <> +struct FMaxGradDx { + HOSTDEVICE int operator()(int x, int y, int out, int dout) const { + return dout * static_cast((x >= y)); + } +}; + +template <> +struct FMaxGradDx { + HOSTDEVICE int64_t operator()(int64_t x, + int64_t y, + int64_t out, + int64_t dout) const { + return dout * static_cast((x >= y)); + } +}; + +template +struct FMaxGradDy { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return dout * static_cast(!((x >= y) || isnan(y))); + } +}; + +template <> +struct FMaxGradDy { + HOSTDEVICE dtype::float16 operator()(dtype::float16 x, + dtype::float16 y, + dtype::float16 out, + dtype::float16 dout) const { + return dout * static_cast(!((x >= y) || dtype::isnan(y))); + } +}; + +template <> +struct FMaxGradDy { + HOSTDEVICE int64_t operator()(int64_t x, + int64_t y, + int64_t out, + int64_t dout) const { + return dout * static_cast(!((x >= y))); + } +}; + +template <> +struct FMaxGradDy { + HOSTDEVICE int operator()(int x, int y, int out, int dout) const { + return dout * static_cast(!((x >= y))); + } +}; + +template +struct FMinGradDx { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return dout * static_cast((x <= y) || isnan(y)); + } +}; + +template <> +struct FMinGradDx { + HOSTDEVICE dtype::float16 operator()(dtype::float16 x, + dtype::float16 y, + dtype::float16 out, + dtype::float16 dout) const { + return dout * static_cast((x <= y) || dtype::isnan(y)); + } +}; + +template <> +struct FMinGradDx { + HOSTDEVICE int operator()(int x, int y, int out, int dout) const { + return dout * static_cast((x <= y)); + } +}; + +template <> +struct FMinGradDx { + HOSTDEVICE int64_t operator()(int64_t x, + int64_t y, + int64_t out, + int64_t dout) const { + return dout * static_cast((x <= y)); + } +}; + +template +struct FMinGradDy { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return dout * static_cast(!((x <= y) || isnan(y))); + } +}; + +template <> +struct FMinGradDy { + HOSTDEVICE dtype::float16 operator()(dtype::float16 x, + dtype::float16 y, + dtype::float16 out, + dtype::float16 dout) const { + return dout * static_cast(!((x <= y) || dtype::isnan(y))); + } +}; + +template <> +struct FMinGradDy { + HOSTDEVICE int operator()(int x, int y, int out, int dout) const { + return dout * static_cast(!((x <= y))); + } +}; + +template <> +struct FMinGradDy { + HOSTDEVICE int64_t operator()(int64_t x, + int64_t y, + int64_t out, + int64_t dout) const { + return dout * static_cast(!((x <= y))); + } +}; template struct MultiplyGradFunctor { diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu index 81f7fac1088..c4481bf6ce3 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu @@ -282,3 +282,20 @@ PD_REGISTER_KERNEL(multiply_triple_grad, phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} +PD_REGISTER_KERNEL(elementwise_fmax_grad, + GPU, + ALL_LAYOUT, + phi::ElementwiseFMaxGradKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL(elementwise_fmin_grad, + GPU, + ALL_LAYOUT, + phi::ElementwiseFMinGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/elementwise_kernel.cu b/paddle/phi/kernels/gpu/elementwise_kernel.cu new file mode 100644 index 00000000000..2cffc68fa06 --- /dev/null +++ b/paddle/phi/kernels/gpu/elementwise_kernel.cu @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h" + +PD_REGISTER_KERNEL(elementwise_fmax, + GPU, + ALL_LAYOUT, + phi::ElementwiseFMaxKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL(elementwise_fmin, + GPU, + ALL_LAYOUT, + phi::ElementwiseFMinKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index 65427e87506..0b7a5d3bcb2 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -258,6 +258,102 @@ void DivideDoubleGradKernel(const Context& dev_ctx, dout_result.device(place) = static_cast(-1) * dout_result; } } +template +void ElementwiseFMaxGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad, + DenseTensor* y_grad) { + funcs::ElementwiseGradPreProcess(out_grad, x_grad); + + auto out = out_grad; // Fake out, not used + auto x_dim = x.dims(); + auto y_dim = y.dims(); + if (x.dims() == y.dims()) { + funcs::ElemwiseGradComputeNoBroadcast, + funcs::FMaxGradDy>( + dev_ctx, + x_dim, + y_dim, + x, + y, + out, + out_grad, + axis, + x_grad, + y_grad, + funcs::FMaxGradDx(), + funcs::FMaxGradDy()); + } else { + funcs::ElemwiseGradComputeWithBroadcast, + funcs::FMaxGradDy>( + dev_ctx, + x_dim, + y_dim, + x, + y, + out, + out_grad, + axis, + x_grad, + y_grad, + funcs::FMaxGradDx(), + funcs::FMaxGradDy()); + } +} + +template +void ElementwiseFMinGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad, + DenseTensor* y_grad) { + funcs::ElementwiseGradPreProcess(out_grad, x_grad); + auto out = out_grad; // Fake out, not used + auto x_dim = x.dims(); + auto y_dim = y.dims(); + if (x.dims() == y.dims()) { + funcs::ElemwiseGradComputeNoBroadcast, + funcs::FMinGradDy>( + dev_ctx, + x_dim, + y_dim, + x, + y, + out, + out_grad, + axis, + x_grad, + y_grad, + funcs::FMinGradDx(), + funcs::FMinGradDy()); + } else { + funcs::ElemwiseGradComputeWithBroadcast, + funcs::FMinGradDy>( + dev_ctx, + x_dim, + y_dim, + x, + y, + out, + out_grad, + axis, + x_grad, + y_grad, + funcs::FMinGradDx(), + funcs::FMinGradDy()); + } +} template struct MulGradDX { diff --git a/paddle/phi/kernels/impl/elementwise_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_kernel_impl.h new file mode 100644 index 00000000000..775a91bf026 --- /dev/null +++ b/paddle/phi/kernels/impl/elementwise_kernel_impl.h @@ -0,0 +1,47 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/elementwise_kernel.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/funcs/elementwise_functor.h" +#if defined(__NVCC__) || defined(__HIPCC__) +#include "paddle/phi/kernels/funcs/broadcast_function.h" +#endif + +namespace phi { +template +void ElementwiseFMaxKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out) { + dev_ctx.template Alloc(out); + funcs::ElementwiseCompute, T, T>( + dev_ctx, x, y, axis, funcs::FMaxFunctor(), out); +} + +template +void ElementwiseFMinKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out) { + dev_ctx.template Alloc(out); + funcs::ElementwiseCompute, T, T>( + dev_ctx, x, y, axis, funcs::FMinFunctor(), out); +} + +} // namespace phi diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc index fc890fa3a49..1d2aaa04f05 100644 --- a/paddle/phi/ops/compat/elementwise_sig.cc +++ b/paddle/phi/ops/compat/elementwise_sig.cc @@ -114,6 +114,14 @@ KernelSignature ElementwiseDivGradOpArgumentMapping( {GradVarName("X"), GradVarName("Y")}); } +KernelSignature ElementwiseFMinGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("elementwise_fmin_grad", + {"X", "Y", GradVarName("Out")}, + {"axis"}, + {GradVarName("X"), GradVarName("Y")}); +} + KernelSignature ElementwiseDivDoubleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("divide_double_grad", @@ -130,6 +138,14 @@ KernelSignature ElementwiseMulGradOpArgumentMapping( {GradVarName("X"), GradVarName("Y")}); } +KernelSignature ElementwiseFMaxGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("elementwise_fmax_grad", + {"X", "Y", GradVarName("Out")}, + {"axis"}, + {GradVarName("X"), GradVarName("Y")}); +} + KernelSignature ElementwiseMulDoubleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("multiply_double_grad", @@ -192,3 +208,9 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_grad_grad, phi::ElementwiseMulDoubleGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_triple_grad, phi::ElementwiseMulTripleGradOpArgumentMapping); + +PD_REGISTER_ARG_MAPPING_FN(elementwise_fmax_grad, + phi::ElementwiseFMaxGradOpArgumentMapping); + +PD_REGISTER_ARG_MAPPING_FN(elementwise_fmin_grad, + phi::ElementwiseFMinGradOpArgumentMapping); -- GitLab From 2c21d24038093084512763f073ddbc9cfa8749fe Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Mon, 14 Mar 2022 17:59:44 +0800 Subject: [PATCH 041/176] fix gpu callback (#40445) * fix gpu conetxt callback * fix gpu callback * fix callback early destruct problem --- paddle/phi/backends/gpu/gpu_context.cc | 13 ++++++++++--- .../phi/kernels/funcs/concat_and_split_functor.cu | 2 ++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index 09deb575f24..a3b25259858 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -654,10 +654,17 @@ struct GPUContext::Impl { } void AddStreamCallback(const std::function& callback) const { - // TODO(wilber): Do we need ThreadPool? - auto* func = new std::function([this, callback] { + // NOTE(zhiqiu): better use threadpool here, otherwise "std::async" may + // launch too + // many threads and result in thread oversubscription. + auto* callback_func = new std::function(std::move(callback)); + auto* func = new std::function([this, callback_func] { std::lock_guard lock(stream_call_back_mtx_); - last_future_ = std::async(std::launch::deferred, [&]() { callback(); }); + VLOG(4) << "Stream callback"; + last_future_ = std::async(std::launch::async, [callback_func]() { + std::unique_ptr> releaser(callback_func); + (*callback_func)(); + }); }); #ifdef PADDLE_WITH_HIP diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu index 840c8872f50..06be592dd93 100644 --- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu +++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu @@ -395,6 +395,8 @@ struct ConcatFunctor { auto* data_alloc_released = data_alloc.release(); auto* col_alloc_released = col_alloc.release(); context.AddStreamCallback([data_alloc_released, col_alloc_released] { + VLOG(4) << "Delete cuda pinned at " << data_alloc_released; + VLOG(4) << "Delete cuda pinned at " << col_alloc_released; paddle::memory::allocation::Allocator::AllocationDeleter( data_alloc_released); paddle::memory::allocation::Allocator::AllocationDeleter( -- GitLab From 9e1f762c33478a2feb64d038686a5bdbfe09332b Mon Sep 17 00:00:00 2001 From: Lijunhui <1578034415@qq.com> Date: Mon, 14 Mar 2022 19:36:58 +0800 Subject: [PATCH 042/176] Optimize bilinear_interp backward (#39423) * bilinear_bw init * optimize code * optimize * optimize 2 * optimize functions * modify func name --- paddle/fluid/operators/interpolate_v2_op.cu | 185 ++++++++++++-------- 1 file changed, 108 insertions(+), 77 deletions(-) diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu index d61eb46d97e..cd297c53f89 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cu +++ b/paddle/fluid/operators/interpolate_v2_op.cu @@ -61,13 +61,13 @@ inline platform::GpuLaunchConfig GetGpuLaunchConfig3D( template __forceinline__ __device__ void PreCalculatorForLinearInterpInputIndex( - int* in_img_idx, int* w_id, T* w1lambda, T* w2lambda, T src_w, - const int in_img_w) { - src_w = (src_w > 0) ? src_w : 0.f; - *in_img_idx = static_cast(src_w); - *w_id = (*in_img_idx < in_img_w - 1) ? 1 : 0; - *w1lambda = src_w - *in_img_idx; - *w2lambda = 1.f - *w1lambda; + int* in_img_idx, int* x_id, T* lambda1, T* lambda2, T src_x, + const int in_img_x) { + src_x = (src_x > 0) ? src_x : 0.f; + *in_img_idx = static_cast(src_x); + *x_id = (*in_img_idx < in_img_x - 1) ? 1 : 0; + *lambda1 = src_x - *in_img_idx; + *lambda2 = 1.f - *lambda1; } struct FastDivModForInterpolate { @@ -670,83 +670,102 @@ __global__ void KeBilinearInterpBwShareMemory( } } +__device__ __forceinline__ int GetInputIndex(const size_t nc, const int height, + const int width, const int h, + const int w) { + return (nc * height + h) * width + w; +} + +template +__global__ void KeBilinearInterpNCHWBw(T* in, const int in_h, const int in_w, + const int out_h, const int out_w, + const int n, const int num_channels, + float ratio_h, float ratio_w, + const T* __restrict__ out, + const T align_type_value) { + int index = threadIdx.x + blockDim.x * blockIdx.x; + int stride = blockDim.x * gridDim.x; + int num_out = n * num_channels * out_h * out_w; + int num_in = n * num_channels * in_h * in_w; + + for (; index < num_out; index += stride) { + int index_tmp = index; + int w2 = index_tmp % out_w; + index_tmp /= out_w; + int h2 = index_tmp % out_h; + int nc = index_tmp / out_h; + + int h1, y_id; + T h1lambda, h0lambda; + T src_y = ratio_h * (h2 + align_type_value) - align_type_value; + + PreCalculatorForLinearInterpInputIndex(&h1, &y_id, &h1lambda, &h0lambda, + src_y, in_h); + int w1, x_id; + T w1lambda, w0lambda; + T src_x = ratio_w * (w2 + align_type_value) - align_type_value; + PreCalculatorForLinearInterpInputIndex(&w1, &x_id, &w1lambda, &w0lambda, + src_x, in_w); + + T d2val = out[index]; + + platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1), + h0lambda * w0lambda * d2val); + platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1 + x_id), + h0lambda * w1lambda * d2val); + platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1), + h1lambda * w0lambda * d2val); + platform::CudaAtomicAdd( + in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1 + x_id), + h1lambda * w1lambda * d2val); + } +} + template __global__ void KeBilinearInterpBw(T* in, const int in_h, const int in_w, const T* __restrict__ out, const int out_h, const int out_w, const int n, - const int num_channels, float ratio_h, - float ratio_w, const T align_type_value, - bool is_nchw) { + const int out_chw, const int num_channels, + float ratio_h, float ratio_w, + const T align_type_value, + FastDivModForInterpolate divmods) { int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; int in_chw = in_h * in_w * num_channels; - int out_chw = num_channels * out_h * out_w; int nthreads = n * out_chw; - if (is_nchw) { - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / out_chw; - int out_id_w = tid % out_chw; - const int in_img_size = in_h * in_w; - const int out_img_size = out_h * out_w; - T value = out[out_id_h * out_chw + out_id_w]; - - int channel_id = out_id_w / out_img_size; - int out_img_idy = (out_id_w % out_img_size) / out_w; - int out_img_idx = tid % out_w; - int in_img_idx, in_img_idy, w_id, h_id; - T w1lambda, h1lambda, w2lambda, h2lambda; - - T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; - T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; - - PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda, - &w2lambda, src_w, in_w); - PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda, - &h2lambda, src_h, in_h); - - T* in_pos = &in[out_id_h * in_chw + channel_id * in_img_size + - in_img_idy * in_w + in_img_idx]; - platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value); - platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * value); - platform::CudaAtomicAdd(&in_pos[h_id * in_w], - h1lambda * w2lambda * value); - platform::CudaAtomicAdd(&in_pos[h_id * in_w + w_id], - h1lambda * w1lambda * value); - } - } else { - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / out_chw; - int out_id_w = tid % out_chw; - const int in_img_size = in_h * in_w; - const int out_img_size = out_h * out_w; - T value = out[out_id_h * out_chw + out_id_w]; - - int out_img_idy = out_id_w / (out_w * num_channels); - int out_img_idx = out_id_w % (out_w * num_channels) / num_channels; - int channel_id = tid % num_channels; - - int in_img_idx, in_img_idy, w_id, h_id; - T w1lambda, h1lambda, w2lambda, h2lambda; - T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; - T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; - - PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda, - &w2lambda, src_w, in_w); - PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda, - &h2lambda, src_h, in_h); - - T* in_pos = &in[out_id_h * in_chw + in_img_idy * in_w * num_channels + - in_img_idx * num_channels + channel_id]; - platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value); - platform::CudaAtomicAdd(&in_pos[w_id * num_channels], - h2lambda * w1lambda * value); - platform::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels], - h1lambda * w2lambda * value); - platform::CudaAtomicAdd( - &in_pos[h_id * in_w * num_channels + w_id * num_channels], - h1lambda * w1lambda * value); - } + for (; tid < nthreads; tid += stride) { + auto out_id_divmod = divmods.output_w_div.Divmod(tid); + int out_id_h = out_id_divmod.val[0]; + int out_id_w = out_id_divmod.val[1]; + + int channel_id = divmods.channels_div.Divmod(tid).val[1]; + auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w); + int out_img_idy = outimg_id_divmod.val[0]; + int out_img_idx = + divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0]; + + int in_img_idx, in_img_idy, w_id, h_id; + T w1lambda, h1lambda, w2lambda, h2lambda; + T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; + T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; + + PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda, + &w2lambda, src_w, in_w); + PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda, + &h2lambda, src_h, in_h); + + T value = out[tid]; + T* in_pos = &in[out_id_h * in_chw + in_img_idy * in_w * num_channels + + in_img_idx * num_channels + channel_id]; + platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value); + platform::CudaAtomicAdd(&in_pos[w_id * num_channels], + h2lambda * w1lambda * value); + platform::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels], + h1lambda * w2lambda * value); + platform::CudaAtomicAdd( + &in_pos[h_id * in_w * num_channels + w_id * num_channels], + h1lambda * w1lambda * value); } } @@ -1907,11 +1926,23 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, ctx.cuda_device_context().stream()>>>( input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, c, ratio_h, ratio_w, align_type_value, is_nchw); + } else if (!optimize_flag & is_nchw) { + // + const int num_kernels = n * c * out_h * out_w; + const int num_threads = + std::min(ctx.cuda_device_context().GetMaxThreadsPerBlock(), 1024); + KeBilinearInterpNCHWBw< + T><<>>( + input_grad_data, in_h, in_w, out_h, out_w, n, c, ratio_h, ratio_w, + output_grad_data, align_type_value); } else { + int64_t cw = c * out_w; + auto interp_divmods = FastDivModForInterpolate(c, out_chw, cw); KeBilinearInterpBw<<>>( - input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, c, - ratio_h, ratio_w, align_type_value, is_nchw); + input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, + out_chw, c, ratio_h, ratio_w, align_type_value, interp_divmods); } } else if ("bicubic" == interp_method) { #ifdef __HIPCC__ -- GitLab From 5720537e84ef38c2c5c94786839491700edd64db Mon Sep 17 00:00:00 2001 From: crystal <62974595+Zjq9409@users.noreply.github.com> Date: Mon, 14 Mar 2022 19:39:24 +0800 Subject: [PATCH 043/176] optimize group_norm op backward (#39944) * optimize backwad * optimize group_norm backward * Add vectorized code * move assignment code * merge function * move code * optimize code * Modify function name --- paddle/fluid/operators/group_norm_op.cc | 4 + paddle/fluid/operators/group_norm_op.cu | 367 +++++++++++++++++++----- 2 files changed, 299 insertions(+), 72 deletions(-) diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc index 2d284fb516e..4331523d26e 100644 --- a/paddle/fluid/operators/group_norm_op.cc +++ b/paddle/fluid/operators/group_norm_op.cc @@ -167,9 +167,11 @@ class GroupNormGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext *ctx) const override { // check input + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GroupNormGrad"); OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "GroupNormGrad"); OP_INOUT_CHECK(ctx->HasInput("Variance"), "Input", "Variance", "GroupNormGrad"); + OP_INOUT_CHECK(ctx->HasInput("Mean"), "Input", "Mean", "GroupNormGrad"); OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input", framework::GradVarName("Y"), "GroupNormGrad"); @@ -216,10 +218,12 @@ class GroupNormGradMaker : public framework::SingleGradOpMaker { void Apply(GradOpPtr op) const override { op->SetType("group_norm_grad"); + op->SetInput("X", this->Input("X")); op->SetInput("Scale", this->Input("Scale")); op->SetInput("Bias", this->Input("Bias")); op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y")); op->SetInput("Y", this->Output("Y")); + op->SetInput("Mean", this->Output("Mean")); op->SetInput("Variance", this->Output("Variance")); op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu index b376334f1e9..ab8c50d90b8 100644 --- a/paddle/fluid/operators/group_norm_op.cu +++ b/paddle/fluid/operators/group_norm_op.cu @@ -81,46 +81,74 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W, CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var); } -template -__device__ __forceinline__ void ThreadReduce(const T* input, int size, - const int offset, AccT* mean, - AccT* var) { +template +__device__ __forceinline__ void ThreadReduce(phi::Array arrs, + int size, const int offset, + AccT* out_mean, AccT* out_var) { + const T* x = arrs[0]; + const T* y; + if (Num == 2) { + y = arrs[1]; + } using VecT = kps::details::VectorType; int tid = threadIdx.x; if (offset > 0) { - input -= offset; + x -= offset; + if (Num == 2) { + y -= offset; + } size += offset; if (tid >= offset) { - AccT temp = input[tid]; - *mean += temp; - *var += temp * temp; + if (Num == 1) { + *out_mean += x[tid]; + *out_var += x[tid] * x[tid]; + } else if (Num == 2) { + *out_mean += y[tid]; + *out_var += y[tid] * x[tid]; + } } size -= blockDim.x; - input += blockDim.x; + x += blockDim.x; + if (Num == 2) { + y += blockDim.x; + } } int remain = size % (VecSize * blockDim.x); - T ins[VecSize]; - VecT* ins_vec = reinterpret_cast(&ins); + T ins_x[VecSize]; + T ins_y[VecSize]; + VecT* ins_vec_x = reinterpret_cast(&ins_x); + VecT* ins_vec_y = reinterpret_cast(&ins_y); // vector part for (; VecSize * tid < (size - remain); tid += blockDim.x) { - *ins_vec = reinterpret_cast(input)[tid]; + *ins_vec_x = reinterpret_cast(x)[tid]; + if (Num == 2) { + *ins_vec_y = reinterpret_cast(y)[tid]; + } #pragma unroll for (int i = 0; i < VecSize; ++i) { - AccT temp = ins[i]; - *mean += temp; - *var += temp * temp; + if (Num == 1) { + *out_mean += ins_x[i]; + *out_var += ins_x[i] * ins_x[i]; + } else if (Num == 2) { + *out_mean += ins_y[i]; + *out_var += ins_y[i] * ins_x[i]; + } } } // scalar part tid = size - remain + threadIdx.x; for (; tid < size; tid += blockDim.x) { - AccT temp = input[tid]; - *mean += temp; - *var += temp * temp; + if (Num == 1) { + *out_mean += x[tid]; + *out_var += x[tid] * x[tid]; + } else if (Num == 2) { + *out_mean += y[tid]; + *out_var += y[tid] * x[tid]; + } } } @@ -148,7 +176,10 @@ __global__ void VectorizedGetMeanAndVarNCHW(const T* x, T* mean, T* var, AccT x_var = static_cast(0); const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T); x += i * size; - ThreadReduce(x, size, input_offset, &x_mean, &x_var); + phi::Array ins; + ins[0] = x; + ThreadReduce(ins, size, input_offset, &x_mean, &x_var); + x_mean = kps::details::BlockXReduce>( x_mean, kps::AddFunctor()); x_var = kps::details::BlockXReduce>( @@ -310,10 +341,12 @@ class GroupNormKernel }; template -__global__ void GroupNormBackwardGetMeanAndVar( - const T* x, const T* scale, const T* bias, const T* d_y, int N, int C, - int W, int imsize, int groups, int group_size, T epsilon, T* d_mean, - T* d_var, T* d_scale, T* d_bias, const DataLayout data_layout) { +__global__ void GroupNormBackwardGetMeanAndVar(const T* x, const T* scale, + const T* bias, const T* d_y, + int N, int C, int W, int imsize, + int groups, int group_size, + T epsilon, T* d_mean, T* d_var, + T* d_scale, T* d_bias) { int gid = blockIdx.y; int cid = blockIdx.x; int bid = blockIdx.z; @@ -329,15 +362,11 @@ __global__ void GroupNormBackwardGetMeanAndVar( for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { T val, dval; - if (data_layout == DataLayout::kNCHW) { - val = x[(bid * C + ccid) * imsize + imid] - x_bias; - dval = d_y[(bid * C + ccid) * imsize + imid]; - } else { - int hid = imid / W; - int wid = imid % W; - val = x[(bid * H + hid) * W * C + wid * C + ccid] - x_bias; - dval = d_y[(bid * H + hid) * W * C + wid * C + ccid]; - } + + int hid = imid / W; + int wid = imid % W; + val = x[(bid * H + hid) * W * C + wid * C + ccid] - x_bias; + dval = d_y[(bid * H + hid) * W * C + wid * C + ccid]; d_var_data += val * dval; d_mean_data += dval * x_scale; @@ -357,8 +386,7 @@ __global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale, const T* bias, const T* var, const T* d_mean, const T* d_var, int N, int C, int W, int imsize, int groups, int group_size, - T epsilon, T* d_x, - const DataLayout data_layout) { + T epsilon, T* d_x) { int gid = blockIdx.y; int cid = blockIdx.x; int bid = blockIdx.z; @@ -379,26 +407,142 @@ __global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale, if (x_scale != 0) x_scale_inv = 1.0 / x_scale; for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { - if (data_layout == DataLayout::kNCHW) { - T tmp = x[(bid * C + ccid) * imsize + imid]; - T v_y = (tmp - x_bias) * x_scale_inv; - T dly = d_y[(bid * C + ccid) * imsize + imid]; - d_x[(bid * C + ccid) * imsize + imid] = - x_var_inv * - (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean); - } else { - int hid = imid / W; - int wid = imid % W; - T tmp = x[(bid * H + hid) * W * C + wid * C + ccid]; - T v_y = (tmp - x_bias) * x_scale_inv; - T dly = d_y[(bid * H + hid) * W * C + wid * C + ccid]; - d_x[(bid * H + hid) * W * C + wid * C + ccid] = - x_var_inv * - (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean); + int hid = imid / W; + int wid = imid % W; + T tmp = x[(bid * H + hid) * W * C + wid * C + ccid]; + T v_y = (tmp - x_bias) * x_scale_inv; + T dly = d_y[(bid * H + hid) * W * C + wid * C + ccid]; + d_x[(bid * H + hid) * W * C + wid * C + ccid] = + x_var_inv * + (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean); + } +} + +template +__global__ void VectorizedGetDsDbCUDAKernel(int imsize, const T* x, const T* dy, + T* ds, T* db) { + int i = blockIdx.x; + AccT ds_sum = static_cast(0); + AccT db_sum = static_cast(0); + const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T); + x += i * imsize; + + phi::Array ins; + ins[0] = x; + ins[1] = dy; + ThreadReduce(ins, imsize, input_offset, &db_sum, + &ds_sum); + + ds_sum = kps::details::BlockXReduce>( + ds_sum, kps::AddFunctor()); + db_sum = kps::details::BlockXReduce>( + db_sum, kps::AddFunctor()); + __syncthreads(); + if (threadIdx.x == 0) { + ds[i] = ds_sum; + db[i] = db_sum; + } +} + +template +__global__ void ScalarGetDsDbCUDAKernel(int imsize, const T* x, const T* dy, + T* ds, T* db) { + const int nc = blockIdx.x; + T ds_sum = 0; + T db_sum = 0; + for (int i = threadIdx.x; i < imsize; i += blockDim.x) { + const int index = nc * imsize + i; + ds_sum += dy[index] * x[index]; + db_sum += dy[index]; + } + CudaAtomicAddWithWarp(&ds[nc], ds_sum); + CudaAtomicAddWithWarp(&db[nc], db_sum); +} + +template +__global__ void GetScaleBiasGradientCUDAKernel(int N, int C, int group, + T epsilon, const T* mean, + const T* var, const T* ds, + const T* db, T* d_scale, + T* d_bias) { + const int c = blockIdx.x * blockDim.x + threadIdx.x; + if (c < C) { + const int G = group; + const int D = C / G; + T sum1 = 0; + T sum2 = 0; + for (int n = 0; n < N; ++n) { + const int nc = n * C + c; + const int ng = n * G + c / D; + sum1 += (d_scale == nullptr) + ? T(0) + : ((ds[nc] - db[nc] * static_cast(mean[ng])) * + static_cast(rsqrt(var[ng] + epsilon))); + sum2 += (d_bias == nullptr) ? T(0) : db[nc]; + } + if (d_scale != nullptr) { + d_scale[c] = sum1; + } + if (d_bias != nullptr) { + d_bias[c] = sum2; } } } +template +__global__ void GetBackwardParamsCUDAKernel(int imsize, int groups, + int group_size, T epsilon, + const T* mean, const T* var, + const T* scale, const T* ds, + const T* db, T* p1, T* p2, T* p3) { + const int n = blockIdx.x; + const int g = blockIdx.y; + const int ng = n * groups + g; + T sum1 = 0; + T sum2 = 0; + T var_inv = rsqrt(var[ng] + epsilon); + for (int64_t i = threadIdx.x; i < group_size; i += blockDim.x) { + const int64_t index = ng * group_size + i; + const int64_t c = g * group_size + i; + const T scale_v = scale == nullptr ? T(1) : static_cast(scale[c]); + sum1 += ds[index] * scale_v; + sum2 += db[index] * scale_v; + const T scale_c = scale == nullptr ? T(0) : static_cast(scale[c]); + p1[index] = scale_c * var_inv; + } + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage ds_storage; + __shared__ typename BlockReduce::TempStorage db_storage; + sum1 = BlockReduce(ds_storage).Reduce(sum1, cub::Sum()); + sum2 = BlockReduce(db_storage).Reduce(sum2, cub::Sum()); + + if (threadIdx.x == 0) { + const T s = T(1) / static_cast(group_size * imsize); + const T x = (sum2 * static_cast(mean[ng]) - sum1) * + static_cast(var_inv) * static_cast(var_inv) * + static_cast(var_inv) * s; + p2[ng] = x; + p3[ng] = -x * static_cast(mean[ng]) - sum2 * static_cast(var_inv) * s; + } +} + +template +__global__ void GetXGradientCUDAKernel(int imsize, int C, int group_size, + int groups, T* p1, T* p2, T* p3, + const T* x, const T* dy, T* dx) { + int cid = blockIdx.x; + int gid = blockIdx.y; + int bid = blockIdx.z; + int ccid = bid * C + gid * group_size + cid; + int ng = bid * groups + gid; + int nc = gid * group_size + cid; + for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { + int index = (bid * C + nc) * imsize + imid; + dx[index] = p1[ccid] * dy[index] + p2[ng] * x[index] + p3[ng]; + } +} + template class GroupNormGradKernel : public framework::OpKernel { @@ -408,7 +552,9 @@ class GroupNormGradKernel const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); const float epsilon = ctx.Attr("epsilon"); - auto* x = ctx.Input("Y"); + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* mean = ctx.Input("Mean"); auto* var = ctx.Input("Variance"); auto* scale = ctx.Input("Scale"); auto* bias = ctx.Input("Bias"); @@ -433,31 +579,27 @@ class GroupNormGradKernel phi::funcs::SetConstant set_zero; auto& dev_ctx = ctx.template device_context(); - Tensor temp_var; - temp_var.mutable_data(var->dims(), ctx.GetPlace()); - set_zero(dev_ctx, &temp_var, static_cast(0)); - T* temp_var_data = temp_var.data(); - - Tensor temp_mean; - temp_mean.mutable_data(var->dims(), ctx.GetPlace()); - set_zero(dev_ctx, &temp_mean, static_cast(0)); - T* temp_mean_data = temp_mean.data(); + Tensor ds, db; + ds.mutable_data({x_dims[0], C}, ctx.GetPlace()); + db.mutable_data({x_dims[0], C}, ctx.GetPlace()); + T* ds_data = ds.data(); + T* db_data = db.data(); + auto* y_data = y->data(); auto* x_data = x->data(); T* d_x_data = nullptr; if (d_x) d_x_data = d_x->data(); - auto* y_data = d_y->data(); + auto* dy_data = d_y->data(); auto* var_data = var->data(); + auto* mean_data = mean->data(); T* d_scale_data = nullptr; if (d_scale) { d_scale->mutable_data(ctx.GetPlace()); - set_zero(dev_ctx, d_scale, static_cast(0)); d_scale_data = d_scale->data(); } T* d_bias_data = nullptr; if (d_bias) { d_bias->mutable_data(ctx.GetPlace()); - set_zero(dev_ctx, d_bias, static_cast(0)); d_bias_data = d_bias->data(); } @@ -479,22 +621,103 @@ class GroupNormGradKernel #ifdef __HIPCC__ int block_size = std::max(std::min(256, imsize), 64); + const int block_dims = 256; #else int block_size = std::min(1024, imsize); + const int block_dims = 1024; #endif dim3 grid(group_size, groups, x_dims[0]); dim3 threads(block_size, 1, 1); int flags = (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias; - UNROLL_ALL_CASES(flags, GroupNormBackwardGetMeanAndVar, x_data, scale_data, - bias_data, y_data, x_dims[0], C, W, imsize, groups, - group_size, epsilon, temp_mean_data, temp_var_data, - d_scale_data, d_bias_data, data_layout); - if (d_x_data != nullptr) { - UNROLL_ALL_CASES(flags, GroupNormBackward, x_data, y_data, scale_data, - bias_data, var_data, temp_mean_data, temp_var_data, - x_dims[0], C, W, imsize, groups, group_size, epsilon, - d_x_data, data_layout); + if (data_layout == DataLayout::kNCHW) { + using AccT = typename details::MPTypeTrait::Type; + constexpr int vec_size = sizeof(float4) / sizeof(T); + const int max_num_threads = 1024; + int max_block_size = std::min(imsize / vec_size, max_num_threads); + int block_size_nchw = 1; + while (block_size_nchw < max_block_size) { + block_size_nchw *= 2; + } + block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize); + dim3 blocks(block_size_nchw); + if (imsize < vec_size) { + if (d_scale) { + set_zero(dev_ctx, d_scale, static_cast(0)); + } + if (d_bias) { + set_zero(dev_ctx, d_bias, static_cast(0)); + } + ScalarGetDsDbCUDAKernel< + T><<>>( + imsize, x_data, dy_data, ds_data, db_data); + } else { + VectorizedGetDsDbCUDAKernel< + T, AccT, vec_size><<>>( + imsize, x_data, dy_data, ds_data, db_data); + } + + if (d_scale || d_bias) { + const int block = 256; + GetScaleBiasGradientCUDAKernel< + T><<<(C + block - 1) / block, block, 0, dev_ctx.stream()>>>( + x_dims[0], C, groups, epsilon, mean_data, var_data, ds_data, + db_data, d_scale_data, d_bias_data); + } + + if (d_x_data != nullptr) { + // p1 * dy + p2 * x + p3, + // p1, p2, p3 represent the reverse calculation of temporary variables + // p1 = scale * var_inv + // p2 = (db * scale * mean - ds * scale) * pow(var_inv, 3) * (1/n) + // p3 = -p2 * mean[ng] - db * scale * var_inv * (1/n); + Tensor p1, p2, p3; + p1.mutable_data({x_dims[0] * C}, ctx.GetPlace()); + p2.mutable_data({x_dims[0], groups}, ctx.GetPlace()); + p3.mutable_data({x_dims[0], groups}, ctx.GetPlace()); + T* p1_data = p1.data(); + T* p2_data = p2.data(); + T* p3_data = p3.data(); + + GetBackwardParamsCUDAKernel<<< + dim3(x_dims[0], groups), block_dims, 0, dev_ctx.stream()>>>( + imsize, groups, group_size, epsilon, mean_data, var_data, + scale_data, ds_data, db_data, p1_data, p2_data, p3_data); + GetXGradientCUDAKernel<<>>( + imsize, C, group_size, groups, p1_data, p2_data, p3_data, x_data, + dy_data, d_x_data); + } + + } else { + if (d_scale) { + set_zero(dev_ctx, d_scale, static_cast(0)); + } + if (d_bias) { + set_zero(dev_ctx, d_bias, static_cast(0)); + } + + Tensor temp_var; + temp_var.mutable_data(var->dims(), ctx.GetPlace()); + set_zero(dev_ctx, &temp_var, static_cast(0)); + T* temp_var_data = temp_var.data(); + + Tensor temp_mean; + temp_mean.mutable_data(var->dims(), ctx.GetPlace()); + set_zero(dev_ctx, &temp_mean, static_cast(0)); + T* temp_mean_data = temp_mean.data(); + + int flags = (scale_data != nullptr) * kHasScale + + (bias_data != nullptr) * kHasBias; + UNROLL_ALL_CASES(flags, GroupNormBackwardGetMeanAndVar, y_data, + scale_data, bias_data, dy_data, x_dims[0], C, W, imsize, + groups, group_size, epsilon, temp_mean_data, + temp_var_data, d_scale_data, d_bias_data); + if (d_x_data != nullptr) { + UNROLL_ALL_CASES(flags, GroupNormBackward, y_data, dy_data, scale_data, + bias_data, var_data, temp_mean_data, temp_var_data, + x_dims[0], C, W, imsize, groups, group_size, epsilon, + d_x_data); + } } } }; -- GitLab From 1f7b25160387552f71f9913afe015fbfe1438878 Mon Sep 17 00:00:00 2001 From: fwenguang <95677191+fwenguang@users.noreply.github.com> Date: Mon, 14 Mar 2022 19:48:54 +0800 Subject: [PATCH 044/176] [MLU] add merged_momentum mlu kernel (#40406) --- .../operators/controlflow/compare_op_mlu.cc | 2 +- .../optimizers/merged_momentum_op_mlu.cc | 163 ++++++++ .../mlu/test_merged_momentum_op_mlu.py | 373 ++++++++++++++++++ 3 files changed, 537 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_merged_momentum_op_mlu.py diff --git a/paddle/fluid/operators/controlflow/compare_op_mlu.cc b/paddle/fluid/operators/controlflow/compare_op_mlu.cc index 9dc287ab76a..c39743ef991 100644 --- a/paddle/fluid/operators/controlflow/compare_op_mlu.cc +++ b/paddle/fluid/operators/controlflow/compare_op_mlu.cc @@ -11,7 +11,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/controlflow/compare_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" namespace paddle { diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc new file mode 100644 index 00000000000..e5399ee36ba --- /dev/null +++ b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc @@ -0,0 +1,163 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/optimizers/merged_momentum_op.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +template +class MLUMergedMomentumOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto params = ctx.MultiInput("Param"); + auto params_out = ctx.MultiOutput("ParamOut"); + size_t n = params.size(); + PADDLE_ENFORCE_EQ(n, params_out.size(), + platform::errors::InvalidArgument( + "The size of Output(ParamOut) must be equal to " + "Input(Param), but got the size of Output(ParamOut) " + "is %d, the size of Input(Param) is %d.", + params_out.size(), n)); + for (size_t i = 0; i < n; ++i) { + PADDLE_ENFORCE_EQ(params[i], params_out[i], + platform::errors::InvalidArgument( + "The size of Input(Param) and Output(ParamOut) " + "must be the same Tensors.")); + } + + auto grads = ctx.MultiInput("Grad"); + PADDLE_ENFORCE_EQ( + n, grads.size(), + platform::errors::InvalidArgument( + "The size of Input(Grad) must be equal to Input(Param), but got " + "the size of Input(Grad) is %d, the size of Input(Param) is %d.", + grads.size(), n)); + + auto velocitys = ctx.MultiInput("Velocity"); + PADDLE_ENFORCE_EQ(n, velocitys.size(), + platform::errors::InvalidArgument( + "The size of Input(Velocity) must be equal to " + "Input(Param), but got the size of Input(Velocity) " + "is %d, the size of Input(Param) is %d.", + velocitys.size(), n)); + + auto velocitys_out = ctx.MultiOutput("VelocityOut"); + PADDLE_ENFORCE_EQ( + n, velocitys_out.size(), + platform::errors::InvalidArgument( + "The size of Output(VelocityOut) must be " + "equal to Input(Param), but got the size of Output(VelocityOut) is " + "%d, the size of Input(Param) is %d.", + velocitys_out.size(), n)); + for (size_t i = 0; i < n; ++i) { + PADDLE_ENFORCE_EQ(velocitys[i], velocitys_out[i], + platform::errors::InvalidArgument( + "Input(Velocity) and Output(VelocityOut) must be " + "the same Tensors.")); + } + + auto mu = ctx.Attr("mu"); + auto lrs = ctx.MultiInput("LearningRate"); + if (lrs.size() != 1) { + PADDLE_ENFORCE_EQ( + n, lrs.size(), + platform::errors::InvalidArgument( + "If the size of Input(LearningRate) is not 1, the size of " + "Input(LearningRate) must be " + "equal to Input(Param), but got the size of Input(LearningRate) " + "is %d, the size of Input(Param) is %d.", + lrs.size(), n)); + } + auto use_nesterov = ctx.Attr("use_nesterov"); + auto regularization_methods = + ctx.Attr>("regularization_method"); + auto regularization_coeffs = + ctx.Attr>("regularization_coeff"); + if (regularization_methods.size() != 0) { + PADDLE_ENFORCE_EQ( + n, regularization_methods.size(), + platform::errors::InvalidArgument( + "The size of Attr(regularization_method) must be equal " + "to Input(Param), but got the size of " + "Attr(regularization_method) is %d, the size of Input(Param) is " + "%d.", + regularization_methods.size(), n)); + PADDLE_ENFORCE_EQ( + n, regularization_coeffs.size(), + platform::errors::InvalidArgument( + "The size of Attr(regularization_coeff) must be equal " + "to Input(Param), but got the size of Attr(regularization_coeff) " + "is %d, the size of Input(Param) is %d.", + regularization_coeffs.size(), n)); + } + + VLOG(5) << "use_nesterov: " << use_nesterov + << ", regularization_methods.size(): " + << regularization_methods.size() + << ", regularization_coeffs.size(): " + << regularization_coeffs.size(); + + auto& dev_ctx = ctx.template device_context(); + + Tensor mu_tensor = ctx.AllocateTmpTensor({1}, dev_ctx); + MLUCnnlTensorDesc mu_tensor_desc(mu_tensor); + MLUCnnl::Fill(ctx, mu, mu_tensor_desc.get(), GetBasePtr(&mu_tensor)); + + for (size_t idx = 0; idx < n; ++idx) { + RegularizationType regularization_flag = + regularization_methods.size() > 0 && + regularization_methods[idx] == "l2_decay" + ? RegularizationType::kL2DECAY + : RegularizationType::kNONE; + T regularization_coeff = static_cast(0.0); + if (regularization_coeffs.size() != 0) { + regularization_coeff = static_cast(regularization_coeffs[idx]); + } + + auto learning_rate = lrs.size() > 1 ? lrs[idx] : lrs[0]; + auto param_out = params_out[idx]; + auto velocity_out = velocitys_out[idx]; + + auto grad = grads[idx]; + Tensor regularized_grad; + MLUCnnlTensorDesc param_desc(*param_out); + if (regularization_flag == RegularizationType::kL2DECAY) { + regularized_grad = ctx.AllocateTmpTensor( + param_out->dims(), dev_ctx); + MLUCnnlOpTensorDesc op_tensor_desc( + CNNL_OP_TENSOR_ADD, ToCnnlDataType(), CNNL_NOT_PROPAGATE_NAN); + MLUCnnl::OpTensor(ctx, op_tensor_desc.get(), param_desc.get(), + GetBasePtr(param_out), param_desc.get(), + GetBasePtr(grad), param_desc.get(), + GetBasePtr(®ularized_grad), ToCnnlDataType(), + regularization_coeff); + } else { + regularized_grad = *grad; + } + MLUCnnl::ApplyMomentum(ctx, param_desc.get(), + GetBasePtr(®ularized_grad), use_nesterov, + GetBasePtr(learning_rate), GetBasePtr(&mu_tensor), + GetBasePtr(param_out), GetBasePtr(velocity_out)); + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_MLU_KERNEL(merged_momentum, ops::MLUMergedMomentumOpKernel, + ops::MLUMergedMomentumOpKernel); diff --git a/python/paddle/fluid/tests/unittests/mlu/test_merged_momentum_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_merged_momentum_op_mlu.py new file mode 100644 index 00000000000..f3699da15b5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_merged_momentum_op_mlu.py @@ -0,0 +1,373 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +sys.path.append('..') +import unittest +import paddle +import numpy as np +from paddle.fluid.layer_helper import LayerHelper +from collections import OrderedDict + + +def run_momentum_op(params, + grads, + velocitys, + master_params, + learning_rate, + place, + multi_precision, + mu=0.9, + rescale_grad=0.01, + use_merged=False): + assert len(params) == len(grads) + assert len(params) == len(velocitys) + if multi_precision: + assert len(params) == len(master_params) + op_type = 'merged_momentum' if use_merged else 'momentum' + main = paddle.static.Program() + startup = paddle.static.Program() + with paddle.static.program_guard(main, startup): + helper = LayerHelper(op_type, **locals()) + attrs = { + 'mu': mu, + 'multi_precision': multi_precision, + 'rescale_grad': rescale_grad, + } + + param_vars = [ + helper.create_variable( + persistable=True, shape=p.shape, dtype=p.dtype) for p in params + ] + grad_vars = [ + helper.create_variable( + shape=g.shape, dtype=g.dtype) for g in grads + ] + velocity_vars = [ + helper.create_variable( + persistable=True, shape=v.shape, dtype=v.dtype) + for v in velocitys + ] + lr_var = helper.create_variable( + persistable=True, + shape=learning_rate.shape, + dtype=learning_rate.dtype) + + feed_dict = OrderedDict() + + feed_dict.update( + OrderedDict([(p_var.name, p_val) + for p_var, p_val in zip(param_vars, params)])) + feed_dict.update( + OrderedDict([(v_var.name, v_val) + for v_var, v_val in zip(velocity_vars, velocitys)])) + fetch_list = list(feed_dict.keys()) + + feed_dict.update( + OrderedDict([(g_var.name, g_val) + for g_var, g_val in zip(grad_vars, grads)])) + feed_dict.update({lr_var.name: learning_rate}) + + if multi_precision: + master_param_vars = [ + helper.create_variable( + persistable=True, shape=p.shape, dtype=p.dtype) + for p in master_params + ] + feed_dict.update( + OrderedDict([(mp_var.name, mp_val) + for mp_var, mp_val in zip(master_param_vars, + master_params)])) + # CPUPlace does not use MasterParam + if isinstance(place, paddle.CUDAPlace): + fetch_list = fetch_list + [ + mp_var.name for mp_var in master_param_vars + ] + else: + master_param_vars = None + + if not use_merged: + for i, (p, g, + v) in enumerate(zip(param_vars, grad_vars, velocity_vars)): + inputs = { + 'Param': p, + 'Grad': g, + 'Velocity': v, + 'LearningRate': lr_var, + } + outputs = {'ParamOut': p, 'VelocityOut': v} + if multi_precision: + inputs['MasterParam'] = master_param_vars[i] + outputs['MasterParamOut'] = master_param_vars[i] + helper.append_op( + type=op_type, inputs=inputs, outputs=outputs, attrs=attrs) + else: + inputs = { + 'Param': param_vars, + 'Grad': grad_vars, + 'Velocity': velocity_vars, + 'LearningRate': lr_var, + } + outputs = {'ParamOut': param_vars, 'VelocityOut': velocity_vars} + if multi_precision: + inputs['MasterParam'] = master_param_vars + outputs['MasterParamOut'] = master_param_vars + helper.append_op( + type=op_type, inputs=inputs, outputs=outputs, attrs=attrs) + + exe = paddle.static.Executor(place) + with paddle.static.scope_guard(paddle.static.Scope()): + exe.run(startup) + return exe.run(main, feed=feed_dict, fetch_list=fetch_list) + + +def run_momentum_op2(params, + grads, + velocitys, + master_params, + learning_rate, + place, + multi_precision, + mu=0.9, + rescale_grad=0.01, + use_merged=False, + use_nesterov=True): + assert len(params) == len(grads) + assert len(params) == len(velocitys) + if multi_precision: + assert len(params) == len(master_params) + op_type = 'merged_momentum' if use_merged else 'momentum' + main = paddle.static.Program() + startup = paddle.static.Program() + with paddle.static.program_guard(main, startup): + helper = LayerHelper(op_type, **locals()) + + param_vars = [ + helper.create_variable( + persistable=True, shape=p.shape, dtype=p.dtype) for p in params + ] + grad_vars = [ + helper.create_variable( + shape=g.shape, dtype=g.dtype) for g in grads + ] + velocity_vars = [ + helper.create_variable( + persistable=True, shape=v.shape, dtype=v.dtype) + for v in velocitys + ] + lr_var = helper.create_variable( + persistable=True, + shape=learning_rate.shape, + dtype=learning_rate.dtype) + + feed_dict = OrderedDict() + + feed_dict.update( + OrderedDict([(p_var.name, p_val) + for p_var, p_val in zip(param_vars, params)])) + feed_dict.update( + OrderedDict([(v_var.name, v_val) + for v_var, v_val in zip(velocity_vars, velocitys)])) + fetch_list = list(feed_dict.keys()) + + feed_dict.update( + OrderedDict([(g_var.name, g_val) + for g_var, g_val in zip(grad_vars, grads)])) + feed_dict.update({lr_var.name: learning_rate}) + + if multi_precision: + master_param_vars = [ + helper.create_variable( + persistable=True, shape=p.shape, dtype=p.dtype) + for p in master_params + ] + feed_dict.update( + OrderedDict([(mp_var.name, mp_val) + for mp_var, mp_val in zip(master_param_vars, + master_params)])) + # CPUPlace does not use MasterParam + if isinstance(place, paddle.CUDAPlace): + fetch_list = fetch_list + [ + mp_var.name for mp_var in master_param_vars + ] + else: + master_param_vars = None + + if not use_merged: + for i, (p, g, + v) in enumerate(zip(param_vars, grad_vars, velocity_vars)): + inputs = { + 'Param': p, + 'Grad': g, + 'Velocity': v, + 'LearningRate': lr_var, + } + outputs = {'ParamOut': p, 'VelocityOut': v} + if multi_precision: + inputs['MasterParam'] = master_param_vars[i] + outputs['MasterParamOut'] = master_param_vars[i] + attrs = { + 'mu': mu, + 'multi_precision': multi_precision, + 'rescale_grad': rescale_grad, + 'use_nesterov': use_nesterov, + 'regularization_method': 'l2_decay', + 'regularization_coeff': 2.0, + } + helper.append_op( + type=op_type, inputs=inputs, outputs=outputs, attrs=attrs) + else: + inputs = { + 'Param': param_vars, + 'Grad': grad_vars, + 'Velocity': velocity_vars, + 'LearningRate': lr_var, + } + outputs = {'ParamOut': param_vars, 'VelocityOut': velocity_vars} + if multi_precision: + inputs['MasterParam'] = master_param_vars + outputs['MasterParamOut'] = master_param_vars + attrs = { + 'mu': mu, + 'multi_precision': multi_precision, + 'rescale_grad': rescale_grad, + 'use_nesterov': use_nesterov, + 'regularization_method': + ['l2_decay' for i in range(len(param_vars))], + 'regularization_coeff': [2.0 for i in range(len(param_vars))], + } + helper.append_op( + type=op_type, inputs=inputs, outputs=outputs, attrs=attrs) + + exe = paddle.static.Executor(place) + with paddle.static.scope_guard(paddle.static.Scope()): + exe.run(startup) + return exe.run(main, feed=feed_dict, fetch_list=fetch_list) + + +class TestMergedMomentum(unittest.TestCase): + def setUp(self): + paddle.enable_static() + self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]] + self.seed = 10 + self.place = paddle.device.MLUPlace(0) + self.__class__.use_mlu = True + + def gen_rand_data(self, shapes, dtype): + return [np.random.random(s).astype(dtype) for s in shapes] + + def prepare_data(self, shapes, multi_precision, seed, place): + np.random.seed(seed) + mp_dtype = np.float32 + dtype = np.float32 + params = self.gen_rand_data(shapes, dtype) + grads = self.gen_rand_data(shapes, dtype) + velocitys = self.gen_rand_data(shapes, mp_dtype) + learning_rate = self.gen_rand_data([[1]], mp_dtype)[0] + if multi_precision: + master_params = [p.astype(mp_dtype) for p in params] + else: + master_params = None + return params, grads, velocitys, master_params, learning_rate + + def check_with_place(self, place, multi_precision): + params, grads, velocitys, master_params, learning_rate = self.prepare_data( + self.shapes, multi_precision, self.seed, place) + + def run_op(use_merged): + # MLU Momentum Op does not support rescale_grad + rescale_grad = 1.0 + return run_momentum_op( + params, + grads, + velocitys, + master_params, + learning_rate, + place, + multi_precision, + rescale_grad=rescale_grad, + use_merged=use_merged) + + outs1 = run_op(True) + outs2 = run_op(False) + self.assertEqual(len(outs1), len(outs2)) + for i, (out1, out2) in enumerate(zip(outs1, outs2)): + self.assertTrue(np.allclose(out1, out2, atol=1e-7)) + + def test_main(self): + self.check_with_place(self.place, multi_precision=False) + + +class TestMergedMomentum2(unittest.TestCase): + def setUp(self): + paddle.enable_static() + self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]] + self.seed = 10 + self.place = paddle.device.MLUPlace(0) + self.__class__.use_mlu = True + + def gen_rand_data(self, shapes, dtype): + return [np.random.random(s).astype(dtype) for s in shapes] + + def prepare_data(self, shapes, multi_precision, seed, place): + np.random.seed(seed) + mp_dtype = np.float32 + dtype = np.float32 # np.float16 + params = self.gen_rand_data(shapes, dtype) + grads = self.gen_rand_data(shapes, dtype) + velocitys = self.gen_rand_data(shapes, mp_dtype) + learning_rate = self.gen_rand_data([[1]], mp_dtype)[0] + if multi_precision: + master_params = [p.astype(mp_dtype) for p in params] + else: + master_params = None + return params, grads, velocitys, master_params, learning_rate + + def check_with_place(self, place, multi_precision): + params, grads, velocitys, master_params, learning_rate = self.prepare_data( + self.shapes, multi_precision, self.seed, place) + + def run_op(use_nesterov, use_merged): + # MLU Momentum Op does not support rescale_grad + rescale_grad = 1.0 + return run_momentum_op2( + params, + grads, + velocitys, + master_params, + learning_rate, + place, + multi_precision, + rescale_grad=rescale_grad, + use_merged=use_merged, + use_nesterov=use_nesterov) + + outs1 = run_op(use_nesterov=True, use_merged=True) + outs2 = run_op(use_nesterov=True, use_merged=False) + self.assertEqual(len(outs1), len(outs2)) + for i, (out1, out2) in enumerate(zip(outs1, outs2)): + self.assertTrue(np.allclose(out1, out2, atol=1e-7)) + + outs3 = run_op(use_nesterov=False, use_merged=True) + outs4 = run_op(use_nesterov=False, use_merged=False) + self.assertEqual(len(outs3), len(outs4)) + for j, (out3, out4) in enumerate(zip(outs3, outs4)): + self.assertTrue(np.allclose(out3, out4, atol=1e-7)) + + def test_main(self): + self.check_with_place(self.place, multi_precision=False) + + +if __name__ == "__main__": + unittest.main() -- GitLab From 3f219160bee15a3afa7107439197361f8266dc57 Mon Sep 17 00:00:00 2001 From: Tomasz Socha Date: Mon, 14 Mar 2022 12:53:02 +0100 Subject: [PATCH 045/176] Add an elementwise + activation fusion pass. (#36541) * Add elementwise add and activation fuse pass * Fix copy ellision * More flexible pattern detector * More flexible fusion pass * Update lists for pass * Add support for Pow operator * Add support for more activation types * Style * Rename fusion pass * First version of tests * Dirty version of pass * Polished version * Update pbtxt * Style * Update names * Style * Use PADDLE_ENFORCE_EQ * Save error message to variable * WO for error checks * CR * Static style check * Add missing 'activation_scale' attribute * Add relu6 and sigmoid activations * Style * Fix fuse list formating * Sync filenames for fuse pass files * Fix cmake after move * Fix registration * Fix pass name in tests * Add missing activations to checker * WIPS * Working mul op * Working sub * Working Add * Remove pten includes * Remove some forward declarations * Remove Includes * Fixes * Remove default kernels * Add check if post_ops attributes are avaliable * Style * Code adjustment * Register default kernels * We have year 2022 not 2021... Co-authored-by: jakpiase Co-authored-by: Sylwester Fraczek * Fast review fixes Co-authored-by: jakpiase Co-authored-by: Sylwester Fraczek * Review Fix * Rename one_dnn -> onednn * Style after review * Fast and dirty fix for quantization * Update tests * Style * Fix mkldnn_quantizer config * Add Joanna's suggestion. * Check if operator is explicitly disables on OneDNN * Try to use unregistered attributes * Style * Test new framework * FXI * FXII * Update test * Style Co-authored-by: jakpiase Co-authored-by: Sylwester Fraczek --- paddle/fluid/framework/ir/CMakeLists.txt | 1 + .../framework/ir/graph_pattern_detector.cc | 30 ++ .../framework/ir/graph_pattern_detector.h | 22 ++ .../ir/mkldnn/elt_act_mkldnn_fuse_pass.cc | 145 ++++++++ .../ir/mkldnn/elt_act_mkldnn_fuse_pass.h | 44 +++ .../inference/api/paddle_pass_builder.cc | 1 + .../mkldnn/elementwise_mkldnn_op.h | 45 ++- paddle/fluid/platform/mkldnn_reuse.h | 14 +- .../test_mkldnn_elt_act_fuse_pass.py | 328 ++++++++++++++++++ .../test_mkldnn_elt_act_fuse_pass_new.py | 82 +++++ 10 files changed, 702 insertions(+), 10 deletions(-) create mode 100644 paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass_new.py diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index a1f2d6edca6..623c8a048c2 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -126,6 +126,7 @@ if(WITH_MKLDNN) pass_library(interpolate_mkldnn_pass inference DIR mkldnn) pass_library(softplus_activation_mkldnn_fuse_pass inference DIR mkldnn) pass_library(fc_act_mkldnn_fuse_pass inference DIR mkldnn) + pass_library(elt_act_mkldnn_fuse_pass inference DIR mkldnn) pass_library(cpu_quantize_placement_pass base DIR mkldnn) pass_library(cpu_quantize_pass inference DIR mkldnn) pass_library(cpu_quantize_squash_pass inference DIR mkldnn) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index d7d866fa98b..18068e22b7f 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -918,6 +918,36 @@ PDNode *patterns::ConvActivation::operator()( return activation_out_var; } +PDNode *patterns::ElementwiseActivation::operator()( + paddle::framework::ir::PDNode *elementwise_a, + const std::string &elementwise_type, const std::string &activation_type) { + // Create Operators + elementwise_a->assert_is_op_input(elementwise_type, "X"); + auto *elementwise_op = + pattern->NewNode(elementwise_repr())->assert_is_op(elementwise_type); + auto *activation_op = + pattern->NewNode(activation_repr())->assert_is_op(activation_type); + // Create variables + auto *elementwise_b = pattern->NewNode(elementwise_b_repr()) + ->AsInput() + ->assert_is_op_input(elementwise_type, "Y"); + // intermediate variable, will be removed in the IR after fuse. + auto *elementwise_out_var = + pattern->NewNode(elementwise_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op(elementwise_type) + ->assert_is_op_input(activation_type); + // output + auto *activation_out_var = pattern->NewNode(activation_out_repr()) + ->AsOutput() + ->assert_is_op_output(activation_type); + + elementwise_op->LinksFrom({elementwise_a, elementwise_b}) + .LinksTo({elementwise_out_var}); + activation_op->LinksFrom({elementwise_out_var}).LinksTo({activation_out_var}); + return activation_out_var; +} + PDNode *patterns::SeqConvEltAddRelu::operator()( paddle::framework::ir::PDNode *seqconv_input) { // Create Operators diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 0f21906d08d..062d2f9dedc 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -487,6 +487,28 @@ struct ConvActivation : public PatternBase { PATTERN_DECL_NODE(activation_out); }; +// Elementwise with Activation +// op: elementwise + activation +// named nodes: +// elementwise_a, elementwise_b, +// elementwise_out, elementwise, +// activation_out, activation +struct ElementwiseActivation : public PatternBase { + ElementwiseActivation(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "elementwise_add_activation") {} + + PDNode* operator()(PDNode* elementwise_a, const std::string& elementwise_type, + const std::string& activation_type); + + // declare operator node's name + PATTERN_DECL_NODE(elementwise); + PATTERN_DECL_NODE(activation); + // declare variable node's name + PATTERN_DECL_NODE(elementwise_b); + PATTERN_DECL_NODE(elementwise_out); + PATTERN_DECL_NODE(activation_out); +}; + // SEQCONV with Elementwise_Add ReLU // op: seqconv + elementwise_add + relu // named nodes: diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc new file mode 100644 index 00000000000..b7f7a8071d2 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace framework { +namespace ir { + +using string::PrettyLogDetail; + +void ElementwiseActivationOneDNNPass::ApplyImpl(Graph *graph) const { + std::vector act_types = { + "relu", "tanh", "leaky_relu", "swish", "hardswish", "sqrt", + "abs", "clip", "gelu", "relu6", "sigmoid"}; + std::vector elt_types = {"elementwise_add", "elementwise_sub", + "elementwise_mul"}; + + for (const auto &elt_type : elt_types) + for (const auto &act_type : act_types) { + std::unordered_map attr_map; + + if (act_type == "swish") + attr_map.emplace("beta", "activation_alpha"); + else if (act_type == "relu6") + attr_map.emplace("threshold", "activation_alpha"); + else if (act_type == "clip") { + attr_map.emplace("min", "activation_alpha"); + attr_map.emplace("max", "activation_beta"); + } else { + attr_map.emplace("alpha", "activation_alpha"); + attr_map.emplace("beta", "activation_beta"); + } + FuseElementwiseAct(graph, elt_type, act_type, attr_map); + } +} + +void ElementwiseActivationOneDNNPass::FuseElementwiseAct( + Graph *graph, const std::string &elt_type, const std::string &act_type, + const std::unordered_map &attr_map) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + FusePassBase::Init("elementwise_act", graph); + + GraphPatternDetector gpd; + auto *elementwise_input = gpd.mutable_pattern() + ->NewNode(elt_type + "_act/elementwise_input") + ->AsInput() + ->assert_is_op_input(elt_type, "X"); + patterns::ElementwiseActivation elementwise_act_pattern(gpd.mutable_pattern(), + elt_type + "_act"); + elementwise_act_pattern(elementwise_input, elt_type, act_type); + + int found_elementwise_activation_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "Fuse " << elt_type << " with activation op."; + // Elementwise output + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, + elementwise_act_pattern); + // ACT output + GET_IR_NODE_FROM_SUBGRAPH(activation_out, activation_out, + elementwise_act_pattern); + // ops + GET_IR_NODE_FROM_SUBGRAPH(elementwise, elementwise, + elementwise_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(activation, activation, elementwise_act_pattern); + + auto *elementwise_op = elementwise->Op(); + + if (elementwise_op->HasAttr("use_mkldnn")) { + const std::string wo_elt_type = + "The " + elt_type; // Workaround for PP error message checking. + PADDLE_ENFORCE_EQ( + BOOST_GET_CONST(bool, elementwise_op->GetAttr("use_mkldnn")), true, + platform::errors::PreconditionNotMet( + wo_elt_type + "+Act fusion may happen only when oneDNN library " + "is used.")); + } + + auto *activation_op = activation->Op(); + for (const auto &attr : attr_map) { + if (activation_op->HasAttr(attr.first)) { + elementwise_op->SetAttr(attr.second, + activation_op->GetAttr(attr.first)); + } + } + + if (act_type == "gelu" && activation_op->HasAttr("approximate") && + BOOST_GET_CONST(bool, activation_op->GetAttr("approximate"))) + elementwise_op->SetAttr("activation_type", std::string("gelu_tanh")); + else + elementwise_op->SetAttr("activation_type", act_type); + + elementwise_op->SetOutput("Out", {activation_out->Name()}); + + IR_OP_VAR_LINK(elementwise, activation_out); + GraphSafeRemoveNodes(g, {activation, elementwise_out}); + found_elementwise_activation_count++; + }; + + gpd(graph, handler); + AddStatis(found_elementwise_activation_count); + PrettyLogDetail("--- fused %d %s with %s activation", + found_elementwise_activation_count, elt_type, act_type); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(elt_act_mkldnn_fuse_pass, + paddle::framework::ir::ElementwiseActivationOneDNNPass); +REGISTER_PASS_CAPABILITY(elt_act_mkldnn_fuse_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .LE("elementwise_add", 1) + .LE("elementwise_sub", 1) + .LE("elementwise_mul", 1) + .LE("relu", 0) + .LE("tanh", 0) + .LE("leaky_relu", 1) + .LE("swish", 0) + .LE("hard_swish", 0) + .LE("sqrt", 0) + .LE("abs", 0) + .LE("clip", 1) + .LE("gelu", 0) + .LE("relu6", 0) + .LE("sigmoid", 0)); diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h new file mode 100644 index 00000000000..b8b7d06a828 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h @@ -0,0 +1,44 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * \brief Fuse the Elementwise and activation operators into single + * OneDNN's Elementwise with post-op. + */ +class ElementwiseActivationOneDNNPass : public FusePassBase { + public: + virtual ~ElementwiseActivationOneDNNPass() {} + + protected: + void ApplyImpl(Graph *graph) const override; + + void FuseElementwiseAct( + Graph *graph, const std::string &elt_types, const std::string &act_types, + const std::unordered_map &attr_map) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index f5f36d805b4..22d9dedb32e 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -262,6 +262,7 @@ void CpuPassStrategy::EnableMKLDNN() { // "fc_act_mkldnn_fuse_pass", "batch_norm_act_fuse_pass", // "softplus_activation_mkldnn_fuse_pass", // + "elt_act_mkldnn_fuse_pass", // // TODO(intel): Please fix the bug on windows. // https://github.com/PaddlePaddle/Paddle/issues/29710 // "mkldnn_inplace_pass", // This pass should be activated after diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index 763fc5f2674..ad8fd317013 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -32,6 +32,45 @@ using dnnl::stream; template class EltwiseMKLDNNKernel : public framework::OpKernel { + private: + dnnl::post_ops get_post_ops(const framework::ExecutionContext& ctx) const { + dnnl::post_ops post_operations; + if (ctx.HasAttr("activation_type")) { + const float scale = ctx.HasAttr("activation_scale") + ? ctx.Attr("activation_scale") + : 1.0f; + const float alpha = ctx.HasAttr("activation_alpha") + ? ctx.Attr("activation_alpha") + : 0.0f; + const float beta = ctx.HasAttr("activation_beta") + ? ctx.Attr("activation_beta") + : 0.0f; + + static std::unordered_map algo_map = { + {"relu", dnnl::algorithm::eltwise_relu}, + {"tanh", dnnl::algorithm::eltwise_tanh}, + {"leaky_relu", dnnl::algorithm::eltwise_relu}, + {"swish", dnnl::algorithm::eltwise_swish}, + {"hardswish", dnnl::algorithm::eltwise_hardswish}, + {"sqrt", dnnl::algorithm::eltwise_sqrt}, + {"abs", dnnl::algorithm::eltwise_abs}, + {"clip", dnnl::algorithm::eltwise_clip}, + {"gelu", dnnl::algorithm::eltwise_gelu_erf}, + {"gelu_tanh", dnnl::algorithm::eltwise_gelu_tanh}, + {"relu6", dnnl::algorithm::eltwise_bounded_relu}, + {"sigmoid", dnnl::algorithm::eltwise_logistic}}; + + const auto& activation_type = + algo_map.find(ctx.Attr("activation_type")); + + if (activation_type != algo_map.end()) { + post_operations.append_eltwise(scale, activation_type->second, alpha, + beta); + } + } + return post_operations; + } + public: void Compute(const framework::ExecutionContext& ctx) const override { const auto& dev_ctx = @@ -47,9 +86,9 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { float scale_o = ctx.Attr("Scale_out"); int axis = ctx.Attr("axis"); - platform::BinaryMKLDNNHandler handler(BINARY_OP, axis, mkldnn_engine, - ctx.GetPlace(), x, y, z, scale_x, - scale_y, scale_o); + platform::BinaryMKLDNNHandler handler( + BINARY_OP, axis, mkldnn_engine, ctx.GetPlace(), x, y, z, scale_x, + scale_y, scale_o, get_post_ops(ctx)); const auto src_x_memory = handler.AcquireSrcMemory(x); const auto src_y_memory = handler.AcquireSecondSrcMemory(y); diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 01de7349f48..1254331835b 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -618,7 +618,7 @@ class BinaryMKLDNNHandler const dnnl::engine engine, platform::Place cpu_place, const Tensor* x, const Tensor* y, Tensor* z, float scale_x, float scale_y, float scale_z, - const dnnl::post_ops& post_ops = dnnl::post_ops()) + const dnnl::post_ops& post_ops = dnnl::post_ops{}) : platform::MKLDNNHandlerNoCachingT(engine, cpu_place) { PADDLE_ENFORCE_EQ( x->layout(), DataLayout::kMKLDNN, @@ -676,8 +676,8 @@ class BinaryMKLDNNHandler const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType(), MKLDNNMemoryFormat::any); - auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_z); - attributes.set_post_ops(post_ops); + auto attributes = + CreateAttributes(algo, scale_x, scale_y, scale_z, post_ops); this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, src1_md, dst_md); @@ -690,10 +690,9 @@ class BinaryMKLDNNHandler } private: - static inline dnnl::primitive_attr CreateAttributes(dnnl::algorithm op, - float scale_x, - float scale_y, - float scale_z) { + static inline dnnl::primitive_attr CreateAttributes( + dnnl::algorithm op, float scale_x, float scale_y, float scale_z, + dnnl::post_ops post_ops = dnnl::post_ops{}) { // Scales set in attributes for inputs contibute to the output equation // in the following way (assuming no broadcasting takes place): // output_i = scale_0 * x_i <+ or *> scale_1 * y_i; @@ -718,6 +717,7 @@ class BinaryMKLDNNHandler {scale_0}); attributes.set_scales(/* input_y_id = */ DNNL_ARG_SRC_1, /* mask = */ 0, {scale_1}); + if (post_ops.len() > 0) attributes.set_post_ops(post_ops); return attributes; } }; diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py new file mode 100644 index 00000000000..893bd383343 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py @@ -0,0 +1,328 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from inference_pass_test import InferencePassTest +import paddle +import paddle.fluid as fluid +from paddle.fluid.core import PassVersionChecker + + +class ElementwiseActivationMkldnnFusePassTest(InferencePassTest): + act_alpha = None + act_beta = None + pass_name = 'elt_act_mkldnn_fuse_pass' + + def setUp(self): + self.set_params() + with fluid.program_guard(self.main_program, self.startup_program): + data_A = fluid.data( + name="data_A", shape=[-1, 3, 100, 100], dtype="float32") + data_B = fluid.data( + name="data_B", shape=[-1, 3, 100, 100], dtype="float32") + elt_out = self.operand(data_A, data_B) + if self.act is not None: + if self.act_beta is not None: + elt_out = self.act(elt_out, self.act_alpha, self.act_beta) + elif self.act_alpha is not None: + elt_out = self.act(elt_out, self.act_alpha) + else: + elt_out = self.act(elt_out) + + self.feeds = { + "data_A": np.random.random((1, 3, 100, 100)).astype("float32"), + "data_B": np.random.random((1, 3, 100, 100)).astype("float32") + } + self.fetch_list = [elt_out] + self.enable_mkldnn = True + + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act = None + + def test_check_output(self): + use_gpu = False + self.check_output_with_option(use_gpu) + + def test_pass_compatible(self): + self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name)) + + +class ElementwiseActivationMkldnnFusePassTest_Add_Relu( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act = fluid.layers.relu + + +class ElementwiseActivationMkldnnFusePassTest_Add_Tanh( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act = fluid.layers.tanh + + +class ElementwiseActivationMkldnnFusePassTest_Add_LeakyRelu( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act_alpha = 0.2 + self.act = fluid.layers.leaky_relu + + +class ElementwiseActivationMkldnnFusePassTest_Add_Swish( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act_alpha = 4 + self.act = fluid.layers.swish + + +class ElementwiseActivationMkldnnFusePassTest_Add_HardSwish( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act = fluid.layers.hard_swish + + +class ElementwiseActivationMkldnnFusePassTest_Add_SQRT( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act = fluid.layers.sqrt + + +class ElementwiseActivationMkldnnFusePassTest_Add_ABS( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act = fluid.layers.abs + + +class ElementwiseActivationMkldnnFusePassTest_Add_Clip( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act = fluid.layers.clip + self.act_alpha = 0.0 + self.act_beta = 10.0 + + +class ElementwiseActivationMkldnnFusePassTest_Add_Gelu( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act = fluid.layers.gelu + + +class ElementwiseActivationMkldnnFusePassTest_Add_Gelu_Tanh( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act = fluid.layers.gelu + self.act_alpha = True + + +class ElementwiseActivationMkldnnFusePassTest_Add_Relu6( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act = fluid.layers.relu6 + self.act_alpha = 5.0 + + +class ElementwiseActivationMkldnnFusePassTest_Add_Sigmoid( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_add + self.act = fluid.layers.sigmoid + + +class ElementwiseActivationMkldnnFusePassTest_Sub_Relu( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_sub + self.act = fluid.layers.relu + + +class ElementwiseActivationMkldnnFusePassTest_Sub_Tanh( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_sub + self.act = fluid.layers.tanh + + +class ElementwiseActivationMkldnnFusePassTest_Sub_LeakyRelu( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_sub + self.act_alpha = 0.2 + self.act = fluid.layers.leaky_relu + + +class ElementwiseActivationMkldnnFusePassTest_Sub_Swish( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_sub + self.act = fluid.layers.swish + + +class ElementwiseActivationMkldnnFusePassTest_Sub_HardSwish( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_sub + self.act = fluid.layers.hard_swish + + +class ElementwiseActivationMkldnnFusePassTest_Sub_ABS( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_sub + self.act = fluid.layers.abs + + +class ElementwiseActivationMkldnnFusePassTest_Sub_Clip( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_sub + self.act = fluid.layers.clip + self.act_alpha = 0.0 + self.act_beta = 10.0 + + +class ElementwiseActivationMkldnnFusePassTest_Sub_Gelu( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_sub + self.act = fluid.layers.gelu + + +class ElementwiseActivationMkldnnFusePassTest_Sub_Gelu_Tanh( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_sub + self.act = fluid.layers.gelu + self.act_alpha = True + + +class ElementwiseActivationMkldnnFusePassTest_Sub_Relu6( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_sub + self.act = fluid.layers.relu6 + self.act_alpha = 5.0 + + +class ElementwiseActivationMkldnnFusePassTest_Sub_Sigmoid( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_sub + self.act = fluid.layers.sigmoid + + +class ElementwiseActivationMkldnnFusePassTest_Mul_Relu( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_mul + self.act = fluid.layers.relu + + +class ElementwiseActivationMkldnnFusePassTest_Mul_Tanh( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_mul + self.act = fluid.layers.tanh + + +class ElementwiseActivationMkldnnFusePassTest_Mul_LeakyRelu( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_mul + self.act_alpha = 0.2 + self.act = fluid.layers.leaky_relu + + +class ElementwiseActivationMkldnnFusePassTest_Mul_Swish( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_mul + self.act = fluid.layers.swish + + +class ElementwiseActivationMkldnnFusePassTest_Mul_HardSwish( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_mul + self.act = fluid.layers.hard_swish + + +class ElementwiseActivationMkldnnFusePassTest_Mul_SQRT( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_mul + self.act = fluid.layers.sqrt + + +class ElementwiseActivationMkldnnFusePassTest_Mul_ABS( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_mul + self.act = fluid.layers.abs + + +class ElementwiseActivationMkldnnFusePassTest_Mul_Clip( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_mul + self.act = fluid.layers.clip + self.act_alpha = 0.0 + self.act_beta = 10.0 + + +class ElementwiseActivationMkldnnFusePassTest_Mul_Gelu( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_mul + self.act = fluid.layers.gelu + + +class ElementwiseActivationMkldnnFusePassTest_Mul_Gelu_Tanh( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_mul + self.act = fluid.layers.gelu + self.act_alpha = True + + +class ElementwiseActivationMkldnnFusePassTest_Mul_Relu6( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_mul + self.act = fluid.layers.relu6 + self.act_alpha = 5.0 + + +class ElementwiseActivationMkldnnFusePassTest_Mul_Sigmoid( + ElementwiseActivationMkldnnFusePassTest): + def set_params(self): + self.operand = fluid.layers.elementwise_mul + self.act = fluid.layers.sigmoid + + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass_new.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass_new.py new file mode 100644 index 00000000000..0f5279b0eda --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass_new.py @@ -0,0 +1,82 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from auto_scan_test import PassAutoScanTest +from program_config import TensorConfig, ProgramConfig +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +import unittest + +import hypothesis +from hypothesis import given, settings, seed, example, assume +import hypothesis.strategies as st + + +class TestElementWiseAddReluFusePass(PassAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_config(self, draw): + batch_size = draw(st.integers(min_value=1, max_value=4)) + + def generate_input(): + return np.random.random( + [batch_size, 3, 100, 100]).astype(np.float32) + + ops_config = [{ + "op_type": "elementwise_add", + "op_inputs": { + "X": ["A"], + "Y": ["B"] + }, + "op_outputs": { + "Out": ["add_output"] + }, + "op_attrs": {} + }, { + "op_type": "relu", + "op_inputs": { + "X": ["add_output"] + }, + "op_outputs": { + "Out": ["relu_output"] + }, + "op_attrs": {} + }] + + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "A": TensorConfig(data_gen=partial(generate_input)), + "B": TensorConfig(data_gen=partial(generate_input)) + }, + outputs=["relu_output"]) + + return program_config + + def sample_predictor_configs(self, program_config): + config = self.create_inference_config(use_mkldnn=True) + yield config, ["elementwise_add"], (1e-5, 1e-5) + + def test(self): + self.run_and_statis( + quant=False, passes=["elt_act_mkldnn_fuse_pass"], min_success_num=4) + + +if __name__ == "__main__": + unittest.main() -- GitLab From 3149e399d31045c1357d07e0f5de6dda1e2798d8 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Mon, 14 Mar 2022 20:01:10 +0800 Subject: [PATCH 046/176] [PHI] Move set_value_grad kernel form fluid to phi (#40478) * move set_value_grad kernel form fluid to phi * add unittest for passing coverage ci --- paddle/fluid/operators/set_value_op.cc | 8 - paddle/fluid/operators/set_value_op.cu | 25 -- paddle/fluid/operators/set_value_op.h | 268 -------------- .../phi/kernels/cpu/set_value_grad_kernel.cc | 29 ++ .../phi/kernels/gpu/set_value_grad_kernel.cu | 29 ++ .../kernels/impl/set_value_grad_kernel_impl.h | 344 ++++++++++++++++++ .../phi/kernels/impl/set_value_kernel_impl.h | 1 - paddle/phi/kernels/set_value_grad_kernel.h | 34 ++ paddle/phi/ops/compat/set_value_sig.cc | 102 ++++++ paddle/phi/tests/ops/test_op_signature.cc | 65 ++++ 10 files changed, 603 insertions(+), 302 deletions(-) delete mode 100644 paddle/fluid/operators/set_value_op.cu create mode 100644 paddle/phi/kernels/cpu/set_value_grad_kernel.cc create mode 100644 paddle/phi/kernels/gpu/set_value_grad_kernel.cu create mode 100644 paddle/phi/kernels/impl/set_value_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/set_value_grad_kernel.h diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc index 7d0d782b837..513ab46e9b5 100644 --- a/paddle/fluid/operators/set_value_op.cc +++ b/paddle/fluid/operators/set_value_op.cc @@ -243,14 +243,6 @@ REGISTER_OPERATOR(set_value, ops::SetValue, ops::SetValueMaker, REGISTER_OPERATOR(set_value_grad, ops::SetValueGrad); -REGISTER_OP_CPU_KERNEL( - set_value_grad, - ops::SetValueGradKernel, - ops::SetValueGradKernel, - ops::SetValueGradKernel, - ops::SetValueGradKernel, - ops::SetValueGradKernel); - REGISTER_OP_VERSION(set_value) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/set_value_op.cu b/paddle/fluid/operators/set_value_op.cu deleted file mode 100644 index 9f291a863c0..00000000000 --- a/paddle/fluid/operators/set_value_op.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/set_value_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - set_value_grad, - ops::SetValueGradKernel, - ops::SetValueGradKernel, - ops::SetValueGradKernel, - ops::SetValueGradKernel, - ops::SetValueGradKernel); diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h index 4d459f8c01b..4696907f32e 100644 --- a/paddle/fluid/operators/set_value_op.h +++ b/paddle/fluid/operators/set_value_op.h @@ -19,14 +19,10 @@ #include #include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/assign_value_op.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/slice_utils.h" -#include "paddle/fluid/operators/strided_slice_op.h" #include "paddle/fluid/operators/utils.h" #include "paddle/fluid/platform/enforce.h" @@ -36,23 +32,6 @@ namespace operators { using Tensor = framework::Tensor; using DDim = framework::DDim; -inline void GetOffsets(const DDim& big_dim, const DDim& small_dim, - DDim start_offset, int cur_dim, - std::vector* offsets) { - if (cur_dim == big_dim.size()) { - offsets->push_back(start_offset); - return; - } - if (small_dim[cur_dim] == big_dim[cur_dim]) { - GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets); - } else { - for (int i = 0; i < big_dim[cur_dim]; i++) { - GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets); - start_offset[cur_dim] += 1; - } - } -} - inline std::string GetValueName(framework::proto::VarType::Type data_type) { std::string value_name; switch (data_type) { @@ -121,253 +100,6 @@ inline void CheckIsDimsMatch(const framework::DDim first, "of target shape: %d, but now shape is %d.", second.to_str(), first.to_str())); } -template -class SetValueGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - int rank = ctx.Input(framework::GradVarName("Out"))->dims().size(); - - switch (rank) { - case 1: - SetValueGradCompute<1>(ctx); - break; - case 2: - SetValueGradCompute<2>(ctx); - break; - case 3: - SetValueGradCompute<3>(ctx); - break; - case 4: - SetValueGradCompute<4>(ctx); - break; - case 5: - SetValueGradCompute<5>(ctx); - break; - case 6: - SetValueGradCompute<6>(ctx); - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "The rank of set_value_grad's input should be less than 7, but " - "received %d.", - rank)); - } - } - - private: - template - void SetValueGradCompute(const framework::ExecutionContext& context) const { - auto starts = context.Attr>("starts"); - auto ends = context.Attr>("ends"); - auto steps = context.Attr>("steps"); - - auto axes_int64 = context.Attr>("axes"); - std::vector axes(axes_int64.begin(), axes_int64.end()); - - auto starts_indices = Eigen::DSizes(); - auto ends_indices = Eigen::DSizes(); - auto steps_indices = Eigen::DSizes(); - auto reverse_axis = Eigen::array(); - - auto list_new_ends_tensor = - context.MultiInput("EndsTensorList"); - auto list_new_starts_tensor = - context.MultiInput("StartsTensorList"); - auto list_new_steps_tensor = - context.MultiInput("StepsTensorList"); - - if (list_new_starts_tensor.size() > 0) { - starts = GetDataFromTensorList(list_new_starts_tensor); - } - - if (list_new_ends_tensor.size() > 0) { - ends = GetDataFromTensorList(list_new_ends_tensor); - } - - if (list_new_steps_tensor.size() > 0) { - steps = GetDataFromTensorList(list_new_steps_tensor); - } - - auto in = context.Input(framework::GradVarName("Out")); - PADDLE_ENFORCE_EQ( - in->IsInitialized(), true, - platform::errors::PermissionDenied( - "The input of `set_value_grad`(%s) has not been initialized", - framework::GradVarName("Out"))); - auto grad_value = context.Output( - framework::GradVarName("ValueTensor")); - auto grad_input = - context.Output(framework::GradVarName("Input")); - auto in_dims = in->dims(); - - auto decrease_axis_int64 = - context.Attr>("decrease_axes"); - std::vector decrease_axis(decrease_axis_int64.begin(), - decrease_axis_int64.end()); - std::vector infer_flags(axes.size(), 1); - std::vector out_dims_vector(in_dims.size(), -1); - StridedSliceOutDims(starts, ends, steps, axes, infer_flags, in_dims, - decrease_axis, out_dims_vector.data(), axes.size(), - false); - - framework::DDim out_dims(phi::make_ddim(out_dims_vector)); - - std::vector reverse_vector(starts.size(), 0); - StridedSliceFunctor(starts.data(), ends.data(), steps.data(), axes.data(), - reverse_vector.data(), in_dims, infer_flags, - decrease_axis, starts.size()); - - for (size_t axis = 0; axis < D; axis++) { - starts_indices[axis] = 0; - ends_indices[axis] = out_dims[axis]; - steps_indices[axis] = 1; - reverse_axis[axis] = false; - } - - for (size_t axis = 0; axis < axes.size(); axis++) { - int axis_index = axes[axis]; - starts_indices[axis_index] = starts[axis]; - ends_indices[axis_index] = ends[axis]; - steps_indices[axis_index] = steps[axis]; - reverse_axis[axis_index] = (reverse_vector[axis] == 1) ? true : false; - } - - bool need_reverse = false; - for (size_t axis = 0; axis < axes.size(); axis++) { - if (reverse_vector[axis] == 1) { - need_reverse = true; - break; - } - } - - auto& dev_ctx = context.template device_context(); - auto& place = - *context.template device_context().eigen_device(); - phi::funcs::SetConstant set_zero; - - if (grad_input) { - // Set gradient of `Input` - paddle::framework::TensorCopy(*in, context.GetPlace(), grad_input); - - auto grad_input_t = - framework::EigenTensor::From(*grad_input); - - framework::Tensor tmp(grad_input->dtype()); - tmp.mutable_data(out_dims, context.GetPlace()); - set_zero(dev_ctx, &tmp, static_cast(0)); - auto tmp_t = framework::EigenTensor::From(tmp); - - grad_input_t.stridedSlice(starts_indices, ends_indices, steps_indices) - .device(place) = tmp_t; - } - if (grad_value) { - grad_value->mutable_data(context.GetPlace()); - set_zero(dev_ctx, grad_value, static_cast(0)); - - auto in_t = framework::EigenTensor::From(*in); - - if (grad_value->dims() == out_dims) { - auto grad_value_t = - framework::EigenTensor::From(*grad_value); - if (need_reverse) { - framework::Tensor tmp(grad_value->dtype()); - tmp.mutable_data(out_dims, context.GetPlace()); - set_zero(dev_ctx, &tmp, static_cast(0)); - auto tmp_t = framework::EigenTensor::From(tmp); - - tmp_t.device(place) = - in_t.stridedSlice(starts_indices, ends_indices, steps_indices); - grad_value_t.device(place) = tmp_t.reverse(reverse_axis); - } else { - grad_value_t.device(place) = - in_t.stridedSlice(starts_indices, ends_indices, steps_indices); - } - } else { - int out_dims_size = out_dims.size(); - auto grad_value_dims = grad_value->dims(); - auto fake_grad_value_dims = out_dims; - - // Create an extented shape according to the rules of broadcast. - auto grad_value_dims_size = grad_value_dims.size(); - - int num_decrease = 0; - - int decrease_axis_size = decrease_axis.size(); - for (int i = 0; i < out_dims_size; i++) { - if (decrease_axis.end() != - std::find(decrease_axis.begin(), decrease_axis.end(), i)) { - fake_grad_value_dims[i] = 1; - num_decrease++; - } else if (i < out_dims_size - (grad_value_dims_size + - decrease_axis_size - num_decrease)) { - fake_grad_value_dims[i] = 1; - } else { - auto index_grad = - i - (out_dims_size - (grad_value_dims_size + - decrease_axis_size - num_decrease)); - fake_grad_value_dims[i] = grad_value_dims[index_grad]; - - PADDLE_ENFORCE_EQ((out_dims[i] == grad_value_dims[index_grad]) || - (grad_value_dims[index_grad] == 1), - true, - platform::errors::InvalidArgument( - "An error occurred while calculating %s: " - "[%s] can not be accumulated into [%s].", - framework::GradVarName("ValueTensor"), - out_dims, grad_value_dims)); - } - } - - VLOG(3) << "Dimensions of " << framework::GradVarName("ValueTensor") - << "([" << grad_value_dims << "])is broadcasted into [" - << fake_grad_value_dims << "]."; - - auto extent = Eigen::DSizes(); - auto offset = out_dims; - for (int i = 0; i < out_dims_size; i++) { - offset[i] = 0; - extent[i] = fake_grad_value_dims[i]; - } - std::vector offsets; - GetOffsets(out_dims, fake_grad_value_dims, offset, 0, &offsets); - - auto grad_value_t = - framework::EigenTensor:: - From(*grad_value, fake_grad_value_dims); - - framework::Tensor tmp(grad_value->dtype()); - tmp.mutable_data(out_dims, context.GetPlace()); - set_zero(dev_ctx, &tmp, static_cast(0)); - auto tmp_t = framework::EigenTensor::From(tmp); - - tmp_t.device(place) = - in_t.stridedSlice(starts_indices, ends_indices, steps_indices); - - // accumulate gradient - for (auto offset : offsets) { - grad_value_t.device(place) = - grad_value_t + - tmp_t.slice(framework::EigenDim::From(offset), extent); - } - if (need_reverse) { - framework::Tensor tmp_value(grad_value->dtype()); - tmp_value.mutable_data(fake_grad_value_dims, context.GetPlace()); - auto tmp_value_t = - framework::EigenTensor::From(tmp_value); - tmp_value_t.device(place) = grad_value_t.reverse(reverse_axis); - grad_value_t.device(place) = tmp_value_t; - } - } - } - } -}; } // namespace operators } // namespace paddle diff --git a/paddle/phi/kernels/cpu/set_value_grad_kernel.cc b/paddle/phi/kernels/cpu/set_value_grad_kernel.cc new file mode 100644 index 00000000000..44df36bb9fd --- /dev/null +++ b/paddle/phi/kernels/cpu/set_value_grad_kernel.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/set_value_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/set_value_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(set_value_grad, + CPU, + ALL_LAYOUT, + phi::SetValueGradKernel, + float, + double, + int, + int64_t, + bool) {} diff --git a/paddle/phi/kernels/gpu/set_value_grad_kernel.cu b/paddle/phi/kernels/gpu/set_value_grad_kernel.cu new file mode 100644 index 00000000000..7eed96699e7 --- /dev/null +++ b/paddle/phi/kernels/gpu/set_value_grad_kernel.cu @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/set_value_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/set_value_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(set_value_grad, + GPU, + ALL_LAYOUT, + phi::SetValueGradKernel, + float, + double, + int, + int64_t, + bool) {} diff --git a/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h new file mode 100644 index 00000000000..4947170088c --- /dev/null +++ b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h @@ -0,0 +1,344 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/core/dense_tensor.h" + +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +#include "paddle/fluid/operators/strided_slice_op.h" + +namespace phi { + +inline void GetOffsets(const DDim& big_dim, + const DDim& small_dim, + DDim start_offset, + int cur_dim, + std::vector* offsets) { + if (cur_dim == big_dim.size()) { + offsets->push_back(start_offset); + return; + } + if (small_dim[cur_dim] == big_dim[cur_dim]) { + GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets); + } else { + for (int i = 0; i < big_dim[cur_dim]; i++) { + GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets); + start_offset[cur_dim] += 1; + } + } +} + +template +void SetValueGradImpl(const Context& dev_ctx, + const DenseTensor& out_grad, + const ScalarArray& starts, + const ScalarArray& ends, + const ScalarArray& steps, + const std::vector& axes, + const std::vector& decrease_axes, + const std::vector& none_axes, + DenseTensor* x_grad, + DenseTensor* value_grad) { + PADDLE_ENFORCE_EQ( + out_grad.IsInitialized(), + true, + errors::PermissionDenied( + "The input of `set_value_grad`(out_grad) has not been initialized")); + + auto in_dims = out_grad.dims(); + + std::vector decrease_axis_int32(decrease_axes.begin(), + decrease_axes.end()); + std::vector axes_int32(axes.begin(), axes.end()); + std::vector infer_flags(axes.size(), 1); + std::vector out_dims_vector(in_dims.size(), -1); + std::vector starts_local = starts.GetData(); + std::vector ends_local = ends.GetData(); + std::vector steps_local = steps.GetData(); + paddle::operators::StridedSliceOutDims(starts_local, + ends_local, + steps_local, + axes_int32, + infer_flags, + in_dims, + decrease_axis_int32, + out_dims_vector.data(), + axes.size(), + false); + + DDim out_dims(phi::make_ddim(out_dims_vector)); + + std::vector reverse_vector(starts_local.size(), 0); + paddle::operators::StridedSliceFunctor(starts_local.data(), + ends_local.data(), + steps_local.data(), + axes_int32.data(), + reverse_vector.data(), + in_dims, + infer_flags, + decrease_axis_int32, + starts_local.size()); + + auto starts_indices = Eigen::DSizes(); + auto ends_indices = Eigen::DSizes(); + auto steps_indices = Eigen::DSizes(); + auto reverse_axis = Eigen::array(); + + for (size_t axis = 0; axis < RANK; axis++) { + starts_indices[axis] = 0; + ends_indices[axis] = out_dims[axis]; + steps_indices[axis] = 1; + reverse_axis[axis] = false; + } + + for (size_t axis = 0; axis < axes.size(); axis++) { + int axis_index = axes[axis]; + starts_indices[axis_index] = starts_local[axis]; + ends_indices[axis_index] = ends_local[axis]; + steps_indices[axis_index] = steps_local[axis]; + reverse_axis[axis_index] = (reverse_vector[axis] == 1) ? true : false; + } + + bool need_reverse = false; + for (size_t axis = 0; axis < axes.size(); axis++) { + if (reverse_vector[axis] == 1) { + need_reverse = true; + break; + } + } + + auto& place = *dev_ctx.eigen_device(); + phi::funcs::SetConstant set_zero; + + if (x_grad) { + // Set gradient of `Input` + Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad); + + auto x_grad_t = + EigenTensor::From(*x_grad); + + DenseTensor tmp = Full(dev_ctx, out_dims_vector, static_cast(0)); + auto tmp_t = + EigenTensor::From(tmp); + + x_grad_t.stridedSlice(starts_indices, ends_indices, steps_indices) + .device(place) = tmp_t; + } + if (value_grad) { + dev_ctx.template Alloc(value_grad); + set_zero(dev_ctx, value_grad, static_cast(0)); + + auto in_t = EigenTensor::From( + out_grad); + + if (value_grad->dims() == out_dims) { + auto value_grad_t = + EigenTensor::From( + *value_grad); + if (need_reverse) { + DenseTensor tmp = Full(dev_ctx, out_dims_vector, static_cast(0)); + auto tmp_t = + EigenTensor::From(tmp); + + tmp_t.device(place) = + in_t.stridedSlice(starts_indices, ends_indices, steps_indices); + value_grad_t.device(place) = tmp_t.reverse(reverse_axis); + } else { + value_grad_t.device(place) = + in_t.stridedSlice(starts_indices, ends_indices, steps_indices); + } + } else { + int out_dims_size = out_dims.size(); + auto value_grad_dims = value_grad->dims(); + auto fake_value_grad_dims = out_dims; + + // Create an extented shape according to the rules of broadcast. + auto value_grad_dims_size = value_grad_dims.size(); + + int num_decrease = 0; + + int decrease_axis_size = decrease_axes.size(); + for (int i = 0; i < out_dims_size; i++) { + if (decrease_axes.end() != + std::find(decrease_axes.begin(), decrease_axes.end(), i)) { + fake_value_grad_dims[i] = 1; + num_decrease++; + } else if (i < out_dims_size - (value_grad_dims_size + + decrease_axis_size - num_decrease)) { + fake_value_grad_dims[i] = 1; + } else { + auto index_grad = + i - (out_dims_size - + (value_grad_dims_size + decrease_axis_size - num_decrease)); + fake_value_grad_dims[i] = value_grad_dims[index_grad]; + + PADDLE_ENFORCE_EQ((out_dims[i] == value_grad_dims[index_grad]) || + (value_grad_dims[index_grad] == 1), + true, + errors::InvalidArgument( + "An error occurred while calculating %s: " + "[%s] can not be accumulated into [%s].", + paddle::framework::GradVarName("ValueTensor"), + out_dims, + value_grad_dims)); + } + } + + VLOG(3) << "Dimensions of " + << paddle::framework::GradVarName("ValueTensor") << "([" + << value_grad_dims << "])is broadcasted into [" + << fake_value_grad_dims << "]."; + + auto extent = Eigen::DSizes(); + auto offset = out_dims; + for (int i = 0; i < out_dims_size; i++) { + offset[i] = 0; + extent[i] = fake_value_grad_dims[i]; + } + std::vector offsets; + GetOffsets(out_dims, fake_value_grad_dims, offset, 0, &offsets); + + auto value_grad_t = + EigenTensor::From( + *value_grad, fake_value_grad_dims); + + DenseTensor tmp = Full(dev_ctx, out_dims_vector, static_cast(0)); + auto tmp_t = + EigenTensor::From(tmp); + + tmp_t.device(place) = + in_t.stridedSlice(starts_indices, ends_indices, steps_indices); + + // accumulate gradient + for (auto offset : offsets) { + value_grad_t.device(place) = + value_grad_t + tmp_t.slice(EigenDim::From(offset), extent); + } + if (need_reverse) { + DenseTensor tmp_value = + Full(dev_ctx, + {fake_value_grad_dims.Get(), fake_value_grad_dims.size()}, + static_cast(0)); + auto tmp_value_t = + EigenTensor::From( + tmp_value); + tmp_value_t.device(place) = value_grad_t.reverse(reverse_axis); + value_grad_t.device(place) = tmp_value_t; + } + } + } +} + +template +void SetValueGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const ScalarArray& starts, + const ScalarArray& ends, + const ScalarArray& steps, + const std::vector& axes, + const std::vector& decrease_axes, + const std::vector& none_axes, + DenseTensor* x_grad, + DenseTensor* value_grad) { + const int rank = out_grad.dims().size(); + + switch (rank) { + case 1: + SetValueGradImpl(dev_ctx, + out_grad, + starts, + ends, + steps, + axes, + decrease_axes, + none_axes, + x_grad, + value_grad); + break; + case 2: + SetValueGradImpl(dev_ctx, + out_grad, + starts, + ends, + steps, + axes, + decrease_axes, + none_axes, + x_grad, + value_grad); + break; + case 3: + SetValueGradImpl(dev_ctx, + out_grad, + starts, + ends, + steps, + axes, + decrease_axes, + none_axes, + x_grad, + value_grad); + break; + case 4: + SetValueGradImpl(dev_ctx, + out_grad, + starts, + ends, + steps, + axes, + decrease_axes, + none_axes, + x_grad, + value_grad); + break; + case 5: + SetValueGradImpl(dev_ctx, + out_grad, + starts, + ends, + steps, + axes, + decrease_axes, + none_axes, + x_grad, + value_grad); + break; + case 6: + SetValueGradImpl(dev_ctx, + out_grad, + starts, + ends, + steps, + axes, + decrease_axes, + none_axes, + x_grad, + value_grad); + break; + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "The rank of set_value_grad's input should be less than 7, but " + "received %d.", + rank)); + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/set_value_kernel_impl.h b/paddle/phi/kernels/impl/set_value_kernel_impl.h index 5aebffe51b5..99db559f3b8 100644 --- a/paddle/phi/kernels/impl/set_value_kernel_impl.h +++ b/paddle/phi/kernels/impl/set_value_kernel_impl.h @@ -25,7 +25,6 @@ #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" -#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/slice_utils.h" namespace phi { diff --git a/paddle/phi/kernels/set_value_grad_kernel.h b/paddle/phi/kernels/set_value_grad_kernel.h new file mode 100644 index 00000000000..6a028b0c8dc --- /dev/null +++ b/paddle/phi/kernels/set_value_grad_kernel.h @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void SetValueGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const ScalarArray& starts, + const ScalarArray& ends, + const ScalarArray& steps, + const std::vector& axes, + const std::vector& decrease_axes, + const std::vector& none_axes, + DenseTensor* x_grad, + DenseTensor* value_grad); + +} // namespace phi diff --git a/paddle/phi/ops/compat/set_value_sig.cc b/paddle/phi/ops/compat/set_value_sig.cc index eacfff26d53..9653250bded 100644 --- a/paddle/phi/ops/compat/set_value_sig.cc +++ b/paddle/phi/ops/compat/set_value_sig.cc @@ -731,6 +731,108 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) { } return KernelSignature("unregistered", {}, {}, {}); } + +KernelSignature SetValueGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + if (ctx.HasInput("StartsTensorList")) { + if (ctx.HasInput("EndsTensorList")) { + if (ctx.HasInput("StepsTensorList")) { + return KernelSignature( + "set_value_grad", + {GradVarName("Out")}, + {"StartsTensorList", + "EndsTensorList", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes"}, + {GradVarName("Input"), GradVarName("ValueTensor")}); + } else { + return KernelSignature( + "set_value_grad", + {GradVarName("Out")}, + {"StartsTensorList", + "EndsTensorList", + "steps", + "axes", + "decrease_axes", + "none_axes"}, + {GradVarName("Input"), GradVarName("ValueTensor")}); + } + } else { + if (ctx.HasInput("StepsTensorList")) { + return KernelSignature( + "set_value_grad", + {GradVarName("Out")}, + {"StartsTensorList", + "ends", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes"}, + {GradVarName("Input"), GradVarName("ValueTensor")}); + } else { + return KernelSignature( + "set_value_grad", + {GradVarName("Out")}, + {"StartsTensorList", + "ends", + "steps", + "axes", + "decrease_axes", + "none_axes"}, + {GradVarName("Input"), GradVarName("ValueTensor")}); + } + } + } else { + if (ctx.HasInput("EndsTensorList")) { + if (ctx.HasInput("StepsTensorList")) { + return KernelSignature( + "set_value_grad", + {GradVarName("Out")}, + {"starts", + "EndsTensorList", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes"}, + {GradVarName("Input"), GradVarName("ValueTensor")}); + } else { + return KernelSignature( + "set_value_grad", + {GradVarName("Out")}, + {"starts", + "EndsTensorList", + "steps", + "axes", + "decrease_axes", + "none_axes"}, + {GradVarName("Input"), GradVarName("ValueTensor")}); + } + } else { + if (ctx.HasInput("StepsTensorList")) { + return KernelSignature( + "set_value_grad", + {GradVarName("Out")}, + {"starts", + "ends", + "StepsTensorList", + "axes", + "decrease_axes", + "none_axes"}, + {GradVarName("Input"), GradVarName("ValueTensor")}); + } else { + return KernelSignature( + "set_value_grad", + {GradVarName("Out")}, + {"starts", "ends", "steps", "axes", "decrease_axes", "none_axes"}, + {GradVarName("Input"), GradVarName("ValueTensor")}); + } + } + } +} + } // namespace phi PD_REGISTER_ARG_MAPPING_FN(set_value, phi::SetValueOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(set_value_grad, phi::SetValueGradOpArgumentMapping); diff --git a/paddle/phi/tests/ops/test_op_signature.cc b/paddle/phi/tests/ops/test_op_signature.cc index c74049e0f04..36923972ea4 100644 --- a/paddle/phi/tests/ops/test_op_signature.cc +++ b/paddle/phi/tests/ops/test_op_signature.cc @@ -484,6 +484,71 @@ TEST(ARG_MAP, set_value) { "set_value"); } +TEST(ARG_MAP, set_value_grad) { + TestArgumentMappingContext arg_case( + {"Out@GRAD", "StartsTensorList", "EndsTensorList"}, + {}, + {}, + {"Input@GRAD", "ValueTensor@GRAD"}, + {}); + ASSERT_EQ(OpUtilsMap::Instance() + .GetArgumentMappingFn("set_value_grad")(arg_case) + .name, + "set_value_grad"); + + TestArgumentMappingContext arg_case1( + {"Out@GRAD", "StartsTensorList", "StepsTensorList"}, + {}, + {}, + {"Input@GRAD", "ValueTensor@GRAD"}, + {}); + ASSERT_EQ(OpUtilsMap::Instance() + .GetArgumentMappingFn("set_value_grad")(arg_case1) + .name, + "set_value_grad"); + + TestArgumentMappingContext arg_case2({"Out@GRAD", "StartsTensorList"}, + {}, + {}, + {"Input@GRAD", "ValueTensor@GRAD"}, + {}); + ASSERT_EQ(OpUtilsMap::Instance() + .GetArgumentMappingFn("set_value_grad")(arg_case2) + .name, + "set_value_grad"); + + TestArgumentMappingContext arg_case3( + {"Out@GRAD", "EndsTensorList", "StepsTensorList"}, + {}, + {}, + {"Input@GRAD", "ValueTensor@GRAD"}, + {}); + ASSERT_EQ(OpUtilsMap::Instance() + .GetArgumentMappingFn("set_value_grad")(arg_case3) + .name, + "set_value_grad"); + + TestArgumentMappingContext arg_case4({"Out@GRAD", "EndsTensorList"}, + {}, + {}, + {"Input@GRAD", "ValueTensor@GRAD"}, + {}); + ASSERT_EQ(OpUtilsMap::Instance() + .GetArgumentMappingFn("set_value_grad")(arg_case4) + .name, + "set_value_grad"); + + TestArgumentMappingContext arg_case5({"Out@GRAD", "StepsTensorList"}, + {}, + {}, + {"Input@GRAD", "ValueTensor@GRAD"}, + {}); + ASSERT_EQ(OpUtilsMap::Instance() + .GetArgumentMappingFn("set_value_grad")(arg_case5) + .name, + "set_value_grad"); +} + TEST(ARG_MAP, allclose) { TestArgumentMappingContext arg_case1( {"Input", "Other", "Rtol"}, -- GitLab From e157f2afb78a9183d71bf03ffd67379040173056 Mon Sep 17 00:00:00 2001 From: Siming Dai <908660116@qq.com> Date: Mon, 14 Mar 2022 20:05:16 +0800 Subject: [PATCH 047/176] [Phi]Add diag_v2 grad kernel (#40447) * Add diag grad kernel * fix unittest case * add float16, remove const & * delete diag_grad in op_utils.h --- paddle/fluid/operators/diag_v2_op.cc | 53 ++++++- paddle/phi/kernels/cpu/diag_grad_kernel.cc | 72 +++++++++ paddle/phi/kernels/cpu/diag_kernel.cc | 11 +- paddle/phi/kernels/diag_grad_kernel.h | 28 ++++ paddle/phi/kernels/gpu/diag_grad_kernel.cu | 139 ++++++++++++++++++ paddle/phi/kernels/gpu/diag_kernel.cu | 11 +- paddle/phi/ops/compat/diag_sig.cc | 7 + .../fluid/tests/unittests/test_diag_v2.py | 8 +- 8 files changed, 316 insertions(+), 13 deletions(-) create mode 100644 paddle/phi/kernels/cpu/diag_grad_kernel.cc create mode 100644 paddle/phi/kernels/diag_grad_kernel.h create mode 100644 paddle/phi/kernels/gpu/diag_grad_kernel.cu diff --git a/paddle/fluid/operators/diag_v2_op.cc b/paddle/fluid/operators/diag_v2_op.cc index 93fbff67e22..ac8c12bcd7e 100644 --- a/paddle/fluid/operators/diag_v2_op.cc +++ b/paddle/fluid/operators/diag_v2_op.cc @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include - #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/infermeta/unary.h" @@ -58,15 +56,56 @@ class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker { } }; +class DiagV2GradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "X", "X", "DiagV2Grad"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output", + framework::GradVarName("X"), "DiagV2Grad"); + + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")), + ctx.GetPlace()); + } +}; + +template +class DiagV2GradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr grad_op) const override { + grad_op->SetType("diag_v2_grad"); + grad_op->SetInput("X", this->Input("X")); + grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + grad_op->SetAttrMap(this->Attrs()); + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERER(DiagGradV2NoNeedBufferVarsInferer, "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; + DECLARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor, PD_INFER_META(phi::DiagInferMeta)); -REGISTER_OPERATOR( - diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker, - DiagInferShapeFunctor); +REGISTER_OPERATOR(diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker, + ops::DiagV2GradOpMaker, + ops::DiagV2GradOpMaker, + DiagInferShapeFunctor); + +REGISTER_OPERATOR(diag_v2_grad, ops::DiagV2GradOp, + ops::DiagGradV2NoNeedBufferVarsInferer); diff --git a/paddle/phi/kernels/cpu/diag_grad_kernel.cc b/paddle/phi/kernels/cpu/diag_grad_kernel.cc new file mode 100644 index 00000000000..c56b225e2a7 --- /dev/null +++ b/paddle/phi/kernels/cpu/diag_grad_kernel.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/diag_grad_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/diag_functor.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void DiagGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + int offset, + DenseTensor* x_grad) { + T* dx_data = dev_ctx.template Alloc(x_grad); + const T* dout_data = out_grad.data(); + auto dx_dims = x_grad->dims(); + auto dout_dims = out_grad.dims(); + + if (dx_dims.size() == 1) { + auto dx_length = dx_dims[0]; + int dx_stride = phi::funcs::ComputeStride(0, dx_dims); + + auto dout_stride_0 = phi::funcs::ComputeStride(0, dout_dims); + auto dout_stride_1 = phi::funcs::ComputeStride(1, dout_dims); + dout_data += + (offset >= 0 ? offset * dout_stride_1 : -offset * dout_stride_0); + + for (int i = 0; i < dx_length; i++) { + dx_data[i * dx_stride] = dout_data[i * (dout_stride_0 + dout_stride_1)]; + } + } else { + phi::funcs::SetConstant set_padding_value; + set_padding_value(dev_ctx, x_grad, static_cast(0)); + + int dx_stride_0 = phi::funcs::ComputeStride(0, dx_dims); + int dx_stride_1 = phi::funcs::ComputeStride(1, dx_dims); + auto dout_stride_0 = phi::funcs::ComputeStride(0, dout_dims); + dx_data += (offset >= 0 ? offset * dx_stride_1 : -offset * dx_stride_0); + + auto dout_length = dout_dims[0]; + for (int i = 0; i < dout_length; i++) { + dx_data[i * (dx_stride_0 + dx_stride_1)] = dout_data[i * dout_stride_0]; + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(diag_grad, + CPU, + ALL_LAYOUT, + phi::DiagGradKernel, + phi::dtype::float16, + int, + int64_t, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/diag_kernel.cc b/paddle/phi/kernels/cpu/diag_kernel.cc index d1e0b8e31e7..4b060f0372a 100644 --- a/paddle/phi/kernels/cpu/diag_kernel.cc +++ b/paddle/phi/kernels/cpu/diag_kernel.cc @@ -62,5 +62,12 @@ void DiagKernel(const Context& dev_ctx, } // namespace phi -PD_REGISTER_KERNEL( - diag, CPU, ALL_LAYOUT, phi::DiagKernel, int, float, double, int64_t) {} +PD_REGISTER_KERNEL(diag, + CPU, + ALL_LAYOUT, + phi::DiagKernel, + phi::dtype::float16, + int, + float, + double, + int64_t) {} diff --git a/paddle/phi/kernels/diag_grad_kernel.h b/paddle/phi/kernels/diag_grad_kernel.h new file mode 100644 index 00000000000..b9edab9bec4 --- /dev/null +++ b/paddle/phi/kernels/diag_grad_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void DiagGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + int offset, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/diag_grad_kernel.cu b/paddle/phi/kernels/gpu/diag_grad_kernel.cu new file mode 100644 index 00000000000..65bf837e6cf --- /dev/null +++ b/paddle/phi/kernels/gpu/diag_grad_kernel.cu @@ -0,0 +1,139 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/diag_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/diag_functor.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +// Extract the diagonal of a matrix 'dout' to a matrix 'dx' +template +__global__ void ExtractDiagonalKernel(const T* dout, + T* dx, + std::ptrdiff_t start, + std::ptrdiff_t dx_length, + const std::ptrdiff_t sumStride, + const std::ptrdiff_t xStride) { + for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < dx_length; + idx += gridDim.x * blockDim.x) { + const std::ptrdiff_t outOffset = start + sumStride * idx; + dx[xStride * idx] = dout[outOffset]; + } +} + +// Paste a vector 'dout' to the diagonal of a matrix 'dx' +template +__global__ void PasteDiagonalKernel(const T* dout, + T* dx, + std::ptrdiff_t start, + std::ptrdiff_t size, + const std::ptrdiff_t sumStride, + const std::ptrdiff_t outStride) { + for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; + idx += gridDim.x * blockDim.x) { + std::ptrdiff_t xOffset = start + sumStride * idx; + dx[xOffset] = dout[outStride * idx]; + } +} + +template +void DiagGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + int offset, + DenseTensor* x_grad) { + T* dx_data = dev_ctx.template Alloc(x_grad); + auto* dout_data = out_grad.data(); + auto dx_dims = x_grad->dims(); + auto dout_dims = out_grad.dims(); + + auto GetBlockGridSize = [&dev_ctx](int64_t size) { + const int64_t block_size = + std::min(size, static_cast(dev_ctx.GetMaxThreadsPerBlock())); + int64_t max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int64_t max_blocks = + std::max(((max_threads - 1) / block_size + 1), static_cast(1)); + const int64_t grid_size = + std::min(max_blocks, (size + block_size - 1) / block_size); + return std::tuple{block_size, grid_size}; + }; + + if (dx_dims.size() == 1) { + auto dx_length = dx_dims[0]; + auto size = (offset > 0) ? dx_length + offset : dx_length - offset; + int dx_stride = phi::funcs::ComputeStride(0, dx_dims); + if (size > 0) { + auto dout_stride_0 = phi::funcs::ComputeStride(0, dout_dims); + auto dout_stride_1 = phi::funcs::ComputeStride(1, dout_dims); + auto start = + (offset >= 0 ? offset * dout_stride_1 : -offset * dout_stride_0); + + std::tuple block_grid_size = GetBlockGridSize(size); + ExtractDiagonalKernel<<(block_grid_size), + std::get<0>(block_grid_size), + 0, + dev_ctx.stream()>>>( + dout_data, + dx_data, + start, + dx_length, + dout_stride_0 + dout_stride_1, + dx_stride); + } + } else { + phi::funcs::SetConstant set_padding_value; + set_padding_value(dev_ctx, x_grad, static_cast(0)); + + int dx_stride_0 = phi::funcs::ComputeStride(0, dx_dims); + int dx_stride_1 = phi::funcs::ComputeStride(1, dx_dims); + int64_t size; + if (offset > 0) { + size = std::min(dx_dims[0], dx_dims[1] - offset); + } else { + size = std::min(dx_dims[0] + offset, dx_dims[1]); + } + + if (size > 0) { + auto start = (offset >= 0 ? offset * dx_stride_1 : -offset * dx_stride_0); + auto dout_stride_0 = phi::funcs::ComputeStride(0, dout_dims); + std::tuple block_grid_size = GetBlockGridSize(size); + PasteDiagonalKernel<<(block_grid_size), + std::get<0>(block_grid_size), + 0, + dev_ctx.stream()>>>(dout_data, + dx_data, + start, + size, + dx_stride_0 + dx_stride_1, + dout_stride_0); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(diag_grad, + GPU, + ALL_LAYOUT, + phi::DiagGradKernel, + phi::dtype::float16, + int, + int64_t, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/diag_kernel.cu b/paddle/phi/kernels/gpu/diag_kernel.cu index fc706397871..95d3d3365d9 100644 --- a/paddle/phi/kernels/gpu/diag_kernel.cu +++ b/paddle/phi/kernels/gpu/diag_kernel.cu @@ -130,5 +130,12 @@ void DiagKernel(const Context& dev_ctx, } // namespace phi -PD_REGISTER_KERNEL( - diag, GPU, ALL_LAYOUT, phi::DiagKernel, int, int64_t, float, double) {} +PD_REGISTER_KERNEL(diag, + GPU, + ALL_LAYOUT, + phi::DiagKernel, + phi::dtype::float16, + int, + int64_t, + float, + double) {} diff --git a/paddle/phi/ops/compat/diag_sig.cc b/paddle/phi/ops/compat/diag_sig.cc index 0a14b9095c8..f3245b922c0 100644 --- a/paddle/phi/ops/compat/diag_sig.cc +++ b/paddle/phi/ops/compat/diag_sig.cc @@ -20,8 +20,15 @@ KernelSignature DiagOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("diag", {"X"}, {"offset", "padding_value"}, {"Out"}); } +KernelSignature DiagGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "diag_grad", {"X", GradVarName("Out")}, {"offset"}, {GradVarName("X")}); +} + } // namespace phi PD_REGISTER_BASE_KERNEL_NAME(diag_v2, diag); +PD_REGISTER_BASE_KERNEL_NAME(diag_v2_grad, diag_grad); PD_REGISTER_ARG_MAPPING_FN(diag_v2, phi::DiagOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(diag_v2_grad, phi::DiagGradOpArgumentMapping); diff --git a/python/paddle/fluid/tests/unittests/test_diag_v2.py b/python/paddle/fluid/tests/unittests/test_diag_v2.py index 0371fa05428..9f727608f81 100644 --- a/python/paddle/fluid/tests/unittests/test_diag_v2.py +++ b/python/paddle/fluid/tests/unittests/test_diag_v2.py @@ -44,6 +44,10 @@ class TestDiagV2Op(OpTest): paddle.enable_static() self.check_output(check_eager=True) + def test_check_grad(self): + paddle.enable_static() + self.check_grad(['X'], 'Out', check_eager=True) + def init_config(self): pass @@ -62,14 +66,14 @@ class TestDiagV2OpCase2(TestDiagV2Op): class TestDiagV2OpCase3(TestDiagV2Op): def init_config(self): - self.x = np.random.randint(-10, 10, size=(10, 10)) + self.x = np.random.randint(-10, 10, size=(10, 10)).astype("float64") self.out = np.diag(self.x, self.offset) class TestDiagV2OpCase4(TestDiagV2Op): def init_config(self): self.x = np.random.rand(100) - self.padding_value = 8 + self.padding_value = 2 n = self.x.size self.out = self.padding_value * np.ones((n, n)) + np.diag( self.x, self.offset) - np.diag(self.padding_value * np.ones(n)) -- GitLab From 5d08a4471973e1c2b2a595781d0a0840875a0c77 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 15 Mar 2022 00:00:04 +0800 Subject: [PATCH 048/176] move allclose infershape (#40508) --- paddle/fluid/operators/allclose_op.cc | 41 ++++------------------ paddle/phi/infermeta/binary.cc | 50 +++++++++++++++++++++++++++ paddle/phi/infermeta/binary.h | 5 +++ 3 files changed, 61 insertions(+), 35 deletions(-) diff --git a/paddle/fluid/operators/allclose_op.cc b/paddle/fluid/operators/allclose_op.cc index 706a132878d..88d7cb7c1f5 100644 --- a/paddle/fluid/operators/allclose_op.cc +++ b/paddle/fluid/operators/allclose_op.cc @@ -15,10 +15,13 @@ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -61,40 +64,6 @@ class AllcloseOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Allclose"); - OP_INOUT_CHECK(ctx->HasInput("Other"), "Input", "Other", "Allclose"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Allclose"); - - auto input_dim = ctx->GetInputDim("Input"); - auto other_dim = ctx->GetInputDim("Other"); - PADDLE_ENFORCE_EQ(input_dim.size(), other_dim.size(), - platform::errors::PreconditionNotMet( - "Input(Input) and Input(Other) must have the same " - "dimension size.")); - int n = input_dim.size(); - bool is_runtime = ctx->IsRuntime(); - for (int i = 0; i < n; i++) { - if (is_runtime) { - PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i], - platform::errors::PreconditionNotMet( - "The value at dim %d of Input(Input) is not " - "equal to the Input(Other): %ld != %ld.", - i, input_dim[i], other_dim[i])); - } else { - if (!(input_dim[i] < 0 || other_dim[i] < 0)) { - PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i], - platform::errors::PreconditionNotMet( - "The value at dim %d of Input(Input) is not " - "equal to the Input(Other): %ld != %ld.", - i, input_dim[i], other_dim[i])); - } - } - } - - ctx->SetOutputDim("Out", phi::make_ddim({1})); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -117,11 +86,13 @@ class AllcloseOpVarTypeInference : public framework::VarTypeInference { namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; +DECLARE_INFER_SHAPE_FUNCTOR(allclose, AllcloseInferShapeFunctor, + PD_INFER_META(phi::AllValueCompareInferMeta)); REGISTER_OPERATOR( allclose, ops::AllcloseOp, ops::AllcloseOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker, - ops::AllcloseOpVarTypeInference); + ops::AllcloseOpVarTypeInference, AllcloseInferShapeFunctor); /* ========================== register checkpoint ===========================*/ REGISTER_OP_VERSION(allclose) diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index b9d43224456..2947661517e 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -21,6 +21,56 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/common_shape.h" namespace phi { +namespace detail { + +static void BinarySameInputDimsCheck(const MetaTensor& x, + const MetaTensor& y, + MetaConfig config) { + auto input_dim = x.dims(); + auto other_dim = y.dims(); + PADDLE_ENFORCE_EQ(input_dim.size(), + other_dim.size(), + phi::errors::PreconditionNotMet( + "Input(Input) and Input(Other) must have the same " + "dimension size.")); + int n = input_dim.size(); + bool is_runtime = config.is_runtime; + for (int i = 0; i < n; i++) { + if (is_runtime) { + PADDLE_ENFORCE_EQ(input_dim[i], + other_dim[i], + phi::errors::PreconditionNotMet( + "The value at dim %d of Input(Input) is not " + "equal to the Input(Other): %ld != %ld.", + i, + input_dim[i], + other_dim[i])); + } else { + if (!(input_dim[i] < 0 || other_dim[i] < 0)) { + PADDLE_ENFORCE_EQ(input_dim[i], + other_dim[i], + phi::errors::PreconditionNotMet( + "The value at dim %d of Input(Input) is not " + "equal to the Input(Other): %ld != %ld.", + i, + input_dim[i], + other_dim[i])); + } + } + } +} + +} // namespace detail + +void AllValueCompareInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out, + MetaConfig config) { + detail::BinarySameInputDimsCheck(x, y, config); + + out->set_dims(phi::make_ddim({1})); + out->set_dtype(DataType::BOOL); +} void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { out->share_meta(x); diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index 307ecc29cac..cfae45cf04b 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -29,6 +29,11 @@ namespace phi { // Because functions in this file not only can infer shape, but also need // infer lod or other useful data. +void AllValueCompareInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out); void BCELossInferMeta(const MetaTensor& input, -- GitLab From f181d47f250bd801111419bdac6b6abb806a208b Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Tue, 15 Mar 2022 08:43:29 +0800 Subject: [PATCH 049/176] [Phi]Move kron kernel to phi (#40427) * first commit * fix * fix * fix compile eeror * fix * fix complex * fix * fix * fix npu * fix * modify accroding to comments * fix --- paddle/fluid/operators/gather_op_npu.cc | 1 - paddle/fluid/operators/kron_op.cc | 27 +- paddle/fluid/operators/kron_op.cu | 42 -- paddle/fluid/operators/kron_op.h | 415 ------------------ paddle/fluid/operators/scatter_op_npu.cc | 2 +- paddle/phi/kernels/cpu/kron_grad_kernel.cc | 31 ++ paddle/phi/kernels/cpu/kron_kernel.cc | 31 ++ paddle/phi/kernels/gpu/kron_grad_kernel.cu | 31 ++ paddle/phi/kernels/gpu/kron_kernel.cu | 31 ++ .../phi/kernels/impl/kron_grad_kernel_impl.h | 295 +++++++++++++ paddle/phi/kernels/impl/kron_kernel_impl.h | 167 +++++++ paddle/phi/kernels/kron_grad_kernel.h | 29 ++ paddle/phi/kernels/kron_kernel.h | 27 ++ paddle/phi/ops/compat/kron_sig.cc | 28 ++ 14 files changed, 672 insertions(+), 485 deletions(-) delete mode 100644 paddle/fluid/operators/kron_op.cu delete mode 100644 paddle/fluid/operators/kron_op.h create mode 100644 paddle/phi/kernels/cpu/kron_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/kron_kernel.cc create mode 100644 paddle/phi/kernels/gpu/kron_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/kron_kernel.cu create mode 100644 paddle/phi/kernels/impl/kron_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/kron_kernel_impl.h create mode 100644 paddle/phi/kernels/kron_grad_kernel.h create mode 100644 paddle/phi/kernels/kron_kernel.h create mode 100644 paddle/phi/ops/compat/kron_sig.cc diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc index a83abb24522..21093f585b5 100644 --- a/paddle/fluid/operators/gather_op_npu.cc +++ b/paddle/fluid/operators/gather_op_npu.cc @@ -17,7 +17,6 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/kron_op.h" #include "paddle/fluid/platform/device/npu/npu_info.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" diff --git a/paddle/fluid/operators/kron_op.cc b/paddle/fluid/operators/kron_op.cc index 58d51ab1c72..68d0c7978b4 100644 --- a/paddle/fluid/operators/kron_op.cc +++ b/paddle/fluid/operators/kron_op.cc @@ -17,9 +17,7 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/kron_op.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -178,27 +176,4 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(kron, ops::KronOp, ops::KronOpMaker, ops::KronGradOpMaker, ops::KronGradOpMaker); -REGISTER_OP_CPU_KERNEL( - kron, ops::KronKernel, - ops::KronKernel, - ops::KronKernel, - ops::KronKernel, - ops::KronKernel, - ops::KronKernel>, - ops::KronKernel>); - REGISTER_OPERATOR(kron_grad, ops::KronGradOp); -REGISTER_OP_CPU_KERNEL( - kron_grad, ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel>, - ops::KronGradKernel>); diff --git a/paddle/fluid/operators/kron_op.cu b/paddle/fluid/operators/kron_op.cu deleted file mode 100644 index e5124e65007..00000000000 --- a/paddle/fluid/operators/kron_op.cu +++ /dev/null @@ -1,42 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/kron_op.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/float16.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - kron, ops::KronKernel, - ops::KronKernel, - ops::KronKernel, - ops::KronKernel, - ops::KronKernel, - ops::KronKernel>, - ops::KronKernel>); - -REGISTER_OP_CUDA_KERNEL( - kron_grad, ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel>, - ops::KronGradKernel>); diff --git a/paddle/fluid/operators/kron_op.h b/paddle/fluid/operators/kron_op.h deleted file mode 100644 index 274b47c03a4..00000000000 --- a/paddle/fluid/operators/kron_op.h +++ /dev/null @@ -1,415 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/for_range.h" -#if defined(__NVCC__) || defined(__HIPCC__) -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "thrust/device_vector.h" -#endif - -namespace paddle { -namespace operators { - -// Process an element in the output, used with a parallel-for -template -struct KronElemFunctor { - KronElemFunctor(const T* a, const T* b, T* out, const int64_t* shape_b, - const int64_t* stride_a, const int64_t* stride_b, - const int64_t* stride_out, int ndims) - : a_(a), - b_(b), - out_(out), - shape_b_(shape_b), - stride_a_(stride_a), - stride_b_(stride_b), - stride_out_(stride_out), - ndims_(ndims) {} - - HOSTDEVICE void operator()(int64_t idx) const { - // it computes 1 element in the output - int64_t index = idx; - int64_t index_a = 0; - int64_t index_b = 0; - for (int i = 0; i < ndims_; i++) { - auto pos_i = index / stride_out_[i]; - index = index % stride_out_[i]; - auto pos_ai = pos_i / shape_b_[i]; - auto pos_bi = pos_i % shape_b_[i]; - index_a += stride_a_[i] * pos_ai; - index_b += stride_b_[i] * pos_bi; - } - out_[idx] = a_[index_a] * b_[index_b]; - } - - private: - const T* a_; - const T* b_; - T* out_; - const int64_t* shape_b_; - const int64_t* stride_a_; - const int64_t* stride_b_; - const int64_t* stride_out_; - const int ndims_; -}; - -template -struct KronOpFunctor { - void operator()(const DeviceContext& dev_ctx, const framework::Tensor& x, - const framework::Tensor& y, framework::Tensor* out) { - int ndims = out->dims().size(); - int64_t numel = out->numel(); - - const framework::DDim& dim_x = x.dims(); - const framework::DDim& dim_y = y.dims(); - const framework::DDim& dim_out = out->dims(); - const framework::DDim stride_x = phi::stride(dim_x); - const framework::DDim stride_y = phi::stride(dim_y); - const framework::DDim stride_out = phi::stride(dim_out); - - const int64_t *p_stride_x = nullptr, *p_stride_y = nullptr, - *p_stride_out = nullptr, *p_shape_y = nullptr; -#if defined(__NVCC__) || defined(__HIPCC__) - thrust::device_vector d_stride_x(ndims); - thrust::device_vector d_stride_y(ndims); - thrust::device_vector d_stride_out(ndims); - thrust::device_vector d_shape_y(ndims); - thrust::copy(stride_x.Get(), stride_x.Get() + ndims, d_stride_x.begin()); - thrust::copy(stride_y.Get(), stride_y.Get() + ndims, d_stride_y.begin()); - thrust::copy(stride_out.Get(), stride_out.Get() + ndims, - d_stride_out.begin()); - thrust::copy(dim_y.Get(), dim_y.Get() + ndims, d_shape_y.begin()); - - p_stride_x = thrust::raw_pointer_cast(d_stride_x.data()); - p_stride_y = thrust::raw_pointer_cast(d_stride_y.data()); - p_stride_out = thrust::raw_pointer_cast(d_stride_out.data()); - p_shape_y = thrust::raw_pointer_cast(d_shape_y.data()); -#else - p_stride_x = stride_x.Get(); - p_stride_y = stride_y.Get(); - p_stride_out = stride_out.Get(); - p_shape_y = dim_y.Get(); -#endif - - platform::ForRange for_range(dev_ctx, numel); - KronElemFunctor functor(x.data(), y.data(), out->data(), - p_shape_y, p_stride_x, p_stride_y, p_stride_out, - ndims); - for_range(functor); - } -}; - -template -struct KronGradElemFunctor { - KronGradElemFunctor(const T* dout, const T* A, const T* B, T* dout_a, - T* dout_b, const int64_t* stride_dout, - const int64_t* stride_a, const int64_t* stride_b, - const int64_t* shape_b, const int64_t numel_a, - const int64_t numel_b, const int ndims) - : dout_(dout), - A_(A), - B_(B), - dout_a_(dout_a), - dout_b_(dout_b), - stride_dout_(stride_dout), - stride_a_(stride_a), - stride_b_(stride_b), - shape_b_(shape_b), - numel_a_(numel_a), - numel_b_(numel_b), - ndims_(ndims) {} - - HOSTDEVICE void operator()(int64_t idx) { - int64_t index = idx; - int64_t index_a = 0; - int64_t index_b = 0; - for (int i = 0; i < ndims_; i++) { - auto pos_i = index / stride_dout_[i]; - index = index % stride_dout_[i]; - auto pos_ai = pos_i / shape_b_[i]; - auto pos_bi = pos_i % shape_b_[i]; - index_a += stride_a_[i] * pos_ai; - index_b += stride_b_[i] * pos_bi; - } - - if (dout_a_) { - size_t index_out_a = index_a * numel_b_ + index_b; - dout_a_[index_out_a] = dout_[idx] * B_[index_b]; - } - if (dout_b_) { - size_t index_out_b = index_b * numel_a_ + index_a; - dout_b_[index_out_b] = dout_[idx] * A_[index_a]; - } - } - - private: - const T* dout_; - const T* A_; - const T* B_; - T* dout_a_; - T* dout_b_; - const int64_t* stride_dout_; - const int64_t* stride_a_; - const int64_t* stride_b_; - const int64_t* shape_b_; - const int64_t numel_a_; - const int64_t numel_b_; - const int ndims_; -}; - -template -struct KronGradElemFunctor> { - KronGradElemFunctor(const platform::complex* dout, - const platform::complex* A, - const platform::complex* B, - platform::complex* dout_a, - platform::complex* dout_b, const int64_t* stride_dout, - const int64_t* stride_a, const int64_t* stride_b, - const int64_t* shape_b, const int64_t numel_a, - const int64_t numel_b, const int ndims) - : dout_(dout), - A_(A), - B_(B), - dout_a_(dout_a), - dout_b_(dout_b), - stride_dout_(stride_dout), - stride_a_(stride_a), - stride_b_(stride_b), - shape_b_(shape_b), - numel_a_(numel_a), - numel_b_(numel_b), - ndims_(ndims) {} - - HOSTDEVICE void operator()(int64_t idx) { - int64_t index = idx; - int64_t index_a = 0; - int64_t index_b = 0; - for (int i = 0; i < ndims_; i++) { - auto pos_i = index / stride_dout_[i]; - index = index % stride_dout_[i]; - auto pos_ai = pos_i / shape_b_[i]; - auto pos_bi = pos_i % shape_b_[i]; - index_a += stride_a_[i] * pos_ai; - index_b += stride_b_[i] * pos_bi; - } - - if (dout_a_) { - size_t index_out_a = index_a * numel_b_ + index_b; - dout_a_[index_out_a] = - dout_[idx] * - platform::complex(B_[index_b].real, -B_[index_b].imag); - } - if (dout_b_) { - size_t index_out_b = index_b * numel_a_ + index_a; - dout_b_[index_out_b] = - dout_[idx] * - platform::complex(A_[index_a].real, -A_[index_a].imag); - } - } - - private: - const platform::complex* dout_; - const platform::complex* A_; - const platform::complex* B_; - platform::complex* dout_a_; - platform::complex* dout_b_; - const int64_t* stride_dout_; - const int64_t* stride_a_; - const int64_t* stride_b_; - const int64_t* shape_b_; - const int64_t numel_a_; - const int64_t numel_b_; - const int ndims_; -}; - -template -struct KronGradOpFunctor { - void operator()(const DeviceContext& dev_ctx, const framework::Tensor& dout, - const framework::Tensor& x, const framework::Tensor& y, - framework::Tensor* dx, framework::Tensor* dy) { - int ndims = dout.dims().size(); - int64_t numel = dout.numel(); - int64_t numel_x = x.numel(); - int64_t numel_y = y.numel(); - - const framework::DDim& dim_x = x.dims(); - const framework::DDim& dim_y = y.dims(); - const framework::DDim& dim_dout = dout.dims(); - - const framework::DDim stride_x = phi::stride(dim_x); - const framework::DDim stride_y = phi::stride(dim_y); - const framework::DDim stride_dout = phi::stride(dim_dout); - - const int64_t* p_stride_x = nullptr; - const int64_t* p_stride_y = nullptr; - const int64_t* p_stride_dout = nullptr; - const int64_t* p_shape_y = nullptr; -#if defined(__NVCC__) || defined(__HIPCC__) - thrust::device_vector d_stride_x(ndims); - thrust::device_vector d_stride_y(ndims); - thrust::device_vector d_stride_dout(ndims); - thrust::device_vector d_shape_y(ndims); - thrust::copy(stride_x.Get(), stride_x.Get() + ndims, d_stride_x.begin()); - thrust::copy(stride_y.Get(), stride_y.Get() + ndims, d_stride_y.begin()); - thrust::copy(stride_dout.Get(), stride_dout.Get() + ndims, - d_stride_dout.begin()); - thrust::copy(dim_y.Get(), dim_y.Get() + ndims, d_shape_y.begin()); - - p_stride_x = thrust::raw_pointer_cast(d_stride_x.data()); - p_stride_y = thrust::raw_pointer_cast(d_stride_y.data()); - p_stride_dout = thrust::raw_pointer_cast(d_stride_dout.data()); - p_shape_y = thrust::raw_pointer_cast(d_shape_y.data()); -#else - p_stride_x = stride_x.Get(); - p_stride_y = stride_y.Get(); - p_stride_dout = stride_dout.Get(); - p_shape_y = dim_y.Get(); -#endif - // dout_x: dout * kron(ones(X), Y) re-aranged in shape (numel_x, numel_y) - // dout_y: dout * kron(X, ones(Y)) re-aranged in shaoe (numel_y, numel_x) - framework::Tensor dout_x; - T* p_dout_x = nullptr; - if (dx) { - dout_x.mutable_data({numel_x, numel_y}, dev_ctx.GetPlace()); - p_dout_x = dout_x.data(); - } - framework::Tensor dout_y; - T* p_dout_y = nullptr; - if (dy) { - dout_y.mutable_data({numel_y, numel_x}, dev_ctx.GetPlace()); - p_dout_y = dout_y.data(); - } - - platform::ForRange for_range(dev_ctx, numel); - KronGradElemFunctor func(dout.data(), x.data(), y.data(), - p_dout_x, p_dout_y, p_stride_dout, p_stride_x, - p_stride_y, p_shape_y, numel_x, numel_y, ndims); - for_range(func); - -// reduce_sum along aixs 1 -#if defined(__NVCC__) || defined(__HIPCC__) - auto stream = dev_ctx.stream(); // it is a cuda device_context - if (dx) { - TensorReduceImpl>( - dev_ctx, dout_x, dx, kps::IdentityFunctor(), {1}, stream); - } - if (dy) { - TensorReduceImpl>( - dev_ctx, dout_y, dy, kps::IdentityFunctor(), {1}, stream); - } -#else - auto* place = dev_ctx.eigen_device(); - Eigen::array reduce_dim = {1}; - if (dx) { - auto eigen_dout_x = framework::EigenMatrix::Reshape(dout_x, 1); - auto eigen_vec_dx = framework::EigenVector::Flatten(*dx); - eigen_vec_dx.device(*place) = eigen_dout_x.sum(reduce_dim); - } - if (dy) { - auto eigen_dout_y = framework::EigenMatrix::Reshape(dout_y, 1); - auto eigen_vec_dy = framework::EigenVector::Flatten(*dy); - eigen_vec_dy.device(*place) = eigen_dout_y.sum(reduce_dim); - } -#endif - } -}; - -inline framework::Tensor UnsqueezeTo(const framework::Tensor& src, int ndims) { - const framework::DDim& shape = src.dims(); - int rank = shape.size(); - framework::Tensor res; - res.ShareDataWith(src); - PADDLE_ENFORCE_LE( - rank, ndims, - platform::errors::InvalidArgument( - "The input Tensor's rank should be less than or equal to ndims" - "Received input Tensor's rank = %d, ndims = %d", - rank, ndims)); - if (rank < ndims) { - std::vector new_dim(ndims, 1); - for (int i = ndims - rank; i < ndims; i++) { - new_dim[i] = shape[i - ndims + rank]; - } - res.Resize(phi::make_ddim(new_dim)); - } - return res; -} - -template -class KronKernel : public framework::OpKernel { - public: - virtual void Compute(const framework::ExecutionContext& ctx) const { - auto& dev_ctx = ctx.template device_context(); - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - int ndims = out->dims().size(); - framework::Tensor xx = UnsqueezeTo(*x, ndims); - framework::Tensor yy = UnsqueezeTo(*y, ndims); - - KronOpFunctor func; - func(dev_ctx, xx, yy, out); - } -}; - -template -class KronGradKernel : public framework::OpKernel { - public: - virtual void Compute(const framework::ExecutionContext& ctx) const { - auto& dev_ctx = ctx.template device_context(); - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - if (dx) { - dx->mutable_data(ctx.GetPlace()); - } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - } - - int ndims = dout->dims().size(); - framework::Tensor xx = UnsqueezeTo(*x, ndims); - framework::Tensor yy = UnsqueezeTo(*y, ndims); - - framework::Tensor* pdxx = nullptr; - framework::Tensor* pdyy = nullptr; - framework::Tensor dxx; - framework::Tensor dyy; - if (dx) { - dxx = UnsqueezeTo(*dx, ndims); - pdxx = &dxx; - } - - if (dy) { - dyy = UnsqueezeTo(*dy, ndims); - pdyy = &dyy; - } - - KronGradOpFunctor func; - func(dev_ctx, *dout, xx, yy, pdxx, pdyy); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc index 815984ac307..d5ef95269b4 100644 --- a/paddle/fluid/operators/scatter_op_npu.cc +++ b/paddle/fluid/operators/scatter_op_npu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/kron_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/phi/kernels/cpu/kron_grad_kernel.cc b/paddle/phi/kernels/cpu/kron_grad_kernel.cc new file mode 100644 index 00000000000..01f5e5404b6 --- /dev/null +++ b/paddle/phi/kernels/cpu/kron_grad_kernel.cc @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kron_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/kron_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(kron_grad, + CPU, + ALL_LAYOUT, + phi::KronGradKernel, + int, + int64_t, + float, + double, + phi::dtype::float16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/kron_kernel.cc b/paddle/phi/kernels/cpu/kron_kernel.cc new file mode 100644 index 00000000000..aaea509dc76 --- /dev/null +++ b/paddle/phi/kernels/cpu/kron_kernel.cc @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kron_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/kron_kernel_impl.h" + +PD_REGISTER_KERNEL(kron, + CPU, + ALL_LAYOUT, + phi::KronKernel, + int, + int64_t, + float, + double, + phi::dtype::float16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/kron_grad_kernel.cu b/paddle/phi/kernels/gpu/kron_grad_kernel.cu new file mode 100644 index 00000000000..13ef2adaab3 --- /dev/null +++ b/paddle/phi/kernels/gpu/kron_grad_kernel.cu @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kron_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/kron_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(kron_grad, + GPU, + ALL_LAYOUT, + phi::KronGradKernel, + int, + int64_t, + float, + double, + phi::dtype::float16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/kron_kernel.cu b/paddle/phi/kernels/gpu/kron_kernel.cu new file mode 100644 index 00000000000..a2124fd5af7 --- /dev/null +++ b/paddle/phi/kernels/gpu/kron_kernel.cu @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kron_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/kron_kernel_impl.h" + +PD_REGISTER_KERNEL(kron, + GPU, + ALL_LAYOUT, + phi::KronKernel, + int, + int64_t, + float, + double, + phi::dtype::float16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/impl/kron_grad_kernel_impl.h b/paddle/phi/kernels/impl/kron_grad_kernel_impl.h new file mode 100644 index 00000000000..30297b53eab --- /dev/null +++ b/paddle/phi/kernels/impl/kron_grad_kernel_impl.h @@ -0,0 +1,295 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/impl/kron_kernel_impl.h" + +namespace phi { + +template +struct KronGradElemFunctor { + KronGradElemFunctor(const T* dout, + const T* A, + const T* B, + T* dout_a, + T* dout_b, + const int64_t* stride_dout, + const int64_t* stride_a, + const int64_t* stride_b, + const int64_t* shape_b, + const int64_t numel_a, + const int64_t numel_b, + const int ndims) + : dout_(dout), + A_(A), + B_(B), + dout_a_(dout_a), + dout_b_(dout_b), + stride_dout_(stride_dout), + stride_a_(stride_a), + stride_b_(stride_b), + shape_b_(shape_b), + numel_a_(numel_a), + numel_b_(numel_b), + ndims_(ndims) {} + + HOSTDEVICE void operator()(int64_t idx) { + int64_t index = idx; + int64_t index_a = 0; + int64_t index_b = 0; + for (int i = 0; i < ndims_; i++) { + auto pos_i = index / stride_dout_[i]; + index = index % stride_dout_[i]; + auto pos_ai = pos_i / shape_b_[i]; + auto pos_bi = pos_i % shape_b_[i]; + index_a += stride_a_[i] * pos_ai; + index_b += stride_b_[i] * pos_bi; + } + + if (dout_a_) { + size_t index_out_a = index_a * numel_b_ + index_b; + dout_a_[index_out_a] = dout_[idx] * B_[index_b]; + } + if (dout_b_) { + size_t index_out_b = index_b * numel_a_ + index_a; + dout_b_[index_out_b] = dout_[idx] * A_[index_a]; + } + } + + private: + const T* dout_; + const T* A_; + const T* B_; + T* dout_a_; + T* dout_b_; + const int64_t* stride_dout_; + const int64_t* stride_a_; + const int64_t* stride_b_; + const int64_t* shape_b_; + const int64_t numel_a_; + const int64_t numel_b_; + const int ndims_; +}; + +template +struct KronGradElemFunctor> { + KronGradElemFunctor(const dtype::complex* dout, + const dtype::complex* A, + const dtype::complex* B, + dtype::complex* dout_a, + dtype::complex* dout_b, + const int64_t* stride_dout, + const int64_t* stride_a, + const int64_t* stride_b, + const int64_t* shape_b, + const int64_t numel_a, + const int64_t numel_b, + const int ndims) + : dout_(dout), + A_(A), + B_(B), + dout_a_(dout_a), + dout_b_(dout_b), + stride_dout_(stride_dout), + stride_a_(stride_a), + stride_b_(stride_b), + shape_b_(shape_b), + numel_a_(numel_a), + numel_b_(numel_b), + ndims_(ndims) {} + + HOSTDEVICE void operator()(int64_t idx) { + int64_t index = idx; + int64_t index_a = 0; + int64_t index_b = 0; + for (int i = 0; i < ndims_; i++) { + auto pos_i = index / stride_dout_[i]; + index = index % stride_dout_[i]; + auto pos_ai = pos_i / shape_b_[i]; + auto pos_bi = pos_i % shape_b_[i]; + index_a += stride_a_[i] * pos_ai; + index_b += stride_b_[i] * pos_bi; + } + + if (dout_a_) { + size_t index_out_a = index_a * numel_b_ + index_b; + dout_a_[index_out_a] = + dout_[idx] * dtype::complex(B_[index_b].real, -B_[index_b].imag); + } + if (dout_b_) { + size_t index_out_b = index_b * numel_a_ + index_a; + dout_b_[index_out_b] = + dout_[idx] * dtype::complex(A_[index_a].real, -A_[index_a].imag); + } + } + + private: + const dtype::complex* dout_; + const dtype::complex* A_; + const dtype::complex* B_; + dtype::complex* dout_a_; + dtype::complex* dout_b_; + const int64_t* stride_dout_; + const int64_t* stride_a_; + const int64_t* stride_b_; + const int64_t* shape_b_; + const int64_t numel_a_; + const int64_t numel_b_; + const int ndims_; +}; + +template +struct KronGradOpFunctor { + void operator()(const Context& dev_ctx, + const DenseTensor& dout, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* dx, + DenseTensor* dy) { + int ndims = dout.dims().size(); + int64_t numel = dout.numel(); + int64_t numel_x = x.numel(); + int64_t numel_y = y.numel(); + + const phi::DDim& dim_x = x.dims(); + const phi::DDim& dim_y = y.dims(); + const phi::DDim& dim_dout = dout.dims(); + + const phi::DDim stride_x = phi::stride(dim_x); + const phi::DDim stride_y = phi::stride(dim_y); + const phi::DDim stride_dout = phi::stride(dim_dout); + + const int64_t* p_stride_x = nullptr; + const int64_t* p_stride_y = nullptr; + const int64_t* p_stride_dout = nullptr; + const int64_t* p_shape_y = nullptr; +#if defined(__NVCC__) || defined(__HIPCC__) + thrust::device_vector d_stride_x(ndims); + thrust::device_vector d_stride_y(ndims); + thrust::device_vector d_stride_dout(ndims); + thrust::device_vector d_shape_y(ndims); + thrust::copy(stride_x.Get(), stride_x.Get() + ndims, d_stride_x.begin()); + thrust::copy(stride_y.Get(), stride_y.Get() + ndims, d_stride_y.begin()); + thrust::copy( + stride_dout.Get(), stride_dout.Get() + ndims, d_stride_dout.begin()); + thrust::copy(dim_y.Get(), dim_y.Get() + ndims, d_shape_y.begin()); + + p_stride_x = thrust::raw_pointer_cast(d_stride_x.data()); + p_stride_y = thrust::raw_pointer_cast(d_stride_y.data()); + p_stride_dout = thrust::raw_pointer_cast(d_stride_dout.data()); + p_shape_y = thrust::raw_pointer_cast(d_shape_y.data()); +#else + p_stride_x = stride_x.Get(); + p_stride_y = stride_y.Get(); + p_stride_dout = stride_dout.Get(); + p_shape_y = dim_y.Get(); +#endif + // dout_x: dout * kron(ones(X), Y) re-aranged in shape (numel_x, numel_y) + // dout_y: dout * kron(X, ones(Y)) re-aranged in shaoe (numel_y, numel_x) + DenseTensor dout_x; + T* p_dout_x = nullptr; + if (dx) { + dout_x.Resize({numel_x, numel_y}); + dev_ctx.template Alloc(&dout_x); + p_dout_x = dout_x.data(); + } + DenseTensor dout_y; + T* p_dout_y = nullptr; + if (dy) { + dout_y.Resize({numel_y, numel_x}); + dev_ctx.template Alloc(&dout_y); + p_dout_y = dout_y.data(); + } + + funcs::ForRange for_range(dev_ctx, numel); + KronGradElemFunctor func(dout.data(), + x.data(), + y.data(), + p_dout_x, + p_dout_y, + p_stride_dout, + p_stride_x, + p_stride_y, + p_shape_y, + numel_x, + numel_y, + ndims); + for_range(func); + +// reduce_sum along aixs 1 +#if defined(__NVCC__) || defined(__HIPCC__) + auto stream = dev_ctx.stream(); // it is a cuda device_context + if (dx) { + funcs::ReduceKernel>( + dev_ctx, dout_x, dx, kps::IdentityFunctor(), {1}); + } + if (dy) { + funcs::ReduceKernel>( + dev_ctx, dout_y, dy, kps::IdentityFunctor(), {1}); + } +#else + auto* place = dev_ctx.eigen_device(); + Eigen::array reduce_dim = {1}; + if (dx) { + auto eigen_dout_x = EigenMatrix::Reshape(dout_x, 1); + auto eigen_vec_dx = EigenVector::Flatten(*dx); + eigen_vec_dx.device(*place) = eigen_dout_x.sum(reduce_dim); + } + if (dy) { + auto eigen_dout_y = EigenMatrix::Reshape(dout_y, 1); + auto eigen_vec_dy = EigenVector::Flatten(*dy); + eigen_vec_dy.device(*place) = eigen_dout_y.sum(reduce_dim); + } +#endif + } +}; + +template +void KronGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + DenseTensor* x_grad, + DenseTensor* y_grad) { + if (x_grad) { + ctx.template Alloc(x_grad); + } + if (y_grad) { + ctx.template Alloc(y_grad); + } + + int ndims = out_grad.dims().size(); + DenseTensor xx = UnsqueezeTo(x, ndims); + DenseTensor yy = UnsqueezeTo(y, ndims); + + DenseTensor* pdxx = nullptr; + DenseTensor* pdyy = nullptr; + DenseTensor dxx; + DenseTensor dyy; + if (x_grad) { + dxx = UnsqueezeTo(*x_grad, ndims); + pdxx = &dxx; + } + + if (y_grad) { + dyy = UnsqueezeTo(*y_grad, ndims); + pdyy = &dyy; + } + + KronGradOpFunctor func; + func(ctx, out_grad, xx, yy, pdxx, pdyy); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/kron_kernel_impl.h b/paddle/phi/kernels/impl/kron_kernel_impl.h new file mode 100644 index 00000000000..47c76f59df2 --- /dev/null +++ b/paddle/phi/kernels/impl/kron_kernel_impl.h @@ -0,0 +1,167 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/for_range.h" +#if defined(__NVCC__) || defined(__HIPCC__) +#include "paddle/phi/kernels/funcs/reduce_function.h" +#include "thrust/device_vector.h" +#endif + +namespace phi { + +inline DenseTensor UnsqueezeTo(const DenseTensor& src, int ndims) { + const phi::DDim& shape = src.dims(); + int rank = shape.size(); + DenseTensor res; + res.ShareDataWith(src); + PADDLE_ENFORCE_LE( + rank, + ndims, + errors::InvalidArgument( + "The input Tensor's rank should be less than or equal to ndims" + "Received input Tensor's rank = %d, ndims = %d", + rank, + ndims)); + if (rank < ndims) { + std::vector new_dim(ndims, 1); + for (int i = ndims - rank; i < ndims; i++) { + new_dim[i] = shape[i - ndims + rank]; + } + res.Resize(phi::make_ddim(new_dim)); + } + return res; +} + +template +struct KronElemFunctor { + KronElemFunctor(const T* a, + const T* b, + T* out, + const int64_t* shape_b, + const int64_t* stride_a, + const int64_t* stride_b, + const int64_t* stride_out, + int ndims) + : a_(a), + b_(b), + out_(out), + shape_b_(shape_b), + stride_a_(stride_a), + stride_b_(stride_b), + stride_out_(stride_out), + ndims_(ndims) {} + + HOSTDEVICE void operator()(int64_t idx) const { + // it computes 1 element in the output + int64_t index = idx; + int64_t index_a = 0; + int64_t index_b = 0; + for (int i = 0; i < ndims_; i++) { + auto pos_i = index / stride_out_[i]; + index = index % stride_out_[i]; + auto pos_ai = pos_i / shape_b_[i]; + auto pos_bi = pos_i % shape_b_[i]; + index_a += stride_a_[i] * pos_ai; + index_b += stride_b_[i] * pos_bi; + } + out_[idx] = a_[index_a] * b_[index_b]; + } + + private: + const T* a_; + const T* b_; + T* out_; + const int64_t* shape_b_; + const int64_t* stride_a_; + const int64_t* stride_b_; + const int64_t* stride_out_; + const int ndims_; +}; + +template +struct KronOpFunctor { + void operator()(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + int ndims = out->dims().size(); + int64_t numel = out->numel(); + + const phi::DDim& dim_x = x.dims(); + const phi::DDim& dim_y = y.dims(); + const phi::DDim& dim_out = out->dims(); + const phi::DDim stride_x = phi::stride(dim_x); + const phi::DDim stride_y = phi::stride(dim_y); + const phi::DDim stride_out = phi::stride(dim_out); + + const int64_t *p_stride_x = nullptr, *p_stride_y = nullptr, + *p_stride_out = nullptr, *p_shape_y = nullptr; +#if defined(__NVCC__) || defined(__HIPCC__) + thrust::device_vector d_stride_x(ndims); + thrust::device_vector d_stride_y(ndims); + thrust::device_vector d_stride_out(ndims); + thrust::device_vector d_shape_y(ndims); + thrust::copy(stride_x.Get(), stride_x.Get() + ndims, d_stride_x.begin()); + thrust::copy(stride_y.Get(), stride_y.Get() + ndims, d_stride_y.begin()); + thrust::copy( + stride_out.Get(), stride_out.Get() + ndims, d_stride_out.begin()); + thrust::copy(dim_y.Get(), dim_y.Get() + ndims, d_shape_y.begin()); + + p_stride_x = thrust::raw_pointer_cast(d_stride_x.data()); + p_stride_y = thrust::raw_pointer_cast(d_stride_y.data()); + p_stride_out = thrust::raw_pointer_cast(d_stride_out.data()); + p_shape_y = thrust::raw_pointer_cast(d_shape_y.data()); +#else + p_stride_x = stride_x.Get(); + p_stride_y = stride_y.Get(); + p_stride_out = stride_out.Get(); + p_shape_y = dim_y.Get(); +#endif + + funcs::ForRange for_range(dev_ctx, numel); + KronElemFunctor functor(x.data(), + y.data(), + out->data(), + p_shape_y, + p_stride_x, + p_stride_y, + p_stride_out, + ndims); + for_range(functor); + } +}; + +template +void KronKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + ctx.template Alloc(out); + + int ndims = out->dims().size(); + DenseTensor xx = UnsqueezeTo(x, ndims); + DenseTensor yy = UnsqueezeTo(y, ndims); + + KronOpFunctor func; + func(ctx, xx, yy, out); +} + +} // namespace phi diff --git a/paddle/phi/kernels/kron_grad_kernel.h b/paddle/phi/kernels/kron_grad_kernel.h new file mode 100644 index 00000000000..3daa9dcfba9 --- /dev/null +++ b/paddle/phi/kernels/kron_grad_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void KronGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + DenseTensor* x_grad, + DenseTensor* y_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/kron_kernel.h b/paddle/phi/kernels/kron_kernel.h new file mode 100644 index 00000000000..4451ac757a9 --- /dev/null +++ b/paddle/phi/kernels/kron_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void KronKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/kron_sig.cc b/paddle/phi/ops/compat/kron_sig.cc new file mode 100644 index 00000000000..06b6545f58e --- /dev/null +++ b/paddle/phi/ops/compat/kron_sig.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature KronGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("kron_grad", + {"X", "Y", GradVarName("Out")}, + {}, + {GradVarName("X"), GradVarName("Y")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(kron_grad, phi::KronGradOpArgumentMapping); -- GitLab From 47d764a3ffdfed1b1f20a1fdba60e995ebed76a5 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Tue, 15 Mar 2022 09:14:41 +0800 Subject: [PATCH 050/176] Remove pybind index error (#40538) * change the exception of getitem from pybind type to PADDLE_ENFORCE * fix bug * remove pybind::index_error exception --- paddle/fluid/pybind/slice_utils.h | 18 ++++++++---------- paddle/fluid/pybind/tensor_py.h | 26 ++++++++++++++++++-------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/pybind/slice_utils.h b/paddle/fluid/pybind/slice_utils.h index a037fa13eb5..add332abd30 100644 --- a/paddle/fluid/pybind/slice_utils.h +++ b/paddle/fluid/pybind/slice_utils.h @@ -188,16 +188,14 @@ static void ParseIndexingSlice( int start = static_cast(PyLong_AsLong(slice_item)); auto s_t = start; start = start < 0 ? start + dim_len : start; - if (start >= dim_len || start < 0) { - std::string str_error_message = - "The starting index " + std::to_string(s_t) + - " of slice is out of bounds in tensor " + std::to_string(dim) + - "-th axis, it shound be in the range of [" + - std::to_string(-dim_len) + ", " + std::to_string(dim_len) + ")"; - // py::index_error is corresponding to IndexError in Python - // Used to indicate out of bounds access in __getitem__, __setitem__ - throw py::index_error(str_error_message); - } + + PADDLE_ENFORCE( + 0 <= start && start < dim_len, + platform::errors::OutOfRange("The starting index %d of slice is out " + "of bounds in tensor %d-th axis, it " + "shound be in the range of [%d, %d).", + s_t, dim, -dim_len, dim_len)); + slice_axes->push_back(dim); slice_starts->push_back(start); slice_ends->push_back(start + 1); diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index c593c7df3e0..6849fcb0394 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -585,14 +585,20 @@ inline void _getSliceinfo(const framework::Tensor &self, py::object obj, auto &step = *pstep; auto &slicelength = *pslicelength; const framework::DDim &srcDDim = self.dims(); - if (dim < 0 || dim >= srcDDim.size()) { - throw py::index_error(); - } + PADDLE_ENFORCE( + 0 <= dim && dim < srcDDim.size(), + platform::errors::OutOfRange("The dim %d of slice is out of bounds, it " + "shound be in the range of [0, %d).", + dim, srcDDim.size())); + if (py::isinstance(obj)) { size_t lstart, lstop, lstep, lslicelength; py::slice s = static_cast(obj); if (!s.compute(srcDDim[dim], &lstart, &lstop, &lstep, &lslicelength)) { - throw py::index_error(); + PADDLE_THROW(platform::errors::OutOfRange( + "Slice on dim: %d is error, please check the validity of tensor " + "dims or slice item.", + dim)); } start = static_cast(lstart); stop = static_cast(lstop); @@ -600,15 +606,19 @@ inline void _getSliceinfo(const framework::Tensor &self, py::object obj, slicelength = static_cast(lslicelength); } else if (py::isinstance(obj)) { start = static_cast(static_cast(obj)); - if (std::abs(start) >= srcDDim[dim]) { - throw py::index_error(); - } + PADDLE_ENFORCE( + std::abs(start) < srcDDim[dim], + platform::errors::OutOfRange("The start %d of slice is out of bounds, " + "it shound be in the range of (%d, %d).", + start, -srcDDim[dim], srcDDim[dim])); start = (start >= 0) ? start : srcDDim[dim] - start; stop = start + 1; step = 1; slicelength = 1; } else { - throw py::index_error(); + PADDLE_THROW( + platform::errors::OutOfRange("Index object error, the index object for " + "slice only supports slice(::) and int.")); } } -- GitLab From e7057932d222e433ccaed851b7bdbb51131a5975 Mon Sep 17 00:00:00 2001 From: crystal <62974595+Zjq9409@users.noreply.github.com> Date: Tue, 15 Mar 2022 10:12:04 +0800 Subject: [PATCH 051/176] remove useless head file (#40519) --- paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc | 4 ---- paddle/phi/kernels/cpu/reduce_prod_kernel.cc | 2 +- paddle/phi/kernels/eigh_kernel.h | 1 - paddle/phi/kernels/funcs/values_vectors_functor.h | 1 - paddle/phi/kernels/gather_tree_kernel.h | 1 + paddle/phi/kernels/gpu/eigh_grad_kernel.cu | 1 - paddle/phi/kernels/gpu/gather_tree_kernel.cu | 4 ++-- paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu | 5 ----- paddle/phi/kernels/gpu/reduce_prod_kernel.cu | 4 ++-- paddle/phi/kernels/impl/eigh_grad_kernel_impl.h | 8 +------- 10 files changed, 7 insertions(+), 24 deletions(-) diff --git a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc index 210750da1e0..70b6316e104 100644 --- a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc +++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc @@ -16,15 +16,11 @@ #include #include -#include "paddle/phi/core/ddim.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/reduce.h" #include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/funcs/compare_functors.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/funcs/reduce_functor.h" #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h" #include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/reduce_max_kernel.h" diff --git a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc b/paddle/phi/kernels/cpu/reduce_prod_kernel.cc index cf0179124eb..9a9bf46e948 100644 --- a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc +++ b/paddle/phi/kernels/cpu/reduce_prod_kernel.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/reduce_prod_kernel.h" -#include "paddle/phi/backends/cpu/cpu_context.h" + #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/cpu/reduce.h" #include "paddle/phi/kernels/funcs/reduce_functor.h" diff --git a/paddle/phi/kernels/eigh_kernel.h b/paddle/phi/kernels/eigh_kernel.h index dd28752d929..19653918302 100644 --- a/paddle/phi/kernels/eigh_kernel.h +++ b/paddle/phi/kernels/eigh_kernel.h @@ -15,7 +15,6 @@ #pragma once #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/device_context.h" namespace phi { diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h index b3189fc5cc3..336e9c80942 100644 --- a/paddle/phi/kernels/funcs/values_vectors_functor.h +++ b/paddle/phi/kernels/funcs/values_vectors_functor.h @@ -20,7 +20,6 @@ #endif // PADDLE_WITH_CUDA #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/device_context.h" #include "paddle/phi/kernels/funcs/complex_functors.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" #include "paddle/phi/kernels/transpose_kernel.h" diff --git a/paddle/phi/kernels/gather_tree_kernel.h b/paddle/phi/kernels/gather_tree_kernel.h index e5a1a684dae..b3e6ffbc429 100644 --- a/paddle/phi/kernels/gather_tree_kernel.h +++ b/paddle/phi/kernels/gather_tree_kernel.h @@ -15,6 +15,7 @@ #pragma once #include "paddle/phi/core/dense_tensor.h" + namespace phi { template diff --git a/paddle/phi/kernels/gpu/eigh_grad_kernel.cu b/paddle/phi/kernels/gpu/eigh_grad_kernel.cu index fdf61dc7399..5e33966055e 100644 --- a/paddle/phi/kernels/gpu/eigh_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/eigh_grad_kernel.cu @@ -15,7 +15,6 @@ #include "paddle/phi/kernels/eigh_grad_kernel.h" #include "paddle/phi/kernels/impl/eigh_grad_kernel_impl.h" -#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/complex_functors.h" diff --git a/paddle/phi/kernels/gpu/gather_tree_kernel.cu b/paddle/phi/kernels/gpu/gather_tree_kernel.cu index a9e73ec37c8..2906b81cb40 100644 --- a/paddle/phi/kernels/gpu/gather_tree_kernel.cu +++ b/paddle/phi/kernels/gpu/gather_tree_kernel.cu @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/phi/kernels/gather_tree_kernel.h" + #include -#include "paddle/phi/core/device_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/gather_tree_kernel.h" namespace phi { diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu index ccd9f714956..2009547fc8d 100644 --- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu +++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu @@ -21,16 +21,11 @@ #include #include "paddle/fluid/memory/memory.h" #include "paddle/phi/backends/dynload/cusolver.h" -#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/abs_kernel.h" #include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/compare_functors.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" -#include "paddle/phi/kernels/funcs/for_range.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/funcs/reduce_function.h" #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h" #include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/reduce_max_kernel.h" diff --git a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu b/paddle/phi/kernels/gpu/reduce_prod_kernel.cu index 14084d0f4f3..278d4a6e5ab 100644 --- a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu +++ b/paddle/phi/kernels/gpu/reduce_prod_kernel.cu @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/reduce_prod_kernel.h" +#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/gpu/reduce.h" -#include "paddle/phi/kernels/reduce_prod_kernel.h" namespace phi { diff --git a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h index 2f0530b638f..5b71fd7fa3a 100644 --- a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h @@ -15,21 +15,15 @@ #pragma once #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/device_context.h" -#include "paddle/phi/kernels/cast_kernel.h" #include "paddle/phi/kernels/complex_kernel.h" -#include "paddle/phi/kernels/full_kernel.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" #include "paddle/phi/kernels/funcs/diag_functor.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/unsqueeze.h" #include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/matmul_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" -#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" - namespace phi { template -- GitLab From af6ef8881438201fbf135500cc7d652a32ae583b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?= <39303645+Shixiaowei02@users.noreply.github.com> Date: Tue, 15 Mar 2022 10:36:47 +0800 Subject: [PATCH 052/176] adjusts the mlir attrs order, test=develop (#40514) --- paddle/infrt/dialect/phi/data_type.cc | 28 ++++---- paddle/infrt/dialect/phi/data_type.h | 16 ++--- .../infrt/dialect/phi/pass/kernel_op_desc.cc | 6 +- paddle/infrt/host_context/kernel_registry.cc | 30 +++++--- paddle/infrt/host_context/kernel_registry.h | 8 ++- .../host_context/mlir_function_executable.cc | 4 +- .../host_context/mlir_function_executable.h | 1 + .../host_context/mlir_to_runtime_translate.cc | 71 ++++++++++++++----- .../host_context/mlir_to_runtime_translate.h | 3 +- .../infrt/kernel/phi/dense_tensor_kernels.cc | 12 ++-- .../infrt/kernel/phi/dense_tensor_kernels.h | 2 +- paddle/infrt/kernel/phi/registry.cc | 12 ++-- paddle/infrt/kernel/tensor_kernels.cc | 6 +- .../infrt/tests/dialect/phi/dense_tensor.mlir | 2 +- 14 files changed, 131 insertions(+), 70 deletions(-) diff --git a/paddle/infrt/dialect/phi/data_type.cc b/paddle/infrt/dialect/phi/data_type.cc index 5da7ec88312..bbc296ea748 100644 --- a/paddle/infrt/dialect/phi/data_type.cc +++ b/paddle/infrt/dialect/phi/data_type.cc @@ -16,7 +16,7 @@ namespace infrt { -phi::Backend cvtTarget2Phi(TargetType target) { +phi::Backend ConvertTargetToPhi(TargetType target) { switch (target) { case TargetType::CPU: return phi::Backend::CPU; @@ -27,7 +27,7 @@ phi::Backend cvtTarget2Phi(TargetType target) { } } -TargetType cvtTargetFromPhi(phi::Backend backend) { +TargetType ConvertTargetFromPhi(phi::Backend backend) { switch (backend) { case phi::Backend::CPU: return TargetType::CPU; @@ -38,7 +38,7 @@ TargetType cvtTargetFromPhi(phi::Backend backend) { } } -phi::DataType cvtPrecision2Phi(PrecisionType precision) { +phi::DataType ConvertPrecisionToPhi(PrecisionType precision) { #define CONVERT_PRECISION_TO_PHI(Precision) \ case PrecisionType::Precision: \ return phi::DataType::Precision; @@ -61,7 +61,7 @@ phi::DataType cvtPrecision2Phi(PrecisionType precision) { #undef CONVERT_PRECISION_TO_PHI } -PrecisionType cvtPrecisionFromPhi(phi::DataType datatype) { +PrecisionType ConvertPrecisionFromPhi(phi::DataType datatype) { #define CONVERT_PRECISION_FROM_PHI(Precision) \ case phi::DataType::Precision: \ return PrecisionType::Precision; @@ -84,7 +84,7 @@ PrecisionType cvtPrecisionFromPhi(phi::DataType datatype) { #undef CONVERT_PRECISION_FROM_PHI } -phi::DataLayout cvtLayout2Phi(LayoutType layout) { +phi::DataLayout ConvertLayoutToPhi(LayoutType layout) { switch (layout) { case LayoutType::NCHW: return phi::DataLayout::NCHW; @@ -97,7 +97,7 @@ phi::DataLayout cvtLayout2Phi(LayoutType layout) { } } -LayoutType cvtLayoutFromPhi(phi::DataLayout layout) { +LayoutType ConvertLayoutFromPhi(phi::DataLayout layout) { switch (layout) { case phi::DataLayout::NCHW: return LayoutType::NCHW; @@ -110,16 +110,16 @@ LayoutType cvtLayoutFromPhi(phi::DataLayout layout) { } } -phi::KernelKey cvtPlace2Phi(const Place& place) { - return phi::KernelKey(cvtTarget2Phi(place.target), - cvtLayout2Phi(place.layout), - cvtPrecision2Phi(place.precision)); +phi::KernelKey ConvertPlaceToPhi(const Place& place) { + return phi::KernelKey(ConvertTargetToPhi(place.target), + ConvertLayoutToPhi(place.layout), + ConvertPrecisionToPhi(place.precision)); } -Place cvtPlaceFromPhi(phi::TensorArgDef tensor_arg) { - return Place(cvtTargetFromPhi(tensor_arg.backend), - cvtPrecisionFromPhi(tensor_arg.dtype), - cvtLayoutFromPhi(tensor_arg.layout)); +Place ConvertPlaceFromPhi(phi::TensorArgDef tensor_arg) { + return Place(ConvertTargetFromPhi(tensor_arg.backend), + ConvertPrecisionFromPhi(tensor_arg.dtype), + ConvertLayoutFromPhi(tensor_arg.layout)); } } // namespace infrt diff --git a/paddle/infrt/dialect/phi/data_type.h b/paddle/infrt/dialect/phi/data_type.h index f2a76507b85..bd258cb1038 100644 --- a/paddle/infrt/dialect/phi/data_type.h +++ b/paddle/infrt/dialect/phi/data_type.h @@ -23,16 +23,16 @@ namespace infrt { -phi::Backend cvtTarget2Phi(TargetType target); -TargetType cvtTargetFromPhi(phi::Backend backend); +phi::Backend ConvertTargetToPhi(TargetType target); +TargetType ConvertTargetFromPhi(phi::Backend backend); -phi::DataType cvtPrecision2Phi(PrecisionType precision); -PrecisionType cvtPrecisionFromPhi(phi::DataType datatype); +phi::DataType ConvertPrecisionToPhi(PrecisionType precision); +PrecisionType ConvertPrecisionFromPhi(phi::DataType datatype); -phi::DataLayout cvtLayout2Phi(LayoutType layout); -LayoutType cvtLayoutFromPhi(phi::DataLayout layout); +phi::DataLayout ConvertLayoutToPhi(LayoutType layout); +LayoutType ConvertLayoutFromPhi(phi::DataLayout layout); -phi::KernelKey cvtPlace2Phi(const Place& place); -Place cvtPlaceFromPhi(phi::TensorArgDef tensor_arg); +phi::KernelKey ConvertPlaceToPhi(const Place& place); +Place ConvertPlaceFromPhi(phi::TensorArgDef tensor_arg); } // namespace infrt diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc index d1763897b4a..353b1054e71 100644 --- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc @@ -80,7 +80,7 @@ std::vector getCandidateKernels( phi::KernelKeyMap kernel_key_map = phi::KernelFactory::Instance().SelectKernelMap(name); for (Place place : valid_palces) { - phi::KernelKey kernel_key = cvtPlace2Phi(place); + phi::KernelKey kernel_key = ConvertPlaceToPhi(place); if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) { kernel_key = phi::KernelKey(kernel_key.backend(), phi::DataLayout::ALL_LAYOUT, @@ -97,10 +97,10 @@ std::vector getCandidateKernels( const paddle::SmallVector& output_arg = args_def.output_defs(); for (auto tensor_arg : input_arg) { - phi_kernel_desc.inputsType.emplace_back(cvtPlaceFromPhi(tensor_arg)); + phi_kernel_desc.inputsType.emplace_back(ConvertPlaceFromPhi(tensor_arg)); } for (auto tensor_arg : output_arg) { - phi_kernel_desc.outputsType.emplace_back(cvtPlaceFromPhi(tensor_arg)); + phi_kernel_desc.outputsType.emplace_back(ConvertPlaceFromPhi(tensor_arg)); } candidate_kernels.emplace_back(phi_kernel_desc); } diff --git a/paddle/infrt/host_context/kernel_registry.cc b/paddle/infrt/host_context/kernel_registry.cc index f343dfc71b0..4209b2a9648 100644 --- a/paddle/infrt/host_context/kernel_registry.cc +++ b/paddle/infrt/host_context/kernel_registry.cc @@ -23,8 +23,9 @@ namespace infrt { namespace host_context { struct KernelRegistry::Impl { - std::unordered_map data; - std::unordered_map> attr_names; + std::unordered_map>> + data; }; KernelRegistry::KernelRegistry() : impl_(std::make_unique()) {} @@ -33,20 +34,29 @@ void KernelRegistry::AddKernel(const std::string &key, KernelImplementation fn) { CHECK(!impl_->data.count(key)) << "kernel [" << key << "] is registered twice"; - impl_->data.emplace(key, fn); + impl_->data.emplace( + key, std::make_pair(std::move(fn), std::vector{})); } -void KernelRegistry::AddKernelAttrNameList( - const std::string &key, const std::vector &names) { - CHECK(!impl_->attr_names.count(key)) - << "kernel [" << key << "] is registered twice in attribute names"; - impl_->attr_names.emplace( - key, llvm::SmallVector(names.begin(), names.end())); +const std::vector &KernelRegistry::GetAttrNameList( + const std::string &key) const { + CHECK(impl_->data.count(key)); + return impl_->data[key].second; +} + +void KernelRegistry::AddKernelWithAttrs( + const std::string &key, + KernelImplementation fn, + std::vector &&attr_order) { + CHECK(!impl_->data.count(key)) << "kernel [" << key + << "] is registered twice"; + impl_->data.emplace(key, + std::make_pair(std::move(fn), std::move(attr_order))); } KernelImplementation KernelRegistry::GetKernel(const std::string &key) const { auto it = impl_->data.find(key); - return it != impl_->data.end() ? it->second : KernelImplementation{}; + return it != impl_->data.end() ? it->second.first : KernelImplementation{}; } std::vector KernelRegistry::GetKernelList() const { diff --git a/paddle/infrt/host_context/kernel_registry.h b/paddle/infrt/host_context/kernel_registry.h index a813f690efb..a146b2b3c4c 100644 --- a/paddle/infrt/host_context/kernel_registry.h +++ b/paddle/infrt/host_context/kernel_registry.h @@ -34,10 +34,14 @@ class KernelRegistry { KernelRegistry(); void AddKernel(const std::string &key, KernelImplementation fn); - void AddKernelAttrNameList(const std::string &key, - const std::vector &names); + void AddKernelWithAttrs(const std::string &key, + KernelImplementation fn, + std::vector &&attrs_order); KernelImplementation GetKernel(const std::string &key) const; + const std::vector &GetAttrNameList( + const std::string &key) const; + std::vector GetKernelList() const; size_t size() const; diff --git a/paddle/infrt/host_context/mlir_function_executable.cc b/paddle/infrt/host_context/mlir_function_executable.cc index 47ec27ebec3..ec8d43f99ba 100644 --- a/paddle/infrt/host_context/mlir_function_executable.cc +++ b/paddle/infrt/host_context/mlir_function_executable.cc @@ -43,6 +43,7 @@ MlirFunctionExecutable::MlirFunctionExecutable( func_op.getNumResults()), MlirToRuntimeTranslator(&core_runtime_builder_), region_(&func_op.getRegion()), + kernel_registry_(kernel_registry), core_runtime_builder_(kernel_registry), function_table_(function_table) {} @@ -54,6 +55,7 @@ MlirFunctionExecutable::MlirFunctionExecutable( : Function("", func_type.getNumInputs(), func_type.getNumResults()), MlirToRuntimeTranslator(&core_runtime_builder_), region_(region), + kernel_registry_(kernel_registry), core_runtime_builder_(kernel_registry), function_table_(function_table) {} @@ -90,7 +92,7 @@ void MlirFunctionExecutable::BuildExecutables( if (EmitCallOp(&op, &function_table_)) continue; - if (EmitGeneralOp(&op)) continue; + if (EmitGeneralOp(&op, *kernel_registry_)) continue; LOG(FATAL) << "Not supported op: " << DumpToString(op); } diff --git a/paddle/infrt/host_context/mlir_function_executable.h b/paddle/infrt/host_context/mlir_function_executable.h index a6428df86e6..cd9161d01bb 100644 --- a/paddle/infrt/host_context/mlir_function_executable.h +++ b/paddle/infrt/host_context/mlir_function_executable.h @@ -70,6 +70,7 @@ class MlirFunctionExecutable : public Function, public MlirToRuntimeTranslator { private: mlir::Region* region_{}; + KernelRegistry* kernel_registry_{}; CoreRuntimeBuilder core_runtime_builder_; MlirToRuntimeTranslator::function_defs_t& function_table_; std::function copy_res_fn_; diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc index b3ea930e8ce..c613843cd17 100644 --- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc +++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc @@ -270,7 +270,8 @@ static bool IsReturn(mlir::Operation* op) { return op->getName().getStringRef() == "infrt.return"; } -bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) { +bool MlirToRuntimeTranslator::EmitGeneralOp( + mlir::Operation* op, const KernelRegistry& kernel_registry) { CHECK(impl_->runtime); impl_->cur_op = impl_->runtime->NewOpExecutable(op->getName().getStringRef().str()); @@ -308,42 +309,80 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) { // process attributes auto attrs = op->getAttrs(); + // MLIR's underlying attr storage type is `Builtin_Dictionary`, and its + // elements + // are sorted by name. The following code adapts the order of function + // signatures + // of the phi operator library. + llvm::SmallVector tmp; + tmp.resize(attrs.size()); + const std::string& kernel_name = op->getName().getStringRef().str(); + const auto& attr_names = kernel_registry.GetAttrNameList(kernel_name); + if (attrs.size() && attr_names.empty()) { + LOG(WARNING) << "The kernel `" << kernel_name + << "` has no specified attr order."; + } + auto get_offset = [](const char* attr, + const std::vector& names, + const std::string& kernel_name) -> int { + for (size_t i = 0; i < names.size(); ++i) { + if (!std::strcmp(attr, names[i])) { + return i; + } + } + LOG(WARNING) << "The attribute `" << attr << "` of kernel `" << kernel_name + << "` is not properly registered with " + "`KernelRegistry::AddKernelWithAttrs()`."; + return -1; + }; + for (size_t i = 0; i < attrs.size(); i++) { auto& attr = attrs[i]; + int offset{}; + if (attr_names.size()) { + offset = get_offset(attr.getName().data(), attr_names, kernel_name); + } else { + offset = i; + } + CHECK_NE(offset, -1); if (auto v = EmitAttribute(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(*v)); + tmp[offset] = new Value(*v); } else if (auto v = EmitAttribute(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(*v)); + tmp[offset] = new Value(*v); } else if (auto v = EmitAttribute(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(*v)); + tmp[offset] = new Value(*v); } else if (auto v = EmitAttribute(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(*v)); + tmp[offset] = new Value(*v); } else if (auto v = EmitAttribute(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(std::move(*v))); + tmp[offset] = new Value(std::move(*v)); } else if (auto v = EmitAttribute(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(*v)); + tmp[offset] = new Value(*v); } else if (auto v = EmitAttribute<::infrt::TargetType>(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(*v)); + tmp[offset] = new Value(*v); } else if (auto v = EmitAttribute<::infrt::PrecisionType>(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(*v)); + tmp[offset] = new Value(*v); } else if (auto v = EmitAttribute<::infrt::LayoutType>(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(*v)); + tmp[offset] = new Value(*v); } else if (auto v = EmitAttribute>(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(std::move(*v))); + tmp[offset] = new Value(std::move(*v)); } else if (auto v = EmitAttribute>(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(std::move(*v))); + tmp[offset] = new Value(std::move(*v)); } else if (auto v = EmitAttribute>(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(std::move(*v))); + tmp[offset] = new Value(std::move(*v)); } else if (auto v = EmitAttribute>(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(std::move(*v))); + tmp[offset] = new Value(std::move(*v)); } else if (auto v = EmitAttribute>(attr.getValue())) { - impl_->cur_op->AppendAttribute(new Value(std::move(*v))); + tmp[offset] = new Value(std::move(*v)); } else { LOG(FATAL) << "Not supported attribute type"; } } + for (size_t i = 0; i < tmp.size(); i++) { + impl_->cur_op->AppendAttribute(tmp[i]); + } + // process results llvm::SmallVector res_values; for (int i = 0, e = op->getNumResults(); i < e; i++) { @@ -598,7 +637,7 @@ class MlirProgramTestExecutor : public MlirToRuntimeTranslator { llvm::SmallVector results; if (EmitReturnOp(&op, &results)) continue; if (EmitCallOp(&op, &impl_->func_defs)) continue; - if (EmitGeneralOp(&op)) continue; + if (EmitGeneralOp(&op, *registry)) continue; LOG(FATAL) << "Not supported op: " << DumpToString(op); } diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.h b/paddle/infrt/host_context/mlir_to_runtime_translate.h index fcd79eaf386..27a7f201686 100644 --- a/paddle/infrt/host_context/mlir_to_runtime_translate.h +++ b/paddle/infrt/host_context/mlir_to_runtime_translate.h @@ -63,7 +63,8 @@ class MlirToRuntimeTranslator { //! Emit a "ts.build_shape" operation. bool EmitBuildShapeOp(mlir::Operation* op); //! Emit an operation other than the special cases above. - bool EmitGeneralOp(mlir::Operation* op); + bool EmitGeneralOp(mlir::Operation* op, + const KernelRegistry& kernel_registry); //! Emit all the functions. bool EmitFunctions(); diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc index e89ee7cfe5d..777fb29ac60 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc @@ -23,23 +23,23 @@ namespace phi { ::phi::DenseTensor CreateDenseTensor( const ::phi::CPUContext& context, host_context::Attribute> dims, - host_context::Attribute<::infrt::LayoutType> layout, host_context::Attribute> lod, + host_context::Attribute<::infrt::LayoutType> layout, host_context::Attribute<::infrt::PrecisionType> precision) { return ::phi::DenseTensor( const_cast<::phi::Allocator*>(&context.GetAllocator()), - ::phi::DenseTensorMeta(cvtPrecision2Phi(precision.get()), + ::phi::DenseTensorMeta(ConvertPrecisionToPhi(precision.get()), ::phi::make_ddim(dims.get()), - cvtLayout2Phi(layout.get()), + ConvertLayoutToPhi(layout.get()), {})); } void FillDenseTensorF32(::phi::DenseTensor* dense_tensor, - host_context::Attribute> values) { + host_context::Attribute> value) { auto place = ::phi::CPUPlace(); float* a_data = dense_tensor->mutable_data(place); for (int64_t i = 0; i < dense_tensor->numel(); ++i) { - a_data[i] = (values.get())[i]; + a_data[i] = (value.get())[i]; } } @@ -57,7 +57,7 @@ void PrintDenseTensor(::phi::DenseTensor* dense_tensor) { ::phi::DDim dims = dense_tensor->dims(); std::cout << "dense_tensor: shape=shape" << dims.to_str() << "," - << " values=["; + << " value=["; switch (dense_tensor->dtype()) { PRINT_META_DATA(FLOAT32, float); PRINT_META_DATA(INT32, int32_t); diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h index e77e9becb6f..8cc0e39e0e4 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h @@ -26,8 +26,8 @@ namespace phi { ::phi::DenseTensor CreateDenseTensor( const ::phi::CPUContext& context, host_context::Attribute> dims, - host_context::Attribute<::infrt::LayoutType> layout, host_context::Attribute> lod, + host_context::Attribute<::infrt::LayoutType> layout, host_context::Attribute<::infrt::PrecisionType> precision); void FillDenseTensorF32(::phi::DenseTensor* dense_tensor, diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc index 90570484179..0e071418603 100644 --- a/paddle/infrt/kernel/phi/registry.cc +++ b/paddle/infrt/kernel/phi/registry.cc @@ -34,10 +34,14 @@ namespace kernel { void RegisterPhiKernels(host_context::KernelRegistry* registry) { registry->AddKernel("phi_dt.create_context.cpu", INFRT_KERNEL(infrt::kernel::phi::CreateCPUContext)); - registry->AddKernel("phi_dt.create_dense_tensor", - INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensor)); - registry->AddKernel("phi_dt.fill_dense_tensor.f32", - INFRT_KERNEL(infrt::kernel::phi::FillDenseTensorF32)); + registry->AddKernelWithAttrs( + "phi_dt.create_dense_tensor", + INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensor), + {"dims", "lod", "layout", "precision"}); + registry->AddKernelWithAttrs( + "phi_dt.fill_dense_tensor.f32", + INFRT_KERNEL(infrt::kernel::phi::FillDenseTensorF32), + {"value"}); registry->AddKernel("phi_dt.print_tensor", INFRT_KERNEL(infrt::kernel::phi::PrintDenseTensor)); } diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc index d5922af9ada..b7503aa4ef3 100644 --- a/paddle/infrt/kernel/tensor_kernels.cc +++ b/paddle/infrt/kernel/tensor_kernels.cc @@ -111,9 +111,9 @@ void NaiveMatmul(const DenseHostTensor &x, /// ===== Kernel end ==== void RegisterTensorKernels(host_context::KernelRegistry *registry) { - registry->AddKernel("dt.create_uninit_tensor.f32", - INFRT_KERNEL(CreateUninitTensor)); - registry->AddKernelAttrNameList("dt.create_uninit_tensor.f32", {"shape"}); + registry->AddKernelWithAttrs("dt.create_uninit_tensor.f32", + INFRT_KERNEL(CreateUninitTensor), + {"shape"}); registry->AddKernel("dt.print_tensor", INFRT_KERNEL(PrintTensor)); registry->AddKernel("dt.fill_tensor_with_constant.f32", INFRT_KERNEL(FillTensorWithConstant)); diff --git a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir index b40184e7266..3657777a5b0 100644 --- a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir +++ b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir @@ -9,7 +9,7 @@ func @sign_any_float32_execute() { "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor) -> () %e = "phi_cpu.sign.float32.any"(%ctx, %t) : (!phi.context, !infrt.dense_tensor) -> (!infrt.dense_tensor) - // CHECK: dense_tensor: shape=shape[1], values=[1] + // CHECK: dense_tensor: shape=shape[1], value=[1] "phi_dt.print_tensor" (%e) : (!infrt.dense_tensor) -> () infrt.return } -- GitLab From 1a32391c66484a3466bc1a5595e97816097a60f5 Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Tue, 15 Mar 2022 10:49:55 +0800 Subject: [PATCH 053/176] [Dygraph] Refactoring of reducer in DataParallel (#40389) * refactor reducer * modify cmakelists * solve conflicts * rename group and update process_group * fix bugs of ProcessGroupNCCL * modify for CIs * refactoring reducer --- .../distributed/collective/CMakeLists.txt | 3 +- .../collective/ProcessGroupNCCL.cc | 4 +- .../fluid/distributed/collective/reducer.cc | 426 +++++++++++++++++- paddle/fluid/distributed/collective/reducer.h | 99 +++- paddle/fluid/pybind/distributed_py.cc | 23 + python/paddle/fluid/dygraph/parallel.py | 63 ++- ...llel_dygraph_dataparallel_in_eager_mode.py | 127 ++++++ .../test_parallel_dygraph_dataparallel.py | 5 + python/paddle/optimizer/optimizer.py | 9 +- 9 files changed, 736 insertions(+), 23 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index f88c993d85e..3fca45cc068 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -1,8 +1,9 @@ cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api) +cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup phi phi_api) + if (WITH_DISTRIBUTE) cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi phi_api eager_api gloo_wrapper) endif() -cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup) if(WITH_NCCL) cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api) diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 67715f410d4..7f21bcee87a 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -88,8 +88,8 @@ void SyncDefaultStream( for (size_t i = 0; i < places.size(); ++i) { auto* default_ctx = static_cast( platform::DeviceContextPool::Instance().Get(places[i])); - ncclEvents[i].Record(*dev_ctx[i]); - ncclEvents[i].Block(*default_ctx); + ncclEvents[i].Record(*default_ctx); + ncclEvents[i].Block(*dev_ctx[i]); } } diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index 59f3ea3b0a7..5533f3f4cbf 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "paddle/fluid/distributed/collective/reducer.h" -#include "paddle/phi/common/data_type.h" namespace paddle { namespace distributed { @@ -127,5 +126,430 @@ std::vector> Eager_AssignGroupBySize( return res; } +template +static void ConcatTensorsForAllReduce( + const DeviceContext &context, + const std::vector &dense_tensors_, + Tensor *p_dense_contents) { + operators::math::ConcatFunctor concat_functor_; + concat_functor_( + context, dense_tensors_, 0, + std::dynamic_pointer_cast(p_dense_contents->impl()) + .get()); +} + +template +static void SplitTensorsForAllReduce( + const DeviceContext &context, Tensor *p_dense_contents, + std::vector *p_dense_tensors) { + auto *in = + std::dynamic_pointer_cast(p_dense_contents->impl()) + .get(); + std::vector outs; + std::vector shape_refer; + + outs.reserve(p_dense_tensors->size()); + shape_refer.reserve(p_dense_tensors->size()); + + for (auto &tensor : *p_dense_tensors) { + outs.emplace_back(&tensor); + shape_refer.emplace_back(&tensor); + } + + operators::math::SplitFunctor split_functor_; + split_functor_(context, *in, shape_refer, 0, &outs); +} + +// context is used to select the stream for concat +template +static void ConcatTensorsWithType( + const DeviceContext &context, + const std::vector &dense_tensors_, + Tensor *p_dense_contents, phi::DataType type) { + switch (type) { + case phi::DataType::FLOAT16: + ConcatTensorsForAllReduce( + context, dense_tensors_, p_dense_contents); + break; + case phi::DataType::FLOAT32: + ConcatTensorsForAllReduce(context, dense_tensors_, + p_dense_contents); + break; + case phi::DataType::FLOAT64: + ConcatTensorsForAllReduce(context, dense_tensors_, + p_dense_contents); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Data type (%s) is not supported when it concats tensors for " + "allreduce.", + type)); + } +} + +// context is used to select the stream for split +template +static void SplitTensorsWithType(const DeviceContext &context, + Tensor *p_dense_contents, + std::vector *p_dense_tensors, + phi::DataType type) { + switch (type) { + case phi::DataType::FLOAT16: + SplitTensorsForAllReduce( + context, p_dense_contents, p_dense_tensors); + break; + case phi::DataType::FLOAT32: + SplitTensorsForAllReduce(context, p_dense_contents, + p_dense_tensors); + break; + case phi::DataType::FLOAT64: + SplitTensorsForAllReduce(context, p_dense_contents, + p_dense_tensors); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Data type (%s) is not supported when it splits tensors for " + "allreduce.", + type)); + } +} + +void EagerGroup::ConcatTensors(const platform::Place &place) { + if (platform::is_gpu_place(place)) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto *default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + ConcatTensorsWithType(*default_ctx, dense_tensors_, &dense_contents_, + dtype_); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't concat grad tensors since it's not compiled with NCCL," + "Please recompile or reinstall Paddle with NCCL support.")); +#endif + } else if (platform::is_cpu_place(place)) { + auto *default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + ConcatTensorsWithType(*default_ctx, dense_tensors_, &dense_contents_, + dtype_); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Concat grad tensor not supported on place (%s)", place)); + } +} + +void EagerGroup::SplitTensors(const platform::Place &place) { + if (platform::is_gpu_place(place)) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto *default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + SplitTensorsWithType(*default_ctx, &dense_contents_, &dense_tensors_, + dtype_); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't split grad tensor since it's not compiled with NCCL," + "Please recompile or reinstall Paddle with NCCL support.")); +#endif + } else if (platform::is_cpu_place(place)) { + auto *default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + SplitTensorsWithType(*default_ctx, &dense_contents_, &dense_tensors_, + dtype_); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Split grad tensor not supported on place (%s)", place)); + } +} + +EagerReducer::EagerReducer( + const std::vector tensors, + const std::vector> &group_indices, + const std::vector &is_sparse_gradient, + std::shared_ptr process_group, + const std::vector &group_size_limits, bool find_unused_parameters) + : tensors_(tensors), + group_indices_(group_indices), + is_sparse_gradient_(is_sparse_gradient), + process_group_(process_group), + group_size_limits_(group_size_limits), + find_unused_vars_each_step_(find_unused_parameters) { + VLOG(3) << "Start construct the Reducer ..."; + + nranks_ = process_group_->GetSize(); + + // initialize groups + InitializeGroups(group_indices); + + for (size_t global_var_index = 0; global_var_index < tensors_.size(); + ++global_var_index) { + auto tensor = tensors_[global_var_index]; + auto reduce_hook = [=](void) -> void { + this->AddDistHook(global_var_index); + }; + + const auto &grad_node = GetGradNodeFromTensor(&tensor); + + PADDLE_ENFORCE( + grad_node.get() != nullptr, + paddle::platform::errors::Fatal("Detected NULL grad_node," + "Leaf tensor should have had grad_node " + "with type: GradNodeAccumulation")); + const auto &accumulation_grad_node = + std::dynamic_pointer_cast(grad_node); + accumulation_grad_node->RegisterReduceHook( + std::make_shared(reduce_hook)); + } + + vars_marked_ready_.resize(tensors_.size(), false); + local_used_vars_.resize(tensors_.size(), 0); +} + +std::shared_ptr EagerReducer::GetGradNodeFromTensor( + Tensor *tensor) { + auto *autograd_meta = tensor->get_autograd_meta(); + const auto &grad_node = + static_cast(autograd_meta)->GetMutableGradNode(); + return grad_node; +} + +void EagerReducer::InitializeGroups( + const std::vector> &group_indices) { + VLOG(3) << "Start initialize groups .."; + + // clear the group + groups_.clear(); + groups_.reserve(group_indices.size()); + + variable_locators_.clear(); + variable_locators_.resize(tensors_.size()); + + auto group_nums = group_indices.size(); + for (size_t group_index = 0; group_index < group_nums; ++group_index) { + const auto &tensor_indices_ = group_indices[group_index]; + PADDLE_ENFORCE_GT( + tensor_indices_.size(), 0, + platform::errors::PreconditionNotMet( + "The number of group[%d]'s elements is 0.", group_index)); + + EagerGroup group; + + // It's just for check the sparse or dense + auto first_var = tensors_[tensor_indices_.front()]; + if (tensor_indices_.size() == 1 && + is_sparse_gradient_[tensor_indices_.front()]) { + // process the sparse gradient. one sparse, one group + group.dtype_ = first_var.dtype(); + } else { + // process the dense gradient. + InitializeDenseGroups(tensor_indices_, &group); + experimental::Backend backend; + switch (inner_place_.GetType()) { + case phi::AllocationType::GPU: + backend = experimental::Backend::GPU; + break; + case phi::AllocationType::CPU: + backend = experimental::Backend::CPU; + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Place type (%s) is not supported. ", inner_place_)); + break; + } + group.dense_contents_ = paddle::experimental::empty( + ScalarArray({group.all_length_}), group.dtype_, backend); + } + + // map tensors to this group by VariableLocator + size_t inside_group_index = 0; + for (const auto var_index : tensor_indices_) { + TensorLocator tensor_locator; + tensor_locator.group_index = group_index; + tensor_locator.inside_group_index = inside_group_index++; + variable_locators_[var_index] = tensor_locator; + } + group.tensor_indices_ = std::move(tensor_indices_); + groups_.emplace_back(std::move(group)); + + VLOG(3) << "The Group[" << group_index << "]:" << groups_.back(); + } +} + +void EagerReducer::InitializeDenseGroups( + const std::vector &tensor_indices_, EagerGroup *p_group) { + VLOG(3) << "InitializeDenseGroups."; + int64_t all_length = 0; + for (size_t index = 0; index < tensor_indices_.size(); ++index) { + auto tensor_index = tensor_indices_[index]; + auto &tensor = tensors_[tensor_index]; + auto &tensor_name = tensor.name(); + + PADDLE_ENFORCE_EQ(tensor.is_initialized(), true, + platform::errors::PreconditionNotMet( + "Tensor %s is not initialized.", tensor_name)); + const auto size = tensor.numel(); + PADDLE_ENFORCE_GT( + size, 0, platform::errors::PreconditionNotMet( + "The number of tensor %s's elements is 0.", tensor_name)); + all_length += size; + + p_group->length_.push_back(size); + + // for concat operator + p_group->origin_shapes_.push_back(ScalarArray(tensor.shape())); + p_group->dense_tensors_.push_back(phi::DenseTensor()); + + const auto &dtype = tensor.dtype(); + const auto &place = tensor.place(); + const auto &inner_place = tensor.impl()->place(); + if (index > 0) { + PADDLE_ENFORCE_EQ(dtype, p_group->dtype_, + platform::errors::PreconditionNotMet( + "Tensor %s has unexpected dtype.", tensor_name)); + PADDLE_ENFORCE_EQ(place, place_, + platform::errors::PreconditionNotMet( + "Tensor %s has different place. Expected place is " + "%s, but actual place is %s", + tensor_name, inner_place_, inner_place)); + } else { + p_group->dtype_ = dtype; + place_ = place; + inner_place_ = inner_place; + } + } + p_group->all_length_ = all_length; +} + +void EagerReducer::PrepareForBackward(const std::vector &outputs) { + VLOG(3) << "after forward, then reset count for backward."; + grad_need_hooks_ = true; + next_group_ = 0; + std::for_each(groups_.begin(), groups_.end(), [](EagerGroup &group) { + group.pending_ = group.tensor_indices_.size(); + }); + + // reinitialize vars_marked_ready_ for next iteration + vars_marked_ready_.clear(); + vars_marked_ready_.resize(tensors_.size(), false); +} + +void EagerReducer::AddDistHook(size_t var_index) { + PADDLE_ENFORCE_LT(var_index, variable_locators_.size(), + platform::errors::OutOfRange( + "Out of bounds variable index. it must be less" + "than %d, but it is %d", + variable_locators_.size(), var_index)); + + // gradient synchronization is not required when grad_need_hooks_ is false. + if (!grad_need_hooks_) { + return; + } + + auto &tensor = tensors_[var_index]; + const auto &grad_node = GetGradNodeFromTensor(&tensor); + + VLOG(3) << "Var[" << var_index << "] [" << (*grad_node).name() + << "] arrived and triggered disthook"; + + local_used_vars_[var_index] = 1; + + MarkVarReady(var_index, true); +} + +void EagerReducer::MarkVarReady(const size_t var_index, + const bool is_used_var) { + const auto &var_locator = variable_locators_[var_index]; + const auto group_index = var_locator.group_index; + const auto inside_group_index = var_locator.inside_group_index; + + auto &group = groups_[group_index]; + auto &group_tensor = group.dense_tensors_[inside_group_index]; + auto *autograd_meta = tensors_[var_index].get_autograd_meta(); + auto &grad_tensor = static_cast(autograd_meta)->Grad(); + + group_tensor + .ShareDataWith( + *(std::dynamic_pointer_cast(grad_tensor.impl()))) + .Resize({grad_tensor.numel()}); + + vars_marked_ready_[var_index] = true; + + if (--group.pending_ == 0) { + // can start allreduce + MarkGroupReady(group_index); + } +} + +void EagerReducer::MarkGroupReady(size_t group_index) { + VLOG(3) << "Group[" << group_index << "] is ready"; + + PADDLE_ENFORCE_GE( + group_index, next_group_, + platform::errors::PreconditionNotMet( + "The index of the incoming group must be greater " + "than or equal to the previously synchronized group index, " + "expect it to greater than or equal to %d, but got %d.", + next_group_, group_index)); + + if (group_index > next_group_) { + VLOG(3) << "It will adjust the order of group in next batch automatically"; + return; + } + + for (; next_group_ < groups_.size() && groups_[next_group_].pending_ == 0; + ++next_group_) { + UNUSED auto &group = groups_[next_group_]; + FusedAllReduceSchedule(&group, next_group_); + } +} + +void EagerReducer::FusedAllReduceSchedule(EagerGroup *group, + const int curr_group_index) { + // The overall timeline: concat > div_nranks > allreduce > split + distributed::AllreduceOptions opts; + opts.reduce_op = ReduceOp::SUM; + + VLOG(3) << "group [" << curr_group_index << "] start fused_allreduce."; + + // concat tensors + group->ConcatTensors(inner_place_); + + // div nranks + double scaling = 1.0 / nranks_; + paddle::experimental::scale_(group->dense_contents_, scaling, 0.0, false); + + // all_reduce + std::vector reduce_tensors = {group->dense_contents_}; + tasks_.push_back(process_group_->AllReduce(reduce_tensors, opts)); + + if (tasks_.size() == groups_.size()) { + for (size_t index = 0; index < tasks_.size(); index++) { + auto &task = tasks_.back(); + task->Synchronize(); + tasks_.pop_back(); + } + for (size_t index = 0; index < groups_.size(); index++) { + auto &group = groups_[index]; + group.SplitTensors(inner_place_); + } + } +} + +std::ostream &operator<<(std::ostream &out, const EagerGroup &group) { + const auto &tensors_ = group.tensor_indices_; + out << "numel: " << group.all_length_ << " ;var number: " << tensors_.size() + << "\n"; + auto begin = tensors_.begin(); + auto end = tensors_.end(); + out << "["; + for (int i = 0; begin != end && i < 100; ++i, ++begin) { + if (i > 0) out << ' '; + out << *begin; + } + if (begin != end) { + out << " ..."; + } + out << "]\n"; + return out; +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h index f8c75385ef8..ac6f3fbe595 100644 --- a/paddle/fluid/distributed/collective/reducer.h +++ b/paddle/fluid/distributed/collective/reducer.h @@ -17,16 +17,109 @@ #include #include #include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/eager/accumulation/accumulation_node.h" +#include "paddle/fluid/eager/api/utils/hook_utils.h" #include "paddle/fluid/eager/api/utils/tensor_utils.h" +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/operators/math/concat_and_split.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/phi/api/include/api.h" +#include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/api/lib/ext_compat_utils.h" +#include "paddle/phi/common/data_type.h" namespace paddle { namespace distributed { using Tensor = paddle::experimental::Tensor; +using Scalar = paddle::experimental::ScalarBase; +using ScalarArray = + paddle::experimental::ScalarArrayBase; std::vector> Eager_AssignGroupBySize( - const std::vector, const std::vector& is_sparse_gradient, - const std::vector& group_size_limits, - const std::vector& tensor_indices = {}); + const std::vector, const std::vector &is_sparse_gradient, + const std::vector &group_size_limits, + const std::vector &tensor_indices = {}); + +class EagerGroup { + public: + Tensor dense_contents_; + + // for concat kernel + std::vector dense_tensors_; + std::vector length_; + int64_t all_length_{0}; + std::vector origin_shapes_; + + // Global indices of participating tensors in the group + std::vector tensor_indices_; + + // Number of params that haven't been ready. When it is 0, it means + // the group is ready. + size_t pending_ = -1; + + // external message of group + phi::DataType dtype_; + + // context is used to select the stream for concat + void ConcatTensors(const platform::Place &); + + // context is used to select the stream for split + void SplitTensors(const platform::Place &); + + friend std::ostream &operator<<(std::ostream &, const EagerGroup &); +}; + +struct TensorLocator { + // record the index in groups_ + size_t group_index; + size_t inside_group_index; +}; + +class EagerReducer { + public: + explicit EagerReducer( + const std::vector tensors, + const std::vector> &group_indices, + const std::vector &is_sparse_gradient, + std::shared_ptr process_group, + const std::vector &group_size_limits, + bool find_unused_parameters); + + virtual ~EagerReducer() {} + + std::shared_ptr GetGradNodeFromTensor(Tensor *tensor); + + void InitializeGroups(const std::vector> &group_indices); + void InitializeDenseGroups(const std::vector &tensor_indices_, + EagerGroup *p_group); + void PrepareForBackward(const std::vector &outputs); + void AddDistHook(size_t var_index); + void MarkVarReady(const size_t var_index, const bool is_used_var); + void MarkGroupReady(const size_t group_index); + void FusedAllReduceSchedule(EagerGroup *group, const int curr_group_index); + + private: + std::vector tensors_; + std::vector> group_indices_; + std::vector is_sparse_gradient_; + std::shared_ptr process_group_; + std::vector group_size_limits_; + bool find_unused_vars_each_step_; + + std::vector groups_; + std::vector variable_locators_; + PlaceType place_; + platform::Place inner_place_; + size_t next_group_ = 0; + int64_t nranks_ = -1; + std::vector> tasks_; + + bool grad_need_hooks_{false}; + + std::vector vars_marked_ready_; + std::vector local_used_vars_; +}; } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index 0b179670381..1df917b8c35 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -51,6 +51,18 @@ namespace pybind { using Tensor = paddle::experimental::Tensor; +std::shared_ptr CreateEagerReducer( + py::handle py_tensors, + const std::vector> &group_indices, + const std::vector &is_sparse_gradient, + std::shared_ptr process_group, + const std::vector &group_size_limits, bool find_unused_parameters) { + auto params = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0); + return std::make_shared( + params, group_indices, is_sparse_gradient, process_group, + group_size_limits, find_unused_parameters); +} + #if defined(PADDLE_WITH_GLOO) using ProcessGroupGloo = paddle::distributed::ProcessGroupGloo; using GlooStore = paddle::distributed::ProcessGroupGloo::GlooStore; @@ -271,6 +283,17 @@ void BindDistributed(py::module *m) { py::arg("group_size_limits") = std::vector{25 * 1024 * 1024}, py::arg("tensor_indices") = std::vector{}, py::call_guard()); + + py::class_>(*m, "EagerReducer", + R"DOC()DOC") + .def(py::init(&CreateEagerReducer)) + .def("prepare_for_backward", + [](distributed::EagerReducer &self, py::handle py_tensors) { + auto params = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0); + self.PrepareForBackward(params); + }, + py::arg("tensors"), py::call_guard()); } } // end namespace pybind diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index 652916491ee..86d76f1b20a 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -30,7 +30,7 @@ from paddle.fluid.dygraph import to_variable, no_grad from paddle.utils import deprecated from ..layers import collective from paddle.fluid.dygraph import base as imperative_base -from paddle.fluid.framework import ParamBase +from paddle.fluid.framework import ParamBase, _in_eager_mode __all__ = ["prepare_context", "ParallelEnv", "DataParallel"] @@ -397,6 +397,16 @@ def sync_params_buffers(model, 'axis': 0}) +@imperative_base.no_grad +@framework.dygraph_only +def sync_eager_params(model, comm_group=None, src_rank=0): + for _, param in model._obtain_parameters_buffers().items(): + if not isinstance(param, core.eager.Tensor): + raise TypeError("The data type of '%s' must be '%s'" % + (param.name, core.eager.Tensor)) + comm_group.broadcast(param, src_rank).synchronize() + + class DataParallel(layers.Layer): """ Run the dygraph module with data parallelism. @@ -576,6 +586,7 @@ class DataParallel(layers.Layer): self.process_group = process_group self.gradient_as_buffer_view = gradient_as_buffer_view self.static_graph = static_graph + self.var_dtype = core.eager.Tensor if _in_eager_mode() else core.VarBase # NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy. # It just stores some environment variables, which can be constructed by @@ -592,11 +603,20 @@ class DataParallel(layers.Layer): "ParallelContext must be initialized before. You should use init_parallel_env() before" \ "constructing the DataParallel." + if self.process_group is None and _in_eager_mode(): + raise RuntimeError( + "Process group should be built in DataParallel of eager mode." + ) + # sync buffer and params # TODO(liuyuhui) Currently not support xpu. xpu is # still broadcasting parameters when calling layer if not paddle.is_compiled_with_xpu(): - sync_params_buffers(self._layers) + if _in_eager_mode(): + sync_eager_params( + self._layers, comm_group=self.process_group) + else: + sync_params_buffers(self._layers) self.comm_buffer_size = int(comm_buffer_size * 1024 * 1024) # NOTE(shenliang03): We can set environment variables to control @@ -620,9 +640,9 @@ class DataParallel(layers.Layer): if param is None or param in params_set: continue params_set.add(param) - if not isinstance(param, core.VarBase): - raise TypeError("The data type of '%s' must be Varbase" % - param.name) + if not isinstance(param, self.var_dtype): + raise TypeError("The data type of '%s' must be '%s'" % + (param.name, self.var_dtype)) if param.trainable: layers_param.append((sublayer, param)) @@ -649,19 +669,32 @@ class DataParallel(layers.Layer): check_layer_sparse(sublayer) for sublayer, _ in layers_param ] - self.group_indices = core.assign_group_by_size( - trainable_parameters, is_sparse_gradient, - [self.last_comm_buffer_size, self.comm_buffer_size]) + if _in_eager_mode(): + self.group_indices = core.eager_assign_group_by_size( + trainable_parameters, is_sparse_gradient, + [self.last_comm_buffer_size, self.comm_buffer_size]) + + self._reducer = core.EagerReducer( + trainable_parameters, + list(reversed(self.group_indices)), is_sparse_gradient, + self.process_group, + [self.last_comm_buffer_size, self.comm_buffer_size], + self.find_unused_parameters) + else: + self.group_indices = core.assign_group_by_size( + trainable_parameters, is_sparse_gradient, + [self.last_comm_buffer_size, self.comm_buffer_size]) - self._reducer = core.Reducer( - trainable_parameters, - list(reversed(self.group_indices)), is_sparse_gradient, - parallel_helper.__parallel_ctx__clz__, - [self.last_comm_buffer_size, self.comm_buffer_size], - self.find_unused_parameters) + self._reducer = core.Reducer( + trainable_parameters, + list(reversed(self.group_indices)), is_sparse_gradient, + parallel_helper.__parallel_ctx__clz__, + [self.last_comm_buffer_size, self.comm_buffer_size], + self.find_unused_parameters) def _find_varbase(self, obj): - if isinstance(obj, core.VarBase): + var_type = core.eager.Tensor if _in_eager_mode() else core.VarBase + if isinstance(obj, var_type): return [obj] if isinstance(obj, (list, tuple)): return itertools.chain(*map(self._find_varbase, obj)) diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py new file mode 100644 index 00000000000..8ff68a1ce0d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py @@ -0,0 +1,127 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import unittest +import os +import numpy as np +import random + +import paddle +import paddle.nn as nn +from paddle.fluid.dygraph.nn import Linear +import paddle.fluid.core as core +from paddle.fluid.framework import _test_eager_guard +import paddle.distributed as dist +from paddle.fluid.dygraph.parallel import ParallelEnv +from paddle.optimizer import SGD +from paddle.fluid.initializer import NumpyArrayInitializer + + +def init_process_group(strategy=None): + nranks = ParallelEnv().nranks + rank = ParallelEnv().local_rank + is_master = True if rank == 0 else False + store = paddle.fluid.core.TCPStore("127.0.0.1", 6172, is_master, nranks) + group = core.ProcessGroupNCCL(store, rank, nranks) + return group + + +class LinearModel(nn.Layer): + def __init__(self, attr_list): + super(LinearModel, self).__init__() + self._linear1 = paddle.nn.Linear( + 50, 30, weight_attr=attr_list[0], bias_attr=False) + self._linear2 = paddle.nn.Linear( + 30, 10, weight_attr=attr_list[1], bias_attr=False) + self._linear3 = paddle.nn.Linear( + 10, 10, weight_attr=attr_list[2], bias_attr=False) + + def forward(self, x): + output = self._linear1(x) + output = self._linear2(output) + output = self._linear3(output) + return output + + +class TestDistTraning(unittest.TestCase): + def test_multiple_gpus(self): + process_group = init_process_group() + self.generate_reducer("float32", process_group) + self.generate_reducer("float16", process_group) + + def generate_reducer(self, dtype, process_group): + dev_id = ParallelEnv().dev_id + np.random.seed(2022 + dev_id) + paddle.set_default_dtype(dtype) + + w_1 = paddle.ParamAttr(initializer=NumpyArrayInitializer( + np.random.rand(50, 30).astype(dtype))) + w_2 = paddle.ParamAttr(initializer=NumpyArrayInitializer( + np.random.rand(30, 10).astype(dtype))) + w_3 = paddle.ParamAttr(initializer=NumpyArrayInitializer( + np.random.rand(10, 10).astype(dtype))) + + attr_list = [w_1, w_2, w_3] + inp = np.random.rand(10, 50).astype(dtype) + + # original reducer + params_a = self.model_train(attr_list, inp) + + # refactored reducer in eager mode + with _test_eager_guard(): + params_b = self.model_train( + attr_list, inp, process_group=process_group) + + for i in range(len(params_a)): + np.testing.assert_allclose(params_a[i].numpy(), params_b[i].numpy()) + + def model_train(self, attr_list, inp, process_group=None): + model = LinearModel(attr_list) + model = paddle.DataParallel(model, process_group=process_group) + optimizer = SGD(learning_rate=0.0003, parameters=model.parameters()) + + x = paddle.to_tensor(inp) + x.stop_gradient = False + + for step in range(10): + y = model(x) + loss = y.mean() + + loss.backward() + optimizer.step() + optimizer.clear_grad() + + return model.parameters() + + +class TestCatchErrors1(unittest.TestCase): + def test_multiple_gpus(self): + linear = paddle.nn.Linear(2, 4) + with _test_eager_guard(): + self.assertRaises(RuntimeError, paddle.DataParallel, linear) + + +class TestCatchErrors2(unittest.TestCase): + def test_multiple_gpus(self): + with _test_eager_guard(): + linear = paddle.nn.Linear(2, 4) + self.assertRaises(RuntimeError, paddle.DataParallel, linear) + + +if __name__ == '__main__': + dist.init_parallel_env() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py index edf9aed04f5..802fcc96288 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py @@ -200,5 +200,10 @@ class TestDataParallelWithPyLayer(TestMultipleGpus): self.run_mnist_2gpu('parallel_dygraph_dataparallel_with_pylayer.py') +class TestDataParallelInEagerMode(TestMultipleGpus): + def test_multiple_gpus_dynamic(self): + self.run_mnist_2gpu('parallel_dygraph_dataparallel_in_eager_mode.py') + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 47dc02705f8..96f35eb9d27 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -42,6 +42,7 @@ from .. import compat as cpt from .lr import LRScheduler import copy from paddle import _C_ops +from paddle.fluid.framework import _in_eager_mode __all__ = [] @@ -1108,7 +1109,13 @@ class Optimizer(object): for p in param_group['params']: if not p.stop_gradient: param_list.append(p) - core.clear_gradients(param_list, set_to_zero) + + if _in_eager_mode(): + for p in param_list: + clear_func = p._zero_grads if set_to_zero else p.clear_gradient + clear_func() + else: + core.clear_gradients(param_list, set_to_zero) @imperative_base.no_grad def minimize(self, -- GitLab From 85f8fd9b360694d92ac06730e0d3470b1e611ff4 Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Tue, 15 Mar 2022 10:55:44 +0800 Subject: [PATCH 054/176] [Phi]Move searchsorted kernel to phi (#40520) --- paddle/fluid/operators/searchsorted_op.cc | 10 +- paddle/phi/kernels/cpu/searchsorted_kernel.cc | 28 +++++ paddle/phi/kernels/gpu/searchsorted_kernel.cu | 28 +++++ .../kernels/impl/searchsorted_kernel_impl.h} | 107 +++++++++--------- .../kernels/searchsorted_kernel.h} | 24 ++-- 5 files changed, 124 insertions(+), 73 deletions(-) create mode 100644 paddle/phi/kernels/cpu/searchsorted_kernel.cc create mode 100644 paddle/phi/kernels/gpu/searchsorted_kernel.cu rename paddle/{fluid/operators/searchsorted_op.h => phi/kernels/impl/searchsorted_kernel_impl.h} (58%) rename paddle/{fluid/operators/searchsorted_op.cu => phi/kernels/searchsorted_kernel.h} (54%) diff --git a/paddle/fluid/operators/searchsorted_op.cc b/paddle/fluid/operators/searchsorted_op.cc index bbd5b9c4e7d..d0290795455 100644 --- a/paddle/fluid/operators/searchsorted_op.cc +++ b/paddle/fluid/operators/searchsorted_op.cc @@ -12,8 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/searchsorted_op.h" - +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -117,10 +116,3 @@ class SearchSortedOpMaker : public framework::OpProtoAndCheckerMaker { namespace ops = paddle::operators; REGISTER_OPERATOR(searchsorted, ops::SearchSortedOp, ops::SearchSortedOpMaker); - -REGISTER_OP_CPU_KERNEL( - searchsorted, - ops::SearchSortedKernel, - ops::SearchSortedKernel, - ops::SearchSortedKernel, - ops::SearchSortedKernel); diff --git a/paddle/phi/kernels/cpu/searchsorted_kernel.cc b/paddle/phi/kernels/cpu/searchsorted_kernel.cc new file mode 100644 index 00000000000..c036c2d438a --- /dev/null +++ b/paddle/phi/kernels/cpu/searchsorted_kernel.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/searchsorted_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/searchsorted_kernel_impl.h" + +PD_REGISTER_KERNEL(searchsorted, + CPU, + ALL_LAYOUT, + phi::SearchsortedKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/searchsorted_kernel.cu b/paddle/phi/kernels/gpu/searchsorted_kernel.cu new file mode 100644 index 00000000000..4a2ce2241c2 --- /dev/null +++ b/paddle/phi/kernels/gpu/searchsorted_kernel.cu @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/searchsorted_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/searchsorted_kernel_impl.h" + +PD_REGISTER_KERNEL(searchsorted, + GPU, + ALL_LAYOUT, + phi::SearchsortedKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/fluid/operators/searchsorted_op.h b/paddle/phi/kernels/impl/searchsorted_kernel_impl.h similarity index 58% rename from paddle/fluid/operators/searchsorted_op.h rename to paddle/phi/kernels/impl/searchsorted_kernel_impl.h index 6aa38a81581..82bd9fba2a6 100644 --- a/paddle/fluid/operators/searchsorted_op.h +++ b/paddle/phi/kernels/impl/searchsorted_kernel_impl.h @@ -1,4 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -16,16 +16,11 @@ #include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/for_range.h" #include "paddle/phi/core/ddim.h" #include "paddle/phi/kernels/funcs/algorithm.h" +#include "paddle/phi/kernels/funcs/for_range.h" -namespace paddle { -namespace operators { -using Tensor = framework::Tensor; +namespace phi { template class GpuAndCpuSearchSortedCompute { @@ -65,9 +60,11 @@ class GpuAndCpuSearchSortedCompute { static HOSTDEVICE bool IsInf(int64_t x) { return false; } HOSTDEVICE GpuAndCpuSearchSortedCompute(const T1* sequence_data, - const T2* value_data, bool right, + const T2* value_data, + bool right, bool is_1d_boundaries, - int64_t val_size, int64_t seq_size, + int64_t val_size, + int64_t seq_size, OutType* out_data) : sequence_data_(sequence_data), value_data_(value_data), @@ -104,12 +101,13 @@ class GpuAndCpuSearchSortedCompute { OutType* out_data_; }; -template +template class SearchSortedFunctor { public: - SearchSortedFunctor(const framework::ExecutionContext& context, - const framework::Tensor* sorted_sequence, - const framework::Tensor* value, bool right, + SearchSortedFunctor(const Context& context, + const DenseTensor* sorted_sequence, + const DenseTensor* value, + bool right, OutType* out_data) : context_(context), sorted_sequence_(sorted_sequence), @@ -121,74 +119,73 @@ class SearchSortedFunctor { void apply() { const T1* sequence_data = sorted_sequence_->data(); const T2* value_data = value_->data(); - const framework::DDim& seq_dims = sorted_sequence_->dims(); - const framework::DDim& val_dims = value_->dims(); + const phi::DDim& seq_dims = sorted_sequence_->dims(); + const phi::DDim& val_dims = value_->dims(); bool is_1d_boundaries = seq_dims.size() == 1; int64_t val_size = val_dims[val_dims.size() - 1]; int64_t seq_size = seq_dims[seq_dims.size() - 1]; - auto& dev_ctx = context_.template device_context(); - platform::ForRange for_range(dev_ctx, value_->numel()); + funcs::ForRange for_range(context_, value_->numel()); GpuAndCpuSearchSortedCompute - gpu_and_cpu_search_sorted_compute(sequence_data, value_data, right_, - is_1d_boundaries, val_size, seq_size, + gpu_and_cpu_search_sorted_compute(sequence_data, + value_data, + right_, + is_1d_boundaries, + val_size, + seq_size, out_data_); for_range(gpu_and_cpu_search_sorted_compute); } private: - const framework::ExecutionContext& context_; - const framework::Tensor* sorted_sequence_; - const framework::Tensor* value_; + const Context& context_; + const DenseTensor* sorted_sequence_; + const DenseTensor* value_; bool right_; OutType* out_data_; }; template -static void VisitDataType(framework::proto::VarType::Type type, - Visitor visitor) { - if (type == framework::proto::VarType::FP32) { +static void VisitDataType(DataType type, Visitor visitor) { + if (type == DataType::FLOAT32) { visitor.template apply(); - } else if (type == framework::proto::VarType::FP64) { + } else if (type == DataType::FLOAT64) { visitor.template apply(); - } else if (type == framework::proto::VarType::INT32) { + } else if (type == DataType::INT32) { visitor.template apply(); - } else if (type == framework::proto::VarType::INT64) { + } else if (type == DataType::INT64) { visitor.template apply(); } else { - PADDLE_THROW(platform::errors::InvalidArgument( + PADDLE_THROW(errors::InvalidArgument( "The recieved values data type %s can not meet input requirements. " "Because the given values data type of searchsorted operators must be " "float32, float64, int32 or int64. Please input appropriate " "sorted_sequence again! ", - framework::DataTypeToString(type))); + type)); } } -template -class SearchSortedKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* sorted_sequence = context.Input("SortedSequence"); - auto* value = context.Input("Values"); - bool out_int32 = context.Attr("out_int32"); - bool right = context.Attr("right"); - auto* out = context.Output("Out"); - - if (out_int32) { - int* out_data = out->mutable_data(context.GetPlace()); - SearchSortedFunctor functor( - context, sorted_sequence, value, right, out_data); - VisitDataType(framework::TransToProtoVarType(value->dtype()), functor); - } else { - int64_t* out_data = out->mutable_data(context.GetPlace()); - SearchSortedFunctor functor( - context, sorted_sequence, value, right, out_data); - VisitDataType(framework::TransToProtoVarType(value->dtype()), functor); - } +template +void SearchsortedKernel(const Context& ctx, + const DenseTensor& sorted_sequence, + const DenseTensor& value, + bool out_int32, + bool right, + DenseTensor* out) { + if (out_int32) { + ctx.template Alloc(out); + int* out_data = out->data(); + SearchSortedFunctor functor( + ctx, &sorted_sequence, &value, right, out_data); + VisitDataType(value.dtype(), functor); + } else { + ctx.template Alloc(out); + int64_t* out_data = out->data(); + SearchSortedFunctor functor( + ctx, &sorted_sequence, &value, right, out_data); + VisitDataType(value.dtype(), functor); } -}; +} -} // namespace operators -} // namespace paddle +} // namespace phi diff --git a/paddle/fluid/operators/searchsorted_op.cu b/paddle/phi/kernels/searchsorted_kernel.h similarity index 54% rename from paddle/fluid/operators/searchsorted_op.cu rename to paddle/phi/kernels/searchsorted_kernel.h index 4633ab43efb..e425c7fd795 100644 --- a/paddle/fluid/operators/searchsorted_op.cu +++ b/paddle/phi/kernels/searchsorted_kernel.h @@ -1,4 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,12 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/searchsorted_op.h" -namespace ops = paddle::operators; -namespace plat = paddle::platform; +#pragma once -REGISTER_OP_CUDA_KERNEL( - searchsorted, ops::SearchSortedKernel, - ops::SearchSortedKernel, - ops::SearchSortedKernel, - ops::SearchSortedKernel); +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void SearchsortedKernel(const Context& ctx, + const DenseTensor& sorted_sequence, + const DenseTensor& value, + bool out_int32, + bool right, + DenseTensor* out); + +} // namespace phi -- GitLab From 8852591fff5eef40c3bd2bcafb1d97690517a6c9 Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Tue, 15 Mar 2022 11:07:25 +0800 Subject: [PATCH 055/176] [IPU] add IPU related CI configures (#40354) * add ci * rm retry tests * format * restore retry tests * update timeout for ipu uts --- paddle/scripts/paddle_build.sh | 135 ++++++++++++++++++ .../fluid/tests/unittests/ipu/CMakeLists.txt | 6 + .../unittests/ipu/test_activation_x_op.py | 126 ---------------- .../unittests/ipu/test_batch_norm_op_ipu.py | 4 +- .../unittests/ipu/test_ipu_fp16_support.py | 109 -------------- 5 files changed, 143 insertions(+), 237 deletions(-) delete mode 100644 python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py delete mode 100644 python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index d1db8feb217..39676b916e5 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -229,6 +229,7 @@ function cmake_base() { -DWITH_CNCL=${WITH_CNCL:-OFF} -DWITH_XPU=${WITH_XPU:-OFF} -DWITH_MLU=${WITH_MLU:-OFF} + -DWITH_IPU=${WITH_IPU:-OFF} -DLITE_GIT_TAG=release/v2.10 -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF} -DWITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF} @@ -280,6 +281,7 @@ EOF -DLITE_GIT_TAG=release/v2.10 \ -DWITH_XPU=${WITH_XPU:-OFF} \ -DWITH_MLU=${WITH_MLU:-OFF} \ + -DWITH_IPU=${WITH_IPU:-OFF} \ -DWITH_CNCL=${WITH_CNCL:-OFF} \ -DXPU_SDK_ROOT=${XPU_SDK_ROOT:-""} \ -DWITH_LITE=${WITH_LITE:-OFF} \ @@ -1283,6 +1285,8 @@ function card_test() { CUDA_DEVICE_COUNT=$(rocm-smi -i | grep GPU | wc -l) elif [ "${WITH_MLU}" == "ON" ];then CUDA_DEVICE_COUNT=1 + elif [ "${WITH_IPU}" == "ON" ];then + CUDA_DEVICE_COUNT=1 else CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l) fi @@ -2240,6 +2244,130 @@ set -ex fi } +function parallel_test_base_ipu() { + mkdir -p ${PADDLE_ROOT}/build + cd ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/ipu + if [ ${WITH_TESTING:-ON} == "ON" ] ; then + cat <> ${PADDLE_ROOT}/build/build_summary.txt + ut_actual_total_endTime_s=`date +%s` + echo "ipipe_log_param_actual_TestCases_Total_Time: $[ $ut_actual_total_endTime_s - $ut_actual_total_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt + if [[ "$EXIT_CODE" != "0" ]]; then + show_ut_retry_result + fi +set -ex + fi +} + function parallel_test() { mkdir -p ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build @@ -2257,6 +2385,8 @@ function parallel_test() { parallel_test_base_npu elif [ "$WITH_MLU" == "ON" ];then parallel_test_base_mlu + elif [ "$WITH_IPU" == "ON" ];then + parallel_test_base_ipu else parallel_test_base_cpu ${PROC_RUN:-1} fi @@ -3022,6 +3152,11 @@ function main() { parallel_test check_coverage ;; + check_ipu_coverage) + cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number} + parallel_test + check_coverage + ;; reuse_so_cicheck_py35) reuse_so_cache parallel_test diff --git a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt index 959700ad743..79a2430a161 100644 --- a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt @@ -4,5 +4,11 @@ if(WITH_IPU) foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) + # set all UTs timeout to 200s + set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 200) endforeach(TEST_OP) + + set_tests_properties(test_conv_op_ipu PROPERTIES TIMEOUT 300) + set_tests_properties(test_elemetwise_x_op_ipu PROPERTIES TIMEOUT 300) + set_tests_properties(test_reduce_x_op_ipu PROPERTIES TIMEOUT 600) endif() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py b/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py deleted file mode 100644 index 58a88c113fc..00000000000 --- a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op.py +++ /dev/null @@ -1,126 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.nn.functional as F -import paddle.optimizer -import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() - - -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") -class TestRelu(IPUOpTest): - def setUp(self): - self.set_atol() - self.set_training() - self.init_op() - - def init_op(self): - self.op = paddle.fluid.layers.relu - - def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] - - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() - main_prog = paddle.static.Program() - startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED - - with fluid.scope_guard(scope): - with paddle.static.program_guard(main_prog, startup_prog): - x = paddle.static.data( - name=self.feed_list[0], - shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) - out = self.op(x, **self.attrs) - - fetch_list = [out.name] - - if run_ipu: - place = paddle.IPUPlace() - else: - place = paddle.CPUPlace() - exe = paddle.static.Executor(place) - exe.run(startup_prog) - - if run_ipu: - feed_list = self.feed_list - ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IpuCompiler( - main_prog, - ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) - else: - program = main_prog - - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) - return result[0] - - def run_test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) - - self.assertTrue(res0.shape == res1.shape) - - def test_case0(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'), - } - self.attrs = {} - self.set_feed_attr() - self.run_test_base() - - -class TestTanh(TestRelu): - def init_op(self): - self.op = F.tanh - - -class TestLog(TestRelu): - def init_op(self): - self.op = paddle.fluid.layers.log - - -class TestSigmoid(TestRelu): - def init_op(self): - self.op = F.sigmoid - - -class TestSqrt(TestRelu): - def init_op(self): - self.op = paddle.fluid.layers.sqrt - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py index 1dab958c1ec..c640cd441f1 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py @@ -115,7 +115,7 @@ class TestBase(IPUOpTest): class TestCase1(TestBase): def set_atol(self): - self.atol = 1e-7 + self.atol = 1e-6 self.rtol = 1e-6 self.atol_fp16 = 1e-3 self.rtol_fp16 = 1e-3 @@ -129,7 +129,7 @@ class TestCase1(TestBase): class TestCase2(TestBase): def set_atol(self): - self.atol = 1e-7 + self.atol = 1e-6 self.rtol = 1e-6 self.atol_fp16 = 1e-3 self.rtol_fp16 = 1e-3 diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py deleted file mode 100644 index aa6c05dc59a..00000000000 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_fp16_support.py +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer -import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() - - -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") -class TestBase(IPUOpTest): - def setUp(self): - self.set_atol() - self.set_feed() - self.set_feed_attr() - self.set_attrs() - - def set_feed(self): - np_data = np.random.uniform(low=-1, high=1, size=[1, 3, 100, 100]) - self.feed_ipu = {"x": np_data.astype('float16')} - self.feed_cpu = {"x": np_data.astype('float32')} - - def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed_cpu.values()] - self.feed_list = list(self.feed_cpu.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed_cpu.values() - ] - - def set_attrs(self): - self.attrs = {} - - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() - main_prog = paddle.static.Program() - startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED - - with fluid.scope_guard(scope): - with paddle.static.program_guard(main_prog, startup_prog): - x = paddle.static.data( - name=self.feed_list[0], - shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) - conv1 = paddle.static.nn.conv2d( - x, num_filters=3, filter_size=3, bias_attr=False) - conv2 = paddle.static.nn.conv2d( - x, num_filters=3, filter_size=3, bias_attr=False) - add1 = conv1 + conv2 - conv3 = paddle.static.nn.conv2d( - add1, num_filters=8, filter_size=8, bias_attr=False) - out = paddle.fluid.layers.relu(conv3, **self.attrs) - fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: - place = paddle.CPUPlace() - exe = paddle.static.Executor(place) - exe.run(startup_prog) - - feed = self.feed_ipu if run_ipu else self.feed_cpu - if run_ipu: - feed_list = self.feed_list - ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=False) - ipu_strategy.SetHalfConfig(enable_fp16=True) - program = compiler.IPUCompiledProgram( - main_prog, - ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) - else: - feed_list = self.feed_list - program = main_prog - result = exe.run(program, feed=feed, fetch_list=fetch_list) - return result[0] - - def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue(res0.shape == res1.shape) - mae = np.mean(np.abs(res0.flatten() - res1.flatten())) - print("mae is ", mae) - self.assertTrue(mae < 0.001) - - -if __name__ == "__main__": - unittest.main() -- GitLab From 5cb506b09b3cedaa0f85cfba716424a7ae22ad62 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Tue, 15 Mar 2022 11:14:33 +0800 Subject: [PATCH 056/176] add yaml (#40533) --- python/paddle/utils/code_gen/api.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index aac68efc59a..639afeb4c86 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -150,6 +150,15 @@ func : reshape inplace : (x -> out) +- api : relu + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : relu + inplace : (x -> out) + - api : scale args : (Tensor x, Scalar scale, float bias, bool bias_after_scale) output : Tensor -- GitLab From 42c7bb4794832e5eb5f6f2e789cadc0dedac2345 Mon Sep 17 00:00:00 2001 From: qipengh Date: Tue, 15 Mar 2022 11:20:46 +0800 Subject: [PATCH 057/176] [MLU] add check_finite_and_unscale op for amp (#40458) --- .../amp/check_finite_and_unscale_op_mlu.cc | 88 +++++++++++ .../test_amp_check_finite_and_scale_op_mlu.py | 145 ++++++++++++++++++ 2 files changed, 233 insertions(+) create mode 100644 paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_amp_check_finite_and_scale_op_mlu.py diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc new file mode 100644 index 00000000000..237cfcc6f11 --- /dev/null +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc @@ -0,0 +1,88 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto& dev_ctx = ctx.template device_context(); + const auto xs = ctx.MultiInput("X"); + const auto* scale = ctx.Input("Scale"); + auto outs = ctx.MultiOutput("Out"); + auto* found_inf = ctx.Output("FoundInfinite"); + + found_inf->mutable_data(dev_ctx.GetPlace()); + + MLUCnnlTensorDesc scale_desc(*scale); + MLUCnnlTensorDesc found_inf_desc(*found_inf, CNNL_LAYOUT_ARRAY, + ToCnnlDataType()); + + for (size_t i = 0; i < xs.size(); ++i) { + const auto* x = xs[i]; + auto* out = outs[i]; + out->mutable_data(ctx.GetPlace()); + + // check is_finite or is_nan + Tensor is_finite(found_inf->type()); + if (i != 0) { + is_finite.Resize(phi::make_ddim({1})); + is_finite.mutable_data(ctx.GetPlace()); + } else { + is_finite.ShareDataWith(*found_inf); + } + + MLUCnnlTensorDesc x_desc(*x); + + MLUCnnl::IsNanInf(ctx, x_desc.get(), GetBasePtr(x), + GetBasePtr(&is_finite)); + + // save is_finite by logical_and op after checking every input + if (i != 0) { + MLUCnnlTensorDesc is_finite_desc(is_finite, CNNL_LAYOUT_ARRAY, + ToCnnlDataType()); + MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_OR, found_inf_desc.get(), + GetBasePtr(found_inf), is_finite_desc.get(), + GetBasePtr(&is_finite), found_inf_desc.get(), + GetBasePtr(found_inf)); + } + + // The normal logic is : + // out = in, if found_inf = true + // out = in/scale, if found_inf = false + // But when found_inf is true, the data of Out should not be used. + // So, on MLU, we always compute out with in/scale. + MLUCnnlTensorDesc out_desc(*out); + MLUCnnl::Div(ctx, CNNL_COMPUTATION_HIGH_PRECISION, x_desc.get(), + GetBasePtr(x), scale_desc.get(), GetBasePtr(scale), + out_desc.get(), GetBasePtr(out)); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_MLU_KERNEL(check_finite_and_unscale, + ops::CheckFiniteAndUnscaleMLUKernel, + ops::CheckFiniteAndUnscaleMLUKernel); diff --git a/python/paddle/fluid/tests/unittests/mlu/test_amp_check_finite_and_scale_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_amp_check_finite_and_scale_op_mlu.py new file mode 100644 index 00000000000..57fa56acd68 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_amp_check_finite_and_scale_op_mlu.py @@ -0,0 +1,145 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle + +paddle.enable_static() +SEED = 2022 + + +class TestCheckFiniteAndUnscaleOp(OpTest): + def setUp(self): + self.set_mlu() + self.op_type = "check_finite_and_unscale" + self.init_dtype() + self.init_test_case() + + def init_test_case(self): + x = np.random.random((129, 129)).astype(self.dtype) + scale = np.random.random((1)).astype(self.dtype) + + self.inputs = {'X': [('x0', x)], 'Scale': scale} + self.outputs = { + 'FoundInfinite': np.array([0]), + 'Out': [('out0', x / scale)], + } + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestCheckFiniteAndUnscaleOpWithNan(TestCheckFiniteAndUnscaleOp): + def init_test_case(self): + x = np.random.random((129, 129)).astype(self.dtype) + x[128][128] = np.nan + scale = np.random.random((1)).astype(self.dtype) + + self.inputs = {'X': [('x0', x)], 'Scale': scale} + self.outputs = { + 'FoundInfinite': np.array([1]), + 'Out': [('out0', x)], + } + + def test_check_output(self): + # When input contains nan, do not check the output, + # since the output may be nondeterministic and will be discarded. + self.check_output_with_place(self.place, no_check_set=['Out']) + + +class TestCheckFiniteAndUnscaleOpWithInf(TestCheckFiniteAndUnscaleOp): + def init_test_case(self): + x = np.random.random((129, 129)).astype(self.dtype) + x[128][128] = np.inf + scale = np.random.random((1)).astype(self.dtype) + + self.inputs = {'X': [('x0', x)], 'Scale': scale} + self.outputs = { + 'FoundInfinite': np.array([1]), + 'Out': [('out0', x)], + } + + def test_check_output(self): + # When input contains inf, do not check the output, + # since the output may be nondeterministic and will be discarded. + self.check_output_with_place(self.place, no_check_set=['Out']) + + +class TestCheckFiniteAndUnscaleOpMultiInput(TestCheckFiniteAndUnscaleOp): + def init_test_case(self): + x0 = np.random.random((129, 129)).astype(self.dtype) + x1 = np.random.random((129, 129)).astype(self.dtype) + scale = np.random.random((1)).astype(self.dtype) + + self.inputs = {'X': [('x0', x0), ('x1', x1)], 'Scale': scale} + self.outputs = { + 'FoundInfinite': np.array([0]), + 'Out': [('out0', x0 / scale), ('out1', x1 / scale)], + } + + +class TestCheckFiniteAndUnscaleOpMultiInputWithNan(TestCheckFiniteAndUnscaleOp): + def init_test_case(self): + x0 = np.random.random((129, 129)).astype(self.dtype) + x0[128][128] = np.nan + x1 = np.random.random((129, 129)).astype(self.dtype) + scale = np.random.random((1)).astype(self.dtype) + + self.inputs = {'X': [('x0', x0), ('x1', x1)], 'Scale': scale} + self.outputs = { + 'FoundInfinite': np.array([1]), + 'Out': [('out0', x0 / scale), ('out1', x1 / scale)], + } + + def test_check_output(self): + # When input contains inf, do not check the output, + # since the output may be nondeterministic and will be discarded. + self.check_output_with_place(self.place, no_check_set=['Out']) + + +class TestCheckFiniteAndUnscaleOpMultiInputWithInf(TestCheckFiniteAndUnscaleOp): + def init_test_case(self): + x0 = np.random.random((129, 129)).astype(self.dtype) + x0[128][128] = np.nan + x1 = np.random.random((129, 129)).astype(self.dtype) + x1[128][128] = np.inf + scale = np.random.random((1)).astype(self.dtype) + + self.inputs = {'X': [('x0', x0), ('x1', x1)], 'Scale': scale} + self.outputs = { + 'FoundInfinite': np.array([1]), + 'Out': [('out0', x0 / scale), ('out1', x1 / scale)], + } + + def test_check_output(self): + # When input contains inf, do not check the output, + # since the output may be nondeterministic and will be discarded. + self.check_output_with_place(self.place, no_check_set=['Out']) + + +if __name__ == '__main__': + unittest.main() -- GitLab From d7112180a7889c239026147694ca10f0e4bfee09 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Tue, 15 Mar 2022 11:35:32 +0800 Subject: [PATCH 058/176] [Phi]Move Tanh/BRelu/LeakyRelu/ThresholdedRelu Kernels to Phi (#40385) * move activation op * adjust code format * fix compile bugs * fix ci bugs * code format adjust * code format adjust2 * activate ci status * modify according to comment * move activation kernel * revert relu6 * reduce add code * perfect use_phi_functor * completing func name * fix bugs when run ci * fix bugs when run infr * modifpy infrt get kernel signature --- .../ir/mkldnn/mkldnn_inplace_pass_tester.cc | 4 +- .../new_executor/standalone_executor_test.cc | 4 +- .../tensorrt/convert/test_activation_op.cc | 2 +- .../tensorrt/convert/test_leaky_relu_op.cc | 2 +- paddle/fluid/operators/activation_op.cc | 30 +- paddle/fluid/operators/activation_op.h | 375 +---------------- paddle/fluid/operators/activation_op.kps | 184 +------- paddle/phi/kernels/activation_grad_kernel.h | 49 +++ paddle/phi/kernels/activation_kernel.h | 20 + .../phi/kernels/cpu/activation_grad_kernel.cc | 150 +++++-- paddle/phi/kernels/cpu/activation_kernel.cc | 63 ++- paddle/phi/kernels/funcs/activation_functor.h | 392 +++++++++++++++++- .../phi/kernels/gpu/activation_grad_kernel.cu | 201 ++++----- paddle/phi/kernels/gpu/activation_kernel.cu | 125 +++--- .../phi/kernels/impl/activation_grad_impl.h | 72 ++++ paddle/phi/ops/compat/activation_sig.cc | 97 ++++- tools/infrt/get_compat_kernel_signature.py | 5 +- 17 files changed, 983 insertions(+), 792 deletions(-) diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index d578ada0db0..ef2e83ced26 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -25,11 +25,11 @@ USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); -USE_OP(leaky_relu); +USE_OP_ITSELF(leaky_relu); USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN); USE_OP(gelu); USE_OP_ITSELF(relu); -USE_OP(tanh); +USE_OP_ITSELF(tanh); USE_OP_DEVICE_KERNEL(tanh, MKLDNN); namespace paddle { diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index 219aae71127..eadb00b9e88 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -32,7 +32,7 @@ USE_OP(concat); USE_OP(matmul); USE_OP_ITSELF(elementwise_add); USE_OP(sigmoid); -USE_OP(tanh); +USE_OP_ITSELF(tanh); USE_OP(elementwise_mul); USE_OP(softmax_with_cross_entropy); USE_OP_ITSELF(reduce_mean); @@ -48,7 +48,7 @@ USE_OP(transpose2_grad); USE_OP(concat_grad); USE_OP_ITSELF(elementwise_mul_grad); USE_OP(sigmoid_grad); -USE_OP(tanh_grad); +USE_OP_ITSELF(tanh_grad); USE_OP(sum); USE_OP(slice_grad); USE_OP(lookup_table_grad); diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index 7f7313fbcb5..1946f9e2838 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -54,5 +54,5 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); } USE_OP_ITSELF(relu); USE_OP(sigmoid); -USE_OP(tanh); +USE_OP_ITSELF(tanh); USE_OP(relu6); diff --git a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc index 1725888abc3..f17e00de0ee 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc @@ -45,4 +45,4 @@ TEST(leaky_relu_op, test_leaky_relu) { } // namespace paddle // USE_OP(leaky_relu); -USE_OP(leaky_relu); +USE_OP_ITSELF(leaky_relu); diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 66f1bcc8b68..4205f2253a6 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -1482,6 +1482,9 @@ REGISTER_ACTIVATION_OP(cosh, Cosh, CoshFunctor, CoshGradFunctor); REGISTER_ACTIVATION_OP(asinh, Asinh, AsinhFunctor, AsinhGradFunctor); REGISTER_ACTIVATION_OP(acosh, Acosh, AcoshFunctor, AcoshGradFunctor); REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor); +REGISTER_ACTIVATION_OP(brelu, BRelu, BReluFunctor, BReluGradFunctor); +REGISTER_ACTIVATION_OP(thresholded_relu, ThresholdedRelu, + ThresholdedReluFunctor, ThresholdedReluGradFunctor); /* ========================== sigmoid register ============================= */ @@ -1567,23 +1570,6 @@ REGISTER_OPERATOR( ops::ActivationOpTripleGrad::FwdDeps()>, ops::ActivationTripleGradOpInplaceInferer); -REGISTER_ACTIVATION_CPU_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor); -REGISTER_OP_CPU_KERNEL( - tanh_grad_grad, ops::TanhDoubleGradKernel>, - ops::TanhDoubleGradKernel>, - ops::TanhDoubleGradKernel>); -// Register TripleGrad Kernel -REGISTER_OP_CPU_KERNEL( - tanh_triple_grad, - ops::TanhTripeGradKernel>, - ops::TanhTripeGradKernel>, - ops::TanhTripeGradKernel>); /* ========================================================================== */ /* ========================== relu register ============================= */ @@ -1623,16 +1609,6 @@ REGISTER_OPERATOR( ops::ActivationOpDoubleGrad2::FwdDeps()>, ops::ActivationDoubleGradOpInplaceInferer); -REGISTER_ACTIVATION_CPU_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor, - LeakyReluGradFunctor); -REGISTER_OP_CPU_KERNEL( - leaky_relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel< - plat::CPUDeviceContext, ops::LeakyReluGradGradFunctor>); /* ========================================================================== */ /* ======================== elu register ============================ */ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 4b79397b6cd..b076db01c22 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -253,6 +253,14 @@ struct SigmoidFunctor : public BaseActivationFunctor { template \ using name##GradFunctor = phi::funcs::name##GradFunctor; +#define USE_PHI_DOUBLE_GRAD_FUNCTOR(name) \ + template \ + using name##GradGradFunctor = phi::funcs::name##GradGradFunctor; + +#define USE_PHI_TRIPLE_GRAD_FUNCTOR(name) \ + template \ + using name##TripleGradFunctor = phi::funcs::name##TripleGradFunctor; + USE_PHI_FUNCTOR(Cos) USE_PHI_FUNCTOR(Tan) USE_PHI_FUNCTOR(Acos) @@ -264,6 +272,13 @@ USE_PHI_FUNCTOR(Cosh) USE_PHI_FUNCTOR(Asinh) USE_PHI_FUNCTOR(Acosh) USE_PHI_FUNCTOR(Atanh) +USE_PHI_FUNCTOR(Tanh) +USE_PHI_DOUBLE_GRAD_FUNCTOR(Tanh) +USE_PHI_TRIPLE_GRAD_FUNCTOR(Tanh) +USE_PHI_FUNCTOR(BRelu) +USE_PHI_FUNCTOR(ThresholdedRelu) +USE_PHI_FUNCTOR(LeakyRelu) +USE_PHI_DOUBLE_GRAD_FUNCTOR(LeakyRelu) template struct SigmoidGradFunctor : public BaseActivationFunctor { @@ -497,117 +512,6 @@ using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor; template using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor; -// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) -template -struct TanhFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.tanh(); - } -}; - -template -struct TanhGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * (static_cast(1) - out * out); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -template -struct TanhGradGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* Out, - const framework::Tensor* ddX, const framework::Tensor* dOut, - framework::Tensor* dOutNew, framework::Tensor* ddOut) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad")); - // tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out - // * ddx) - if (dOutNew) { - auto dout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad")); - auto dout_new = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "TanhGradGrad")); - dout_new.device(*d) = - static_cast(-1) * dout * static_cast(2) * out * ddx; - } - if (ddOut) { - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "TanhGradGrad")); - ddout.device(*d) = (static_cast(1) - out * out) * ddx; - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; -/* - Out - DOut D_Dout - DDx -> TanhTripleGrad -> D_DDx - D_DDout d_OutNew - D_Dout_new - - D_Dout = (-2) * Out * DDx * D_Dout_new - D_DDx = (1-Out^2)*D_DDout + (-2) * Out * DOut * D_Dout_new - D_OutNew = (-2) * Out * DDx * D_DDout + (-2) * DOut * DDx * D_Dout_new - - Out, DDX, DOut, D_DDOut, D_DOut_New // input - D_OutNew, D_DOut, D_DDx // output -*/ -template -struct TanhTripleGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* Out, - const framework::Tensor* ddX, const framework::Tensor* dOut, - const framework::Tensor* d_DDOut, - const framework::Tensor* d_dOut_New, - framework::Tensor* d_d_Out, framework::Tensor* d_Out_New, - framework::Tensor* d_DDx) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhTripleGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Input", "Out", "TanhTripleGrad")); - auto dout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhTripleGrad")); - auto d_ddOut = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "TanhTripleGrad")); - auto d_dOutNew = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_dOut_New, "Input", "D_DOut_New", "TanhTripleGrad")); - - if (d_Out_New) { - auto d_OutNew = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_Out_New, "Output", "D_OutNew", "TanhTripleGrad")); - d_OutNew.device(*d) = (static_cast(-2) * out * ddx * d_ddOut) - - (static_cast(2) * dout * ddx * d_dOutNew); - } - if (d_d_Out) { - auto d_dOut = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "TanhTripleGrad")); - d_dOut.device(*d) = static_cast(-2) * out * ddx * d_dOutNew; - } - if (d_DDx) { - auto d_ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "TanhTripleGrad")); - d_ddx.device(*d) = (static_cast(1) - (out * out)) * d_ddOut - - static_cast(2) * out * dout * d_dOutNew; - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - // tanhshrink(x) = x - tanh(x) // where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) template @@ -909,42 +813,6 @@ struct SquareGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct BReluFunctor : public BaseActivationFunctor { - float t_min; - float t_max; - - // NOTE: Explicit hides the `BaseActivationFunctor::GetAttrs` - // not polymorphism for speed. - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"t_min", &t_min}, {"t_max", &t_max}}; - } - - template - void operator()(Device d, X x, Out out) const { - out.device(d) = - x.cwiseMax(static_cast(t_min)).cwiseMin(static_cast(t_max)); - } -}; - -template -struct BReluGradFunctor : public BaseActivationFunctor { - float t_min; - float t_max; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"t_min", &t_min}, {"t_max", &t_max}}; - } - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * - ((x > static_cast(t_min)) * (x < static_cast(t_max))) - .template cast(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - // relu6(x) = min(max(0, x), 6) template struct Relu6Functor : public BaseActivationFunctor { @@ -1168,41 +1036,6 @@ struct SoftReluGradFunctor : public BaseActivationFunctor { } }; -template -struct LeakyReluFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - template - void operator()(Device d, X x, Out out) const { - if (alpha < 1.f) { - out.device(d) = x.cwiseMax(static_cast(alpha) * x); - } else { - out.device(d) = x.cwiseMin(static_cast(alpha) * x); - } - } -}; - -template -struct LeakyReluGradFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto temp1 = - static_cast(alpha) * (x < static_cast(0)).template cast(); - auto temp2 = (x >= static_cast(0)).template cast(); - dx.device(d) = dout * (temp1 + temp2).template cast(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct ELUFunctor : public BaseActivationFunctor { float alpha; @@ -1430,37 +1263,6 @@ struct STanhGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct ThresholdedReluFunctor : public BaseActivationFunctor { - float threshold; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - template - void operator()(Device d, X x, Out out) const { - auto th = static_cast(threshold); - out.device(d) = (x > th).template cast() * x; - } -}; - -template -struct ThresholdedReluGradFunctor : public BaseActivationFunctor { - float threshold; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto th = static_cast(threshold); - dx.device(d) = dout * (x > th).template cast(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct HardSigmoidFunctor : public BaseActivationFunctor { float slope; @@ -1531,121 +1333,6 @@ struct SwishGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -/* - * in arguments: x, out, ddx - * out arguments: ddout, dout, dx - */ -template -inline void ExtractActivationDoubleGradTensor( - const framework::ExecutionContext& ctx, const framework::Tensor** X, - const framework::Tensor** Out, const framework::Tensor** ddX, - framework::Tensor** dX, framework::Tensor** dOut, - framework::Tensor** ddOut) { - auto ddx_var = ctx.InputVar("DDX"); - auto ddo_var = ctx.OutputVar("DDOut"); - PADDLE_ENFORCE_NOT_NULL( - ddx_var, platform::errors::NotFound( - "Cannot get input Variable Out, variable name = %s", - ctx.InputName("DDX"))); - if (CanBeUsedBySelectedRows.count(ctx.Type())) { - *ddX = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*ddx_var); - if (ddo_var) { - *ddOut = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( - ddo_var); - } - } else { - *ddX = ctx.Input("DDX"); - if (ddo_var) { - *ddOut = ctx.Output("DDOut"); - } - } - PADDLE_ENFORCE_NOT_NULL( - *ddX, - platform::errors::NotFound( - "Cannot get the tensor from the Variable Output, variable name = %s", - ctx.OutputName("DDX"))); - - if (static_cast(kDepValue) & static_cast(ActBwdOpFwdDeps::kDepX)) { - auto x_var = ctx.InputVar("X"); - PADDLE_ENFORCE_NOT_NULL( - x_var, platform::errors::NotFound( - "Cannot get input Variable Out, variable name = %s", - ctx.InputName("X"))); - auto dx_var = ctx.OutputVar("DX"); - if (CanBeUsedBySelectedRows.count(ctx.Type())) { - *X = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var); - if (dx_var) { - *dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( - dx_var); - } - } else { - *X = ctx.Input("X"); - if (dx_var) { - *dX = ctx.Output("DX"); - } - } - } else { - VLOG(10) << "Inplace activation of Op: " << ctx.Type(); - *X = *ddX; - } - if (static_cast(kDepValue) & - static_cast(ActBwdOpFwdDeps::kDepOut)) { - auto out_var = ctx.InputVar("Out"); - PADDLE_ENFORCE_NOT_NULL( - out_var, - platform::errors::NotFound( - "Cannot get the tensor from the Variable Out, variable name = %s", - ctx.InputName("Out"))); - auto dout_var = ctx.OutputVar("DOut"); - if (CanBeUsedBySelectedRows.count(ctx.Type())) { - *Out = - paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var); - if (dout_var) { - *dOut = - paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( - dout_var); - } - } else { - *Out = ctx.Input("Out"); - if (dout_var) { - *dOut = ctx.Output("DOut"); - } - } - } else { - VLOG(10) << "Inplace activation of Op: " << ctx.Type(); - *Out = *ddX; - } -} - -template -class ActivationDoubleGradKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& ctx) const override { - const framework::Tensor *X, *Out, *ddX; - X = Out = ddX = nullptr; - framework::Tensor *ddOut, *dOut, *dX; - ddOut = dOut = dX = nullptr; - - ExtractActivationDoubleGradTensor(ctx, &X, &Out, &ddX, - &dX, &dOut, &ddOut); - - if (ddOut) ddOut->mutable_data(ctx.GetPlace()); - if (dOut) dOut->mutable_data(ctx.GetPlace()); - if (dX) dX->mutable_data(Out->dims(), ctx.GetPlace()); - - auto& place = ctx.template device_context(); - - Functor functor; - auto attrs = functor.GetAttrs(); - for (auto& attr : attrs) { - *attr.second = ctx.Attr(attr.first); - } - functor(place, X, Out, ddX, ddOut, dOut, dX); - } -}; - template struct AbsGradGradFunctor : public BaseActivationFunctor { template @@ -1667,35 +1354,6 @@ struct AbsGradGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct LeakyReluGradGradFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - template - void operator()(const Device& dev, const framework::Tensor* X, - const framework::Tensor* Out, const framework::Tensor* ddX, - framework::Tensor* ddOut, framework::Tensor* dOut, - framework::Tensor* dX) const { - if (ddOut) { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad")); - auto x = framework::EigenVector::Flatten( - GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad")); - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad")); - ddout.device(*d) = - ddx * - ((x > static_cast(0)).template cast() + - static_cast(alpha) * (x <= static_cast(0)).template cast()) - .template cast(); - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct ELUGradGradFunctor : public BaseActivationFunctor { float alpha; @@ -2504,7 +2162,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor { __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \ __macro(log2, Log2, Log2Functor, Log2GradFunctor); \ __macro(log10, Log10, Log10Functor, Log10GradFunctor); \ - __macro(brelu, BRelu, BReluFunctor, BReluGradFunctor); \ __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \ __macro(stanh, STanh, STanhFunctor, STanhGradFunctor); \ __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \ @@ -2515,7 +2172,5 @@ struct LogGradGradFunctor : public BaseActivationFunctor { __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, \ HardSigmoidGradFunctor); \ __macro(swish, Swish, SwishFunctor, SwishGradFunctor); \ - __macro(thresholded_relu, ThresholdedRelu, ThresholdedReluFunctor, \ - ThresholdedReluGradFunctor); \ __macro(mish, Mish, MishFunctor, MishGradFunctor); \ __macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor); diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index 92a101451e2..256f20db084 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -18,38 +18,6 @@ limitations under the License. */ namespace paddle { namespace operators { -template -struct CudaLeakyReluFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float alpha; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - // leakyrelu(x) = x > 0 ? x : alpha * x - __device__ __forceinline__ T operator()(const T x) const { - return x > zero ? x : static_cast(alpha) * x; - } -}; - -template -struct CudaLeakyReluGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float alpha; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - // dx = dout * (x > 0 ? 1 : alpha) - __device__ __forceinline__ T operator()(const T dout, const T x) const { - return x > zero ? dout : static_cast(alpha) * dout; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CudaSigmoidFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; @@ -224,31 +192,6 @@ struct CudaZeroGradFunctor : public BaseActivationFunctor { } }; -template -struct CudaTanhFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // tanh(x) = tanh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(tanh(x)); - } -}; - -template -struct CudaTanhGradFunctor : public BaseActivationFunctor { - T one = static_cast(1.0f); - - // dx = dout * (1 - out^2) - __device__ __forceinline__ T operator()(const T dout, const T out) const { - return dout * (one - out * out); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - template struct CudaReciprocalFunctor : public BaseActivationFunctor { T one = static_cast(1.0f); @@ -476,45 +419,6 @@ struct CudaLog10GradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct CudaBReluFunctor : public BaseActivationFunctor { - float t_min; - float t_max; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"t_min", &t_min}, {"t_max", &t_max}}; - } - - // brelu(x) = min(max(x, t_min), t_max) - __device__ __forceinline__ T operator()(const T x) const { - T t_min_cast = static_cast(t_min); - T t_max_cast = static_cast(t_max); - T temp_max = x > t_min_cast ? x : t_min_cast; - T temp_min = temp_max < t_max_cast ? temp_max : t_max_cast; - return temp_min; - } -}; - -template -struct CudaBReluGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float t_min; - float t_max; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"t_min", &t_min}, {"t_max", &t_max}}; - } - - // dx = (x > t_min && x < t_max) ? dout : 0 - __device__ __forceinline__ T operator()(const T dout, const T x) const { - T t_min_cast = static_cast(t_min); - T t_max_cast = static_cast(t_max); - return (x > t_min_cast && x < t_max_cast) ? dout : zero; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CudaSoftReluFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; @@ -907,38 +811,6 @@ struct CudaMishGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct CudaThresholdedReluFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - // thresholded_relu(x) = x > threshold ? x : 0 - __device__ __forceinline__ T operator()(const T x) const { - return x > static_cast(threshold) ? x : zero; - } -}; - -template -struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - // dx = x > threshold ? dout : 0 - __device__ __forceinline__ T operator()(const T dout, const T x) const { - return x > static_cast(threshold) ? dout : zero; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CudaHardSwishFunctor : public BaseActivationFunctor { T zero = static_cast(0.0f); @@ -1212,6 +1084,22 @@ class ActivationGradCudaKernel } }; +USE_PHI_FUNCTOR(CudaCos) +USE_PHI_FUNCTOR(CudaTan) +USE_PHI_FUNCTOR(CudaAcos) +USE_PHI_FUNCTOR(CudaSin) +USE_PHI_FUNCTOR(CudaAsin) +USE_PHI_FUNCTOR(CudaAtan) +USE_PHI_FUNCTOR(CudaSinh) +USE_PHI_FUNCTOR(CudaCosh) +USE_PHI_FUNCTOR(CudaAsinh) +USE_PHI_FUNCTOR(CudaAcosh) +USE_PHI_FUNCTOR(CudaAtanh) +USE_PHI_FUNCTOR(CudaTanh) +USE_PHI_FUNCTOR(CudaBRelu) +USE_PHI_FUNCTOR(CudaLeakyRelu) +USE_PHI_FUNCTOR(CudaThresholdedRelu) + } // namespace operators } // namespace paddle @@ -1270,20 +1158,6 @@ namespace plat = paddle::platform; ops::ActivationGradCudaKernel>); -/* ======================== leaky relu register ============================ */ -REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor, - CudaLeakyReluGradFunctor); - -REGISTER_OP_CUDA_KERNEL( - leaky_relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel< - plat::CUDADeviceContext, ops::LeakyReluGradGradFunctor>); -/* ========================================================================== */ - /* ======================== elu register ============================ */ REGISTER_OP_CUDA_KERNEL( elu, ops::ActivationCudaKernel>); /* ========================================================================== */ -/* =========================== tanh register ============================ */ -REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, CudaTanhFunctor, - CudaTanhGradFunctor); - -REGISTER_OP_CUDA_KERNEL( - tanh_grad_grad, - ops::TanhDoubleGradKernel>, - ops::TanhDoubleGradKernel>, - ops::TanhDoubleGradKernel>); - -REGISTER_OP_CUDA_KERNEL( - tanh_triple_grad, - ops::TanhTripeGradKernel>, - ops::TanhTripeGradKernel>, - ops::TanhTripeGradKernel>); -/* ========================================================================== */ - /* =========================== sqrt register ============================= */ REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, CudaSqrtFunctor, CudaSqrtGradFunctor); @@ -1521,7 +1372,6 @@ REGISTER_OP_CUDA_KERNEL( __macro(log1p, Log1p, CudaLog1pFunctor, CudaLog1pGradFunctor); \ __macro(log2, Log2, CudaLog2Functor, CudaLog2GradFunctor); \ __macro(log10, Log10, CudaLog10Functor, CudaLog10GradFunctor); \ - __macro(brelu, BRelu, CudaBReluFunctor, CudaBReluGradFunctor); \ __macro(soft_relu, SoftRelu, CudaSoftReluFunctor, CudaSoftReluGradFunctor); \ __macro(stanh, STanh, CudaSTanhFunctor, CudaSTanhGradFunctor); \ __macro(softplus, Softplus, CudaSoftplusFunctor, CudaSoftplusGradFunctor); \ @@ -1535,8 +1385,6 @@ REGISTER_OP_CUDA_KERNEL( CudaHardSigmoidGradFunctor); \ __macro(swish, Swish, CudaSwishFunctor, CudaSwishGradFunctor); \ __macro(mish, Mish, CudaMishFunctor, CudaMishGradFunctor); \ - __macro(thresholded_relu, ThresholdedRelu, CudaThresholdedReluFunctor, \ - CudaThresholdedReluGradFunctor); \ __macro(hard_swish, HardSwish, CudaHardSwishFunctor, \ CudaHardSwishGradFunctor); FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL) diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h index f34e5710ab7..a5b737b28c2 100644 --- a/paddle/phi/kernels/activation_grad_kernel.h +++ b/paddle/phi/kernels/activation_grad_kernel.h @@ -39,6 +39,54 @@ void ReluDoubleGradKernel(const Context& dev_ctx, const DenseTensor& ddx, DenseTensor* ddout); +template +void TanhDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + const DenseTensor& dout, + DenseTensor* dout_new, + DenseTensor* ddout); + +template +void TanhTripleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + const DenseTensor& dout, + const DenseTensor& d_ddout, + const DenseTensor& d_dout_new, + DenseTensor* d_out_new, + DenseTensor* d_dout, + DenseTensor* d_ddx); + +template +void BReluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + float t_min, + float t_max, + DenseTensor* dx); + +template +void LeakyReluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + float alpha, + DenseTensor* dx); + +template +void LeakyReluDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& ddx, + float alpha, + DenseTensor* ddout); + +template +void ThresholdedReluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + float threshold, + DenseTensor* dx); + DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cos); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Tan); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acos); @@ -51,5 +99,6 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asinh); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acosh); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atanh); DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Relu); +DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Tanh); } // namespace phi diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h index bdf8f436359..885dccad8e3 100644 --- a/paddle/phi/kernels/activation_kernel.h +++ b/paddle/phi/kernels/activation_kernel.h @@ -36,5 +36,25 @@ DECLARE_ACTIVATION_KERNEL(Asinh) DECLARE_ACTIVATION_KERNEL(Acosh) DECLARE_ACTIVATION_KERNEL(Atanh) DECLARE_ACTIVATION_KERNEL(Relu) +DECLARE_ACTIVATION_KERNEL(Tanh) + +template +void BReluKernel(const Context& dev_ctx, + const DenseTensor& x, + float t_min, + float t_max, + DenseTensor* out); + +template +void LeakyReluKernel(const Context& dev_ctx, + const DenseTensor& x, + float alpha, + DenseTensor* out); + +template +void ThresholdedReluKernel(const Context& dev_ctx, + const DenseTensor& x, + float threshold, + DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc index fe43ebb8160..f9af50f6832 100644 --- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc @@ -27,65 +27,135 @@ namespace phi { const DenseTensor& x, \ const DenseTensor& dout, \ DenseTensor* dx) { \ - functor_class functor; \ - ActivationGradImpl( \ + functor_class functor; \ + ActivationGradImpl>( \ dev_ctx, &x, nullptr, &dout, dx, functor); \ } +#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( \ + name, functor_class, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx) { \ + functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationGradImpl>( \ + dev_ctx, &x, nullptr, &dout, dx, functor); \ + } + +#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX( \ + name, functor_class, attr1, attr2) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr1, \ + float attr2, \ + DenseTensor* dx) { \ + functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationGradImpl>( \ + dev_ctx, &x, nullptr, &dout, dx, functor); \ + } + #define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& out, \ const DenseTensor& dout, \ DenseTensor* dx) { \ - functor_class functor; \ - ActivationGradImpl( \ + functor_class functor; \ + ActivationGradImpl>( \ dev_ctx, nullptr, &out, &dout, dx, functor); \ } -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CosGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::TanGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::AcosGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::SinGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::AsinGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::AtanGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::SinhGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CoshGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::AsinhGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::AcoshGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::AtanhGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::ReluGradFunctor); +#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut( \ + name, functor_class, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx) { \ + functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationGradImpl>( \ + dev_ctx, nullptr, &out, &dout, dx, functor); \ + } + +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CosGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::TanGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::AcosGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::SinGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::AsinGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::AtanGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::SinhGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CoshGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::AsinhGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::AcoshGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::AtanhGradFunctor); + +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::ReluGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Tanh, funcs::TanhGradFunctor); + +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu, + funcs::LeakyReluGradFunctor, + alpha); +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( + ThresholdedRelu, funcs::ThresholdedReluGradFunctor, threshold); + +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu, + funcs::BReluGradFunctor, + t_min, + t_max); } // namespace phi -PD_REGISTER_KERNEL( - cos_grad, CPU, ALL_LAYOUT, phi::CosGradKernel, float, double) {} -PD_REGISTER_KERNEL( - tan_grad, CPU, ALL_LAYOUT, phi::TanGradKernel, float, double) {} -PD_REGISTER_KERNEL( - acos_grad, CPU, ALL_LAYOUT, phi::AcosGradKernel, float, double) {} -PD_REGISTER_KERNEL( - sin_grad, CPU, ALL_LAYOUT, phi::SinGradKernel, float, double) {} -PD_REGISTER_KERNEL( - asin_grad, CPU, ALL_LAYOUT, phi::AsinGradKernel, float, double) {} -PD_REGISTER_KERNEL( - atan_grad, CPU, ALL_LAYOUT, phi::AtanGradKernel, float, double) {} -PD_REGISTER_KERNEL( - sinh_grad, CPU, ALL_LAYOUT, phi::SinhGradKernel, float, double) {} -PD_REGISTER_KERNEL( - cosh_grad, CPU, ALL_LAYOUT, phi::CoshGradKernel, float, double) {} -PD_REGISTER_KERNEL( - asinh_grad, CPU, ALL_LAYOUT, phi::AsinhGradKernel, float, double) {} -PD_REGISTER_KERNEL( - acosh_grad, CPU, ALL_LAYOUT, phi::AcoshGradKernel, float, double) {} -PD_REGISTER_KERNEL( - atanh_grad, CPU, ALL_LAYOUT, phi::AtanhGradKernel, float, double) {} PD_REGISTER_KERNEL( relu_grad, CPU, ALL_LAYOUT, phi::ReluGradKernel, float, double) {} -PD_REGISTER_KERNEL(relu_double_grad, + +#define PD_REGISTER_ACTIVATION_GRAD_KERNEL(name, func) \ + PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func, float, double) {} + +#define PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(name, func) \ + PD_REGISTER_KERNEL( \ + name, CPU, ALL_LAYOUT, phi::func, float, double, phi::dtype::float16) {} + +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sin_grad, SinGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(cos_grad, CosGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tan_grad, TanGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(acos_grad, AcosGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(asin_grad, AsinGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(atan_grad, AtanGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sinh_grad, SinhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(cosh_grad, CoshGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(asinh_grad, AsinhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(acosh_grad, AcoshGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(atanh_grad, AtanhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_grad, TanhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad, + ThresholdedReluGradKernel) + +PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(relu_double_grad, + ReluDoubleGradKernel) +PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(tanh_double_grad, + TanhDoubleGradKernel) +PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(leaky_relu_double_grad, + LeakyReluDoubleGradKernel) + +PD_REGISTER_KERNEL(tanh_triple_grad, CPU, ALL_LAYOUT, - phi::ReluDoubleGradKernel, + phi::TanhTripleGradKernel, float, double, phi::dtype::float16) {} diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc index 51883f25183..0d13429c8f6 100644 --- a/paddle/phi/kernels/cpu/activation_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_kernel.cc @@ -27,6 +27,33 @@ namespace phi { ActivationImpl(dev_ctx, x, out, functor); \ } +#define DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr, \ + DenseTensor* out) { \ + functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationImpl>(dev_ctx, x, out, functor); \ + } + +#define DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS( \ + name, functor_class, attr1, attr2) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr1, \ + float attr2, \ + DenseTensor* out) { \ + functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationImpl>(dev_ctx, x, out, functor); \ + } + DEFINE_CPU_ACTIVATION_KERNEL(Sin, funcs::SinFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Cos, funcs::CosFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Tan, funcs::TanFunctor) @@ -39,17 +66,31 @@ DEFINE_CPU_ACTIVATION_KERNEL(Asinh, funcs::AsinhFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Acosh, funcs::AcoshFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Atanh, funcs::AtanhFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Relu, funcs::ReluCPUFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Tanh, funcs::TanhFunctor) +DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, funcs::LeakyReluFunctor, alpha) +DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, + funcs::ThresholdedReluFunctor, + threshold) +DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, funcs::BReluFunctor, t_min, t_max) } // namespace phi -PD_REGISTER_KERNEL(sin, CPU, ALL_LAYOUT, phi::SinKernel, float, double) {} -PD_REGISTER_KERNEL(cos, CPU, ALL_LAYOUT, phi::CosKernel, float, double) {} -PD_REGISTER_KERNEL(tan, CPU, ALL_LAYOUT, phi::TanKernel, float, double) {} -PD_REGISTER_KERNEL(acos, CPU, ALL_LAYOUT, phi::AcosKernel, float, double) {} -PD_REGISTER_KERNEL(asin, CPU, ALL_LAYOUT, phi::AsinKernel, float, double) {} -PD_REGISTER_KERNEL(atan, CPU, ALL_LAYOUT, phi::AtanKernel, float, double) {} -PD_REGISTER_KERNEL(sinh, CPU, ALL_LAYOUT, phi::SinhKernel, float, double) {} -PD_REGISTER_KERNEL(cosh, CPU, ALL_LAYOUT, phi::CoshKernel, float, double) {} -PD_REGISTER_KERNEL(asinh, CPU, ALL_LAYOUT, phi::AsinhKernel, float, double) {} -PD_REGISTER_KERNEL(acosh, CPU, ALL_LAYOUT, phi::AcoshKernel, float, double) {} -PD_REGISTER_KERNEL(atanh, CPU, ALL_LAYOUT, phi::AtanhKernel, float, double) {} PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {} + +#define PD_REGISTER_ACTIVATION_KERNEL(name, func) \ + PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func##Kernel, float, double) {} + +PD_REGISTER_ACTIVATION_KERNEL(sin, Sin) +PD_REGISTER_ACTIVATION_KERNEL(cos, Cos) +PD_REGISTER_ACTIVATION_KERNEL(tan, Tan) +PD_REGISTER_ACTIVATION_KERNEL(acos, Acos) +PD_REGISTER_ACTIVATION_KERNEL(asin, Asin) +PD_REGISTER_ACTIVATION_KERNEL(atan, Atan) +PD_REGISTER_ACTIVATION_KERNEL(sinh, Sinh) +PD_REGISTER_ACTIVATION_KERNEL(cosh, Cosh) +PD_REGISTER_ACTIVATION_KERNEL(asinh, Asinh) +PD_REGISTER_ACTIVATION_KERNEL(acosh, Acosh) +PD_REGISTER_ACTIVATION_KERNEL(atanh, Atanh) +PD_REGISTER_ACTIVATION_KERNEL(tanh, Tanh) +PD_REGISTER_ACTIVATION_KERNEL(brelu, BRelu) +PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyRelu) +PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedRelu) diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h index 1a36e4e132f..c8fb54bb102 100644 --- a/paddle/phi/kernels/funcs/activation_functor.h +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -513,7 +513,270 @@ struct ReluGradGradFunctor : public BaseActivationFunctor { } }; -#if defined(__NVCC__) || defined(__HIPCC__) +// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) +template +struct TanhFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.tanh(); + } +}; + +template +struct TanhGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * (static_cast(1) - out * out); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +template +struct TanhGradGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, + const DenseTensor* Out, + const DenseTensor* ddX, + const DenseTensor* dOut, + DenseTensor* dOutNew, + DenseTensor* ddOut) const { + auto* d = dev.eigen_device(); + auto ddx = EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad")); + auto out = EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad")); + // tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out + // * ddx) + if (dOutNew) { + auto dout = EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad")); + auto dout_new = EigenVector::Flatten( + GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "TanhGradGrad")); + dout_new.device(*d) = + static_cast(-1) * dout * static_cast(2) * out * ddx; + } + if (ddOut) { + auto ddout = EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "TanhGradGrad")); + ddout.device(*d) = (static_cast(1) - out * out) * ddx; + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; +/* + Out + DOut D_Dout + DDx -> TanhTripleGrad -> D_DDx + D_DDout d_OutNew + D_Dout_new + + D_Dout = (-2) * Out * DDx * D_Dout_new + D_DDx = (1-Out^2)*D_DDout + (-2) * Out * DOut * D_Dout_new + D_OutNew = (-2) * Out * DDx * D_DDout + (-2) * DOut * DDx * D_Dout_new + + Out, DDX, DOut, D_DDOut, D_DOut_New // input + D_OutNew, D_DOut, D_DDx // output +*/ +template +struct TanhTripleGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, + const DenseTensor* Out, + const DenseTensor* ddX, + const DenseTensor* dOut, + const DenseTensor* d_DDOut, + const DenseTensor* d_dOut_New, + DenseTensor* d_d_Out, + DenseTensor* d_Out_New, + DenseTensor* d_DDx) const { + auto* d = dev.eigen_device(); + auto ddx = EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhTripleGrad")); + auto out = EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Input", "Out", "TanhTripleGrad")); + auto dout = EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhTripleGrad")); + auto d_ddOut = EigenVector::Flatten( + GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "TanhTripleGrad")); + auto d_dOutNew = EigenVector::Flatten( + GET_DATA_SAFELY(d_dOut_New, "Input", "D_DOut_New", "TanhTripleGrad")); + + if (d_Out_New) { + auto d_OutNew = EigenVector::Flatten( + GET_DATA_SAFELY(d_Out_New, "Output", "D_OutNew", "TanhTripleGrad")); + d_OutNew.device(*d) = (static_cast(-2) * out * ddx * d_ddOut) - + (static_cast(2) * dout * ddx * d_dOutNew); + } + if (d_d_Out) { + auto d_dOut = EigenVector::Flatten( + GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "TanhTripleGrad")); + d_dOut.device(*d) = static_cast(-2) * out * ddx * d_dOutNew; + } + if (d_DDx) { + auto d_ddx = EigenVector::Flatten( + GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "TanhTripleGrad")); + d_ddx.device(*d) = (static_cast(1) - (out * out)) * d_ddOut - + static_cast(2) * out * dout * d_dOutNew; + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +template +struct BReluFunctor : public BaseActivationFunctor { + float t_min; + float t_max; + + // NOTE: Explicit hides the `BaseActivationFunctor::GetAttrs` + // not polymorphism for speed. + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"t_min", &t_min}, {"t_max", &t_max}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = + x.cwiseMax(static_cast(t_min)).cwiseMin(static_cast(t_max)); + } +}; + +template +struct BReluGradFunctor : public BaseActivationFunctor { + float t_min; + float t_max; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"t_min", &t_min}, {"t_max", &t_max}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * + ((x > static_cast(t_min)) * (x < static_cast(t_max))) + .template cast(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct LeakyReluFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + template + void operator()(Device d, X x, Out out) const { + if (alpha < 1.f) { + out.device(d) = x.cwiseMax(static_cast(alpha) * x); + } else { + out.device(d) = x.cwiseMin(static_cast(alpha) * x); + } + } +}; + +template +struct LeakyReluGradFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp1 = + static_cast(alpha) * (x < static_cast(0)).template cast(); + auto temp2 = (x >= static_cast(0)).template cast(); + dx.device(d) = dout * (temp1 + temp2).template cast(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct LeakyReluGradGradFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(const Device& dev, + const DenseTensor* X, + const DenseTensor* Out, + const DenseTensor* ddX, + DenseTensor* ddOut, + DenseTensor* dOut, + DenseTensor* dX) const { + if (ddOut) { + auto* d = dev.eigen_device(); + auto ddx = EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad")); + auto x = EigenVector::Flatten( + GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad")); + auto ddout = EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad")); + ddout.device(*d) = + ddx * + ((x > static_cast(0)).template cast() + + static_cast(alpha) * (x <= static_cast(0)).template cast()) + .template cast(); + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct ThresholdedReluFunctor : public BaseActivationFunctor { + float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out) const { + auto th = static_cast(threshold); + out.device(d) = (x > th).template cast() * x; + } +}; + +template +struct ThresholdedReluGradFunctor : public BaseActivationFunctor { + float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto th = static_cast(threshold); + dx.device(d) = dout * (x > th).template cast(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) template struct CudaReluFunctor : public BaseActivationFunctor { T zero = static_cast(0.0f); @@ -824,6 +1087,133 @@ struct CudaAtanGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; +template +struct CudaTanhFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // tanh(x) = tanh(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(tanh(x)); + } +}; + +template +struct CudaTanhGradFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // dx = dout * (1 - out^2) + __device__ __forceinline__ T operator()(const T dout, const T out) const { + return dout * (one - out * out); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +template +struct CudaBReluFunctor : public BaseActivationFunctor { + float t_min; + float t_max; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"t_min", &t_min}, {"t_max", &t_max}}; + } + + // brelu(x) = min(max(x, t_min), t_max) + __device__ __forceinline__ T operator()(const T x) const { + T t_min_cast = static_cast(t_min); + T t_max_cast = static_cast(t_max); + T temp_max = x > t_min_cast ? x : t_min_cast; + T temp_min = temp_max < t_max_cast ? temp_max : t_max_cast; + return temp_min; + } +}; + +template +struct CudaBReluGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float t_min; + float t_max; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"t_min", &t_min}, {"t_max", &t_max}}; + } + + // dx = (x > t_min && x < t_max) ? dout : 0 + __device__ __forceinline__ T operator()(const T dout, const T x) const { + T t_min_cast = static_cast(t_min); + T t_max_cast = static_cast(t_max); + return (x > t_min_cast && x < t_max_cast) ? dout : zero; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaThresholdedReluFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // thresholded_relu(x) = x > threshold ? x : 0 + __device__ __forceinline__ T operator()(const T x) const { + return x > static_cast(threshold) ? x : zero; + } +}; + +template +struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // dx = x > threshold ? dout : 0 + __device__ __forceinline__ T operator()(const T dout, const T x) const { + return x > static_cast(threshold) ? dout : zero; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaLeakyReluFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // leakyrelu(x) = x > 0 ? x : alpha * x + __device__ __forceinline__ T operator()(const T x) const { + return x > zero ? x : static_cast(alpha) * x; + } +}; + +template +struct CudaLeakyReluGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // dx = dout * (x > 0 ? 1 : alpha) + __device__ __forceinline__ T operator()(const T dout, const T x) const { + return x > zero ? dout : static_cast(alpha) * dout; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; #endif } // namespace funcs diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu index c2995c79a7e..00792b8ab60 100644 --- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu @@ -79,113 +79,97 @@ void ActivationGradGPUImpl(const Context& dev_ctx, const DenseTensor& x, \ const DenseTensor& dout, \ DenseTensor* dx) { \ - functor_class functor; \ - ActivationGradGPUImpl( \ + funcs::functor_class functor; \ + ActivationGradGPUImpl>( \ dev_ctx, &x, nullptr, &dout, dx, functor); \ } +#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( \ + name, functor_class, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationGradGPUImpl>( \ + dev_ctx, &x, nullptr, &dout, dx, functor); \ + } + +#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX( \ + name, functor_class, attr1, attr2) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr1, \ + float attr2, \ + DenseTensor* dx) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationGradGPUImpl>( \ + dev_ctx, &x, nullptr, &dout, dx, functor); \ + } + #define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& out, \ const DenseTensor& dout, \ DenseTensor* dx) { \ - functor_class functor; \ - ActivationGradGPUImpl( \ + funcs::functor_class functor; \ + ActivationGradGPUImpl>( \ dev_ctx, nullptr, &out, &dout, dx, functor); \ } -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::CudaReluGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CudaCosGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::CudaTanGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::CudaAcosGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::CudaSinGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::CudaAsinGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::CudaAtanGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::CudaSinhGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CudaCoshGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::CudaAsinhGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::CudaAcoshGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::CudaAtanhGradFunctor); +#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut( \ + name, functor_class, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationGradGPUImpl>( \ + dev_ctx, nullptr, &out, &dout, dx, functor); \ + } + +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, CudaReluGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Tanh, CudaTanhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, CudaCosGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, CudaTanGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, CudaAcosGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, CudaSinGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, CudaAsinGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, CudaAtanGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, CudaSinhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, CudaCoshGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, CudaAsinhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, CudaAcoshGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, CudaAtanhGradFunctor); + +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu, + CudaLeakyReluGradFunctor, + alpha); +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(ThresholdedRelu, + CudaThresholdedReluGradFunctor, + threshold); + +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu, + CudaBReluGradFunctor, + t_min, + t_max); } // namespace phi -PD_REGISTER_KERNEL(cos_grad, - GPU, - ALL_LAYOUT, - phi::CosGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(tan_grad, - GPU, - ALL_LAYOUT, - phi::TanGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(acos_grad, - GPU, - ALL_LAYOUT, - phi::AcosGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(sin_grad, - GPU, - ALL_LAYOUT, - phi::SinGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(asin_grad, - GPU, - ALL_LAYOUT, - phi::AsinGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(atan_grad, - GPU, - ALL_LAYOUT, - phi::AtanGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(sinh_grad, - GPU, - ALL_LAYOUT, - phi::SinhGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(cosh_grad, - GPU, - ALL_LAYOUT, - phi::CoshGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(asinh_grad, - GPU, - ALL_LAYOUT, - phi::AsinhGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(acosh_grad, - GPU, - ALL_LAYOUT, - phi::AcoshGradKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(atanh_grad, - GPU, - ALL_LAYOUT, - phi::AtanhGradKernel, - float, - double, - phi::dtype::float16) {} + #ifdef PADDLE_WITH_HIP PD_REGISTER_KERNEL(relu_grad, GPU, @@ -219,3 +203,34 @@ PD_REGISTER_KERNEL(relu_double_grad, phi::dtype::float16, phi::dtype::bfloat16) {} #endif + +#define PD_REGISTER_ACTIVATION_GRAD_KERNEL(name, func) \ + PD_REGISTER_KERNEL(name, \ + GPU, \ + ALL_LAYOUT, \ + phi::func, \ + float, \ + double, \ + phi::dtype::float16, \ + phi::dtype::bfloat16) {} + +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sin_grad, SinGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(cos_grad, CosGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tan_grad, TanGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(acos_grad, AcosGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(asin_grad, AsinGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(atan_grad, AtanGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sinh_grad, SinhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(cosh_grad, CoshGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(asinh_grad, AsinhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(acosh_grad, AcoshGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(atanh_grad, AtanhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_grad, TanhGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_double_grad, TanhDoubleGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_triple_grad, TanhTripleGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_double_grad, + LeakyReluDoubleGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad, + ThresholdedReluGradKernel) diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu index 26752b89e7c..3c340a89f57 100644 --- a/paddle/phi/kernels/gpu/activation_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_kernel.cu @@ -46,6 +46,35 @@ void ActivationGPUImpl(const Context& dev_ctx, ActivationGPUImpl(dev_ctx, x, out, functor); \ } +#define DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr, \ + DenseTensor* out) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationGPUImpl>( \ + dev_ctx, x, out, functor); \ + } + +#define DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS( \ + name, functor_class, attr1, attr2) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr1, \ + float attr2, \ + DenseTensor* out) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationGPUImpl>( \ + dev_ctx, x, out, functor); \ + } + DEFINE_GPU_ACTIVATION_KERNEL(Cos, funcs::CudaCosFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Tan, funcs::CudaTanFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Acos, funcs::CudaAcosFunctor) @@ -58,6 +87,14 @@ DEFINE_GPU_ACTIVATION_KERNEL(Asinh, funcs::CudaAsinhFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Acosh, funcs::CudaAcoshFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Atanh, funcs::CudaAtanhFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Relu, funcs::CudaReluFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Tanh, funcs::CudaTanhFunctor) + +DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha) +DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, + CudaThresholdedReluFunctor, + threshold) + +DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, CudaBReluFunctor, t_min, t_max) } // namespace phi @@ -79,65 +116,29 @@ PD_REGISTER_KERNEL(relu, phi::dtype::float16, phi::dtype::bfloat16) {} #endif -PD_REGISTER_KERNEL( - sin, GPU, ALL_LAYOUT, phi::SinKernel, float, double, phi::dtype::float16) {} -PD_REGISTER_KERNEL( - cos, GPU, ALL_LAYOUT, phi::CosKernel, float, double, phi::dtype::float16) {} -PD_REGISTER_KERNEL( - tan, GPU, ALL_LAYOUT, phi::TanKernel, float, double, phi::dtype::float16) {} -PD_REGISTER_KERNEL(acos, - GPU, - ALL_LAYOUT, - phi::AcosKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(asin, - GPU, - ALL_LAYOUT, - phi::AsinKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(atan, - GPU, - ALL_LAYOUT, - phi::AtanKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(sinh, - GPU, - ALL_LAYOUT, - phi::SinhKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(cosh, - GPU, - ALL_LAYOUT, - phi::CoshKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(asinh, - GPU, - ALL_LAYOUT, - phi::AsinhKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(acosh, - GPU, - ALL_LAYOUT, - phi::AcoshKernel, - float, - double, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(atanh, - GPU, - ALL_LAYOUT, - phi::AtanhKernel, - float, - double, - phi::dtype::float16) {} + +#define PD_REGISTER_ACTIVATION_KERNEL(name, func) \ + PD_REGISTER_KERNEL(name, \ + GPU, \ + ALL_LAYOUT, \ + phi::func, \ + float, \ + double, \ + phi::dtype::float16, \ + phi::dtype::bfloat16) {} + +PD_REGISTER_ACTIVATION_KERNEL(sin, SinKernel) +PD_REGISTER_ACTIVATION_KERNEL(cos, CosKernel) +PD_REGISTER_ACTIVATION_KERNEL(tan, TanKernel) +PD_REGISTER_ACTIVATION_KERNEL(acos, AcosKernel) +PD_REGISTER_ACTIVATION_KERNEL(asin, AsinKernel) +PD_REGISTER_ACTIVATION_KERNEL(atan, AtanKernel) +PD_REGISTER_ACTIVATION_KERNEL(sinh, SinhKernel) +PD_REGISTER_ACTIVATION_KERNEL(cosh, CoshKernel) +PD_REGISTER_ACTIVATION_KERNEL(asinh, AsinhKernel) +PD_REGISTER_ACTIVATION_KERNEL(acosh, AcoshKernel) +PD_REGISTER_ACTIVATION_KERNEL(atanh, AtanhKernel) +PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel) +PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel) +PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel) +PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel) diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h index 80e23d2b8e2..a48a6226f23 100644 --- a/paddle/phi/kernels/impl/activation_grad_impl.h +++ b/paddle/phi/kernels/impl/activation_grad_impl.h @@ -130,4 +130,76 @@ void ReluDoubleGradKernel(const Context& dev_ctx, relu_double_grad_functor); } +template +void LeakyReluDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& ddx, + float alpha, + DenseTensor* ddout) { + funcs::LeakyReluGradGradFunctor leaky_relu_double_grad_functor; + leaky_relu_double_grad_functor.alpha = alpha; + ActivationDoubleGradImpl>( + dev_ctx, + &x, + nullptr, + &ddx, + nullptr, + nullptr, + ddout, + leaky_relu_double_grad_functor); +} + +template +void TanhDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + const DenseTensor& dout, + DenseTensor* dout_new, + DenseTensor* ddout) { + if (dout_new) { + dout_new->Resize(out.dims()); + dev_ctx.template Alloc(dout_new); + } + if (ddout) { + ddout->Resize(out.dims()); + dev_ctx.template Alloc(ddout); + } + funcs::TanhGradGradFunctor functor; + functor(dev_ctx, &out, &ddx, &dout, dout_new, ddout); +} + +template +void TanhTripleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + const DenseTensor& dout, + const DenseTensor& d_ddout, + const DenseTensor& d_dout_new, + DenseTensor* d_out_new, + DenseTensor* d_dout, + DenseTensor* d_ddx) { + if (d_dout) { + d_dout->Resize(out.dims()); + dev_ctx.template Alloc(d_dout); + } + if (d_out_new) { + d_dout->Resize(out.dims()); + dev_ctx.template Alloc(d_out_new); + } + if (d_ddx) { + d_dout->Resize(ddx.dims()); + dev_ctx.template Alloc(d_ddx); + } + funcs::TanhTripleGradFunctor functor; + functor(dev_ctx, + &out, + &ddx, + &dout, + &d_ddout, + &d_dout_new, // input + d_dout, + d_out_new, + d_ddx); // output +} + } // namespace phi diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc index 396830ca207..cbfca5b17ae 100644 --- a/paddle/phi/ops/compat/activation_sig.cc +++ b/paddle/phi/ops/compat/activation_sig.cc @@ -16,40 +16,80 @@ limitations under the License. */ namespace phi { -#define DefineActGradDepXOpArgMap(func_name, op_name) \ - KernelSignature func_name##GradOpArgumentMapping( \ - const ArgumentMappingContext& ctx) { \ - return KernelSignature( \ - op_name "_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")}); \ +#define DefineActGradDepXOpArgMap(func_name, op_name, attrs) \ + KernelSignature func_name##GradOpArgumentMapping( \ + const ArgumentMappingContext& ctx) { \ + return KernelSignature(op_name "_grad", \ + {"X", GradVarName("Out")}, \ + {attrs}, \ + {GradVarName("X")}); \ } -#define DefineActGradDepOutOpArgMap(func_name, op_name) \ - KernelSignature func_name##GradOpArgumentMapping( \ - const ArgumentMappingContext& ctx) { \ - return KernelSignature( \ - op_name "_grad", {"Out", GradVarName("Out")}, {}, {GradVarName("X")}); \ +#define DefineActGradDepOutOpArgMap(func_name, op_name, attrs) \ + KernelSignature func_name##GradOpArgumentMapping( \ + const ArgumentMappingContext& ctx) { \ + return KernelSignature(op_name "_grad", \ + {"Out", GradVarName("Out")}, \ + {attrs}, \ + {GradVarName("X")}); \ } +#define comma , + +DefineActGradDepXOpArgMap(Cos, "cos", ); // NOLINT +DefineActGradDepXOpArgMap(Tan, "tan", ); // NOLINT +DefineActGradDepXOpArgMap(Acos, "acos", ); // NOLINT +DefineActGradDepXOpArgMap(Sin, "sin", ); // NOLINT +DefineActGradDepXOpArgMap(Asin, "asin", ); // NOLINT +DefineActGradDepXOpArgMap(Atan, "atan", ); // NOLINT +DefineActGradDepXOpArgMap(Sinh, "sinh", ); // NOLINT +DefineActGradDepXOpArgMap(Cosh, "cosh", ); // NOLINT +DefineActGradDepXOpArgMap(Asinh, "asinh", ); // NOLINT +DefineActGradDepXOpArgMap(Acosh, "acosh", ); // NOLINT +DefineActGradDepXOpArgMap(Atanh, "atanh", ); // NOLINT +DefineActGradDepXOpArgMap(BRelu, "brelu", "t_min" comma "t_max"); // NOLINT +DefineActGradDepXOpArgMap(LeakyRelu, "leaky_relu", "alpha"); // NOLINT +DefineActGradDepXOpArgMap(ThresholdedRelu, + "thresholded_relu", + "threshold"); // NOLINT + +DefineActGradDepOutOpArgMap(Relu, "relu", ); // NOLINT +DefineActGradDepOutOpArgMap(Tanh, "tanh", ); // NOLINT + KernelSignature ReluDoubleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("relu_double_grad", {"Out", "DDX"}, {}, {"DDOut"}); } -DefineActGradDepXOpArgMap(Cos, "cos"); -DefineActGradDepXOpArgMap(Tan, "tan"); -DefineActGradDepXOpArgMap(Acos, "acos"); -DefineActGradDepXOpArgMap(Sin, "sin"); -DefineActGradDepXOpArgMap(Asin, "asin"); -DefineActGradDepXOpArgMap(Atan, "atan"); -DefineActGradDepXOpArgMap(Sinh, "sinh"); -DefineActGradDepXOpArgMap(Cosh, "cosh"); -DefineActGradDepXOpArgMap(Asinh, "asinh"); -DefineActGradDepXOpArgMap(Acosh, "acosh"); -DefineActGradDepXOpArgMap(Atanh, "atanh"); -DefineActGradDepOutOpArgMap(Relu, "relu"); +KernelSignature TanhDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "tanh_double_grad", {"Out", "DDX", "DOut"}, {}, {"DOutNew", "DDOut"}); +} + +KernelSignature TanhTripleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("tanh_triple_grad", + {"Out", "DDX", "DOut", "D_DDOut", "D_DOut_New"}, + {}, + {"D_OutNew", "D_DOut", "D_DDx"}); +} + +KernelSignature LeakyReluDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "leaky_relu_double_grad", {"X", "DDX"}, {"alpha"}, {"DDOut"}); +} + +KernelSignature LeakyReluOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("leaky_relu", {"X"}, {"alpha"}, {"Out"}); +} + } // namespace phi PD_REGISTER_BASE_KERNEL_NAME(relu_grad_grad, relu_double_grad); +PD_REGISTER_BASE_KERNEL_NAME(tanh_grad_grad, tanh_double_grad); +PD_REGISTER_BASE_KERNEL_NAME(leaky_relu_grad_grad, leaky_relu_double_grad); PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping); @@ -65,3 +105,16 @@ PD_REGISTER_ARG_MAPPING_FN(atanh_grad, phi::AtanhGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(relu_grad, phi::ReluGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(relu_grad_grad, phi::ReluDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tanh_grad, phi::TanhGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tanh_grad_grad, + phi::TanhDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tanh_triple_grad, + phi::TanhTripleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(brelu_grad, phi::BReluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(leaky_relu, phi::LeakyReluOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad, + phi::LeakyReluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad_grad, + phi::LeakyReluDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(thresholded_relu_grad, + phi::ThresholdedReluGradOpArgumentMapping); diff --git a/tools/infrt/get_compat_kernel_signature.py b/tools/infrt/get_compat_kernel_signature.py index b8c4232076c..0680e87b38b 100644 --- a/tools/infrt/get_compat_kernel_signature.py +++ b/tools/infrt/get_compat_kernel_signature.py @@ -58,8 +58,9 @@ def get_compat_kernels_info(): content += line if (registry and ";" in line): data = content.replace("\n", "").replace( - " ", "").strip("return").strip( - "KernelSignature(").strip("\);").replace("\"", "") + " ", + "").strip("return").strip("KernelSignature(").strip( + "\);").replace("\"", "").replace("\\", "") registry = False name, registry_info = parse_compat_registry(data) -- GitLab From 0c3335433525d4f156ee7afc475274df75a34736 Mon Sep 17 00:00:00 2001 From: Chang Xu Date: Tue, 15 Mar 2022 11:37:49 +0800 Subject: [PATCH 059/176] Fix truncated norm operator (#40287) --- .../ps/table/depends/initializers.h | 11 +++++--- .../operators/truncated_gaussian_random_op.h | 17 ++---------- .../truncated_gaussian_random_op_npu.cc | 9 +++++-- .../truncated_gaussian_random_op_xpu.cc | 9 +++++-- .../cpu/truncated_gaussian_random_kernel.cc | 9 +++++-- .../gpu/truncated_gaussian_random_kernel.cu | 27 ++++++++++++------- .../truncated_gaussian_random_kernel.h | 14 ++-------- 7 files changed, 49 insertions(+), 47 deletions(-) diff --git a/paddle/fluid/distributed/ps/table/depends/initializers.h b/paddle/fluid/distributed/ps/table/depends/initializers.h index f46e659a88b..5ac0c08f97d 100644 --- a/paddle/fluid/distributed/ps/table/depends/initializers.h +++ b/paddle/fluid/distributed/ps/table/depends/initializers.h @@ -23,7 +23,6 @@ #include "gflags/gflags.h" #include "paddle/fluid/framework/generator.h" - #include "paddle/fluid/operators/truncated_gaussian_random_op.h" namespace paddle { @@ -118,9 +117,13 @@ class TruncatedGaussianInitializer : public Initializer { seed_ = static_cast(std::stoi(attrs[1])); mean_ = std::stof(attrs[2]); std_ = std::stof(attrs[3]); - - std::uniform_real_distribution dist_( - std::numeric_limits::min(), 1.0); + auto normal_cdf = [](float x) { + return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; + }; + float a_normal_cdf = normal_cdf((-2.0 - mean_) / std_); + float b_normal_cdf = normal_cdf((2.0 - mean_) / std_); + std::uniform_real_distribution dist_(2.0 * a_normal_cdf - 1.0, + 2.0 * b_normal_cdf - 1.0); random_engine_ = framework::GetCPURandomEngine(seed_); } diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.h b/paddle/fluid/operators/truncated_gaussian_random_op.h index a6ff2f686cb..8af6e281424 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op.h +++ b/paddle/fluid/operators/truncated_gaussian_random_op.h @@ -1,11 +1,8 @@ /* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -140,19 +137,9 @@ T Erfinv(T x) { template struct TruncatedNormal { T mean, std; - T a_normal_cdf; - T b_normal_cdf; - TruncatedNormal(T mean, T std) : mean(mean), std(std) { - auto normal_cdf = [](T x) { - return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; - }; - a_normal_cdf = normal_cdf(-2.0); - b_normal_cdf = normal_cdf(2.0); - } - + TruncatedNormal(T mean, T std) : mean(mean), std(std) {} T operator()(T value) const { - auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; - return std::sqrt(2.0) * Erfinv(2 * p - 1) * std + mean; + return std::sqrt(2.0) * Erfinv(value) * std + mean; } }; diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc index 261d9cee2d5..4ed0dd22ec0 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc @@ -84,8 +84,13 @@ class NPUTruncatedGaussianRandomKernel : public framework::OpKernel { Tensor cpu_tensor(tensor->dtype()); cpu_tensor.Resize(tensor->dims()); T* cpu_data = cpu_tensor.mutable_data(platform::CPUPlace()); - std::uniform_real_distribution dist(std::numeric_limits::min(), - 1.0); + auto normal_cdf = [](float x) { + return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; + }; + float a_normal_cdf = normal_cdf((-2.0 - mean) / std); + float b_normal_cdf = normal_cdf((2.0 - mean) / std); + std::uniform_real_distribution dist(2.0 * a_normal_cdf - 1.0, + 2.0 * b_normal_cdf - 1.0); TruncatedNormal truncated_normal(mean, std); int64_t size = tensor->numel(); diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc index 803b61fbe81..984d9f397cc 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc @@ -32,8 +32,13 @@ class XPUTruncatedGaussianRandomKernel : public framework::OpKernel { auto* tensor = context.Output("Out"); T* data = tensor->mutable_data(context.GetPlace()); - std::uniform_real_distribution dist(std::numeric_limits::min(), - 1.0); + auto normal_cdf = [](float x) { + return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; + }; + float a_normal_cdf = normal_cdf((-2.0 - mean) / std); + float b_normal_cdf = normal_cdf((2.0 - mean) / std); + std::uniform_real_distribution dist(2.0 * a_normal_cdf - 1.0, + 2.0 * b_normal_cdf - 1.0); TruncatedNormal truncated_normal(mean, std); int64_t size = tensor->numel(); diff --git a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc index 4247e597ace..ab3d3c2376b 100644 --- a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc +++ b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc @@ -37,8 +37,13 @@ void TruncatedGaussianRandomKernel(const Context& dev_ctx, T* data = dev_ctx.template Alloc(tensor); - std::uniform_real_distribution dist(std::numeric_limits::min(), - 1.0); + auto normal_cdf = [](float x) { + return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; + }; + float a_normal_cdf = normal_cdf((-2.0 - mean) / std); + float b_normal_cdf = normal_cdf((2.0 - mean) / std); + std::uniform_real_distribution dist(2.0 * a_normal_cdf - 1.0, + 2.0 * b_normal_cdf - 1.0); TruncatedNormal truncated_normal(mean, std); int64_t size = tensor->numel(); diff --git a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu index f27b32ca7b8..bb04e7ee851 100644 --- a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu +++ b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu @@ -33,23 +33,27 @@ struct GPUTruncatedNormal { T mean, std; T a_normal_cdf; T b_normal_cdf; + unsigned int seed; T numeric_min; __host__ __device__ GPUTruncatedNormal(T mean, T std, T numeric_min, int seed) : mean(mean), std(std), seed(seed), numeric_min(numeric_min) { - a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0; - b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0; + auto normal_cdf = [](float x) { + return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; + }; + a_normal_cdf = normal_cdf((-2.0 - mean) / std); + b_normal_cdf = normal_cdf((2.0 - mean) / std); } __host__ __device__ T operator()(const unsigned int n) const { thrust::minstd_rand rng; rng.seed(seed); - thrust::uniform_real_distribution dist(numeric_min, 1); + thrust::uniform_real_distribution dist(2.0 * a_normal_cdf - 1.0, + 2.0 * b_normal_cdf - 1.0); rng.discard(n); T value = dist(rng); - auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; - return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean; + return std::sqrt(2.0) * erfinvf(value) * std + mean; } }; @@ -69,18 +73,21 @@ struct TruncatedNormalOffset { seed(seed), numeric_min(numeric_min), offset_(offset) { - a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0; - b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0; + auto normal_cdf = [](float x) { + return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; + }; + a_normal_cdf = normal_cdf((-2.0 - mean) / std); + b_normal_cdf = normal_cdf((2.0 - mean) / std); } __host__ __device__ T operator()(const unsigned int n) const { thrust::minstd_rand rng; rng.seed(seed); - thrust::uniform_real_distribution dist(numeric_min, 1); + thrust::uniform_real_distribution dist(2.0 * a_normal_cdf - 1.0, + 2.0 * b_normal_cdf - 1.0); rng.discard(n + offset_); T value = dist(rng); - auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; - return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean; + return std::sqrt(2.0) * erfinvf(value) * std + mean; } }; diff --git a/paddle/phi/kernels/truncated_gaussian_random_kernel.h b/paddle/phi/kernels/truncated_gaussian_random_kernel.h index f8547ced419..c4c13578a98 100644 --- a/paddle/phi/kernels/truncated_gaussian_random_kernel.h +++ b/paddle/phi/kernels/truncated_gaussian_random_kernel.h @@ -141,19 +141,9 @@ T Erfinv(T x) { template struct TruncatedNormal { T mean, std; - T a_normal_cdf; - T b_normal_cdf; - TruncatedNormal(T mean, T std) : mean(mean), std(std) { - auto normal_cdf = [](T x) { - return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; - }; - a_normal_cdf = normal_cdf(-2.0); - b_normal_cdf = normal_cdf(2.0); - } - + TruncatedNormal(T mean, T std) : mean(mean), std(std) {} T operator()(T value) const { - auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; - return std::sqrt(2.0) * Erfinv(2 * p - 1) * std + mean; + return std::sqrt(2.0) * Erfinv(value) * std + mean; } }; -- GitLab From f84b54ebd80214af059af455b1480bb710e3ba2e Mon Sep 17 00:00:00 2001 From: Yulong Ao Date: Tue, 15 Mar 2022 11:39:07 +0800 Subject: [PATCH 060/176] [Auto parallel] Redesign the tuner for auto parallel (#40121) * [Auto Parallel] Redesign the tunner for Auto Parallel --- .../auto_parallel/tuner/__init__.py | 13 + .../auto_parallel/tuner/storable.py | 36 +++ .../auto_parallel/tuner/tunable_space.py | 151 +++++++++++ .../auto_parallel/tuner/tunable_variable.py | 242 ++++++++++++++++++ .../auto_parallel/test_tunable_space.py | 138 ++++++++++ .../auto_parallel/test_tunable_variable.py | 99 +++++++ python/setup.py.in | 1 + 7 files changed, 680 insertions(+) create mode 100644 python/paddle/distributed/auto_parallel/tuner/__init__.py create mode 100644 python/paddle/distributed/auto_parallel/tuner/storable.py create mode 100644 python/paddle/distributed/auto_parallel/tuner/tunable_space.py create mode 100644 python/paddle/distributed/auto_parallel/tuner/tunable_variable.py create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_variable.py diff --git a/python/paddle/distributed/auto_parallel/tuner/__init__.py b/python/paddle/distributed/auto_parallel/tuner/__init__.py new file mode 100644 index 00000000000..513558501a0 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/tuner/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/distributed/auto_parallel/tuner/storable.py b/python/paddle/distributed/auto_parallel/tuner/storable.py new file mode 100644 index 00000000000..d61e53a0272 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/tuner/storable.py @@ -0,0 +1,36 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json + + +class Storable(object): + def get_state(self): + raise NotImplementedError + + def set_state(self, state): + raise NotImplementedError + + def save(self, path): + state = self.get_state() + state_json = json.dumps(state) + with open(path, "w") as f: + f.write(state_json) + return str(path) + + def load(self, path): + with open(path, "r") as f: + state_data = f.read() + state = json.loads(state_data) + self.set_state(state) diff --git a/python/paddle/distributed/auto_parallel/tuner/tunable_space.py b/python/paddle/distributed/auto_parallel/tuner/tunable_space.py new file mode 100644 index 00000000000..f63364c5b75 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/tuner/tunable_space.py @@ -0,0 +1,151 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import contextlib +import copy +import math +import random +import numpy as np + +from .tunable_variable import Boolean +from .tunable_variable import Fixed +from .tunable_variable import Choice +from .tunable_variable import IntRange +from .tunable_variable import FloatRange + + +class TunableSpace(object): + """ + A TunableSpace is constructed by the tunable variables. + """ + + def __init__(self): + # Tunable variables for this tunable variables + self._variables = {} + # Specific values coresponding to each tunable variable + self._values = {} + + @property + def variables(self): + return self._variables + + @property + def values(self): + return self._values + + def get_value(self, name): + if name in self.values: + return self.values[name] + else: + raise KeyError("{} does not exist.".format(name)) + + def set_value(self, name, value): + if name in self.values: + self.values[name] = value + else: + raise KeyError("{} does not exist.".format(name)) + + def _exists(self, name): + if name in self._variables: + return True + return False + + def _retrieve(self, tv): + tv = tv.__class__.from_state(tv.get_state()) + if self._exists(tv.name): + return self.get_value(tv.name) + return self._register(tv) + + def _register(self, tv): + self._variables[tv.name] = tv + if tv.name not in self.values: + self.values[tv.name] = tv.default + return self.values[tv.name] + + def __getitem__(self, name): + return self.get_value(name) + + def __setitem__(self, name, value): + self.set_value(name, value) + + def __contains__(self, name): + try: + self.get_value(name) + return True + except (KeyError, ValueError): + return False + + def fixed(self, name, default): + tv = Fixed(name=name, default=default) + return self._retrieve(tv) + + def boolean(self, name, default=False): + tv = Boolean(name=name, default=default) + return self._retrieve(tv) + + def choice(self, name, values, default=None): + tv = Choice(name=name, values=values, default=default) + return self._retrieve(tv) + + def int_range(self, name, start, stop, step=1, default=None): + tv = IntRange( + name=name, start=start, stop=stop, step=step, default=default) + return self._retrieve(tv) + + def float_range(self, name, start, stop, step=None, default=None): + tv = FloatRange( + name=name, start=start, stop=stop, step=step, default=default) + return self._retrieve(tv) + + def get_state(self): + return { + "variables": [{ + "class_name": v.__class__.__name__, + "state": v.get_state() + } for v in self._variables.values()], + "values": dict((k, v) for (k, v) in self.values.items()) + } + + @classmethod + def from_state(cls, state): + ts = cls() + for v in state["variables"]: + v = _deserialize_tunable_variable(v) + ts._variables[v.name] = v + ts._values = dict((k, v) for (k, v) in state["values"].items()) + return ts + + +def _deserialize_tunable_variable(state): + classes = (Boolean, Fixed, Choice, IntRange, FloatRange) + cls_name_to_cls = {cls.__name__: cls for cls in classes} + + if isinstance(state, classes): + return state + + if (not isinstance(state, dict) or "class_name" not in state or + "state" not in state): + raise ValueError( + "Expect state to be a python dict containing class_name and state as keys, but found {}" + .format(state)) + + cls_name = state["class_name"] + cls = cls_name_to_cls[cls_name] + if cls is None: + raise ValueError("Unknown class name {}".format(cls_name)) + + cls_state = state["state"] + deserialized_object = cls.from_state(cls_state) + return deserialized_object diff --git a/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py b/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py new file mode 100644 index 00000000000..9549b44c48e --- /dev/null +++ b/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py @@ -0,0 +1,242 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + + +class TunableVariable(object): + """ + Tunablevariable base class. + """ + + def __init__(self, name, default=None): + self.name = name + self._default = default + + @property + def default(self): + return self._default + + def get_state(self): + return {"name": self.name, "default": self.default} + + @classmethod + def from_state(cls, state): + return cls(**state) + + +class Fixed(TunableVariable): + """ + Fixed variable which cannot be changed. + """ + + def __init__(self, name, default): + super(Fixed, self).__init__(name=name, default=default) + self.name = name + if not isinstance(default, (str, int, float, bool)): + raise ValueError( + "Fixed must be an str, int, float or bool, but found {}" + .format(default)) + self._default = default + + def random(self, seed=None): + return self._default + + def __repr__(self): + return "Fixed(name: {}, value: {})".format(self.name, self.default) + + +class Boolean(TunableVariable): + """ + Choice between True and False. + """ + + def __init__(self, name, default=False): + super(Boolean, self).__init__(name=name, default=default) + if default not in {True, False}: + raise ValueError( + "default must be a Python boolean, but got {}".format(default)) + + def random(self, seed=None): + rng = np.random.default_rng(seed) + return rng.choice((True, False)) + + def __repr__(self): + return 'Boolean(name: "{}", default: {})'.format(self.name, + self.default) + + +class Choice(TunableVariable): + def __init__(self, name, values, default=None): + super(Choice, self).__init__(name=name, default=default) + + types = set(type(v) for v in values) + if len(types) > 1: + raise TypeError( + "Choice can contain only one type of value, but found values: {} with types: {}." + .format(str(values), str(types))) + + if isinstance(values[0], str): + values = [str(v) for v in values] + if default is not None: + default = str(default) + elif isinstance(values[0], int): + values = [int(v) for v in values] + if default is not None: + default = int(default) + elif isinstance(values[0], float): + values = [float(v) for v in values] + if default is not None: + default = float(default) + elif isinstance(values[0], bool): + values = [bool(v) for v in values] + if default is not None: + default = bool(default) + else: + raise TypeError( + "Choice can only contain str, int, float, or boll, but found: {} " + .format(str(values))) + self.values = values + + if default is not None and default not in values: + raise ValueError( + "The default value should be one of the choices {}, but found {}". + format(values, default)) + self._default = default + + @property + def default(self): + if self._default is None: + if None in self.values: + return None + return self.values[0] + return self._default + + def random(self, seed=None): + rng = np.random.default_rng(seed) + return rng.choice(self.values) + + def get_state(self): + state = super(Choice, self).get_state() + state["values"] = self.values + return state + + def __repr__(self): + return 'Choice(name: "{}", values: {}, default: {})'.format( + self.name, self.values, self.default) + + +class IntRange(TunableVariable): + """ + Integer range. + """ + + def __init__(self, name, start, stop, step=1, default=None, endpoint=False): + super(IntRange, self).__init__(name=name, default=default) + self.start = self._check_int(start) + self.stop = self._check_int(stop) + self.step = self._check_int(step) + self._default = default + self.endpoint = endpoint + + @property + def default(self): + if self._default is not None: + return self._default + return self.start + + def random(self, seed=None): + rng = np.random.default_rng(seed) + value = (self.stop - self.start) * rng.random() + self.start + if self.step is not None: + if self.endpoint: + values = np.arange(self.start, self.stop + 1e-7, step=self.step) + else: + values = np.arange(self.start, self.stop, step=self.step) + closest_index = np.abs(values - value).argmin() + value = values[closest_index] + return int(value) + + def get_state(self): + state = super(IntRange, self).get_state() + state["start"] = self.start + state["stop"] = self.stop + state["step"] = self.step + state["default"] = self._default + return state + + def _check_int(self, val): + int_val = int(val) + if int_val != val: + raise ValueError("Expects val is an int, but found: {}.".format( + str(val))) + return int_val + + def __repr__(self): + return "IntRange(name: {}, start: {}, stop: {}, step: {}, default: {})".format( + self.name, self.start, self.stop, self.step, self.default) + + +class FloatRange(TunableVariable): + """ + Float range. + """ + + def __init__(self, + name, + start, + stop, + step=None, + default=None, + endpoint=False): + super(FloatRange, self).__init__(name=name, default=default) + self.stop = float(stop) + self.start = float(start) + if step is not None: + self.step = float(step) + else: + self.step = None + self._default = default + self.endpoint = endpoint + + @property + def default(self): + if self._default is not None: + return self._default + return self.start + + def random(self, seed=None): + rng = np.random.default_rng(seed) + value = (self.stop - self.start) * rng.random() + self.start + if self.step is not None: + if self.endpoint: + values = np.arange(self.start, self.stop + 1e-7, step=self.step) + else: + values = np.arange(self.start, self.stop, step=self.step) + closest_index = np.abs(values - value).argmin() + value = values[closest_index] + return value + + def get_state(self): + state = super(FloatRange, self).get_state() + state["start"] = self.start + state["stop"] = self.stop + state["step"] = self.step + state["endpoint"] = self.endpoint + return state + + def __repr__(self): + return "FloatRange(name: {}, start: {}, stop: {}, step: {}, default: {}, endpoint: {})".format( + self.name, self.start, self.stop, self.step, self.default, + self.endpoint) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py new file mode 100644 index 00000000000..cb7104f9ef6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py @@ -0,0 +1,138 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from paddle.distributed.auto_parallel.tuner import tunable_space as ts + + +class TestTunableSpace(unittest.TestCase): + def test_fixed(self): + space = ts.TunableSpace() + fixed = space.fixed("fixed", default=4) + self.assertEqual(space.values["fixed"], 4) + self.assertEqual(len(space.variables), 1) + self.assertEqual(space.variables["fixed"].name, "fixed") + + space.values["fixed"] = 2 + self.assertEqual(space.get_value("fixed"), 2) + self.assertEqual(space.values, {"fixed": 2}) + self.assertEqual(len(space.variables), 1) + self.assertEqual(space.variables["fixed"].name, "fixed") + + def test_boolean(self): + space = ts.TunableSpace() + boolean = space.boolean("boolean") + self.assertEqual(space.values["boolean"], False) + self.assertEqual(len(space.variables), 1) + self.assertEqual(space.variables["boolean"].name, "boolean") + + space.values["boolean"] = True + self.assertEqual(space.get_value("boolean"), True) + self.assertEqual(space.values, {"boolean": True}) + self.assertEqual(len(space.variables), 1) + self.assertEqual(space.variables["boolean"].name, "boolean") + + def test_choice(self): + space = ts.TunableSpace() + choice = space.choice("choice", [1, 2, 3, 4], default=4) + self.assertEqual(space.values["choice"], 4) + self.assertEqual(len(space.variables), 1) + self.assertEqual(space.variables["choice"].name, "choice") + + space.values["choice"] = 2 + self.assertEqual(space.get_value("choice"), 2) + self.assertEqual(space.values, {"choice": 2}) + self.assertEqual(len(space.variables), 1) + self.assertEqual(space.variables["choice"].name, "choice") + + def test_int_range(self): + space = ts.TunableSpace() + int_range = space.int_range("int_range", start=1, stop=4, default=2) + self.assertEqual(space.values["int_range"], 2) + self.assertEqual(len(space.variables), 1) + self.assertEqual(space.variables["int_range"].name, "int_range") + + space.values["int_range"] = 3 + self.assertEqual(space.get_value("int_range"), 3) + self.assertEqual(space.values, {"int_range": 3}) + self.assertEqual(len(space.variables), 1) + self.assertEqual(space.variables["int_range"].name, "int_range") + + def test_float_range(self): + space = ts.TunableSpace() + float_range = space.float_range( + "float_range", start=0.4, stop=4.4, default=2.0) + self.assertEqual(space.values["float_range"], 2.0) + self.assertEqual(len(space.variables), 1) + self.assertEqual(space.variables["float_range"].name, "float_range") + + space.values["float_range"] = 3.0 + self.assertEqual(space.get_value("float_range"), 3.0) + self.assertEqual(space.values, {"float_range": 3.0}) + self.assertEqual(len(space.variables), 1) + self.assertEqual(space.variables["float_range"].name, "float_range") + + def test_varibles(self): + space = ts.TunableSpace() + choice = space.choice("choice", [1, 2, 3, 4], default=4) + self.assertEqual(space.values["choice"], 4) + self.assertEqual(len(space.variables), 1) + self.assertEqual(space.variables["choice"].name, "choice") + + int_range = space.int_range("int_range", start=1, stop=4, default=2) + self.assertEqual(space.values["int_range"], 2) + self.assertEqual(len(space.variables), 2) + self.assertEqual(space.variables["int_range"].name, "int_range") + + def test_not_populated_variable(self): + space = ts.TunableSpace() + choice = space.choice("choice", [1, 2, 3, 4], default=2) + self.assertEqual(choice, 2) + + def test_populated_variable(self): + space = ts.TunableSpace() + space.values["choice"] = 2 + choice = space.choice("choice", [1, 2, 3, 4], default=4) + self.assertEqual(choice, 2) + + space["choice"] = 3 + self.assertNotEqual(space.values["choice"], 2) + self.assertEqual(space.values["choice"], 3) + + def test_state(self): + space = ts.TunableSpace() + choice = space.choice("choice", [1, 2, 3, 4], default=4) + int_range = space.int_range("int_range", start=1, stop=4, default=2) + + new_space = space.from_state(space.get_state()) + self.assertEqual(new_space.get_value("choice"), 4) + self.assertEqual(new_space.get_value("int_range"), 2) + self.assertEqual(len(new_space.variables), 2) + self.assertEqual(len(new_space.values), 2) + + self.assertEqual(new_space.variables["choice"].name, "choice") + self.assertEqual(new_space.variables["choice"].default, 4) + self.assertEqual(new_space.variables["choice"].values, [1, 2, 3, 4]) + + self.assertEqual(new_space.variables["int_range"].name, "int_range") + self.assertEqual(new_space.variables["int_range"].default, 2) + self.assertEqual(new_space.variables["int_range"].start, 1) + self.assertEqual(new_space.variables["int_range"].stop, 4) + self.assertEqual(new_space.variables["int_range"].step, 1) + self.assertEqual(new_space.variables["int_range"].endpoint, False) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_variable.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_variable.py new file mode 100644 index 00000000000..c36fca7a9d0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_variable.py @@ -0,0 +1,99 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from paddle.distributed.auto_parallel.tuner import tunable_variable as tv + + +class TestTunableVariable(unittest.TestCase): + def test_fixed(self): + fixed = tv.Fixed("fixed", True) + fixed = tv.Fixed.from_state(fixed.get_state()) + self.assertEqual(fixed.default, True) + self.assertEqual(fixed.random(), True) + + fixed = tv.Fixed("fixed", 1) + fixed = tv.Fixed.from_state(fixed.get_state()) + self.assertEqual(fixed.default, 1) + self.assertEqual(fixed.random(), 1) + + def test_boolean(self): + boolean = tv.Boolean("bool") + boolean = tv.Boolean.from_state(boolean.get_state()) + self.assertEqual(boolean.default, False) + self.assertIn(boolean.random(), [True, False]) + self.assertIn(boolean.random(1234), [True, False]) + + boolean = tv.Boolean("bool", True) + boolean = tv.Boolean.from_state(boolean.get_state()) + self.assertEqual(boolean.default, True) + self.assertIn(boolean.random(), [True, False]) + self.assertIn(boolean.random(1234), [True, False]) + + def test_choice(self): + choice = tv.Choice("choice", [1, 2, 3, 4]) + choice = tv.Choice.from_state(choice.get_state()) + self.assertEqual(choice.default, 1) + self.assertIn(choice.random(), [1, 2, 3, 4]) + self.assertIn(choice.random(1234), [1, 2, 3, 4]) + + choice = tv.Choice("choice", [1, 2, 3, 4], default=2) + choice = tv.Choice.from_state(choice.get_state()) + self.assertEqual(choice.default, 2) + self.assertIn(choice.random(), [1, 2, 3, 4]) + self.assertIn(choice.random(1234), [1, 2, 3, 4]) + + def test_int_range(self): + int_range = tv.IntRange("int_range", start=1, stop=4, default=2) + int_range = tv.IntRange.from_state(int_range.get_state()) + self.assertEqual(int_range.default, 2) + self.assertIn(int_range.random(), [1, 2, 3, 4]) + self.assertIn(int_range.random(1234), [1, 2, 3, 4]) + self.assertNotEqual(int_range.default, 4) + + int_range = tv.IntRange( + "int_range", start=1, stop=8, step=2, default=3, endpoint=True) + int_range = tv.IntRange.from_state(int_range.get_state()) + self.assertEqual(int_range.default, 3) + self.assertIn(int_range.random(), [1, 3, 5, 7]) + self.assertIn(int_range.random(1234), [1, 3, 5, 7]) + self.assertNotEqual(int_range.default, 2) + + def test_float_range(self): + float_range = tv.FloatRange( + "float_range", start=0.4, stop=4.4, default=2.0) + float_range = tv.FloatRange.from_state(float_range.get_state()) + self.assertEqual(float_range.default, 2.0) + self.assertGreater(float_range.random(), 0.4) + self.assertLess(float_range.random(1234), 4.4) + self.assertNotAlmostEqual(float_range.random(), 1) + self.assertNotAlmostEqual(float_range.random(), 4.4) + + float_range = tv.FloatRange( + "float_range", + start=0.4, + stop=8.4, + step=2.0, + default=3.0, + endpoint=True) + float_range = tv.FloatRange.from_state(float_range.get_state()) + self.assertEqual(float_range.default, 3.0) + self.assertGreater(float_range.random(), 0.4) + self.assertLessEqual(float_range.random(1234), 8.4) + self.assertNotAlmostEqual(float_range.random(), 2) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/setup.py.in b/python/setup.py.in index 689f63c0f00..44998bd3e16 100755 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -300,6 +300,7 @@ packages=['paddle', 'paddle.distributed.fleet.meta_parallel.parallel_layers', 'paddle.distributed.auto_parallel', 'paddle.distributed.auto_parallel.operators', + 'paddle.distributed.auto_parallel.tuner', 'paddle.distributed.passes', 'paddle.framework', 'paddle.jit', -- GitLab From 464f65b16725401dadcdcf91c03f593fa4671cea Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Tue, 15 Mar 2022 12:27:05 +0800 Subject: [PATCH 061/176] add CHECK_VERSION macro (#40512) --- paddle/phi/backends/device_ext.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/paddle/phi/backends/device_ext.h b/paddle/phi/backends/device_ext.h index bbd4966b727..6315fe15afd 100644 --- a/paddle/phi/backends/device_ext.h +++ b/paddle/phi/backends/device_ext.h @@ -523,6 +523,15 @@ struct CustomRuntimeParams { char reserved[32]; }; +#define PADDLE_CUSTOM_RUNTIME_CHECK_VERSION(params) \ + if ((params)->size != sizeof(DevicePluginParams) && \ + (params)->interface->size != sizeof(C_DeviceInterface)) { \ + return; \ + } \ + (params)->version.major = PADDLE_DEVICE_PLUGIN_MAJOR_VERSION; \ + (params)->version.minor = PADDLE_DEVICE_PLUGIN_MINOR_VERSION; \ + (params)->version.patch = PADDLE_DEVICE_PLUGIN_PATCH_VERSION; + // Plugin implement it and fill CustomRuntimeParams void InitPlugin(CustomRuntimeParams*); -- GitLab From 67c6ddff478c9a4232f8b85c5766e0355153f34d Mon Sep 17 00:00:00 2001 From: kuizhiqing Date: Tue, 15 Mar 2022 13:03:06 +0800 Subject: [PATCH 062/176] New design for launch/run (#40086) --- python/paddle/distributed/run/__init__.py | 86 ++++++ python/paddle/distributed/run/__main__.py | 28 ++ .../distributed/run/context/__init__.py | 219 +++++++++++++ .../paddle/distributed/run/context/device.py | 88 ++++++ .../paddle/distributed/run/context/event.py | 20 ++ python/paddle/distributed/run/context/node.py | 64 ++++ .../distributed/run/context/resource.py | 18 ++ .../paddle/distributed/run/context/status.py | 58 ++++ .../distributed/run/controllers/__init__.py | 32 ++ .../distributed/run/controllers/collective.py | 185 +++++++++++ .../distributed/run/controllers/controller.py | 192 ++++++++++++ .../distributed/run/controllers/master.py | 289 ++++++++++++++++++ .../paddle/distributed/run/controllers/ps.py | 221 ++++++++++++++ python/paddle/distributed/run/job/__init__.py | 25 ++ .../paddle/distributed/run/job/container.py | 179 +++++++++++ python/paddle/distributed/run/job/job.py | 80 +++++ python/paddle/distributed/run/job/pod.py | 185 +++++++++++ python/paddle/distributed/run/job/status.py | 24 ++ .../distributed/run/plugins/__init__.py | 50 +++ python/paddle/distributed/run/plugins/ip.py | 30 ++ .../paddle/distributed/run/utils/kv_client.py | 94 ++++++ .../paddle/distributed/run/utils/kv_server.py | 121 ++++++++ .../distributed/run/utils/process_context.py | 83 +++++ .../fluid/tests/unittests/CMakeLists.txt | 1 + .../paddle/fluid/tests/unittests/test_run.py | 174 +++++++++++ 25 files changed, 2546 insertions(+) create mode 100644 python/paddle/distributed/run/__init__.py create mode 100644 python/paddle/distributed/run/__main__.py create mode 100644 python/paddle/distributed/run/context/__init__.py create mode 100644 python/paddle/distributed/run/context/device.py create mode 100644 python/paddle/distributed/run/context/event.py create mode 100644 python/paddle/distributed/run/context/node.py create mode 100644 python/paddle/distributed/run/context/resource.py create mode 100644 python/paddle/distributed/run/context/status.py create mode 100644 python/paddle/distributed/run/controllers/__init__.py create mode 100644 python/paddle/distributed/run/controllers/collective.py create mode 100644 python/paddle/distributed/run/controllers/controller.py create mode 100644 python/paddle/distributed/run/controllers/master.py create mode 100644 python/paddle/distributed/run/controllers/ps.py create mode 100644 python/paddle/distributed/run/job/__init__.py create mode 100644 python/paddle/distributed/run/job/container.py create mode 100644 python/paddle/distributed/run/job/job.py create mode 100644 python/paddle/distributed/run/job/pod.py create mode 100644 python/paddle/distributed/run/job/status.py create mode 100644 python/paddle/distributed/run/plugins/__init__.py create mode 100644 python/paddle/distributed/run/plugins/ip.py create mode 100644 python/paddle/distributed/run/utils/kv_client.py create mode 100644 python/paddle/distributed/run/utils/kv_server.py create mode 100644 python/paddle/distributed/run/utils/process_context.py create mode 100644 python/paddle/fluid/tests/unittests/test_run.py diff --git a/python/paddle/distributed/run/__init__.py b/python/paddle/distributed/run/__init__.py new file mode 100644 index 00000000000..f25ddb794cc --- /dev/null +++ b/python/paddle/distributed/run/__init__.py @@ -0,0 +1,86 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .job.container import Container +from .job.pod import Pod +from .job.job import Job +from . import plugins + +#__all__ = [Container, Pod, Job] +''' +Paddle distribution training entry ``python -m paddle.distributed.run``. + +Help + +# for arg usage and explanation, try the following command +# python -m paddle.distributed.run -h + +Collective Mode + +Case 1: 1 node + +use all visible devices +# python -m paddle.distributed.run train.py + +use specified devices +# python -m paddle.distributed.run --devices=0,1,2,3 train.py + +Case 2: multi-node, auto detect ip/port + +# python -m paddle.distributed.run --np 2 train.py +# auto print following command +# python -m paddle.distributed.run --master 10.0.0.1:13538 --np 2 demo.py +# then copy and paste above command to other nodes + +Case 3: multi-node, specified master/rendezvous server + +# python -m paddle.distributed.run --np 2 --master 10.0.0.1:2379 train.py +# the master ip must be one of the node and the port must available + +Parameter Server Mode + +Case 1.1: 1 node, 1 ps, 1 trainer + +# python -m paddle.distributed.run --mode ps train.py +# python -m paddle.distributed.run --server_num=1 --trainer_num=1 train.py + +Case 1.2: 1 node, 2 ps, 2 trainer + +# python -m paddle.distributed.run --server_num=2 --trainer_num=2 train.py + +Case 2: 2 node, 2 ps, 2 trainer per node + +# python -m paddle.distributed.run --server_num=2 --trainer_num=2 --np 2 train.py +# auto print following command +# python -m paddle.distributed.run --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --np 2 train.py +# then copy and paste above command to other nodes + +Case 3: multi-node, specified master/rendezvous server + +# python -m paddle.distributed.run --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --np 2 train.py +# the master ip must be one of the node and the port must available + +Case 4: specified servers and trainers in each node + +python -m paddle.distributed.run --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903 train.py + + +Elastic Mode + +# run following command in 3 node to run immediately, or in 2 node to run after elastic_timeout +# python -m paddle.distributed.run --master etcd://10.0.0.1:2379 --np 2:3 train.py + +# once the peer number changes between 2:3, the strategy holds + +''' diff --git a/python/paddle/distributed/run/__main__.py b/python/paddle/distributed/run/__main__.py new file mode 100644 index 00000000000..e32df59a328 --- /dev/null +++ b/python/paddle/distributed/run/__main__.py @@ -0,0 +1,28 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .context import Context +from . import controllers + +# initialize the context to run +ctx = Context() + +# initialize the selected controller +c = controllers.init(ctx) + +# run the pods +c.run() + +# manager or just wait pod +c.finalize() diff --git a/python/paddle/distributed/run/context/__init__.py b/python/paddle/distributed/run/context/__init__.py new file mode 100644 index 00000000000..86dff0f1f80 --- /dev/null +++ b/python/paddle/distributed/run/context/__init__.py @@ -0,0 +1,219 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from argparse import ArgumentParser, REMAINDER +import os, copy + +from paddle.distributed.run import plugins + +from .node import Node +from .status import Status + +import logging + + +class Context(object): + def __init__(self, enable_plugin=True): + os.environ.pop('http_proxy', None) + os.environ.pop('https_proxy', None) + + self.args = self.parse_args() + self.envs = self.fetch_envs() + self.logger = self.get_logger() + + self.node = Node() + self.status = Status() + + self.set_env_in_args() + + # design for event queue, later + self.events = [] + + if enable_plugin: + self._enable_plugin() + + def get_envs(self): + return self.envs.copy() + + def _enable_plugin(self): + for pl in plugins.enabled_plugins: + pl(self) + + def parse_args(self): + parser = ArgumentParser() + + base_group = parser.add_argument_group("Base Parameters") + + base_group.add_argument( + "--master", + type=str, + default=None, + help="the master/rendezvous server, ip:port") + + base_group.add_argument( + "--rank", type=int, default=-1, help="the peer rank") + + base_group.add_argument( + "--log", type=str, default="INFO", help="log level. Default INFO") + + base_group.add_argument( + "--np", + type=str, + default="1", + help="the number of peers, i.e. pod/node number") + + base_group.add_argument( + "--nproc_per_node", + type=int, + default=None, + help="the number of processes in a pod") + + base_group.add_argument( + "--log_dir", + type=str, + default="log", + help="the path for each process's log. Default ./log") + base_group.add_argument( + "--mode", + type=str, + default="collective", + help="run mode of the job, collective/ps/ps-heter") + + base_group.add_argument( + "--id", + type=str, + default="default", + help="unique id of the job. Default default") + + base_group.add_argument( + "--devices", + type=str, + default=None, + help="accelerate devices. as --gpus,npus,xps") + + base_group.add_argument( + "--host", type=str, default=None, help="host ip") + + base_group.add_argument( + "training_script", + type=str, + help="the full path of py script," + "followed by arguments for the " + "training script") + + base_group.add_argument('training_script_args', nargs=REMAINDER) + + ps_group = parser.add_argument_group("Parameter-Server Parameters") + # for parameter server + ps_group.add_argument( + "--servers", + type=str, + default='', + help="servers endpoints full list") + ps_group.add_argument( + "--trainers", + type=str, + default='', + help="trainers endpoints full list") + + ps_group.add_argument( + "--trainer_num", type=int, default=None, help="number of trainers") + ps_group.add_argument( + "--server_num", type=int, default=None, help="number of servers") + ps_group.add_argument( + "--gloo_port", type=int, default=6767, help="gloo http port") + ps_group.add_argument( + "--with_gloo", type=str, default="0", help="use gloo or not") + + # parameter elastic mode + elastic_group = parser.add_argument_group("Elastic Parameters") + elastic_group.add_argument( + "--max_restart", + type=int, + default=3, + help="the times can restart. Default 3") + + elastic_group.add_argument( + "--elastic_level", + type=int, + default=-1, + help="elastic level: -1 disable, 0 failed exit, peers hold, 1 internal restart" + ) + + elastic_group.add_argument( + "--elastic_timeout", + type=int, + default=30, + help="seconds to wait before elastic perform training") + return parser.parse_args() + + def _valide_env(self, key): + if key in ['POD_IP']: + return True + if key.endswith('_VISIBLE_DEVICES'): + return True + if key.startswith('PADDLE_'): + return True + + return False + + def fetch_envs(self): + ge = os.environ.copy() + + black_env_list = ['http_proxy', 'https_proxy'] + for key in black_env_list: + ge.pop(key, None) + + return ge + ''' + # use black list instead white list + return {k: ge[k] for k in ge if self._valide_env(k)} + ''' + + def get_logger(self, level=logging.INFO): + logger = logging.getLogger("PADDLERUN") + logger.setLevel(self.args.log.upper() or level) + formatter = logging.Formatter( + fmt='%(name)s %(levelname)s %(asctime)s %(message)s') + ch = logging.StreamHandler() + ch.setFormatter(formatter) + logger.addHandler(ch) + return logger + + def set_env_in_args(self): + env_args = { + 'POD_IP': 'host', + 'PADDLE_MASTER': 'master', + 'PADDLE_DEVICES': 'devices', + 'PADDLE_NP': 'np', + 'PADDLE_MODE': 'mode', + 'PADDLE_LOG': 'log', + 'PADDLE_NPROC_PER_NODE': 'nproc_per_node', + 'PADDLE_JOB_ID': 'id', + 'PADDLE_RANK': 'rank', + 'PADDLE_LOG_DIR': 'log_dir', + 'PADDLE_MAX_RESTlRT': 'max_restart', + 'PADDLE_ELASTIC_LEVEL': 'elastic_level', + 'PADDLE_ELASTIC_TIMEOUT': 'elastic_timeout', + 'PADDLE_SERVER_NUM': 'server_num', + 'PADDLE_TRAINER_NUM': 'trainer_num', + 'PADDLE_SERVERS_ENDPOINTS': 'servers', + 'PADDLE_TRAINERS_ENDPOINTS': 'trainers', + 'PADDLE_GLOO_PORT': 'gloo_port', + 'PADDLE_WITH_GLOO': 'with_gloo', + } + + for k, v in env_args.items(): + if k in self.envs: + setattr(self.args, v, self.envs[k]) diff --git a/python/paddle/distributed/run/context/device.py b/python/paddle/distributed/run/context/device.py new file mode 100644 index 00000000000..d8bbd851ccf --- /dev/null +++ b/python/paddle/distributed/run/context/device.py @@ -0,0 +1,88 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + + +class DeviceType: + CPU = 'cpu' + GPU = 'gpu' + XPU = 'xpu' + NPU = 'npu' + + +class Device(object): + def __init__(self, dtype=None, count=1, memory="", labels=""): + self.dtype = dtype + self.count = count + self.memory = memory + self.labels = labels + + def __str__(self): + return ",".join(self.labels) + + @classmethod + def parse_device(self): + dev = Device() + visible_devices = None + if 'CUDA_VISIBLE_DEVICES' in os.environ or 'NVIDIA_VISIBLE_DEVICES' in os.environ: + dev.dtype = DeviceType.GPU + visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv( + "NVIDIA_VISIBLE_DEVICES") + elif 'XPU_VISIBLE_DEVICES' in os.environ: + dev.dtype = DeviceType.XPU + visible_devices = os.getenv("XPU_VISIBLE_DEVICES") + elif 'ASCEND_VISIBLE_DEVICES' in os.environ: + dev.dtype = DeviceType.NPU + visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES") + + if visible_devices and visible_devices != 'all': + dev.labels = visible_devices.split(',') + dev.count = len(dev.labels) + else: + return self.detect_device() + + return dev + + @classmethod + def detect_device(self): + import paddle.fluid as fluid + + dev = Device() + num = 0 + visible_devices = None + if fluid.core.is_compiled_with_cuda(): + dev.dtype = DeviceType.GPU + num = fluid.core.get_cuda_device_count() + visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv( + "NVIDIA_VISIBLE_DEVICES") + elif fluid.core.is_compiled_with_xpu(): + dev.dtype = DeviceType.XPU + num = fluid.core.get_xpu_device_count() + visible_devices = os.getenv("XPU_VISIBLE_DEVICES") + elif fluid.core.is_compiled_with_npu(): + dev.dtype = DeviceType.NPU + num = fluid.core.get_npu_device_count() + visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES") + + if num == 0: + dev.dtype = DeviceType.CPU + elif visible_devices is None or visible_devices == "all" or visible_devices == "": + dev.labels = [str(x) for x in range(0, num)] + dev.count = num + else: + dev.labels = visible_devices.split(',') + dev.count = len(dev.labels) + + return dev diff --git a/python/paddle/distributed/run/context/event.py b/python/paddle/distributed/run/context/event.py new file mode 100644 index 00000000000..23e8e7a5014 --- /dev/null +++ b/python/paddle/distributed/run/context/event.py @@ -0,0 +1,20 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class Event(object): + def __init__(self, kind="status", message="", fatal=False): + self.kind = kind + self.message = message + self.fatal = fatal diff --git a/python/paddle/distributed/run/context/node.py b/python/paddle/distributed/run/context/node.py new file mode 100644 index 00000000000..1ece4db0fbb --- /dev/null +++ b/python/paddle/distributed/run/context/node.py @@ -0,0 +1,64 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .device import Device + +import socket +import struct +from contextlib import closing + + +class Node(object): + def __init__(self): + # self.device = Device.detect_device() + self.device = Device.parse_device() + self.ip = self.get_host_ip() + self.free_ports = [] + + def get_host_ip(self): + try: + self.hostname = socket.gethostname() + self.ip = socket.gethostbyname(socket.getfqdn(self.hostname)) + return self.ip + except: + return '127.0.0.1' + + def get_free_ports(self, n=1): + free_ports = [self.get_free_port() for i in range(n)] + self.free_ports += free_ports + return free_ports + + def get_ports_occupied(self): + return self.free_ports + + @classmethod + def get_free_port(self): + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: + s.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER, + struct.pack('ii', 1, 0)) + s.bind(('', 0)) + return s.getsockname()[1] + + @classmethod + def is_server_ready(self, ip, port): + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: + #sock.settimeout(0.01) + #sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + if hasattr(socket, 'SO_REUSEPORT'): + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1) + result = sock.connect_ex((ip, int(port))) + if result == 0: + return True + else: + return False diff --git a/python/paddle/distributed/run/context/resource.py b/python/paddle/distributed/run/context/resource.py new file mode 100644 index 00000000000..faffed704c1 --- /dev/null +++ b/python/paddle/distributed/run/context/resource.py @@ -0,0 +1,18 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class Resource(object): + def __init__(self): + self.devices = [] diff --git a/python/paddle/distributed/run/context/status.py b/python/paddle/distributed/run/context/status.py new file mode 100644 index 00000000000..cfbf3623ec2 --- /dev/null +++ b/python/paddle/distributed/run/context/status.py @@ -0,0 +1,58 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class Status(object): + UNINIT = "uninit" + READY = "ready" + RUNNING = "running" + FAILED = "failed" + TERMINATING = "terminating" + RESTARTING = "restarting" + UNKNOWN = "unknown" + COMPLETED = "completed" + DONE = "done" # should exit whatever status + + def __init__(self): + self._current_status = None + + def current(self): + return self._current_status + + def is_running(self): + return self._current_status == self.RUNNING + + def is_restarting(self): + return self._current_status == self.RESTARTING + + def is_done(self): + if self._current_status in [self.DONE, self.COMPLETED, self.FAILED]: + return True + else: + return False + + def run(self): + self._current_status = self.RUNNING + + def fail(self): + self._current_status = self.FAILED + + def complete(self): + self._current_status = self.COMPLETED + + def restart(self): + self._current_status = self.RESTARTING + + def done(self): + self._current_status = self.DONE diff --git a/python/paddle/distributed/run/controllers/__init__.py b/python/paddle/distributed/run/controllers/__init__.py new file mode 100644 index 00000000000..e5557151ad5 --- /dev/null +++ b/python/paddle/distributed/run/controllers/__init__.py @@ -0,0 +1,32 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ["init"] + +from .collective import CollectiveController +from .collective import CollectiveElasticController +from .ps import PSController + +# the order is extremely important +_controllers = [ + CollectiveElasticController, + PSController, + CollectiveController, +] + + +def init(ctx): + for c in _controllers: + if c.enable(ctx): + return c(ctx) diff --git a/python/paddle/distributed/run/controllers/collective.py b/python/paddle/distributed/run/controllers/collective.py new file mode 100644 index 00000000000..c4feb54428a --- /dev/null +++ b/python/paddle/distributed/run/controllers/collective.py @@ -0,0 +1,185 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .controller import Controller + +import json +import os +import six +import time + + +class CollectiveController(Controller): + @classmethod + def enable(cls, ctx): + if ctx: + ctx.logger.debug("{} enabled".format(cls.__name__)) + return True + else: + return False + + def build_pod(self): + self.pod.replicas = self.pod_replicas() + + # rank will be reset when restart + self.pod.rank = self.ctx.args.rank + + port = self.ctx.node.get_free_port() + + # compatible + endpoints = [ + "{}:{}".format(self.ctx.node.ip, p) + for p in self.ctx.node.get_free_ports(self.pod.replicas) + ] + + data = json.dumps({ + 'name': self.pod.name, + 'rank': self.pod.rank, + 'replicas': self.pod.replicas, + 'dtype': self.ctx.node.device.dtype, + 'candidate': '{}:{}'.format(self.ctx.node.ip, port), + 'endpoints': ",".join(endpoints), + }) + + peer_list, rank = self.master.sync_peers( + '/{}/info'.format(self.job.id), self.pod.name, data, + self.job.replicas, self.pod.rank) + self.pod.rank = rank + + if len(peer_list) < 1: + return False + + peer_list = [json.loads(i) for i in peer_list] + + self.ctx.logger.debug("sync peers done {}".format(peer_list)) + self.save_pod_log(peer_list) + + global_size = sum([i['replicas'] for i in peer_list]) + rank_offset = sum([i['replicas'] for i in peer_list[:rank]]) + ''' + The new designed collective need nothing but a master endpoint + ''' + collective_master = peer_list[0]['candidate'] + + job_endpoints = [i['endpoints'] for i in peer_list] + + self.pod.reset() + for i in range(self.pod.replicas): + e = { + "PADDLE_MASTER": collective_master, + "PADDLE_GLOBAL_SIZE": "{}".format(global_size), + "PADDLE_LOCAL_SIZE": "{}".format(self.pod.replicas), + "PADDLE_GLOBAL_RANK": "{}".format(i + rank_offset), + "PADDLE_LOCAL_RANK": "{}".format(i), + ## compatible env + "PADDLE_TRAINER_ENDPOINTS": ",".join(job_endpoints), + "PADDLE_CURRENT_ENDPOINT": endpoints[i], + "PADDLE_TRAINER_ID": "{}".format(i + rank_offset), + "PADDLE_TRAINERS_NUM": "{}".format(global_size), + "PADDLE_RANK_IN_NODE": str(i), + } + self.add_container(envs=e, log_tag=i) + + return True + + +class CollectiveElasticController(CollectiveController): + @classmethod + def enable(cls, ctx): + if ctx.args.master and ctx.args.master.startswith("etcd://"): + ctx.logger.debug("{} enabled".format(cls.__name__)) + return True + else: + return False + + def register(self): + if self.job.id == 'default': + self.ctx.logger.warning( + 'Using default job name may cause conflict, add --id in args') + + self.master.register_heartbeat(self.job.id, self.pod.name) + + def watch(self) -> bool: + ''' + watch self and peer status, return true to exit + ''' + while not self.ctx.status.is_done(): + # self status + status = self.pod.watch(timeout=2) + self.ctx.logger.debug("Pod status {}, Ctx status {}".format( + status, self.ctx.status.current())) + + # completed + if status == self.ctx.status.COMPLETED: + self.master.set_status(status) + self.ctx.status.complete() + self.ctx.logger.info("Pod complete {}".format(status)) + return True + + # self failure + elif status == self.ctx.status.FAILED: + self.master.set_status(status) + self.master.restart_peer() + self.ctx.logger.info("Pod failed {}".format(status)) + self.pod.stop() + + if self.ctx.args.elastic_level <= 0: + return True + else: + return False + + # peer failure + if self.ctx.status.is_restarting() and self.master.get_status( + ) != self.ctx.status.COMPLETED: + self.pod.stop() + return False + + #peers = self.master.fetch_peer_alive() + #print("peers {}".format(peers)) + + def run(self): + + timeout = self.ctx.args.elastic_timeout if self.job.elastic else self.ctx.args.elastic_timeout * 10 + self.register() + + while self.pod.restart <= self.ctx.args.max_restart: + + self.build_job() + + ok, replicas = self.master.wait_peer_ready( + self.job.replicas_min, self.job.replicas_max, timeout) + if ok: + self.job.replicas = replicas + else: + self.ctx.logger.warnning("peer not ready {}".format(self.job)) + break + + self.ctx.logger.debug("Run {}".format(self.job)) + + if not self.build_pod(): + continue + + self.master.set_status(self.ctx.status.RUNNING) + self.ctx.status.run() + + assert len(self.pod.containers) > 0, "No container in the pod" + self.ctx.logger.debug("Run {}".format(self.pod)) + self.ctx.logger.debug("Run {}".format(self.pod.containers[0])) + + self.pod.deploy() + + if self.watch(): + break + + self.ctx.logger.debug("Job done {}".format(self.job)) diff --git a/python/paddle/distributed/run/controllers/controller.py b/python/paddle/distributed/run/controllers/controller.py new file mode 100644 index 00000000000..2d904cf2a2c --- /dev/null +++ b/python/paddle/distributed/run/controllers/controller.py @@ -0,0 +1,192 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import os +import signal + +from paddle.distributed.run.job import Job +from paddle.distributed.run.job import Pod +from paddle.distributed.run.job import Container + +from .master import Master + +import time + + +class ControleMode: + COLLECTIVE = "collective" + PS = "ps" + + +class ControllerBase(object): + def __init__(self, ctx): + signal.signal(signal.SIGTERM, self.signal_handler) + signal.signal(signal.SIGABRT, self.signal_handler) + signal.signal(signal.SIGINT, self.signal_handler) + + self.ctx = ctx + self.master = Master.factory(self.ctx) + + self.job = Job(np=self.ctx.args.np, + mode=self.ctx.args.mode, + id=self.ctx.args.id) + self.pod = Pod() + + self.join_server = None + + def run(self): + self.build_job() + self.build_pod() + + if len(self.pod.containers) < 1: + self.ctx.logger.error("No container in the pod {}".format(self.pod)) + return + + self.ctx.logger.info("Run {}".format(self.pod)) + self.ctx.logger.debug(self.pod.containers[0]) + + self.pod.deploy() + + self.watch() + + def watch(self) -> bool: + status = self.pod.watch() + + if status == self.ctx.status.COMPLETED: + self.ctx.logger.info("Pod {}".format(status)) + elif status == self.ctx.status.FAILED: + self.ctx.logger.info("Pod {}".format(status)) + self.ctx.logger.error("Container failed !!!\n{}".format( + self.pod.failed_container())) + self.pod.tail() + self.pod.stop() + + def stop(self, sigint=None): + self.ctx.logger.debug("Controller stop") + self.master.stop() + self.pod.stop(sigint) + + def finalize(self): + self.pod.join() + self.master.stop() + + self.ctx.logger.info("Exit code {}".format(self.pod.exit_code)) + sys.exit(self.pod.exit_code) + + def signal_handler(self, sigint, frame): + self.ctx.logger.info("Terminating with signal {}".format(sigint)) + + if hasattr(self, 'sigint'): + time.sleep(5) + sys.exit(sigint) + + self.sigint = sigint + self.ctx.status.done() + self.stop(sigint) + time.sleep(1) + self.ctx.logger.debug("Exit with signal {}".format(sigint)) + sys.exit(sigint) + + +class Controller(ControllerBase): + ''' + Controller API for customization + ''' + + def build_job(self): + ''' + build job fill the job info. + ''' + self.ctx.logger.info(self.job) + + def build_pod(self) -> bool: + ''' + build pod includes creating containers etc. + + Return True if succeed + ''' + raise NotImplementedError + + def _get_entrypoint(self): + entrypoint = [sys.executable, "-u", self.ctx.args.training_script] + entrypoint.extend(self.ctx.args.training_script_args) + return entrypoint + + def _get_out_err_file(self, out=None, err=None): + if out and self.ctx.args.log_dir != "": + out = os.path.join(self.ctx.args.log_dir, out) + if err and self.ctx.args.log_dir != "": + err = os.path.join(self.ctx.args.log_dir, err) + return out, (err or out) + + def new_container(self, + entrypoint=None, + envs={}, + use_ctx_env=True, + out=None, + err=None): + c = Container( + entrypoint=(entrypoint or self._get_entrypoint()), + env=(self.ctx.get_envs() if use_ctx_env else {}), ) + c.outfile, c.errfile = self._get_out_err_file(out, err) + c.update_env(envs) + return c + + def add_container(self, + container=None, + entrypoint=None, + envs={}, + log_tag=None, + is_init=False): + if not is_init and log_tag is not None: + log_file = "{}.{}.{}.log".format(self.job.id, self.pod.name, + log_tag) + else: + log_file = None + + if not container: + container = self.new_container( + entrypoint=entrypoint, envs=envs, out=log_file, err=log_file) + + if is_init: + self.pod.add_init_container(container) + else: + self.pod.add_container(container) + + def pod_replicas(self): + ''' + how many process/container should be run in pod + ''' + + if self.ctx.args.nproc_per_node: + return int(self.ctx.args.nproc_per_node) + else: + return self.ctx.node.device.count + + def save_pod_log(self, info): + ''' + save_pod_log append *info* to the log file of pod.name + ''' + if not self.ctx.args.log_dir: + return + + f = os.path.join(self.ctx.args.log_dir, + '{}.{}.log'.format(self.job.id, self.pod.name)) + try: + os.makedirs(os.path.dirname(f), exist_ok=True) + with open(f, 'a+') as fd: + fd.write(str(info)) + except Exception as e: + self.ctx.logger.error("save log failed because {}".format(e)) diff --git a/python/paddle/distributed/run/controllers/master.py b/python/paddle/distributed/run/controllers/master.py new file mode 100644 index 00000000000..257ba3bad8d --- /dev/null +++ b/python/paddle/distributed/run/controllers/master.py @@ -0,0 +1,289 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.distributed.run.utils.kv_client import KVClient +from paddle.distributed.run.utils.kv_server import KVServer + +import time +import sys +import six +import threading +import copy +import random + +ETCD_PROTOCAL = 'etcd://' + + +class Master(object): + ''' + Master is a distributed store design to exchange info among nodes + ''' + + MAIN = "main" + STANDBY = "standby" + PATICIPANT = "participant" + + def __init__(self, ctx): + self.ctx = ctx + self.server = None + self.initialized = False + self.endpoint = None + + def stop(self): + raise NotImplementedError + + def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int): + raise NotImplementedError + + @classmethod + def factory(cls, ctx): + if ctx.args.master and ctx.args.master.startswith(ETCD_PROTOCAL): + return ETCDMaster(ctx) + else: + return HTTPMaster(ctx) + + +class HTTPMaster(Master): + def lazy_init(self): + if self.initialized: + return + + self.role = Master.PATICIPANT + + if self.ctx.args.master: + self.endpoint = self.ctx.args.master + ip, port = self.endpoint.split(':') + if ip in ['127.0.0.1', self.ctx.node.ip]: + time.sleep(2 * random.random()) + while not self.ctx.node.is_server_ready(ip, int(port)): + try: + self.server = KVServer(int(port)) + self.role = Master.MAIN + break + except Exception as e: + self.ctx.logger.warning("start master failed {}".format( + e)) + time.sleep(0.1) + continue + else: + port = self.ctx.node.get_free_port() + self.endpoint = "{}:{}".format(self.ctx.node.ip, port) + self.server = KVServer(port) + self.role = Master.MAIN + + print("Copy the following command to other nodes to run.") + cmd = [ + sys.executable.split('/')[-1], "-m", "paddle.distributed.run" + ] + cmd.extend(["--master", self.endpoint]) + cmd.extend(sys.argv[1:]) + print("-" * 80) + print(" ".join(cmd)) + print("-" * 80) + + if self.ctx.args.rank >= 0: + self.ctx.logger.warning( + "--rank set in the command may not compatible in auto mode") + + if '127.0.0.1' in self.endpoint: + self.endpoint = self.endpoint.replace('127.0.0.1', self.ctx.node.ip) + self.client = KVClient(self.endpoint) + + self.initialized = True + + self._start_server() + + def _start_server(self): + if self.server and not self.server.started: + self.server.start() + self.ctx.logger.debug("KV server start at {}".format(self.endpoint)) + + def _stop_server(self): + if self.server and not self.server.stopped: + self.server.stop() + self.ctx.logger.debug("KV server stopped") + + def stop(self): + self._stop_server() + + def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int): + if size < 2: + return [value], 0 + + self.lazy_init() + + while not self.ctx.status.is_done(): + if self.client.wait_server_ready(timeout=5): + break + else: + self.ctx.logger.warning("master not ready") + time.sleep(0.1) + + # 'aaaaaa' make suer main pod (master server) as rank 0 + ky = 'aaaaaa' if rank < 0 and self.role == Master.MAIN else key + k = "{}/{}/{}".format(prefix, ky, rank) + + while not self.ctx.status.is_done(): + if not self.client.put(k, value): + self.ctx.logger.warning("put value failed") + time.sleep(0.1) + continue + + rjson = self.client.get_prefix(prefix) + self.ctx.logger.debug("sync peers {}".format(rjson)) + if rjson and len(rjson) == size: + if rank < 0: + keys = list(rjson.keys()) + keys.sort() + ret = [rjson[k] for k in keys] + idx = ret.index(value) + return ret, idx + else: + ret = [None] * size + for k, v in rjson.items(): + ret[int(k.split('/')[-1])] = v + return ret, rank + else: + time.sleep(0.5) + return [], 0 + + +class ETCDMaster(Master): + def __init__(self, ctx): + super().__init__(ctx) + + if self.ctx.args.master: + # etcd://localhost:2379 + self.endpoint = self.ctx.args.master.strip("etcd://") + + import etcd3 + + host, port = self.endpoint.split(':') + self.client = etcd3.client(host=host, port=port) + + def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int): + ''' + sync_peers gather all value for key under scope prefix + result always be sorted either by rank or alphabet of pod.name + ''' + path = "{}/{}/{}".format(prefix, key, rank) + + self.client.delete_prefix(prefix) + + self.ctx.logger.debug("sync path {} value {}".format(path, value)) + + while not self.ctx.status.is_done(): + self.client.put(path, six.b(value)) + + result = [i for i in self.client.get_prefix(prefix)] + result = copy.deepcopy(result) + self.ctx.logger.debug("sync peers {}".format(result)) + + if len(result) == size: + if rank < 0: + keys = [six.ensure_str(i[1].key) for i in result] + sorted_keys = [six.ensure_str(i[1].key) for i in result] + sorted_keys.sort() + values = [six.ensure_str(i[0]) for i in result] + ret = [values[keys.index(k)] for k in sorted_keys] + idx = ret.index(value) + return ret, idx + else: + ret = [None] * size + for v, k in result: + ii = int(six.ensure_str(k.key).split('/')[-1]) + if ii < 0: + self.ctx.logger.error( + "rank {} error in sync".format(ii)) + ret[ii] = six.ensure_str(v) + return ret, rank + else: + time.sleep(0.5) + + def register_heartbeat(self, job_id, pod_id, ttl=10): + if hasattr(self, 'heartbeat_prefix'): + self.ctx.logger.warning("Heartbeat already done") + return + + self.job_prefix = '/paddle/{}'.format(job_id) + self.heartbeat_prefix = '{}/heartbeat'.format(self.job_prefix) + + lease = self.client.lease(ttl) + + #self.client.delete_prefix(self.job_prefix) + + beat_path = "{}/{}".format(self.heartbeat_prefix, pod_id) + self.client.put(beat_path, six.b(pod_id), lease=lease) + + def _beat_watch(event): + self.ctx.status.restart() + + beat_watch = self.client.add_watch_prefix_callback( + self.heartbeat_prefix, _beat_watch) + + def _heartbeat(): + while not self.ctx.status.is_done(): + try: + lease.refresh() + if pod_id not in self.fetch_peer_alive(): + self.client.put(beat_path, six.b(pod_id), lease=lease) + self.ctx.logger.debug("Heartbeat register again") + except Exception as e: + self.ctx.logger.error("Heartbeat error {}".format(e)) + time.sleep(ttl / 2) + self.ctx.logger.debug("Heartbeat done") + self.client.cancel_watch(beat_watch) + + self.beat_thread = threading.Thread( + name='heartbeat', target=_heartbeat, daemon=True) + self.beat_thread.start() + + def fetch_peer_alive(self): + peer_alive = [ + six.ensure_str(i[0]) + for i in self.client.get_prefix(self.heartbeat_prefix) + ] + self.ctx.logger.debug("peer alive {}".format(peer_alive)) + return peer_alive + + def wait_peer_ready(self, replicas_min, replicas_max, timeout): + end = time.time() + timeout + while not self.ctx.status.is_done() and time.time() < end: + if len(self.fetch_peer_alive()) == replicas_max: + return (True, replicas_max) + else: + time.sleep(0.5) + + np = len(self.fetch_peer_alive()) + if np >= replicas_min and np <= replicas_max: + return (True, np) + else: + return (False, np) + + def restart_peer(self): + self.client.delete_prefix(self.heartbeat_prefix) + + def set_status(self, status): + assert self.client.put( + self.job_prefix, six.b(status), + lease=self.client.lease(600)), "set status failed {}".format(status) + + def get_status(self): + return six.ensure_str(self.client.get(self.job_prefix)[0] or '') + + def stop(self): + if hasattr(self, 'beat_thread'): + self.ctx.status.done() + # TODO(kuizhiqing) thread should exit + #self.beat_thread.join() diff --git a/python/paddle/distributed/run/controllers/ps.py b/python/paddle/distributed/run/controllers/ps.py new file mode 100644 index 00000000000..cc43c336cf1 --- /dev/null +++ b/python/paddle/distributed/run/controllers/ps.py @@ -0,0 +1,221 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .controller import Controller, ControleMode + +import json +import os, shutil + + +class PSController(Controller): + @classmethod + def enable(cls, ctx): + if ctx.args.mode == ControleMode.PS or ctx.args.server_num or len( + ctx.args.servers) > 0: + ctx.logger.debug("{} enabled".format(cls.__name__)) + ctx.args.mode = ControleMode.PS + return True + else: + return False + + def build_pod(self): + if self.ctx.args.servers and self.ctx.args.trainers: + self._build_pod_with_args() + else: + self._build_pod_with_master() + + def _build_pod_with_args(self): + if '127.0.0.1' in self.ctx.args.servers: + host = '127.0.0.1' + else: + host = self.ctx.node.ip + + server_endpoints = [s for s in self.ctx.args.servers.split(",")] + trainer_endpoints = [s for s in self.ctx.args.trainers.split(",")] + servers = [ + s for s in self.ctx.args.servers.split(",") if s.startswith(host) + ] + trainers = [ + s for s in self.ctx.args.trainers.split(",") if s.startswith(host) + ] + server_num = len(servers) + trainer_num = len(trainers) + + self.pod.replicas = server_num + trainer_num + + self.save_pod_log([server_endpoints, trainer_endpoints]) + + import tempfile + gloo_rendezvous_dir = tempfile.mkdtemp() + if os.path.exists(gloo_rendezvous_dir): + shutil.rmtree(gloo_rendezvous_dir) + + gloo_port = self.ctx.args.gloo_port + gloo_http = "{}:{}".format(server_endpoints[0].split(":")[0], gloo_port) + + _gloo_envs = { + "PADDLE_GLOO_RENDEZVOUS": "3", + "PADDLE_GLOO_FS_PATH": gloo_rendezvous_dir, + "PADDLE_GLOO_HTTP_ENDPOINT": gloo_http, + "PADDLE_WITH_GLOO": self.ctx.args.with_gloo + } + + for i in range(server_num): + e = { + "PADDLE_PSERVERS_IP_PORT_LIST": self.ctx.args.servers, + "PADDLE_TRAINER_ENDPOINTS": self.ctx.args.trainers, + "PADDLE_PORT": servers[i].split(":")[1], + "PADDLE_ROLE": "PSERVER", + "TRAINING_ROLE": "PSERVER", + "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)), + "POD_IP": self.ctx.node.ip, + } + e.update(_gloo_envs) + log_tag = "ps.{}".format(i) + self.add_container(envs=e, log_tag=log_tag) + + trainer_rank_offset = 0 + for s in trainer_endpoints: + if s.startswith(host): + break + else: + trainer_rank_offset += 1 + + for i in range(trainer_num): + e = { + "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints), + "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints), + "PADDLE_PORT": trainers[i].split(":")[1], + "PADDLE_ROLE": "TRAINER", + "TRAINING_ROLE": "TRAINER", + "PADDLE_TRAINER_ID": "{}".format(i + trainer_rank_offset), + "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)), + "POD_IP": self.ctx.node.ip, + } + e.update(_gloo_envs) + log_tag = "trainer.{}".format(i) + self.add_container(envs=e, log_tag=log_tag) + + def _build_pod_with_master(self): + + self.pod.rank = self.ctx.args.rank + + server_num = self.ctx.args.server_num or 1 + servers = [ + "{}:{}".format(self.ctx.node.ip, p) + for p in self.ctx.node.get_free_ports(server_num) + ] + trainer_num = self.ctx.args.trainer_num or 1 + trainers = [ + "{}:{}".format(self.ctx.node.ip, p) + for p in self.ctx.node.get_free_ports(trainer_num) + ] + + data = json.dumps({ + 'name': self.pod.name, + 'rank': self.pod.rank, + 'servers': servers, + 'trainers': trainers, + 'dtype': self.ctx.node.device.dtype, + 'gloo_port': self.ctx.node.get_free_port(), + }) + + peer_list, rank = self.master.sync_peers( + '/{}/info'.format(self.job.id), self.pod.name, data, + self.job.replicas, self.pod.rank) + + self.ctx.logger.debug("sync peers done {}".format(peer_list)) + + peer_list = [json.loads(i) for i in peer_list] + + self.save_pod_log(peer_list) + + server_endpoints = [j for i in peer_list for j in i['servers']] + trainer_endpoints = [j for i in peer_list for j in i['trainers']] + #rank_offset = sum([i['replicas'] for i in peer_list[:rank]]) + + server_rank_offset = sum([len(i['servers']) for i in peer_list[:rank]]) + trainer_rank_offset = sum( + [len(i['trainers']) for i in peer_list[:rank]]) + + self.pod.rank = rank + + self.pod.replicas = server_num + trainer_num + + import tempfile + gloo_rendezvous_dir = tempfile.mkdtemp() + if os.path.exists(gloo_rendezvous_dir): + shutil.rmtree(gloo_rendezvous_dir) + + gloo_port = peer_list[0]['gloo_port'] + gloo_http = "{}:{}".format(server_endpoints[0].split(":")[0], gloo_port) + + _gloo_envs = { + "PADDLE_GLOO_RENDEZVOUS": "3", + "PADDLE_GLOO_FS_PATH": gloo_rendezvous_dir, + "PADDLE_GLOO_HTTP_ENDPOINT": gloo_http, + "PADDLE_WITH_GLOO": self.ctx.args.with_gloo + } + + for i in range(server_num): + e = { + "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints), + "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints), + "PADDLE_PORT": + server_endpoints[i + server_rank_offset].split(":")[1], + "PADDLE_ROLE": "PSERVER", + "TRAINING_ROLE": "PSERVER", + "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)), + "POD_IP": self.ctx.node.ip, + } + e.update(_gloo_envs) + log_tag = "ps.{}".format(i) + self.add_container(envs=e, log_tag=log_tag) + + for i in range(trainer_num): + e = { + "PADDLE_PSERVERS_IP_PORT_LIST": ",".join(server_endpoints), + "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints), + "PADDLE_PORT": + trainer_endpoints[i + trainer_rank_offset].split(":")[1], + "PADDLE_ROLE": "TRAINER", + "TRAINING_ROLE": "TRAINER", + "PADDLE_TRAINER_ID": "{}".format(i + trainer_rank_offset), + "PADDLE_TRAINERS_NUM": "{}".format(len(trainer_endpoints)), + "POD_IP": self.ctx.node.ip, + } + e.update(_gloo_envs) + log_tag = "trainer.{}".format(i) + self.add_container(envs=e, log_tag=log_tag) + ''' NEW VERSION + for i in range(server_num): + e = { + "PADDLE_PSERVER_ENDPOINTS": ",".join(server_endpoints), + "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints), + "PADDLE_ROLE": "PSERVER", + "PADDLE_RANK": "{}".format(i + server_rank_offset), + } + log_tag = "ps.{}".format(i) + self.add_container(envs=e, log_tag=log_tag) + + for i in range(trainer_num): + e = { + "PADDLE_PSERVER_ENDPOINTS": ",".join(server_endpoints), + "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints), + "PADDLE_ROLE": "TRAINER_CPU", + "PADDLE_RANK": "{}".format(i + trainer_rank_offset), + } + log_tag = "trainer.{}".format(i) + self.add_container(envs=e, log_tag=log_tag) + ''' diff --git a/python/paddle/distributed/run/job/__init__.py b/python/paddle/distributed/run/job/__init__.py new file mode 100644 index 00000000000..66d2abbce21 --- /dev/null +++ b/python/paddle/distributed/run/job/__init__.py @@ -0,0 +1,25 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .pod import Pod +from .job import Job +from .container import Container +from .status import Status + +__all__ = [ + 'Pod', + 'Job', + 'Container', + 'Status', +] diff --git a/python/paddle/distributed/run/job/container.py b/python/paddle/distributed/run/job/container.py new file mode 100644 index 00000000000..651932d6c88 --- /dev/null +++ b/python/paddle/distributed/run/job/container.py @@ -0,0 +1,179 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import OrderedDict +from paddle.distributed.run.utils.process_context import ProcessContext + +from .status import Status + +import os, copy, sys +import time + + +class Container(object): + ''' + TODO(kuizhiqing) A container can be run by process/thread or just a callable function + ''' + + def __init__(self, entrypoint=[], rank=-1, env={}): + self._entrypoint = entrypoint + self._rank = rank + self._out = None + self._err = None + self._env = env + self._proc = None + + self._retry: int = 3 + self._grace_period = 10 + + self._log_handler = None + + @property + def entrypoint(self): + return self._entrypoint + + @entrypoint.setter + def entrypoint(self, entry): + self._entrypoint = entry + + @property + def rank(self): + return self._rank + + @rank.setter + def rank(self, r): + self._rank = r + + @property + def outfile(self): + return self._out + + @outfile.setter + def outfile(self, out): + self._out = out + + @property + def errfile(self): + return self._err + + @errfile.setter + def errfile(self, err): + self._err = err + + def update_env(self, env={}, **kwargs): + env = {k: v for k, v in env.items() if isinstance(v, str)} + self._env.update(env) + + kwargs = {k: v for k, v in kwargs.items() if isinstance(v, str)} + self._env.update(kwargs) + + def _get_fd(self, pth): + if not pth: + return None + + try: + d = os.path.dirname(pth) + if not os.path.isdir(d): + os.makedirs(d, exist_ok=True) + return open(pth, 'w') + except: + return None + + def start(self, timeout=-1): + end = time.time() + timeout + + if self._proc and self._proc.alive(): + return True + + self._stdout = self._get_fd(self._out) or sys.stdout + if self._out == self._err: + self._stderr = self._stdout + elif self._err: + self._stderr = self._get_fd(self._err) or sys.stderr + + self._proc = ProcessContext( + self._entrypoint, env=self._env, out=self._stdout, err=self._stderr) + self._proc.start() + + while timeout > 0 and time.time() < end: + if self._proc.alive(): + time.sleep(0.1) + continue + if self._proc.exit_code() == 0: + return True + return False + + def terminate(self, force=False): + if self._log_handler: + self._log_handler.close() + self._log_handler = None + + if self._proc and self._proc.alive(): + return self._proc.terminate(force) + + def wait(self, timeout=None): + self._proc.wait(timeout) + + def exit_code(self): + return self._proc.exit_code() if self._proc else -1 + + def status(self): + if not self._proc: + return Status.UNINIT + if self._proc.alive(): + return Status.RUNNING + elif self._proc.exit_code() == 0: + return Status.COMPLETED + else: + return Status.FAILED + + def __str__(self): + return 'Container rank {} status {} cmd {} code {} log {} \nenv {}'.format( + self._rank, + self.status(), + self._entrypoint, + self.exit_code(), + self.errfile, + self._env, ) + + def logs(self, fn=None, offset=0, whence=1, lines=1000): + if not self._log_handler: + self._log_handler = open(self._out) + + if fn is None: + fn = sys.stdout + + self._log_handler.seek(offset, whence) + + try: + idx = 0 + for line in self._log_handler: + fn.write(line) + idx += 1 + if idx > lines: + break + finally: + return self._log_handler.tell() + + def tail(self, length=3000): + if not self._log_handler: + self._log_handler = open(self._out) + + self._log_handler.seek(0, 2) + ed = self._log_handler.tell() + + if ed > length: + self.logs(offset=ed - length, whence=0) + else: + self.logs(offset=0, whence=0) diff --git a/python/paddle/distributed/run/job/job.py b/python/paddle/distributed/run/job/job.py new file mode 100644 index 00000000000..3469ed86257 --- /dev/null +++ b/python/paddle/distributed/run/job/job.py @@ -0,0 +1,80 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class JobMode: + COLLECTIVE = 'collective' + PS = 'ps' + HETER = 'heter' + + +class Job(object): + def __init__(self, id='default', mode=JobMode.COLLECTIVE, np="1"): + self._mode = mode + self._id = id + + self._replicas = 0 + self._replicas_min = self._replicas + self._replicas_max = self._replicas + self._elastic = False + + self.set_replicas(str(np)) + + def __str__(self): + return "Job: {}, mode {}, replicas {}[{}:{}], elastic {}".format( + self.id, self.mode, self._replicas, self._replicas_min, + self._replicas_max, self.elastic) + + @property + def mode(self): + return self._mode + + @property + def id(self): + return self._id + + @property + def elastic(self): + return self._elastic + + @property + def replicas(self): + return self._replicas + + @property + def replicas_min(self): + return self._replicas_min + + @property + def replicas_max(self): + return self._replicas_max + + @replicas.setter + def replicas(self, replicas): + self._replicas = replicas + + def set_replicas(self, np: str): + np = str(np) if np else '1' + + if ':' in np: + nps = np.split(':') + self._replicas_min, self._replicas_max = int(nps[0]), int(nps[1]) + self._replicas = self._replicas_max # default to max + + self._elastic = True + else: + self._replicas = int(np) + self._replicas_min, self._replicas_max = self._replicas, self._replicas + + self._elastic = False diff --git a/python/paddle/distributed/run/job/pod.py b/python/paddle/distributed/run/job/pod.py new file mode 100644 index 00000000000..f7c31edce1d --- /dev/null +++ b/python/paddle/distributed/run/job/pod.py @@ -0,0 +1,185 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import OrderedDict +from .container import Container + +from .status import Status + +import random +import time + + +class PodSepc(object): + def __init__(self): + self._name = ''.join( + random.choice('abcdefghijklmnopqrstuvwxyz') for _ in range(6)) + + # by controller + self._init_containers: List[Container] = [] + self._containers: List[Container] = [] + + #self.resource: Resource = None + #self.status: Status = None + + self._rank = -1 + self._init_timeout = 120 # 2 min timeout for each init container + self._restart = -1 + self._replicas = 0 # number of containers + self._exit_code = 0 + + +class Pod(PodSepc): + def __init__(self): + super().__init__() + + def __str__(self): + return "Pod: {}, replicas {}, status {}".format(self.name, + self.replicas, + self.status()) + + def failed_container(self): + for c in self._containers: + if c.status() == Status.FAILED: + return c + return None + + @property + def name(self): + return self._name + + @property + def replicas(self): + return self._replicas + + @replicas.setter + def replicas(self, r): + self._replicas = r + + @property + def rank(self): + return self._rank + + @rank.setter + def rank(self, r): + self._rank = r + + @property + def restart(self): + return self._restart + + @property + def containers(self): + return self._containers + + def add_container(self, c): + c.rank = len(self._containers) + self._containers.append(c) + + @property + def init_containers(self): + return self._init_containers + + def add_init_container(self, c): + c.rank = len(self._init_containers) + self._init_containers.append(c) + + @property + def exit_code(self): + for c in self._containers: + if c.exit_code() != 0: + return c.exit_code() + return 0 + + def deploy(self): + for i in self._init_containers: + i.start(self._init_timeout) + + for c in self._containers: + c.start() + + self._restart += 1 + + def stop(self, sigint=0): + for c in self._containers: + force = True if sigint == 9 else False + c.terminate(force) + + def join(self): + for c in self._containers: + c.wait(None) + + def status(self): + if self.is_failed(): + return Status.FAILED + + if self.is_completed(): + return Status.COMPLETED + + return Status.READY + + def reset(self): + self._init_containers = [] + self._containers = [] + + def is_failed(self): + for c in self._containers: + if c.status() == Status.FAILED: + return True + return False + + def is_completed(self): + for c in self._containers: + if c.status() != Status.COMPLETED: + return False + return True + + def logs(self, idx=None): + if idx is None: + if self.failed_container(): + self.failed_container().logs() + else: + self._containers[0].logs() + else: + self._containers[idx].logs() + + def tail(self, idx=None): + if idx is None: + if self.failed_container(): + self.failed_container().tail() + else: + self._containers[0].tail() + else: + self._containers[idx].tail() + + def watch(self, + all_list=[Status.COMPLETED], + any_list=[Status.FAILED], + interval=1, + timeout=-1): + ''' + watch return if any container status in any_list + or all container status in all_list + ''' + end = time.time() + timeout + while timeout < 0 or time.time() < end: + for c in self._containers: + if c.status() in any_list: + return c.status() + + s = [c.status() for c in self._containers] + if len(set(s)) == 1 and s[0] in all_list: + return s[0] + + time.sleep(interval) diff --git a/python/paddle/distributed/run/job/status.py b/python/paddle/distributed/run/job/status.py new file mode 100644 index 00000000000..ae10c5adb6c --- /dev/null +++ b/python/paddle/distributed/run/job/status.py @@ -0,0 +1,24 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class Status(object): + UNINIT = "uninit" + READY = "ready" + RUNNING = "running" + FAILED = "failed" + TERMINATING = "terminating" + RESTARTING = "restarting" + UNKNOWN = "unknown" + COMPLETED = "completed" diff --git a/python/paddle/distributed/run/plugins/__init__.py b/python/paddle/distributed/run/plugins/__init__.py new file mode 100644 index 00000000000..ec91402a7aa --- /dev/null +++ b/python/paddle/distributed/run/plugins/__init__.py @@ -0,0 +1,50 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import six + +__all__ = [] + + +def log(ctx): + ctx.logger.info("----------- Configuration ----------------------") + for arg, value in sorted(six.iteritems(vars(ctx.args))): + ctx.logger.info("%s: %s" % (arg, value)) + ctx.logger.info("--------------------------------------------------") + + +def process_args(ctx): + # reset device by args + #argdev = ctx.args.gpus or ctx.args.xpus or ctx.args.npus + argdev = ctx.args.devices + if argdev: + ctx.node.device.labels = argdev.split(',') + ctx.node.device.count = len(ctx.node.device.labels) + ctx.logger.debug('Device reset by args {}'.format(argdev)) + + +def collective_compatible(ctx): + if 'PADDLE_TRAINER_ENDPOINTS' in ctx.envs: + ctx.master = ctx.envs['PADDLE_TRAINER_ENDPOINTS'].split(',')[0] + if 'DISTRIBUTED_TRAINER_ENDPOINTS' in ctx.envs: + ctx.master = ctx.envs['DISTRIBUTED_TRAINER_ENDPOINTS'].split(',')[0] + + +def rewrite_host_ip(ctx): + if ctx.args.host is not None and "." in ctx.args.host: + ctx.logger.warning('Host ip reset to {}'.format(ctx.args.host)) + ctx.node.ip = ctx.args.host + + +enabled_plugins = [collective_compatible, rewrite_host_ip, process_args, log] diff --git a/python/paddle/distributed/run/plugins/ip.py b/python/paddle/distributed/run/plugins/ip.py new file mode 100644 index 00000000000..0809ed5864d --- /dev/null +++ b/python/paddle/distributed/run/plugins/ip.py @@ -0,0 +1,30 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import socket + + +def get_local_ip(ctx): + _, ip = _get_host_name_ip() + ctx.args.host = ip + ctx.envs["POD_IP"] = ip + + +def _get_host_name_ip(): + try: + host_name = socket.gethostname() + host_ip = socket.gethostbyname(host_name) + return host_name, host_ip + except: + return None diff --git a/python/paddle/distributed/run/utils/kv_client.py b/python/paddle/distributed/run/utils/kv_client.py new file mode 100644 index 00000000000..e1919541226 --- /dev/null +++ b/python/paddle/distributed/run/utils/kv_client.py @@ -0,0 +1,94 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import requests +import time + + +class KVClient(object): + def __init__(self, endpoint='localhost:2379'): + self.endpoint = endpoint if endpoint.startswith( + "http://") else "http://{}".format(endpoint) + + def put(self, key, value): + key = key if key.startswith('/') else "/{}".format(key) + u = "{}{}".format(self.endpoint, key) + try: + r = requests.post(u, data=value, timeout=3) + if r.status_code == 200: + return True + else: + return False + except: + return False + + def get(self, key): + key = key if key.startswith('/') else "/{}".format(key) + u = "{}{}".format(self.endpoint, key) + try: + r = requests.get(u, timeout=3) + if r.status_code == 200: + ret = r.json() + return ret.get(key, '') + else: + return "error" + except: + return "" + + def get_prefix(self, key): + key = key if key.startswith('/') else "/{}".format(key) + u = "{}{}".format(self.endpoint, key) + try: + r = requests.get(u, timeout=3) + if r.status_code == 200: + return r.json() + except: + return "" + + def delete(self, key): + key = key if key.startswith('/') else "/{}".format(key) + u = "{}{}".format(self.endpoint, key) + try: + r = requests.delete(u, timeout=3) + if r.status_code == 200: + return True + else: + return False + except: + return False + + def wait_server_ready(self, timeout=3): + end = time.time() + timeout + while time.time() < end: + if self.get("/healthy") == "ok": + return True + + +if __name__ == '__main__': + cli = PKVClient("http://localhost:8090") + data = {"/workers/1": "rank1", "/workers/2": "rank2"} + for k, v in data.items(): + cli.put(k, v) + x = cli.get_prefix("/workers") + print(x) + for k, v in data.items(): + assert x[k] == v + + cli.put("key", "value") + print(cli.get("key")) + assert cli.get("key") == "value" + cli.delete("key") + print(cli.get("/key")) + print(cli.get("/healthy")) + assert cli.get("/healthy") == "ok" diff --git a/python/paddle/distributed/run/utils/kv_server.py b/python/paddle/distributed/run/utils/kv_server.py new file mode 100644 index 00000000000..2d7ae15f13d --- /dev/null +++ b/python/paddle/distributed/run/utils/kv_server.py @@ -0,0 +1,121 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from http.server import HTTPServer +import http.server as SimpleHTTPServer + +from multiprocessing import Process + +import threading +import json + + +class KVHandler(SimpleHTTPServer.SimpleHTTPRequestHandler): + def do_GET(self): + with self.server.kv_lock: + ret = {} + for k, v in self.server.kv.items(): + if k.startswith(self.path): + ret[k] = v.decode(encoding="utf-8") + if ret: + self.output(200, json.dumps(ret).encode("utf-8")) + else: + self.output(404) + + def do_PUT(self): + self.do_POST() + + def do_POST(self): + content_length = int(self.headers['Content-Length'] or 0) + try: + value = self.rfile.read(content_length) + with self.server.kv_lock: + self.server.kv[self.path] = value + self.output(200) + return + except: + self.output(500) + + def do_DELETE(self): + with self.server.kv_lock: + if self.path in self.server.kv: + del self.server.kv[self.path] + self.output(200) + else: + self.output(404) + + def output(self, code, value=''): + self.send_response(code) + self.send_header("Content-Length", len(value)) + self.send_header("Content-Type", "application/json; charset=utf8") + self.end_headers() + if value: + self.wfile.write(value) + + def log_message(self, format, *args): + return + + +class KVServer(HTTPServer, object): + def __init__(self, port): + super(KVServer, self).__init__(('', port), KVHandler) + self.kv_lock = threading.Lock() + self.kv = {'/healthy': b'ok'} + self.port = port + self.stopped = False + self.started = False + + def start(self): + self.listen_thread = threading.Thread(target=self.serve_forever) + self.listen_thread.start() + self.started = True + + def stop(self): + self.shutdown() + self.listen_thread.join() + self.server_close() + self.stopped = True + + +class PKVServer(): + def __init__(self, port): + self._server = KVServer(port) + + def start(self): + self.proc = Process(target=self._server.start) + self.proc.daemon = True + self.proc.start() + + def stop(self): + self._server.stop() + self.proc.join() + + @property + def started(self): + return self._server.started + + @property + def stopped(self): + return self._server.stopped + + +if __name__ == '__main__': + #kv = PKVServer(8090) + kv = KVServer(8090) + kv.start() + import time + + #print("serve at 8090 for 600 s") + + time.sleep(600) diff --git a/python/paddle/distributed/run/utils/process_context.py b/python/paddle/distributed/run/utils/process_context.py new file mode 100644 index 00000000000..4d6fa8de794 --- /dev/null +++ b/python/paddle/distributed/run/utils/process_context.py @@ -0,0 +1,83 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import subprocess +import os, sys, signal, time + + +class ProcessContext(object): + def __init__(self, + cmd, + env=os.environ, + out=sys.stdout, + err=sys.stderr, + group=True, + preexec_fn=None): + self._cmd = cmd + self._env = env + self._preexec_fn = preexec_fn + self._stdout = out + self._stderr = err + self._group = group if os.name != 'nt' else False + self._proc = None + self._code = None + + def _start(self): + pre_fn = os.setsid if self._group else None + self._proc = subprocess.Popen( + self._cmd, + env=self._env, + stdout=self._stdout, + stderr=self._stderr, + preexec_fn=self._preexec_fn or pre_fn) + + def _close_std(self): + try: + if not self._stdout.isatty(): + self._stdout.close() + + if not self._stderr.isatty(): + self._stderr.close() + except: + pass + + def alive(self): + return self._proc and self._proc.poll() is None + + def exit_code(self): + return self._proc.poll() if self._proc else None + + def start(self): + self._start() + + def terminate(self, force=False, max_retry=3): + for i in range(max_retry): + if self.alive(): + if self._group: + os.killpg(os.getpgid(self._proc.pid), signal.SIGTERM) + else: + self._proc.terminate() + time.sleep(0.2) + else: + break + + if force and self.alive(): + self._proc.kill() + + self._close_std() + + return self.alive() + + def wait(self, timeout=None): + self._proc.wait(timeout) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index b05f16a0606..cbe360f556c 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -949,6 +949,7 @@ if (WITH_DISTRIBUTE AND NOT APPLE) endif() # setting timeout value as 15S +set_tests_properties(test_run PROPERTIES TIMEOUT 200) set_tests_properties(test_sync_batch_norm_op PROPERTIES TIMEOUT 120) set_tests_properties(test_cross_op PROPERTIES TIMEOUT 120) set_tests_properties(test_imperative_lod_tensor_to_selected_rows PROPERTIES TIMEOUT 200) diff --git a/python/paddle/fluid/tests/unittests/test_run.py b/python/paddle/fluid/tests/unittests/test_run.py new file mode 100644 index 00000000000..8fe5fb9bb94 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_run.py @@ -0,0 +1,174 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import subprocess +import sys, os +import json +import shutil + +import random + +from os import listdir +from os.path import isfile, join + +pyname = 'train.py' +colpyfile = '''# train.py for unitest +import os +env = os.environ.copy() +assert "PADDLE_MASTER" in env +assert "PADDLE_GLOBAL_SIZE" in env +assert "PADDLE_LOCAL_SIZE" in env +assert "PADDLE_GLOBAL_RANK" in env +assert "PADDLE_LOCAL_RANK" in env +''' + +pspyfile = '''# train.py for unitest +import os +env = os.environ.copy() +assert "PADDLE_PSERVERS_IP_PORT_LIST" in env +assert "PADDLE_TRAINER_ENDPOINTS" in env +#assert "PADDLE_PSERVER_ENDPOINTS" in env +#assert "PADDLE_TRAINER_ENDPOINTS" in env +#assert "PADDLE_ROLE" in env +#assert "PADDLE_RANK" in env +''' + + +def write_file(name, ct): + with open(name, "w") as f: + f.write(ct) + + +def get_files(pth, prefix): + return [ + f for f in listdir(pth) if isfile(join(pth, f)) and f.startswith(prefix) + ] + + +class Collective_Test(unittest.TestCase): + def setUp(self): + write_file(pyname, colpyfile) + + def pdrun(self, args, env=None): + cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.run"] + if args: + cmd.extend(args.split(" ")) + cmd.extend([pyname]) + proc = subprocess.Popen(cmd, env) + return proc + + ''' + def test_collective_1(self): + args = "--id test1" + p = self.pdrun(args) + p.wait() + self.assertTrue(p.poll() == 0) + + ''' + + def test_collective_2(self): + if os.path.exists('./log'): + shutil.rmtree('./log') + + args = "--id test2 --devices 0,1,2" + p = self.pdrun(args) + p.wait() + self.assertTrue(p.poll() == 0) + + c = get_files('log', 'test2') + self.assertTrue(len(c) == 4) + + def test_collective_3(self): + if os.path.exists('./log'): + shutil.rmtree('./log') + + port = random.randrange(6000, 8000) + args = "--id test3 --devices 0,1 --master 127.0.0.1:{} --np 2".format( + port) + p1 = self.pdrun(args) + p2 = self.pdrun(args) + p1.wait() + p2.wait() + self.assertTrue(p1.poll() == 0) + self.assertTrue(p2.poll() == 0) + + c = get_files('log', 'test3') + self.assertTrue(len(c) == 6) + + +class PS_Test(unittest.TestCase): + def setUp(self): + write_file(pyname, pspyfile) + + def pdrun(self, args, env=None): + cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.run"] + if args: + cmd.extend(args.split(" ")) + cmd.extend([pyname]) + proc = subprocess.Popen(cmd, env) + return proc + + ''' + def test_ps_1(self): + args = "--mode ps" + p = self.pdrun(args) + p.wait() + self.assertTrue(p.poll() == 0) + + def test_ps_2(self): + if os.path.exists('./log'): + shutil.rmtree('./log') + + args = "--id ps2 --server_num=2 --trainer_num=2" + p = self.pdrun(args) + p.wait() + self.assertTrue(p.poll() == 0) + + c = get_files('log', 'ps2') + self.assertTrue(len(c) == 5) + ''' + + def test_ps_3(self): + if os.path.exists('./log'): + shutil.rmtree('./log') + + port = random.randrange(6000, 8000) + args = "--id ps3 --master 127.0.0.1:{} --np 2 --server_num=1 --trainer_num=1".format( + port) + p1 = self.pdrun(args) + p2 = self.pdrun(args) + p1.wait() + p2.wait() + self.assertTrue(p1.poll() == 0) + self.assertTrue(p2.poll() == 0) + + c = get_files('log', 'ps3') + self.assertTrue(len(c) == 6) + + def test_ps_4(self): + if os.path.exists('./log'): + shutil.rmtree('./log') + + args = "--id ps4 --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903" + p1 = self.pdrun(args) + p1.wait() + self.assertTrue(p1.poll() == 0) + + c = get_files('log', 'ps4') + self.assertTrue(len(c) == 5) + + +if __name__ == '__main__': + unittest.main() -- GitLab From c9f3ad0361d8d4773fb6cd207630ed28fa1cbf02 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 15 Mar 2022 13:11:17 +0800 Subject: [PATCH 063/176] Skip infrt when checking log fatal (#40529) * skip infrt when checking log fatal, test=document_fix * remove test=document_fix * update commit --- tools/check_file_diff_approvals.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh index 55d2d59c7ec..9c802a56a7b 100644 --- a/tools/check_file_diff_approvals.sh +++ b/tools/check_file_diff_approvals.sh @@ -198,7 +198,9 @@ if [ ${HAS_BOOST_GET} ] && [ "${GIT_PR_ID}" != "" ]; then check_approval 1 6836917 47554610 22561442 fi -HAS_LOG_FATAL=`git diff -U0 upstream/$BRANCH $FILTER |grep "^+" |grep -o -m 1 "LOG(FATAL)" || true` +# infrt needs to temporarily use LOG(FATAL) during the debugging period, and will replace it with standard error format in the future. +NO_INFRT_FILES=`git diff --name-only upstream/develop | grep -v "tools/\|paddle/infrt/" || true` +HAS_LOG_FATAL=`git diff -U0 upstream/$BRANCH $NO_INFRT_FILES |grep "^+" |grep -o -m 1 "LOG(FATAL)" || true` if [ ${HAS_LOG_FATAL} ] && [ "${GIT_PR_ID}" != "" ]; then echo_line="LOG(FATAL) is not recommended, because it will throw exception without standard stack information, so please use PADDLE_THROW macro here. If you have to use LOG(FATAL) here, please request chenwhql (Recommend), luotao1 or lanxianghit review and approve.\n" check_approval 1 6836917 47554610 22561442 -- GitLab From 7701db3713e52db6e8feaad9e45a84bbc32c9c44 Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Tue, 15 Mar 2022 13:22:06 +0800 Subject: [PATCH 064/176] Move one hot to phi (#39876) * move one hot to phi; test=develop * fix bugs; test=develop * fix bugs; test=develop * add infer meta; test=develop * fix bugs; test=develop * resolve confilct * resolve confilct * fix bug; * fix error; test=develop * update; test=develop * polish code; test=develop * add one api in eager mode; test=develop * add one hot test; test=develop * remove use less code; test=develop * fix bug; test=develop * polish code; test=develop * polish code; test=develop --- paddle/fluid/framework/infershape_utils.cc | 18 +++- paddle/fluid/framework/operator.cc | 45 +++++--- paddle/fluid/imperative/prepared_operator.h | 12 +++ paddle/fluid/operators/one_hot_v2_op.cc | 38 +++---- paddle/fluid/operators/one_hot_v2_op.cu | 100 ------------------ paddle/fluid/operators/one_hot_v2_op_npu.cc | 3 +- paddle/phi/core/compat/op_utils.h | 1 + paddle/phi/core/meta_tensor.cc | 4 + paddle/phi/infermeta/unary.cc | 37 +++++++ paddle/phi/infermeta/unary.h | 8 ++ .../kernels/cpu/one_hot_kernel.cc} | 85 +++++++-------- paddle/phi/kernels/gpu/one_hot_kernel.cu | 86 +++++++++++++++ paddle/phi/kernels/one_hot_kernel.cc | 38 +++++++ paddle/phi/kernels/one_hot_kernel.h | 36 +++++++ paddle/phi/ops/compat/one_hot_sig.cc | 37 +++++++ python/paddle/fluid/dygraph/tracer.py | 6 ++ .../tests/unittests/test_one_hot_v2_op.py | 19 ++-- python/paddle/nn/functional/input.py | 3 + python/paddle/utils/code_gen/api.yaml | 9 ++ 19 files changed, 394 insertions(+), 191 deletions(-) delete mode 100644 paddle/fluid/operators/one_hot_v2_op.cu rename paddle/{fluid/operators/one_hot_v2_op.h => phi/kernels/cpu/one_hot_kernel.cc} (50%) create mode 100644 paddle/phi/kernels/gpu/one_hot_kernel.cu create mode 100644 paddle/phi/kernels/one_hot_kernel.cc create mode 100644 paddle/phi/kernels/one_hot_kernel.h create mode 100644 paddle/phi/ops/compat/one_hot_sig.cc diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index 5119c306906..b1d7059f311 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -500,8 +500,22 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, "Unsupported attribute type is received when call " "InferShapeFunctor.")); } - } else { - // do nothing + } else if (ctx->HasInput(attr_name)) { + // convert from data + if (attr_defs[i].type_index == std::type_index(typeid(int32_t))) { + if (ctx->IsRuntime()) { + const auto& infershape_inputs = ctx->GetInputVarPtrs(attr_name); + auto var_temp = BOOST_GET_CONST(Variable*, infershape_inputs[i]); + auto val = experimental::MakePhiScalarFromVar(*var_temp); + int32_t val_int = val.template to(); + infer_meta_context.EmplaceBackAttr(val_int); + } else { + infer_meta_context.EmplaceBackAttr(-1); + } + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Get value from variable only support int yet")); + } } } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index f8e30c1ee29..f23a266ef03 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2250,41 +2250,62 @@ void OperatorWithKernel::BuildPhiKernelContext( } } else { // TODO(chenweihang): support other attrs later - auto& attr = Attrs().at(attr_names[i]); + auto attr_it = attrs_.find(attr_names[i]); if (attr_defs[i].type_index == std::type_index(typeid(int))) { - pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(int, attr)); + if (attr_it == attrs_.end()) { + auto in_it = ctx.inputs.find(attr_names[i]); + if (in_it != ctx.inputs.end()) { + // get data from input + auto val = experimental::MakePhiScalarFromVar(*(in_it->second[0])); + int32_t val_int = val.template to(); + pt_kernel_context->EmplaceBackAttr(val_int); + } else { + PADDLE_THROW(platform::errors::NotFound( + "can not find attribute `%s` both in attribute and input ", + attr_names[i])); + } + } else { + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(int, attr_it->second)); + } } else if (attr_defs[i].type_index == std::type_index(typeid(float))) { - pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(float, attr)); + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(float, attr_it->second)); } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) { - pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(bool, attr)); + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(bool, attr_it->second)); } else if (attr_defs[i].type_index == std::type_index(typeid(int64_t))) { - pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(int64_t, attr)); + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(int64_t, attr_it->second)); } else if (attr_defs[i].type_index == std::type_index(typeid(std::string))) { - pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(std::string, attr)); + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(std::string, attr_it->second)); } else if (attr_defs[i].type_index == std::type_index(typeid(phi::DataType))) { auto data_type = paddle::framework::TransToPhiDataType( static_cast( - BOOST_GET_CONST(int, attr))); + BOOST_GET_CONST(int, attr_it->second))); pt_kernel_context->EmplaceBackAttr(data_type); } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { - if (std::type_index(attr.type()) == + if (std::type_index(attr_it->second.type()) == std::type_index(typeid(std::vector))) { pt_kernel_context->EmplaceBackAttr( - BOOST_GET_CONST(std::vector, attr)); - } else if (std::type_index(attr.type()) == + BOOST_GET_CONST(std::vector, attr_it->second)); + } else if (std::type_index(attr_it->second.type()) == std::type_index(typeid(std::vector))) { // Emplace Back Attr according to the type of Phi_Kernel args. - const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); + const auto& vector_int_attr = + BOOST_GET_CONST(std::vector, attr_it->second); const std::vector vector_int64_attr(vector_int_attr.begin(), vector_int_attr.end()); pt_kernel_context->EmplaceBackAttr(vector_int64_attr); } } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { - const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); + const auto& vector_int_attr = + BOOST_GET_CONST(std::vector, attr_it->second); pt_kernel_context->EmplaceBackAttr(vector_int_attr); } else { PADDLE_THROW(platform::errors::Unimplemented( diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 91e6974fa2e..8deb3b93e9c 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -419,6 +419,17 @@ void BuildDygraphPhiKernelContext( experimental::MakePhiScalarFromVar(ins_vector[0]->Var()))); } + } else if (ins.find(attr_names[i]) != ins.end()) { + // deal tensor attr here + auto& ins_vector = ins.at(attr_names[i]); + auto tensor_attr = + experimental::MakePhiScalarFromVar(ins_vector[0]->Var()); + if (attr_defs[i].type_index == std::type_index(typeid(int))) { + int val = tensor_attr.template to(); + kernel_ctx->EmplaceBackAttr(val); + } else { + PADDLE_THROW(platform::errors::Unimplemented("only support int here")); + } } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { auto& attr = GetAttr(attrs, default_attrs, attr_names[i]); @@ -475,6 +486,7 @@ void BuildDygraphPhiKernelContext( } } else { // TODO(chenweihang): support other attrs later + auto& attr = GetAttr(attrs, default_attrs, attr_names[i]); if (attr_defs[i].type_index == std::type_index(typeid(int))) { kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(int, attr)); diff --git a/paddle/fluid/operators/one_hot_v2_op.cc b/paddle/fluid/operators/one_hot_v2_op.cc index e212f4e7e2b..122b6a8a80a 100644 --- a/paddle/fluid/operators/one_hot_v2_op.cc +++ b/paddle/fluid/operators/one_hot_v2_op.cc @@ -12,9 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/one_hot_v2_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -22,26 +26,6 @@ namespace operators { class OneHotV2Op : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "one_hot_v2"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "one_hot_v2"); - - auto x_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_GE(x_dims.size(), 1, - platform::errors::InvalidArgument( - "Rank of Input(X) should be at least 1.")); - - int depth = ctx->Attrs().Get("depth"); - if (ctx->HasInput("depth_tensor")) { - depth = -1; - } - - auto out_dims_vec = phi::vectorize(x_dims); - out_dims_vec.push_back(depth); - auto out_dims = phi::make_ddim(out_dims_vec); - ctx->SetOutputDim("Out", out_dims); - ctx->ShareLoD("X", /* --> */ "Out"); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -52,7 +36,7 @@ class OneHotV2Op : public framework::OperatorWithKernel { } framework::OpKernelType GetKernelTypeForVar( - const std::string& var_name, const Tensor& tensor, + const std::string& var_name, const framework::Tensor& tensor, const framework::OpKernelType& expected_kernel_type) const override { if (var_name == "depth_tensor") { return expected_kernel_type; @@ -114,10 +98,12 @@ Out is a LoDTensor: } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(one_hot_v2, OneHotInferShapeFunctor, + PD_INFER_META(phi::OneHotRawInferMeta)); + REGISTER_OPERATOR( one_hot_v2, ops::OneHotV2Op, ops::OneHotV2OpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - one_hot_v2, ops::OneHotV2Kernel, - ops::OneHotV2Kernel); + paddle::framework::EmptyGradOpMaker, + OneHotInferShapeFunctor); diff --git a/paddle/fluid/operators/one_hot_v2_op.cu b/paddle/fluid/operators/one_hot_v2_op.cu deleted file mode 100644 index 77e2a931e50..00000000000 --- a/paddle/fluid/operators/one_hot_v2_op.cu +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/one_hot_v2_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; - -template -__global__ void FillOutputKernel(const InT* p_in_data, OutT* p_out_data, - const int64_t numel, const int depth) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < numel && p_in_data[idx] >= 0 && p_in_data[idx] < depth) { - *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0; - } -} - -template -struct OneHotV2OpCUDAFunctor { - const framework::LoDTensor* in_; - framework::LoDTensor* out_; - const DeviceContext& ctx_; - int depth_; - - OneHotV2OpCUDAFunctor(const framework::LoDTensor* in, - framework::LoDTensor* out, int depth, - const DeviceContext& ctx) - : in_(in), out_(out), depth_(depth), ctx_(ctx) {} - - template - void apply() const { - auto* p_in_data = in_->data(); - auto numel = in_->numel(); - auto* p_out_data = out_->mutable_data(ctx_.GetPlace()); - auto stream = ctx_.stream(); - phi::funcs::set_constant(ctx_, out_, 0.0); - - FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - p_in_data, p_out_data, numel, depth_); - } -}; - -using LoDTensor = framework::LoDTensor; -template -class OneHotV2CUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - - int depth = -1; - if (context.HasInput("depth_tensor")) { - auto* depth_tensor = context.Input("depth_tensor"); - if (platform::is_gpu_place(depth_tensor->place())) { - framework::Tensor temp; - paddle::framework::TensorCopySync(*depth_tensor, platform::CPUPlace(), - &temp); - depth = *temp.data(); - } else { - depth = *depth_tensor->data(); - } - - auto out_dims = out->dims(); - out_dims[out_dims.size() - 1] = depth; - out->Resize(out_dims); - } else { - depth = context.Attr("depth"); - } - framework::VisitDataType( - static_cast( - context.Attr("dtype")), - OneHotV2OpCUDAFunctor( - in, out, depth, context.template device_context())); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - one_hot_v2, - ops::OneHotV2CUDAKernel, - ops::OneHotV2CUDAKernel); diff --git a/paddle/fluid/operators/one_hot_v2_op_npu.cc b/paddle/fluid/operators/one_hot_v2_op_npu.cc index acf6baf50b4..e5702a37bb2 100644 --- a/paddle/fluid/operators/one_hot_v2_op_npu.cc +++ b/paddle/fluid/operators/one_hot_v2_op_npu.cc @@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/one_hot_v2_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; template class OneHotV2NPUKernel : public framework::OpKernel { diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h index 00e9bff9bd5..7f4384545f3 100644 --- a/paddle/phi/core/compat/op_utils.h +++ b/paddle/phi/core/compat/op_utils.h @@ -55,6 +55,7 @@ const std::unordered_set deprecated_op_names({"diag", "expand_grad", "expand_as_grad", "sum", + "one_hot", "sum_grad", "top_k", "top_k_grad"}); diff --git a/paddle/phi/core/meta_tensor.cc b/paddle/phi/core/meta_tensor.cc index 38a6e09a61e..bcbb1a4835b 100644 --- a/paddle/phi/core/meta_tensor.cc +++ b/paddle/phi/core/meta_tensor.cc @@ -72,6 +72,10 @@ void MetaTensor::set_layout(DataLayout layout) { } void MetaTensor::share_lod(const MetaTensor& meta_tensor) { + if (meta_tensor.lod().size() == 0) { + // no need share + return; + } if (phi::DenseTensor::classof(tensor_)) { DenseTensorUtils::GetMutableMeta(static_cast(tensor_))->lod = meta_tensor.lod(); diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index d09a2191fb2..4d1cb42bd59 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -1602,6 +1602,43 @@ void UnfoldInferMeta(const MetaTensor& x, out->set_dims(phi::make_ddim(out_dims)); } +void OneHotRawInferMeta(const MetaTensor& x, + int32_t depth, + DataType dtype, + bool allow_out_of_range, + MetaTensor* out) { + auto x_dims = x.dims(); + PADDLE_ENFORCE_GE( + x_dims.size(), + 1, + phi::errors::InvalidArgument("Rank of Input(X) should be at least 1.")); + + auto out_dims_vec = phi::vectorize(x_dims); + out_dims_vec.push_back(depth); + auto out_dims = phi::make_ddim(out_dims_vec); + out->set_dims(out_dims); + out->share_lod(x); + out->set_dtype(dtype); +} + +void OneHotInferMeta(const MetaTensor& x, + const Scalar& depth_t, + MetaTensor* out) { + auto x_dims = x.dims(); + PADDLE_ENFORCE_GE( + x_dims.size(), + 1, + phi::errors::InvalidArgument("Rank of Input(X) should be at least 1.")); + + int depth = depth_t.to(); + auto out_dims_vec = phi::vectorize(x_dims); + out_dims_vec.push_back(depth); + auto out_dims = phi::make_ddim(out_dims_vec); + out->set_dims(out_dims); + out->share_lod(x); + out->set_dtype(phi::DataType::FLOAT32); +} + void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out) { auto rank = condition.dims().size(); PADDLE_ENFORCE_GE( diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index a1fc6fd4053..75fb9fadf82 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -228,6 +228,14 @@ void UnfoldInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void OneHotRawInferMeta(const MetaTensor& x, + int32_t depth, + DataType dtype, + bool allow_out_of_range, + MetaTensor* out); + +void OneHotInferMeta(const MetaTensor& x, const Scalar& depth, MetaTensor* out); + void WhereIndexInferMeta(const MetaTensor& condition, MetaTensor* out); } // namespace phi diff --git a/paddle/fluid/operators/one_hot_v2_op.h b/paddle/phi/kernels/cpu/one_hot_kernel.cc similarity index 50% rename from paddle/fluid/operators/one_hot_v2_op.h rename to paddle/phi/kernels/cpu/one_hot_kernel.cc index 9d42c5875bb..dc58489ebf7 100644 --- a/paddle/fluid/operators/one_hot_v2_op.h +++ b/paddle/phi/kernels/cpu/one_hot_kernel.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,23 +12,25 @@ // See the License for the specific language governing permissions and // limitations under the License. -#pragma once -#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/kernels/one_hot_kernel.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/kernels/funcs/math_function.h" -namespace paddle { -namespace operators { +namespace phi { template struct OneHotV2OpFunctor { - const framework::LoDTensor* in_; - framework::LoDTensor* out_; + const DenseTensor* in_; + DenseTensor* out_; int depth_; const DeviceContext& ctx_; bool allow_out_of_range_; - OneHotV2OpFunctor(const framework::LoDTensor* in, framework::LoDTensor* out, - int depth, const DeviceContext& ctx, + OneHotV2OpFunctor(const DenseTensor* in, + DenseTensor* out, + int depth, + const DeviceContext& ctx, bool allow_out_of_range = false) : in_(in), out_(out), @@ -40,8 +42,8 @@ struct OneHotV2OpFunctor { void apply() const { auto* p_in_data = in_->data(); auto numel = in_->numel(); - auto* p_out_data = out_->mutable_data(ctx_.GetPlace()); - phi::funcs::set_constant(ctx_, out_, 0.0); + auto* p_out_data = ctx_.template Alloc(out_); + funcs::set_constant(ctx_, out_, 0.0); if (allow_out_of_range_) { for (int i = 0; i < numel; ++i) { @@ -52,51 +54,46 @@ struct OneHotV2OpFunctor { } else { for (int i = 0; i < numel; ++i) { PADDLE_ENFORCE_GE( - p_in_data[i], 0, - platform::errors::InvalidArgument( + p_in_data[i], + 0, + phi::errors::InvalidArgument( "Illegal index value, Input(input) value should be at least 0, " "but received input (%d) less than 0", p_in_data[i])); PADDLE_ENFORCE_LT( - p_in_data[i], depth_, - platform::errors::InvalidArgument( + p_in_data[i], + depth_, + phi::errors::InvalidArgument( "Illegal index value, Input(input) value should be less than " "Input(depth), " "but received input (%d) not less than depth (%d)", - p_in_data[i], depth_)); + p_in_data[i], + depth_)); *(p_out_data + i * depth_ + p_in_data[i]) = 1.0; } } } }; -using LoDTensor = framework::LoDTensor; -using Tensor = framework::Tensor; -template -class OneHotV2Kernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - int depth = context.Attr("depth"); - bool allow_out_of_range = context.Attr("allow_out_of_range"); - if (context.HasInput("depth_tensor")) { - auto* depth_tensor = context.Input("depth_tensor"); - auto* depth_data = depth_tensor->data(); - depth = depth_data[0]; - auto out_dims = out->dims(); - out_dims[out_dims.size() - 1] = depth; - out->Resize(out_dims); - } - - framework::VisitDataType( - static_cast( - context.Attr("dtype")), - OneHotV2OpFunctor( - in, out, depth, context.template device_context(), - allow_out_of_range)); +template +void OneHotRawKernel(const Context& dev_ctx, + const DenseTensor& x, + int32_t depth, + DataType dtype, + bool allow_out_of_range, + DenseTensor* out) { + auto out_dims = out->dims(); + if (out_dims[out_dims.size() - 1] == -1) { + out_dims[out_dims.size() - 1] = depth; + out->Resize(out_dims); } -}; -} // namespace operators -} // namespace paddle + phi::VisitDataType(dtype, + OneHotV2OpFunctor( + &x, out, depth, dev_ctx, allow_out_of_range)); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + one_hot_raw, CPU, ALL_LAYOUT, phi::OneHotRawKernel, int, int64_t) {} diff --git a/paddle/phi/kernels/gpu/one_hot_kernel.cu b/paddle/phi/kernels/gpu/one_hot_kernel.cu new file mode 100644 index 00000000000..32c7fa1e85d --- /dev/null +++ b/paddle/phi/kernels/gpu/one_hot_kernel.cu @@ -0,0 +1,86 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/one_hot_kernel.h" + +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void FillOutputKernel(const InT* p_in_data, + OutT* p_out_data, + const int64_t numel, + const int depth) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < numel && p_in_data[idx] >= 0 && p_in_data[idx] < depth) { + *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0; + } +} + +template +struct OneHotV2OpCUDAFunctor { + const DenseTensor* in_; + DenseTensor* out_; + const DeviceContext& ctx_; + int depth_; + + OneHotV2OpCUDAFunctor(const DenseTensor* in, + DenseTensor* out, + int depth, + const DeviceContext& ctx) + : in_(in), out_(out), depth_(depth), ctx_(ctx) {} + + template + void apply() const { + auto* p_in_data = in_->data(); + auto numel = in_->numel(); + auto* p_out_data = ctx_.template Alloc(out_); + auto stream = ctx_.stream(); + funcs::set_constant(ctx_, out_, 0.0); + + FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / + PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, + 0, + stream>>>(p_in_data, p_out_data, numel, depth_); + } +}; + +template +void OneHotRawKernel(const Context& dev_ctx, + const DenseTensor& x, + int32_t depth, + DataType dtype, + bool allow_out_of_range, + DenseTensor* out) { + auto out_dims = out->dims(); + if (out_dims[out_dims.size() - 1] == -1) { + out_dims[out_dims.size() - 1] = depth; + out->Resize(out_dims); + } + + phi::VisitDataType( + dtype, OneHotV2OpCUDAFunctor(&x, out, depth, dev_ctx)); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + one_hot_raw, GPU, ALL_LAYOUT, phi::OneHotRawKernel, int, int64_t) {} diff --git a/paddle/phi/kernels/one_hot_kernel.cc b/paddle/phi/kernels/one_hot_kernel.cc new file mode 100644 index 00000000000..633f48cbb62 --- /dev/null +++ b/paddle/phi/kernels/one_hot_kernel.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/one_hot_kernel.h" + +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void OneHotKernel(const Context& dev_ctx, + const DenseTensor& x, + const Scalar& num_classes_s, + DenseTensor* out) { + int num_classes = num_classes_s.to(); + OneHotRawKernel( + dev_ctx, x, num_classes, phi::DataType::FLOAT32, false, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(one_hot, CPU, ALL_LAYOUT, phi::OneHotKernel, int, int64_t) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_KERNEL(one_hot, GPU, ALL_LAYOUT, phi::OneHotKernel, int, int64_t) {} +#endif diff --git a/paddle/phi/kernels/one_hot_kernel.h b/paddle/phi/kernels/one_hot_kernel.h new file mode 100644 index 00000000000..9f89609ea63 --- /dev/null +++ b/paddle/phi/kernels/one_hot_kernel.h @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void OneHotKernel(const Context& dev_ctx, + const DenseTensor& x, + const Scalar& num_classes, + DenseTensor* out); + +template +void OneHotRawKernel(const Context& dev_ctx, + const DenseTensor& x, + int32_t depth, + DataType dtype, + bool allow_out_of_range, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/one_hot_sig.cc b/paddle/phi/ops/compat/one_hot_sig.cc new file mode 100644 index 00000000000..655969093c8 --- /dev/null +++ b/paddle/phi/ops/compat/one_hot_sig.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature OneHotOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.HasInput("depth_tensor")) { + return KernelSignature("one_hot_raw", + {"X"}, + {"depth_tensor", "dtype", "allow_out_of_range"}, + {"Out"}); + } else { + return KernelSignature("one_hot_raw", + {"X"}, + {"depth", "dtype", "allow_out_of_range"}, + {"Out"}); + } +} + +} // namespace phi + +PD_REGISTER_BASE_KERNEL_NAME(one_hot_v2, one_hot); + +PD_REGISTER_ARG_MAPPING_FN(one_hot_v2, phi::OneHotOpArgumentMapping); diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py index a7dd938a1cf..d0552ca41f0 100644 --- a/python/paddle/fluid/dygraph/tracer.py +++ b/python/paddle/fluid/dygraph/tracer.py @@ -52,6 +52,12 @@ final_state_name_mapping = { "axis1": "axis1", "axis2": "axis2", "out": "Out", + }, + "one_hot": { + "final_op_name": "final_state_one_hot", + "x": "X", + "num_class": "depth", + "out": "Out", } } diff --git a/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py index 66de1b30979..fac25819211 100644 --- a/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py @@ -22,7 +22,8 @@ import paddle import paddle.fluid as fluid import paddle.fluid.core as core import paddle.fluid.framework as framework -from paddle.fluid.framework import Program, program_guard +from paddle.framework import _in_eager_mode +from paddle.fluid.framework import Program, program_guard, _test_eager_guard class TestOneHotOp(OpTest): @@ -45,7 +46,7 @@ class TestOneHotOp(OpTest): self.outputs = {'Out': (out, x_lod)} def test_check_output(self): - self.check_output(check_dygraph=False) + self.check_output() class TestOneHotOp_attr(OpTest): @@ -68,7 +69,7 @@ class TestOneHotOp_attr(OpTest): self.outputs = {'Out': (out, x_lod)} def test_check_output(self): - self.check_output(check_dygraph=False) + self.check_output() class TestOneHotOp_default_dtype(OpTest): @@ -91,7 +92,7 @@ class TestOneHotOp_default_dtype(OpTest): self.outputs = {'Out': (out, x_lod)} def test_check_output(self): - self.check_output(check_dygraph=False) + self.check_output() class TestOneHotOp_default_dtype_attr(OpTest): @@ -114,7 +115,7 @@ class TestOneHotOp_default_dtype_attr(OpTest): self.outputs = {'Out': (out, x_lod)} def test_check_output(self): - self.check_output(check_dygraph=False) + self.check_output() class TestOneHotOp_out_of_range(OpTest): @@ -132,7 +133,7 @@ class TestOneHotOp_out_of_range(OpTest): self.outputs = {'Out': (out, x_lod)} def test_check_output(self): - self.check_output(check_dygraph=False) + self.check_output() class TestOneHotOp_exception(unittest.TestCase): @@ -190,6 +191,12 @@ class TestOneHotOpApi(unittest.TestCase): one_hot_label = fluid.one_hot( input=fluid.dygraph.to_variable(label), depth=depth) + one_hot_label = paddle.nn.functional.one_hot( + fluid.dygraph.to_variable(label), depth) + with _test_eager_guard(): + one_hot_label = paddle.nn.functional.one_hot( + paddle.to_tensor(label), depth) + def _run(self, depth): label = fluid.layers.data(name="label", shape=[1], dtype="int64") one_hot_label = fluid.one_hot(input=label, depth=depth) diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py index de8a7ff6d3c..4c30ed03735 100644 --- a/python/paddle/nn/functional/input.py +++ b/python/paddle/nn/functional/input.py @@ -19,6 +19,7 @@ from ...fluid.layer_helper import LayerHelper from ...fluid.data_feeder import check_variable_and_dtype, check_dtype from paddle import _C_ops from paddle import in_dynamic_mode +from paddle.framework import _in_eager_mode __all__ = [] @@ -87,6 +88,8 @@ def one_hot(x, num_classes, name=None): """ if in_dynamic_mode(): + if _in_eager_mode(): + return _C_ops.final_state_one_hot(x, num_classes) return _C_ops.one_hot_v2(x, 'depth', num_classes, 'allow_out_of_range', False) else: diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 639afeb4c86..0d012685b73 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -204,6 +204,15 @@ output : Tensor invoke : full_like(x, 0, dtype, place) + +- api : one_hot + args : (Tensor x, Scalar num_classes) + output : Tensor + infer_meta : + func : OneHotInferMeta + kernel : + func : one_hot + - api : digamma args : (Tensor x) output : Tensor -- GitLab From 36db75b4f445365c8c28b4b4db269d2be8571b1e Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Tue, 15 Mar 2022 13:24:16 +0800 Subject: [PATCH 065/176] Added more profile signposts to dygraph (#40201) * Added more signposts to dygraph profiling * Fixed minor issues * Refactored signpost names * Fixed typo * Removed debug codes * Fixed typo * Adjusted signpost names * Fixed issues from branch merge --- .../auto_code_generator/eager_generator.cc | 33 ++++++++++++++++--- .../final_state_generator/eager_gen.py | 16 ++++++++- .../final_state_generator/python_c_gen.py | 9 +++-- paddle/fluid/eager/backward.cc | 9 +++++ paddle/fluid/imperative/basic_engine.cc | 5 ++- paddle/fluid/imperative/tracer.cc | 31 +++++++++-------- python/paddle/utils/code_gen/api_base.py | 12 +++++-- python/paddle/utils/code_gen/api_gen.py | 3 ++ .../paddle/utils/code_gen/backward_api_gen.py | 2 ++ 9 files changed, 97 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index dc79a8a45a2..6a2e5e7ac6c 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -1553,9 +1553,23 @@ static std::pair GenerateForwardFunctionContents( core_ops_returns_info[op_type] = return_contents; // [Generation] ComputeRequireGrad -> GradNodeCreation + if (!bwd_info.GenerateForwardOnly()) { std::string grad_node_creation_body_str = GenerateGradNodeCreationContent(fwd_info, bwd_info); + + // Add event record + std::string event_name = op_type + " node_creation"; + const char* NODE_CREATION_TEMPLATE = + "{\n" + " paddle::platform::RecordEvent node_creation_record_event(\"%s\", " + "paddle::platform::TracerEventType::Operator, 1);\n" + " %s\n" + "}"; + + grad_node_creation_body_str = paddle::string::Sprintf( + NODE_CREATION_TEMPLATE, event_name, grad_node_creation_body_str); + generated_function_body += grad_node_creation_body_str; generated_function_body += "\n"; @@ -1614,10 +1628,20 @@ static std::pair GenerateForwardFunctionContents( if ((*iter) == ',') dygraph_function_args_str.erase(iter); } - const char* FWD_FUNCTION_TEMPLATE = "%s %s(%s) {\n\n%s\n}\n\n"; + const char* DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE = + "paddle::platform::RecordEvent dygraph_entrance_record_event(\"%s\", " + "paddle::platform::TracerEventType::Operator, 1);"; + std::string event_name = op_type + " dygraph"; + std::string fwd_record_event_str = paddle::string::Sprintf( + DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE, event_name); + const char* FWD_FUNCTION_TEMPLATE = + "%s %s(%s) {\n\n" + " %s\n" + " %s\n" + "}\n\n"; std::string fwd_function_str = paddle::string::Sprintf( FWD_FUNCTION_TEMPLATE, function_proto_return_type_str, function_name, - dygraph_function_args_str, generated_function_body); + dygraph_function_args_str, fwd_record_event_str, generated_function_body); // [Generation] Generate forward functions header const char* FWD_HEADER_TEMPLATE = "%s %s(%s);\n"; @@ -2240,8 +2264,9 @@ static void GenerateForwardDygraphFile(const std::string& forward_cc_path, "\"paddle/fluid/eager/api/generated/fluid_generated/" "dygraph_forward_api.h\"\n" "#include " - "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n\n" - "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n"; + "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n" + "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n" + "#include \"paddle/fluid/platform/profiler/event_tracing.h\"\n\n"; std::string forward_cc_include_str = paddle::string::Sprintf(FORWARD_INCLUDE_TEMPLATE); std::ofstream forward_cc_stream(forward_cc_path, std::ios::out); diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 537c2bb7f02..656418a05ad 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -923,8 +923,20 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map, backward_attrs_list, optional_inputs) + node_event_name = fwd_api_name + " node_creation" + NODE_CREATION_TEMPLATE = """{{\n + paddle::platform::RecordEvent node_creation_record_event(\"{}\", paddle::platform::TracerEventType::Operator, 1);\n + {}\n + }}""" + node_creation_str = NODE_CREATION_TEMPLATE.format(node_event_name, + node_creation_str) + + dygraph_event_str = f"paddle::platform::RecordEvent dygraph_entrance_record_event(\"{fwd_api_name} dygraph\", paddle::platform::TracerEventType::Operator, 1);" + FORWARD_FUNCTION_TEMPLATE = """ {} {}({}) {{ + {} + // Forward API Call {} @@ -938,7 +950,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, forward_function_name = GetForwardFunctionName(fwd_api_name) forward_function_str = FORWARD_FUNCTION_TEMPLATE.format( returns_type_str, forward_function_name, inputs_args_definition_str, - forward_call_str, node_creation_str, returns_str) + dygraph_event_str, forward_call_str, node_creation_str, returns_str) forward_function_declaration_str = f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});" return forward_function_str, forward_function_declaration_str @@ -1065,6 +1077,8 @@ def GenerateForwardCCFile(filepath, forward_definition_str): #include "paddle/phi/api/include/sparse_api.h" #include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" + """ file_contents += GenerateCoreOpInfoDefinition() diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index eee32a2c505..9b77f0449e0 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -94,9 +94,13 @@ def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map, dygraph_function_call_list[pos] = f"{name}" dygraph_function_call_str = ",".join(dygraph_function_call_list) + pythonc_event_str = f"paddle::platform::RecordEvent pythonc_record_event(\"{fwd_api_name} pybind_imperative_func\", paddle::platform::TracerEventType::Operator, 1);" + PYTHON_C_FUNCTION_TEMPLATE = """ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObject *kwargs) {{ + {} + PyThreadState *tstate = nullptr; try {{ @@ -136,8 +140,8 @@ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObj fwd_function_name = namespace_str + GetForwardFunctionName(fwd_api_name) python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format( - fwd_api_name, fwd_api_name, get_eager_tensor_str, parse_attributes_str, - fwd_function_name, dygraph_function_call_str) + fwd_api_name, pythonc_event_str, fwd_api_name, get_eager_tensor_str, + parse_attributes_str, fwd_function_name, dygraph_function_call_str) python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void)) {namespace_str}eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n" @@ -231,6 +235,7 @@ def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str): #include "paddle/fluid/pybind/op_function_common.h" #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/pybind/exception.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include namespace paddle {{ diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 603f93d9ddc..1987d024d8f 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -19,6 +19,8 @@ #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/grad_tensor_holder.h" #include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/errors.h" @@ -77,6 +79,9 @@ std::unordered_map getInDegreeMap( void RunBackward(const std::vector& tensors, const std::vector& grad_tensors, bool retain_graph) { + paddle::platform::RecordEvent backward_record_event( + "backward", paddle::platform::TracerEventType::Operator, 1); + VLOG(6) << "Start Backward"; // *Gradient Hook should happen at node-level // *Inplace version check should perform at node-level @@ -160,6 +165,10 @@ void RunBackward(const std::vector& tensors, while (!queue.empty()) { GradNodeBase* node = queue.front(); + paddle::platform::RecordEvent node_record_event( + std::string(typeid(*node).name()) + " grad_node", + paddle::platform::TracerEventType::Operator, 1); + if (queue.size() > 1 && node_in_degree_map[node] != 0) { queue.pop(); continue; diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc index 7416d206fc4..d7478b18dba 100644 --- a/paddle/fluid/imperative/basic_engine.cc +++ b/paddle/fluid/imperative/basic_engine.cc @@ -389,6 +389,9 @@ static void PerformBackwardInplace(const std::string& op_type, } void BasicEngine::Execute() { + platform::RecordEvent backward_record_event( + "backward", platform::TracerEventType::Operator, 1); + if (init_nodes_.empty()) { return; } @@ -412,7 +415,7 @@ void BasicEngine::Execute() { for (auto& cur_op : *shared_cur_node) { platform::RecordEvent op_type_record_event( - cur_op.Type(), platform::TracerEventType::Operator, 1); + cur_op.Type() + " grad_node", platform::TracerEventType::Operator, 1); ++op_num; diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 01c9d2847e0..c55599cc9aa 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -177,7 +177,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap& ins, paddle::framework::AttributeMap* passed_default_attrs_, bool use_default_attr_map) { platform::RecordEvent op_type_record_event( - type, platform::TracerEventType::Operator, 1); + type + " trace_op", platform::TracerEventType::Operator, 1); platform::ScopedFlushDenormal flush; VLOG(1) << "Trace Op: " << type; if (FLAGS_use_mkldnn) { @@ -297,19 +297,24 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap& ins, program_desc_tracer_->InsertOp(type, new_ins, outs, attrs); } - if (ComputeRequiredGrad(new_ins, outs, trace_backward)) { - PADDLE_ENFORCE_EQ( - passed_default_attrs_, nullptr, - paddle::platform::errors::PermissionDenied( - "We expect passed_default_attrs_ is nullptr while " - "use_default_attr_map is true, however we got not null " - "passed_default_attrs_. Please check your usage of trace_op. ")); - CreateGradOpNode(*op, new_ins, outs, attrs, default_attrs, place, - inplace_map); - } else { - VLOG(3) << "No Grad to track for Op: " << type; + { + platform::RecordEvent node_creation_record_event( + type + " node_creation", platform::TracerEventType::Operator, 1); + + if (ComputeRequiredGrad(new_ins, outs, trace_backward)) { + PADDLE_ENFORCE_EQ( + passed_default_attrs_, nullptr, + paddle::platform::errors::PermissionDenied( + "We expect passed_default_attrs_ is nullptr while " + "use_default_attr_map is true, however we got not null " + "passed_default_attrs_. Please check your usage of trace_op. ")); + CreateGradOpNode(*op, new_ins, outs, attrs, default_attrs, place, + inplace_map); + } else { + VLOG(3) << "No Grad to track for Op: " << type; + } + VLOG(6) << "Finish Trace Op: " << type; } - VLOG(6) << "Finish Trace Op: " << type; } template void Tracer::TraceOp( diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py index fe68548a22a..d91b76bb703 100644 --- a/python/paddle/utils/code_gen/api_base.py +++ b/python/paddle/utils/code_gen/api_base.py @@ -696,6 +696,7 @@ PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self code_indent) outputs_args, kernel_output_names, output_create = self.gene_output( self.outputs['types'], 'SetKernelOutput', code_indent, inplace_flag) + api_func_name = self.get_api_func_name() + ('_' if inplace_flag else '') return f""" {code_indent} auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( {code_indent} "{self.kernel['func'][0]}", {{kernel_backend, kernel_layout, kernel_data_type}}); @@ -709,7 +710,10 @@ PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self {code_indent} using kernel_signature = {kernel_signature}; {code_indent} auto* kernel_fn = kernel.GetVariadicKernelFn(); -{code_indent} (*kernel_fn)({kernel_args}, {outputs_args}); +{code_indent} {{ +{code_indent} paddle::platform::RecordEvent kernel_record_event(\"{api_func_name} compute\", paddle::platform::TracerEventType::Operator, 1); +{code_indent} (*kernel_fn)({kernel_args}, {outputs_args}); +{code_indent} }} {code_indent} return {self.gene_return_code()};""" @@ -719,6 +723,7 @@ PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self outputs_args, kernel_output_names, output_create = self.gene_output( self.outputs['types'], 'SetSelectedRowsKernelOutput', code_indent, inplace_flag) + api_func_name = self.get_api_func_name() + ('_' if inplace_flag else '') return f""" {code_indent} auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( {code_indent} "{self.kernel['func'][1]}", {{kernel_backend, kernel_layout, kernel_data_type}}); @@ -732,7 +737,10 @@ PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self {code_indent} using kernel_signature = {kernel_signature}; {code_indent} auto* kernel_fn = kernel.GetVariadicKernelFn(); -{code_indent} (*kernel_fn)({kernel_args}, {outputs_args}); +{code_indent} {{ +{code_indent} paddle::platform::RecordEvent kernel_record_event(\"{api_func_name} compute\", paddle::platform::TracerEventType::Operator, 1); +{code_indent} (*kernel_fn)({kernel_args}, {outputs_args}); +{code_indent} }} {code_indent} return {self.gene_return_code()};""" diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py index a404fc01784..98a3606952b 100644 --- a/python/paddle/utils/code_gen/api_gen.py +++ b/python/paddle/utils/code_gen/api_gen.py @@ -147,6 +147,9 @@ def source_include(header_file_path): #include "paddle/phi/infermeta/multiary.h" #include "paddle/phi/infermeta/nullary.h" #include "paddle/phi/infermeta/unary.h" +#include "paddle/phi/kernels/declarations.h" + +#include "paddle/fluid/platform/profiler/event_tracing.h" """ diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py index 7417d6bb030..5506f71f4b6 100644 --- a/python/paddle/utils/code_gen/backward_api_gen.py +++ b/python/paddle/utils/code_gen/backward_api_gen.py @@ -154,6 +154,8 @@ def source_include(header_file_path): #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/api/include/api.h" #include "paddle/phi/infermeta/backward.h" + +#include "paddle/fluid/platform/profiler/event_tracing.h" """ -- GitLab From c46e661dfd4dca4ccb2d8bb83225d2dcb0bff52c Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Tue, 15 Mar 2022 14:10:21 +0800 Subject: [PATCH 066/176] [Phi]move reduce_min/any/all kernel (#40374) * add reduce_min kernel * remove raw reduce_min kernel * add reduce min * add reduce any all impl * add bool reduce Kernel * remove raw any/all kernel * add any all kernel * rm comment --- .../operators/reduce_ops/reduce_all_op.cc | 20 ++++-- .../operators/reduce_ops/reduce_all_op.cu | 19 ------ .../operators/reduce_ops/reduce_any_op.cc | 20 ++++-- .../operators/reduce_ops/reduce_any_op.cu | 20 ------ .../reduce_ops/reduce_any_op_npu_test.cc | 2 +- .../operators/reduce_ops/reduce_min_op.cc | 31 +++++++--- .../operators/reduce_ops/reduce_min_op.cu | 23 ------- paddle/phi/core/compat/op_utils.h | 3 + paddle/phi/kernels/cpu/reduce.h | 25 ++++++++ paddle/phi/kernels/cpu/reduce_all_kernel.cc | 37 +++++++++++ paddle/phi/kernels/cpu/reduce_any_kernel.cc | 37 +++++++++++ paddle/phi/kernels/cpu/reduce_min_kernel.cc | 39 ++++++++++++ paddle/phi/kernels/funcs/reduce_functor.h | 24 +++++++ paddle/phi/kernels/gpu/reduce_all_kernel.cu | 36 +++++++++++ paddle/phi/kernels/gpu/reduce_any_kernel.cu | 36 +++++++++++ paddle/phi/kernels/gpu/reduce_min_kernel.cu | 37 +++++++++++ paddle/phi/kernels/reduce_all_kernel.cc | 37 +++++++++++ paddle/phi/kernels/reduce_all_kernel.h | 35 +++++++++++ paddle/phi/kernels/reduce_any_kernel.cc | 37 +++++++++++ paddle/phi/kernels/reduce_any_kernel.h | 35 +++++++++++ paddle/phi/kernels/reduce_max_kernel.h | 3 - paddle/phi/kernels/reduce_min_kernel.cc | 39 ++++++++++++ paddle/phi/kernels/reduce_min_kernel.h | 35 +++++++++++ paddle/phi/ops/compat/reduce_sig.cc | 62 +++++++++++++++++-- 24 files changed, 605 insertions(+), 87 deletions(-) delete mode 100644 paddle/fluid/operators/reduce_ops/reduce_all_op.cu delete mode 100644 paddle/fluid/operators/reduce_ops/reduce_any_op.cu delete mode 100644 paddle/fluid/operators/reduce_ops/reduce_min_op.cu create mode 100644 paddle/phi/kernels/cpu/reduce_all_kernel.cc create mode 100644 paddle/phi/kernels/cpu/reduce_any_kernel.cc create mode 100644 paddle/phi/kernels/cpu/reduce_min_kernel.cc create mode 100644 paddle/phi/kernels/gpu/reduce_all_kernel.cu create mode 100644 paddle/phi/kernels/gpu/reduce_any_kernel.cu create mode 100644 paddle/phi/kernels/gpu/reduce_min_kernel.cu create mode 100644 paddle/phi/kernels/reduce_all_kernel.cc create mode 100644 paddle/phi/kernels/reduce_all_kernel.h create mode 100644 paddle/phi/kernels/reduce_any_kernel.cc create mode 100644 paddle/phi/kernels/reduce_any_kernel.h create mode 100644 paddle/phi/kernels/reduce_min_kernel.cc create mode 100644 paddle/phi/kernels/reduce_min_kernel.h diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc index 955cf8d4448..9115d21b195 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc @@ -14,6 +14,10 @@ #include "paddle/fluid/operators/reduce_ops/reduce_all_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace framework { class OpDesc; @@ -28,9 +32,17 @@ class CPUDeviceContext; } // namespace platform } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(reduce_all, ReduceAllInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); +class ReduceAllOpMaker : public ops::ReduceOpMaker { + protected: + virtual std::string GetName() const { return "reduce_all"; } + virtual std::string GetOpType() const { return "Reduce reduce_all"; } +}; // kernel's device type is decided by input tensor place, to be consistent with // compare and logical ops -REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_all, UseInputPlace); -REGISTER_OP_CPU_KERNEL(reduce_all, - ops::BoolReduceKernel); +REGISTER_OPERATOR( + reduce_all, ops::ReduceOpUseInputPlace, ReduceAllOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + ReduceAllInferShapeFunctor); diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu deleted file mode 100644 index a1f1a228aeb..00000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/reduce_ops/reduce_all_op.h" - -REGISTER_OP_CUDA_KERNEL( - reduce_all, - ops::ReduceCudaKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc index fa3800dd3c9..69561b93498 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc @@ -14,6 +14,9 @@ #include "paddle/fluid/operators/reduce_ops/reduce_any_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace framework { class OpDesc; @@ -28,9 +31,18 @@ class CPUDeviceContext; } // namespace platform } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(reduce_any, ReduceAnyInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); + +class ReduceAnyOpMaker : public ops::ReduceOpMaker { + protected: + virtual std::string GetName() const { return "reduce_any"; } + virtual std::string GetOpType() const { return "Reduce reduce_any"; } +}; // kernel's device type is decided by input tensor place, to be consistent with // compare and logical ops -REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_any, UseInputPlace); -REGISTER_OP_CPU_KERNEL(reduce_any, - ops::BoolReduceKernel); +REGISTER_OPERATOR( + reduce_any, ops::ReduceOpUseInputPlace, ReduceAnyOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + ReduceAnyInferShapeFunctor); diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu deleted file mode 100644 index 2e93e67debb..00000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/reduce_ops/reduce_any_op.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" - -REGISTER_OP_CUDA_KERNEL( - reduce_any, - ops::ReduceCudaKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc index d057ee8f5d7..e327d19ab3b 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc @@ -35,7 +35,7 @@ namespace p = paddle::platform; using Tensor = paddle::framework::Tensor; -USE_OP(reduce_any); +USE_OP_ITSELF(reduce_any); USE_OP_DEVICE_KERNEL(reduce_any, NPU); template diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc index 11aa78382e3..b9915f2b484 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc @@ -14,15 +14,28 @@ #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" -REGISTER_REDUCE_OP(reduce_min); -REGISTER_OP_CPU_KERNEL( - reduce_min, ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + +namespace ops = paddle::operators; + +class ReduceMinOpMaker : public ops::ReduceOpMaker { + protected: + virtual std::string GetName() const { return "reduce_min"; } + virtual std::string GetOpType() const { return "Reduce reduce_min"; } +}; + +DECLARE_INFER_SHAPE_FUNCTOR(reduce_min, ReduceMinInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); + +REGISTER_OPERATOR( + reduce_min, ops::ReduceOp, ReduceMinOpMaker, + paddle::framework::DefaultGradOpMaker, + paddle::framework::DefaultGradOpMaker, + ReduceMinInferShapeFunctor); +REGISTER_OPERATOR(reduce_min_grad, ops::ReduceGradOp) + REGISTER_OP_CPU_KERNEL( reduce_min_grad, ops::ReduceGradKernel, diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu deleted file mode 100644 index 44548b8d2e7..00000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" - -// reduce_min -REGISTER_OP_CUDA_KERNEL( - reduce_min, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel); diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h index 7f4384545f3..b1da573c49f 100644 --- a/paddle/phi/core/compat/op_utils.h +++ b/paddle/phi/core/compat/op_utils.h @@ -48,6 +48,9 @@ const std::unordered_set deprecated_op_names({"diag", "matmul_grad_grad", "mean", "max", + "min", + "any", + "all", "reshape", "reshape_grad", "expand", diff --git a/paddle/phi/kernels/cpu/reduce.h b/paddle/phi/kernels/cpu/reduce.h index 4e268d40038..af67bdf5d62 100644 --- a/paddle/phi/kernels/cpu/reduce.h +++ b/paddle/phi/kernels/cpu/reduce.h @@ -239,4 +239,29 @@ void Reduce(const DeviceContext& dev_ctx, } } +template +void BoolReduceKernel(const DeviceContext& dev_ctx, + const phi::DenseTensor& input, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + phi::DenseTensor* output) { + dev_ctx.template Alloc(output); + + // The dims has full dim, set the reduce_all is True + const auto& input_dim_size = input.dims().size(); + std::set dims_set(dims.begin(), dims.end()); + bool full_dim = true; + for (auto i = 0; i < input_dim_size; i++) { + if (dims_set.find(i) == dims_set.end()) { + full_dim = false; + break; + } + } + reduce_all = (reduce_all || full_dim); + + ReduceKernelImpl( + dev_ctx, input, output, dims, keep_dim, reduce_all); +} + } // namespace phi diff --git a/paddle/phi/kernels/cpu/reduce_all_kernel.cc b/paddle/phi/kernels/cpu/reduce_all_kernel.cc new file mode 100644 index 00000000000..3e8e38ee444 --- /dev/null +++ b/paddle/phi/kernels/cpu/reduce_all_kernel.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_all_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" + +namespace phi { + +template +void AllRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + phi::BoolReduceKernel( + dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(all_raw, CPU, ALL_LAYOUT, phi::AllRawKernel, bool) {} diff --git a/paddle/phi/kernels/cpu/reduce_any_kernel.cc b/paddle/phi/kernels/cpu/reduce_any_kernel.cc new file mode 100644 index 00000000000..4fd71f1d0b1 --- /dev/null +++ b/paddle/phi/kernels/cpu/reduce_any_kernel.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_any_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" + +namespace phi { + +template +void AnyRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + phi::BoolReduceKernel( + dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(any_raw, CPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {} diff --git a/paddle/phi/kernels/cpu/reduce_min_kernel.cc b/paddle/phi/kernels/cpu/reduce_min_kernel.cc new file mode 100644 index 00000000000..0a241c81dbe --- /dev/null +++ b/paddle/phi/kernels/cpu/reduce_min_kernel.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_min_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" + +namespace phi { + +template +void MinRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + min_raw, CPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/funcs/reduce_functor.h b/paddle/phi/kernels/funcs/reduce_functor.h index 4e83d0fa371..c74880e0432 100644 --- a/paddle/phi/kernels/funcs/reduce_functor.h +++ b/paddle/phi/kernels/funcs/reduce_functor.h @@ -49,5 +49,29 @@ struct MaxFunctor { } }; +//////// Min Functor /////// +struct MinFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->minimum(dim); + } +}; + +//////// All Functor /////// +struct AllFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->all(dim); + } +}; + +//////// Any Functor /////// +struct AnyFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->any(dim); + } +}; + } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/gpu/reduce_all_kernel.cu b/paddle/phi/kernels/gpu/reduce_all_kernel.cu new file mode 100644 index 00000000000..2963d3f206c --- /dev/null +++ b/paddle/phi/kernels/gpu/reduce_all_kernel.cu @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_all_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/reduce.h" + +namespace phi { + +template +void AllRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(all_raw, GPU, ALL_LAYOUT, phi::AllRawKernel, bool) {} diff --git a/paddle/phi/kernels/gpu/reduce_any_kernel.cu b/paddle/phi/kernels/gpu/reduce_any_kernel.cu new file mode 100644 index 00000000000..39c8cbe442c --- /dev/null +++ b/paddle/phi/kernels/gpu/reduce_any_kernel.cu @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_any_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/reduce.h" + +namespace phi { + +template +void AnyRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(any_raw, GPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {} diff --git a/paddle/phi/kernels/gpu/reduce_min_kernel.cu b/paddle/phi/kernels/gpu/reduce_min_kernel.cu new file mode 100644 index 00000000000..ba37d54895d --- /dev/null +++ b/paddle/phi/kernels/gpu/reduce_min_kernel.cu @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_min_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/reduce.h" + +namespace phi { + +template +void MinRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + min_raw, GPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/reduce_all_kernel.cc b/paddle/phi/kernels/reduce_all_kernel.cc new file mode 100644 index 00000000000..3cbd0976ad8 --- /dev/null +++ b/paddle/phi/kernels/reduce_all_kernel.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_all_kernel.h" + +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void AllKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + AllRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(all, CPU, ALL_LAYOUT, phi::AllKernel, bool) {} +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_KERNEL(all, GPU, ALL_LAYOUT, phi::AllKernel, bool) {} +#endif diff --git a/paddle/phi/kernels/reduce_all_kernel.h b/paddle/phi/kernels/reduce_all_kernel.h new file mode 100644 index 00000000000..8d7a9ab3faf --- /dev/null +++ b/paddle/phi/kernels/reduce_all_kernel.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void AllRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void AllKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/kernels/reduce_any_kernel.cc b/paddle/phi/kernels/reduce_any_kernel.cc new file mode 100644 index 00000000000..371dd972129 --- /dev/null +++ b/paddle/phi/kernels/reduce_any_kernel.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_any_kernel.h" + +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void AnyKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + AnyRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(any, CPU, ALL_LAYOUT, phi::AnyKernel, bool) {} +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_KERNEL(any, GPU, ALL_LAYOUT, phi::AnyKernel, bool) {} +#endif diff --git a/paddle/phi/kernels/reduce_any_kernel.h b/paddle/phi/kernels/reduce_any_kernel.h new file mode 100644 index 00000000000..0f505817084 --- /dev/null +++ b/paddle/phi/kernels/reduce_any_kernel.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void AnyRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void AnyKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/kernels/reduce_max_kernel.h b/paddle/phi/kernels/reduce_max_kernel.h index 7560473d43c..49a350519c5 100644 --- a/paddle/phi/kernels/reduce_max_kernel.h +++ b/paddle/phi/kernels/reduce_max_kernel.h @@ -15,9 +15,6 @@ #pragma once #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/infermeta/binary.h" -#include "paddle/phi/infermeta/unary.h" -#include "paddle/phi/kernels/empty_kernel.h" namespace phi { diff --git a/paddle/phi/kernels/reduce_min_kernel.cc b/paddle/phi/kernels/reduce_min_kernel.cc new file mode 100644 index 00000000000..c8ec6b3678c --- /dev/null +++ b/paddle/phi/kernels/reduce_min_kernel.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_min_kernel.h" + +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void MinKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + MinRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + min, CPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {} +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_KERNEL( + min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {} +#endif diff --git a/paddle/phi/kernels/reduce_min_kernel.h b/paddle/phi/kernels/reduce_min_kernel.h new file mode 100644 index 00000000000..3227ec00e64 --- /dev/null +++ b/paddle/phi/kernels/reduce_min_kernel.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void MinRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void MinKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc index 997f1505bd0..dcb00fe1b0c 100644 --- a/paddle/phi/ops/compat/reduce_sig.cc +++ b/paddle/phi/ops/compat/reduce_sig.cc @@ -41,8 +41,7 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) { // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in // InferShape, so we must return the "mean_raw" KernelSignature. // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with - // the - // "mean_raw" KernelSignature + // the "mean_raw" KernelSignature if (ctx.IsForInferShape() || reduce_all) { return KernelSignature( "mean_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); @@ -63,8 +62,7 @@ KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) { // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in // InferShape, so we must return the "max_raw" KernelSignature. // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with - // the - // "max_raw" KernelSignature + // the "max_raw" KernelSignature if (ctx.IsForInferShape() || reduce_all) { return KernelSignature( "max_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); @@ -74,6 +72,54 @@ KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("unregistered", {}, {}, {}); } +KernelSignature ReduceMinOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.IsDenseTensorInput("X")) { + bool reduce_all = paddle::any_cast(ctx.Attr("reduce_all")); + // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in + // InferShape, so we must return the "min_raw" KernelSignature. + // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with + // the "min_raw" KernelSignature + if (ctx.IsForInferShape() || reduce_all) { + return KernelSignature( + "min_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); + } + return KernelSignature("min", {"X"}, {"dim", "keep_dim"}, {"Out"}); + } + return KernelSignature("unregistered", {}, {}, {}); +} + +KernelSignature ReduceAnyOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.IsDenseTensorInput("X")) { + bool reduce_all = paddle::any_cast(ctx.Attr("reduce_all")); + // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in + // InferShape, so we must return the "any_raw" KernelSignature. + // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with + // the "any_raw" KernelSignature + if (ctx.IsForInferShape() || reduce_all) { + return KernelSignature( + "any_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); + } + return KernelSignature("any", {"X"}, {"dim", "keep_dim"}, {"Out"}); + } + return KernelSignature("unregistered", {}, {}, {}); +} + +KernelSignature ReduceAllOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.IsDenseTensorInput("X")) { + bool reduce_all = paddle::any_cast(ctx.Attr("reduce_all")); + // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in + // InferShape, so we must return the "all_raw" KernelSignature. + // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with + // the "all_raw" KernelSignature + if (ctx.IsForInferShape() || reduce_all) { + return KernelSignature( + "all_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); + } + return KernelSignature("all", {"X"}, {"dim", "keep_dim"}, {"Out"}); + } + return KernelSignature("unregistered", {}, {}, {}); +} + KernelSignature ReduceSumGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( @@ -88,11 +134,19 @@ KernelSignature ReduceSumGradOpArgumentMapping( PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum); PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean); PD_REGISTER_BASE_KERNEL_NAME(reduce_max, max); +PD_REGISTER_BASE_KERNEL_NAME(reduce_min, min); +PD_REGISTER_BASE_KERNEL_NAME(reduce_all, all); +PD_REGISTER_BASE_KERNEL_NAME(reduce_any, any); + PD_REGISTER_BASE_KERNEL_NAME(reduce_sum_grad, sum_grad); PD_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(reduce_prod, phi::ReduceProdOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(reduce_max, phi::ReduceMaxOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reduce_min, phi::ReduceMinOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reduce_all, phi::ReduceAllOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reduce_any, phi::ReduceAnyOpArgumentMapping); + PD_REGISTER_ARG_MAPPING_FN(reduce_sum_grad, phi::ReduceSumGradOpArgumentMapping); -- GitLab From 7039f61e92344a4b9f469d9c74b50d9627618736 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 15 Mar 2022 14:21:23 +0800 Subject: [PATCH 067/176] add softmax yaml and add_raw infermeta (#40534) --- paddle/phi/infermeta/binary.cc | 3 +++ python/paddle/utils/code_gen/api.yaml | 8 ++++++++ 2 files changed, 11 insertions(+) diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 2947661517e..ff2cf81a904 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/kernels/funcs/common_shape.h" namespace phi { @@ -813,3 +814,5 @@ void TriangularSolveInferMeta(const MetaTensor& x, } } // namespace phi + +PD_REGISTER_INFER_META_FN(add_raw, phi::ElementwiseRawInferMeta); diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 0d012685b73..d24b64bf661 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -177,6 +177,14 @@ kernel : func : sign +- api : softmax + args : (Tensor x, int axis) + output : Tensor + infer_meta : + func : SoftmaxInferMeta + kernel : + func : sotfmax + - api : split args : (Tensor x, ScalarArray num_or_sections, Scalar axis) output : Tensor[] -- GitLab From 64223620e92b1aac5b84af1f2bafad68d0384116 Mon Sep 17 00:00:00 2001 From: xiongkun Date: Tue, 15 Mar 2022 14:47:14 +0800 Subject: [PATCH 068/176] [phi] Transfer lgamma, kldiv_loss, isclose, cumprod kernels into phi and pass the tests of these four kernels (#39770) * tranfer and pass the lgamma unittest * merge and pass the test * transfer kldiv_loss and kldiv_loss_grad; pass the unitest * trafer the isclose and cumprod kernel * change PT_REGISTER -> PD_REGISTER * fix by code review * fix by code review * fix * remove enforce include dependence from scalar * fix * fix by code review * fix by code review --- paddle/fluid/operators/cumprod_op.cc | 16 +- paddle/fluid/operators/cumprod_op.cu | 369 ------------------ paddle/fluid/operators/cumprod_op.h | 170 -------- paddle/fluid/operators/isclose_op.cc | 43 -- paddle/fluid/operators/isclose_op.cu | 85 ---- paddle/fluid/operators/isclose_op.h | 93 ----- paddle/fluid/operators/kldiv_loss_op.cc | 8 - paddle/fluid/operators/kldiv_loss_op.cu | 22 -- paddle/fluid/operators/kldiv_loss_op.h | 119 ------ paddle/fluid/operators/kldiv_loss_op_npu.cc | 3 +- paddle/fluid/operators/lgamma_op.cc | 30 +- paddle/fluid/operators/lgamma_op.cu | 59 --- paddle/fluid/operators/lgamma_op.h | 100 ----- paddle/fluid/operators/math/inclusive_scan.h | 18 +- paddle/phi/api/lib/utils/CMakeLists.txt | 2 +- paddle/phi/common/CMakeLists.txt | 1 + paddle/phi/common/scalar.cc | 35 ++ paddle/phi/common/scalar.h | 11 +- paddle/phi/kernels/cpu/cumprod_grad_kernel.cc | 113 ++++++ paddle/phi/kernels/cpu/cumprod_kernel.cc | 65 +++ paddle/phi/kernels/cpu/isclose_kernel.cc | 21 + .../phi/kernels/cpu/kldiv_loss_grad_kernel.cc | 22 ++ paddle/phi/kernels/cpu/kldiv_loss_kernel.cc | 23 ++ paddle/phi/kernels/cpu/lgamma_grad_kernel.cc | 20 + paddle/phi/kernels/cpu/lgamma_kernel.cc | 49 +++ paddle/phi/kernels/cumprod_grad_kernel.h | 28 ++ paddle/phi/kernels/cumprod_kernel.h | 26 ++ paddle/phi/kernels/funcs/cumprod.h | 52 +++ .../phi/kernels/funcs/elementwise_functor.h | 5 + paddle/phi/kernels/gpu/cumprod_grad_kernel.cu | 320 +++++++++++++++ paddle/phi/kernels/gpu/cumprod_kernel.cu | 60 +++ paddle/phi/kernels/gpu/isclose_kernel.cu | 22 ++ .../phi/kernels/gpu/kldiv_loss_grad_kernel.cu | 22 ++ paddle/phi/kernels/gpu/kldiv_loss_kernel.cu | 21 + paddle/phi/kernels/gpu/lgamma_grad_kernel.cu | 21 + paddle/phi/kernels/gpu/lgamma_kernel.cu | 41 ++ paddle/phi/kernels/impl/isclose_kernel_impl.h | 176 +++++++++ .../impl/kldiv_loss_grad_kernel_impl.h | 70 ++++ .../phi/kernels/impl/kldiv_loss_kernel_impl.h | 69 ++++ .../kernels/impl/lgamma_grad_kernel_impl.h | 47 +++ paddle/phi/kernels/isclose_kernel.h | 30 ++ paddle/phi/kernels/kldiv_loss_grad_kernel.h | 29 ++ paddle/phi/kernels/kldiv_loss_kernel.h | 29 ++ paddle/phi/kernels/lgamma_grad_kernel.h | 27 ++ paddle/phi/kernels/lgamma_kernel.h | 26 ++ paddle/phi/ops/compat/cumprod_sig.cc | 29 ++ paddle/phi/ops/compat/isclose_sig.cc | 50 +++ paddle/phi/ops/compat/kldiv_loss_sig.cc | 30 ++ paddle/phi/ops/compat/lgamma_sig.cc | 25 ++ 49 files changed, 1632 insertions(+), 1120 deletions(-) delete mode 100644 paddle/fluid/operators/cumprod_op.cu delete mode 100644 paddle/fluid/operators/cumprod_op.h delete mode 100644 paddle/fluid/operators/isclose_op.cu delete mode 100644 paddle/fluid/operators/isclose_op.h delete mode 100644 paddle/fluid/operators/kldiv_loss_op.cu delete mode 100644 paddle/fluid/operators/kldiv_loss_op.h delete mode 100644 paddle/fluid/operators/lgamma_op.cu delete mode 100644 paddle/fluid/operators/lgamma_op.h create mode 100644 paddle/phi/common/scalar.cc create mode 100644 paddle/phi/kernels/cpu/cumprod_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/cumprod_kernel.cc create mode 100644 paddle/phi/kernels/cpu/isclose_kernel.cc create mode 100644 paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/kldiv_loss_kernel.cc create mode 100644 paddle/phi/kernels/cpu/lgamma_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/lgamma_kernel.cc create mode 100644 paddle/phi/kernels/cumprod_grad_kernel.h create mode 100644 paddle/phi/kernels/cumprod_kernel.h create mode 100644 paddle/phi/kernels/funcs/cumprod.h create mode 100644 paddle/phi/kernels/gpu/cumprod_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/cumprod_kernel.cu create mode 100644 paddle/phi/kernels/gpu/isclose_kernel.cu create mode 100644 paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/kldiv_loss_kernel.cu create mode 100644 paddle/phi/kernels/gpu/lgamma_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/lgamma_kernel.cu create mode 100644 paddle/phi/kernels/impl/isclose_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/isclose_kernel.h create mode 100644 paddle/phi/kernels/kldiv_loss_grad_kernel.h create mode 100644 paddle/phi/kernels/kldiv_loss_kernel.h create mode 100644 paddle/phi/kernels/lgamma_grad_kernel.h create mode 100644 paddle/phi/kernels/lgamma_kernel.h create mode 100644 paddle/phi/ops/compat/cumprod_sig.cc create mode 100644 paddle/phi/ops/compat/isclose_sig.cc create mode 100644 paddle/phi/ops/compat/kldiv_loss_sig.cc create mode 100644 paddle/phi/ops/compat/lgamma_sig.cc diff --git a/paddle/fluid/operators/cumprod_op.cc b/paddle/fluid/operators/cumprod_op.cc index bff6673429d..90910bbbb20 100644 --- a/paddle/fluid/operators/cumprod_op.cc +++ b/paddle/fluid/operators/cumprod_op.cc @@ -12,7 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/cumprod_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" namespace paddle { namespace operators { @@ -87,16 +88,3 @@ REGISTER_OPERATOR(cumprod, ops::CumprodOp, ops::CumprodOpMaker, ops::CumprodGradOpMaker); REGISTER_OPERATOR(cumprod_grad, ops::CumprodGradOp); - -REGISTER_OP_CPU_KERNEL( - cumprod, ops::CumprodOpCPUKernel, ops::CumprodOpCPUKernel, - ops::CumprodOpCPUKernel, ops::CumprodOpCPUKernel, - ops::CumprodOpCPUKernel>, - ops::CumprodOpCPUKernel>); - -REGISTER_OP_CPU_KERNEL( - cumprod_grad, ops::CumprodGradOpCPUKernel, - ops::CumprodGradOpCPUKernel, ops::CumprodGradOpCPUKernel, - ops::CumprodGradOpCPUKernel, - ops::CumprodGradOpCPUKernel>, - ops::CumprodGradOpCPUKernel>); diff --git a/paddle/fluid/operators/cumprod_op.cu b/paddle/fluid/operators/cumprod_op.cu deleted file mode 100644 index f792d683291..00000000000 --- a/paddle/fluid/operators/cumprod_op.cu +++ /dev/null @@ -1,369 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "paddle/fluid/operators/cumprod_op.h" -#include "paddle/fluid/operators/math/inclusive_scan.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" - -namespace paddle { -namespace operators { - -template -struct MultiplyFunctor { - HOSTDEVICE T operator()(T a, T b) const { return a * b; } -}; - -template -class CumprodOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *x = ctx.Input("X"); - auto *y = ctx.Output("Out"); - auto dim = ctx.Attr("dim"); - size_t outer_dim, mid_dim, inner_dim; - GetCumprodDimInfo(x->dims(), dim, &outer_dim, &mid_dim, &inner_dim); - - const auto *x_data = x->data(); - auto *y_data = y->mutable_data(ctx.GetPlace()); - const auto &dev_ctx = - ctx.template device_context(); - math::InclusiveScan>( - x_data, y_data, outer_dim, mid_dim, inner_dim, static_cast(1), - MultiplyFunctor(), /*reverse=*/false, dev_ctx); - } -}; - -template -struct IsZeroFunctor { - HOSTDEVICE bool operator()(T x) const { return x == static_cast(0); } -}; - -template -struct CumprodGradFunctorExceptFirstZero { - HOSTDEVICE CumprodGradFunctorExceptFirstZero( - const T *x, const T *y, const T *dy_mul_y_reversed_cumsum, - const uint8_t *zero_mask, size_t mid_dim, size_t inner_dim, T *dx, - int64_t *first_zero_idx, T *x_filled_one) - : x_(x), - y_(y), - dy_mul_y_reversed_cumsum_(dy_mul_y_reversed_cumsum), - zero_mask_(zero_mask), - mid_dim_(mid_dim), - inner_dim_(inner_dim), - dx_(dx), - first_zero_idx_(first_zero_idx), - x_filled_one_(x_filled_one) {} - - HOSTDEVICE void operator()(size_t idx) const { - auto inner_idx = idx % inner_dim_; - auto outer_idx = idx / (mid_dim_ * inner_dim_); - auto mid_idx = (idx - inner_idx) / inner_dim_ % mid_dim_; - auto mask = zero_mask_[idx]; - bool should_fill_one = true; - - if (mask == 0) { - dx_[idx] = dy_mul_y_reversed_cumsum_[idx] / x_[idx]; - if (mid_idx == mid_dim_ - 1) { - // record first zero position as -1, i.e., no zero - first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = -1; - } - } else if (mid_idx > 0) { // mask > 0 - if (zero_mask_[idx - inner_dim_] > 0) { // not first zero - dx_[idx] = 0; - should_fill_one = false; - } else { - // idx is the first zero position, it should be recorded - dx_[idx] = y_[idx - inner_dim_]; - first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = mid_idx; - } - } else { // the first zero position is index 0 - dx_[idx] = 1; - first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = 0; - } - - x_filled_one_[idx] = should_fill_one ? 1 : x_[idx]; - } - - private: - const T *x_; - const T *y_; - const T *dy_mul_y_reversed_cumsum_; - const uint8_t *zero_mask_; - size_t mid_dim_; - size_t inner_dim_; - T *dx_; - int64_t *first_zero_idx_; - T *x_filled_one_; -}; - -template -struct FillFirstZeroPositionGradFunctor { - HOSTDEVICE FillFirstZeroPositionGradFunctor(const int64_t *first_zero_idx, - const T *grad_value, - size_t mid_dim, size_t inner_dim, - T *dx) - : first_zero_idx_(first_zero_idx), - grad_value_(grad_value), - mid_dim_(mid_dim), - inner_dim_(inner_dim), - dx_(dx) {} - - HOSTDEVICE void operator()(size_t idx) const { - auto outer_idx = idx / inner_dim_; - auto inner_idx = idx % inner_dim_; - auto mid_idx = first_zero_idx_[idx]; - if (mid_idx >= 0) { - auto full_idx = - outer_idx * mid_dim_ * inner_dim_ + mid_idx * inner_dim_ + inner_idx; - dx_[full_idx] *= grad_value_[full_idx]; - } - } - - private: - const int64_t *first_zero_idx_; - const T *grad_value_; - size_t mid_dim_; - size_t inner_dim_; - T *dx_; -}; - -/* -Reference to -https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/ReduceOps.cpp -input: x, y, dL/dy -output: dL/dx -dL/dx[i] = sum{0<=j k, dL/dx[i] = 0; -i < k, dL/dx[i] = 1/x[i]*sum{i<=j k - dx[i] = 0; - x_filled_one[i] = x[i]; - } - } - } -} -T = reversed_cumsum(dy[j]*cumprod(x_filled_one[j])); -if (zero_index != -1) { - dx[zero_index] *= T[zero_index]; -} -*/ - -template -class CumprodGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *x = ctx.Input("X"); - const auto *y = ctx.Input("Out"); - const auto *dy = - ctx.Input(framework::GradVarName("Out")); - auto *dx = ctx.Output(framework::GradVarName("X")); - auto dim = ctx.Attr("dim"); - - size_t outer_dim, mid_dim, inner_dim; - GetCumprodDimInfo(x->dims(), dim, &outer_dim, &mid_dim, &inner_dim); - if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return; - - size_t numel = outer_dim * mid_dim * inner_dim; - - const auto *x_data = x->data(); - const auto *y_data = y->data(); - const auto *dy_data = dy->data(); - - auto place = ctx.GetPlace(); - const auto &dev_ctx = - ctx.template device_context(); - auto *dx_data = dx->mutable_data(place); - - // deal with complex - const T *x_data_deal; - const T *y_data_deal; - memory::AllocationPtr x_conj; - memory::AllocationPtr y_conj; - if (framework::IsComplex::value) { - x_conj = memory::Alloc(place, numel * sizeof(T)); - auto *x_data_conj = reinterpret_cast(x_conj->ptr()); - y_conj = memory::Alloc(place, numel * sizeof(T)); - auto *y_data_conj = reinterpret_cast(y_conj->ptr()); - - platform::ForRange for_range_x(dev_ctx, - numel); - phi::funcs::ConjFunctor functor_x(x_data, numel, x_data_conj); - for_range_x(functor_x); - - platform::ForRange for_range_y(dev_ctx, - numel); - phi::funcs::ConjFunctor functor_y(y_data, numel, y_data_conj); - for_range_y(functor_y); - x_data_deal = x_data_conj; - y_data_deal = y_data_conj; - } else { - x_data_deal = x_data; - y_data_deal = y_data; - } - -// Step 1: find cummax-ed zero mask of x -#ifdef PADDLE_WITH_CUDA - const auto &exec_policy = thrust::cuda::par.on(dev_ctx.stream()); -#else - const auto &exec_policy = thrust::hip::par.on(dev_ctx.stream()); -#endif - auto zero_mask_without_cummax = - memory::Alloc(place, numel * sizeof(uint8_t)); - auto *zero_mask_without_cummax_data = - reinterpret_cast(zero_mask_without_cummax->ptr()); - thrust::transform( - exec_policy, thrust::device_pointer_cast(x_data_deal), - thrust::device_pointer_cast(x_data_deal) + numel, - thrust::device_pointer_cast(zero_mask_without_cummax_data), - IsZeroFunctor()); - - auto zero_mask = memory::Alloc(place, numel * sizeof(uint8_t)); - auto *zero_mask_data = reinterpret_cast(zero_mask->ptr()); - math::InclusiveScan( - zero_mask_without_cummax_data, zero_mask_data, outer_dim, mid_dim, - inner_dim, static_cast(0), cub::Max(), /*reverse=*/false, - dev_ctx); - zero_mask_without_cummax = nullptr; - - // Step 2: calculate reversed cumsum(dy * y) - auto dy_mul_y = memory::Alloc(place, numel * sizeof(T)); - auto *dy_mul_y_data = reinterpret_cast(dy_mul_y->ptr()); - thrust::transform(exec_policy, thrust::device_pointer_cast(dy_data), - thrust::device_pointer_cast(dy_data) + numel, - thrust::device_pointer_cast(y_data_deal), - thrust::device_pointer_cast(dy_mul_y_data), - MultiplyFunctor()); - - auto dy_mul_y_reversed_cumsum = memory::Alloc(place, numel * sizeof(T)); - auto *dy_mul_y_reversed_cumsum_data = - reinterpret_cast(dy_mul_y_reversed_cumsum->ptr()); - math::InclusiveScan( - dy_mul_y_data, dy_mul_y_reversed_cumsum_data, outer_dim, mid_dim, - inner_dim, static_cast(0), cub::Sum(), /*reverse=*/true, dev_ctx); - - // Step 3: calculate the gradient value except the first zero position. - // The gradient value of the first zero position is filled with out[idx-1], - // while the gradient value of the other positions are calculated out - // completely. This functor also: - // (1) find the first zero index, i.e., first_zero_idx_data. - // (2) fill x_filled_one, which satifies - // x_filled_one[i] = x[i], i > pos - // x_filled_one[i] = 1, i <= pos - auto first_zero_idx = - memory::Alloc(place, outer_dim * inner_dim * sizeof(int64_t)); - auto *first_zero_idx_data = - reinterpret_cast(first_zero_idx->ptr()); - auto *x_filled_one_data = dy_mul_y_data; // reuse former allocated memory - platform::ForRange for_range(dev_ctx, numel); - CumprodGradFunctorExceptFirstZero functor_except_first_zero( - x_data_deal, y_data_deal, dy_mul_y_reversed_cumsum_data, zero_mask_data, - mid_dim, inner_dim, dx_data, first_zero_idx_data, x_filled_one_data); - for_range(functor_except_first_zero); - - // Step 4: calculate cumprod of x_filled_one - auto *x_filled_one_cumprod_data = - dy_mul_y_reversed_cumsum_data; // reuse former allocated memory - math::InclusiveScan>( - x_filled_one_data, x_filled_one_cumprod_data, outer_dim, mid_dim, - inner_dim, static_cast(1), MultiplyFunctor(), /*reverse=*/false, - dev_ctx); - - // Step 5: calculate reversed cumsum(dy * x_filled_one_cumprod) - auto *dy_mul_x_filled_one_cumprod = - dy_mul_y_data; // reuse former allocated memory - thrust::transform(exec_policy, thrust::device_pointer_cast(dy_data), - thrust::device_pointer_cast(dy_data) + numel, - thrust::device_pointer_cast(x_filled_one_cumprod_data), - thrust::device_pointer_cast(dy_mul_x_filled_one_cumprod), - MultiplyFunctor()); - auto *dy_mul_x_filled_one_cumprod_reversed_cumsum = - dy_mul_y_reversed_cumsum_data; // reuse former allocated memory - math::InclusiveScan( - dy_mul_x_filled_one_cumprod, - dy_mul_x_filled_one_cumprod_reversed_cumsum, outer_dim, mid_dim, - inner_dim, static_cast(0), cub::Sum(), - /*reverse=*/true, dev_ctx); - - // Step 6: fill zero pos gradient value - platform::ForRange - for_range_fill_zero_pos_grad(dev_ctx, outer_dim * inner_dim); - FillFirstZeroPositionGradFunctor fill_first_zero_pos_grad_functor( - first_zero_idx_data, dy_mul_x_filled_one_cumprod_reversed_cumsum, - mid_dim, inner_dim, dx_data); - for_range_fill_zero_pos_grad(fill_first_zero_pos_grad_functor); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - cumprod, ops::CumprodOpCUDAKernel, ops::CumprodOpCUDAKernel, - ops::CumprodOpCUDAKernel, ops::CumprodOpCUDAKernel, - ops::CumprodOpCUDAKernel>, - ops::CumprodOpCUDAKernel>); - -REGISTER_OP_CUDA_KERNEL( - cumprod_grad, ops::CumprodGradOpCUDAKernel, - ops::CumprodGradOpCUDAKernel, ops::CumprodGradOpCUDAKernel, - ops::CumprodGradOpCUDAKernel, - ops::CumprodGradOpCUDAKernel>, - ops::CumprodGradOpCUDAKernel>); diff --git a/paddle/fluid/operators/cumprod_op.h b/paddle/fluid/operators/cumprod_op.h deleted file mode 100644 index 74ed2008ae9..00000000000 --- a/paddle/fluid/operators/cumprod_op.h +++ /dev/null @@ -1,170 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" - -namespace paddle { -namespace operators { -using Tensor = framework::Tensor; - -static void GetCumprodDimInfo(const framework::DDim& dim, int cumprod_dim, - size_t* outer_dim, size_t* mid_dim, - size_t* inner_dim) { - PADDLE_ENFORCE_GE( - cumprod_dim, -dim.size(), - platform::errors::InvalidArgument( - "The input dim of CumprodOp should be larger than the opposite " - "rank of input x which is %d.But received dim=%d", - -dim.size(), cumprod_dim)); - PADDLE_ENFORCE_LT(cumprod_dim, dim.size(), - platform::errors::InvalidArgument( - "The input dim of CumprodOp should be smaller than the " - "rank of input x which is %d.But received dim=%d", - dim.size(), cumprod_dim)); - if (cumprod_dim < 0) cumprod_dim += dim.size(); - - *outer_dim = 1; - for (int i = 0; i < cumprod_dim; ++i) { - *outer_dim *= dim[i]; - } - *mid_dim = dim[cumprod_dim]; - *inner_dim = 1; - for (int i = cumprod_dim + 1; i < dim.size(); ++i) { - *inner_dim *= dim[i]; - } -} - -template -class CumprodOpCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* x = context.Input("X"); - Tensor* out = context.Output("Out"); - int dim = context.Attr("dim"); - - auto* x_data = x->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - framework::DDim shape = x->dims(); - - size_t outer_dim = 1; - size_t mid_dim = 1; - size_t inner_dim = 1; - GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim); - - for (size_t i = 0; i < outer_dim; i++) { - for (size_t j = 0; j < mid_dim; j++) { - for (size_t k = 0; k < inner_dim; k++) { - size_t pos = i * mid_dim * inner_dim + j * inner_dim + k; - if (j == 0) { - out_data[pos] = x_data[pos]; - } else { - out_data[pos] = out_data[pos - inner_dim] * x_data[pos]; - } - } - } - } - } -}; - -template -class CumprodGradOpCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const { - const Tensor* d_out = context.Input(framework::GradVarName("Out")); - const Tensor* x = context.Input("X"); - const Tensor* out = context.Input("Out"); - - int dim = context.Attr("dim"); - framework::DDim shape = x->dims(); - Tensor* d_x = context.Output(framework::GradVarName("X")); - - auto* d_out_data = d_out->data(); - auto* x_data = x->data(); - auto* out_data = out->data(); - auto* d_x_data = d_x->mutable_data(context.GetPlace()); - - auto place = context.GetPlace(); - const auto& dev_ctx = - context.template device_context(); - - size_t outer_dim = 1; - size_t mid_dim = 1; - size_t inner_dim = 1; - GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim); - size_t numel = outer_dim * mid_dim * inner_dim; - - // deal with complex - const T* x_data_deal; - const T* out_data_deal; - memory::AllocationPtr x_conj; - memory::AllocationPtr out_conj; - if (framework::IsComplex::value) { - x_conj = memory::Alloc(place, numel * sizeof(T)); - auto* x_data_conj = reinterpret_cast(x_conj->ptr()); - out_conj = memory::Alloc(place, numel * sizeof(T)); - auto* out_data_conj = reinterpret_cast(out_conj->ptr()); - - platform::ForRange for_range_x(dev_ctx, - numel); - phi::funcs::ConjFunctor functor_x(x_data, numel, x_data_conj); - for_range_x(functor_x); - - platform::ForRange for_range_out(dev_ctx, - numel); - phi::funcs::ConjFunctor functor_out(out_data, numel, out_data_conj); - for_range_out(functor_out); - - x_data_deal = x_data_conj; - out_data_deal = out_data_conj; - } else { - x_data_deal = x_data; - out_data_deal = out_data; - } - - for (size_t i = 0; i < outer_dim; i++) { - for (size_t k = 0; k < inner_dim; k++) { - for (size_t j = 0; j < mid_dim; j++) { - size_t index = i * mid_dim * inner_dim + j * inner_dim + k; - d_x_data[index] = 0; - for (size_t n = 0; n < mid_dim; n++) { - size_t pos = i * mid_dim * inner_dim + n * inner_dim + k; - T elem; - if (j == 0) { - elem = d_out_data[pos]; - } else { - elem = d_out_data[pos] * out_data_deal[index - inner_dim]; - } - if (pos > index) { - for (size_t m = index + inner_dim; m <= pos; m += inner_dim) { - elem *= x_data_deal[m]; - } - } else if (pos < index) { - elem = static_cast(0); - } - d_x_data[index] += elem; - } - } - } - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/isclose_op.cc b/paddle/fluid/operators/isclose_op.cc index 0ae7a9fa02f..8668de4d3a6 100644 --- a/paddle/fluid/operators/isclose_op.cc +++ b/paddle/fluid/operators/isclose_op.cc @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/isclose_op.h" #include #include #include "paddle/fluid/framework/op_registry.h" @@ -23,45 +22,6 @@ namespace paddle { namespace operators { -template -struct GetTensorValue { - T operator()(const platform::CPUDeviceContext& dev_ctx, - const framework::Tensor& tensor) const { - return *(tensor.data()); - } -}; - -template -struct IscloseFunctor { - void operator()(const platform::CPUDeviceContext& ctx, - const framework::Tensor& in, const framework::Tensor& other, - const double rtol, const double atol, bool equal_nan, - framework::Tensor* output) { - auto* in_a = in.data(); - auto* in_b = other.data(); - auto* out_data = output->mutable_data(ctx.GetPlace()); - auto num = in.numel(); - // *out_data = true; - for (int i = 0; i < num; i++) { - out_data[i] = true; - } - for (int i = 0; i < num; i++) { - const T a = in_a[i], b = in_b[i]; - bool val; - if (std::isnan(a) || std::isnan(b)) { - val = equal_nan && std::isnan(a) == std::isnan(b); - } else { - T left = (a > b ? a - b : b - a); - T right = atol + (b > 0 ? rtol * b : (-rtol) * b); - T diff = (left > right ? left - right : right - left); - val = a == b || left <= right || diff <= 1e-15; - } - // *out_data &= val; - out_data[i] = val; - } - } -}; - class IscloseOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -154,12 +114,9 @@ class IscloseOpVarTypeInference : public framework::VarTypeInference { } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; REGISTER_OPERATOR( isclose, ops::IscloseOp, ops::IscloseOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker, ops::IscloseOpVarTypeInference); -REGISTER_OP_CPU_KERNEL(isclose, ops::IscloseKernel, - ops::IscloseKernel); diff --git a/paddle/fluid/operators/isclose_op.cu b/paddle/fluid/operators/isclose_op.cu deleted file mode 100644 index 09710ba0c69..00000000000 --- a/paddle/fluid/operators/isclose_op.cu +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/isclose_op.h" - -namespace paddle { -namespace operators { - -template -struct GetTensorValue { - T operator()(const platform::CUDADeviceContext& dev_ctx, - const framework::Tensor& tensor) const { - const T* data = tensor.data(); - T value; - const auto gpu_place = dev_ctx.GetPlace(); - memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T), - dev_ctx.stream()); - return value; - } -}; - -template -__global__ void IscloseCUDAKernel(const T* in_data, const T* other_data, - const double rtol, const double atol, - bool equal_nan, int num, bool* out_data) { - unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; - bool val; - for (int i = idx; i < num; i += blockDim.x * gridDim.x) { - const T a = in_data[i], b = other_data[i]; - if (isnan(a) || isnan(b)) { - val = equal_nan && isnan(a) == isnan(b); - } else { - T left = (a > b ? a - b : b - a); - T right = atol + (b > 0 ? rtol * b : (-rtol) * b); - T diff = (left > right ? left - right : right - left); - val = a == b || left <= right || diff <= 1e-15; - } - out_data[i] = val; - // if (!val) *out_data = false; - } -} - -template -struct IscloseFunctor { - void operator()(const platform::CUDADeviceContext& dev_ctx, - const framework::Tensor& in, const framework::Tensor& other, - const double rtol, const double atol, bool equal_nan, - framework::Tensor* output) { - int num = in.numel(); - const T* in_data = in.data(); - const T* other_data = other.data(); - bool* out_data = output->mutable_data(dev_ctx.GetPlace()); - int block = 1024; - int grid = (block - 1 + num) / block; - grid = (grid > block) ? block : grid; -#ifdef PADDLE_WITH_HIP - hipMemset(out_data, true, num * sizeof(bool)); -#else - cudaMemset(out_data, true, num * sizeof(bool)); -#endif - IscloseCUDAKernel<<>>( - in_data, other_data, rtol, atol, equal_nan, num, out_data); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; -REGISTER_OP_CUDA_KERNEL(isclose, ops::IscloseKernel, - ops::IscloseKernel); diff --git a/paddle/fluid/operators/isclose_op.h b/paddle/fluid/operators/isclose_op.h deleted file mode 100644 index cde5d2afbf0..00000000000 --- a/paddle/fluid/operators/isclose_op.h +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace operators { -using Tensor = framework::Tensor; - -template -struct GetTensorValue { - T operator()(const platform::DeviceContext& ctx, - const framework::Tensor& tensor) const; -}; - -template -struct IscloseFunctor { - void operator()(const DeviceContext& ctx, const framework::Tensor& in, - const framework::Tensor& other, const float rtol, - const float atol, bool equal_nan, framework::Tensor* output); -}; - -template -class IscloseKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - // get attrs - bool equal_nan = ctx.Attr("equal_nan"); - // get input/output - const auto* input = ctx.Input("Input"); - const auto* other = ctx.Input("Other"); - auto* out = ctx.Output("Out"); - - double rtol_v = std::stod(ctx.Attr("rtol")); - double atol_v = std::stod(ctx.Attr("atol")); - - auto& dev_ctx = ctx.template device_context(); - GetTensorValue get_tensor_value; - if (ctx.HasInput("Rtol")) { - const auto* rtol = ctx.Input("Rtol"); - PADDLE_ENFORCE_EQ( - rtol->numel(), 1, - platform::errors::InvalidArgument( - "Input(Rtol) size must be 1, but get %d.", rtol->numel())); - PADDLE_ENFORCE_EQ( - framework::TransToProtoVarType(rtol->dtype()), - framework::proto::VarType::FP64, - platform::errors::InvalidArgument( - "Input(Rtol) type must be double, but get %s.", - framework::DataTypeToString( - framework::TransToProtoVarType(rtol->dtype())))); - rtol_v = get_tensor_value(dev_ctx, *rtol); - } - if (ctx.HasInput("Atol")) { - const auto* atol = ctx.Input("Atol"); - PADDLE_ENFORCE_EQ( - atol->numel(), 1, - platform::errors::InvalidArgument( - "Input(Atol) size must be 1, but get %d", atol->numel())); - PADDLE_ENFORCE_EQ( - framework::TransToProtoVarType(atol->dtype()), - framework::proto::VarType::FP64, - platform::errors::InvalidArgument( - "Input(Atol) type must be double, but get %s", - framework::DataTypeToString( - framework::TransToProtoVarType(atol->dtype())))); - atol_v = get_tensor_value(dev_ctx, *atol); - } - - IscloseFunctor()(dev_ctx, *input, *other, rtol_v, atol_v, - equal_nan, out); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/kldiv_loss_op.cc b/paddle/fluid/operators/kldiv_loss_op.cc index a78d8ec1014..dcd98054b05 100644 --- a/paddle/fluid/operators/kldiv_loss_op.cc +++ b/paddle/fluid/operators/kldiv_loss_op.cc @@ -9,7 +9,6 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/kldiv_loss_op.h" #include #include #include "paddle/fluid/framework/op_registry.h" @@ -177,10 +176,3 @@ REGISTER_OPERATOR(kldiv_loss, ops::KLDivLossOp, ops::KLDivLossOpMaker, ops::KLDivLossOpGradMaker); REGISTER_OPERATOR(kldiv_loss_grad, ops::KLDivLossOpGrad, ops::KLDivLossGradNoNeedBufferVarInferer); -REGISTER_OP_CPU_KERNEL( - kldiv_loss, ops::KLDivLossKernel, - ops::KLDivLossKernel); -REGISTER_OP_CPU_KERNEL( - kldiv_loss_grad, - ops::KLDivLossGradKernel, - ops::KLDivLossGradKernel); diff --git a/paddle/fluid/operators/kldiv_loss_op.cu b/paddle/fluid/operators/kldiv_loss_op.cu deleted file mode 100644 index 5226cb8c08e..00000000000 --- a/paddle/fluid/operators/kldiv_loss_op.cu +++ /dev/null @@ -1,22 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/kldiv_loss_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - kldiv_loss, - ops::KLDivLossKernel, - ops::KLDivLossKernel); -REGISTER_OP_CUDA_KERNEL( - kldiv_loss_grad, - ops::KLDivLossGradKernel, - ops::KLDivLossGradKernel); diff --git a/paddle/fluid/operators/kldiv_loss_op.h b/paddle/fluid/operators/kldiv_loss_op.h deleted file mode 100644 index 5a6ef06f5eb..00000000000 --- a/paddle/fluid/operators/kldiv_loss_op.h +++ /dev/null @@ -1,119 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using Array1 = Eigen::DSizes; - -template -struct KLDivLossForward { - HOSTDEVICE KLDivLossForward() {} - - HOSTDEVICE T operator()(const T& target, const T& input) const { - if (target <= 0) { - return 0; - } else { - return target * (std::log(target) - input); - } - } -}; - -template -struct KLDivLossBackward { - HOSTDEVICE KLDivLossBackward() {} - - HOSTDEVICE T operator()(const T& target, const T& grad) const { - if (target <= 0) { - return 0; - } else { - return static_cast(-1.) * grad; - } - } -}; - -template -class KLDivLossKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& place = *ctx.template device_context().eigen_device(); - auto* input = ctx.Input("X"); - auto* target = ctx.Input("Target"); - auto* loss = ctx.Output("Loss"); - auto reduction = ctx.Attr("reduction"); - - const int n = input->dims()[0]; - - loss->mutable_data(ctx.GetPlace()); - auto input_t = framework::EigenVector::Flatten(*input); - auto target_t = framework::EigenVector::Flatten(*target); - auto loss_t = framework::EigenVector::Flatten(*loss); - auto output = target_t.binaryExpr(input_t, KLDivLossForward()); - if ("none" == reduction) { - loss_t.device(place) = output; - } else if ("batchmean" == reduction) { - auto output_sum = output.sum(); - if (n > 0) { - loss_t.device(place) = output_sum / output_sum.constant(n); - } else { - loss_t.device(place) = output_sum; - } - } else if ("mean" == reduction) { - loss_t.device(place) = output.mean(); - } else if ("sum" == reduction) { - loss_t.device(place) = output.sum(); - } - } -}; - -template -class KLDivLossGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& place = *ctx.template device_context().eigen_device(); - auto* target = ctx.Input("Target"); - auto reduction = ctx.Attr("reduction"); - auto* input_grad = ctx.Output(framework::GradVarName("X")); - auto* loss_grad = ctx.Input(framework::GradVarName("Loss")); - - const int n = input_grad->dims()[0]; - const int numel = input_grad->numel(); - const int expand = numel / loss_grad->numel(); - - input_grad->mutable_data(ctx.GetPlace()); - - auto target_t = framework::EigenVector::Flatten(*target); - - auto input_grad_t = framework::EigenVector::Flatten(*input_grad); - auto loss_grad_t = framework::EigenVector::Flatten(*loss_grad); - - auto loss_grad_expand = loss_grad_t.broadcast(Array1(expand)); - auto grad_t = target_t * loss_grad_expand; - input_grad_t.device(place) = - target_t.binaryExpr(grad_t, KLDivLossBackward()); - - if ("mean" == reduction) { - input_grad_t.device(place) = input_grad_t / static_cast(numel); - } else if ("batchmean" == reduction) { - input_grad_t.device(place) = input_grad_t / static_cast(n); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/kldiv_loss_op_npu.cc b/paddle/fluid/operators/kldiv_loss_op_npu.cc index 322ae5df4cb..eac181489aa 100644 --- a/paddle/fluid/operators/kldiv_loss_op_npu.cc +++ b/paddle/fluid/operators/kldiv_loss_op_npu.cc @@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the Licnse. */ -#include "paddle/fluid/operators/kldiv_loss_op.h" +#include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/lgamma_op.cc b/paddle/fluid/operators/lgamma_op.cc index 148fb05afcf..72c6b41efa9 100644 --- a/paddle/fluid/operators/lgamma_op.cc +++ b/paddle/fluid/operators/lgamma_op.cc @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/lgamma_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -35,16 +38,6 @@ $$out = log\Gamma(x)$$ class LgammaOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Lgamma"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Lgamma"); - - auto in_dims = ctx->GetInputDim("X"); - - ctx->SetOutputDim("Out", in_dims); - ctx->ShareLoD("X", "Out"); - } }; template @@ -83,17 +76,12 @@ class LgammaGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(lgamma, LgammaInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); + REGISTER_OPERATOR(lgamma, ops::LgammaOp, ops::LgammaOpMaker, ops::LgammaGradMaker, - ops::LgammaGradMaker); + ops::LgammaGradMaker, + LgammaInferShapeFunctor); REGISTER_OPERATOR(lgamma_grad, ops::LgammaGradOp); - -REGISTER_OP_CPU_KERNEL( - lgamma, ops::LgammaKernel, - ops::LgammaKernel) - -REGISTER_OP_CPU_KERNEL( - lgamma_grad, - ops::LgammaGradKernel, - ops::LgammaGradKernel); diff --git a/paddle/fluid/operators/lgamma_op.cu b/paddle/fluid/operators/lgamma_op.cu deleted file mode 100644 index b9f273727b0..00000000000 --- a/paddle/fluid/operators/lgamma_op.cu +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" -#include "paddle/fluid/operators/lgamma_op.h" - -namespace paddle { -namespace operators { - -template -struct CudaLgammaFunctor { - __device__ __forceinline__ T operator()(const T x) const { - return Eigen::numext::lgamma(x); - } -}; - -template -class LgammaKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* x = context.Input("X"); - Tensor* out = context.Output("Out"); - out->mutable_data(context.GetPlace()); - - auto& dev_ctx = context.device_context(); - std::vector ins = {x}; - std::vector outs = {out}; - auto functor = CudaLgammaFunctor(); - paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, - &outs, functor); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - lgamma, ops::LgammaKernel, - ops::LgammaKernel); - -REGISTER_OP_CUDA_KERNEL( - lgamma_grad, - ops::LgammaGradKernel, - ops::LgammaGradKernel); diff --git a/paddle/fluid/operators/lgamma_op.h b/paddle/fluid/operators/lgamma_op.h deleted file mode 100644 index 674054e7457..00000000000 --- a/paddle/fluid/operators/lgamma_op.h +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/for_range.h" - -namespace paddle { -namespace operators { - -template -struct LgammaFunctor { - LgammaFunctor(const T* input, T* output, int64_t numel) - : input_(input), output_(output), numel_(numel) {} - - HOSTDEVICE void operator()(int64_t idx) const { - output_[idx] = Eigen::numext::lgamma(input_[idx]); - } - - private: - const T* input_; - T* output_; - int64_t numel_; -}; - -template -struct LgammaGradFunctor { - LgammaGradFunctor(const T* dout, const T* x, T* output, int64_t numel) - : dout_(dout), x_(x), output_(output), numel_(numel) {} - - HOSTDEVICE void operator()(int64_t idx) const { - output_[idx] = dout_[idx] * Eigen::numext::digamma(x_[idx]); - } - - private: - const T* dout_; - const T* x_; - T* output_; - int64_t numel_; -}; - -using Tensor = framework::Tensor; - -template -class LgammaKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* x = context.Input("X"); - Tensor* out = context.Output("Out"); - - auto numel = x->numel(); - auto* x_data = x->data(); - auto* out_data = out->mutable_data(context.GetPlace(), - size_t(x->numel() * sizeof(T))); - - auto& dev_ctx = context.template device_context(); - platform::ForRange for_range(dev_ctx, numel); - LgammaFunctor functor(x_data, out_data, numel); - for_range(functor); - } -}; - -template -class LgammaGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - const framework::Tensor* d_out = - ctx.Input(framework::GradVarName("Out")); - const framework::Tensor* x = ctx.Input("X"); - framework::Tensor* d_x = - ctx.Output(framework::GradVarName("X")); - - auto numel = d_out->numel(); - auto* dout_data = d_out->data(); - auto* x_data = x->data(); - auto* dx_data = d_x->mutable_data( - ctx.GetPlace(), static_cast(numel * sizeof(T))); - - auto& dev_ctx = ctx.template device_context(); - platform::ForRange for_range(dev_ctx, numel); - LgammaGradFunctor functor(dout_data, x_data, dx_data, numel); - for_range(functor); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h index 9994ccc10cb..b77e2345036 100644 --- a/paddle/fluid/operators/math/inclusive_scan.h +++ b/paddle/fluid/operators/math/inclusive_scan.h @@ -34,10 +34,10 @@ namespace paddle { namespace operators { namespace math { -template +template static void CubInclusiveScan(InputIterator x_iter, OutputIterator y_iter, - size_t n, BinaryOp op, - const platform::CUDADeviceContext &dev_ctx) { + size_t n, BinaryOp op, const Context &dev_ctx) { memory::AllocationPtr allocation; void *temp_storage = nullptr; size_t temp_storage_bytes = 0; @@ -185,11 +185,10 @@ static __global__ void InclusiveScanInnerDimCUDAKernel(const T *x, T *y, } } -template +template static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim, size_t inner_dim, T init, BinaryOp op, - bool reverse, - const platform::CUDADeviceContext &dev_ctx) { + bool reverse, const Context &dev_ctx) { constexpr size_t kThreadNumX = 16; constexpr size_t kThreadNumY = 32; @@ -209,10 +208,10 @@ static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim, } } -template +template void InclusiveScan(const T *x, T *y, size_t outer_dim, size_t mid_dim, size_t inner_dim, T init, BinaryOp op, bool reverse, - const platform::CUDADeviceContext &dev_ctx) { + const Context &dev_ctx) { if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return; if (outer_dim == 1 && inner_dim == 1) { @@ -224,8 +223,7 @@ void InclusiveScan(const T *x, T *y, size_t outer_dim, size_t mid_dim, CubInclusiveScan(x, y, mid_dim, op, dev_ctx); } } else if (inner_dim != 1) { - platform::ForRange for_range( - dev_ctx, outer_dim * inner_dim); + platform::ForRange for_range(dev_ctx, outer_dim * inner_dim); if (reverse) { for_range( InclusiveScanOuterOrMidDimFunctor( diff --git a/paddle/phi/api/lib/utils/CMakeLists.txt b/paddle/phi/api/lib/utils/CMakeLists.txt index 6d056b54b70..271a58222f0 100644 --- a/paddle/phi/api/lib/utils/CMakeLists.txt +++ b/paddle/phi/api/lib/utils/CMakeLists.txt @@ -1,2 +1,2 @@ cc_library(phi_api_utils SRCS storage.cc tensor_utils.cc DEPS -tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits) +tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits scalar) diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt index 85a1424ee34..0947870dcd3 100644 --- a/paddle/phi/common/CMakeLists.txt +++ b/paddle/phi/common/CMakeLists.txt @@ -1 +1,2 @@ cc_library(phi_place SRCS place.cc) +cc_library(scalar SRCS scalar.cc) diff --git a/paddle/phi/common/scalar.cc b/paddle/phi/common/scalar.cc new file mode 100644 index 00000000000..5cd55c1e88b --- /dev/null +++ b/paddle/phi/common/scalar.cc @@ -0,0 +1,35 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/common/scalar.h" + +#include "paddle/phi/core/enforce.h" + +namespace paddle { +namespace experimental { + +// NOTE(xiongkun): why we put definition here? +// test_custom_op can't include enforce.h, because enforce.h includes gflags. +// so we decouple the include dependence of enforce.h by link. +void ThrowTensorConvertError(int num) { + PADDLE_ENFORCE_EQ(num, + 1, + phi::errors::InvalidArgument( + "The Scalar only supports Tensor with 1 element, but " + "now Tensor has `%d` elements", + num)); +} + +} // namespace experimental +} // namespace paddle diff --git a/paddle/phi/common/scalar.h b/paddle/phi/common/scalar.h index 72cef89d300..5134f4eb726 100644 --- a/paddle/phi/common/scalar.h +++ b/paddle/phi/common/scalar.h @@ -19,9 +19,12 @@ limitations under the License. */ #include "paddle/phi/api/ext/exception.h" #include "paddle/phi/api/include/tensor.h" + namespace paddle { namespace experimental { +void ThrowTensorConvertError(int); + template class ScalarBase { public: @@ -104,11 +107,7 @@ class ScalarBase { // The Tensor must have one dim ScalarBase(const T& tensor) : dtype_(tensor.dtype()) { // NOLINT is_from_tensor_ = true; - PD_CHECK( - tensor.numel() == 1, - "The Scalar only supports Tensor with 1 element, but now Tensor has `", - tensor.numel(), - "` element."); + ThrowTensorConvertError(tensor.numel()); switch (dtype_) { case DataType::FLOAT32: data_.f32 = tensor.template data()[0]; @@ -156,6 +155,8 @@ class ScalarBase { CopyScalar(other, this); } + // NOTE(xiongkun): some op need to judge the dtype of the Scalar, we expose a + // interface. bool FromTensor() const { return is_from_tensor_; } void SetFromTensor(bool from_tensor) { is_from_tensor_ = from_tensor; } diff --git a/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc b/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc new file mode 100644 index 00000000000..a25f9650fc5 --- /dev/null +++ b/paddle/phi/kernels/cpu/cumprod_grad_kernel.cc @@ -0,0 +1,113 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/cumprod_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/allocator.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/cumprod.h" +#include "paddle/phi/kernels/funcs/for_range.h" + +// NOTE(@xiongkun): use of IsComplex<> +#include "paddle/fluid/framework/data_type.h" + +namespace phi { +template +void CumprodGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& d_out, + int dim, + DenseTensor* d_x) { + DDim shape = x.dims(); + + auto* d_out_data = d_out.data(); + auto* x_data = x.data(); + auto* out_data = out.data(); + auto* d_x_data = dev_ctx.template Alloc(d_x); + + size_t outer_dim = 1; + size_t mid_dim = 1; + size_t inner_dim = 1; + GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim); + size_t numel = outer_dim * mid_dim * inner_dim; + + // deal with complex + const T* x_data_deal; + const T* out_data_deal; + Allocator::AllocationPtr x_conj; + Allocator::AllocationPtr out_conj; + if (paddle::framework::IsComplex::value) { + x_conj = const_cast(dev_ctx.GetAllocator()) + .Allocate(numel * sizeof(T)); + auto* x_data_conj = reinterpret_cast(x_conj->ptr()); + out_conj = const_cast(dev_ctx.GetAllocator()) + .Allocate(numel * sizeof(T)); + auto* out_data_conj = reinterpret_cast(out_conj->ptr()); + + phi::funcs::ForRange for_range_x(dev_ctx, numel); + phi::funcs::ConjFunctor functor_x(x_data, numel, x_data_conj); + for_range_x(functor_x); + + phi::funcs::ForRange for_range_out(dev_ctx, numel); + phi::funcs::ConjFunctor functor_out(out_data, numel, out_data_conj); + for_range_out(functor_out); + + x_data_deal = x_data_conj; + out_data_deal = out_data_conj; + } else { + x_data_deal = x_data; + out_data_deal = out_data; + } + + for (size_t i = 0; i < outer_dim; i++) { + for (size_t k = 0; k < inner_dim; k++) { + for (size_t j = 0; j < mid_dim; j++) { + size_t index = i * mid_dim * inner_dim + j * inner_dim + k; + d_x_data[index] = 0; + for (size_t n = 0; n < mid_dim; n++) { + size_t pos = i * mid_dim * inner_dim + n * inner_dim + k; + T elem; + if (j == 0) { + elem = d_out_data[pos]; + } else { + elem = d_out_data[pos] * out_data_deal[index - inner_dim]; + } + if (pos > index) { + for (size_t m = index + inner_dim; m <= pos; m += inner_dim) { + elem *= x_data_deal[m]; + } + } else if (pos < index) { + elem = static_cast(0); + } + d_x_data[index] += elem; + } + } + } + } +} +} // namespace phi +PD_REGISTER_KERNEL(cumprod_grad, + CPU, + ALL_LAYOUT, + phi::CumprodGradKernel, + float, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/cumprod_kernel.cc b/paddle/phi/kernels/cpu/cumprod_kernel.cc new file mode 100644 index 00000000000..aea338027f5 --- /dev/null +++ b/paddle/phi/kernels/cpu/cumprod_kernel.cc @@ -0,0 +1,65 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/cumprod_kernel.h" + +#include +#include +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/cumprod.h" + +namespace phi { +template +void CumprodKernel(const Context& dev_ctx, + const DenseTensor& input, + int dim, + DenseTensor* out) { + const DenseTensor* x = &input; + auto* x_data = x->data(); + auto* out_data = dev_ctx.template Alloc(out); + DDim shape = x->dims(); + + size_t outer_dim = 1; + size_t mid_dim = 1; + size_t inner_dim = 1; + GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim); + + for (size_t i = 0; i < outer_dim; i++) { + for (size_t j = 0; j < mid_dim; j++) { + for (size_t k = 0; k < inner_dim; k++) { + size_t pos = i * mid_dim * inner_dim + j * inner_dim + k; + if (j == 0) { + out_data[pos] = x_data[pos]; + } else { + out_data[pos] = out_data[pos - inner_dim] * x_data[pos]; + } + } + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(cumprod, + CPU, + ALL_LAYOUT, + phi::CumprodKernel, + float, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/isclose_kernel.cc b/paddle/phi/kernels/cpu/isclose_kernel.cc new file mode 100644 index 00000000000..633c6ba093e --- /dev/null +++ b/paddle/phi/kernels/cpu/isclose_kernel.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/isclose_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/isclose_kernel_impl.h" + +PD_REGISTER_KERNEL( + isclose, CPU, ALL_LAYOUT, phi::IscloseKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc new file mode 100644 index 00000000000..f9399d38d71 --- /dev/null +++ b/paddle/phi/kernels/cpu/kldiv_loss_grad_kernel.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kldiv_loss_grad_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h" + +PD_REGISTER_KERNEL( + kldiv_loss_grad, CPU, ALL_LAYOUT, phi::KLDivLossGradKernel, float, double) { +} diff --git a/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc b/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc new file mode 100644 index 00000000000..c462b8ec32c --- /dev/null +++ b/paddle/phi/kernels/cpu/kldiv_loss_kernel.cc @@ -0,0 +1,23 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kldiv_loss_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h" + +namespace phi {} // namespace phi + +PD_REGISTER_KERNEL( + kldiv_loss, CPU, ALL_LAYOUT, phi::KLDivLossKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc b/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc new file mode 100644 index 00000000000..116fa3f8d3f --- /dev/null +++ b/paddle/phi/kernels/cpu/lgamma_grad_kernel.cc @@ -0,0 +1,20 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/lgamma_grad_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h" +PD_REGISTER_KERNEL( + lgamma_grad, CPU, ALL_LAYOUT, phi::LgammaGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/lgamma_kernel.cc b/paddle/phi/kernels/cpu/lgamma_kernel.cc new file mode 100644 index 00000000000..d0226894089 --- /dev/null +++ b/paddle/phi/kernels/cpu/lgamma_kernel.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/lgamma_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/for_range.h" + +namespace phi { +template +struct LgammaFunctor { + LgammaFunctor(const T* input, T* output, int64_t numel) + : input_(input), output_(output), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { + output_[idx] = Eigen::numext::lgamma(input_[idx]); + } + + private: + const T* input_; + T* output_; + int64_t numel_; +}; + +template +void LgammaKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out) { + auto numel = x.numel(); + auto* x_data = x.data(); + auto* out_data = dev_ctx.template Alloc(out); + phi::funcs::ForRange for_range(dev_ctx, numel); + LgammaFunctor functor(x_data, out_data, numel); + for_range(functor); +} +} // namespace phi + +PD_REGISTER_KERNEL(lgamma, CPU, ALL_LAYOUT, phi::LgammaKernel, float, double) {} diff --git a/paddle/phi/kernels/cumprod_grad_kernel.h b/paddle/phi/kernels/cumprod_grad_kernel.h new file mode 100644 index 00000000000..b3cb17b28e0 --- /dev/null +++ b/paddle/phi/kernels/cumprod_grad_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void CumprodGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + int dim, + DenseTensor* dx); +} // phi diff --git a/paddle/phi/kernels/cumprod_kernel.h b/paddle/phi/kernels/cumprod_kernel.h new file mode 100644 index 00000000000..96d76cb0f43 --- /dev/null +++ b/paddle/phi/kernels/cumprod_kernel.h @@ -0,0 +1,26 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void CumprodKernel(const Context& dev_ctx, + const DenseTensor& x, + int dim, + DenseTensor* out); +} // phi diff --git a/paddle/phi/kernels/funcs/cumprod.h b/paddle/phi/kernels/funcs/cumprod.h new file mode 100644 index 00000000000..ac40523c1c4 --- /dev/null +++ b/paddle/phi/kernels/funcs/cumprod.h @@ -0,0 +1,52 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { +static void GetCumprodDimInfo(const DDim& dim, + int cumprod_dim, + size_t* outer_dim, + size_t* mid_dim, + size_t* inner_dim) { + PADDLE_ENFORCE_GE( + cumprod_dim, + -dim.size(), + phi::errors::InvalidArgument( + "The input dim of CumprodOp should be larger than the opposite " + "rank of input x which is %d.But received dim=%d", + -dim.size(), + cumprod_dim)); + PADDLE_ENFORCE_LT(cumprod_dim, + dim.size(), + phi::errors::InvalidArgument( + "The input dim of CumprodOp should be smaller than the " + "rank of input x which is %d.But received dim=%d", + dim.size(), + cumprod_dim)); + if (cumprod_dim < 0) cumprod_dim += dim.size(); + + *outer_dim = 1; + for (int i = 0; i < cumprod_dim; ++i) { + *outer_dim *= dim[i]; + } + *mid_dim = dim[cumprod_dim]; + *inner_dim = 1; + for (int i = cumprod_dim + 1; i < dim.size(); ++i) { + *inner_dim *= dim[i]; + } +} +} // namespace phi diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h index f9e66836a62..ac262fe2d57 100644 --- a/paddle/phi/kernels/funcs/elementwise_functor.h +++ b/paddle/phi/kernels/funcs/elementwise_functor.h @@ -67,6 +67,11 @@ struct InverseMultiplyFunctor { } }; +template +struct IsZeroFunctor { + HOSTDEVICE bool operator()(T x) const { return x == static_cast(0); } +}; + // Divide #define DIV_ERROR_INFO \ "InvalidArgumentError: Integer division by zero encountered in " \ diff --git a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu new file mode 100644 index 00000000000..6e871246292 --- /dev/null +++ b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu @@ -0,0 +1,320 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/cumprod_grad_kernel.h" + +#include +#include "paddle/fluid/operators/math/inclusive_scan.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/cumprod.h" +#include "paddle/phi/kernels/funcs/elementwise_functor.h" +#include "paddle/phi/kernels/funcs/for_range.h" +// NOTE(@xiongkun): use of IsComplex<> +#include "paddle/fluid/framework/data_type.h" + +namespace phi { + +template +struct CumprodGradFunctorExceptFirstZero { + HOSTDEVICE CumprodGradFunctorExceptFirstZero( + const T *x, + const T *y, + const T *dy_mul_y_reversed_cumsum, + const uint8_t *zero_mask, + size_t mid_dim, + size_t inner_dim, + T *dx, + int64_t *first_zero_idx, + T *x_filled_one) + : x_(x), + y_(y), + dy_mul_y_reversed_cumsum_(dy_mul_y_reversed_cumsum), + zero_mask_(zero_mask), + mid_dim_(mid_dim), + inner_dim_(inner_dim), + dx_(dx), + first_zero_idx_(first_zero_idx), + x_filled_one_(x_filled_one) {} + + HOSTDEVICE void operator()(size_t idx) const { + auto inner_idx = idx % inner_dim_; + auto outer_idx = idx / (mid_dim_ * inner_dim_); + auto mid_idx = (idx - inner_idx) / inner_dim_ % mid_dim_; + auto mask = zero_mask_[idx]; + bool should_fill_one = true; + + if (mask == 0) { + dx_[idx] = dy_mul_y_reversed_cumsum_[idx] / x_[idx]; + if (mid_idx == mid_dim_ - 1) { + // record first zero position as -1, i.e., no zero + first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = -1; + } + } else if (mid_idx > 0) { // mask > 0 + if (zero_mask_[idx - inner_dim_] > 0) { // not first zero + dx_[idx] = 0; + should_fill_one = false; + } else { + // idx is the first zero position, it should be recorded + dx_[idx] = y_[idx - inner_dim_]; + first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = mid_idx; + } + } else { // the first zero position is index 0 + dx_[idx] = 1; + first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = 0; + } + + x_filled_one_[idx] = should_fill_one ? 1 : x_[idx]; + } + + private: + const T *x_; + const T *y_; + const T *dy_mul_y_reversed_cumsum_; + const uint8_t *zero_mask_; + size_t mid_dim_; + size_t inner_dim_; + T *dx_; + int64_t *first_zero_idx_; + T *x_filled_one_; +}; + +template +struct FillFirstZeroPositionGradFunctor { + HOSTDEVICE FillFirstZeroPositionGradFunctor(const int64_t *first_zero_idx, + const T *grad_value, + size_t mid_dim, + size_t inner_dim, + T *dx) + : first_zero_idx_(first_zero_idx), + grad_value_(grad_value), + mid_dim_(mid_dim), + inner_dim_(inner_dim), + dx_(dx) {} + + HOSTDEVICE void operator()(size_t idx) const { + auto outer_idx = idx / inner_dim_; + auto inner_idx = idx % inner_dim_; + auto mid_idx = first_zero_idx_[idx]; + if (mid_idx >= 0) { + auto full_idx = + outer_idx * mid_dim_ * inner_dim_ + mid_idx * inner_dim_ + inner_idx; + dx_[full_idx] *= grad_value_[full_idx]; + } + } + + private: + const int64_t *first_zero_idx_; + const T *grad_value_; + size_t mid_dim_; + size_t inner_dim_; + T *dx_; +}; + +template +void CumprodGradKernel(const Context &dev_ctx, + const DenseTensor &x, + const DenseTensor &out, + const DenseTensor &dout, + int dim, + DenseTensor *dx) { + const auto *y = &out; + const auto *dy = &dout; + + size_t outer_dim, mid_dim, inner_dim; + GetCumprodDimInfo(x.dims(), dim, &outer_dim, &mid_dim, &inner_dim); + if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return; + + size_t numel = outer_dim * mid_dim * inner_dim; + + const auto *x_data = x.data(); + const auto *y_data = y->data(); + const auto *dy_data = dy->data(); + + auto place = dev_ctx.GetPlace(); + auto *dx_data = dev_ctx.template Alloc(dx); + + // deal with complex + const T *x_data_deal; + const T *y_data_deal; + Allocator::AllocationPtr x_conj; + Allocator::AllocationPtr y_conj; + if (paddle::framework::IsComplex::value) { + x_conj = const_cast(dev_ctx.GetAllocator()) + .Allocate(numel * sizeof(T)); + auto *x_data_conj = reinterpret_cast(x_conj->ptr()); + y_conj = const_cast(dev_ctx.GetAllocator()) + .Allocate(numel * sizeof(T)); + auto *y_data_conj = reinterpret_cast(y_conj->ptr()); + + phi::funcs::ForRange for_range_x(dev_ctx, numel); + phi::funcs::ConjFunctor functor_x(x_data, numel, x_data_conj); + for_range_x(functor_x); + + phi::funcs::ForRange for_range_y(dev_ctx, numel); + phi::funcs::ConjFunctor functor_y(y_data, numel, y_data_conj); + for_range_y(functor_y); + x_data_deal = x_data_conj; + y_data_deal = y_data_conj; + } else { + x_data_deal = x_data; + y_data_deal = y_data; + } + +// Step 1: find cummax-ed zero mask of x +#ifdef PADDLE_WITH_CUDA + const auto &exec_policy = thrust::cuda::par.on(dev_ctx.stream()); +#else + const auto &exec_policy = thrust::hip::par.on(dev_ctx.stream()); +#endif + auto zero_mask_without_cummax = + const_cast(dev_ctx.GetAllocator()) + .Allocate(numel * sizeof(uint8_t)); + auto *zero_mask_without_cummax_data = + reinterpret_cast(zero_mask_without_cummax->ptr()); + thrust::transform(exec_policy, + thrust::device_pointer_cast(x_data_deal), + thrust::device_pointer_cast(x_data_deal) + numel, + thrust::device_pointer_cast(zero_mask_without_cummax_data), + funcs::IsZeroFunctor()); + + auto zero_mask = const_cast(dev_ctx.GetAllocator()) + .Allocate(numel * sizeof(uint8_t)); + auto *zero_mask_data = reinterpret_cast(zero_mask->ptr()); + paddle::operators::math::InclusiveScan( + zero_mask_without_cummax_data, + zero_mask_data, + outer_dim, + mid_dim, + inner_dim, + static_cast(0), + cub::Max(), + /*reverse=*/false, + dev_ctx); + zero_mask_without_cummax = nullptr; + + // Step 2: calculate reversed cumsum(dy * y) + auto dy_mul_y = const_cast(dev_ctx.GetAllocator()) + .Allocate(numel * sizeof(T)); + auto *dy_mul_y_data = reinterpret_cast(dy_mul_y->ptr()); + thrust::transform(exec_policy, + thrust::device_pointer_cast(dy_data), + thrust::device_pointer_cast(dy_data) + numel, + thrust::device_pointer_cast(y_data_deal), + thrust::device_pointer_cast(dy_mul_y_data), + funcs::MultiplyFunctor()); + + auto dy_mul_y_reversed_cumsum = + const_cast(dev_ctx.GetAllocator()) + .Allocate(numel * sizeof(T)); + auto *dy_mul_y_reversed_cumsum_data = + reinterpret_cast(dy_mul_y_reversed_cumsum->ptr()); + paddle::operators::math::InclusiveScan( + dy_mul_y_data, + dy_mul_y_reversed_cumsum_data, + outer_dim, + mid_dim, + inner_dim, + static_cast(0), + cub::Sum(), + /*reverse=*/true, + dev_ctx); + + // Step 3: calculate the gradient value except the first zero position. + // The gradient value of the first zero position is filled with out[idx-1], + // while the gradient value of the other positions are calculated out + // completely. This functor also: + // (1) find the first zero index, i.e., first_zero_idx_data. + // (2) fill x_filled_one, which satifies + // x_filled_one[i] = x[i], i > pos + // x_filled_one[i] = 1, i <= pos + auto first_zero_idx = const_cast(dev_ctx.GetAllocator()) + .Allocate(numel * sizeof(int64_t)); + auto *first_zero_idx_data = + reinterpret_cast(first_zero_idx->ptr()); + auto *x_filled_one_data = dy_mul_y_data; // reuse former allocated memory + phi::funcs::ForRange for_range(dev_ctx, numel); + CumprodGradFunctorExceptFirstZero functor_except_first_zero( + x_data_deal, + y_data_deal, + dy_mul_y_reversed_cumsum_data, + zero_mask_data, + mid_dim, + inner_dim, + dx_data, + first_zero_idx_data, + x_filled_one_data); + for_range(functor_except_first_zero); + + // Step 4: calculate cumprod of x_filled_one + auto *x_filled_one_cumprod_data = + dy_mul_y_reversed_cumsum_data; // reuse former allocated memory + paddle::operators::math::InclusiveScan>( + x_filled_one_data, + x_filled_one_cumprod_data, + outer_dim, + mid_dim, + inner_dim, + static_cast(1), + funcs::MultiplyFunctor(), + /*reverse=*/false, + dev_ctx); + + // Step 5: calculate reversed cumsum(dy * x_filled_one_cumprod) + auto *dy_mul_x_filled_one_cumprod = + dy_mul_y_data; // reuse former allocated memory + thrust::transform(exec_policy, + thrust::device_pointer_cast(dy_data), + thrust::device_pointer_cast(dy_data) + numel, + thrust::device_pointer_cast(x_filled_one_cumprod_data), + thrust::device_pointer_cast(dy_mul_x_filled_one_cumprod), + funcs::MultiplyFunctor()); + auto *dy_mul_x_filled_one_cumprod_reversed_cumsum = + dy_mul_y_reversed_cumsum_data; // reuse former allocated memory + paddle::operators::math::InclusiveScan( + dy_mul_x_filled_one_cumprod, + dy_mul_x_filled_one_cumprod_reversed_cumsum, + outer_dim, + mid_dim, + inner_dim, + static_cast(0), + cub::Sum(), + /*reverse=*/true, + dev_ctx); + + // Step 6: fill zero pos gradient value + phi::funcs::ForRange for_range_fill_zero_pos_grad( + dev_ctx, outer_dim * inner_dim); + FillFirstZeroPositionGradFunctor fill_first_zero_pos_grad_functor( + first_zero_idx_data, + dy_mul_x_filled_one_cumprod_reversed_cumsum, + mid_dim, + inner_dim, + dx_data); + for_range_fill_zero_pos_grad(fill_first_zero_pos_grad_functor); +} + +} // namespace phi + +PD_REGISTER_KERNEL(cumprod_grad, + GPU, + ALL_LAYOUT, + phi::CumprodGradKernel, + float, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/cumprod_kernel.cu b/paddle/phi/kernels/gpu/cumprod_kernel.cu new file mode 100644 index 00000000000..1bbf8972a24 --- /dev/null +++ b/paddle/phi/kernels/gpu/cumprod_kernel.cu @@ -0,0 +1,60 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/cumprod_kernel.h" + +#include "paddle/fluid/operators/math/inclusive_scan.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/cumprod.h" +#include "paddle/phi/kernels/funcs/elementwise_functor.h" + +namespace phi { + +template +void CumprodKernel(const Context &dev_ctx, + const DenseTensor &input, + int dim, + DenseTensor *out) { + const auto *x = &input; + auto *y = out; + size_t outer_dim, mid_dim, inner_dim; + GetCumprodDimInfo(x->dims(), dim, &outer_dim, &mid_dim, &inner_dim); + + const auto *x_data = x->data(); + auto *y_data = dev_ctx.template Alloc(y); + paddle::operators::math::InclusiveScan(x_data, + y_data, + outer_dim, + mid_dim, + inner_dim, + static_cast(1), + funcs::MultiplyFunctor(), + /*reverse=*/false, + dev_ctx); +} + +} // namespace phi + +PD_REGISTER_KERNEL(cumprod, + GPU, + ALL_LAYOUT, + phi::CumprodKernel, + float, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/isclose_kernel.cu b/paddle/phi/kernels/gpu/isclose_kernel.cu new file mode 100644 index 00000000000..34774ec715c --- /dev/null +++ b/paddle/phi/kernels/gpu/isclose_kernel.cu @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/isclose_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/isclose_kernel_impl.h" + +PD_REGISTER_KERNEL( + isclose, GPU, ALL_LAYOUT, phi::IscloseKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu new file mode 100644 index 00000000000..8ca53f021f0 --- /dev/null +++ b/paddle/phi/kernels/gpu/kldiv_loss_grad_kernel.cu @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kldiv_loss_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h" +PD_REGISTER_KERNEL( + kldiv_loss_grad, GPU, ALL_LAYOUT, phi::KLDivLossGradKernel, float, double) { +} diff --git a/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu b/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu new file mode 100644 index 00000000000..9388ac7071c --- /dev/null +++ b/paddle/phi/kernels/gpu/kldiv_loss_kernel.cu @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kldiv_loss_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h" +PD_REGISTER_KERNEL( + kldiv_loss, GPU, ALL_LAYOUT, phi::KLDivLossKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu b/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu new file mode 100644 index 00000000000..3e4cd21a658 --- /dev/null +++ b/paddle/phi/kernels/gpu/lgamma_grad_kernel.cu @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/lgamma_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h" +PD_REGISTER_KERNEL( + lgamma_grad, GPU, ALL_LAYOUT, phi::LgammaGradKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/lgamma_kernel.cu b/paddle/phi/kernels/gpu/lgamma_kernel.cu new file mode 100644 index 00000000000..e94d67f4ce3 --- /dev/null +++ b/paddle/phi/kernels/gpu/lgamma_kernel.cu @@ -0,0 +1,41 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/lgamma_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" + +namespace phi { +template +struct CudaLgammaFunctor { + __device__ __forceinline__ T operator()(const T x) const { + return Eigen::numext::lgamma(x); + } +}; +template +void LgammaKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out) { + // XKTODO( add gpu kernel implementation. ) + dev_ctx.template Alloc(out); + std::vector ins = {&x}; + std::vector outs = {out}; + auto functor = CudaLgammaFunctor(); + phi::funcs::ElementwiseKernel(dev_ctx, ins, &outs, functor); +} +} // namespace phi + +PD_REGISTER_KERNEL(lgamma, GPU, ALL_LAYOUT, phi::LgammaKernel, float, double) {} diff --git a/paddle/phi/kernels/impl/isclose_kernel_impl.h b/paddle/phi/kernels/impl/isclose_kernel_impl.h new file mode 100644 index 00000000000..25247ceaff6 --- /dev/null +++ b/paddle/phi/kernels/impl/isclose_kernel_impl.h @@ -0,0 +1,176 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/dense_tensor.h" + +// TODO(xiongkun): remove the header when decouple the memcpy function in phi. +#include "paddle/fluid/memory/memcpy.h" + +namespace phi { +using Tensor = DenseTensor; +template +struct GetTensorValue { + T operator()(const DeviceContext& ctx, const DenseTensor& tensor) const; +}; + +template +struct IscloseFunctor { + void operator()(const DeviceContext& ctx, + const DenseTensor& in, + const DenseTensor& other, + const float rtol, + const float atol, + bool equal_nan, + DenseTensor* output); +}; + +template +struct GetTensorValue { + T operator()(const phi::CPUContext& dev_ctx, + const DenseTensor& tensor) const { + return *(tensor.data()); + } +}; + +template +struct GetTensorValue { + T operator()(const phi::GPUContext& dev_ctx, + const DenseTensor& tensor) const { + const T* data = tensor.data(); + T value; + const auto gpu_place = dev_ctx.GetPlace(); + paddle::memory::Copy( + phi::CPUPlace(), &value, gpu_place, data, sizeof(T), dev_ctx.stream()); + return value; + } +}; + +template +struct IscloseFunctor { + void operator()(const phi::CPUContext& ctx, + const DenseTensor& in, + const DenseTensor& other, + const double rtol, + const double atol, + bool equal_nan, + DenseTensor* output) { + auto* in_a = in.data(); + auto* in_b = other.data(); + auto* out_data = ctx.template Alloc(output); + auto num = in.numel(); + // *out_data = true; + for (int i = 0; i < num; i++) { + out_data[i] = true; + } + for (int i = 0; i < num; i++) { + const T a = in_a[i], b = in_b[i]; + bool val; + if (std::isnan(a) || std::isnan(b)) { + val = equal_nan && std::isnan(a) == std::isnan(b); + } else { + T left = (a > b ? a - b : b - a); + T right = atol + (b > 0 ? rtol * b : (-rtol) * b); + T diff = (left > right ? left - right : right - left); + val = a == b || left <= right || diff <= 1e-15; + } + // *out_data &= val; + out_data[i] = val; + } + } +}; + +#if defined(__NVCC__) || defined(__HIPCC__) +template +__global__ void IscloseCUDAKernel(const T* in_data, + const T* other_data, + const double rtol, + const double atol, + bool equal_nan, + int num, + bool* out_data) { + unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; + bool val; + for (int i = idx; i < num; i += blockDim.x * gridDim.x) { + const T a = in_data[i], b = other_data[i]; + if (isnan(a) || isnan(b)) { + val = equal_nan && isnan(a) == isnan(b); + } else { + T left = (a > b ? a - b : b - a); + T right = atol + (b > 0 ? rtol * b : (-rtol) * b); + T diff = (left > right ? left - right : right - left); + val = a == b || left <= right || diff <= 1e-15; + } + out_data[i] = val; + // if (!val) *out_data = false; + } +} + +template +struct IscloseFunctor { + void operator()(const phi::GPUContext& dev_ctx, + const DenseTensor& in, + const DenseTensor& other, + const double rtol, + const double atol, + bool equal_nan, + DenseTensor* output) { + int num = in.numel(); + const T* in_data = in.data(); + const T* other_data = other.data(); + bool* out_data = dev_ctx.template Alloc(output); + int block = 1024; + int grid = (block - 1 + num) / block; + grid = (grid > block) ? block : grid; +#ifdef PADDLE_WITH_HIP + hipMemset(out_data, true, num * sizeof(bool)); +#else + cudaMemset(out_data, true, num * sizeof(bool)); +#endif + IscloseCUDAKernel<<>>( + in_data, other_data, rtol, atol, equal_nan, num, out_data); + } +}; +#endif + +template +void IscloseKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const Scalar& rtol, + const Scalar& atol, + bool equal_nan, + DenseTensor* out) { + PADDLE_ENFORCE_EQ( + atol.dtype(), + DataType::FLOAT64, + phi::errors::InvalidArgument("Input(Atol) type must be double")); + + PADDLE_ENFORCE_EQ( + rtol.dtype(), + DataType::FLOAT64, + phi::errors::InvalidArgument("Input(Rtol) type must be double")); + + IscloseFunctor()( + dev_ctx, x, y, rtol.to(), atol.to(), equal_nan, out); +} +} // namespace phi diff --git a/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h b/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h new file mode 100644 index 00000000000..1ae90960ef4 --- /dev/null +++ b/paddle/phi/kernels/impl/kldiv_loss_grad_kernel_impl.h @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { +using Array1 = Eigen::DSizes; +template +struct KLDivLossBackward { + HOSTDEVICE KLDivLossBackward() {} + + HOSTDEVICE T operator()(const T& target, const T& grad) const { + if (target <= 0) { + return 0; + } else { + return static_cast(-1.) * grad; + } + } +}; + +template +void KLDivLossGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& label, + const DenseTensor& d_out, + const std::string& reduction, + DenseTensor* d_x) { + auto& place = *dev_ctx.eigen_device(); + auto* target = &label; + auto* input_grad = d_x; + auto* loss_grad = &d_out; + + const int n = input_grad->dims()[0]; + const int numel = input_grad->numel(); + const int expand = numel / loss_grad->numel(); + + dev_ctx.template Alloc(input_grad); + + auto target_t = phi::EigenVector::Flatten(*target); + + auto input_grad_t = phi::EigenVector::Flatten(*input_grad); + auto loss_grad_t = phi::EigenVector::Flatten(*loss_grad); + + auto loss_grad_expand = loss_grad_t.broadcast(Array1(expand)); + auto grad_t = target_t * loss_grad_expand; + input_grad_t.device(place) = + target_t.binaryExpr(grad_t, KLDivLossBackward()); + + if ("mean" == reduction) { + input_grad_t.device(place) = input_grad_t / static_cast(numel); + } else if ("batchmean" == reduction) { + input_grad_t.device(place) = input_grad_t / static_cast(n); + } +} +} // namespace phi diff --git a/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h b/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h new file mode 100644 index 00000000000..ecd23bbfc1c --- /dev/null +++ b/paddle/phi/kernels/impl/kldiv_loss_kernel_impl.h @@ -0,0 +1,69 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { +using Array1 = Eigen::DSizes; +template +struct KLDivLossForward { + HOSTDEVICE KLDivLossForward() {} + + HOSTDEVICE T operator()(const T& target, const T& input) const { + if (target <= 0) { + return 0; + } else { + return target * (std::log(target) - input); + } + } +}; +template +void KLDivLossKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& label, + const std::string& reduction, + DenseTensor* out) { + auto& place = *(dev_ctx.eigen_device()); + auto* input = &x; + auto* target = &label; + auto* loss = out; + + const int n = input->dims()[0]; + dev_ctx.template Alloc(loss); + + auto input_t = phi::EigenVector::Flatten(*input); + auto target_t = phi::EigenVector::Flatten(*target); + auto loss_t = phi::EigenVector::Flatten(*loss); + auto output = target_t.binaryExpr(input_t, KLDivLossForward()); + if ("none" == reduction) { + loss_t.device(place) = output; + } else if ("batchmean" == reduction) { + auto output_sum = output.sum(); + if (n > 0) { + loss_t.device(place) = output_sum / output_sum.constant(n); + } else { + loss_t.device(place) = output_sum; + } + } else if ("mean" == reduction) { + loss_t.device(place) = output.mean(); + } else if ("sum" == reduction) { + loss_t.device(place) = output.sum(); + } +} +} // namespace phi diff --git a/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h b/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h new file mode 100644 index 00000000000..a1b33f5a331 --- /dev/null +++ b/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h @@ -0,0 +1,47 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/phi/kernels/funcs/for_range.h" +namespace phi { +template +struct LgammaGradFunctor { + LgammaGradFunctor(const T* dout, const T* x, T* output, int64_t numel) + : dout_(dout), x_(x), output_(output), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { + output_[idx] = dout_[idx] * Eigen::numext::digamma(x_[idx]); + } + + private: + const T* dout_; + const T* x_; + T* output_; + int64_t numel_; +}; +template +void LgammaGradKernel(const Context& dev_ctx, + const DenseTensor& d_out, + const DenseTensor& x, + DenseTensor* d_x) { + auto numel = d_out.numel(); + auto* dout_data = d_out.data(); + auto* x_data = x.data(); + auto* dx_data = + dev_ctx.template Alloc(d_x, static_cast(numel * sizeof(T))); + phi::funcs::ForRange for_range(dev_ctx, numel); + LgammaGradFunctor functor(dout_data, x_data, dx_data, numel); + for_range(functor); +} +} // namespace phi diff --git a/paddle/phi/kernels/isclose_kernel.h b/paddle/phi/kernels/isclose_kernel.h new file mode 100644 index 00000000000..8c468da0550 --- /dev/null +++ b/paddle/phi/kernels/isclose_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void IscloseKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const Scalar& rtol, + const Scalar& atol, + bool equal_nan, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/kernels/kldiv_loss_grad_kernel.h b/paddle/phi/kernels/kldiv_loss_grad_kernel.h new file mode 100644 index 00000000000..8f53898fa68 --- /dev/null +++ b/paddle/phi/kernels/kldiv_loss_grad_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +// XKTODO (change name) +void KLDivLossGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& label, + const DenseTensor& d_out, + const std::string& reduction, + DenseTensor* d_x); +} // namespace phi diff --git a/paddle/phi/kernels/kldiv_loss_kernel.h b/paddle/phi/kernels/kldiv_loss_kernel.h new file mode 100644 index 00000000000..103780ab747 --- /dev/null +++ b/paddle/phi/kernels/kldiv_loss_kernel.h @@ -0,0 +1,29 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void KLDivLossKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& label, + const std::string& reduction, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/kernels/lgamma_grad_kernel.h b/paddle/phi/kernels/lgamma_grad_kernel.h new file mode 100644 index 00000000000..94173cc29c7 --- /dev/null +++ b/paddle/phi/kernels/lgamma_grad_kernel.h @@ -0,0 +1,27 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void LgammaGradKernel(const Context& dev_ctx, + const DenseTensor& d_out, + const DenseTensor& x, + DenseTensor* d_x); +} // namespace phi diff --git a/paddle/phi/kernels/lgamma_kernel.h b/paddle/phi/kernels/lgamma_kernel.h new file mode 100644 index 00000000000..f61b3a1ce85 --- /dev/null +++ b/paddle/phi/kernels/lgamma_kernel.h @@ -0,0 +1,26 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void LgammaKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/ops/compat/cumprod_sig.cc b/paddle/phi/ops/compat/cumprod_sig.cc new file mode 100644 index 00000000000..59b4eabfa47 --- /dev/null +++ b/paddle/phi/ops/compat/cumprod_sig.cc @@ -0,0 +1,29 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature CumprodGradGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("cumprod_grad", + {"X", "Out", GradVarName("Out")}, + {"dim"}, + {GradVarName("X")}); +} + +} // namespace phi +PD_REGISTER_ARG_MAPPING_FN(cumprod_grad, phi::CumprodGradGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/isclose_sig.cc b/paddle/phi/ops/compat/isclose_sig.cc new file mode 100644 index 00000000000..08632e99095 --- /dev/null +++ b/paddle/phi/ops/compat/isclose_sig.cc @@ -0,0 +1,50 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature IscloseOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.HasInput("Rtol")) { + if (ctx.HasInput("Atol")) { + return KernelSignature("isclose", + {"Input", "Other"}, + {"Rtol", "Atol", "equal_nan"}, + {"Out"}); + + } else { + return KernelSignature("isclose", + {"Input", "Other"}, + {"Rtol", "atol", "equal_nan"}, + {"Out"}); + } + } else { + if (ctx.HasInput("Atol")) { + return KernelSignature("isclose", + {"Input", "Other"}, + {"rtol", "Atol", "equal_nan"}, + {"Out"}); + } else { + return KernelSignature("isclose", + {"Input", "Other"}, + {"rtol", "atol", "equal_nan"}, + {"Out"}); + } + } +} + +} // namespace phi +PD_REGISTER_ARG_MAPPING_FN(isclose, phi::IscloseOpArgumentMapping); diff --git a/paddle/phi/ops/compat/kldiv_loss_sig.cc b/paddle/phi/ops/compat/kldiv_loss_sig.cc new file mode 100644 index 00000000000..22d2f074e9f --- /dev/null +++ b/paddle/phi/ops/compat/kldiv_loss_sig.cc @@ -0,0 +1,30 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature KLDivLossGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("kldiv_loss_grad", + {"X", "Target", GradVarName("Loss")}, + {"reduction"}, + {GradVarName("X")}); +} + +} // namespace phi +PD_REGISTER_ARG_MAPPING_FN(kldiv_loss_grad, + phi::KLDivLossGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/lgamma_sig.cc b/paddle/phi/ops/compat/lgamma_sig.cc new file mode 100644 index 00000000000..968ad4923ba --- /dev/null +++ b/paddle/phi/ops/compat/lgamma_sig.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature LgammaGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "lgamma_grad", {GradVarName("Out"), "X"}, {}, {GradVarName("X")}); +} + +} // namespace phi +PD_REGISTER_ARG_MAPPING_FN(lgamma_grad, phi::LgammaGradOpArgumentMapping); -- GitLab From 6b7d484509a511b94e899f5874461ea5e187131e Mon Sep 17 00:00:00 2001 From: Thunderbrook <52529258+Thunderbrook@users.noreply.github.com> Date: Tue, 15 Mar 2022 15:24:47 +0800 Subject: [PATCH 069/176] add shard_id (#40261) * shard_id * format --- paddle/fluid/framework/fleet/ps_gpu_wrapper.cc | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index 31a30f72e3a..432e57107e8 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -148,7 +148,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { t.join(); } timeline.Pause(); - VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; + VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; } else { CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos); VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset"; @@ -182,7 +182,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { t.join(); } timeline.Pause(); - VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; + VLOG(0) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; } timeline.Start(); @@ -300,7 +300,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { int32_t cnt = 0; while (true) { auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr( - reinterpret_cast(local_ptr[i].data()), this->table_id_, + i, reinterpret_cast(local_ptr[i].data()), this->table_id_, local_keys[i].data(), key_size); bool flag = true; @@ -378,8 +378,8 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { int32_t cnt = 0; while (true) { auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr( - reinterpret_cast(local_dim_ptr[i][j].data()), this->table_id_, - local_dim_keys[i][j].data(), key_size); + i, reinterpret_cast(local_dim_ptr[i][j].data()), + this->table_id_, local_dim_keys[i][j].data(), key_size); bool flag = true; tt.wait(); @@ -431,7 +431,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { t.join(); } timeline.Pause(); - VLOG(1) << "pull sparse from CpuPS into GpuPS cost " << timeline.ElapsedSec() + VLOG(0) << "pull sparse from CpuPS into GpuPS cost " << timeline.ElapsedSec() << " seconds."; if (multi_node_) { auto gloo_wrapper = paddle::framework::GlooWrapper::GetInstance(); @@ -603,7 +603,7 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { t.join(); } timeline.Pause(); - VLOG(1) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec() + VLOG(0) << "GpuPs prepare for build hbm cost " << timeline.ElapsedSec() << " seconds."; } @@ -746,7 +746,7 @@ void PSGPUWrapper::BeginPass() { "[BeginPass] after build_task, current task is not null.")); } - VLOG(1) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s"; + VLOG(0) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s"; } void PSGPUWrapper::EndPass() { @@ -769,7 +769,7 @@ void PSGPUWrapper::EndPass() { current_task_ = nullptr; gpu_free_channel_->Put(current_task_); timer.Pause(); - VLOG(1) << "EndPass end, cost time: " << timer.ElapsedSec() << "s"; + VLOG(0) << "EndPass end, cost time: " << timer.ElapsedSec() << "s"; } void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, -- GitLab From 813f61d25dd5526899a6e8ef3d7e0e6fd669aa89 Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Tue, 15 Mar 2022 15:46:25 +0800 Subject: [PATCH 070/176] change CUDA implementation of randperm OP (#40464) --- paddle/phi/kernels/gpu/randperm_kernel.cu | 160 ++++++++++++++++-- .../fluid/tests/unittests/test_randperm_op.py | 77 +++++++++ 2 files changed, 219 insertions(+), 18 deletions(-) diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu index d4d90cac917..92948bf47c9 100644 --- a/paddle/phi/kernels/gpu/randperm_kernel.cu +++ b/paddle/phi/kernels/gpu/randperm_kernel.cu @@ -14,37 +14,161 @@ #include "paddle/phi/kernels/randperm_kernel.h" +#ifdef __NVCC__ +#include +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +#include +namespace cub = hipcub; +#endif +#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/randint_kernel.h" // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/memory/memcpy.h" +DECLARE_bool(use_curand); + namespace phi { +template +__global__ void SwapRepeatKernel( + int* key, T* data, int n, uint64_t seed, uint64_t offset) { + size_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (idx < n) return; + + bool first_repeat = false; + if (data[idx] == data[idx + 1]) { + if (idx == 0) { + first_repeat = true; + } else if (data[idx] != data[idx - 1]) { + first_repeat = true; + } + } + + if (!first_repeat) return; + + int repeat_size = 1; + for (int i = idx; i < n; ++i) { + if (data[i] == data[i + 1]) { + ++repeat_size; + } else { + break; + } + } + +#ifdef __NVCC__ + curandStatePhilox4_32_10_t state; + curand_init(seed, idx, offset, &state); + for (int i = repeat_size - 1; i > 0; i--) { + uint32_t r = curand(&state) % (i + 1); +#elif __HIPCC__ + hiprandStatePhilox4_32_10_t state; + hiprand_init(seed, idx, offset, &state); + for (int i = repeat_size - 1; i > 0; i--) { + uint32_t r = hiprand(&state) % (i + 1); +#endif + if (r != i) { + T tmp = data[idx + i]; + data[idx + i] = data[idx + r]; + data[idx + r] = tmp; + } + } +} + template void RandpermRawKernel( const Context& dev_ctx, int n, DataType dtype, int seed, DenseTensor* out) { - DenseTensor tmp; - tmp.Resize(phi::make_ddim({n})); - T* tmp_data = dev_ctx.template HostAlloc(&tmp); - - std::shared_ptr engine; - if (seed) { - engine = std::make_shared(); - engine->seed(seed); + if (FLAGS_use_curand) { + DenseTensor key; + RandintKernel(dev_ctx, + std::numeric_limits::min(), + std::numeric_limits::max(), + ScalarArray({n}), + phi::DataType::INT32, + &key); + DenseTensor key_out = Empty(dev_ctx, ScalarArray({n})); + + DenseTensor range = Empty(dev_ctx, ScalarArray({n})); + T* range_data = range.data(); + funcs::ForRange for_range(dev_ctx, n); + for_range([range_data] __device__(size_t idx) { + range_data[idx] = static_cast(idx); + }); + + out->Resize(phi::make_ddim({n})); + T* out_data = dev_ctx.template Alloc(out); + + // Refer to [Algorithm of randperm] https://osf.io/af2hy/ to + // improve performance of radix sort. + double n_d = static_cast(n); + int begin_bit = 0; + int end_bit = + std::ceil(std::log2(n_d - (6 * n_d * n_d + 1) / (12 * std::log(0.9)))); + + size_t temp_storage_bytes = 0; + cub::DeviceRadixSort::SortPairs(nullptr, + temp_storage_bytes, + key.data(), + key_out.data(), + range.data(), + out_data, + n, + begin_bit, + end_bit < 32 ? end_bit : 32, + dev_ctx.stream()); + + auto d_temp_storage = paddle::memory::Alloc(dev_ctx, temp_storage_bytes); + cub::DeviceRadixSort::SortPairs(d_temp_storage->ptr(), + temp_storage_bytes, + key.data(), + key_out.data(), + range.data(), + out_data, + n, + begin_bit, + end_bit < 32 ? end_bit : 32, + dev_ctx.stream()); + + auto gen_cuda = dev_ctx.GetGenerator(); + auto seed_offset = gen_cuda->IncrementOffset(n); + uint64_t seed = seed_offset.first; + uint64_t offset = seed_offset.second; + + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n); + SwapRepeatKernel<<>>( + key_out.data(), out_data, n, seed, offset); } else { - engine = dev_ctx.GetHostGenerator()->GetCPUEngine(); - } + DenseTensor tmp; + tmp.Resize(phi::make_ddim({n})); + T* tmp_data = dev_ctx.template HostAlloc(&tmp); - for (int i = 0; i < n; ++i) { - tmp_data[i] = static_cast(i); - } - std::shuffle(tmp_data, tmp_data + n, *engine); + std::shared_ptr engine; + if (seed) { + engine = std::make_shared(); + engine->seed(seed); + } else { + engine = dev_ctx.GetHostGenerator()->GetCPUEngine(); + } - T* out_data = dev_ctx.template Alloc(out); - auto size = out->numel() * paddle::experimental::SizeOf(out->dtype()); - paddle::memory::Copy( - out->place(), out_data, tmp.place(), tmp_data, size, 0); + for (int i = 0; i < n; ++i) { + tmp_data[i] = static_cast(i); + } + std::shuffle(tmp_data, tmp_data + n, *engine); + + T* out_data = dev_ctx.template Alloc(out); + auto size = out->numel() * paddle::experimental::SizeOf(out->dtype()); + paddle::memory::Copy( + out->place(), out_data, tmp.place(), tmp_data, size, 0); + } } template diff --git a/python/paddle/fluid/tests/unittests/test_randperm_op.py b/python/paddle/fluid/tests/unittests/test_randperm_op.py index 4361a45f156..2380ccb14aa 100644 --- a/python/paddle/fluid/tests/unittests/test_randperm_op.py +++ b/python/paddle/fluid/tests/unittests/test_randperm_op.py @@ -18,6 +18,7 @@ from op_test import OpTest import paddle import paddle.fluid.core as core from paddle.static import program_guard, Program +import os def check_randperm_out(n, data_np): @@ -129,5 +130,81 @@ class TestRandpermImperative(unittest.TestCase): paddle.enable_static() +class TestRandomValue(unittest.TestCase): + def test_fixed_random_number(self): + # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t' + if not paddle.is_compiled_with_cuda(): + return + + if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None): + return + + print("Test Fixed Random number on GPU------>") + paddle.disable_static() + paddle.set_device('gpu') + paddle.seed(2021) + + x = paddle.randperm(30000, dtype='int32').numpy() + expect = [ + 24562, 8409, 9379, 10328, 20503, 18059, 9681, 21883, 11783, 27413 + ] + self.assertTrue(np.array_equal(x[0:10], expect)) + expect = [ + 29477, 27100, 9643, 16637, 8605, 16892, 27767, 2724, 1612, 13096 + ] + self.assertTrue(np.array_equal(x[10000:10010], expect)) + expect = [ + 298, 4104, 16479, 22714, 28684, 7510, 14667, 9950, 15940, 28343 + ] + self.assertTrue(np.array_equal(x[20000:20010], expect)) + + x = paddle.randperm(30000, dtype='int64').numpy() + expect = [ + 6587, 1909, 5525, 23001, 6488, 14981, 14355, 3083, 29561, 8171 + ] + self.assertTrue(np.array_equal(x[0:10], expect)) + expect = [ + 23460, 12394, 22501, 5427, 20185, 9100, 5127, 1651, 25806, 4818 + ] + self.assertTrue(np.array_equal(x[10000:10010], expect)) + expect = [5829, 4508, 16193, 24836, 8526, 242, 9984, 9243, 1977, 11839] + self.assertTrue(np.array_equal(x[20000:20010], expect)) + + x = paddle.randperm(30000, dtype='float32').numpy() + expect = [ + 5154., 10537., 14362., 29843., 27185., 28399., 27561., 4144., + 22906., 10705. + ] + self.assertTrue(np.array_equal(x[0:10], expect)) + expect = [ + 1958., 18414., 20090., 21910., 22746., 27346., 22347., 3002., 4564., + 26991. + ] + self.assertTrue(np.array_equal(x[10000:10010], expect)) + expect = [ + 25580., 12606., 553., 16387., 29536., 4241., 20946., 16899., 16339., + 4662. + ] + self.assertTrue(np.array_equal(x[20000:20010], expect)) + + x = paddle.randperm(30000, dtype='float64').numpy() + expect = [ + 19051., 2449., 21940., 11121., 282., 7330., 13747., 24321., 21147., + 9163. + ] + self.assertTrue(np.array_equal(x[0:10], expect)) + expect = [ + 15483., 1315., 5723., 20954., 13251., 25539., 5074., 1823., 14945., + 17624. + ] + self.assertTrue(np.array_equal(x[10000:10010], expect)) + expect = [ + 10516., 2552., 29970., 5941., 986., 8007., 24805., 26753., 12202., + 21404. + ] + self.assertTrue(np.array_equal(x[20000:20010], expect)) + paddle.enable_static() + + if __name__ == "__main__": unittest.main() -- GitLab From dde9cec0b606ee8ab5203c2c3bdffad23fc5dfc9 Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Tue, 15 Mar 2022 08:48:18 +0100 Subject: [PATCH 071/176] oneDNN NHWC fixes (#40049) * - Prototype of third solution - fix - compilation fixes - fix - fixe - fix - fix - compilation fix - comment fix - lint update mkldnn conv_elementwise_add_fuse_pass ut - NHWC changes to prelu - alhpa dims - UT fix - fix to UT - lint - Some fixes - added to BWD of prelu NHWC support - reverted removal of resetting cu_layout in clearing of caching * - Small changes * - compilation fix * - fix * - fix * lint * - fixes after internal review * - compilation fix * - lint --- paddle/fluid/framework/executor.cc | 3 +- .../conv_elementwise_add_mkldnn_fuse_pass.cc | 2 +- paddle/fluid/framework/naive_executor.cc | 1 + paddle/fluid/operators/lrn_op.cc | 2 +- .../fluid/operators/mkldnn/prelu_mkldnn_op.cc | 9 +- .../operators/mkldnn/test_mkldnn_op_nhwc.cc | 2 +- paddle/fluid/operators/prelu_op.cc | 34 ++++- paddle/fluid/platform/mkldnn_helper.h | 28 ++++ ...t_mkldnn_conv_elementwise_add_fuse_pass.py | 125 ++++++++++++++++-- 9 files changed, 183 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 48850d4624a..f951b5d0f50 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -174,10 +174,11 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, bool force_disable_gc, bool keep_kid_scopes) { platform::RecordBlock b(block_id); if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); + auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc); #ifdef PADDLE_WITH_MKLDNN platform::AttachPointerHashToMKLDNNKey(this, place_); + platform::RegisterModelLayout(ctx->ops_, place_); #endif - auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars, keep_kid_scopes); } diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc index 2403e60df39..0f3f37320b0 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc @@ -118,7 +118,7 @@ ResidualConnectionMKLDNNFusePass::ResidualConnectionMKLDNNFusePass() { .IsType>() .End() .AddAttr("data_format") - .IsStringIn({"NCHW", "AnyLayout"}) + .IsStringIn({"NHWC", "NCHW", "AnyLayout"}) .End(); AddOpCompat(OpCompat("elementwise_add")) diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index ece48158586..f30d1ea1b83 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -41,6 +41,7 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc, void NaiveExecutor::Run() { #ifdef PADDLE_WITH_MKLDNN platform::AttachPointerHashToMKLDNNKey(this, place_); + platform::RegisterModelLayout(ops_, place_); #endif platform::ScopedFlushDenormal flush; for (auto &op : ops_) { diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc index 65297abe3e4..88d70d9bb7d 100644 --- a/paddle/fluid/operators/lrn_op.cc +++ b/paddle/fluid/operators/lrn_op.cc @@ -221,7 +221,7 @@ class LRNOp : public framework::OperatorWithKernel { auto ar = paddle::framework::AttrReader(attrs); const std::string data_format = ar.Get("data_format"); auto dl = framework::StringToDataLayout(data_format); - // Some models may have intentionally set "AnyLayout" for pool + // Some models may have intentionally set "AnyLayout" for lrn // op. Treat this as NCHW (default data_format value) if (dl != framework::DataLayout::kAnyLayout) { return framework::OpKernelType(expected_kernel_type.data_type_, diff --git a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc index bdb4fe1198a..86ecb01c89a 100644 --- a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc @@ -50,13 +50,8 @@ class PReluMKLDNNHandler if (weights->dims().size() != x->dims().size()) { auto new_weights_dims = std::vector(x->dims().size(), 1); if (mode == "channel") { - if (data_format == "NHWC") { - new_weights_dims[x->dims().size() - 1] = - *std::max_element(weights_dims.begin(), weights_dims.end()); - } else { - new_weights_dims[1] = - *std::max_element(weights_dims.begin(), weights_dims.end()); - } + new_weights_dims[1] = + *std::max_element(weights_dims.begin(), weights_dims.end()); } weights_dims = std::move(new_weights_dims); } diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc index 717af61b858..0e988557df6 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc @@ -98,7 +98,7 @@ TEST(test_pool2d_transpose_nhwc, cpu_place) { TEST(test_pool2d_relu_relu_nhwc, cpu_place) { framework::DDim dims({1, 4, 8, 512}); // NHWC shape - framework::DDim expected_dims({1, 512, 3, 7}); // NHWC expected shape + framework::DDim expected_dims({1, 512, 3, 7}); // NCHW expected shape platform::CPUPlace p; framework::Scope scope; diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index 9bd6ae8bab8..4d2a2e23b3f 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -17,6 +17,26 @@ limitations under the License. */ namespace paddle { namespace operators { +framework::OpKernelType innerGetKernelTypeForVar( + const Tensor &tensor, const framework::OpKernelType &expected_kernel_type) { +#ifdef PADDLE_WITH_MKLDNN + auto isOneDNNKernelChosen = + (expected_kernel_type.data_layout_ == framework::DataLayout::kMKLDNN); + auto isNotOneDNNTensor = (tensor.layout() != framework::DataLayout::kMKLDNN); + auto isModelNHWC = + (paddle::platform::MKLDNNDeviceContext::tls() + .get_cur_paddle_data_layout() == framework::DataLayout::kNHWC); + // All inputs (including alpha) need shape rotating + if (isOneDNNKernelChosen && isNotOneDNNTensor && isModelNHWC) { + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), + framework::DataLayout::kNHWC); + } +#endif + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); +} + class PReluOp : public framework::OperatorWithKernel { public: PReluOp(const std::string &type, const framework::VariableNameMap &inputs, @@ -53,7 +73,7 @@ class PReluOp : public framework::OperatorWithKernel { "For mode 'channel', data_format must be one of " "NCHW and NHWC. But recevied data_format: %s", data_format_str)); - if (data_format_str == "NCHW") { + if (data_format_str == "NCHW" || ctx->IsRunMKLDNNKernel()) { PADDLE_ENFORCE_EQ( product(ctx->GetInputDim("Alpha")) == x_dim[1], true, platform::errors::InvalidArgument( @@ -128,6 +148,12 @@ class PReluOp : public framework::OperatorWithKernel { #endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } + + framework::OpKernelType GetKernelTypeForVar( + const std::string &var_name, const Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const { + return innerGetKernelTypeForVar(tensor, expected_kernel_type); + } }; class PReluOpMaker : public framework::OpProtoAndCheckerMaker { @@ -212,6 +238,12 @@ class PReluGradOp : public framework::OperatorWithKernel { #endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } + + framework::OpKernelType GetKernelTypeForVar( + const std::string &var_name, const Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const { + return innerGetKernelTypeForVar(tensor, expected_kernel_type); + } }; template diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index ce2dba4db02..4001fd744e6 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -559,6 +559,34 @@ inline void GetGroupConvWeightsTz(std::vector& weights_tz, // NOLINT } } +inline void RegisterModelLayout( + std::vector>& ops, + const platform::Place& place) { + if (platform::is_cpu_place(place)) { + auto check_attrib = [](std::unique_ptr& op, + const std::string& attrib_name) -> bool { + if (op->HasAttr(attrib_name)) { + auto data_format = op->Attr(attrib_name); + platform::MKLDNNDeviceContext::tls().set_cur_paddle_data_layout( + data_format.compare("NHWC") == 0 ? framework::DataLayout::kNHWC + : framework::DataLayout::kNCHW); + return true; + } else { + return false; + } + }; + + for (auto& op : ops) { + if (check_attrib(op, std::string("data_format"))) { + return; + } + if (check_attrib(op, std::string("data_layout"))) { + return; + } + } + } +} + inline bool HasOpINT8DataType(const paddle::framework::OpDesc* op) { return (op->GetAttrIfExists("mkldnn_data_type") == "int8" || op->GetAttrIfExists("use_quantizer")); diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py index 66c547de2c2..2e84607e2f5 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_elementwise_add_fuse_pass.py @@ -25,17 +25,120 @@ from hypothesis import given, settings, seed, example, assume import hypothesis.strategies as st -class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest): +# the two inputs of elementwise_add are tensor +class TestConvElementwiseAddMkldnnFusePass1(PassAutoScanTest): def is_program_valid(self, program_config: ProgramConfig) -> bool: attrs = [ program_config.ops[i].attrs for i in range(len(program_config.ops)) ] - # If the problem has been fixed, the judgment - # needs to be deleted!!! - if attrs[1]['data_format'] == "NHWC": + if attrs[1]['data_format'] == "NHWC" and attrs[3]['axis'] == 0: + return False + if attrs[1]['data_format'] == "NCHW" and attrs[3]['axis'] == -1: return False + return True + + def sample_program_config(self, draw): + data_format = draw(st.sampled_from(["NCHW", "NHWC"])) + dilations = draw(st.sampled_from([[1, 1], [2, 2], [1, 2]])) + padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"])) + groups = draw(st.sampled_from([1, 2, 4])) + paddings = draw(st.sampled_from([[0, 3], [1, 1], [1, 2, 3, 4]])) + strides = draw(st.sampled_from([[1, 1], [2, 2], [1, 2]])) + axis = draw(st.sampled_from([-1, 0])) + batch_size = draw(st.integers(min_value=1, max_value=4)) + + def generate_input(): + if data_format == "NCHW": + return np.random.random( + [batch_size, 48, 64, 64]).astype(np.float32) + else: + return np.random.random( + [batch_size, 64, 64, 48]).astype(np.float32) + + def generate_weight(): + return np.random.random( + [48, int(48 / groups), 3, 3]).astype(np.float32) + + relu_op = OpConfig( + type="relu", + inputs={"X": ["input_data"]}, + outputs={"Out": ["relu_out"]}, + attrs={}) + + conv2d_op1 = OpConfig( + type="conv2d", + inputs={"Input": ["relu_out"], + "Filter": ["conv_weight1"]}, + outputs={"Output": ["conv_output1"]}, + attrs={ + "data_format": data_format, + "dilations": dilations, + "padding_algorithm": padding_algorithm, + "groups": groups, + "paddings": paddings, + "strides": strides + }) + + conv2d_op2 = OpConfig( + type="conv2d", + inputs={"Input": ["input_data"], + "Filter": ["conv_weight2"]}, + outputs={"Output": ["conv_output2"]}, + attrs={ + "data_format": data_format, + "dilations": dilations, + "padding_algorithm": padding_algorithm, + "groups": groups, + "paddings": paddings, + "strides": strides + }) + + elt_op = OpConfig( + type="elementwise_add", + inputs={"X": ["conv_output1"], + "Y": ["conv_output2"]}, + outputs={"Out": ["elementwise_output"]}, + attrs={'axis': axis}) + model_net = [relu_op, conv2d_op1, conv2d_op2, elt_op] + + program_config = ProgramConfig( + ops=model_net, + weights={ + "conv_weight1": TensorConfig(data_gen=partial(generate_weight)), + "conv_weight2": TensorConfig(data_gen=partial(generate_weight)) + }, + inputs={ + "input_data": TensorConfig(data_gen=partial(generate_input)) + }, + outputs=["elementwise_output"]) + + return program_config + + def sample_predictor_configs(self, program_config): + config = self.create_inference_config(use_mkldnn=True) + yield config, ["relu", "conv2d", "conv2d"], (1e-5, 1e-5) + + def test(self): + self.run_and_statis( + quant=False, passes=["conv_elementwise_add_mkldnn_fuse_pass"]) + + +''' +class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + attrs = [ + program_config.ops[i].attrs + for i in range(len(program_config.ops)) + ] + if "elementwise_weight" in program_config.weights: + if program_config.weights["elementwise_weight"].shape[0] == program_config.inputs["input_data1"].shape[1]: + if attrs[2]['axis'] != 1: + return False + if program_config.weights["elementwise_weight"].shape[0] == program_config.inputs["input_data1"].shape[3]: + if attrs[2]['axis'] != -1: + return False return True def sample_program_config(self, draw): @@ -101,7 +204,7 @@ class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest): "strides": strides }) - if axis == -1 or axis == 0: + if axis == 0: elt_op = OpConfig( type="elementwise_add", inputs={"X": ["input_data1"], @@ -118,14 +221,12 @@ class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest): model_net = [relu_op, conv2d_op, elt_op] - if axis == 1: + if axis == 0: program_config = ProgramConfig( ops=model_net, weights={ "conv_weight": - TensorConfig(data_gen=partial(generate_weight1)), - "elementwise_weight": - TensorConfig(data_gen=partial(generate_weight2)) + TensorConfig(data_gen=partial(generate_weight1)) }, inputs={ "input_data1": @@ -137,7 +238,9 @@ class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest): ops=model_net, weights={ "conv_weight": - TensorConfig(data_gen=partial(generate_weight1)) + TensorConfig(data_gen=partial(generate_weight1)), + "elementwise_weight": + TensorConfig(data_gen=partial(generate_weight2)) }, inputs={ "input_data1": @@ -154,7 +257,7 @@ class TestConvElementwiseAddMkldnnFusePass(PassAutoScanTest): def test(self): self.run_and_statis( quant=False, passes=["conv_elementwise_add_mkldnn_fuse_pass"]) - +''' if __name__ == "__main__": unittest.main() -- GitLab From 0c703fe7592bf3d5a522ca4405c9d3c54eb065a5 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 15 Mar 2022 16:05:54 +0800 Subject: [PATCH 072/176] [Phi] Move gather op kernel into phi (#40500) * add phi gather kernel * update year * remove original gather opkernel * add gather grad phi kernels * remove origin gather grad kernel * fix failed npu and xpu * fix xpu compile failed --- paddle/fluid/operators/gather_op.cc | 14 +- paddle/fluid/operators/gather_op.cu | 152 ------------------- paddle/fluid/operators/gather_op.h | 133 ---------------- paddle/fluid/operators/gather_op_npu.cc | 3 +- paddle/fluid/operators/gather_op_npu_test.cc | 5 +- paddle/fluid/operators/gather_op_xpu.cc | 5 +- paddle/phi/kernels/cpu/gather_grad_kernel.cc | 82 ++++++++++ paddle/phi/kernels/cpu/gather_kernel.cc | 66 ++++++++ paddle/phi/kernels/gather_grad_kernel.h | 31 ++++ paddle/phi/kernels/gather_kernel.h | 29 ++++ paddle/phi/kernels/gpu/gather_grad_kernel.cu | 73 +++++++++ paddle/phi/kernels/gpu/gather_kernel.cu | 70 +++++++++ paddle/phi/ops/compat/gather_sig.cc | 44 ++++++ 13 files changed, 405 insertions(+), 302 deletions(-) delete mode 100644 paddle/fluid/operators/gather_op.cu delete mode 100644 paddle/fluid/operators/gather_op.h create mode 100644 paddle/phi/kernels/cpu/gather_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/gather_kernel.cc create mode 100644 paddle/phi/kernels/gather_grad_kernel.h create mode 100644 paddle/phi/kernels/gather_kernel.h create mode 100644 paddle/phi/kernels/gpu/gather_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/gather_kernel.cu create mode 100644 paddle/phi/ops/compat/gather_sig.cc diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc index 8a405cc6fc1..7910d94298e 100644 --- a/paddle/fluid/operators/gather_op.cc +++ b/paddle/fluid/operators/gather_op.cc @@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/gather_op.h" #include #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/core/ddim.h" @@ -198,17 +198,7 @@ REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker, ops::GatherGradOpMaker); REGISTER_OPERATOR(gather_grad, ops::GatherGradOp, ops::GatherGradNoNeedBufferVarInferer); -REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel, - ops::GatherOpKernel, ops::GatherOpKernel, - ops::GatherOpKernel, - ops::GatherOpKernel, - ops::GatherOpKernel); -REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel, - ops::GatherGradientOpKernel, - ops::GatherGradientOpKernel, - ops::GatherGradientOpKernel, - ops::GatherGradientOpKernel, - ops::GatherGradientOpKernel); + REGISTER_OP_VERSION(gather) .AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC", paddle::framework::compatible::OpVersionDesc().NewInput( diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu deleted file mode 100644 index e0db2f26d3e..00000000000 --- a/paddle/fluid/operators/gather_op.cu +++ /dev/null @@ -1,152 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/operators/gather_op.h" -#include "paddle/phi/kernels/funcs/gather.cu.h" -#include "paddle/phi/kernels/funcs/scatter.cu.h" - -namespace paddle { -namespace operators { - -template -class GatherOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *x = ctx.Input("X"); - auto *index = ctx.Input("Index"); - auto *output = ctx.Output("Out"); - - int axis = ctx.Attr("axis"); - - // get axis from tensor - if (ctx.HasInput("Axis")) { - Tensor cpu_axis; - const Tensor *axis_tensor = ctx.Input("Axis"); - framework::TensorCopy(*axis_tensor, platform::CPUPlace(), &cpu_axis); - const auto &axis_type = - framework::TransToProtoVarType(axis_tensor->dtype()); - if (axis_type == framework::proto::VarType::INT32) { - axis = static_cast(cpu_axis.data()[0]); - } else if (axis_type == framework::proto::VarType::INT64) { - axis = static_cast(cpu_axis.data()[0]); - } else if (axis_type == framework::proto::VarType::INT16) { - axis = static_cast(cpu_axis.data()[0]); - } - } - const auto &place = ctx.GetPlace(); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - const auto &dev_ctx = ctx.cuda_device_context(); - if (axis != 0) { - if (index_type == framework::proto::VarType::INT32) { - phi::funcs::GatherV2CUDAFunction(x, index, axis, output, - dev_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - phi::funcs::GatherV2CUDAFunction(x, index, axis, output, - dev_ctx); - } else if (index_type == framework::proto::VarType::INT16) { - phi::funcs::GatherV2CUDAFunction(x, index, axis, output, - dev_ctx); - } - return; - } - - output->mutable_data(ctx.GetPlace()); - if (x->numel() == 0) return; - if (index_type == framework::proto::VarType::INT32) { - phi::funcs::GPUGather(dev_ctx, *x, *index, output); - } else if (index_type == framework::proto::VarType::INT64) { - phi::funcs::GPUGather(dev_ctx, *x, *index, output); - } else if (index_type == framework::proto::VarType::INT16) { - phi::funcs::GPUGather(dev_ctx, *x, *index, output); - } - } -}; - -template -class GatherGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *index = ctx.Input("Index"); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dO = ctx.Input(framework::GradVarName("Out")); - - int axis = ctx.Attr("axis"); - if (ctx.HasInput("Axis")) { - const Tensor *axis_tensor = ctx.Input("Axis"); - Tensor cpu_axis; - framework::TensorCopy(*axis_tensor, platform::CPUPlace(), &cpu_axis); - const auto &axis_type = - framework::TransToProtoVarType(axis_tensor->dtype()); - if (axis_type == framework::proto::VarType::INT32) { - axis = static_cast(cpu_axis.data()[0]); - } else if (axis_type == framework::proto::VarType::INT64) { - axis = static_cast(cpu_axis.data()[0]); - } - } - - const auto &dev_ctx = ctx.cuda_device_context(); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (axis != 0) { - if (index_type == framework::proto::VarType::INT32) { - phi::funcs::GatherV2GradCUDAFunction(dO, index, axis, dX, - dev_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - phi::funcs::GatherV2GradCUDAFunction(dO, index, axis, dX, - dev_ctx); - } - return; - } - - dX->mutable_data(ctx.GetPlace()); - auto dxt = framework::EigenVector::Flatten(*dX); - auto &place = *ctx.template device_context() - .eigen_device(); - dxt.device(place) = dxt.constant(static_cast(0)); - if (dO->numel() == 0) return; - if (index_type == framework::proto::VarType::INT32) { - phi::funcs::GPUScatterAssign(dev_ctx, *dO, *index, dX, - ctx.Attr("overwrite")); - } else if (index_type == framework::proto::VarType::INT64) { - phi::funcs::GPUScatterAssign(dev_ctx, *dO, *index, dX, - ctx.Attr("overwrite")); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel, - ops::GatherOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel, - ops::GatherGradOpCUDAKernel, - ops::GatherGradOpCUDAKernel, - ops::GatherGradOpCUDAKernel, - ops::GatherGradOpCUDAKernel, - ops::GatherGradOpCUDAKernel); diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h deleted file mode 100644 index 94de694b2f9..00000000000 --- a/paddle/fluid/operators/gather_op.h +++ /dev/null @@ -1,133 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/gather.h" -#include "paddle/phi/kernels/funcs/scatter.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class GatherOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - - auto *x = ctx.Input("X"); - auto *index = ctx.Input("Index"); - auto *output = ctx.Output("Out"); - - int axis = ctx.Attr("axis"); - // get axis from tensor - if (ctx.HasInput("Axis")) { - const Tensor *axis_tensor = ctx.Input("Axis"); - const auto &axis_type = axis_tensor->dtype(); - if (axis_type == phi::DataType::INT32) { - axis = static_cast(axis_tensor->data()[0]); - } else if (axis_type == phi::DataType::INT64) { - axis = static_cast(axis_tensor->data()[0]); - } - } - const auto &index_type = index->dtype(); - auto &dev_ctx = ctx.template device_context(); - if (axis != 0) { - if (index_type == phi::DataType::INT32) { - phi::funcs::GatherV2Function(dev_ctx, x, index, axis, - output); - } else if (index_type == phi::DataType::INT64) { - phi::funcs::GatherV2Function(dev_ctx, x, index, axis, - output); - } - return; - } - - output->mutable_data(ctx.GetPlace()); - if (x->numel() == 0) return; - if (index_type == phi::DataType::INT32) { - phi::funcs::CPUGather(dev_ctx, *x, *index, output); - } else if (index_type == phi::DataType::INT64) { - phi::funcs::CPUGather(dev_ctx, *x, *index, output); - } - } -}; - -template -class GatherGradientOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - - auto *index = ctx.Input("Index"); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dO = ctx.Input(framework::GradVarName("Out")); - - int axis = ctx.Attr("axis"); - if (ctx.HasInput("Axis")) { - const Tensor *axis_tensor = ctx.Input("Axis"); - const auto &axis_type = axis_tensor->dtype(); - if (axis_type == phi::DataType::INT32) { - axis = static_cast(axis_tensor->data()[0]); - } else if (axis_type == phi::DataType::INT64) { - axis = static_cast(axis_tensor->data()[0]); - } - } - const auto &index_type = index->dtype(); - auto &dev_ctx = ctx.template device_context(); - - if (axis != 0) { - if (index_type == phi::DataType::INT32) { - phi::funcs::GatherV2GradFunction(dev_ctx, dO, index, axis, - dX); - } else if (index_type == phi::DataType::INT64) { - phi::funcs::GatherV2GradFunction(dev_ctx, dO, index, axis, - dX); - } - return; - } - - dX->mutable_data(ctx.GetPlace()); - auto dxt = framework::EigenVector::Flatten(*dX); - auto &place = *dev_ctx.eigen_device(); - dxt.device(place) = dxt.constant(static_cast(0)); - if (dO->numel() == 0) return; - bool overwrite = ctx.Attr("overwrite"); - - if (index_type == phi::DataType::INT32) { - if (overwrite) { - phi::funcs::ScatterAssign(dev_ctx, *dO, *index, dX); - } else { - phi::funcs::ScatterAssignAdd(dev_ctx, *dO, *index, dX); - } - } else if (index_type == phi::DataType::INT64) { - if (overwrite) { - phi::funcs::ScatterAssign(dev_ctx, *dO, *index, dX); - } else { - phi::funcs::ScatterAssignAdd(dev_ctx, *dO, *index, dX); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc index 21093f585b5..f996b1ede2f 100644 --- a/paddle/fluid/operators/gather_op_npu.cc +++ b/paddle/fluid/operators/gather_op_npu.cc @@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/gather_op.h" #include #include #include + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/device/npu/npu_info.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc index 3dce3803608..b42050eabe3 100644 --- a/paddle/fluid/operators/gather_op_npu_test.cc +++ b/paddle/fluid/operators/gather_op_npu_test.cc @@ -24,16 +24,15 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/gather_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(gather); +USE_OP_ITSELF(gather); USE_OP_DEVICE_KERNEL(gather, NPU); -USE_OP(gather_grad); +USE_OP_ITSELF(gather_grad); USE_OP_DEVICE_KERNEL(gather_grad, NPU); template diff --git a/paddle/fluid/operators/gather_op_xpu.cc b/paddle/fluid/operators/gather_op_xpu.cc index 28f2f7d473b..6c691aa14ae 100644 --- a/paddle/fluid/operators/gather_op_xpu.cc +++ b/paddle/fluid/operators/gather_op_xpu.cc @@ -13,15 +13,18 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/gather_op.h" #include #include #include + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/core/ddim.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + template class GatherOpXPUKernel : public framework::OpKernel { using XPUType = typename XPUTypeTrait::Type; diff --git a/paddle/phi/kernels/cpu/gather_grad_kernel.cc b/paddle/phi/kernels/cpu/gather_grad_kernel.cc new file mode 100644 index 00000000000..f0a6948018a --- /dev/null +++ b/paddle/phi/kernels/cpu/gather_grad_kernel.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gather_grad_kernel.h" + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/gather.h" +#include "paddle/phi/kernels/funcs/scatter.h" + +namespace phi { + +template +void GatherGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + const Scalar& axis, + bool overwrite, + DenseTensor* x_grad) { + const auto& index_type = index.dtype(); + auto axis_v = axis.to(); + + if (axis_v != 0) { + if (index_type == phi::DataType::INT32) { + phi::funcs::GatherV2GradFunction( + dev_ctx, &out_grad, &index, axis_v, x_grad); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::GatherV2GradFunction( + dev_ctx, &out_grad, &index, axis_v, x_grad); + } + return; + } + + dev_ctx.template Alloc(x_grad); + + auto dxt = EigenVector::Flatten(*x_grad); + auto& place = *dev_ctx.eigen_device(); + dxt.device(place) = dxt.constant(static_cast(0)); + if (x_grad->numel() == 0) return; + + if (index_type == phi::DataType::INT32) { + if (overwrite) { + phi::funcs::ScatterAssign(dev_ctx, out_grad, index, x_grad); + } else { + phi::funcs::ScatterAssignAdd( + dev_ctx, out_grad, index, x_grad); + } + } else if (index_type == phi::DataType::INT64) { + if (overwrite) { + phi::funcs::ScatterAssign(dev_ctx, out_grad, index, x_grad); + } else { + phi::funcs::ScatterAssignAdd( + dev_ctx, out_grad, index, x_grad); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gather_grad, + CPU, + ALL_LAYOUT, + phi::GatherGradKernel, + float, + double, + int, + uint8_t, + int64_t, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/cpu/gather_kernel.cc b/paddle/phi/kernels/cpu/gather_kernel.cc new file mode 100644 index 00000000000..9207a05b9dc --- /dev/null +++ b/paddle/phi/kernels/cpu/gather_kernel.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gather_kernel.h" + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/gather.h" + +namespace phi { + +template +void GatherKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const Scalar& axis, + DenseTensor* out) { + const auto& index_type = index.dtype(); + auto axis_v = axis.to(); + if (axis_v != 0) { + if (index_type == phi::DataType::INT32) { + phi::funcs::GatherV2Function( + dev_ctx, &x, &index, axis_v, out); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::GatherV2Function( + dev_ctx, &x, &index, axis_v, out); + } + return; + } + + dev_ctx.template Alloc(out); + + if (x.numel() == 0) { + return; + } + + if (index_type == phi::DataType::INT32) { + phi::funcs::CPUGather(dev_ctx, x, index, out); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::CPUGather(dev_ctx, x, index, out); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gather, + CPU, + ALL_LAYOUT, + phi::GatherKernel, + float, + double, + int, + uint8_t, + int64_t, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gather_grad_kernel.h b/paddle/phi/kernels/gather_grad_kernel.h new file mode 100644 index 00000000000..e53da7b471c --- /dev/null +++ b/paddle/phi/kernels/gather_grad_kernel.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void GatherGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + const Scalar& axis, + bool overwrite, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/gather_kernel.h b/paddle/phi/kernels/gather_kernel.h new file mode 100644 index 00000000000..78ac09125b6 --- /dev/null +++ b/paddle/phi/kernels/gather_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void GatherKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const Scalar& axis, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/gather_grad_kernel.cu b/paddle/phi/kernels/gpu/gather_grad_kernel.cu new file mode 100644 index 00000000000..04149a2f9ee --- /dev/null +++ b/paddle/phi/kernels/gpu/gather_grad_kernel.cu @@ -0,0 +1,73 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gather_kernel.h" + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" +#include "paddle/phi/kernels/funcs/scatter.cu.h" + +namespace phi { + +template +void GatherGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + const Scalar& axis, + bool overwrite, + DenseTensor* x_grad) { + const auto& index_type = index.dtype(); + auto axis_v = axis.to(); + + if (axis_v != 0) { + if (index_type == DataType::INT32) { + phi::funcs::GatherV2GradCUDAFunction( + &out_grad, &index, axis_v, x_grad, dev_ctx); + } else if (index_type == DataType::INT64) { + phi::funcs::GatherV2GradCUDAFunction( + &out_grad, &index, axis_v, x_grad, dev_ctx); + } + return; + } + + dev_ctx.template Alloc(x_grad); + auto dxt = EigenVector::Flatten(*x_grad); + auto& place = *dev_ctx.eigen_device(); + dxt.device(place) = dxt.constant(static_cast(0)); + if (out_grad.numel() == 0) return; + if (index_type == DataType::INT32) { + phi::funcs::GPUScatterAssign( + dev_ctx, out_grad, index, x_grad, overwrite); + } else if (index_type == DataType::INT64) { + phi::funcs::GPUScatterAssign( + dev_ctx, out_grad, index, x_grad, overwrite); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gather_grad, + GPU, + ALL_LAYOUT, + phi::GatherGradKernel, + float, + double, + int64_t, + int, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/gather_kernel.cu b/paddle/phi/kernels/gpu/gather_kernel.cu new file mode 100644 index 00000000000..7e0c6cc1685 --- /dev/null +++ b/paddle/phi/kernels/gpu/gather_kernel.cu @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gather_kernel.h" + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" + +namespace phi { + +template +void GatherKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const Scalar& axis, + DenseTensor* out) { + const auto& index_type = index.dtype(); + auto axis_v = axis.to(); + if (axis_v != 0) { + if (index_type == phi::DataType::INT32) { + phi::funcs::GatherV2CUDAFunction( + &x, &index, axis_v, out, dev_ctx); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::GatherV2CUDAFunction( + &x, &index, axis_v, out, dev_ctx); + } else if (index_type == phi::DataType::INT16) { + phi::funcs::GatherV2CUDAFunction( + &x, &index, axis_v, out, dev_ctx); + } + return; + } + + dev_ctx.template Alloc(out); + + if (x.numel() == 0) return; + if (index_type == phi::DataType::INT32) { + phi::funcs::GPUGather(dev_ctx, x, index, out); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::GPUGather(dev_ctx, x, index, out); + } else if (index_type == phi::DataType::INT16) { + phi::funcs::GPUGather(dev_ctx, x, index, out); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gather, + GPU, + ALL_LAYOUT, + phi::GatherKernel, + float, + double, + int64_t, + int, + int16_t, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/ops/compat/gather_sig.cc b/paddle/phi/ops/compat/gather_sig.cc new file mode 100644 index 00000000000..6c47bbe48b8 --- /dev/null +++ b/paddle/phi/ops/compat/gather_sig.cc @@ -0,0 +1,44 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature GatherOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.HasInput("Axis")) { + return KernelSignature("gather", {"X", "Index"}, {"Axis"}, {"Out"}); + } else { + return KernelSignature("gather", {"X", "Index"}, {"axis"}, {"Out"}); + } +} + +KernelSignature GatherGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.HasInput("Axis")) { + return KernelSignature("gather_grad", + {"X", "Index", GradVarName("Out")}, + {"Axis", "overwrite"}, + {GradVarName("X")}); + } else { + return KernelSignature("gather_grad", + {"X", "Index", GradVarName("Out")}, + {"axis", "overwrite"}, + {GradVarName("X")}); + } +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(gather, phi::GatherOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(gather_grad, phi::GatherGradOpArgumentMapping); -- GitLab From 2c5edb4f58a4102fdeebbb29a2e4fc6a64b67da6 Mon Sep 17 00:00:00 2001 From: Yulong Ao Date: Tue, 15 Mar 2022 16:25:36 +0800 Subject: [PATCH 073/176] [Auto Parallel] Add the recorder and trial class for the tuner (#40555) Add the recorder --- .../auto_parallel/tuner/recorder.py | 214 ++++++++++++++++++ .../distributed/auto_parallel/tuner/trial.py | 114 ++++++++++ .../unittests/auto_parallel/CMakeLists.txt | 5 + .../unittests/auto_parallel/test_recorder.py | 152 +++++++++++++ .../unittests/auto_parallel/test_trial.py | 53 +++++ 5 files changed, 538 insertions(+) create mode 100644 python/paddle/distributed/auto_parallel/tuner/recorder.py create mode 100644 python/paddle/distributed/auto_parallel/tuner/trial.py create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/test_recorder.py create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/test_trial.py diff --git a/python/paddle/distributed/auto_parallel/tuner/recorder.py b/python/paddle/distributed/auto_parallel/tuner/recorder.py new file mode 100644 index 00000000000..140336566a1 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/tuner/recorder.py @@ -0,0 +1,214 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + + +class MetricRecord(object): + """ + One record for a single metric at a given execution step. + """ + + def __init__(self, value, step): + self._value = value + self._step = step + + @property + def value(self): + return self._value + + @value.setter + def value(self, value): + self._value = value + + @property + def step(self): + return self._step + + @step.setter + def step(self, step): + self._step = step + + def mean(self): + return np.mean(self.value) + + def get_state(self): + return {"value": self.value, "step": self.step} + + @classmethod + def from_state(cls, state): + return cls(**state) + + def __eq__(self, other): + if not isinstance(other, MetricRecord): + return False + return other.value == self.value and other.step == self.step + + def __repr__(self): + return "MetricRecord(value={}, step={})".format(self.value, self.step) + + +class MetricRecords(object): + """ + Records of a single metric across different executions. + """ + + def __init__(self, direction="min"): + if direction not in {"min", "max"}: + raise ValueError( + "direction should be one of {min, max}, but got: {}.".format( + direction)) + self._direction = direction + self._records = {} + + @property + def records(self): + return sorted(self._records.values(), key=lambda r: r.step) + + @records.setter + def records(self, records): + for r in records: + self.update(r.value, step=r.step) + + @property + def direction(self): + return self._direction + + @direction.setter + def direction(self, direction): + self._direction = direction + + def update(self, value, step=0): + if step in self._records: + self._records[step].set_value(value) + else: + self._records[step] = MetricRecord(value, step=step) + + def get_best_value(self): + values = list(r.mean() for r in self._records.values()) + if not values: + return None + if self._direction == "min": + return np.nanmin(values) + return np.nanmax(values) + + def get_best_step(self): + best_value = self.get_best_value() + if best_value is None: + return None + for r in self._records.values(): + if r.mean() == best_value: + return r.step + + def get_statistics(self): + records = self.records + records_values = [r.mean() for r in records] + if not len(records_values): + return {} + return { + "min": float(np.nanmin(records_values)), + "max": float(np.nanmax(records_values)), + "mean": float(np.nanmean(records_values)), + "median": float(np.nanmedian(records_values)), + "var": float(np.nanvar(records_values)), + "std": float(np.nanstd(records_values)), + } + + def get_state(self): + state = {} + state["direction"] = self._direction + state["records"] = [r.get_state() for r in self.records] + return state + + @classmethod + def from_state(cls, state): + records = cls(state["direction"]) + records.records = [MetricRecord.from_state(r) for r in state["records"]] + print("here 1", records.records) + return records + + +class MetricsRecorder(object): + """ + Record the values for all metrics. + """ + + def __init__(self, metrics=None): + self._records = {} + self.register_metrics(metrics) + + @property + def records(self): + return self._records + + def exists(self, name): + return name in self._records + + def register_metrics(self, metrics=None): + metrics = metrics or [] + for metric in metrics: + self.register(metric.name) + + def register(self, name, direction=None): + if self.exists(name): + raise ValueError("Metric {} have been registered.".format(name)) + if direction is None: + direction = "min" + self._records[name] = MetricRecords(direction) + + def update(self, name, value, step=0): + value = float(value) + if not self.exists(name): + self.register(name) + + prev_best = self._records[name].get_best_value() + self._records[name].update(value, step=step) + new_best = self._records[name].get_best_value() + + improved = new_best != prev_best + return improved + + def get_records(self, name): + return self._records[name].records + + def set_records(self, name, records): + if not self.exists(name): + self.register(name) + self._records[name].records = records + + def get_best_value(self, name): + return self._records[name].get_best_value() + + def get_best_step(self, name): + return self._records[name].get_best_step() + + def get_statistics(self, name): + return self._records[name].get_statistics() + + def get_state(self): + return { + "metrics": { + name: metric_records.get_state() + for name, metric_records in self._records.items() + } + } + + @classmethod + def from_state(cls, state): + recorder = cls() + recorder._records = { + name: MetricRecords.from_state(metric_records) + for name, metric_records in state["metrics"].items() + } + return recorder diff --git a/python/paddle/distributed/auto_parallel/tuner/trial.py b/python/paddle/distributed/auto_parallel/tuner/trial.py new file mode 100644 index 00000000000..22a6638c5ca --- /dev/null +++ b/python/paddle/distributed/auto_parallel/tuner/trial.py @@ -0,0 +1,114 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import hashlib +import random +import time +from enum import Enum + +from .storable import Storable +from .recorder import MetricsRecorder +from .tunable_space import TunableSpace + + +class TrialStatus: + RUNNING = "RUNNING" + COMPLETED = "COMPLETED" + STOPPED = "STOPPED" + INVALID = "INVALID" + + +class Trial(Storable): + def __init__(self, tunable_space, trial_id=None, + status=TrialStatus.RUNNING): + self._id = _generate_trial_id() if trial_id is None else trial_id + self._space = tunable_space + self._recorder = MetricsRecorder() + self._score = None + self._best_step = None + self._status = status + + @property + def id(self): + return self._id + + @property + def space(self): + return self._space + + @property + def recorder(self): + return self._recorder + + @property + def score(self): + return self._score + + @score.setter + def score(self, score): + self._score = score + + @property + def best_step(self): + return self._best_step + + @best_step.setter + def best_step(self, best_step): + self._best_step = best_step + + @property + def status(self): + return self._status + + @status.setter + def status(self, status): + self._status = status + + def summary(self): + print("Tunable space:") + if self.space.values: + for tv, value in self.space.values.items(): + print(tv + ":", value) + + if self.score is not None: + print("Score: {}".format(self.score)) + + def get_state(self): + return { + "id": self.id, + "space": self.space.get_state(), + "recorder": self.recorder.get_state(), + "score": self.score, + "best_step": self.best_step, + "status": self.status, + } + + def set_state(self, state): + self._id = state["id"] + self._space = TunableSpace.from_state(state["space"]) + self._recorder = MetricsRecorder.from_state(state["recorder"]) + self._score = state["score"] + self._best_step = state["best_step"] + self._status = state["status"] + + @classmethod + def from_state(cls, state): + trial = cls(tunable_space=None) + trial.set_state(state) + return trial + + +def _generate_trial_id(): + s = str(time.time()) + str(random.randint(1, int(1e7))) + return hashlib.sha256(s.encode("utf-8")).hexdigest()[:32] diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index 1f7ae53acdf..4a2fba70de4 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -11,4 +11,9 @@ if(WITH_DISTRIBUTE AND WITH_GPU) set_tests_properties(test_engine_api PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 80) py_test_modules(test_converter MODULES test_converter ENVS ${dist_ENVS}) set_tests_properties(test_converter PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50) + + py_test_modules(test_tunable_variable MODULES test_tunable_variable ENVS ${dist_ENVS}) + py_test_modules(test_tunable_space MODULES test_tunable_space ENVS ${dist_ENVS}) + py_test_modules(test_recorder MODULES test_recorder ENVS ${dist_ENVS}) + py_test_modules(test_trial MODULES test_trial ENVS ${dist_ENVS}) endif() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_recorder.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_recorder.py new file mode 100644 index 00000000000..ab704a6a257 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_recorder.py @@ -0,0 +1,152 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +from paddle.distributed.auto_parallel.tuner import recorder as rd + + +class TestRecorder(unittest.TestCase): + def test_register(self): + recorder = rd.MetricsRecorder() + recorder.register("metric") + self.assertEqual(set(recorder.records.keys()), {"metric"}) + self.assertEqual(recorder.records["metric"].direction, "min") + + def test_exists(self): + recorder = rd.MetricsRecorder() + recorder.register("metric", direction="max") + self.assertTrue(recorder.exists("metric")) + + def test_update(self): + recorder = rd.MetricsRecorder() + recorder.update("metric", 4, 1000) + self.assertEqual(recorder.records["metric"].direction, "min") + self.assertEqual( + recorder.get_records("metric"), [rd.MetricRecord(4, 1000)]) + + def test_get_records(self): + recorder = rd.MetricsRecorder() + recorder.update("metric", 1, step=0) + recorder.update("metric", 2, step=1) + recorder.update("metric", 3, step=2) + recorder.update("metric", 4, step=3) + self.assertEqual( + recorder.get_records("metric"), [ + rd.MetricRecord(1, 0), + rd.MetricRecord(2, 1), + rd.MetricRecord(3, 2), + rd.MetricRecord(4, 3), + ]) + + def test_set_records(self): + recorder = rd.MetricsRecorder() + recorder.set_records( + "metric", + [ + rd.MetricRecord(1, 0), + rd.MetricRecord(2, 1), + rd.MetricRecord(3, 2), + rd.MetricRecord(4, 3), + ], ) + self.assertEqual( + recorder.get_records("metric"), [ + rd.MetricRecord(1, 0), + rd.MetricRecord(2, 1), + rd.MetricRecord(3, 2), + rd.MetricRecord(4, 3), + ]) + + def test_get_best_value(self): + recorder = rd.MetricsRecorder() + recorder.register("metric_min", "min") + recorder.register("metric_max", "max") + + recorder.set_records( + "metric_min", + [ + rd.MetricRecord(1, 0), + rd.MetricRecord(2, 1), + rd.MetricRecord(3, 2), + rd.MetricRecord(4, 3), + ], ) + self.assertEqual(recorder.get_best_value("metric_min"), 1) + + recorder.set_records( + "metric_max", + [ + rd.MetricRecord(1, 0), + rd.MetricRecord(2, 1), + rd.MetricRecord(3, 2), + rd.MetricRecord(4, 3), + ], ) + self.assertEqual(recorder.get_best_value("metric_max"), 4) + + def test_get_best_step(self): + recorder = rd.MetricsRecorder() + + recorder.register("metric_min", "min") + recorder.set_records( + "metric_min", + [ + rd.MetricRecord(1, 0), + rd.MetricRecord(2, 1), + rd.MetricRecord(3, 2), + rd.MetricRecord(4, 3), + ], ) + self.assertEqual(recorder.get_best_step("metric_min"), 0) + + recorder.register("metric_max", "max") + recorder.set_records( + "metric_max", + [ + rd.MetricRecord(1, 0), + rd.MetricRecord(2, 1), + rd.MetricRecord(3, 2), + rd.MetricRecord(4, 3), + ], ) + self.assertEqual(recorder.get_best_step("metric_max"), 3) + + def test_get_statistics(self): + recorder = rd.MetricsRecorder() + records = [rd.MetricRecord(np.random.random(), i) for i in range(14)] + recorder.set_records("metric", records) + stats = recorder.get_statistics("metric") + records = [r.value for r in records] + self.assertEqual(stats["min"], np.min(records)) + self.assertEqual(stats["max"], np.max(records)) + self.assertEqual(stats["mean"], np.mean(records)) + self.assertEqual(stats["median"], np.median(records)) + self.assertEqual(stats["var"], np.var(records)) + self.assertEqual(stats["std"], np.std(records)) + + def test_serialization(self): + recorder = rd.MetricsRecorder() + recorder.register("metric") + recorder.set_records( + "metric", + [ + rd.MetricRecord(1, 0), + rd.MetricRecord(2, 1), + rd.MetricRecord(3, 2), + rd.MetricRecord(4, 3), + ], ) + print(recorder.get_state()) + new_recorder = rd.MetricsRecorder.from_state(recorder.get_state()) + self.assertEqual(new_recorder.records.keys(), recorder.records.keys()) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_trial.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_trial.py new file mode 100644 index 00000000000..fc52d1c394e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_trial.py @@ -0,0 +1,53 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from paddle.distributed.auto_parallel.tuner import tunable_space as ts +from paddle.distributed.auto_parallel.tuner import trial as tr + + +class TestTiral(unittest.TestCase): + def test_trial(self): + space = ts.TunableSpace() + space.choice("choice", [0, 1, 2, 3], default=2) + trial = tr.Trial(space, trial_id="trial-1") + trial.recorder.register("latency", direction="min") + trial.recorder.update("latency", 0.1, step=0) + trial.recorder.update("latency", 0.2, step=1) + trial.best_step = 0 + + self.assertEqual(trial.id, "trial-1") + self.assertEqual(trial.space.get_value("choice"), 2) + self.assertEqual(trial.best_step, 0) + self.assertEqual(trial.status, "RUNNING") + + def test_serialization(self): + space = ts.TunableSpace() + space.int_range("int_range", start=1, stop=4, default=2) + trial = tr.Trial(space, trial_id="trial-2", status="COMPLETED") + trial.recorder.register("latency", direction="min") + trial.recorder.update("latency", 0.1, step=0) + trial.recorder.update("latency", 0.2, step=1) + trial.best_step = 0 + + new_trial = tr.Trial.from_state(trial.get_state()) + self.assertEqual(new_trial.id, "trial-2") + self.assertEqual(new_trial.space.get_value("int_range"), 2) + self.assertEqual(new_trial.best_step, 0) + self.assertEqual(new_trial.status, "COMPLETED") + + +if __name__ == "__main__": + unittest.main() -- GitLab From 69dd43d123c76bbeee16cccc4daa751349c4de80 Mon Sep 17 00:00:00 2001 From: furnace <34057289+windstamp@users.noreply.github.com> Date: Tue, 15 Mar 2022 16:26:51 +0800 Subject: [PATCH 074/176] [NPU] add AMP O1 support (#40362) * [NPU] add AMP O1 support * [NPU] fix NOTE and warnings --- paddle/fluid/imperative/amp_auto_cast.cc | 4 ++- paddle/fluid/pybind/op_function_generator.h | 1 + python/paddle/fluid/dygraph/amp/auto_cast.py | 11 ++++-- .../paddle/fluid/dygraph/amp/loss_scaler.py | 35 +++++++++++++------ 4 files changed, 37 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc index 149202468be..dd00b75666d 100644 --- a/paddle/fluid/imperative/amp_auto_cast.cc +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -209,7 +209,9 @@ inline bool NeedCast(const std::shared_ptr& var) { auto data_type = GetDataType(var); if (paddle::platform::is_gpu_place(place) || paddle::platform::is_cuda_pinned_place(place) || - paddle::platform::is_xpu_place(place)) { + paddle::platform::is_xpu_place(place) || + paddle::platform::is_npu_place(place) || + paddle::platform::is_npu_pinned_place(place)) { // CudaPinndePlace is added for varbase created by dataloader if (data_type == paddle::framework::proto::VarType::FP32 || data_type == paddle::framework::proto::VarType::FP16 || diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h index 9e86e3df8a6..d8750c1d6c1 100644 --- a/paddle/fluid/pybind/op_function_generator.h +++ b/paddle/fluid/pybind/op_function_generator.h @@ -88,6 +88,7 @@ std::map> op_ins_map = { {"nce", {"Input", "Label", "Weight", "Bias", "SampleWeight", "CustomDistProbs", "CustomDistAlias", "CustomDistAliasProbs"}}, + {"check_finite_and_unscale", {"X", "Scale", "FloatStatus"}}, }; // NOTE(zhiqiu): Like op_ins_map. diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py index 191661b7bf9..a449bdf0a18 100644 --- a/python/paddle/fluid/dygraph/amp/auto_cast.py +++ b/python/paddle/fluid/dygraph/amp/auto_cast.py @@ -271,14 +271,19 @@ def amp_guard(enable=True, "current_tracer is None, maybe it is not in imperative mode.") # check device_type: - # NOTE: Now, amp only support gpu for float16 and bfloat16, xpu for float16. + # NOTE: Now, amp only support gpu for float16 and bfloat16, xpu for float16, npu for float16. # Maybe we will support cpu for bfloat16. if enable and not (tracer._expected_place.is_gpu_place() or - tracer._expected_place.is_xpu_place()): + tracer._expected_place.is_xpu_place() or + tracer._expected_place.is_npu_place()): warnings.warn( - 'amp_guard can only be enabled on CUDAPlace and XPUPlace, current place is %s, so it makes no effect.' + 'amp_guard can only be enabled on CUDAPlace, XPUPlace, and NPUPlace, current place is %s, so it makes no effect.' % tracer._expected_place) enable = False + # For npu: + if tracer._expected_place.is_npu_place() and (dtype == 'bfloat16'): + warnings.warn('NPUPlace only support float16 amp.') + enable = False # For xpu: if tracer._expected_place.is_xpu_place() and (dtype == 'bfloat16'): warnings.warn('XPUPlace only support float16 amp.') diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py index f7c2d6be574..3ca4c7dca76 100644 --- a/python/paddle/fluid/dygraph/amp/loss_scaler.py +++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py @@ -105,9 +105,10 @@ class AmpScaler(object): "current_tracer is None, maybe it is not in imperative mode.") if enable and not (tracer._expected_place.is_gpu_place() or - tracer._expected_place.is_xpu_place()): + tracer._expected_place.is_xpu_place() or + tracer._expected_place.is_npu_place()): warnings.warn( - 'AmpScaler can only be enabled on CUDAPlace and XPUPlace, current place is %s, so it makes no effect.' + 'AmpScaler can only be enabled on CUDAPlace, XPUPlace and NPUPlace, current place is %s, so it makes no effect.' % tracer._expected_place) enable = False @@ -286,14 +287,28 @@ class AmpScaler(object): ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32 ) ] - if len(param_grads_fp16): - _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale, - param_grads_fp16, - self._temp_found_inf_fp16) - if len(param_grads_fp32): - _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale, - param_grads_fp32, - self._temp_found_inf_fp32) + if core.is_compiled_with_npu(): + float_status = _C_ops.alloc_float_status() + _C_ops.clear_float_status(float_status, float_status) + + if len(param_grads_fp16): + _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale, + float_status, param_grads_fp16, + self._temp_found_inf_fp16) + if len(param_grads_fp32): + _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale, + float_status, param_grads_fp32, + self._temp_found_inf_fp32) + else: + if len(param_grads_fp16): + _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale, + param_grads_fp16, + self._temp_found_inf_fp16) + if len(param_grads_fp32): + _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale, + param_grads_fp32, + self._temp_found_inf_fp32) + if len(param_grads_fp16) and len(param_grads_fp32): self._found_inf = self._temp_found_inf_fp16 or self._temp_found_inf_fp32 elif len(param_grads_fp16): -- GitLab From 187fcfa3f8a5d673f6328801fa4a346a320d64d7 Mon Sep 17 00:00:00 2001 From: Tongxin Bai Date: Tue, 15 Mar 2022 17:18:40 +0800 Subject: [PATCH 075/176] [einsum] refactored and supporting unknown shapes in static mode (#40360) * formatted. * Remove dead code. * Fix error message in the unit test. * polish formats. * [Einsum] fix bugs. --- .../fluid/tests/unittests/test_einsum.py | 57 +++- python/paddle/tensor/einsum.py | 260 ++++++------------ 2 files changed, 142 insertions(+), 175 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_einsum.py b/python/paddle/fluid/tests/unittests/test_einsum.py index 13e763bee63..43b5ce96a39 100644 --- a/python/paddle/fluid/tests/unittests/test_einsum.py +++ b/python/paddle/fluid/tests/unittests/test_einsum.py @@ -26,14 +26,14 @@ class TestErrors(unittest.TestCase): def test_diagonalize_errors(self): a = np.arange(4 * 3 * 4 * 4).reshape(4, 3, 4, 4).astype('float') a = paddle.to_tensor(a) - with self.assertRaisesRegex(AssertionError, ( - 'Diagonal and trace not implemented yet.')): + with self.assertRaisesRegex(AssertionError, + ('Duplicate labels are not supported.')): paddle.einsum('...ii->...i', a) - with self.assertRaisesRegex(AssertionError, ( - 'Diagonal and trace not implemented yet.')): + with self.assertRaisesRegex(AssertionError, + ('Duplicate labels are not supported.')): paddle.einsum('i...i', a) - with self.assertRaisesRegex(AssertionError, ( - 'Diagonal and trace not implemented yet.')): + with self.assertRaisesRegex(AssertionError, + ('Duplicate labels are not supported.')): paddle.einsum('i...i->i...', a) def test_param_errors(self): @@ -396,6 +396,51 @@ class TestNumpyTests(unittest.TestCase): self.check_output('a...b,b...c,c...a', a, a, a) self.check_output('...ab,...ba,...ab,...ab', a, a, a, a) + def test_static_graph(self): + paddle.enable_static() + fluid = paddle.fluid + if fluid.core.is_compiled_with_cuda(): + self.place = fluid.CUDAPlace(0) + else: + self.place = fluid.CPUPlace() + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + a = paddle.static.data( + name='a', shape=[3, None, None, None], dtype='float') + b = paddle.static.data( + name='b', shape=[2, None, None, None], dtype='float') + c = paddle.static.data( + name='c', shape=[None, None, 2, None], dtype='float') + d = paddle.static.data( + name='d', shape=[None, None, 5], dtype='float') + e = paddle.static.data( + name='e', shape=[None, 2, None], dtype='float') + + outs = [] + outs.append(paddle.einsum("ibnd,jbnd->bnij", a, b)) + outs.append(paddle.einsum('...ik, ...j', c, d)) + outs.append(paddle.einsum('...kj, ...ik', d, e)) + outs.append(paddle.einsum('ijk..., ikj', c, e)) + outs.append(paddle.einsum('ijk..., ikj->...ij', c, e)) + exe = fluid.Executor(self.place) + exe.run(startup) + a = np.arange(72).reshape(3, 2, 3, 4).astype('float') + b = np.arange(48).reshape(2, 2, 3, 4).astype('float') + c = np.arange(48).reshape(2, 3, 2, 4).astype('float') + d = np.arange(30).reshape(2, 3, 5).astype('float') + e = np.arange(12).reshape(2, 2, 3).astype('float') + feeds = {'a': a, 'b': b, 'c': c, 'd': d, 'e': e} + actual = exe.run(main, feed=feeds, fetch_list=[outs]) + expect = [] + expect.append(np.einsum("ibnd,jbnd->bnij", a, b)) + expect.append(np.einsum('...ik, ...j', c, d)) + expect.append(np.einsum('...kj, ...ik', d, e)) + expect.append(np.einsum('ijk..., ikj', c, e)) + expect.append(np.einsum('ijk..., ikj->...ij', c, e)) + for a, e in zip(actual, expect): + self.check_output_equal(a, e) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py index 040480c26fa..06c2a82fd69 100644 --- a/python/paddle/tensor/einsum.py +++ b/python/paddle/tensor/einsum.py @@ -13,9 +13,10 @@ # limitations under the License. import itertools +import numpy as np import re -from .linalg import matmul, transpose +from .linalg import dot, matmul, transpose from .manipulation import squeeze, unsqueeze, reshape from .math import multiply from .math import sum as paddle_sum @@ -111,36 +112,6 @@ def validate_rhs(rhs, input_labels, n_bcast_dims): f"Invalid equation: duplicate output labels are found.") -# ''' -# Tests if the two operands can perform a broadcast operation on the given ranges of dimensions. -# We follow the Numpy broadcasting convention which states that, by lining up the shape arrays -# starting from the right most dimension, all the aligned dimensions either have equal sizes or -# one of them is sized one. -# Parameters -# ---------- -# args: -# *args unpacks into operand one's axes range, shape, operand two's axes range, shape -# f: -# if available, is used as a callback for postprocessing the aligned operand dimensions. -# ''' -# xran, xshape, yran, yshape = args -# -# xran_inv, yran_inv = xran[::-1], yran[::-1] -# -# for xi, yi in zip(xran_inv, yran_inv): -# xs, ys = xshape[xi], yshape[yi] -# cond = xs == ys or xs == 1 or ys == 1 -# if not cond: -# return False -# -# if not f: -# return True -# -# # Apply the callback to each aligned dimension pair -# for xi, yi in zip(xran_inv, yran_inv): -# f(xi, yi) - - def build_view(in_labels, out_labels): ''' Build an inverse map of dimension indices. Three conditions must hold for @@ -291,39 +262,12 @@ def build_global_shape(g_view, g_labels, op_shapes): g_shape = [sizes.pop() if len(sizes) > 0 else 1 for sizes in g_shape] - g_masks = [[s > 1 for s in view_shape] for view_shape in view_shapes] + g_masks = [[s > 1 or s == -1 for s in view_shape] + for view_shape in view_shapes] return g_shape, g_masks -def dim_strides(shape): - ''' - Returns the dimension strides for a tensor shape - ''' - strides = [] - stride = 1 - for size in shape[::-1]: - strides.append(stride) - stride = stride * size - return strides - - -def create_view(operand, *view_def): - ''' - Create and materialize a view. - - Parameters - ---------- - operand: - the base tensor operand - view_def: - include two lists which define the view's dimension sizes and strides - ''' - assert False, f'Diagonal and trace not implemented yet.' - view_shape, view_strides = view_def - return operand.create_view(view_shape, view_strides) - - def has_duplicated_labels(labels): ''' Returns True if there is any duplicate label. @@ -337,46 +281,17 @@ def diagonalize(labels, operand): Merges dimensions with duplicate labels. For those dimensions with duplicate labels, merge them into one dimension - which represents the diagonal elements. That requires the duplicate labeled - dimensions equal sized. The order of dimensions is kept unchanged up to - the left-most appearance of each label. + which represents the diagonal elements. This requires the dimensions with + duplicate labels are equal sized. Examples -------- 'ijj...i' would be merged into 'ij...' ''' - if not has_duplicated_labels(labels): - return labels, operand - - strides = dim_strides(operand.shape) - shape = operand.shape - new_labels = [] - new_shape = [] - new_strides = [] - - for ax, l in enumerate(labels): - if l == '.' or l not in new_labels: - # not duplicate - new_labels.append(l) - new_strides.append(strides[ax]) - new_shape.append(shape[ax]) - else: - # duplicate label - diag_ax = new_labels.index(l) - new_strides[diag_ax] += strides[ax] + assert not has_duplicated_labels(labels), ( + f'Duplicate labels are not supported.') - # Call framework API to build a new tensor - new_op = create_view(operand, new_shape, new_strides) - return new_labels, new_op - - -def prod(iter, default=1): - if len(iter): - res = 1 - for s in iter: - res *= s - return res - return default + return labels, operand def plan_reduce(plan, op, reduce_dims, keepdim): @@ -408,102 +323,108 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K): op1_view, op2_view = [g_view[op] for op in (op1, op2)] - # Note, I may index into -1 - I1_dims = [op1_view[ax] for ax in I if op1_view[ax] >= 0] - I2_dims = [op2_view[ax] for ax in I if op2_view[ax] >= 0] - J1_dims = [op1_view[ax] for ax in J1] - J2_dims = [op2_view[ax] for ax in J2] - K1_dims = [op1_view[ax] for ax in K] - K2_dims = [op2_view[ax] for ax in K] + I1 = [idx for idx in I if op1_view[idx] >= 0] + I2 = [idx for idx in I if op2_view[idx] >= 0] + op1_view = np.array(op1_view) + op1_dims = op1_view[I1 + J1 + K] - op1_mask, op2_mask = [g_supports[op] for op in (op1, op2)] - op1_vshape = [s if m else 1 for s, m in zip(g_shape, op1_mask)] - op2_vshape = [s if m else 1 for s, m in zip(g_shape, op2_mask)] - - I1_shape, J1_shape, K1_shape = [[op1_vshape[ax] for ax in axes] - for axes in (I, J1, K)] - I2_shape, J2_shape, K2_shape = [[op2_vshape[ax] for ax in axes] - for axes in (I, J2, K)] + op2_view = np.array(op2_view) + op2_dims = op2_view[I2 + J2 + K] - K1_size, J1_size, J2_size = prod(K1_shape), prod(J1_shape), prod(J2_shape) + op1_mask, op2_mask = [g_supports[op] for op in (op1, op2)] + op1_vshape = np.array([s if m else 1 for s, m in zip(g_shape, op1_mask)]) + op2_vshape = np.array([s if m else 1 for s, m in zip(g_shape, op2_mask)]) + vshape = np.maximum(op1_vshape, op2_vshape) - perm1 = I1_dims + J1_dims + K1_dims - perm2 = I2_dims + J2_dims + K2_dims + i1, i2, j1, j2, k = map(len, (I1, I2, J1, J2, K)) - if any(i != dim for i, dim in enumerate(perm1)): + if any(op1_dims != np.arange(len(op1_dims))): # print(f'perm1: {perm1}') - step = transpose, [var1], var1, perm1 + step = transpose, [var1], var1, list(op1_dims) plan.add_step(step) - if any(i != dim for i, dim in enumerate(perm2)): + if any(op2_dims != np.arange(len(op2_dims))): # print(f'perm2: {perm2}') - step = transpose, [var2], var2, perm2 + step = transpose, [var2], var2, list(op2_dims) plan.add_step(step) - # In case of no K... dimensions, do a broadcast - if not K: - # unsqueeze operands include J1...J2... dimensions - if J2: - fill_start = len(I2_dims) + len(J1) - fill_end = fill_start + len(J2) - fill = list(range(fill_start, fill_end)) - step = unsqueeze, [var1], var1, fill - plan.add_step(step) - if J1: - fill_start = len(I2_dims) - fill_end = fill_start + len(J1) - fill = list(range(fill_start, fill_end)) - step = unsqueeze, [var2], var2, fill - plan.add_step(step) - # make broadcast - step = multiply, [var1, var2], var2 - plan.add_step(step) - # K... are there, let's reason about I... and J... - # In case I... and J... are empty, do the vector-vector version of matmul - elif not I and not J1 and not J2: - # merge K dimensions - if len(K) > 1: - for var in var1, var2: - step = reshape, [var], var, [K1_size] - plan.add_step(step) - # Build vector-vector matmul - step = matmul, [var1, var2], var2 - plan.add_step(step) - # General case, there are K... and some I... and J..., the actual operation will be - # matrix-vector or matrix-matrix multiplies, depending on the operands' shapes. - else: - # Merge J dims and K dims by reshaping - merged_shape1 = I1_shape + [J1_size] + [K1_size] - merged_shape2 = I2_shape + [J2_size] + [K1_size] + # Check if conditions hold for turnning the operation into a matmul + if j1 + j2 > 0 and k > 0 and -1 not in np.concatenate( + (op1_vshape, op2_vshape)): + op1_shape = list(op1_vshape[I]) + [np.prod(op1_vshape[J1]) + ] + [np.prod(op1_vshape[K])] + op2_shape = list(op2_vshape[I]) + [np.prod(op2_vshape[J2]) + ] + [np.prod(op2_vshape[K])] - step = reshape, [var1], var1, merged_shape1 + # Merge J dims and K dims by reshaping + step = reshape, [var1], var1, op1_shape plan.add_step(step) - step = reshape, [var2], var2, merged_shape2 + step = reshape, [var2], var2, op2_shape plan.add_step(step) # Matmul step = matmul, [var1, var2], var2, False, True plan.add_step(step) - # The result shape is in I..., J1, J2. Let's reshape back to known dimensions - # Note, this is static deduction, not by reading the tensor shape at runtime - result_shape = [1] * len(I) - for i, ax in enumerate(I): - result_shape[i] = max(op1_vshape[ax], op2_vshape[ax]) - if J1: - result_shape += J1_shape - if J2: - result_shape += J2_shape - - # Need a scalar dimension somehow - if result_shape: - step = reshape, [var2], var2, result_shape + # Reshape back + shape = list(vshape[I + J1 + J2]) + step = reshape, [var2], var2, shape plan.add_step(step) + elif j1 == j2 == k == 1: + # Can still do matmul even unknown shapes are present + step = matmul, [var1, var2], var2, False, True + plan.add_step(step) + + # In the rest cases we opt for ops other than matmul + else: + # unsqueeze operands include J1...J2... dimensions + if j2: + fill = list(range(i1 + j1, i1 + j1 + j2)) + step = unsqueeze, [var1], var1, fill + plan.add_step(step) + if j1: + fill = list(range(i2, i2 + j1)) + step = unsqueeze, [var2], var2, fill + plan.add_step(step) + # In case of no dimensions to contract, do an elementwise multiply + if k == 0: + # make broadcast + step = multiply, [var1, var2], var2 + plan.add_step(step) + # Contract and no join, turn into a dot + elif j1 + j2 == 0 and k == 1: + step = unsqueeze, [var1], var1, [-2] + plan.add_step(step) + step = unsqueeze, [var2], var2, [-1] + plan.add_step(step) + step = matmul, [var1, var2], var2 + plan.add_step(step) + step = squeeze, [var2], var2, [-1, -2] + plan.add_step(step) + elif j1 + j2 == 0 and not-1 in np.concatenate( + (op1_vshape[K], op2_vshape[K])): + assert all(op1_vshape[K] == op2_vshape[K]) + step = reshape, [var1], var1, list(op1_vshape[ + I]) + [1] + [np.prod(op1_vshape[K])] + plan.add_step(step) + step = reshape, [var2], var2, list(op2_vshape[ + I]) + [1] + [np.prod(op2_vshape[K])] + plan.add_step(step) + step = matmul, [var1, var2], var2, False, True + plan.add_step(step) + step = squeeze, [var2], var2, [-1, -2] + plan.add_step(step) + else: + step = multiply, [var1, var2], var2 + plan.add_step(step) + reduce_dims = list(range(-k, 0)) + plan_reduce(plan, op2, reduce_dims, keepdim=False) + # Wrap up, updating auxiliary data # Updating g_mask for I and J axes - for i, ax in enumerate(I + J1 + J2): - op2_mask[ax] = (result_shape[i] > 1) + for ax in I + J1 + J2: + op2_mask[ax] = vshape[ax] > 1 or vshape[ax] == -1 for ax in K: op2_mask[ax] = False @@ -514,6 +435,8 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K): for ax in I + J1 + J2: op2_view[ax], dim = dim, dim + 1 + g_view[op2] = list(op2_view) + def plan_summation(plan, g_view, op1, op2, g_supports, g_shape, g_count, n_bcast): @@ -737,7 +660,6 @@ def plan_einsum(operands, g_view, g_shape, g_supports, g_count, n_bcast): return plan -@dygraph_only def einsum(equation, *operands): r""" einsum(equation, *operands) -- GitLab From 3041799953b5ccec0565a3bd8ca03fff467dce01 Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Tue, 15 Mar 2022 17:20:00 +0800 Subject: [PATCH 076/176] Fixed issues with generated scale operator (#40482) * Fixed issues with generated scale operator * Fixed minor issues --- .../auto_code_generator/eager_generator.cc | 49 +++++++++---------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index 6a2e5e7ac6c..bf838b27615 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -56,23 +56,29 @@ static std::string LegalizeVariableName(const std::string& var_name) { return ret; } -static bool IgnoreGradAttribute(const std::string& op_type, - const std::string& attr_name) { - // Attributes in operators_with_attrs are created manually during code - // generation - // We should ignore these arbitrary attrs when setting up grad attribute map - if (operators_with_attrs.count(op_type)) { - if (operators_with_attrs[op_type].count(attr_name)) { - return true; - } - } +static std::string HandleDynamicGradAttributes(const std::string& fwd_op_type, + const std::string& attrs_name) { + std::string additional_grad_attrs_str = ""; + + if (fwd_op_type == "sum") { + const char* GRAD_ATTRS_TEMPLATE = " %s[\"%s\"] = %s;\n"; + additional_grad_attrs_str = paddle::string::Sprintf( + GRAD_ATTRS_TEMPLATE, attrs_name, "scale", "float(1.0)"); + additional_grad_attrs_str += paddle::string::Sprintf( + GRAD_ATTRS_TEMPLATE, attrs_name, "bias", "float(0.0f)"); + additional_grad_attrs_str += paddle::string::Sprintf( + GRAD_ATTRS_TEMPLATE, attrs_name, "bias_after_scale", "bool(true)"); - // Only allow SumOp - if (op_type != "sum") { - return true; + } else if (fwd_op_type == "scale") { + const char* GRAD_ATTRS_TEMPLATE = " %s[\"%s\"] = %s;\n"; + + additional_grad_attrs_str += paddle::string::Sprintf( + GRAD_ATTRS_TEMPLATE, attrs_name, "bias", "float(0.0f)"); + additional_grad_attrs_str += paddle::string::Sprintf( + GRAD_ATTRS_TEMPLATE, attrs_name, "bias_after_scale", "bool(true)"); } - return false; + return additional_grad_attrs_str; } static void PrepareAttrMapForOps() { @@ -1866,18 +1872,9 @@ static std::string GenerateSingleOpBase( const char* ATTRS_TEMPLATE = " auto& %s = this->attr_map_;\n"; std::string grad_attrs_str = paddle::string::Sprintf(ATTRS_TEMPLATE, attrs_name); - for (const auto& iter : grad_attrs) { - if (IgnoreGradAttribute(fwd_op_type, iter.first)) continue; - std::pair type_val = - GetAttrType(iter.second, false /*is_arg*/); - const char* GRAD_ATTRS_TEMPLATE = - " %s %s = %s;\n" - " %s[\"%s\"] = %s;\n"; - std::string var_name = iter.first + std::to_string(*outs_size); - grad_attrs_str += paddle::string::Sprintf( - GRAD_ATTRS_TEMPLATE, type_val.first, var_name, type_val.second, - attrs_name, iter.first, var_name); - } + + // Handle dynamic grad attributes + grad_attrs_str += HandleDynamicGradAttributes(fwd_op_type, attrs_name); generated_grad_function_body += grad_attrs_str; const char* TRACE_OP_TEMPLATE = -- GitLab From 4d886f75c9ac22760549ad76e9ad0ba893bac856 Mon Sep 17 00:00:00 2001 From: xiongkun Date: Tue, 15 Mar 2022 17:43:11 +0800 Subject: [PATCH 077/176] run python api in eager model and filter the out in argument list (#40523) * run python api in eager model and filter the out in argument list * fix code --- paddle/fluid/imperative/tracer.cc | 6 ++-- paddle/fluid/imperative/tracer.h | 4 +-- paddle/fluid/pybind/imperative.cc | 28 ++++++++++++++-- .../paddle/fluid/tests/unittests/op_test.py | 32 ++++++++++--------- 4 files changed, 48 insertions(+), 22 deletions(-) diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index c55599cc9aa..d18c8e96c49 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -390,8 +390,8 @@ bool Tracer::ComputeRequiredGrad(const NameTensorMap& ins, } phi::KernelSignature Tracer::GetExpectedKernelSignature( - const std::string& type, const NameVarBaseMap& ins, - const NameVarBaseMap& outs, framework::AttributeMap attrs) const { + const std::string& type, const NameTensorMap& ins, + const NameTensorMap& outs, framework::AttributeMap attrs) const { auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false); framework::RuntimeContext ctx({}, {}); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); @@ -406,7 +406,7 @@ phi::KernelSignature Tracer::GetExpectedKernelSignature( attr_checker == nullptr ? empty_attrs_map : attr_checker->GetDefaultAttrMap(); auto dygraph_exe_ctx = - imperative::DygraphExecutionContext( + imperative::DygraphExecutionContext( *op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs, default_attrs); auto* opbase_with_kernel = diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index fd13fce6a6e..f24961885c9 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -156,8 +156,8 @@ class Tracer { } phi::KernelSignature GetExpectedKernelSignature( - const std::string& type, const NameVarBaseMap& ins, - const NameVarBaseMap& outs, framework::AttributeMap attrs) const; + const std::string& type, const NameTensorMap& ins, + const NameTensorMap& outs, framework::AttributeMap attrs) const; paddle::framework::GarbageCollector* MutableGarbageCollectorIfNotExists( const platform::Place& place); diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 9b373a58181..85427a8455b 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -52,11 +52,13 @@ limitations under the License. */ #include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/memory/allocation/mmap_allocator.h" #include "paddle/fluid/operators/utils.h" +#include "paddle/fluid/pybind/eager_utils.h" #include "paddle/fluid/pybind/op_function.h" #include "paddle/fluid/pybind/pybind_boost_headers.h" #include "paddle/fluid/pybind/slice_utils.h" #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/phi/core/compat/arg_map_context.h" +#include "paddle/phi/core/compat/type_defs.h" namespace paddle { namespace pybind { @@ -436,6 +438,28 @@ static imperative::NameVarBaseMap ConvertToNameVarBaseMap( return result; } +paddle::imperative::NameTensorMap ConvertToNameTensorMap( + const PyNameVarBaseMap &map) { + paddle::imperative::NameTensorMap result; + for (auto &pair : map) { + auto var_vec = CastPyArg2VectorOfTensor(pair.second.ptr(), 0); + if (!var_vec.empty()) { + // change vector -> vector> + std::vector> dst_var_vec; + for (auto &v : var_vec) { + dst_var_vec.emplace_back( + std::make_shared(std::move(v))); + } + result.emplace(pair.first, std::move(dst_var_vec)); + } + } + + PADDLE_ENFORCE_EQ( + PyErr_Occurred(), nullptr, + platform::errors::InvalidArgument(py::str(py::handle(PyErr_Occurred())))); + return result; +} + template static void VarBaseCopy(std::shared_ptr &src, // NOLINT imperative::VarBase &dst, // NOLINT @@ -2079,8 +2103,8 @@ void BindImperative(py::module *m_ptr) { const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, framework::AttributeMap attrs) { // TODO(xiongkun): move this function outside of tracer. - auto ins_map = ConvertToNameVarBaseMap(ins); - auto outs_map = ConvertToNameVarBaseMap(outs); + auto ins_map = ConvertToNameTensorMap(ins); + auto outs_map = ConvertToNameTensorMap(outs); { auto to_vector = [](paddle::SmallVector &vec) { return std::vector(vec.begin(), vec.end()); diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 457f20ac5b0..530ea2838a7 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -715,10 +715,11 @@ class OpTest(unittest.TestCase): assert related_idx >= 0, "%d-th arguments don't have default value" % idx return defaults[related_idx] - def remove_name(x): - if isinstance(x, list): return [i for i in x if i != 'name'] + def filter_by_name(x): + names = set(['name', 'out', 'output']) + if isinstance(x, list): return [i for i in x if i not in names] if isinstance(x, dict): - return {k: v for k, v in x.items() if k != 'name'} + return {k: v for k, v in x.items() if k not in names} assert False, "Only support list or dict." def to_defaults_list(params, defaults): @@ -728,7 +729,7 @@ class OpTest(unittest.TestCase): # Because we don't know the python api name of each arguments. # using parse_arg_and_kwargs, we can get the all api information we need. api_params, api_defaults = [ - remove_name(item) for item in parse_arg_and_kwargs(api) + filter_by_name(item) for item in parse_arg_and_kwargs(api) ] api_defaults = to_defaults_list(api_params, api_defaults) inputs_sig, attrs_sig, outputs_sig = kernel_sig @@ -784,10 +785,10 @@ class OpTest(unittest.TestCase): block = fluid.default_main_program().global_block() op_proto = OpProtoHolder.instance().get_op_proto(self.op_type) # prepare input variable - inputs = self.append_input_output_for_dygraph(op_proto, self.inputs, - True, False, block) + eager_tensor_inputs = self.append_input_output_for_dygraph( + op_proto, self.inputs, True, False, block) # prepare output variable - outputs = self.append_input_output_for_dygraph( + eager_tensor_outputs = self.append_input_output_for_dygraph( op_proto, self.outputs, False, False, block) # prepare attrbutes @@ -798,13 +799,14 @@ class OpTest(unittest.TestCase): attrs_outputs[attrs_name] = self.attrs[attrs_name] kernel_sig = _dygraph_tracer()._get_kernel_signature( - self.op_type, inputs, outputs, attrs_outputs) + self.op_type, eager_tensor_inputs, eager_tensor_outputs, + attrs_outputs) assert hasattr( self, "python_api" ), "Please set the `self.python_api` if you want to compare python api output." - args = prepare_python_api_arguments(self.python_api, inputs, - attrs_outputs, kernel_sig) + args = prepare_python_api_arguments( + self.python_api, eager_tensor_inputs, attrs_outputs, kernel_sig) """ we directly return the cal_python_api value because the value is already tensor. """ return cal_python_api(self.python_api, args, kernel_sig) @@ -1286,11 +1288,11 @@ class OpTest(unittest.TestCase): with _test_eager_guard(): eager_dygraph_outs = self._calc_dygraph_output( place, no_check_set=no_check_set) - # we only check end2end api when check_eager=True - if hasattr(self, "python_api"): - api_outs = self._calc_python_api_output(place) - self._check_api_outs_by_dygraph_outs(api_outs, dygraph_outs, - place) + # we only check end2end api when check_eager=True + if hasattr(self, "python_api"): + api_outs = self._calc_python_api_output(place) + self._check_api_outs_by_dygraph_outs(api_outs, dygraph_outs, + place) outs, fetch_list = self._calc_output(place, no_check_set=no_check_set) -- GitLab From 9bdee4372e2b5bc61753963bb8492b7817a5dc4d Mon Sep 17 00:00:00 2001 From: Roc <30228238+sljlp@users.noreply.github.com> Date: Tue, 15 Mar 2022 18:49:45 +0800 Subject: [PATCH 078/176] add number count op (#39224) * add expert count op add ut for expert_count * update UT only for cuda * fix for rocm * update ut * add moe module * add expert count op add ut for expert_count * update UT only for cuda * update ut * add moe module * make expert count private * rename expert count op Co-authored-by: hlygit66666 <2570058140@qq.com> --- paddle/fluid/operators/number_count_op.cc | 66 +++++++++++ paddle/fluid/operators/number_count_op.cu | 108 ++++++++++++++++++ paddle/fluid/operators/number_count_op.h | 37 ++++++ python/paddle/distributed/models/__init__.py | 13 +++ .../paddle/distributed/models/moe/__init__.py | 13 +++ python/paddle/distributed/models/moe/utils.py | 55 +++++++++ .../tests/unittests/test_number_count_op.py | 80 +++++++++++++ 7 files changed, 372 insertions(+) create mode 100644 paddle/fluid/operators/number_count_op.cc create mode 100644 paddle/fluid/operators/number_count_op.cu create mode 100644 paddle/fluid/operators/number_count_op.h create mode 100644 python/paddle/distributed/models/__init__.py create mode 100644 python/paddle/distributed/models/moe/__init__.py create mode 100644 python/paddle/distributed/models/moe/utils.py create mode 100644 python/paddle/fluid/tests/unittests/test_number_count_op.py diff --git a/paddle/fluid/operators/number_count_op.cc b/paddle/fluid/operators/number_count_op.cc new file mode 100644 index 00000000000..8f7a3b82acf --- /dev/null +++ b/paddle/fluid/operators/number_count_op.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/number_count_op.h" + +namespace paddle { +namespace operators { + +class NumberCountOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("gate_idx"), "Input", "gate_idx", + "NumberCount"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "number_count", + "NumberCount"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + // the dtype of the gate_idx should be same as int64 + auto gate_idx_dtype = + OperatorWithKernel::IndicateVarDataType(ctx, "gate_idx"); + + PADDLE_ENFORCE_EQ(gate_idx_dtype, framework::proto::VarType::INT64, + platform::errors::InvalidArgument( + "The dtype of the gate_idx_dtype should be int64")); + return framework::OpKernelType(gate_idx_dtype, ctx.GetPlace()); + } +}; + +class NumberCountOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("gate_idx", "(Tensor) The input gate index tensor."); + AddOutput("Out", "(Tensor) The output expert count tensor."); + AddAttr("upper_range", "(int), The number of experts."); + + AddComment(R"DOC(number_count Operator.count gate indices.)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CPU_KERNEL(number_count, ops::NumberCountOpCPUKernel, + ops::NumberCountOpCPUKernel); + +REGISTER_OP_WITHOUT_GRADIENT(number_count, ops::NumberCountOp, + ops::NumberCountOpMaker); diff --git a/paddle/fluid/operators/number_count_op.cu b/paddle/fluid/operators/number_count_op.cu new file mode 100644 index 00000000000..97e4b4f2845 --- /dev/null +++ b/paddle/fluid/operators/number_count_op.cu @@ -0,0 +1,108 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/number_count_op.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +#define CEIL(_x_, _y_) (((_x_)-1) / (_y_) + 1) +#define PERTHREAD_EXPERTS 256 +#define WARP_SIZE 32 + +const int CUDA_NUM_THREADS = 512; +static inline int GET_BLOCKS(const int N) { + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +template +__global__ void initialize_zero_kernel(T* data, const int length) { + CUDA_KERNEL_LOOP(idx, length) { data[idx] = static_cast(0); } +} + +template +__global__ void NumberCount(const T* gate_idx, T* number_count, + int64_t batch_size, int upper_range) { + int res_tmp[PERTHREAD_EXPERTS] = {0}; + int expert_min = blockIdx.x * PERTHREAD_EXPERTS; + int expert_max = expert_min + PERTHREAD_EXPERTS; + if (expert_max > upper_range) { + expert_max = upper_range; + } + for (int i = threadIdx.x; i < batch_size; i += blockDim.x) { + T idx = gate_idx[i]; + if (idx == -1) { + continue; + } + if (idx < expert_min || idx >= expert_max) { + continue; + } + res_tmp[idx - expert_min] += 1; + } + for (int i = expert_min; i < expert_max; ++i) { + int x = res_tmp[i - expert_min]; +#pragma unroll + for (int j = 1; j < WARP_SIZE; j <<= 1) { +#ifdef __HIPCC__ + x = x + __shfl_down(x, j); +#else + x = x + __shfl_down_sync(-1u, x, j); +#endif + } + if (threadIdx.x % WARP_SIZE == 0) { + platform::CudaAtomicAdd(number_count + i, x); + } + } +} + +template +class NumberCountOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto gate_idx = context.Input("gate_idx"); + auto upper_range = context.Attr("upper_range"); + auto number_count = context.Output("Out"); + + int64_t batch_size = gate_idx->numel(); + auto place = context.GetPlace(); + const auto& dev_ctx = + context.template device_context(); + + framework::DDim out_dims = phi::make_ddim({upper_range}); + auto out_data = number_count->mutable_data(out_dims, place); + const T* gate_data = gate_idx->data(); + + initialize_zero_kernel< + T><<>>( + out_data, upper_range); + + NumberCount< + T><<>>( + gate_data, out_data, batch_size, upper_range); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(number_count, ops::NumberCountOpCUDAKernel); diff --git a/paddle/fluid/operators/number_count_op.h b/paddle/fluid/operators/number_count_op.h new file mode 100644 index 00000000000..95e64946fb8 --- /dev/null +++ b/paddle/fluid/operators/number_count_op.h @@ -0,0 +1,37 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +#if defined(PADDLE_WITH_GLOO) +#include "paddle/fluid/framework/fleet/gloo_wrapper.h" +#endif + +namespace paddle { +namespace operators { + +template +class NumberCountOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW(platform::errors::Unavailable( + "Do not support expert count op for cpu kernel now.")); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/distributed/models/__init__.py b/python/paddle/distributed/models/__init__.py new file mode 100644 index 00000000000..e1663029ef1 --- /dev/null +++ b/python/paddle/distributed/models/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/distributed/models/moe/__init__.py b/python/paddle/distributed/models/moe/__init__.py new file mode 100644 index 00000000000..e1663029ef1 --- /dev/null +++ b/python/paddle/distributed/models/moe/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/distributed/models/moe/utils.py b/python/paddle/distributed/models/moe/utils.py new file mode 100644 index 00000000000..fd98c64318c --- /dev/null +++ b/python/paddle/distributed/models/moe/utils.py @@ -0,0 +1,55 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.fluid import core +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.framework import in_dygraph_mode + + +def _number_count(gate_idx, upper_range): + """ + calculate the expert count according to the gate index. + Args: + gate_idx (Tensor): Tensor. The input gate index whose data type should be int32 or int64. + upper_range (int): The number of the experts. + Returns: + out (Tensor): The output expert count. + Examples: + .. code-block:: python + # required: distributed + import paddle + + gate_idx = [ + [0, 2], + [0, 2] + ] + upper_range = 6 + gate_idx = paddle.to_tensor(gate_idx, dtype="int32") + number_count = paddle.distributed.utils.number_count(gate_idx, upper_range) + print(number_count) # the result: [2, 0, 2, 0, 0, 0] + """ + if in_dygraph_mode(): + return core.ops.number_count(gate_idx, 'upper_range', upper_range) + else: + op_type = 'number_count' + + helper = LayerHelper(op_type, **locals()) + out = helper.create_variable_for_type_inference(dtype=gate_idx.dtype) + + helper.append_op( + type=op_type, + inputs={'gate_idx': gate_idx}, + outputs={'Out': out}, + attrs={'upper_range': upper_range}) + return out diff --git a/python/paddle/fluid/tests/unittests/test_number_count_op.py b/python/paddle/fluid/tests/unittests/test_number_count_op.py new file mode 100644 index 00000000000..0df9d2a3a41 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_number_count_op.py @@ -0,0 +1,80 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import op_test +import numpy as np +import unittest +import paddle +import paddle.fluid.core as core +from paddle.fluid.op import Operator +import paddle.fluid as fluid +from paddle.fluid import compiler, Program, program_guard +from paddle.fluid.backward import append_backward +from paddle.distributed.models.moe import utils + + +def count(x, upper_range): + res = np.zeros((upper_range, )).astype(int) + for i in x.reshape(-1): + if i >= 0 and i < len(res): + res[i] += 1 + return res + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestExpertCountOpInt64(op_test.OpTest): + def setUp(self): + expert_num = 16 + self.op_type = "number_count" + x = np.random.randint(-1, expert_num, size=(1000, 2)).astype('int64') + self.inputs = {'gate_idx': x} + self.outputs = {'Out': count(x, expert_num)} + self.attrs = {"upper_range": expert_num} + + def test_forward(self): + self.check_output_with_place(paddle.CUDAPlace(0)) + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestExpertCountAPI(unittest.TestCase): + def setUp(self): + self.upper_range = 320 + self.x = np.random.randint( + -1, self.upper_range, size=(6000, 200)).astype('int64') + self.out = count(self.x, self.upper_range) + self.place = paddle.CUDAPlace(0) + + def test_api_static(self): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.fluid.data('x', self.x.shape, dtype="int64") + out = utils._number_count(x, self.upper_range) + exe = paddle.static.Executor(self.place) + res = exe.run(feed={'x': self.x}, fetch_list=[out]) + assert np.allclose(res, self.out) + + def test_api_dygraph(self): + paddle.disable_static() + x = paddle.to_tensor(self.x) + out = utils._number_count(x, self.upper_range) + assert np.allclose(out.numpy(), self.out) + + +if __name__ == '__main__': + paddle.enable_static() + unittest.main() -- GitLab From 31729a6283efac7e15e112de3f38114383e2380b Mon Sep 17 00:00:00 2001 From: Liu-xiandong <85323580+Liu-xiandong@users.noreply.github.com> Date: Tue, 15 Mar 2022 19:09:23 +0800 Subject: [PATCH 079/176] [phi] modify the shape OP and move inferMeta of shape,matrix_pow,multi_dot (#40506) * [phi] move matrix_power op * MatrixInverse fluid -> phi * modify the CMake to fix compile bug * delete useless comment * mutable memory -> phi Alloc * modify the include file * modify the include file * fix bug in CI compiler * [phi]modify the shape OP and move inferMeta of shape,matrix_pow,multi_dot * delete useless comment * fix bug in CI * modify after review --- paddle/fluid/operators/matrix_power_op.cc | 30 ++----- paddle/fluid/operators/multi_dot_op.cc | 88 +++---------------- paddle/fluid/operators/shape_op.cc | 21 ++--- paddle/phi/infermeta/multiary.cc | 73 +++++++++++++++ paddle/phi/infermeta/multiary.h | 2 + paddle/phi/infermeta/unary.cc | 28 ++++++ paddle/phi/infermeta/unary.h | 4 + paddle/phi/kernels/cpu/shape_kernel.cc | 33 ------- paddle/phi/kernels/impl/shape_kernel_impl.h | 36 -------- .../phi/kernels/selected_rows/shape_kernel.cc | 11 +-- .../{gpu/shape_kernel.cu => shape_kernel.cc} | 40 ++++++++- 11 files changed, 173 insertions(+), 193 deletions(-) delete mode 100644 paddle/phi/kernels/cpu/shape_kernel.cc delete mode 100644 paddle/phi/kernels/impl/shape_kernel_impl.h rename paddle/phi/kernels/{gpu/shape_kernel.cu => shape_kernel.cc} (53%) diff --git a/paddle/fluid/operators/matrix_power_op.cc b/paddle/fluid/operators/matrix_power_op.cc index cdf204628b6..56f65340ea9 100644 --- a/paddle/fluid/operators/matrix_power_op.cc +++ b/paddle/fluid/operators/matrix_power_op.cc @@ -14,8 +14,11 @@ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -23,26 +26,6 @@ namespace operators { class MatrixPowerOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "matrix_power"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "matrix_power"); - auto dims = ctx->GetInputDim("X"); - auto n_dim = dims.size(); - PADDLE_ENFORCE_GE(n_dim, 2, - platform::errors::InvalidArgument( - "The Input(X) should have at least 2 dimensions. But " - "received a %d dimension tensor.", - n_dim)); - PADDLE_ENFORCE_EQ(dims[n_dim - 2], dims[n_dim - 1], - platform::errors::InvalidArgument( - "The inner-most 2 dimensions of Input(X) all should " - "be square matrices " - "But received X's shape[-2] = %d and shape[-1] = %d.", - dims[n_dim - 2], dims[n_dim - 1])); - ctx->SetOutputDim("Out", dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } }; class MatrixPowerOpMaker : public framework::OpProtoAndCheckerMaker { @@ -116,9 +99,14 @@ class MatrixPowerGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(matrix_power, MatrixPowerInferShapeFunctor, + PD_INFER_META(phi::MatrixPowerInferMeta)); + REGISTER_OPERATOR(matrix_power, ops::MatrixPowerOp, ops::MatrixPowerOpMaker, ops::MatrixPowerOpInferVarType, ops::MatrixPowerGradOpMaker, - ops::MatrixPowerGradOpMaker); + ops::MatrixPowerGradOpMaker, + MatrixPowerInferShapeFunctor); REGISTER_OPERATOR(matrix_power_grad, ops::MatrixPowerGradOp); diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc index b309e1b87ef..5b107ce643d 100644 --- a/paddle/fluid/operators/multi_dot_op.cc +++ b/paddle/fluid/operators/multi_dot_op.cc @@ -16,77 +16,19 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/operators/utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" #include "paddle/phi/kernels/funcs/blas/blas.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; -/** - * @brief compute the output shape and check the input shape valid or not - */ -inline framework::DDim ComputeAndCheckShape( - const bool is_runtime, const std::vector& inputs_dims) { - const size_t n = inputs_dims.size(); - auto first_dim = inputs_dims[0]; - - bool is_vector = false; - framework::DDim out_dim; - - PADDLE_ENFORCE_LT( - first_dim.size(), static_cast(3), - platform::errors::InvalidArgument( - "multi_dot: the first input tensor must be 1D or 2D but got[%d]!", - static_cast(first_dim.size()))); - - // If the first tensor is 1D of size n view it as a row vector (1, n) - if (first_dim.size() == 1) { - first_dim = phi::make_ddim({1, static_cast(first_dim[0])}); - is_vector = true; - } - - auto last_dim = inputs_dims[n - 1]; - PADDLE_ENFORCE_LT( - last_dim.size(), static_cast(3), - platform::errors::InvalidArgument( - "the last input tensor of multi_dot must be 1D or 2D but got[%d]!", - static_cast(first_dim.size()))); - - // If the last tensor is 1D of size n view it as a column vector (n, 1) - if (last_dim.size() == 1) { - last_dim = phi::make_ddim({static_cast(last_dim[0]), 1}); - out_dim = is_vector ? phi::make_ddim({1}) : phi::make_ddim({first_dim[0]}); - } else { - out_dim = is_vector ? phi::make_ddim({last_dim[1]}) - : phi::make_ddim({first_dim[0], last_dim[1]}); - } - - auto width = first_dim[1]; - for (size_t i = 1; i < n - 1; i++) { - PADDLE_ENFORCE_EQ(inputs_dims[i].size(), static_cast(2), - platform::errors::InvalidArgument( - "the input tensor of multi_dot op must be 2D.")); - - const auto& tmp_dim = inputs_dims[i]; - PADDLE_ENFORCE_EQ( - tmp_dim[0], width, - platform::errors::InvalidArgument( - "the input matrix does not meet the multiplication requirements.")); - width = tmp_dim[1]; - } - - PADDLE_ENFORCE_EQ( - last_dim[0], width, - platform::errors::InvalidArgument( - "the input matrix does not meet the multiplication requirements.")); - - return out_dim; -} - class MultiDotOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -105,22 +47,6 @@ If the first argument is 1-D it is treated as a row vector. If the last argument class MultiDotOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "multi_dot"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "multi_dot"); - - auto inputs_dims = ctx->GetInputsDim("X"); - - const size_t inputs_num = inputs_dims.size(); - PADDLE_ENFORCE_GT( - inputs_num, static_cast(1), - platform::errors::InvalidArgument( - "The number of input tensors in multi_dot op should > 1.")); - auto out_dims = ComputeAndCheckShape(ctx->IsRuntime(), inputs_dims); - ctx->SetOutputDim("Out", out_dims); - ctx->ShareLoD("X", "Out"); - } }; class MultiDotOpGrad : public framework::OperatorWithKernel { @@ -171,9 +97,15 @@ class MultiDotOpDoubleGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(multi_dot, MultiDotInferShapeFunctor, + PD_INFER_META(phi::MultiDotInferMeta)); + REGISTER_OPERATOR(multi_dot, ops::MultiDotOp, ops::MultiDotOpMaker, ops::MultiDotOpGradMaker, - ops::MultiDotOpGradMaker); + ops::MultiDotOpGradMaker, + MultiDotInferShapeFunctor); + REGISTER_OPERATOR(multi_dot_grad, ops::MultiDotOpGrad, ops::MultiDotOpDoubleGradMaker, ops::MultiDotOpDoubleGradMaker); diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc index e2c8359beb1..9001ce5d51d 100644 --- a/paddle/fluid/operators/shape_op.cc +++ b/paddle/fluid/operators/shape_op.cc @@ -13,7 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -22,17 +25,6 @@ class ShapeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true, - platform::errors::InvalidArgument( - "Input (Input) of get_shape op should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output (Out) of get_shape op should not be null.")); - auto in_dim = ctx->GetInputDim("Input"); - ctx->SetOutputDim("Out", {in_dim.size()}); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto input_data_type = @@ -89,7 +81,12 @@ Return the shape of the input. namespace ops = paddle::operators; namespace plat = paddle::platform; + +DECLARE_INFER_SHAPE_FUNCTOR(shape, ShapeInferShapeFunctor, + PD_INFER_META(phi::ShapeInferMeta)); + REGISTER_OPERATOR( shape, ops::ShapeOp, ops::ShapeOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + ShapeInferShapeFunctor); diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 84441ed8b74..ef75ab573c6 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -369,6 +369,79 @@ void ConcatInferMeta(const std::vector& x, out->share_lod(*x.at(0)); } +void MultiDotInferMeta(const std::vector& x, MetaTensor* out) { + auto inputs_dims = GetMetaTensorsDim(x); + + const size_t inputs_num = inputs_dims.size(); + PADDLE_ENFORCE_GT( + inputs_num, + static_cast(1), + phi::errors::InvalidArgument( + "The number of input tensors in multi_dot op should > 1.")); + + const size_t n = inputs_dims.size(); + auto first_dim = inputs_dims[0]; + + bool is_vector = false; + phi::DDim out_dim; + + PADDLE_ENFORCE_LT( + first_dim.size(), + static_cast(3), + phi::errors::InvalidArgument( + "multi_dot: the first input tensor must be 1D or 2D but got[%d]!", + static_cast(first_dim.size()))); + + // If the first tensor is 1D of size n view it as a row vector (1, n) + if (first_dim.size() == 1) { + first_dim = phi::make_ddim({1, static_cast(first_dim[0])}); + is_vector = true; + } + + auto last_dim = inputs_dims[n - 1]; + PADDLE_ENFORCE_LT( + last_dim.size(), + static_cast(3), + phi::errors::InvalidArgument( + "the last input tensor of multi_dot must be 1D or 2D but got[%d]!", + static_cast(first_dim.size()))); + + // If the last tensor is 1D of size n view it as a column vector (n, 1) + if (last_dim.size() == 1) { + last_dim = phi::make_ddim({static_cast(last_dim[0]), 1}); + out_dim = is_vector ? phi::make_ddim({1}) : phi::make_ddim({first_dim[0]}); + } else { + out_dim = is_vector ? phi::make_ddim({last_dim[1]}) + : phi::make_ddim({first_dim[0], last_dim[1]}); + } + + auto width = first_dim[1]; + for (size_t i = 1; i < n - 1; i++) { + PADDLE_ENFORCE_EQ(inputs_dims[i].size(), + static_cast(2), + phi::errors::InvalidArgument( + "the input tensor of multi_dot op must be 2D.")); + + const auto& tmp_dim = inputs_dims[i]; + PADDLE_ENFORCE_EQ( + tmp_dim[0], + width, + phi::errors::InvalidArgument( + "the input matrix does not meet the multiplication requirements.")); + width = tmp_dim[1]; + } + + PADDLE_ENFORCE_EQ( + last_dim[0], + width, + phi::errors::InvalidArgument( + "the input matrix does not meet the multiplication requirements.")); + + out->set_dims(out_dim); + out->set_dtype(x.at(0)->dtype()); + out->share_lod(*x.at(0)); +} + void PsroiPoolInferMeta(const MetaTensor& x, const MetaTensor& rois, paddle::optional rois_num, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index c11843212ed..0bdd35d5f58 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -70,6 +70,8 @@ void ConcatInferMeta(const std::vector& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void MultiDotInferMeta(const std::vector& x, MetaTensor* out); + void PsroiPoolInferMeta(const MetaTensor& x, const MetaTensor& rois, paddle::optional rois_num, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 4d1cb42bd59..752abae1b03 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -554,6 +554,28 @@ void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) { out->set_dtype(DataType::BOOL); } +void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out) { + auto dims = x.dims(); + auto n_dim = dims.size(); + PADDLE_ENFORCE_GE(n_dim, + 2, + phi::errors::InvalidArgument( + "The Input(X) should have at least 2 dimensions. But " + "received a %d dimension tensor.", + n_dim)); + PADDLE_ENFORCE_EQ(dims[n_dim - 2], + dims[n_dim - 1], + phi::errors::InvalidArgument( + "The inner-most 2 dimensions of Input(X) all should " + "be square matrices " + "But received X's shape[-2] = %d and shape[-1] = %d.", + dims[n_dim - 2], + dims[n_dim - 1])); + out->set_dims(dims); + out->share_lod(x); + out->set_dtype(x.dtype()); +} + void MaxPoolWithIndexInferMeta(const MetaTensor& x, const std::vector& kernel_size, const std::vector& strides, @@ -994,6 +1016,12 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x, ReshapeInferMeta(x, shape, out, config); } +void ShapeInferMeta(const MetaTensor& input, MetaTensor* out) { + auto in_dim = input.dims(); + out->set_dims(phi::make_ddim({in_dim.size()})); + out->set_dtype(DataType::INT32); +} + void ShardIndexInferMeta(const MetaTensor& in, int index_num, int nshards, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 75fb9fadf82..a9aefd1f12d 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -98,6 +98,8 @@ void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out); void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out); +void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out); + void MaxPoolWithIndexInferMeta(const MetaTensor& x, const std::vector& kernel_size, const std::vector& strides, @@ -162,6 +164,8 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void ShapeInferMeta(const MetaTensor& input, MetaTensor* out); + void ShardIndexInferMeta(const MetaTensor& in, int index_num, int nshards, diff --git a/paddle/phi/kernels/cpu/shape_kernel.cc b/paddle/phi/kernels/cpu/shape_kernel.cc deleted file mode 100644 index 073dc63b2a4..00000000000 --- a/paddle/phi/kernels/cpu/shape_kernel.cc +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/shape_kernel.h" -#include "paddle/phi/kernels/impl/shape_kernel_impl.h" - -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/kernel_registry.h" - -PD_REGISTER_KERNEL(shape, - CPU, - ALL_LAYOUT, - phi::ShapeKernel, - bool, - int, - int8_t, - uint8_t, - int64_t, - float, - double, - phi::dtype::complex, - phi::dtype::complex) {} diff --git a/paddle/phi/kernels/impl/shape_kernel_impl.h b/paddle/phi/kernels/impl/shape_kernel_impl.h deleted file mode 100644 index 982cfb33f6b..00000000000 --- a/paddle/phi/kernels/impl/shape_kernel_impl.h +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" - -namespace phi { - -template -void ShapeKernel(const Context& ctx, - const DenseTensor& input, - DenseTensor* out) { - auto in_var = &input; - phi::DDim in_dims; - in_dims = in_var->dims(); - auto out_t = out; - out_t->Resize({in_dims.size()}); - auto out_data = ctx.template HostAlloc(out_t); - for (int i = 0; i < in_dims.size(); ++i) { - out_data[i] = in_dims[i]; - } -} - -} // namespace phi diff --git a/paddle/phi/kernels/selected_rows/shape_kernel.cc b/paddle/phi/kernels/selected_rows/shape_kernel.cc index 9bcd5d8544e..67126d82042 100644 --- a/paddle/phi/kernels/selected_rows/shape_kernel.cc +++ b/paddle/phi/kernels/selected_rows/shape_kernel.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/shape_kernel.h" namespace phi { namespace sr { @@ -25,15 +26,7 @@ template void ShapeKernel(const Context& ctx, const SelectedRows& input, DenseTensor* out) { - auto in_var = input; - phi::DDim in_dims; - in_dims = in_var.value().dims(); - auto out_t = out; - out_t->Resize({in_dims.size()}); - auto out_data = ctx.template HostAlloc(out_t); - for (int i = 0; i < in_dims.size(); ++i) { - out_data[i] = in_dims[i]; - } + phi::ShapeKernel(ctx, input.value(), out); } } // namespace sr diff --git a/paddle/phi/kernels/gpu/shape_kernel.cu b/paddle/phi/kernels/shape_kernel.cc similarity index 53% rename from paddle/phi/kernels/gpu/shape_kernel.cu rename to paddle/phi/kernels/shape_kernel.cc index 39b6eaeaef2..dd26a7edc9c 100644 --- a/paddle/phi/kernels/gpu/shape_kernel.cu +++ b/paddle/phi/kernels/shape_kernel.cc @@ -13,12 +13,43 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/kernels/shape_kernel.h" - -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/common/float16.h" +#include "paddle/phi/backends/all_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/shape_kernel_impl.h" +namespace phi { + +template +void ShapeKernel(const Context& ctx, + const DenseTensor& input, + DenseTensor* out) { + auto in_var = &input; + phi::DDim in_dims; + in_dims = in_var->dims(); + auto out_t = out; + out_t->Resize({in_dims.size()}); + auto out_data = ctx.template HostAlloc(out_t); + for (int i = 0; i < in_dims.size(); ++i) { + out_data[i] = in_dims[i]; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(shape, + CPU, + ALL_LAYOUT, + phi::ShapeKernel, + bool, + int, + int8_t, + uint8_t, + int64_t, + float, + double, + phi::dtype::complex, + phi::dtype::complex) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(shape, GPU, ALL_LAYOUT, @@ -33,3 +64,4 @@ PD_REGISTER_KERNEL(shape, phi::dtype::complex, phi::dtype::complex, phi::dtype::float16) {} +#endif -- GitLab From 7ced301785f8a4467e4734904df7a180bbe08584 Mon Sep 17 00:00:00 2001 From: Guanghua Yu <742925032@qq.com> Date: Tue, 15 Mar 2022 20:38:00 +0800 Subject: [PATCH 080/176] Support some ops for full quantization (#40083) * add some op for full_quantization --- .../post_training_quantization.py | 2 - .../slim/quantization/quantization_pass.py | 193 +++++++++++++++++- 2 files changed, 182 insertions(+), 13 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py index 97b4116826a..d614630b3db 100644 --- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py +++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py @@ -979,8 +979,6 @@ class PostTrainingQuantization(object): if op.type in ( self._quantizable_op_type + self._out_scale_op_list): out_var_names = _get_op_output_var_names(op) - assert len(out_var_names) == 1, "Post training " + \ - "quantization only support one output for " + op.type for var_name in out_var_names: analysis_and_save_info(op, var_name) diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index efa000274d0..afca617b6dd 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -59,6 +59,7 @@ _out_scale_op_list = [ "tanh", "prelu", "swish", + "dropout", "softmax", "batch_norm", "layer_norm", @@ -68,6 +69,8 @@ _out_scale_op_list = [ "transpose2", "concat", "elementwise_mul", + "elementwise_pow", + "elementwise_sub", "scale", "slice", "hard_swish", @@ -81,8 +84,54 @@ _out_scale_op_list = [ "flatten2", "transpose", "pad2d", + "pad3d", "reshape", - "layer_norm", + "split", + "flatten_contiguous_range", + "squeeze", + "squeeze2", + "nearest_interp_v2", + "fill_constant_batch_size_like", + "bilinear_interp", + "bilinear_interp_v2", + "arg_max", + "abs", + "assign", + "cast", + "clip", + "box_coder", + "crop", + "cumsum", + "equal", + "expand_v2", + "fill_any_like", + "fill_constant", + "gelu", + "instance_norm", + "lookup_table", + "lookup_table_v2", + "norm", + "p_norm", + "pow", + "reduce_mean", + "stack", + "top_k_v2", + "unsqueeze", + "unsqueeze2", + "logical_and", + "logical_not", + "meshgrid", + "roi_align", + "strided_slice", + "where", + "grid_sampler", + "tile", + "group_norm", + "reduce_sum", + "square", + "softplus", + "gather", + "shuffle_channel", ] # list op real input and output names, to avoid processing input such as AxisTensor. @@ -119,7 +168,7 @@ _op_real_in_out_name = { "relu": [["X"], ["Out"]], "relu6": [["X"], ["Out"]], "leaky_relu": [["X"], ["Out"]], - "prelu": [["X"], ["Out"]], + "prelu": [["X", "Alpha"], ["Out"]], "tanh": [["X"], ["Out"]], "swish": [["X"], ["Out"]], "dropout": [["X"], ["Out"]], @@ -127,16 +176,59 @@ _op_real_in_out_name = { "layer_norm": [["X"], ["Y"]], "sigmoid": [["X"], ["Out"]], "elementwise_mul": [["X", "Y"], ["Out"]], + "elementwise_pow": [["X", "Y"], ["Out"]], "scale": [["X"], ["Out"]], "hard_swish": [["X"], ["Out"]], "hard_sigmoid": [["X"], ["Out"]], "gru": [["Input", "Weight"], ["Hidden"]], "lstm": [["Input", "Weight"], ["Hidden"]], "pad2d": [["X"], ["Out"]], + "pad3d": [["X"], ["Out"]], "flatten": [["X"], ["Out"]], "flatten2": [["X"], ["Out"]], "unsqueeze2": [["X"], ["Out"]], - "flatten_contiguous_range": [['X'], ["Out"]], + "unsqueeze2": [["X"], ["Out"]], + "flatten_contiguous_range": [["X"], ["Out"]], + "split": [["X"], ["Out"]], + "squeeze2": [["X"], ["Out"]], + "nearest_interp_v2": [["X"], ["Out"]], + "bilinear_interp": [["X"], ["Out"]], + "bilinear_interp_v2": [["X"], ["Out"]], + "fill_constant_batch_size_like": [["Input"], ["Out"]], + "arg_max": [["X"], ["Out"]], + "abs": [["X"], ["Out"]], + "assign": [["X"], ["Out"]], + "cast": [["X"], ["Out"]], + "clip": [["X"], ["Out"]], + "box_coder": [["PriorBox"], ["OutputBox"]], + "crop": [["X"], ["Out"]], + "cumsum": [["X"], ["Out"]], + "expand_v2": [["X"], ["Out"]], + "fill_any_like": [["X"], ["Out"]], + "fill_constant": [[], ["Out"]], + "gelu": [["X"], ["Out"]], + "instance_norm": [["X"], ["Out"]], + "lookup_table": [["W", "Ids"], ["Out"]], + "lookup_table_v2": [["W", "Ids"], ["Out"]], + "norm": [["X"], ["Norm"]], + "p_norm": [["X"], ["Out"]], + "pow": [["X"], ["Out"]], + "reduce_mean": [["X"], ["Out"]], + "stack": [["X"], ["Y"]], + "top_k_v2": [["X"], ["Out", "Indices"]], + "logical_and": [["X", "Y"], ["Out"]], + "logical_not": [["X"], ["Out"]], + "meshgrid": [["X"], ["Out"]], + "roi_align": [["X", "ROIs"], ["Out"]], + "strided_slice": [["Input"], ["Out"]], + "where": [["Condition", "X", "Y"], ["Out"]], + "grid_sampler": [["X", "Grid"], ["Output"]], + "tile": [["X"], ["Out"]], + "group_norm": [["X"], ["Y", "Mean", "Variance"]], + "reduce_sum": [["X"], ["Out"]], + "square": [["X"], ["Out"]], + "softplus": [["X"], ["Out"]], + "shuffle_channel": [["X"], ["Out"]], } _conv_ops = ['conv2d', 'depthwise_conv2d', 'conv2d_transpose'] @@ -1797,14 +1889,93 @@ class AddQuantDequantPass(object): quantized ops's inputs. """ _supported_quantizable_op_type = [ - "pool2d", "elementwise_add", "concat", "softmax", "argmax", "transpose", - "equal", "gather", "greater_equal", "greater_than", "less_equal", - "less_than", "mean", "not_equal", "reshape", "reshape2", - "bilinear_interp", "nearest_interp", "trilinear_interp", "slice", - "squeeze", "elementwise_sub", "mul", "matmul", "relu", "relu6", - "leaky_relu", "tanh", "swish", "scale", "transpose", "transpose2", - "sigmoid", "pad2d", "flatten", "flatten2", "batch_norm", "layer_norm", - "matmul_v2" + "pool2d", + "elementwise_add", + "concat", + "softmax", + "argmax", + "transpose", + "equal", + "gather", + "greater_equal", + "greater_than", + "less_equal", + "less_than", + "mean", + "not_equal", + "reshape", + "reshape2", + "dropout", + "bilinear_interp", + "nearest_interp", + "trilinear_interp", + "slice", + "squeeze", + "elementwise_sub", + "mul", + "matmul", + "relu", + "relu6", + "leaky_relu", + "tanh", + "swish", + "scale", + "transpose", + "transpose2", + "sigmoid", + "pad2d", + "flatten", + "flatten2", + "batch_norm", + "layer_norm", + "matmul_v2", + "split", + "flatten_contiguous_range", + "squeeze2", + "nearest_interp_v2", + "bilinear_interp", + "bilinear_interp_v2", + "fill_constant_batch_size_like", + "arg_max", + "abs", + "assign", + "cast", + "clip", + "box_coder", + "crop", + "cumsum", + "elementwise_mul", + "elementwise_pow", + "expand_v2", + "fill_any_like", + "fill_constant", + "gelu", + "hard_sigmoid", + "hard_swish", + "instance_norm", + "lookup_table", + "lookup_table_v2", + "norm", + "p_norm", + "pad3d", + "pow", + "prelu", + "reduce_mean", + "unsqueeze", + "unsqueeze2", + "logical_and", + "logical_not", + "meshgrid", + "roi_align", + "strided_slice", + "where", + "grid_sampler", + "tile", + "group_norm", + "reduce_sum", + "square", + "softplus", + "shuffle_channel", ] # To be compatible with PaddleSlim, not remove _activation_type for now -- GitLab From 0c0acbd763bfbbdfb988e09fe4b1753c32b3a6f2 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 15 Mar 2022 23:11:17 +0800 Subject: [PATCH 081/176] remove cmake kernel print info (#40550) --- paddle/phi/CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt index 7b074d0ebb7..04e1bbcc9df 100644 --- a/paddle/phi/CMakeLists.txt +++ b/paddle/phi/CMakeLists.txt @@ -25,8 +25,6 @@ add_subdirectory(tests) # make an unity target for compile deps set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor) get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS) -# keep this message for debug, remove it later if needless -message(STATUS "All standard phi kernels: ${phi_kernels}") set(PHI_DEPS ${PHI_DEPS} ${phi_kernels}) cc_library(phi DEPS ${PHI_DEPS}) -- GitLab From a04a6bd5bd692aa06150b90b276a3863bfb1eb37 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 15 Mar 2022 23:15:53 +0800 Subject: [PATCH 082/176] [Phi] Move determinant op kernel into phi (#40539) * add determinant phi kernel * remove original determinant op kernel * add determinant grad [hi kernel * fix determinant test failed * remove original determinant grad op kernel --- paddle/fluid/operators/determinant_op.cc | 8 - paddle/fluid/operators/determinant_op.cu | 8 - paddle/fluid/operators/determinant_op.h | 237 +----------------- paddle/phi/kernels/CMakeLists.txt | 7 +- .../kernels/cpu/determinant_grad_kernel.cc | 25 ++ paddle/phi/kernels/cpu/determinant_kernel.cc | 21 ++ paddle/phi/kernels/determinant_grad_kernel.h | 28 +++ paddle/phi/kernels/determinant_kernel.h | 26 ++ .../kernels/gpu/determinant_grad_kernel.cu | 25 ++ paddle/phi/kernels/gpu/determinant_kernel.cu | 21 ++ .../impl/determinant_grad_kernel_impl.h | 159 ++++++++++++ .../kernels/impl/determinant_kernel_impl.h | 124 +++++++++ paddle/phi/ops/compat/determinant_sig.cc | 30 +++ 13 files changed, 473 insertions(+), 246 deletions(-) create mode 100644 paddle/phi/kernels/cpu/determinant_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/determinant_kernel.cc create mode 100644 paddle/phi/kernels/determinant_grad_kernel.h create mode 100644 paddle/phi/kernels/determinant_kernel.h create mode 100644 paddle/phi/kernels/gpu/determinant_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/determinant_kernel.cu create mode 100644 paddle/phi/kernels/impl/determinant_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/determinant_kernel_impl.h create mode 100644 paddle/phi/ops/compat/determinant_sig.cc diff --git a/paddle/fluid/operators/determinant_op.cc b/paddle/fluid/operators/determinant_op.cc index 98247fbc862..68083c75985 100644 --- a/paddle/fluid/operators/determinant_op.cc +++ b/paddle/fluid/operators/determinant_op.cc @@ -168,14 +168,6 @@ REGISTER_OPERATOR(determinant, ops::DeterminantOp, ops::DeterminantOpMaker, REGISTER_OPERATOR(determinant_grad, ops::DeterminantGradOp) -REGISTER_OP_CPU_KERNEL(determinant, - ops::DeterminantKernel, - ops::DeterminantKernel); - -REGISTER_OP_CPU_KERNEL( - determinant_grad, ops::DeterminantGradKernel, - ops::DeterminantGradKernel); - REGISTER_OPERATOR(slogdeterminant, ops::SlogDeterminantOp, ops::SlogDeterminantOpMaker, ops::SlogDeterminantGradOpMaker, diff --git a/paddle/fluid/operators/determinant_op.cu b/paddle/fluid/operators/determinant_op.cu index d19d4c3d093..d8237fa3004 100644 --- a/paddle/fluid/operators/determinant_op.cu +++ b/paddle/fluid/operators/determinant_op.cu @@ -17,14 +17,6 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - determinant, ops::DeterminantKernel, - ops::DeterminantKernel); - -REGISTER_OP_CUDA_KERNEL( - determinant_grad, - ops::DeterminantGradKernel, - ops::DeterminantGradKernel); REGISTER_OP_CUDA_KERNEL( slogdeterminant, ops::SlogDeterminantKernel, diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h index f89ecd37222..e6de0ee3548 100644 --- a/paddle/fluid/operators/determinant_op.h +++ b/paddle/fluid/operators/determinant_op.h @@ -23,10 +23,13 @@ #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/complex_kernel.h" #include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/common_shape.h" #include "paddle/phi/kernels/funcs/diag_functor.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/matrix_inverse.h" #include "paddle/phi/kernels/funcs/unsqueeze.h" +#include "paddle/phi/kernels/impl/determinant_grad_kernel_impl.h" +#include "paddle/phi/kernels/impl/determinant_kernel_impl.h" #include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/matmul_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" @@ -40,232 +43,6 @@ T sign(T val) { return static_cast(T(0) < val) - (val < T(0)); } -template -class EigenMatrix {}; - -template <> -class EigenMatrix { - public: - using MatrixType = Eigen::MatrixXf; -}; - -template <> -class EigenMatrix { - public: - using MatrixType = Eigen::MatrixXd; -}; - -inline int64_t GetBatchCount(const framework::DDim dims) { - int64_t batch_count = 1; - auto dim_size = dims.size(); - PADDLE_ENFORCE_GE( - dim_size, 2, - platform::errors::InvalidArgument( - "the input matrix dimension size should greater than 2.")); - - // Cumulative multiplying each dimension until the last 2 to get the batch - // count, - // for example a tensor with shape [3,3,3,3], the batch count of matrices is - // 9. - for (int64_t i = 0; i < dims.size() - 2; i++) { - batch_count *= dims[i]; - } - - return batch_count; -} - -template -struct DeterminantFunctor { - void operator()(const Tensor& input, const framework::ExecutionContext ctx, - int64_t rank, int64_t batch_count, Tensor* output) { - std::vector input_vec; - std::vector output_vec; - framework::TensorToVector(input, ctx.device_context(), &input_vec); - for (int64_t i = 0; i < batch_count; ++i) { // maybe can be parallel - auto begin_iter = input_vec.begin() + i * rank * rank; - auto end_iter = input_vec.begin() + (i + 1) * rank * rank; - std::vector sub_vec(begin_iter, - end_iter); // get every square matrix data - typename EigenMatrix::MatrixType matrix(rank, rank); - for (int64_t i = 0; i < rank; ++i) { - for (int64_t j = 0; j < rank; ++j) { - matrix(i, j) = sub_vec[rank * i + j]; - } - } - output_vec.push_back(matrix.determinant()); - } - framework::TensorFromVector(output_vec, output); - } -}; -template -class DeterminantKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input = context.Input("Input"); - auto input_dim = vectorize(input->dims()); - auto input_dim_size = input_dim.size(); - auto* output = context.Output("Out"); - - auto batch_count = GetBatchCount(input->dims()); - VLOG(2) << "input dim:" << input->dims(); - PADDLE_ENFORCE_GE( - input_dim_size, 2, - platform::errors::InvalidArgument( - "the input matrix dimension size should greater than 2.")); - PADDLE_ENFORCE_EQ(input_dim[input_dim_size - 1], - input_dim[input_dim_size - 2], - platform::errors::InvalidArgument( - "the input matrix should be square matrix.")); - auto rank = input_dim[input_dim_size - 1]; // square matrix length - DeterminantFunctor()(*input, context, rank, batch_count, output); - auto output_dims = phi::slice_ddim(input->dims(), 0, input_dim_size - 2); - if (input_dim_size > 2) { - output->Resize(output_dims); - } else { - // when input is a two-dimension matrix, The det value is a number. - output->Resize({1}); - } - VLOG(2) << "output dim:" << output->dims(); - } -}; - -template -struct FoundZeroFunctor { - FoundZeroFunctor(const T* x, int64_t numel, bool* res) - : x_(x), numel_(numel), res_(res) {} - HOSTDEVICE void operator()(size_t idx) const { - if (*res_ || idx >= static_cast(numel_)) { - // founded zero number - return; - } - *res_ = (x_[idx] == static_cast(0)); - } - const T* x_; - int64_t numel_; - bool* res_; -}; - -template -inline bool CheckMatrixInvertible(const framework::ExecutionContext& ctx, - const framework::Tensor* det) { - auto& dev_ctx = ctx.template device_context(); - auto numel = det->numel(); - - framework::Tensor dev_tensor; - auto* data = dev_tensor.mutable_data({1}, ctx.GetPlace()); - - // set false - phi::funcs::SetConstant zero; - zero(dev_ctx, &dev_tensor, false); - - // find whether zero - platform::ForRange for_range(dev_ctx, numel); - FoundZeroFunctor functor(det->data(), numel, data); - for_range(functor); - - // copy to host - dev_ctx.Wait(); - framework::Tensor cpu_tensor; - framework::TensorCopy(dev_tensor, platform::CPUPlace(), &cpu_tensor); - - // if founded zero, the matrix is not invertible - // else the matrix is invertible - auto* res = cpu_tensor.data(); - return !(*res); -} - -template -class DeterminantGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto& orig_dev_ctx = context.template device_context(); - const auto* input = context.Input("Input"); - const auto* det = context.Input("Out"); - const auto* grad = - context.Input(framework::GradVarName("Out")); - auto* ddet = - context.Output(framework::GradVarName("Input")); - - auto input_dims_size = input->dims().size(); - if (input_dims_size > 2) { - PADDLE_ENFORCE_EQ( - grad->dims().size() + 2, input_dims_size, - platform::errors::InvalidArgument( - "The grad tensor of det dims size should 2 less than" - " input tensor's, but here differ %d", - input_dims_size - grad->dims().size())); - } else if (input_dims_size == 2) { - // input dims size 2 and grad dims size 1 is possible - PADDLE_ENFORCE_EQ( - grad->dims().size(), 1, - platform::errors::InvalidArgument( - "The grad tensor of det dims size should 2 less than" - " input tensor's, but here differ %d", - input_dims_size - grad->dims().size())); - } else { - // checked in forward, pass - } - - auto& dev_ctx = static_cast< - const typename framework::ConvertToPhiContext::TYPE&>( - orig_dev_ctx); - - // Check Whether the matrix is invertible - // (matrix A not invertible) == (det(A)=0) - if (!CheckMatrixInvertible(context, det)) { - // The matrix is not invertible - VLOG(3) << "The input matrix not invertible!"; - ddet->Resize(input->dims()); - phi::Full(dev_ctx, phi::vectorize(input->dims()), static_cast(0.0f), - ddet); - return; - } - - // The matrix is invertible - // let |A| = Determinant(A) - // Ref to https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf - // we set d|A| = unsqueeze(dA * |A|, [-1, -2]) * inverse(A).transpose(-2, - // -1) - - // First: inverse(A) - framework::Tensor inverse_A; - // A must be square matrices! - inverse_A.Resize(input->dims()); - inverse_A.mutable_data(context.GetPlace()); - - phi::funcs::MatrixInverseFunctor mat_inv; - mat_inv(orig_dev_ctx, *input, &inverse_A); - - VLOG(3) << "inverse(A) dims: " << inverse_A.dims(); - - // Second: inverse(A).transpose(-2, -1) - framework::Tensor transpose_inverse_A = - phi::TransposeLast2Dim(dev_ctx, inverse_A); - - VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: " - << transpose_inverse_A.dims(); - - // Third: dA * |A| - auto mul_dA_detA = phi::Multiply(dev_ctx, *grad, *det); - VLOG(3) << "dA * |A| dims: " << mul_dA_detA.dims(); - - // Fourth: unsqueeze(dA * |A|, [-1, -2]) - auto unsqueeze1 = phi::funcs::Unsqueeze(mul_dA_detA, -1); - auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2); - VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims(); - - // Finally: unsqueeze(dA * |A|) * inverse(A) - auto res = phi::Multiply(dev_ctx, unsqueeze2, transpose_inverse_A); - - VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims(); - - framework::TensorCopy(res, context.GetPlace(), ddet); - - ddet->Resize(input->dims()); - VLOG(3) << "d|A| dims: " << ddet->dims(); - } -}; - template struct SlogDeterminantFunctor { void operator()(const Tensor& input, const framework::ExecutionContext ctx, @@ -280,7 +57,7 @@ struct SlogDeterminantFunctor { auto end_iter = input_vec.begin() + (i + 1) * rank * rank; std::vector sub_vec(begin_iter, end_iter); // get every square matrix data - typename EigenMatrix::MatrixType matrix(rank, rank); + typename phi::detail::EigenMatrix::MatrixType matrix(rank, rank); for (int64_t i = 0; i < rank; ++i) { for (int64_t j = 0; j < rank; ++j) { matrix(i, j) = sub_vec[rank * i + j]; @@ -311,7 +88,7 @@ class SlogDeterminantKernel : public framework::OpKernel { auto input_dim_size = input_dim.size(); auto* output = context.Output("Out"); - auto batch_count = GetBatchCount(input->dims()); + auto batch_count = phi::detail::GetBatchCount(input->dims()); VLOG(2) << "input dim:" << input->dims(); PADDLE_ENFORCE_GE( input_dim_size, 2, @@ -370,7 +147,9 @@ class SlogDeterminantGradKernel : public framework::OpKernel { // (matrix A not invertible) == (absslogdet(A)=0) auto slogdet_vec = slogdet->Split(1, 0); auto absslogdet_val = slogdet_vec[0]; - if (!CheckMatrixInvertible(context, &absslogdet_val)) { + if (!phi::detail::CheckMatrixInvertible< + T, typename framework::ConvertToPhiContext::TYPE>( + dev_ctx, &absslogdet_val)) { // The matrix is not invertible VLOG(3) << "The input matrix not invertible!"; dslogdet->Resize(input->dims()); diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index d443b7bb2a0..d16f5f725df 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -27,7 +27,11 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel) # Some kernels depend on some targets that are not commonly used. # These targets are not suitable for common dependencies. # In this case, you need to manually generate them here. -set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel math_kernel matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel triangular_solve_grad_kernel) +set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel math_kernel + matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel + put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel + softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel + triangular_solve_grad_kernel determinant_grad_kernel) kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function) kernel_library(gumbel_softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(gumbel_softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) @@ -46,6 +50,7 @@ kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce) +kernel_library(determinant_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) # 4. auto parse and build kernel targets by cmake register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS} ) diff --git a/paddle/phi/kernels/cpu/determinant_grad_kernel.cc b/paddle/phi/kernels/cpu/determinant_grad_kernel.cc new file mode 100644 index 00000000000..e57d7263f88 --- /dev/null +++ b/paddle/phi/kernels/cpu/determinant_grad_kernel.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/determinant_grad_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/determinant_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(determinant_grad, + CPU, + ALL_LAYOUT, + phi::DeterminantGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/determinant_kernel.cc b/paddle/phi/kernels/cpu/determinant_kernel.cc new file mode 100644 index 00000000000..5810e88e925 --- /dev/null +++ b/paddle/phi/kernels/cpu/determinant_kernel.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/determinant_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/determinant_kernel_impl.h" + +PD_REGISTER_KERNEL( + determinant, CPU, ALL_LAYOUT, phi::DeterminantKernel, float, double) {} diff --git a/paddle/phi/kernels/determinant_grad_kernel.h b/paddle/phi/kernels/determinant_grad_kernel.h new file mode 100644 index 00000000000..87228afc51b --- /dev/null +++ b/paddle/phi/kernels/determinant_grad_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void DeterminantGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& out_grad, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/determinant_kernel.h b/paddle/phi/kernels/determinant_kernel.h new file mode 100644 index 00000000000..abd5f5691b3 --- /dev/null +++ b/paddle/phi/kernels/determinant_kernel.h @@ -0,0 +1,26 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void DeterminantKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/determinant_grad_kernel.cu b/paddle/phi/kernels/gpu/determinant_grad_kernel.cu new file mode 100644 index 00000000000..cce12a87fac --- /dev/null +++ b/paddle/phi/kernels/gpu/determinant_grad_kernel.cu @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/determinant_grad_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/determinant_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(determinant_grad, + GPU, + ALL_LAYOUT, + phi::DeterminantGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/determinant_kernel.cu b/paddle/phi/kernels/gpu/determinant_kernel.cu new file mode 100644 index 00000000000..25184083873 --- /dev/null +++ b/paddle/phi/kernels/gpu/determinant_kernel.cu @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/determinant_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/determinant_kernel_impl.h" + +PD_REGISTER_KERNEL( + determinant, GPU, ALL_LAYOUT, phi::DeterminantKernel, float, double) {} diff --git a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h new file mode 100644 index 00000000000..038ef0c214b --- /dev/null +++ b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h @@ -0,0 +1,159 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/determinant_grad_kernel.h" + +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/matrix_inverse.h" +#include "paddle/phi/kernels/funcs/unsqueeze.h" +#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/transpose_kernel.h" + +namespace phi { +namespace detail { + +template +struct FoundZeroFunctor { + FoundZeroFunctor(const T* x, int64_t numel, bool* res) + : x_(x), numel_(numel), res_(res) {} + HOSTDEVICE void operator()(size_t idx) const { + if (*res_ || idx >= static_cast(numel_)) { + // founded zero number + return; + } + *res_ = (x_[idx] == static_cast(0)); + } + const T* x_; + int64_t numel_; + bool* res_; +}; + +template +inline bool CheckMatrixInvertible(const Context& dev_ctx, + const DenseTensor* det) { + auto numel = det->numel(); + + DenseTensor dev_tensor = phi::Empty(dev_ctx, {1}); + + // set false + phi::funcs::SetConstant zero; + zero(dev_ctx, &dev_tensor, false); + + // find whether zero + phi::funcs::ForRange for_range(dev_ctx, numel); + FoundZeroFunctor functor(det->data(), numel, dev_tensor.data()); + for_range(functor); + + // copy to host + DenseTensor cpu_tensor; + phi::Copy(dev_ctx, dev_tensor, phi::CPUPlace(), false, &cpu_tensor); + + // if founded zero, the matrix is not invertible + // else the matrix is invertible + auto* res = cpu_tensor.data(); + return !(*res); +} + +} // namespace detail + +template +void DeterminantGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& out_grad, + DenseTensor* x_grad) { + auto input_dims_size = x.dims().size(); + if (input_dims_size > 2) { + PADDLE_ENFORCE_EQ( + out_grad.dims().size() + 2, + input_dims_size, + phi::errors::InvalidArgument( + "The grad tensor of det dims size should be 2 less than" + " input tensor's, but here differ %d", + input_dims_size - out_grad.dims().size())); + } else if (input_dims_size == 2) { + // input dims size 2 and grad dims size 1 is possible + PADDLE_ENFORCE_EQ( + out_grad.dims().size(), + 1, + phi::errors::InvalidArgument( + "The grad tensor of det dims size should be 2 less than" + " input tensor's, but here differ %d", + input_dims_size - out_grad.dims().size())); + } else { + // checked in forward, pass + } + + // Check Whether the matrix is invertible + // (matrix A not invertible) == (det(A)=0) + if (!detail::CheckMatrixInvertible(dev_ctx, &out)) { + // The matrix is not invertible + VLOG(3) << "The input matrix not invertible!"; + x_grad->Resize(x.dims()); + phi::Full( + dev_ctx, phi::vectorize(x.dims()), static_cast(0.0f), x_grad); + return; + } + + // The matrix is invertible + // let |A| = Determinant(A) + // Ref to https://people.maths.ox.ac.uk/gilesm/files/NA-08-01.pdf + // we set d|A| = unsqueeze(dA * |A|, [-1, -2]) * inverse(A).transpose(-2, + // -1) + + // First: inverse(A) + DenseTensor inverse_A; + // A must be square matrices! + inverse_A.Resize(x.dims()); + dev_ctx.template Alloc(&inverse_A); + + phi::funcs::MatrixInverseFunctor mat_inv; + mat_inv(dev_ctx, x, &inverse_A); + + VLOG(3) << "inverse(A) dims: " << inverse_A.dims(); + + // Second: inverse(A).transpose(-2, -1) + DenseTensor transpose_inverse_A = + phi::TransposeLast2Dim(dev_ctx, inverse_A); + + VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: " + << transpose_inverse_A.dims(); + + // Third: dA * |A| + auto mul_dA_detA = phi::Multiply(dev_ctx, out_grad, out); + VLOG(3) << "dA * |A| dims: " << mul_dA_detA.dims(); + + // Fourth: unsqueeze(dA * |A|, [-1, -2]) + auto unsqueeze1 = phi::funcs::Unsqueeze(mul_dA_detA, -1); + auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2); + VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims(); + + // Finally: unsqueeze(dA * |A|) * inverse(A) + auto res = phi::Multiply(dev_ctx, unsqueeze2, transpose_inverse_A); + + VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims(); + + x_grad->Resize(x.dims()); + VLOG(3) << "d|A| dims: " << x_grad->dims(); + + phi::Copy(dev_ctx, res, dev_ctx.GetPlace(), false, x_grad); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/determinant_kernel_impl.h b/paddle/phi/kernels/impl/determinant_kernel_impl.h new file mode 100644 index 00000000000..f3a611b89c9 --- /dev/null +++ b/paddle/phi/kernels/impl/determinant_kernel_impl.h @@ -0,0 +1,124 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/determinant_kernel.h" + +#include +#include +#include +#include +#include + +#include "paddle/phi/core/enforce.h" + +#include "paddle/fluid/framework/tensor_util.h" + +namespace phi { +namespace detail { +template +class EigenMatrix {}; + +template <> +class EigenMatrix { + public: + using MatrixType = Eigen::MatrixXf; +}; + +template <> +class EigenMatrix { + public: + using MatrixType = Eigen::MatrixXd; +}; + +inline int64_t GetBatchCount(const DDim dims) { + int64_t batch_count = 1; + auto dim_size = dims.size(); + PADDLE_ENFORCE_GE( + dim_size, + 2, + phi::errors::InvalidArgument( + "the input matrix dimension size should greater than 2.")); + + // Cumulative multiplying each dimension until the last 2 to get the batch + // count, + // for example a tensor with shape [3,3,3,3], the batch count of matrices is + // 9. + for (int64_t i = 0; i < dims.size() - 2; i++) { + batch_count *= dims[i]; + } + + return batch_count; +} +} // namespace detail + +template +struct DeterminantFunctor { + void operator()(const Context& dev_ctx, + const DenseTensor& input, + int64_t rank, + int64_t batch_count, + DenseTensor* output) { + std::vector input_vec; + std::vector output_vec; + paddle::framework::TensorToVector(input, dev_ctx, &input_vec); + for (int64_t i = 0; i < batch_count; ++i) { // maybe can be parallel + auto begin_iter = input_vec.begin() + i * rank * rank; + auto end_iter = input_vec.begin() + (i + 1) * rank * rank; + std::vector sub_vec(begin_iter, + end_iter); // get every square matrix data + typename detail::EigenMatrix::MatrixType matrix(rank, rank); + for (int64_t i = 0; i < rank; ++i) { + for (int64_t j = 0; j < rank; ++j) { + matrix(i, j) = sub_vec[rank * i + j]; + } + } + output_vec.push_back(matrix.determinant()); + } + paddle::framework::TensorFromVector(output_vec, output); + } +}; + +template +void DeterminantKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out) { + auto input_dim = vectorize(x.dims()); + auto input_dim_size = input_dim.size(); + + auto batch_count = detail::GetBatchCount(x.dims()); + VLOG(10) << "input dim:" << x.dims(); + PADDLE_ENFORCE_GE( + input_dim_size, + 2, + phi::errors::InvalidArgument( + "the input matrix dimension size should greater than 2.")); + PADDLE_ENFORCE_EQ(input_dim[input_dim_size - 1], + input_dim[input_dim_size - 2], + phi::errors::InvalidArgument( + "the input matrix should be square matrix.")); + auto rank = input_dim[input_dim_size - 1]; // square matrix length + DeterminantFunctor()(dev_ctx, x, rank, batch_count, out); + auto output_dims = phi::slice_ddim(x.dims(), 0, input_dim_size - 2); + if (input_dim_size > 2) { + out->Resize(output_dims); + } else { + // when input is a two-dimension matrix, The det value is a number. + out->Resize({1}); + } + VLOG(10) << "output dim:" << out->dims(); +} + +} // namespace phi diff --git a/paddle/phi/ops/compat/determinant_sig.cc b/paddle/phi/ops/compat/determinant_sig.cc new file mode 100644 index 00000000000..7bcd30ec5d7 --- /dev/null +++ b/paddle/phi/ops/compat/determinant_sig.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature DeterminantGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("determinant_grad", + {"Input", "Out", GradVarName("Out")}, + {}, + {GradVarName("Input")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(determinant_grad, + phi::DeterminantGradOpArgumentMapping); -- GitLab From 927767ca228823a87729fb209a4b3fec6016d535 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?= <78149749+winter-wang@users.noreply.github.com> Date: Wed, 16 Mar 2022 07:26:55 +0800 Subject: [PATCH 083/176] [infrt]Refine phi dialect (#40505) * change some symbol names * add test * add phi to opt.cc * clean code * up * update * up * up * Update pten_pass.mlir * Update convolution_grad_kernel.cc * update * restore init_infrt_dialects * restore * up * up * up Co-authored-by: Superjomn --- paddle/infrt/dialect/infrt/ir/infrt_ops.td | 6 +- .../infrt/dialect/infrt/pass/infrt_op_fuse.td | 14 +- .../dialect/infrt/pass/infrt_op_fuse_pass.cc | 10 +- paddle/infrt/dialect/phi/CMakeLists.txt | 3 - paddle/infrt/dialect/phi/ir/phi_base.cc | 1 + paddle/infrt/dialect/phi/pass/CMakeLists.txt | 6 +- .../infrt/dialect/phi/pass/kernel_op_desc.cc | 13 +- .../infrt/dialect/phi/pass/kernel_op_desc.h | 8 +- .../dialect/phi/pass/kernel_op_desc_test.cc | 32 +++++ ..._op_cvt_pass.cc => phi_op_convert_pass.cc} | 128 ++++++++++++------ ...hi_op_cvt_pass.h => phi_op_convert_pass.h} | 5 +- paddle/infrt/host_context/mlir_exec.cc | 2 +- paddle/infrt/tests/CMakeLists.txt | 2 +- paddle/infrt/tests/dialect/phi/phi_pass.mlir | 10 +- paddle/scripts/infrt_build.sh | 2 +- 15 files changed, 170 insertions(+), 72 deletions(-) create mode 100644 paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc rename paddle/infrt/dialect/phi/pass/{phi_op_cvt_pass.cc => phi_op_convert_pass.cc} (62%) rename paddle/infrt/dialect/phi/pass/{phi_op_cvt_pass.h => phi_op_convert_pass.h} (86%) diff --git a/paddle/infrt/dialect/infrt/ir/infrt_ops.td b/paddle/infrt/dialect/infrt/ir/infrt_ops.td index f5430b03d0d..82eba2a1746 100644 --- a/paddle/infrt/dialect/infrt/ir/infrt_ops.td +++ b/paddle/infrt/dialect/infrt/ir/infrt_ops.td @@ -53,9 +53,9 @@ def Infrt_CallOp : Infrt_Op<"call"> { }]; } -def Infrt_CvtTensorOp : Infrt_Op<"cvt_tensor", [NoSideEffect]> { - let summary = "convert tensor type op"; - let description = [{convert tensor type op!}]; +def Infrt_TensorCastOp : Infrt_Op<"tensor_cast", [NoSideEffect]> { + let summary = "cast tensor type op"; + let description = [{cast tensor type op!}]; let arguments = (ins AnyType:$input); let results = (outs AnyType:$output); } diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td index 51addb4deb4..7ae0bbae627 100644 --- a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td +++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td @@ -5,17 +5,17 @@ include "mlir/Interfaces/SideEffectInterfaces.td" include "paddle/infrt/dialect/infrt/ir/infrt_ops.td" include "paddle/infrt/dialect/pd_ops.td" -def FuseCvtTensorPattern : Pat< - (Infrt_CvtTensorOp (Infrt_CvtTensorOp $arg)), - (Infrt_CvtTensorOp $arg)>; +def FuseTensorCastPattern : Pat< + (Infrt_TensorCastOp (Infrt_TensorCastOp $arg)), + (Infrt_TensorCastOp $arg)>; -def FuseFeedCvtTensorPattern : Pat< - (Infrt_CvtTensorOp (PD_FeedOp $name)), +def FuseFeedTensorCastPattern : Pat< + (Infrt_TensorCastOp (PD_FeedOp $name)), (PD_FeedOp $name)>; def TypesAreIdentical : Constraint>; -def RedundantCvtTensorOptPattern : Pat< - (Infrt_CvtTensorOp:$res $arg), (replaceWithValue $arg), +def RedundantTensorCastOptPattern : Pat< + (Infrt_TensorCastOp:$res $arg), (replaceWithValue $arg), [(TypesAreIdentical $res, $arg)]>; diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc index 25ecf2ae99d..9d8ce5d8dfe 100644 --- a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc +++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc @@ -27,8 +27,12 @@ struct InfrtOpFusePass : public mlir::PassWrapper { public: ::llvm::StringRef getName() const override { return "infrtOpFusePass"; } + + llvm::StringRef getArgument() const override { return "infrt-op-fuse"; } + void runOnFunction() override; }; + // Implementation of the InfrtOpFusePass. void InfrtOpFusePass::runOnFunction() { ::mlir::RewritePatternSet patterns(&getContext()); @@ -39,14 +43,18 @@ void InfrtOpFusePass::runOnFunction() { if (nullptr == terminator_op) return; for (auto operand : terminator_op->getOperands()) { auto *op1 = operand.getDefiningOp(); - auto cvt_op = ::llvm::dyn_cast<::infrt::CvtTensorOp>(op1); + auto cvt_op = ::llvm::dyn_cast<::infrt::TensorCastOp>(op1); if (!cvt_op) continue; mlir::Value value = cvt_op.input(); operand.replaceAllUsesWith(value); cvt_op.erase(); } } + } // namespace + std::unique_ptr infrt::createInfrtOpFusePass() { return std::make_unique(); } + +mlir::PassRegistration infrt_op_fuse_pass; diff --git a/paddle/infrt/dialect/phi/CMakeLists.txt b/paddle/infrt/dialect/phi/CMakeLists.txt index 4e73a533d99..67f6bb8a2d7 100644 --- a/paddle/infrt/dialect/phi/CMakeLists.txt +++ b/paddle/infrt/dialect/phi/CMakeLists.txt @@ -5,9 +5,6 @@ endif() add_subdirectory(ir) add_subdirectory(pass) -add_executable(phi-ir-exec phi_ir_exec.cc) -target_link_libraries(phi-ir-exec infrt) - add_executable(phi-exec phi_exec.cc) target_link_libraries(phi-exec infrt) diff --git a/paddle/infrt/dialect/phi/ir/phi_base.cc b/paddle/infrt/dialect/phi/ir/phi_base.cc index d8095d7f3f1..f91381fe729 100644 --- a/paddle/infrt/dialect/phi/ir/phi_base.cc +++ b/paddle/infrt/dialect/phi/ir/phi_base.cc @@ -29,6 +29,7 @@ namespace infrt { namespace phi { void PHIDialect::initialize() { + LOG(INFO) << "PHI Dialect initalized"; addOperations< #define GET_OP_LIST #include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc" // NOLINT diff --git a/paddle/infrt/dialect/phi/pass/CMakeLists.txt b/paddle/infrt/dialect/phi/pass/CMakeLists.txt index 5c55a6b0aca..dc60ecf63fe 100644 --- a/paddle/infrt/dialect/phi/pass/CMakeLists.txt +++ b/paddle/infrt/dialect/phi/pass/CMakeLists.txt @@ -2,6 +2,8 @@ core_gather_headers() gather_srcs(infrt_src SRCS proto_arg_map_context.cc - phi_op_cvt_pass.cc + phi_op_convert_pass.cc kernel_op_desc.cc - ) + ) + +cc_test(test_kernel_op_desc SRCS kernel_op_desc_test.cc DEPS infrt) diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc index 353b1054e71..a26e8e2dca5 100644 --- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc @@ -73,7 +73,7 @@ std::string getPhiLayoutSuffix(LayoutType layout) { } } -std::vector getCandidateKernels( +std::vector GetCandidateKernels( std::string name, const std::vector& valid_palces) { std::vector candidate_kernels; PhiKernelDesc phi_kernel_desc; @@ -88,19 +88,20 @@ std::vector getCandidateKernels( if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) continue; place.layout = LayoutType::ANY; } - phi_kernel_desc.kernelType = place; - phi_kernel_desc.inputsType.clear(); - phi_kernel_desc.outputsType.clear(); + phi_kernel_desc.kernel_type = place; + phi_kernel_desc.input_types.clear(); + phi_kernel_desc.output_types.clear(); phi::KernelArgsDef args_def = kernel_key_map.at(kernel_key).args_def(); const paddle::SmallVector& input_arg = args_def.input_defs(); const paddle::SmallVector& output_arg = args_def.output_defs(); for (auto tensor_arg : input_arg) { - phi_kernel_desc.inputsType.emplace_back(ConvertPlaceFromPhi(tensor_arg)); + phi_kernel_desc.input_types.emplace_back(ConvertPlaceFromPhi(tensor_arg)); } for (auto tensor_arg : output_arg) { - phi_kernel_desc.outputsType.emplace_back(ConvertPlaceFromPhi(tensor_arg)); + phi_kernel_desc.output_types.emplace_back( + ConvertPlaceFromPhi(tensor_arg)); } candidate_kernels.emplace_back(phi_kernel_desc); } diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h index b1f7c6c0811..cdc8f7cbff5 100644 --- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.h +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h @@ -21,16 +21,16 @@ namespace infrt { struct PhiKernelDesc { - std::vector inputsType; // kernel input place - std::vector outputsType; // kernel output place - Place kernelType; // kernel place + std::vector input_types; // kernel input place + std::vector output_types; // kernel output place + Place kernel_type; // kernel place }; std::string getPhiTargetPrefix(TargetType target); std::string getPhiPrecisionSuffix(PrecisionType precision); std::string getPhiLayoutSuffix(LayoutType layout); -std::vector getCandidateKernels( +std::vector GetCandidateKernels( std::string name, const std::vector& valid_palces); } // namespace infrt diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc new file mode 100644 index 00000000000..bd5f0799a60 --- /dev/null +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc_test.cc @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h" +#include "paddle/phi/kernels/declarations.h" + +namespace infrt { + +TEST(phi, get_op_desc) { + std::vector places; + places.emplace_back( + TargetType::CPU, PrecisionType::FLOAT32, LayoutType::NCHW); + auto kernels = GetCandidateKernels("addmm", places); + ASSERT_GE(kernels.size(), 1UL); +} + +} // namespace infrt diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc similarity index 62% rename from paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc rename to paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc index 485bf2a75d8..f9e124aba6c 100644 --- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc +++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h" +#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h" #include #include @@ -24,35 +24,52 @@ #include #include +#include "paddle/infrt/common/string.h" #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" +#include "paddle/infrt/dialect/phi/ir/phi_base.h" +#include "paddle/infrt/dialect/phi/ir/phi_kernels.h" #include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h" #include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h" #include "paddle/phi/core/compat/op_utils.h" #include "paddle/phi/ops/compat/signatures.h" namespace { -class phiOpCvtPass - : public mlir::PassWrapper { +class PhiOpConvertPass + : public mlir::PassWrapper { public: - ::llvm::StringRef getName() const override { return "phiOpCvtPass"; } + ::llvm::StringRef getName() const override { return "PhiOpConvertPass"; } void runOnFunction() override; - explicit phiOpCvtPass( - std::vector valid_places = std::vector()) + PhiOpConvertPass(); + explicit PhiOpConvertPass(const std::vector &valid_places) : valid_places_(valid_places) {} + PhiOpConvertPass(const PhiOpConvertPass &other) + : mlir::PassWrapper(*this), + valid_places_(other.valid_places_) {} + + ::llvm::StringRef getArgument() const override { return "phi-op-convert"; } + void getDependentDialects(mlir::DialectRegistry ®istry) const override; + private: void convertStage(); - void diapatchStage(); + void dispatchStage(); + + // Force a specified data format for all layout sensitive operations. + Option valid_places_options_{ + *this, + "valid-targets", + llvm::cl::desc("Set the valid target, [CPU-FP32-NCHW]")}; + std::vector valid_places_; }; - -// Implementation of the phiOpCvtPass. -void phiOpCvtPass::runOnFunction() { +// Implementation of the PhiOpConvertPass. +void PhiOpConvertPass::runOnFunction() { convertStage(); - diapatchStage(); + dispatchStage(); } -void phiOpCvtPass::convertStage() { + +void PhiOpConvertPass::convertStage() { mlir::Block &body = getFunction().front(); std::vector worklist; for (auto &op : body.without_terminator()) { @@ -62,9 +79,9 @@ void phiOpCvtPass::convertStage() { while (!worklist.empty()) { auto *op = worklist.back(); worklist.pop_back(); - if (op == nullptr) continue; + if (!op) continue; - std::string op_name = op->getName().getIdentifier().str(); + auto op_name = op->getName().getIdentifier().str(); // only convert op in pd dialect. if (op_name.substr(0, 3) != "pd.") continue; @@ -73,6 +90,7 @@ void phiOpCvtPass::convertStage() { pd_dialect_inputs_info_map_.end() || pd_dialect_outputs_info_map_.find(op_name) == pd_dialect_outputs_info_map_.end()) { + LOG(WARNING) << "No op info found for " << op_name; // Todo: print log continue; } @@ -85,7 +103,8 @@ void phiOpCvtPass::convertStage() { ::llvm::SmallVector output_types; for (const std::string &str : std::get<0>(kernel_sign.args)) { if (pd_dialect_inputs_info_map_.at(op_name).count(str) == 0) { - // Todo: print error log + LOG(ERROR) << "No input info for Op " << op_name << " and argument " + << str; return; } uint8_t index = pd_dialect_inputs_info_map_.at(op_name).at(str); @@ -94,7 +113,8 @@ void phiOpCvtPass::convertStage() { for (const std::string &str : std::get<2>(kernel_sign.args)) { if (pd_dialect_outputs_info_map_.at(op_name).count(str) == 0) { - // Todo: print error log + LOG(ERROR) << "No output info for Op " << op_name << " and argument " + << str; return; } uint8_t index = pd_dialect_outputs_info_map_.at(op_name).at(str); @@ -109,14 +129,13 @@ void phiOpCvtPass::convertStage() { for (size_t index = 0; index < ori_output.size(); ++index) { ori_output[index].replaceAllUsesWith(kernel_op.getResult(index)); } - if (!op->use_empty()) { - // Todo: print error log - return; - } + + CHECK(op->use_empty()); op->erase(); } } -void phiOpCvtPass::diapatchStage() { + +void PhiOpConvertPass::dispatchStage() { std::vector worklist; mlir::Block &block = getFunction().front(); for (auto &op : block) { @@ -129,7 +148,7 @@ void phiOpCvtPass::diapatchStage() { for (infrt::KernelOp kernel_op : worklist) { std::string kernel_name = kernel_op.name().str(); std::vector candidates = - getCandidateKernels(kernel_name, valid_places_); + GetCandidateKernels(kernel_name, valid_places_); if (candidates.empty()) { LOG(FATAL) << "No candidate kernels for op:" << kernel_name; continue; @@ -140,17 +159,17 @@ void phiOpCvtPass::diapatchStage() { const infrt::PhiKernelDesc &phi_kernel_desc = candidates.front(); kernel_name = - infrt::getPhiTargetPrefix(phi_kernel_desc.kernelType.target) + + infrt::getPhiTargetPrefix(phi_kernel_desc.kernel_type.target) + kernel_name + - infrt::getPhiPrecisionSuffix(phi_kernel_desc.kernelType.precision) + - infrt::getPhiLayoutSuffix(phi_kernel_desc.kernelType.layout); + infrt::getPhiPrecisionSuffix(phi_kernel_desc.kernel_type.precision) + + infrt::getPhiLayoutSuffix(phi_kernel_desc.kernel_type.layout); mlir::OperationName operation_name(kernel_name, kernel_op.getContext()); mlir::OperationState operation_state(kernel_op.getLoc(), operation_name); - if (phi_context.find(phi_kernel_desc.kernelType.target) == + if (phi_context.find(phi_kernel_desc.kernel_type.target) == phi_context.end()) { - switch (phi_kernel_desc.kernelType.target) { + switch (phi_kernel_desc.kernel_type.target) { case infrt::TargetType::CPU: { auto context_value = builder @@ -169,33 +188,36 @@ void phiOpCvtPass::diapatchStage() { } } operation_state.addOperands( - phi_context.at(phi_kernel_desc.kernelType.target)); - for (size_t index = 0; index < phi_kernel_desc.inputsType.size(); ++index) { + phi_context.at(phi_kernel_desc.kernel_type.target)); + + for (size_t index = 0; index < phi_kernel_desc.input_types.size(); + ++index) { mlir::Value input = kernel_op.getOperand(index); - auto cvt_tensor_type_op = builder.create( + auto cvt_tensor_type_op = builder.create( kernel_op.getLoc(), infrt::DenseTensorType::get( kernel_op.getContext(), - phi_kernel_desc.inputsType[index].target, - phi_kernel_desc.inputsType[index].precision, - phi_kernel_desc.inputsType[index].layout), + phi_kernel_desc.input_types[index].target, + phi_kernel_desc.input_types[index].precision, + phi_kernel_desc.input_types[index].layout), input); operation_state.addOperands(cvt_tensor_type_op.output()); } - for (size_t index = 0; index < phi_kernel_desc.outputsType.size(); + + for (size_t index = 0; index < phi_kernel_desc.output_types.size(); ++index) { operation_state.addTypes(infrt::DenseTensorType::get( kernel_op.getContext(), - phi_kernel_desc.outputsType[index].target, - phi_kernel_desc.outputsType[index].precision, - phi_kernel_desc.outputsType[index].layout)); + phi_kernel_desc.output_types[index].target, + phi_kernel_desc.output_types[index].precision, + phi_kernel_desc.output_types[index].layout)); } operation_state.addAttributes(kernel_op.attrsAttr().getValue()); mlir::Operation *phi_operation = builder.createOperation(operation_state); - for (size_t index = 0; index < phi_kernel_desc.outputsType.size(); + for (size_t index = 0; index < phi_kernel_desc.output_types.size(); ++index) { mlir::Value input = phi_operation->getResult(index); - auto cvt_tensor_type_op = builder.create( + auto cvt_tensor_type_op = builder.create( kernel_op.getLoc(), kernel_op.getResultTypes()[index], input); kernel_op.getResult(index).replaceAllUsesWith( cvt_tensor_type_op.output()); @@ -204,9 +226,35 @@ void phiOpCvtPass::diapatchStage() { } } +PhiOpConvertPass::PhiOpConvertPass() { + if (!valid_places_options_.hasValue()) { + valid_places_.emplace_back(infrt::TargetType::CPU, + infrt::PrecisionType::FLOAT32, + infrt::LayoutType::NCHW); + return; + } + + LOG(FATAL) << "To be done for specifying places in command line"; +} + +void PhiOpConvertPass::getDependentDialects( + mlir::DialectRegistry ®istry) const { + registry.insert(); + registry.insert(); + registry.insert(); + registry.insert(); + registry.insert(); +} + } // namespace +mlir::PassRegistration phi_op_convert; + std::unique_ptr infrt::createPhiOpCvtPass( std::vector valid_places) { - return std::make_unique(valid_places); + return std::make_unique(valid_places); +} + +std::unique_ptr infrt::createPhiOpCvtPass() { + return std::make_unique(); } diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h similarity index 86% rename from paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h rename to paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h index 8b1944042aa..5a2c0ee96ed 100644 --- a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h +++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h @@ -21,7 +21,8 @@ namespace infrt { * phiOpCvtPass. * Convert the general operators from pd Dialect to phi dialect. */ -std::unique_ptr createPhiOpCvtPass( - std::vector valid_places = std::vector()); +std::unique_ptr createPhiOpCvtPass(std::vector valid_places); + +std::unique_ptr createPhiOpCvtPass(); } // namespace infrt diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc index 1506282f626..319df90d3ee 100644 --- a/paddle/infrt/host_context/mlir_exec.cc +++ b/paddle/infrt/host_context/mlir_exec.cc @@ -30,7 +30,7 @@ #include "paddle/infrt/kernel/test_kernels.h" #ifdef INFRT_WITH_PHI #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h" -#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h" +#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h" #include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h" #include "paddle/infrt/kernel/phi/registry.h" #endif diff --git a/paddle/infrt/tests/CMakeLists.txt b/paddle/infrt/tests/CMakeLists.txt index 5ce6d867342..e5cc1ec1121 100644 --- a/paddle/infrt/tests/CMakeLists.txt +++ b/paddle/infrt/tests/CMakeLists.txt @@ -1,6 +1,6 @@ configure_file(lit.cfg.py.in "${CMAKE_SOURCE_DIR}/paddle/infrt/tests/lit.cfg.py") add_test(NAME test_infrt_by_lit COMMAND sh -c "lit -v ${CMAKE_SOURCE_DIR}/paddle/infrt/tests --filter-out \"disabled_*\"" - DEPENDS infrtopt infrtexec phi-ir-exec) + DEPENDS infrtopt infrtexec) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir.in ${CMAKE_CURRENT_SOURCE_DIR}/dialect/tensor/tensor_map.mlir) diff --git a/paddle/infrt/tests/dialect/phi/phi_pass.mlir b/paddle/infrt/tests/dialect/phi/phi_pass.mlir index 61a66cb3d71..47badd97d37 100644 --- a/paddle/infrt/tests/dialect/phi/phi_pass.mlir +++ b/paddle/infrt/tests/dialect/phi/phi_pass.mlir @@ -1,4 +1,5 @@ -// RUN: phi-ir-exec %s +// RUN: infrtopt -phi-op-convert -infrt-op-fuse %s + // CHECK-LABEL: @ops func @ops() { %a = pd.feed() {name="input0"} : !infrt.lod_tensor @@ -8,3 +9,10 @@ func @ops() { %h = "pd.abs"(%g):(tensor) -> tensor "pd.fetch"(%h) {name="output"} :(tensor)->() } + +// CHECK-LABEL: @op_execute +func @op_execute(%a:!infrt.lod_tensor, %b:!infrt.lod_tensor, %c:!infrt.lod_tensor) -> !infrt.lod_tensor { + %g = "pd.elementwise_add"(%a, %b) {axis=1:si32} : (!infrt.lod_tensor, !infrt.lod_tensor) -> tensor + %h = "pd.abs"(%g):(tensor) -> tensor + "pd.fetch"(%h) {name="output"} :(tensor)->() +} diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh index 76b45ff89f1..3b2df68074a 100755 --- a/paddle/scripts/infrt_build.sh +++ b/paddle/scripts/infrt_build.sh @@ -93,7 +93,7 @@ function infrt_gen_and_build() { exit 7; fi - make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec phi-ir-exec phi-exec infrt_lib_dist paddle-mlir-convert;build_error=$? + make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec phi-exec infrt_lib_dist paddle-mlir-convert;build_error=$? if [ "$build_error" != 0 ];then exit 7; fi -- GitLab From a991b6a0869290c6ece7f0cd2348f86273146b3c Mon Sep 17 00:00:00 2001 From: Kaipeng Deng Date: Wed, 16 Mar 2022 09:39:11 +0800 Subject: [PATCH 084/176] fix IterableDataset may block model when num_workers > 0. test=develop (#40541) --- python/paddle/fluid/dataloader/dataloader_iter.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py index 706ec0d523b..5385ac28b90 100644 --- a/python/paddle/fluid/dataloader/dataloader_iter.py +++ b/python/paddle/fluid/dataloader/dataloader_iter.py @@ -564,6 +564,14 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): self._rcvd_idx += 1 self._batches_outstanding -= 1 else: + # NOTE: when _rcvd_idx catch up _send_idx, which means + # one of following: + # 1. all 2 * num_workers batches have been loaded + # and stored in _blocking_queue + # 2. all data drained + # we need to let _thread blocking at _data_queue + # get_data to inoccupy CPU, otherwise may occupy + # CPU time for model running # NOTE: in persistent workers mode, do not check data # drained here, simply let it go to _data_queue # reading to get _ResumeIteration @@ -573,7 +581,6 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): # may also be data in blocking queue if self._batches_outstanding < len(self._places): return None - continue if self._rcvd_idx in self._task_infos and \ len(self._task_infos[self._rcvd_idx]) == 3: -- GitLab From 3185826358a10c7d23eaae9b31d87d35045319b9 Mon Sep 17 00:00:00 2001 From: xiongkun Date: Wed, 16 Mar 2022 09:52:32 +0800 Subject: [PATCH 085/176] lgamma tranfer make xpu ci failed. fix compile error in xpu CI (#40581) --- paddle/phi/kernels/cpu/lgamma_kernel.cc | 2 ++ paddle/phi/kernels/funcs/activation_functor.h | 4 ++++ paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h | 1 + 3 files changed, 7 insertions(+) diff --git a/paddle/phi/kernels/cpu/lgamma_kernel.cc b/paddle/phi/kernels/cpu/lgamma_kernel.cc index d0226894089..f849322174d 100644 --- a/paddle/phi/kernels/cpu/lgamma_kernel.cc +++ b/paddle/phi/kernels/cpu/lgamma_kernel.cc @@ -13,6 +13,8 @@ // limitations under the License. #include "paddle/phi/kernels/lgamma_kernel.h" + +#include #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/for_range.h" diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h index c8fb54bb102..7fe513a24ba 100644 --- a/paddle/phi/kernels/funcs/activation_functor.h +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -35,6 +35,10 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/eigen/common.h" +#ifdef PADDLE_WITH_XPU_KP +#define __forceinline__ __inline__ +#endif + namespace phi { namespace funcs { enum ActBwdOpFwdDeps { diff --git a/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h b/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h index a1b33f5a331..8fb1f1c4fa3 100644 --- a/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/lgamma_grad_kernel_impl.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include "paddle/phi/kernels/funcs/for_range.h" namespace phi { template -- GitLab From ec6b8fbd4efaa21add75a8cf7af3f0478bd8a18d Mon Sep 17 00:00:00 2001 From: Yulong Ao Date: Wed, 16 Mar 2022 10:18:14 +0800 Subject: [PATCH 086/176] [Auto Parallel] Add the support for the auto completion of while_op (#39939) * [Auto Parallel] Support the auto completion of while_op * [Auto Parallel] Improve the completion algorithms * [Auto Parallel] Fix bugs for ernie inference * [Auto Parallel] Remove attrs which cannot be pickled * [Auto Parallel] make the dims_mappings of LodTensorArray vars empty * [Auto Parallel] Fix bugs for the ernie inference in the pipeline parallel * [Auto Parallel] Remove unncessary comments * [Auto Parallel] Fix a bug of the CMakeLists * [Auto Parallel] Use the newest APIs to write the unit test * [Auto Parallel] Remove unnecessary statements --- paddle/fluid/framework/ir/graph.cc | 1 + paddle/fluid/framework/ir/graph.h | 4 + paddle/fluid/framework/ir/node.h | 3 + paddle/fluid/pybind/ir.cc | 1 + .../distributed/auto_parallel/completion.py | 437 +++++++++++++++--- .../auto_parallel/dist_attribute.py | 15 + .../distributed/auto_parallel/dist_context.py | 114 +++-- .../distributed/auto_parallel/dist_op.py | 33 +- .../distributed/auto_parallel/dist_tensor.py | 8 +- .../distributed/auto_parallel/engine.py | 2 +- .../auto_parallel/operators/common.py | 4 +- .../auto_parallel/operators/dist_default.py | 49 +- .../auto_parallel/operators/dist_matmul.py | 1 - .../paddle/distributed/auto_parallel/utils.py | 1 - .../unittests/auto_parallel/CMakeLists.txt | 1 + .../auto_parallel/test_while_op_completion.py | 209 +++++++++ 16 files changed, 754 insertions(+), 129 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 036fde8fac6..f5f6f3ecb85 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -95,6 +95,7 @@ std::map> Graph::InitFromBlock( std::unordered_map> name_to_desc_block_id; + block_id_ = block.ID(); const BlockDesc *block_var_visible = █ while (block_var_visible != nullptr) { for (auto *var : block_var_visible->AllVars()) { diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 21e743e3587..10645f08dc3 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -230,6 +230,7 @@ class Graph { auto *x = AddNode(new ir::Node(var_desc, block_id == -1 ? block_id_ : block_id)); x->SetId(num_node_created_++); + x->SetGraphId(block_id_); return x; } @@ -245,6 +246,7 @@ class Graph { "The OpDesc used to create operator node is null.")); auto *x = AddNode(new ir::Node(op_desc)); x->SetId(num_node_created_++); + x->SetGraphId(block_id_); return x; } @@ -263,6 +265,7 @@ class Graph { num_node_created_); auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable, block_id_)); x->SetId(num_node_created_++); + x->SetGraphId(block_id_); return x; } @@ -276,6 +279,7 @@ class Graph { } auto *x = AddNode(new ir::Node(name, type, block_id_)); x->SetId(num_node_created_++); + x->SetGraphId(block_id_); return x; } diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index 7e61d6ae424..8c51c278d48 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -125,6 +125,7 @@ class Node { // Only use this for auto parallel. // A node does not have original desc if the return is zero. uint64_t OriginalDescId() const { return original_desc_id_; } + int GraphId() const { return graph_id_; } bool IsOp() const { return type_ == Type::kOperation; } bool IsVar() const { return type_ == Type::kVariable; } @@ -246,10 +247,12 @@ class Node { // Store the original id of var desc or op desc. // Only use this for auto parallel. uint64_t original_desc_id_{0}; + int graph_id_{-1}; private: // ID can only set by a Graph. void SetId(int id) { id_ = id; } + void SetGraphId(int graph_id) { graph_id_ = graph_id; } // desc_order can only set by a Graph when constructing a Graph from a // BlockDesc. diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc index bb45c1c4060..ecbacd37d56 100644 --- a/paddle/fluid/pybind/ir.cc +++ b/paddle/fluid/pybind/ir.cc @@ -143,6 +143,7 @@ void BindNode(py::module *m) { .def("var", &Node::Var, return_value_policy::reference) .def("op", &Node::Op, return_value_policy::reference) .def("id", &Node::id) + .def("graph_id", &Node::GraphId) .def("original_desc_id", &Node::OriginalDescId) .def("is_op", &Node::IsOp) .def("is_var", &Node::IsVar) diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py index ae2d9163435..e303ce12168 100644 --- a/python/paddle/distributed/auto_parallel/completion.py +++ b/python/paddle/distributed/auto_parallel/completion.py @@ -21,11 +21,12 @@ from paddle.fluid import framework from .utils import print_program_with_dist_attr from .operators import find_best_compatible_distributed_operator_impl -from .dist_context import get_default_distributed_context +from .dist_context import get_default_distributed_context, _node_id from .dist_tensor import DistributedTensor from .dist_op import DistributedOperator from .dist_attribute import TensorDistributedAttribute from .dist_attribute import OperatorDistributedAttribute +from .process_mesh import ProcessMesh from paddle.distributed.fleet.meta_optimizers.common import OpRole @@ -108,6 +109,20 @@ def compute_compatible_dims_mapping(dims_mapping_list): return compatible_result +def merge_process_mesh_two(pm1, pm2): + process_set1 = set() + process_set2 = set() + if pm1 is None and pm2 is None: + return None + if pm1 is not None: + process_set1 = set(pm1.processes) + if pm2 is not None: + process_set2 = set(pm2.processes) + merged_process_set = process_set1.union(process_set2) + merged_process_mesh = ProcessMesh(list(merged_process_set)) + return merged_process_mesh + + class Completer: def __init__(self, dist_context): assert dist_context is not None @@ -119,7 +134,9 @@ class Completer: return False tensor_desc = tensor_node.var() # Skip reader tensor - if tensor_desc.type() == core.VarDesc.VarType.READER: + if tensor_desc.type() == core.VarDesc.VarType.READER \ + or tensor_desc.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \ + or tensor_desc.type == core.VarDesc.VarType.STEP_SCOPES: return False tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph( tensor_node) @@ -185,7 +202,7 @@ class Completer: op_dist_attr = dist_op.dist_attr if fwd: for tensor_node in op_node.inputs: - if tensor_node.var() is not None: + if tensor_node.is_var() and tensor_node.var() is not None: if tensor_node.var().type() == core.VarDesc.VarType.READER: continue tensor_desc = tensor_node.var() @@ -208,19 +225,19 @@ class Completer: # Find the most compatible implemenetations from the distributed operator op_dist_impl = find_best_compatible_distributed_operator_impl( dist_op, fwd=True) - assert op_dist_impl is not None, "Cannot find the dist op implementation." - dim_changed = op_dist_impl.update_dims_mapping(dist_op) - if dim_changed: - changed = True - if op_dist_impl.is_auto_compatible(dist_op): - if op_dist_impl.type == "elementwise": - op_dist_attr.impl_type = "default" - else: - op_dist_attr.impl_type = op_dist_impl.type - op_dist_attr.impl_idx = op_dist_impl.idx + if op_dist_impl is not None: + dim_changed = op_dist_impl.update_dims_mapping(dist_op) + if dim_changed: + changed = True + if op_dist_impl.is_auto_compatible(dist_op): + if op_dist_impl.type == "elementwise": + op_dist_attr.impl_type = "default" + else: + op_dist_attr.impl_type = op_dist_impl.type + op_dist_attr.impl_idx = op_dist_impl.idx else: for tensor_node in op_node.outputs: - if tensor_node.var() is not None: + if tensor_node.is_var() and tensor_node.var() is not None: if tensor_node.var().type() == core.VarDesc.VarType.READER: continue tensor_desc = tensor_node.var() @@ -243,61 +260,38 @@ class Completer: # Find the most compatible implemenetations from the distributed operator op_dist_impl = find_best_compatible_distributed_operator_impl( dist_op, fwd=False) - assert op_dist_impl is not None, "Cannot find the dist op implementation." - dim_changed = op_dist_impl.update_dims_mapping(dist_op) - if dim_changed: - changed = True - if op_dist_impl.is_auto_compatible(dist_op): - if op_dist_impl.type == "elementwise": - op_dist_attr.impl_type = "default" - else: - op_dist_attr.impl_type = op_dist_impl.type - op_dist_attr.impl_idx = op_dist_impl.idx + if op_dist_impl is not None: + dim_changed = op_dist_impl.update_dims_mapping(dist_op) + if dim_changed: + changed = True + if op_dist_impl.is_auto_compatible(dist_op): + if op_dist_impl.type == "elementwise": + op_dist_attr.impl_type = "default" + else: + op_dist_attr.impl_type = op_dist_impl.type + op_dist_attr.impl_idx = op_dist_impl.idx return changed - def _update_process_mesh(self): - def _find_nearset_node(nodes, idx): - for node in reversed(nodes[:idx]): - node_dist_attr = self._dist_context.get_dist_attr_for_graph( - node) - if node_dist_attr.process_mesh is not None: - return node - - total_reach_fix_point = False - while not total_reach_fix_point: - total_changed = False - for is_fwd in [True, False]: - all_nodes = self._dist_context.serial_ordered_nodes \ - if is_fwd else reversed(self._dist_context.serial_ordered_nodes) - reach_fix_point = False - while not reach_fix_point: - changed = False - for idx, node in enumerate(all_nodes): - nearest_node = _find_nearset_node( - self._dist_context.serial_ordered_nodes, idx) - if nearest_node is None: - continue - nearest_node_dis_attr = self._dist_context.get_dist_attr_for_graph( - nearest_node) - nearest_process_mesh = nearest_node_dis_attr.process_mesh - cur_node_dist_attr = self._dist_context.get_dist_attr_for_graph( - node) - cur_process_mesh = cur_node_dist_attr.process_mesh - compatible_process_mesh = compute_compatible_process_mesh( - [cur_process_mesh, nearest_process_mesh]) - if compatible_process_mesh is not None \ - and cur_process_mesh != compatible_process_mesh: - cur_node_dist_attr.process_mesh = compatible_process_mesh - changed = True - if changed: - reach_fix_point = False - total_changed = True - else: - reach_fix_point = True - if total_changed: - total_reach_fix_point = False - else: - total_reach_fix_point = True + def _update_dims_mapping_between_graphs(self): + changed = False + for parent_node, child_node in self._node_pairs_between_graphs: + parent_node_dist_attr = self._dist_context.get_dist_attr_for_graph( + parent_node) + child_node_dist_attr = self._dist_context.get_dist_attr_for_graph( + child_node) + parent_node_dims_mapping = parent_node_dist_attr.dims_mapping + child_node_dims_mapping = child_node_dist_attr.dims_mapping + compatible_dims_mapping = compute_compatible_dims_mapping( + [parent_node_dims_mapping, child_node_dims_mapping]) + if (compatible_dims_mapping is not None) \ + and (compatible_dims_mapping != parent_node_dims_mapping): + parent_node_dist_attr.dims_mapping = compatible_dims_mapping + changed = True + if (compatible_dims_mapping is not None) \ + and (compatible_dims_mapping != child_node_dims_mapping): + parent_node_dist_attr.dims_mapping = compatible_dims_mapping + changed = True + return changed def _update_dims_mapping(self): # Complete dims_mapping for each node @@ -318,11 +312,314 @@ class Completer: node, fwd=is_fwd) if op_changed: changed = True + graph_changed = self._update_dims_mapping_between_graphs() + if graph_changed: + changed = True if changed: reach_fix_point = False else: reach_fix_point = True + def _update_process_mesh_by_nearest(self, op_node, nearest_op_node): + op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node) + # Set the process mesh of the op node by its nearest op node + if not op_dist_attr.is_annotated("process_mesh"): + process_mesh = op_dist_attr.process_mesh + nearest_op_dis_attr = self._dist_context.get_dist_attr_for_graph( + nearest_op_node) + nearest_process_mesh = nearest_op_dis_attr.process_mesh + compatible_process_mesh = compute_compatible_process_mesh( + [process_mesh, nearest_process_mesh]) + if compatible_process_mesh is not None \ + and process_mesh != compatible_process_mesh: + op_dist_attr.process_mesh = compatible_process_mesh + # Skip the process_mesh setting of inputs and outputs of while_op + if op_dist_attr.op_type == "while": + return + # Set the process mesh of the op node's leaf-inputs + for tensor_node in op_node.inputs: + if tensor_node.is_var() and tensor_node.var() is not None: + tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph( + tensor_node) + if tensor_dist_attr.is_annotated("process_mesh"): + continue + # Skip the non-leaf var node + if len(tensor_node.inputs) != 0: + continue + compatible_process_mesh = compute_compatible_process_mesh( + [tensor_dist_attr.process_mesh, op_dist_attr.process_mesh]) + if compatible_process_mesh is not None \ + and tensor_dist_attr.process_mesh != compatible_process_mesh: + tensor_dist_attr.process_mesh = compatible_process_mesh + # Set the process mesh of the op node's outputs + for tensor_node in op_node.outputs: + if tensor_node.is_var() and tensor_node.var() is not None: + tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph( + tensor_node) + if tensor_dist_attr.is_annotated("process_mesh"): + continue + compatible_process_mesh = compute_compatible_process_mesh( + [tensor_dist_attr.process_mesh, op_dist_attr.process_mesh]) + if compatible_process_mesh is not None \ + and tensor_dist_attr.process_mesh != compatible_process_mesh: + tensor_dist_attr.process_mesh = compatible_process_mesh + + def _update_process_mesh_for_specials(self): + def _find_nearest_tensor_node_before(nodes, idx, var_name): + for node in reversed(nodes[:idx]): + if node.is_var() and node.var() is not None \ + and node.var().name() == var_name: + return node + + def _find_nearest_tensor_node_after(nodes, idx, var_name): + for node in nodes[idx + 1:]: + if node.is_var() and node.var() is not None \ + and node.var().name() == var_name: + return node + + def _find_nodes_related_to_cond(source_node): + related_nodes = [] + visited = set() + frontier = list() + frontier.append(source_node) + # BFS + while len(frontier) != 0: + cur = frontier[0] + frontier = frontier[1:] + if _node_id(cur) in visited: + continue + # TODO: need more restrictions + for node in cur.inputs: + if node.is_var() and node.var() is not None: + if node.var().type() != core.VarDesc.VarType.READER \ + and len(node.var().shape()) == 1: + frontier.append(node) + related_nodes.append(node) + if node.is_op() and node.op() is not None: + flag = True + if node.op().type() == "create_py_reader" \ + or node.op().type() == "create_double_buffer_reader" \ + or node.op().type() == "read": + flag = False + for tensor_node in node.inputs: + if tensor_node.is_var() and tensor_node.var( + ) is not None: + if tensor_node.var().type() == core.VarDesc.VarType.READER \ + or len(tensor_node.var().shape()) != 1: + flag = False + break + for tensor_node in node.outputs: + if tensor_node.is_var() and tensor_node.var( + ) is not None: + if tensor_node.var().type() == core.VarDesc.VarType.READER \ + or len(tensor_node.var().shape()) != 1: + flag = False + break + if flag: + frontier.append(node) + related_nodes.append(node) + visited.add(_node_id(cur)) + return related_nodes + + # Amend the process meshes related to while_op + for while_op_node, while_op_node_idx in self._while_op_nodes.values(): + sub_graph_id = while_op_node.op()._block_attr_id("sub_block") + sub_graph = self._dist_context._serial_graph.get_sub_graph( + sub_graph_id) + sub_graph_nodes = list(sub_graph.all_nodes()) + while_dist_op = self._dist_context.get_dist_op_for_graph( + while_op_node) + while_op_dist_attr = while_dist_op.dist_attr + + # Step 1: set the process mesh of while_op to the merged process mesh of its subblock + merged_process_mesh = while_op_dist_attr.process_mesh + for node in sub_graph_nodes: + if (node.is_var() and node.var() is not None) \ + or (node.is_op() and node.op() is not None): + dist_attr = self._dist_context.get_dist_attr_for_graph(node) + merged_process_mesh = merge_process_mesh_two( + merged_process_mesh, dist_attr.process_mesh) + while_op_dist_attr.process_mesh = merged_process_mesh + + # Step 2: set the related nodes of while_op to the process mesh of while_op + # Step 2.1: Find related nodes of cond var the graph of while_op + cond_tensor_related_nodes = [] + cond_tensor_name = while_op_node.op().input("Condition")[0] + cond_tensor_node = None + for node in while_op_node.inputs: + if node.is_var() and node.var() is not None \ + and node.var().name() == cond_tensor_name: + cond_tensor_node = node + cond_tensor_related_nodes.append(cond_tensor_node) + break + + cond_tensor_related_nodes.extend( + _find_nodes_related_to_cond(cond_tensor_node)) + + # Step 2.2: Find related nodes of cond var in the subgraph of while_op + cond_tensor_node = None + for node in reversed(sub_graph_nodes): + if node.is_var() and node.var() is not None \ + and node.var().name() == cond_tensor_name \ + and len(node.outputs) == 0: + cond_tensor_node = node + break + + cond_tensor_related_nodes.extend( + _find_nodes_related_to_cond(cond_tensor_node)) + # Step 2.3: Add the StepScops output of while_op + stepscopes_tensor_name = while_op_node.op().output("StepScopes")[0] + stepscopes_tensor_node = None + for output_node in while_op_node.outputs: + if output_node.is_var() and output_node.var() is not None \ + and output_node.var().name() == stepscopes_tensor_name: + stepscopes_tensor_node = output_node + cond_tensor_related_nodes.append(stepscopes_tensor_node) + # Step 2.4: Set the process meshes of all nodes related to cond var to the process mesh of while op + for node in cond_tensor_related_nodes: + tensor_dist_attr = self._dist_context.get_dist_attr_for_graph( + node) + tensor_dist_attr.process_mesh = merged_process_mesh + + # Step 3: set the process meshes of the inputs in while_op to the process meshes of the outside input nodes + while_op_inputs_dist_attrs = while_op_dist_attr.inputs_dist_attrs + for tensor_name, tensor_dist_attr in while_op_inputs_dist_attrs.items( + ): + nearest_tensor_node = _find_nearest_tensor_node_before( + self._dist_context.serial_ordered_nodes, while_op_node_idx, + tensor_name) + nearest_tensor_dist_attr = self._dist_context.get_dist_attr_for_graph( + nearest_tensor_node) + tensor_dist_attr.process_mesh = nearest_tensor_dist_attr.process_mesh + + # Step 4: set the process meshes of the outputs in while_op to the process meshes of the outside output nodes + while_op_outputs_dist_attrs = while_op_dist_attr.outputs_dist_attrs + for tensor_name, tensor_dist_attr in while_op_outputs_dist_attrs.items( + ): + nearest_tensor_node = _find_nearest_tensor_node_before( + self._dist_context.serial_ordered_nodes, while_op_node_idx, + tensor_name) + if nearest_tensor_node is None: + nearest_tensor_node = _find_nearest_tensor_node_after( + self._dist_context.serial_ordered_nodes, + while_op_node_idx, tensor_name) + nearest_tensor_dist_attr = self._dist_context.get_dist_attr_for_graph( + nearest_tensor_node) + tensor_dist_attr.process_mesh = nearest_tensor_dist_attr.process_mesh + + # Amend the process meshes related to array + for array_node_list in self._array_nodes.values(): + merged_process_mesh = None + for array_node in array_node_list: + dist_attr = self._dist_context.get_dist_attr_for_graph( + array_node) + merged_process_mesh = merge_process_mesh_two( + merged_process_mesh, dist_attr.process_mesh) + for array_node in array_node_list: + dist_attr = self._dist_context.get_dist_attr_for_graph( + array_node) + dist_attr.process_mesh = merged_process_mesh + + def _update_process_mesh(self): + ordered_op_nodes = self._dist_context._serial_ordered_op_nodes + + # Step 1: Set the annotated process meshes from tensors to the first ops using them + ordered_tensor_nodes = self._dist_context._serial_ordered_tensor_nodes + for tensor_node in ordered_tensor_nodes: + tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph( + tensor_node) + if not tensor_dist_attr.is_annotated("process_mesh"): + continue + first_op_node = None + for op_node in ordered_op_nodes: + # TODO: Need a better rule for the control flow ops. + # For now, do not set the process mesh of while_op from its inputs + if op_node.op().type() == "while": + continue + for input_tensor_node in op_node.inputs: + if _node_id(tensor_node) == _node_id(input_tensor_node): + first_op_node = op_node + break + if first_op_node is not None: + break + if first_op_node is None: + continue + op_dist_attr = self._dist_context.get_dist_attr_for_graph( + first_op_node) + if op_dist_attr is not None and not op_dist_attr.is_annotated( + "process_mesh"): + compatible_process_mesh = compute_compatible_process_mesh( + [tensor_dist_attr.process_mesh, op_dist_attr.process_mesh]) + if compatible_process_mesh is not None \ + and op_dist_attr.process_mesh != compatible_process_mesh: + op_dist_attr.process_mesh = compatible_process_mesh + + # Step 2: set the process meshes of ops with the nearest op before them + # Step 2.1: find the first op node which has the process mesh + idx_of_first_op_node_has_process_mesh = -1 + for idx, op_node in enumerate(ordered_op_nodes): + op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node) + if op_dist_attr.process_mesh is not None \ + and idx_of_first_op_node_has_process_mesh == -1: + idx_of_first_op_node_has_process_mesh = idx + # Reuse the following method to set the related tensors for same op node + self._update_process_mesh_by_nearest(op_node, op_node) + # Step 2.2: set the process meshes of ops by the nearest op node after the first op node + if idx_of_first_op_node_has_process_mesh + 1 > len(ordered_op_nodes): + return None + for idx, op_node in enumerate(ordered_op_nodes[ + idx_of_first_op_node_has_process_mesh + 1:]): + original_idx = idx_of_first_op_node_has_process_mesh + +idx + 1 + nearest_op_node = ordered_op_nodes[original_idx - 1] + nearest_op_dist_attr = self._dist_context.get_dist_attr_for_graph( + nearest_op_node) + op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node) + assert nearest_op_dist_attr.process_mesh is not None + self._update_process_mesh_by_nearest(op_node, nearest_op_node) + # Step 2.3: set the process meshes of ops by the nearest op node before the first op node + nearest_op_node = ordered_op_nodes[ + idx_of_first_op_node_has_process_mesh] + for op_node in ordered_op_nodes[:idx_of_first_op_node_has_process_mesh]: + self._update_process_mesh_by_nearest(op_node, nearest_op_node) + + # Step 3: adjust the process meshes for special ops + self._update_process_mesh_for_specials() + + def _prepare(self): + self._while_op_nodes = {} + self._array_nodes = {} + self._node_pairs_between_graphs = [] + all_nodes = self._dist_context.serial_ordered_nodes + for idx, node in enumerate(all_nodes): + if node.is_op(): + if node.op().type() == "while": + self._while_op_nodes[_node_id(node)] = (node, idx) + if node.op().type() == "read_from_array": + array_var_name = node.op().input("X")[0] + if self._array_nodes.get(array_var_name, None) is None: + self._array_nodes[array_var_name] = [] + self._array_nodes[array_var_name].append(node) + if node.op().type() == "write_to_array": + array_var_name = node.op().output("Out")[0] + if self._array_nodes.get(array_var_name, None) is None: + self._array_nodes[array_var_name] = [] + self._array_nodes[array_var_name].append(node) + self._array_nodes[array_var_name].append(node.outputs[0]) + if node.is_var() and node.var() is not None: + if node.node.graph_id() != 0: + for before_node in reversed(all_nodes[:idx]): + if before_node.is_var() and before_node.var() is not None \ + and before_node.node.graph_id() == node.node.graph_id() - 1 \ + and before_node.var().name() == node.var().name(): + self._node_pairs_between_graphs.append( + (before_node, node)) + for after_node in all_nodes[idx + 1:]: + if after_node.is_var() and after_node.var() is not None \ + and after_node.node.graph_id() == node.node.graph_id() - 1 \ + and after_node.var().name() == node.var().name(): + self._node_pairs_between_graphs.append( + (after_node, node)) + def complete_forward_annotation(self, serial_main_program): """ Complete annotation for the partial annotated serial_main_program. Arguments: @@ -336,24 +633,24 @@ class Completer: # Initialize distributed attributes for all var and op node in serial_main_program self._dist_context.init_dist_attr_for_program() + # print_program_with_dist_attr(serial_main_program, self._dist_context) # Initialize distributed attributes for all var and op node in graph self._dist_context.init_dist_attr_for_graph() + self._prepare() + self._update_process_mesh() - # Complete dims_mapping for each node self._update_dims_mapping() # Copy the corresponding distributed attribute from graph to serial_main_program self._dist_context.copy_dist_attr_from_graph_to_program() self._dist_context.clear_dist_info_for_graph() - # print_serial_main_program_with_dist_attr(serial_main_program, self._dist_context) # Do the validation check and amend some completion self._dist_context.amend_dist_attr_for_program() - # print_serial_main_program_with_dist_attr(serial_main_program, self._dist_context) self._dist_context.validate_dist_attr_for_program() return serial_main_program diff --git a/python/paddle/distributed/auto_parallel/dist_attribute.py b/python/paddle/distributed/auto_parallel/dist_attribute.py index b27cd7a37c9..8ec702ffcb0 100644 --- a/python/paddle/distributed/auto_parallel/dist_attribute.py +++ b/python/paddle/distributed/auto_parallel/dist_attribute.py @@ -175,6 +175,7 @@ class TensorDistributedAttribute: class OperatorDistributedAttribute: def __init__(self): self._process_mesh = None + self._op_type = None self._impl_type = None self._impl_idx = None self._inputs_dist_attrs = {} @@ -194,11 +195,23 @@ class OperatorDistributedAttribute: if isinstance(process_mesh, list): process_mesh = ProcessMesh(process_mesh) self._process_mesh = copy.deepcopy(process_mesh) + # In while op, the proess mesh is not shared by all inputs and outputs + if self._op_type == "while": + return None for dist_attr in self._inputs_dist_attrs.values(): dist_attr.process_mesh = process_mesh for dist_attr in self._outputs_dist_attrs.values(): dist_attr.process_mesh = process_mesh + @property + def op_type(self): + return self._op_type + + @op_type.setter + def op_type(self, op_type): + if op_type is not None: + self._op_type = op_type + @property def impl_type(self): return self._impl_type @@ -326,6 +339,8 @@ class OperatorDistributedAttribute: assert False, "No setter for {} in args {}.".format( key, dist_attr) # Make sure proscess_meshes in dist op be same + if self.op_type == "while": + return None process_meshes = [] process_meshes.append(self.process_mesh) for tensor_dist_attr in self.inputs_dist_attrs.values(): diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py index 573f23fdca5..2807c46540a 100644 --- a/python/paddle/distributed/auto_parallel/dist_context.py +++ b/python/paddle/distributed/auto_parallel/dist_context.py @@ -15,6 +15,7 @@ import copy from collections import defaultdict from paddle.fluid import framework +from paddle.fluid.framework import get_flags, set_flags from paddle.fluid import core from .dist_attribute import TensorDistributedAttribute from .dist_attribute import OperatorDistributedAttribute @@ -39,6 +40,10 @@ def set_default_distributed_context(dist_context): _g_default_distributed_context = dist_context +def _node_id(node): + return (node.node.graph_id(), node.node.id()) + + class DistributedContext: """ DistributedContext is used to collect related distributed information for program and graph. @@ -146,7 +151,7 @@ class DistributedContext: return None def get_dist_tensor_for_graph(self, serial_tensor_node): - serial_tensor_node_id = serial_tensor_node.id() + serial_tensor_node_id = _node_id(serial_tensor_node) return self._dist_tensors_for_graph.get(serial_tensor_node_id, None) def get_dist_op_for_program(self, serial_op): @@ -168,7 +173,7 @@ class DistributedContext: del self._dist_ops_for_program[serial_tensor_id] def get_dist_op_for_graph(self, serial_op_node): - serial_op_node_id = serial_op_node.id() + serial_op_node_id = _node_id(serial_op_node) return self._dist_ops_for_graph.get(serial_op_node_id, None) def get_tensor_dist_attr_for_program(self, serial_tensor): @@ -197,7 +202,7 @@ class DistributedContext: self.add_dist_tensor_for_program(dist_tensor) def get_tensor_dist_attr_for_graph(self, serial_tensor_node): - serial_tensor_node_id = serial_tensor_node.id() + serial_tensor_node_id = _node_id(serial_tensor_node) dist_tensor = self._dist_tensors_for_graph.get(serial_tensor_node_id, None) if dist_tensor: @@ -242,7 +247,7 @@ class DistributedContext: self.add_dist_op_for_program(dist_op) def get_op_dist_attr_for_graph(self, serial_op_node): - serial_op_node_id = serial_op_node.id() + serial_op_node_id = _node_id(serial_op_node) dist_op = self._dist_ops_for_graph.get(serial_op_node_id, None) if dist_op: return dist_op.dist_attr @@ -262,7 +267,7 @@ class DistributedContext: def get_dist_attr_for_graph(self, serial_node): if serial_node.is_var() and serial_node.var() is not None: - serial_tensor_node_id = serial_node.id() + serial_tensor_node_id = _node_id(serial_node) dist_tensor = self._dist_tensors_for_graph.get( serial_tensor_node_id, None) if dist_tensor: @@ -270,7 +275,7 @@ class DistributedContext: else: return None if serial_node.is_op() and serial_node.op() is not None: - serial_op_node_id = serial_node.id() + serial_op_node_id = _node_id(serial_node) dist_op = self._dist_ops_for_graph.get(serial_op_node_id, None) if dist_op: return dist_op.dist_attr @@ -311,40 +316,69 @@ class DistributedContext: def order_nodes_by_program_order(self): def _contains(nodes, target_node): for node in nodes: - if node.id() == target_node.id(): + if _node_id(node) == _node_id(target_node): return True return False - ordered_tensor_nodes = [] - ordered_op_nodes = [] - all_nodes = self._serial_graph.all_nodes() + serial_ordered_tensor_nodes = [] + serial_ordered_op_nodes = [] + all_nodes = [] + # for idx, graph in enumerate(self._serial_graph.all_sub_graphs()): + for idx, graph in enumerate(self._serial_graph.all_sub_graphs()): + for node in graph.all_nodes(): + all_nodes.append(node) for node in all_nodes: if node.is_var() and node.var() is not None: - ordered_tensor_nodes.append(node) + serial_ordered_tensor_nodes.append(node) if node.is_op() and node.op() is not None: - ordered_op_nodes.append(node) - ordered_tensor_nodes.sort(key=lambda node: node.node.original_desc_id()) - ordered_op_nodes.sort(key=lambda node: node.node.original_desc_id()) - for op_node in ordered_op_nodes: + serial_ordered_op_nodes.append(node) + serial_ordered_tensor_nodes.sort( + key=lambda node: node.node.original_desc_id()) + serial_ordered_op_nodes.sort( + key=lambda node: node.node.original_desc_id()) + num_nodes_before = len(serial_ordered_tensor_nodes) + len( + serial_ordered_op_nodes) + + new_serial_ordered_tensor_nodes = [] + new_serial_ordered_op_nodes = [] + for op_node in serial_ordered_op_nodes: tensor_nodes = [] for tensor_node in op_node.inputs: if tensor_node.is_var() \ and tensor_node.var() is not None \ and not _contains(self._serial_ordered_nodes, tensor_node): tensor_nodes.append(tensor_node) + new_serial_ordered_tensor_nodes.append(tensor_node) tensor_nodes.sort(key=lambda node: node.node.original_desc_id()) self._serial_ordered_nodes.extend(tensor_nodes) self._serial_ordered_nodes.append(op_node) + new_serial_ordered_op_nodes.append(op_node) tensor_nodes = [] for tensor_node in op_node.outputs: if tensor_node.is_var() \ and tensor_node.var() is not None \ and not _contains(self._serial_ordered_nodes, tensor_node): tensor_nodes.append(tensor_node) + new_serial_ordered_tensor_nodes.append(tensor_node) + tensor_nodes.sort(key=lambda node: node.node.original_desc_id()) self._serial_ordered_nodes.extend(tensor_nodes) - num_nodes_before = len(ordered_tensor_nodes) + len(ordered_op_nodes) - assert len(self._serial_ordered_nodes) == num_nodes_before, \ - "The number of nodes before ordering is not the same after ordering." + new_serial_ordered_tensor_nodes.sort( + key=lambda node: node.node.original_desc_id()) + new_serial_ordered_op_nodes.sort( + key=lambda node: node.node.original_desc_id()) + self._serial_ordered_tensor_nodes = new_serial_ordered_tensor_nodes + self._serial_ordered_op_nodes = new_serial_ordered_op_nodes + assert len(self._serial_ordered_nodes) == len( + self._serial_ordered_tensor_nodes) + len( + self._serial_ordered_op_nodes) + self._serial_orphan_tensor_nodes = [] + for tensor_node in serial_ordered_tensor_nodes: + if not _contains(self._serial_ordered_tensor_nodes, tensor_node): + self._serial_orphan_tensor_nodes.append(tensor_node) + if len(self._serial_ordered_nodes) != num_nodes_before: + print( + "WARNING: there are some orphan tensors or ops which are not used in the execution." + ) def init_dist_attr_for_graph(self): assert self._is_initialized_for_program, \ @@ -352,9 +386,9 @@ class DistributedContext: if self._is_initialized_for_graph: return # Convert program to graph + set_flags({"FLAGS_convert_all_blocks": True}) self._serial_graph = framework.IrGraph( core.Graph(self._serial_program.desc)) - all_nodes = self._serial_graph.all_nodes() self.order_nodes_by_program_order() for node in self.serial_ordered_nodes: if node.is_var() and node.var() is not None: @@ -365,10 +399,11 @@ class DistributedContext: if tensor_id == cur_tensor_id \ or tensor_id == cur_dist_tensor.serial_tensor.desc.original_id(): dist_tensor = cur_dist_tensor - self._node_id_to_tensor_id[node.id()] = cur_tensor_id + self._node_id_to_tensor_id[_node_id( + node)] = cur_tensor_id assert dist_tensor is not None, \ "Tensor must have a distributed tensor after the initialization for program." - serial_tensor_node_id = node.id() + serial_tensor_node_id = _node_id(node) new_dist_tensor = DistributedTensor(dist_tensor.serial_tensor, dist_tensor.dist_attr) self._dist_tensors_for_graph[ @@ -381,10 +416,10 @@ class DistributedContext: if op_id == cur_op_id \ or op_id == cur_dist_op.serial_op.desc.original_id(): dist_op = cur_dist_op - self._node_id_to_op_id[node.id()] = cur_op_id + self._node_id_to_op_id[_node_id(node)] = cur_op_id assert dist_op is not None, \ "Operator must have a distributed operator after the initialization for program." - serial_op_node_id = node.id() + serial_op_node_id = _node_id(node) new_dist_op = DistributedOperator(dist_op.serial_op, dist_op.dist_attr) self._dist_ops_for_graph[serial_op_node_id] = new_dist_op @@ -402,10 +437,11 @@ class DistributedContext: assert self._is_initialized_for_program and self._is_initialized_for_graph, \ "Both program and graph must be initialized." updated_tensors = {} - all_nodes = self._serial_graph.all_nodes() + # all_nodes = self._serial_graph.all_nodes() + all_nodes = self._serial_ordered_nodes for node in all_nodes: if node.is_var() and node.var() is not None: - tensor_id = self._node_id_to_tensor_id[node.id()] + tensor_id = self._node_id_to_tensor_id[_node_id(node)] updated = updated_tensors.get(tensor_id, False) # If a var has multiples var nodes in graph, only use the first one for now if not updated: @@ -416,16 +452,31 @@ class DistributedContext: dist_tensor_for_program.dist_attr = tensor_dist_attr_for_graph updated_tensors[tensor_id] = True if node.is_op() and node.op() is not None: - op_id = self._node_id_to_op_id[node.id()] + op_id = self._node_id_to_op_id[_node_id(node)] op_dist_attr_for_graph = self.get_op_dist_attr_for_graph(node) dist_op_for_program = self._dist_ops_for_program[op_id] dist_op_for_program.dist_attr = op_dist_attr_for_graph + # TODO: the completion algorithm will skip orphan tensors, + # here we just set there process_mesh to the first one. + for orphan_node in self._serial_orphan_tensor_nodes: + serial_tensor_id = orphan_node.var().id() + dist_tensor = self._dist_tensors_for_program.get(serial_tensor_id, + None) + if dist_tensor: + dist_tensor.dist_attr.process_mesh = self._process_meshes[0] + else: + serial_tensor_id = orphan_node.var().original_id() + dist_tensor = self._dist_tensors_for_program.get( + serial_tensor_id, None) + dist_tensor.dist_attr.process_mesh = self._process_meshes[0] def amend_dist_attr_for_program(self): for dist_tensor in self._dist_tensors_for_program.values(): serial_tensor = dist_tensor.serial_tensor dist_attr = dist_tensor.dist_attr - if serial_tensor.type == core.VarDesc.VarType.READER: + if serial_tensor.type == core.VarDesc.VarType.READER \ + or serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \ + or serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES: tensor_shape = [] else: tensor_shape = serial_tensor.shape @@ -446,6 +497,7 @@ class DistributedContext: tensor_shape = [] else: if dist_op.get_serial_input(arg_name).type == core.VarDesc.VarType.READER \ + or dist_op.get_serial_input(arg_name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \ or dist_op.serial_op.type == "create_py_reader": tensor_shape = [] else: @@ -459,8 +511,9 @@ class DistributedContext: and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]: dims_mapping[i] = -1 for arg_name in serial_op.output_arg_names: - if dist_op.get_serial_output( - arg_name).type == core.VarDesc.VarType.READER: + if dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.READER \ + or dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \ + or dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.STEP_SCOPES: tensor_shape = [] else: tensor_shape = dist_op.get_serial_output(arg_name).shape @@ -498,7 +551,8 @@ class DistributedContext: for k, v in self.__dict__.items(): if k == "_serial_program" or k == "_serial_graph" \ or k == "_dist_main_programs" or k == "_dist_startup_programs" \ - or k == "_serial_ordered_nodes": + or k == "_serial_ordered_nodes" or k == "_serial_ordered_tensor_nodes" \ + or k == "_serial_ordered_op_nodes": setattr(result, k, v) else: setattr(result, k, copy.deepcopy(v, memo)) diff --git a/python/paddle/distributed/auto_parallel/dist_op.py b/python/paddle/distributed/auto_parallel/dist_op.py index 67de298564a..a2c2748a8ce 100644 --- a/python/paddle/distributed/auto_parallel/dist_op.py +++ b/python/paddle/distributed/auto_parallel/dist_op.py @@ -76,7 +76,8 @@ class DistributedOperator: if tensor is None: tensor_shape = [] else: - if tensor.type == core.VarDesc.VarType.READER: + if tensor.type == core.VarDesc.VarType.READER \ + or tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY: tensor_shape = [] else: tensor_shape = tensor.shape @@ -86,7 +87,9 @@ class DistributedOperator: tensor_dims_mapping) for tensor_name in self._serial_op.output_arg_names: tensor = self._serial_op.block._var_recursive(tensor_name) - if tensor.type == core.VarDesc.VarType.READER or tensor.type == core.VarDesc.VarType.STEP_SCOPES: + if tensor.type == core.VarDesc.VarType.READER \ + or tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \ + or tensor.type == core.VarDesc.VarType.STEP_SCOPES: tensor_shape = [] else: tensor_shape = tensor.shape @@ -95,6 +98,8 @@ class DistributedOperator: tensor_dims_mapping = [-1 for _ in range(len(tensor_shape))] self._dist_attr.set_output_dims_mapping(tensor_name, tensor_dims_mapping) + if self._dist_attr.op_type is None: + self._dist_attr.op_type = self.serial_op.type if self._dist_attr.impl_type is None: self._dist_attr.impl_type = "default" if self._dist_attr.impl_idx is None: @@ -134,12 +139,16 @@ class DistributedOperator: return new_dist_attr def validate_dist_attr(self): - if "read" in self.serial_op.type: + if "read" in self.serial_op.type or "while" == self.serial_op.type: return True for name in self.serial_op.input_arg_names: input_dist_attr = self.dist_attr.get_input_dist_attr(name) dims_mapping = input_dist_attr.dims_mapping - shape = self.get_serial_input(name).shape + if self.get_serial_input( + name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY: + shape = [] + else: + shape = self.get_serial_input(name).shape if len(shape) != len(dims_mapping): return False for i in range(len(dims_mapping)): @@ -155,7 +164,11 @@ class DistributedOperator: for name in self.serial_op.output_arg_names: output_dist_attr = self.dist_attr.get_output_dist_attr(name) dims_mapping = output_dist_attr.dims_mapping - shape = self.get_serial_output(name).shape + if self.get_serial_output(name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY\ + or self.get_serial_output(name).type == core.VarDesc.VarType.STEP_SCOPES: + shape = [] + else: + shape = self.get_serial_output(name).shape if len(shape) != len(dims_mapping): return False for i in range(len(dims_mapping)): @@ -241,14 +254,14 @@ class DistributedModule: def __call__(self, *args, **kwargs): from .dist_context import get_default_distributed_context - main_prog = paddle.fluid.default_main_program() - main_block = main_prog.global_block() - op_size = len(main_block.ops) + default_prog = paddle.fluid.default_main_program() + cur_block = default_prog.current_block() + op_size = len(cur_block.ops) output = self._serial_module(*args, **kwargs) - new_op_size = len(main_block.ops) + new_op_size = len(cur_block.ops) default_dist_ctx = get_default_distributed_context() for idx in range(op_size, new_op_size): - op = main_block.ops[idx] + op = cur_block.ops[idx] dist_op = DistributedOperator(op, self._dist_attr) dist_op.dist_attr.mark_annotated_as(self._dist_attr) default_dist_ctx.add_dist_op_for_program(dist_op) diff --git a/python/paddle/distributed/auto_parallel/dist_tensor.py b/python/paddle/distributed/auto_parallel/dist_tensor.py index 5e3c852699a..a42ce863492 100644 --- a/python/paddle/distributed/auto_parallel/dist_tensor.py +++ b/python/paddle/distributed/auto_parallel/dist_tensor.py @@ -184,7 +184,9 @@ class DistributedTensor: def _init_default_dist_attr(self): if self._dist_attr.dims_mapping is None: - if self.serial_tensor.type == core.VarDesc.VarType.READER: + if self.serial_tensor.type == core.VarDesc.VarType.READER \ + or self.serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \ + or self.serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES: tensor_shape = [] else: tensor_shape = self._serial_tensor.shape @@ -192,7 +194,9 @@ class DistributedTensor: self._dist_attr.dims_mapping = tensor_dims_mapping def validate_dist_attr(self): - if self.serial_tensor.type == core.VarDesc.VarType.READER: + if self.serial_tensor.type == core.VarDesc.VarType.READER \ + or self.serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \ + or self.serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES: return True tensor_shape = self.serial_tensor.shape if len(tensor_shape) != len(self.dist_attr.dims_mapping): diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py index 56beb895741..6bd1c5527a9 100644 --- a/python/paddle/distributed/auto_parallel/engine.py +++ b/python/paddle/distributed/auto_parallel/engine.py @@ -259,7 +259,7 @@ class Engine: "train_" + name: val for name, val in logs.items() } - self._logger.info(logs) + self._logger.info(train_logs) def _train_step(self, data): logs = {} diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/operators/common.py index 4b079e7b6b5..47f76353e46 100644 --- a/python/paddle/distributed/auto_parallel/operators/common.py +++ b/python/paddle/distributed/auto_parallel/operators/common.py @@ -17,7 +17,9 @@ from ..dist_attribute import OperatorDistributedAttribute _g_distributed_operator_impl_containers = {} -_g_elementwise_ops = ["elementwise_add", "gelu", "dropout", "cast"] +_g_elementwise_ops = [ + "elementwise_add", "gelu", "dropout", "cast", "gather", "concat" +] BACKWARD_ONLY_DIST_OPS = {'check_finite_and_unscale', 'update_loss_scaling'} diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py index 4e977007261..de6d018d605 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_default.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py @@ -55,9 +55,14 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): op_dist_attr = dist_op.dist_attr for arg_name in op_desc.input_arg_names(): serial_tensor = dist_op.get_serial_input(arg_name) - if serial_tensor.is_parameter: - continue dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name) + if serial_tensor.is_parameter: + for mapping in dims_mapping: + if mapping != -1: + return False + # continue + # if len(dims_mapping) < 1: + # continue if len(dims_mapping) > 1: for mapping in dims_mapping[1:]: if mapping != -1: @@ -73,9 +78,14 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): xshape_arg_names = op_desc.output("XShape") for arg_name in op_desc.output_arg_names(): serial_tensor = dist_op.get_serial_output(arg_name) - if serial_tensor.is_parameter: - continue dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name) + if serial_tensor.is_parameter: + for mapping in dims_mapping: + if mapping != -1: + return False + # continue + # if len(dims_mapping) < 1: + # continue if arg_name not in xshape_arg_names: if len(dims_mapping) > 1: for mapping in dims_mapping[1:]: @@ -104,7 +114,8 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): for mapping in dims_mapping[1:]: if mapping != -1: return False - batch_dim_mappings.append(dims_mapping[0]) + if len(dims_mapping) >= 1: + batch_dim_mappings.append(dims_mapping[0]) # Check output compatibility output_names = op_desc.output_names() @@ -121,7 +132,8 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): for mapping in dims_mapping[1:]: if mapping != -1: return False - batch_dim_mappings.append(dims_mapping[0]) + if len(dims_mapping) >= 1: + batch_dim_mappings.append(dims_mapping[0]) else: if dims_mapping[0] != -1: return False @@ -129,7 +141,8 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): for mapping in dims_mapping[2:]: if mapping != -1: return False - batch_dim_mappings.append(dims_mapping[1]) + if len(dims_mapping) >= 2: + batch_dim_mappings.append(dims_mapping[1]) # Check batch dim mapping compatibility if not all(batch_dim_mappings[0] == dim_mapping @@ -143,7 +156,9 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): op_desc = dist_op.serial_op.desc op_dist_attr = dist_op.dist_attr # The following statement will be replaced by a more elegent way - if op_desc.type() == "shape" or op_desc.type() == "slice": + if op_desc.type() == "shape" \ + or op_desc.type() == "slice" \ + or op_desc.type() == "while": return False output_names = op_desc.output_names() xshape_arg_names = [] @@ -155,17 +170,22 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): if serial_tensor.is_parameter: continue dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name) - batch_dim_mappings.append(dims_mapping[0]) + if len(dims_mapping) >= 1: + batch_dim_mappings.append(dims_mapping[0]) for arg_name in op_desc.output_arg_names(): serial_tensor = dist_op.get_serial_output(arg_name) if serial_tensor.is_parameter: continue dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name) if arg_name not in xshape_arg_names: - batch_dim_mappings.append(dims_mapping[0]) + if len(dims_mapping) >= 1: + batch_dim_mappings.append(dims_mapping[0]) else: batch_dim_mappings.append(dims_mapping[1]) + if not batch_dim_mappings: + return changed + compatible_dim_mapping = compute_compatible_dim_mapping( batch_dim_mappings) assert compatible_dim_mapping is not None, "There is no compatible dim mapping." @@ -174,7 +194,8 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): if serial_tensor.is_parameter: continue dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name) - if compatible_dim_mapping != dims_mapping[0]: + if len(dims_mapping + ) >= 1 and compatible_dim_mapping != dims_mapping[0]: dims_mapping[0] = compatible_dim_mapping changed = True for arg_name in op_desc.output_arg_names(): @@ -183,11 +204,13 @@ class DistributedDefaultImpl0(DistributedOperatorImpl): continue dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name) if arg_name not in xshape_arg_names: - if compatible_dim_mapping != dims_mapping[0]: + if len(dims_mapping + ) >= 1 and compatible_dim_mapping != dims_mapping[0]: dims_mapping[0] = compatible_dim_mapping changed = True else: - if compatible_dim_mapping != dims_mapping[1]: + if len(dims_mapping + ) >= 2 and compatible_dim_mapping != dims_mapping[1]: dims_mapping[1] = compatible_dim_mapping changed = True diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py index 058ae1d0a9f..c92142cf738 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py @@ -1432,7 +1432,6 @@ class DistributedMatmulV2Impl2(DistributedOperatorImpl): if is_valid_list_index(y_dims_mapping, -2) and is_dim_shard(y_dims_mapping[-2]): return False - return True def is_output_compatible(self, dist_op): diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py index 241eadcbace..86c274cb45c 100644 --- a/python/paddle/distributed/auto_parallel/utils.py +++ b/python/paddle/distributed/auto_parallel/utils.py @@ -1271,7 +1271,6 @@ def get_all_distributed_main_program(serial_program_info, dist_context, used_dist_context._dist_op_context = DistributedOperatorContext() _, _, dist_startup_program, dist_main_program, _ = copied_parallelizer._get_dist_program( rank_id, used_dist_context) - # print("dist_main_program: ", dist_main_program) all_dist_main_program.append(dist_main_program) return all_dist_main_program diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index 4a2fba70de4..a730d21afa5 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -9,6 +9,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU) set_tests_properties(test_relaunch_with_gpt_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 240) py_test_modules(test_engine_api MODULES test_engine_api ENVS ${dist_ENVS}) set_tests_properties(test_engine_api PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 80) + py_test_modules(test_while_op_completion MODULES test_while_op_completion ENVS ${dist_ENVS}) py_test_modules(test_converter MODULES test_converter ENVS ${dist_ENVS}) set_tests_properties(test_converter PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py new file mode 100644 index 00000000000..1179fd9a9f0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py @@ -0,0 +1,209 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import numpy as np +import paddle.nn as nn +import paddle.utils as utils +import paddle.static as static +import paddle.nn.functional as F +import paddle.distributed.auto_parallel as auto + +from paddle.distributed import fleet +from paddle.distributed.auto_parallel.completion import Completer +from paddle.distributed.auto_parallel.partitioner import Partitioner +from paddle.distributed.auto_parallel.utils import make_data_unshard +from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute +from paddle.distributed.auto_parallel.dist_context import DistributedContext, get_default_distributed_context +from paddle.distributed.auto_parallel.operators import find_best_compatible_distributed_operator_impl +from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr + +paddle.enable_static() + +batch_size = 4 +epoch_num = 10 +hidden_size = 1024 +sequence_len = 512 +_g_process_mesh = [[0, 1], [2, 3]] + + +def get_random_inputs_and_labels(input_shape, label_shape): + input = np.random.random(size=input_shape).astype('float32') + label = np.random.random(size=label_shape).astype('float32') + return input, label + + +def batch_generator_creator(): + def __reader__(): + for _ in range(batch_size): + batch_input, batch_label = get_random_inputs_and_labels( + [batch_size, sequence_len, hidden_size], + [batch_size, sequence_len, 1]) + yield batch_input, batch_label + + return __reader__ + + +class MLPLayer(nn.Layer): + def __init__(self, + hidden_size=1024, + intermediate_size=4 * 1024, + dropout_ratio=0.1, + initializer_range=0.02): + super(MLPLayer, self).__init__() + d_model = hidden_size + dim_feedforward = intermediate_size + param_initializer = nn.initializer.Normal( + mean=0.0, std=initializer_range) + + self.norm = nn.LayerNorm(d_model, epsilon=1e-5) + self.linear0 = nn.Linear( + d_model, + dim_feedforward, + weight_attr=paddle.ParamAttr(initializer=param_initializer), + bias_attr=None) + self.linear1 = nn.Linear( + dim_feedforward, + d_model, + weight_attr=paddle.ParamAttr(initializer=param_initializer), + bias_attr=None) + + def forward(self, input): + out = self.norm(input) + auto.shard_tensor( + self.linear0.weight, + dist_attr={ + "process_mesh": _g_process_mesh[0], + "dims_mapping": [-1, 0] + }) + out = self.linear0(out) + out = F.gelu(out, approximate=True) + auto.shard_tensor( + self.linear1.weight, + dist_attr={ + "process_mesh": _g_process_mesh[1], + "dims_mapping": [0, -1] + }) + out = self.linear1(out) + + return out + + +def loop_cond(i, loop_len, input_array): + return i < loop_len + + +def loop_body(i, loop_len, input_array): + pre_input = paddle.tensor.array_read(array=input_array, i=i) + mlp_while0 = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02) + + mlp_while1 = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02) + + output = mlp_while0(pre_input) + cur_pred = mlp_while1(output) + # 更新循环条件 + i = paddle.increment(x=i, value=1) + paddle.tensor.array_write(cur_pred, array=input_array, i=i) + return i, loop_len, input_array + + +def get_program(): + dist_strategy = fleet.DistributedStrategy() + dist_strategy.semi_auto = True + # fleet.init(is_collective=True, strategy=dist_strategy) + + train_program = static.Program() + start_program = static.Program() + with static.program_guard(train_program, start_program): + + # 循环计数器 + i = paddle.full(shape=[1], fill_value=0, dtype='int64') + # 循环次数 + loop_len = paddle.full(shape=[1], fill_value=epoch_num, dtype='int64') + + # input + input = static.data( + name="input", + shape=[batch_size, sequence_len, hidden_size], + dtype='float32') + label = static.data( + name="label", shape=[batch_size, sequence_len, 1], dtype='float32') + data_holder = [input, label] + # dataloader + dataloader = paddle.io.DataLoader.from_generator( + feed_list=data_holder, capacity=4 * batch_size, iterable=False) + dataloader.set_batch_generator( + batch_generator_creator(), places=paddle.static.cuda_places()) + # data dist_attr + auto.shard_tensor( + input, + dist_attr={ + "process_mesh": _g_process_mesh[0], + "dims_mapping": [-1, -1, -1] + }) + auto.shard_tensor( + label, + dist_attr={ + "process_mesh": _g_process_mesh[0], + "dims_mapping": [-1, -1, -1] + }) + + mlp_start = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02) + pred = mlp_start(input) + + input_array = paddle.tensor.array_write(pred, i) + i, loop_len, input_array = static.nn.while_loop( + cond=loop_cond, + body=loop_body, + loop_vars=[i, loop_len, input_array]) + end_pred = paddle.tensor.array_read(array=input_array, i=i) + + mlp_end = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02) + pred = mlp_end(end_pred) + + error_cost = paddle.nn.functional.square_error_cost(pred, label) + loss = paddle.mean(error_cost) + + return train_program, start_program, dataloader, i, loss + + +class TestMLP(unittest.TestCase): + def test_completer(self): + train_program, start_program, dataloader, i, loss = get_program() + dist_context = DistributedContext() + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program) + # print_program_with_dist_attr(complete_train_program, dist_context) + + +if __name__ == "__main__": + unittest.main() -- GitLab From 59e5c49f850f9e94b49a0a75136efc2e19918a3c Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 16 Mar 2022 10:27:56 +0800 Subject: [PATCH 087/176] move gather infershape (#40594) --- paddle/fluid/operators/gather_op.cc | 72 ++++++----------------------- paddle/phi/infermeta/binary.cc | 49 ++++++++++++++++++++ paddle/phi/infermeta/binary.h | 6 +++ 3 files changed, 68 insertions(+), 59 deletions(-) diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc index 7910d94298e..9f2b48a24b4 100644 --- a/paddle/fluid/operators/gather_op.cc +++ b/paddle/fluid/operators/gather_op.cc @@ -15,9 +15,14 @@ limitations under the License. */ #include #include #include + +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -26,58 +31,6 @@ class GatherOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of GatherOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true, - platform::errors::InvalidArgument( - "Input(Index) of GatherOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of GatherOp should not be null.")); - - auto index_dims = ctx->GetInputDim("Index"); - - if (index_dims.size() == 2) { - PADDLE_ENFORCE_EQ( - index_dims[1], 1, - platform::errors::InvalidArgument( - "The last dim of index should be 1 when it is 2D, but we get %d", - index_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - index_dims.size(), 1, - platform::errors::InvalidArgument( - "The index should be 1D, when it is not 2D, but we get %d", - index_dims.size())); - } - - auto axis = ctx->Attrs().Get("axis"); - auto input_dim = ctx->GetInputDim("X"); - if (ctx->HasInput("Axis") || axis == 0) { - // if HasInput("Axis"), we can not obtain correct shape of output - int batch_size = index_dims[0]; - framework::DDim output_dims(input_dim); - output_dims[0] = batch_size; - ctx->SetOutputDim("Out", output_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } else { - int index_size = index_dims[0]; - std::vector out_dim_vec; - for (int i = 0; i < axis; i++) { - out_dim_vec.push_back(input_dim[i]); - } - out_dim_vec.push_back(index_size); - for (int i = axis + 1; i < input_dim.size(); i++) { - out_dim_vec.push_back(input_dim[i]); - } - auto output_dims = phi::make_ddim(out_dim_vec); - ctx->SetOutputDim("Out", output_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -100,11 +53,6 @@ class GatherGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X")); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -193,11 +141,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(GatherGradNoNeedBufferVarInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(gather, GatherInferShapeFunctor, + PD_INFER_META(phi::GatherInferMeta)); REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker, ops::GatherGradOpMaker, - ops::GatherGradOpMaker); + ops::GatherGradOpMaker, + GatherInferShapeFunctor); +DECLARE_INFER_SHAPE_FUNCTOR(gather_grad, GatherGradInferShapeFunctor, + PD_INFER_META(phi::GeneralUnaryGradInferMeta)); REGISTER_OPERATOR(gather_grad, ops::GatherGradOp, - ops::GatherGradNoNeedBufferVarInferer); + ops::GatherGradNoNeedBufferVarInferer, + GatherGradInferShapeFunctor); REGISTER_OP_VERSION(gather) .AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC", diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index ff2cf81a904..ffb1ed54502 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -431,6 +431,55 @@ void ElementwiseRawInferMeta(const MetaTensor& x, out->share_lod(x); } +void GatherInferMeta(const MetaTensor& x, + const MetaTensor& index, + const Scalar& axis, + MetaTensor* out) { + auto index_dims = index.dims(); + + if (index_dims.size() == 2) { + PADDLE_ENFORCE_EQ( + index_dims[1], + 1, + phi::errors::InvalidArgument( + "The last dim of index should be 1 when it is 2D, but we get %d", + index_dims[1])); + } else { + PADDLE_ENFORCE_EQ( + index_dims.size(), + 1, + phi::errors::InvalidArgument( + "The index should be 1D, when it is not 2D, but we get %d", + index_dims.size())); + } + + auto input_dim = x.dims(); + auto axis_v = axis.to(); + if (axis.FromTensor() || axis_v == 0) { + // if axis.FromTensor(), we can not obtain correct shape of output + int batch_size = index_dims[0]; + phi::DDim output_dims(input_dim); + output_dims[0] = batch_size; + out->set_dims(output_dims); + out->set_dtype(x.dtype()); + out->share_lod(x); + } else { + int index_size = index_dims[0]; + std::vector out_dim_vec; + for (int i = 0; i < axis_v; i++) { + out_dim_vec.push_back(input_dim[i]); + } + out_dim_vec.push_back(index_size); + for (int i = axis_v + 1; i < input_dim.size(); i++) { + out_dim_vec.push_back(input_dim[i]); + } + auto output_dims = phi::make_ddim(out_dim_vec); + out->set_dims(output_dims); + out->set_dtype(x.dtype()); + out->share_lod(x); + } +} + void GatherNdInferMeta(const MetaTensor& x, const MetaTensor& index, MetaTensor* out) { diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index cfae45cf04b..d852db7a846 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "paddle/phi/common/scalar.h" #include "paddle/phi/core/meta_tensor.h" namespace phi { @@ -81,6 +82,11 @@ void ElementwiseRawInferMeta(const MetaTensor& x_meta, int axis, MetaTensor* out); +void GatherInferMeta(const MetaTensor& x, + const MetaTensor& index, + const Scalar& axis, + MetaTensor* out); + void GatherNdInferMeta(const MetaTensor& x, const MetaTensor& index, MetaTensor* out); -- GitLab From 80194bdea846b5c3b14a0437941b3780ebc1360b Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Wed, 16 Mar 2022 10:37:43 +0800 Subject: [PATCH 088/176] Polish reshape error message under @to_static (#40599) --- python/paddle/fluid/layers/nn.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index fd7226c4866..000f08b0a3e 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -6299,7 +6299,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None): if dim_size == -1: assert unk_dim_idx == -1, ( "Only one dimension value of 'shape' in reshape can " - "be -1. But received shape[%d] is also -1." % dim_idx) + "be -1. But received shape[%d] is also -1.\n" + "\n\t# N = x.shape()[2]\t\t# N is an int. " + "(NOT recommend under @to_static)\n\tN = paddle.shape(x)[2]\t\t" + "# N is a Tensor. (Recommend)\n\tz = paddle.reshape([N, -1, 4])" + "\t# z.shape is [-1, -1, 4]\n\n" + " If your target shape in Reshape represents dynamic shape, " + "please turn it into a Tensor under @to_static. See above example for details." + % dim_idx) unk_dim_idx = dim_idx elif dim_size == 0: assert dim_idx < len(x.shape), ( -- GitLab From f748b433850af9552094c3469b9d603795c6b1c4 Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Wed, 16 Mar 2022 10:54:56 +0800 Subject: [PATCH 089/176] Fixed issue with default-valued attributes (#40368) --- .../auto_code_generator/final_state_generator/eager_gen.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 656418a05ad..bc30f6aa03f 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -1245,7 +1245,7 @@ if __name__ == "__main__": # Node Definition Generation definition_declaration_pair = GenerateForwardDefinition( fwd_api_name, bwd_api_name, forward_inputs_position_map, - forward_outputs_position_map, forward_attrs_list, + forward_outputs_position_map, orig_forward_attrs_list, backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map, backward_attrs_list, optional_inputs, intermediate_outputs) @@ -1257,7 +1257,7 @@ if __name__ == "__main__": # For python-level API dispatch CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map, forward_outputs_position_map, - forward_attrs_list) + orig_forward_attrs_list) if len(namespace) > 0: forward_definition_str += f"""namespace {namespace} {{ -- GitLab From ad81f22c38e81a43d2869e254a75fcfdc53be9a8 Mon Sep 17 00:00:00 2001 From: qipengh Date: Wed, 16 Mar 2022 11:18:48 +0800 Subject: [PATCH 090/176] [MLU] support amp O1 of mlu (#40461) --- .../fluid/framework/data_device_transform.cc | 8 +++++++ paddle/fluid/imperative/amp_auto_cast.cc | 12 +++++++++- paddle/fluid/operators/batch_norm_op_mlu.cc | 23 +++++++++++-------- .../contrib/mixed_precision/fp16_lists.py | 3 +++ python/paddle/fluid/dygraph/amp/auto_cast.py | 9 ++++++-- .../paddle/fluid/dygraph/amp/loss_scaler.py | 3 ++- 6 files changed, 45 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc index 1a4f283f511..589d09bf81c 100644 --- a/paddle/fluid/framework/data_device_transform.cc +++ b/paddle/fluid/framework/data_device_transform.cc @@ -34,6 +34,14 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place, return; } + // NOTE(hqp): Special case for CPU->MLU, avoid stream sync. + if (platform::is_cpu_place(in.place()) && platform::is_mlu_place(dst_place)) { + paddle::framework::TensorCopy( + in, dst_place, *platform::DeviceContextPool::Instance().Get(dst_place), + out); + return; + } + // NOTE(yy): TransDataDevice should wait for computation of input. if (!platform::is_cuda_pinned_place(in.place())) { platform::DeviceContextPool::Instance().Get(in.place())->Wait(); diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc index dd00b75666d..7d60b7d26f3 100644 --- a/paddle/fluid/imperative/amp_auto_cast.cc +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -124,7 +124,7 @@ AmpOperators::AmpOperators() OpSupportedInfos("GPU", paddle::framework::proto::VarType::BF16)); unsupported_bf16_ops_->insert(unsupported_ops_gpu_bf16.begin(), unsupported_ops_gpu_bf16.end()); -// NOTE: GPU/NPU/XPU is compiled seperatly. +// NOTE: GPU/NPU/XPU/MLU is compiled seperatly. #elif defined(PADDLE_WITH_ASCEND_CL) auto unsupported_ops_npu_fp16 = std::get<2>( OpSupportedInfos("NPU", paddle::framework::proto::VarType::FP16)); @@ -143,6 +143,15 @@ AmpOperators::AmpOperators() OpSupportedInfos("XPU", paddle::framework::proto::VarType::BF16)); unsupported_bf16_ops_->insert(unsupported_ops_xpu_bf16.begin(), unsupported_ops_xpu_bf16.end()); +#elif defined(PADDLE_WITH_MLU) + auto unsupported_ops_mlu_fp16 = std::get<2>( + OpSupportedInfos("MLU", paddle::framework::proto::VarType::FP16)); + unsupported_fp16_ops_->insert(unsupported_ops_mlu_fp16.begin(), + unsupported_ops_mlu_fp16.end()); + auto unsupported_ops_mlu_bf16 = std::get<2>( + OpSupportedInfos("MLU", paddle::framework::proto::VarType::BF16)); + unsupported_bf16_ops_->insert(unsupported_ops_mlu_bf16.begin(), + unsupported_ops_mlu_bf16.end()); #endif VLOG(4) << allow_ops_->size() << " " << block_ops_->size() << " " << unsupported_fp16_ops_->size() << " " @@ -210,6 +219,7 @@ inline bool NeedCast(const std::shared_ptr& var) { if (paddle::platform::is_gpu_place(place) || paddle::platform::is_cuda_pinned_place(place) || paddle::platform::is_xpu_place(place) || + paddle::platform::is_mlu_place(place) || paddle::platform::is_npu_place(place) || paddle::platform::is_npu_pinned_place(place)) { // CudaPinndePlace is added for varbase created by dataloader diff --git a/paddle/fluid/operators/batch_norm_op_mlu.cc b/paddle/fluid/operators/batch_norm_op_mlu.cc index 0e64b461786..6507890a8b5 100644 --- a/paddle/fluid/operators/batch_norm_op_mlu.cc +++ b/paddle/fluid/operators/batch_norm_op_mlu.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/batch_norm_op.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" namespace paddle { @@ -20,6 +21,8 @@ namespace operators { template class MLUBatchNormOpKernel : public framework::OpKernel { + using MPDType = typename details::MPTypeTrait::Type; + public: void Compute(const framework::ExecutionContext &ctx) const override { const auto &place = ctx.GetPlace(); @@ -68,10 +71,10 @@ class MLUBatchNormOpKernel : public framework::OpKernel { // alloc memory y->mutable_data(place); - mean_out->mutable_data(place); - variance_out->mutable_data(place); - saved_mean->mutable_data(place); - saved_variance->mutable_data(place); + mean_out->mutable_data(place); + variance_out->mutable_data(place); + saved_mean->mutable_data(place); + saved_variance->mutable_data(place); Tensor transformed_x; Tensor transformed_y; @@ -132,6 +135,8 @@ class MLUBatchNormOpKernel : public framework::OpKernel { template class MLUBatchNormGradOpKernel : public framework::OpKernel { + using MPDType = typename details::MPTypeTrait::Type; + public: void Compute(const framework::ExecutionContext &ctx) const override { const auto *x = ctx.Input("X"); @@ -154,10 +159,10 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel { auto &dev_ctx = ctx.template device_context(); auto d_x_tmp = ctx.AllocateTmpTensor(x->dims(), dev_ctx); - auto scale_grad_tmp = - ctx.AllocateTmpTensor(scale->dims(), dev_ctx); + auto scale_grad_tmp = ctx.AllocateTmpTensor( + scale->dims(), dev_ctx); auto bias_grad_tmp = - ctx.AllocateTmpTensor(bias->dims(), dev_ctx); + ctx.AllocateTmpTensor(bias->dims(), dev_ctx); if (d_x == nullptr) { d_x = &d_x_tmp; @@ -171,8 +176,8 @@ class MLUBatchNormGradOpKernel : public framework::OpKernel { const auto &place = ctx.GetPlace(); d_x->mutable_data(place); - d_scale->mutable_data(place); - d_bias->mutable_data(place); + d_scale->mutable_data(place); + d_bias->mutable_data(place); use_global_stats = is_test || use_global_stats; diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py index 80d2ccb0d5c..9dba5d658df 100644 --- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py @@ -173,6 +173,9 @@ if core.is_compiled_with_xpu(): elif core.is_compiled_with_npu(): _, _, _sys_unsupported_fp16_list = core.op_supported_infos( 'NPU', core.VarDesc.VarType.FP16) +elif core.is_compiled_with_mlu(): + _, _, _sys_unsupported_fp16_list = core.op_supported_infos( + 'MLU', core.VarDesc.VarType.FP16) else: _, _, _sys_unsupported_fp16_list = core.op_supported_infos( 'GPU', core.VarDesc.VarType.FP16) diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py index a449bdf0a18..4127f1e4449 100644 --- a/python/paddle/fluid/dygraph/amp/auto_cast.py +++ b/python/paddle/fluid/dygraph/amp/auto_cast.py @@ -271,13 +271,14 @@ def amp_guard(enable=True, "current_tracer is None, maybe it is not in imperative mode.") # check device_type: - # NOTE: Now, amp only support gpu for float16 and bfloat16, xpu for float16, npu for float16. + # NOTE: Now, amp only support gpu for float16 and bfloat16, xpu for float16, mlu for float16, npu for float16. # Maybe we will support cpu for bfloat16. if enable and not (tracer._expected_place.is_gpu_place() or tracer._expected_place.is_xpu_place() or + tracer._expected_place.is_mlu_place() or tracer._expected_place.is_npu_place()): warnings.warn( - 'amp_guard can only be enabled on CUDAPlace, XPUPlace, and NPUPlace, current place is %s, so it makes no effect.' + 'amp_guard can only be enabled on CUDAPlace, XPUPlace, MLUPlace, and NPUPlace, current place is %s, so it makes no effect.' % tracer._expected_place) enable = False # For npu: @@ -288,6 +289,10 @@ def amp_guard(enable=True, if tracer._expected_place.is_xpu_place() and (dtype == 'bfloat16'): warnings.warn('XPUPlace only support float16 amp.') enable = False + # For mlu: + if tracer._expected_place.is_mlu_place() and (dtype == 'bfloat16'): + warnings.warn('MLUPlace only support float16 amp.') + enable = False # For gpu float16: Compute Capability should >= 7. # For gpu bfloat16: Compute Capability should >= 8 & CUDA Version should >= 11. if tracer._expected_place.is_gpu_place(): diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py index 3ca4c7dca76..c5729086194 100644 --- a/python/paddle/fluid/dygraph/amp/loss_scaler.py +++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py @@ -106,9 +106,10 @@ class AmpScaler(object): if enable and not (tracer._expected_place.is_gpu_place() or tracer._expected_place.is_xpu_place() or + tracer._expected_place.is_mlu_place() or tracer._expected_place.is_npu_place()): warnings.warn( - 'AmpScaler can only be enabled on CUDAPlace, XPUPlace and NPUPlace, current place is %s, so it makes no effect.' + 'AmpScaler can only be enabled on CUDAPlace, XPUPlace, MLUPlace and NPUPlace, current place is %s, so it makes no effect.' % tracer._expected_place) enable = False -- GitLab From 8fd20b5bab6c438150eaa9d27b75590ce9e75527 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 16 Mar 2022 11:38:04 +0800 Subject: [PATCH 091/176] [Phi] Move grid sample op kernel into phi (#40585) * add grid sample phi kernel * add grid sample phi kernel and remove original kernel * replace mutable_data by alloc --- paddle/fluid/operators/grid_sampler_op.cc | 11 +- paddle/fluid/operators/grid_sampler_op.cu | 492 -------------- paddle/fluid/operators/grid_sampler_op.h | 600 ------------------ .../kernels/cpu/grid_sample_grad_kernel.cc | 357 +++++++++++ paddle/phi/kernels/cpu/grid_sample_kernel.cc | 184 ++++++ paddle/phi/kernels/cpu/grid_sample_utils.h | 160 +++++ .../kernels/gpu/grid_sample_grad_kernel.cu | 324 ++++++++++ paddle/phi/kernels/gpu/grid_sample_kernel.cu | 233 +++++++ paddle/phi/kernels/gpu/grid_sample_utils.h | 30 + paddle/phi/kernels/grid_sample_grad_kernel.h | 34 + paddle/phi/kernels/grid_sample_kernel.h | 32 + paddle/phi/ops/compat/grid_sampler_sig.cc | 43 ++ 12 files changed, 1398 insertions(+), 1102 deletions(-) delete mode 100644 paddle/fluid/operators/grid_sampler_op.cu delete mode 100644 paddle/fluid/operators/grid_sampler_op.h create mode 100644 paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/grid_sample_kernel.cc create mode 100644 paddle/phi/kernels/cpu/grid_sample_utils.h create mode 100644 paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/grid_sample_kernel.cu create mode 100644 paddle/phi/kernels/gpu/grid_sample_utils.h create mode 100644 paddle/phi/kernels/grid_sample_grad_kernel.h create mode 100644 paddle/phi/kernels/grid_sample_kernel.h create mode 100644 paddle/phi/ops/compat/grid_sampler_sig.cc diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc index 04aa6a3e10f..6ee9582dacd 100644 --- a/paddle/fluid/operators/grid_sampler_op.cc +++ b/paddle/fluid/operators/grid_sampler_op.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/grid_sampler_op.h" #include #include + #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" @@ -229,15 +229,6 @@ REGISTER_OPERATOR(grid_sampler, ops::GridSampleOp, ops::GridSampleOpMaker, ops::GridSampleGradMaker); REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad); -REGISTER_OP_CPU_KERNEL( - grid_sampler, - ops::GridSampleOpKernel, - ops::GridSampleOpKernel); -REGISTER_OP_CPU_KERNEL( - grid_sampler_grad, - ops::GridSampleGradOpKernel, - ops::GridSampleGradOpKernel); - REGISTER_OP_VERSION(grid_sampler) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu deleted file mode 100644 index a227a8e3127..00000000000 --- a/paddle/fluid/operators/grid_sampler_op.cu +++ /dev/null @@ -1,492 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/grid_sampler_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -static __forceinline__ __device__ bool in_bounds(int h, int w, int H, int W) { - return h >= 0 && h < H && w >= 0 && w < W; -} - -template -static __forceinline__ __device__ void atomic_add(T* data, int h, int w, int sH, - int sW, int H, int W, - T delta) { - if (in_bounds(h, w, H, W)) { - platform::CudaAtomicAdd(data + h * sH + w * sW, delta); - } -} - -template -static __forceinline__ __device__ T _unnormalize(T coord, int size, - bool align_corners) { - if (align_corners) { - return ((coord + 1.f) / 2) * (size - 1); - } else { - return ((coord + 1.f) * size - 1) / 2; - } -} - -template -static __forceinline__ __device__ T clip_indexes(T in, int max_value) { - return min(static_cast(max_value), max(in, static_cast(0))); -} - -template -static __forceinline__ __device__ T reflect_indexes(T in, int twice_low, - int twice_high) { - if (twice_low == twice_high) { - return static_cast(0); - } - T min = static_cast(twice_low) / 2; - T span = static_cast(twice_high - twice_low) / 2; - in = fabs(in - min); - T extra = fmod(in, span); - int flips = static_cast(floor(in / span)); - if (flips % 2 == 0) { - return extra + min; - } else { - return span - extra + min; - } -} - -template -static __forceinline__ __device__ T compute_positions(T coord, int size, - PaddingMode padding_mode, - bool align_corners) { - coord = _unnormalize(coord, size, align_corners); - if (padding_mode == PaddingMode::border) { - coord = clip_indexes(coord, size - 1); - } else if (padding_mode == PaddingMode::reflect) { - if (align_corners) { - coord = reflect_indexes(coord, 0, 2 * (size - 1)); - } else { - coord = reflect_indexes(coord, -1, 2 * size - 1); - } - coord = clip_indexes(coord, size - 1); - } - return coord; -} - -template -static __forceinline__ __device__ T _unnormalize_with_mask(T coord, int size, - bool align_corners, - T* grad_in) { - if (align_corners) { - *grad_in = static_cast(size - 1) / 2; - return ((coord + 1.f) / 2) * (size - 1); - } else { - *grad_in = static_cast(size) / 2; - return ((coord + 1.f) * size - 1) / 2; - } -} - -template -static __forceinline__ __device__ T clip_indexes_with_mask(T in, int clip_limit, - T* grad_in) { - if (in <= static_cast(0)) { - *grad_in = static_cast(0); - return static_cast(0); - } else { - T max = static_cast(clip_limit - 1); - if (in >= max) { - *grad_in = static_cast(0); - return max; - } else { - *grad_in = static_cast(1); - return in; - } - } -} - -template -static __forceinline__ __device__ T -reflect_indexes_with_mask(T in, int twice_low, int twice_high, T* grad_in) { - if (twice_low == twice_high) { - *grad_in = static_cast(0); - return static_cast(0); - } - int grad_in_mult_; - T min = static_cast(twice_low) / 2; - T span = static_cast(twice_high - twice_low) / 2; - in = in - min; - if (in < static_cast(0)) { - grad_in_mult_ = -1; - in = -in; - } else { - grad_in_mult_ = 1; - } - T extra = fmod(in, span); - int flips = static_cast(floor(in / span)); - if (flips % 2 == 0) { - *grad_in = static_cast(grad_in_mult_); - return extra + min; - } else { - *grad_in = static_cast(-grad_in_mult_); - return span - extra + min; - } -} - -template -static __forceinline__ __device__ T -compute_positions_with_mask(T coord, int size, PaddingMode padding_mode, - bool align_corners, T* grad_in) { - T grad_clip, grad_refl; - coord = _unnormalize_with_mask(coord, size, align_corners, grad_in); - if (padding_mode == PaddingMode::border) { - coord = clip_indexes_with_mask(coord, size, &grad_clip); - *grad_in = (*grad_in) * grad_clip; - } else if (padding_mode == PaddingMode::reflect) { - if (align_corners) { - coord = reflect_indexes_with_mask(coord, 0, 2 * (size - 1), &grad_refl); - } else { - coord = reflect_indexes_with_mask(coord, -1, 2 * size - 1, &grad_refl); - } - coord = clip_indexes_with_mask(coord, size, &grad_clip); - *grad_in = (*grad_in) * grad_refl * grad_clip; - } - - return coord; -} - -template -__global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c, - int out_h, int out_w, int in_h, - int in_w, const T* input, const T* grid, - T* output, const Mode mode, - const PaddingMode padding_mode, - bool align_corners) { - int inp_sN = out_c * in_h * in_w; - - int inp_sC = in_h * in_w; - int inp_sH = in_w; - int inp_sW = 1; - int grid_sN = out_h * out_w * 2; - int grid_sH = out_w * 2; - int grid_sW = 2; - int grid_sCoor = 1; - int out_sN = out_c * out_h * out_w; - int out_sC = out_h * out_w; - int out_sH = out_w; - int out_sW = 1; - CUDA_KERNEL_LOOP(index, nthreads) { - const int w = index % out_w; - const int h = (index / out_w) % out_h; - const int n = index / (out_h * out_w); - const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW; - - T ix = grid[grid_offset]; - T iy = grid[grid_offset + grid_sCoor]; - - ix = compute_positions(ix, in_w, padding_mode, align_corners); - iy = compute_positions(iy, in_h, padding_mode, align_corners); - if (mode == Mode::bilinear) { - int ix_nw = static_cast(floor(ix)); - int iy_nw = static_cast(floor(iy)); - int ix_ne = ix_nw + 1; - int iy_ne = iy_nw; - int ix_sw = ix_nw; - int iy_sw = iy_nw + 1; - int ix_se = ix_nw + 1; - int iy_se = iy_nw + 1; - - T nw = (ix_se - ix) * (iy_se - iy); - T ne = (ix - ix_sw) * (iy_sw - iy); - T sw = (ix_ne - ix) * (iy - iy_ne); - T se = (ix - ix_nw) * (iy - iy_nw); - - auto inp_offset_NC = n * inp_sN; - - auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW; - for (int c = 0; c < out_c; - ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) { - *out_ptr_NCHW = static_cast(0); - if (in_bounds(iy_nw, ix_nw, in_h, in_w)) { - *out_ptr_NCHW += - input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw; - } - if (in_bounds(iy_ne, ix_ne, in_h, in_w)) { - *out_ptr_NCHW += - input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne; - } - if (in_bounds(iy_sw, ix_sw, in_h, in_w)) { - *out_ptr_NCHW += - input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw; - } - if (in_bounds(iy_se, ix_se, in_h, in_w)) { - *out_ptr_NCHW += - input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se; - } - } - } else if (mode == Mode::nearest) { - int ix_nearest = static_cast(std::nearbyint(ix)); - int iy_nearest = static_cast(std::nearbyint(iy)); - auto inp_offset_NC = n * inp_sN; - auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW; - for (int c = 0; c < out_c; - ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) { - if (in_bounds(iy_nearest, ix_nearest, in_h, in_w)) { - *out_ptr_NCHW = - input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW]; - } else { - *out_ptr_NCHW = static_cast(0); - } - } - } - } -} - -template -class GridSampleOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.cuda_device_context(); - auto align_corners = ctx.Attr("align_corners"); - auto padding_mode_s = ctx.Attr("padding_mode"); - auto mode_s = ctx.Attr("mode"); - PaddingMode padding_mode; - Mode mode; - if (padding_mode_s == "border") { - padding_mode = PaddingMode::border; - } else if (padding_mode_s == "reflection") { - padding_mode = PaddingMode::reflect; - } else { - padding_mode = PaddingMode::zeros; - } - - if (mode_s == "nearest") { - mode = Mode::nearest; - } else { - mode = Mode::bilinear; - } - - auto* input = ctx.Input("X"); - auto* grid = ctx.Input("Grid"); - const int n = grid->dims()[0]; - const int out_h = grid->dims()[1]; - const int out_w = grid->dims()[2]; - const int c = input->dims()[1]; - const int in_h = input->dims()[2]; - const int in_w = input->dims()[3]; - VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h - << "; out_w: " << out_w; - auto* output = ctx.Output("Output"); - auto* output_data = output->mutable_data(ctx.GetPlace()); - VLOG(3) << "out dims: " << output->dims()[0] << "; " << output->dims()[1] - << "; " << output->dims()[2] << "; " << output->dims()[3]; - int count = static_cast(n * out_h * out_w); - auto cu_stream = dev_ctx.stream(); - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(dev_ctx, count); - grid_sample_cuda_kernel< - T><<>>( - count, n, c, out_h, out_w, in_h, in_w, input->data(), - grid->data(), output_data, mode, padding_mode, align_corners); - } -}; - -template -__global__ void grid_sampler_cuda_backward_kernel( - const int nthreads, const T* grad_output, const T* input, const T* grid, - int n, int out_c, int out_h, int out_w, int in_h, int in_w, T* grad_input, - T* grad_grid, const Mode mode, const PaddingMode padding_mode, - bool align_corners) { - int inp_sN = out_c * in_h * in_w; - int inp_sC = in_h * in_w; - int inp_sH = in_w; - int inp_sW = 1; - int grid_sN = out_h * out_w * 2; - int grid_sH = out_w * 2; - int grid_sW = 2; - int grid_sCoor = 1; - - int gOut_sN = out_c * out_h * out_w; - int gOut_sC = out_h * out_w; - int gOut_sH = out_w; - int gOut_sW = 1; - - CUDA_KERNEL_LOOP(index, nthreads) { - const int w = index % out_w; - const int h = (index / out_w) % out_h; - const int n = index / (out_h * out_w); - const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW; - - T ix = grid[grid_offset]; - T iy = grid[grid_offset + grid_sCoor]; - - T gix_mult, giy_mult; - ix = compute_positions_with_mask(ix, in_w, padding_mode, align_corners, - &gix_mult); - iy = compute_positions_with_mask(iy, in_h, padding_mode, align_corners, - &giy_mult); - - if (mode == Mode::bilinear) { - int ix_nw = static_cast(floor(ix)); - int iy_nw = static_cast(floor(iy)); - int ix_ne = ix_nw + 1; - int iy_ne = iy_nw; - int ix_sw = ix_nw; - int iy_sw = iy_nw + 1; - int ix_se = ix_nw + 1; - int iy_se = iy_nw + 1; - - T nw = (ix_se - ix) * (iy_se - iy); - T ne = (ix - ix_sw) * (iy_sw - iy); - T sw = (ix_ne - ix) * (iy - iy_ne); - T se = (ix - ix_nw) * (iy - iy_nw); - - T gix = static_cast(0), giy = static_cast(0); - int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW; - T* gInp_ptr_NC = grad_input + n * inp_sN; - int inp_offset_NC = n * inp_sN; - for (int c = 0; c < out_c; ++c, inp_offset_NC += inp_sC, - gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) { - T gOut = grad_output[gOut_offset]; - - atomic_add(gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w, - nw * gOut); - atomic_add(gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w, - ne * gOut); - atomic_add(gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w, - sw * gOut); - atomic_add(gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w, - se * gOut); - - if (in_bounds(iy_nw, ix_nw, in_h, in_w)) { - T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW]; - gix -= nw_val * (iy_se - iy) * gOut; - giy -= nw_val * (ix_se - ix) * gOut; - } - if (in_bounds(iy_ne, ix_ne, in_h, in_w)) { - T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW]; - gix += ne_val * (iy_sw - iy) * gOut; - giy -= ne_val * (ix - ix_sw) * gOut; - } - if (in_bounds(iy_sw, ix_sw, in_h, in_w)) { - T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW]; - gix -= sw_val * (iy - iy_ne) * gOut; - giy += sw_val * (ix_ne - ix) * gOut; - } - if (in_bounds(iy_se, ix_se, in_h, in_w)) { - T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW]; - gix += se_val * (iy - iy_nw) * gOut; - giy += se_val * (ix - ix_nw) * gOut; - } - } - - if (grad_grid != nullptr) { - T* gGrid_ptr_NHW = grad_grid + index * grid_sW; - gGrid_ptr_NHW[0] = gix_mult * gix; - gGrid_ptr_NHW[1] = giy_mult * giy; - } - } else if (mode == Mode::nearest) { - int ix_nearest = static_cast(std::nearbyint(ix)); - int iy_nearest = static_cast(std::nearbyint(iy)); - - int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW; - T* gInp_ptr_NC = grad_input + n * inp_sN; - for (int c = 0; c < out_c; - ++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) { - atomic_add(gInp_ptr_NC, iy_nearest, ix_nearest, inp_sH, inp_sW, in_h, - in_w, grad_output[gOut_offset]); - } - - if (grad_grid != nullptr) { - T* gGrid_ptr_NHW = grad_grid + index * grid_sW; - gGrid_ptr_NHW[0] = static_cast(0); - gGrid_ptr_NHW[1] = static_cast(0); - } - } - } -} - -template -class GridSampleGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.cuda_device_context(); - auto align_corners = ctx.Attr("align_corners"); - auto padding_mode_s = ctx.Attr("padding_mode"); - auto mode_s = ctx.Attr("mode"); - - PaddingMode padding_mode; - Mode mode; - if (padding_mode_s == "border") { - padding_mode = PaddingMode::border; - } else if (padding_mode_s == "reflection") { - padding_mode = PaddingMode::reflect; - } else { - padding_mode = PaddingMode::zeros; - } - - if (mode_s == "nearest") { - mode = Mode::nearest; - } else { - mode = Mode::bilinear; - } - - auto* input = ctx.Input("X"); - auto* grid = ctx.Input("Grid"); - auto* output_grad = ctx.Input(framework::GradVarName("Output")); - - const int n = grid->dims()[0]; - const int out_h = grid->dims()[1]; - const int out_w = grid->dims()[2]; - const int c = input->dims()[1]; - const int in_h = input->dims()[2]; - const int in_w = input->dims()[3]; - - auto* input_grad = ctx.Output(framework::GradVarName("X")); - input_grad->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant()( - ctx.template device_context(), - input_grad, static_cast(0)); - - T* grid_grad_data = nullptr; - if (ctx.HasOutput(framework::GradVarName("Grid"))) { - auto* grid_grad = ctx.Output(framework::GradVarName("Grid")); - grid_grad_data = grid_grad->mutable_data(ctx.GetPlace()); - } - - int count = static_cast(n * out_h * out_w); - auto cu_stream = dev_ctx.stream(); - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(dev_ctx, count); - grid_sampler_cuda_backward_kernel< - T><<>>( - count, output_grad->data(), input->data(), grid->data(), n, c, - out_h, out_w, in_h, in_w, input_grad->data(), grid_grad_data, mode, - padding_mode, align_corners); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL(grid_sampler, ops::GridSampleOpCUDAKernel, - ops::GridSampleOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL(grid_sampler_grad, - ops::GridSampleGradOpCUDAKernel, - ops::GridSampleGradOpCUDAKernel); diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h deleted file mode 100644 index 93e96694270..00000000000 --- a/paddle/fluid/operators/grid_sampler_op.h +++ /dev/null @@ -1,600 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/hostdevice.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -enum class Mode { - bilinear, - nearest, -}; - -enum class PaddingMode { zeros, border, reflect }; - -using Tensor = framework::Tensor; -template -using EigenTensor = framework::EigenTensor; - -using Array3 = Eigen::DSizes; -using Array4 = Eigen::DSizes; - -template -static inline bool isInBound(T x, T y, T x_max, T y_max) { - if (x < 0 || x > x_max || y < 0 || y > y_max) { - return false; - } - return true; -} - -template -static inline void unnormalize(const platform::CPUDeviceContext& ctx, - Tensor* grid_slice, - const int max_val, // height-1 or width-1 - bool align_corners) { - auto& place = *ctx.eigen_device(); - auto grid_slice_t = EigenTensor::From(*grid_slice); - - if (!align_corners) { - auto factor = static_cast((max_val + 1) * 0.5); - grid_slice_t.device(place) = - (grid_slice_t + static_cast(1)) * factor - static_cast(0.5); - } else { - auto factor = static_cast(max_val * 0.5); - grid_slice_t.device(place) = (grid_slice_t + static_cast(1)) * factor; - } -} - -template -static inline void clip(const platform::CPUDeviceContext& ctx, - Tensor* grid_slice, - const int max_val, // height-1 or width-1 - bool align_corners, std::string padding_mode) { - auto& place = *ctx.eigen_device(); - auto grid_slice_t = EigenTensor::From(*grid_slice); - if (padding_mode == "border") { - grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast(0)) - .cwiseMin(static_cast(max_val)); - } else if (padding_mode == "reflection") { - if (align_corners) { - auto double_range = static_cast(max_val * 2); - auto grid_abs = grid_slice_t.abs(); - auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; - grid_slice_t.device(place) = extra.cwiseMin(double_range - extra); - if (max_val == 0) { - grid_slice_t.device(place) = grid_slice_t.constant(static_cast(0)); - } - } else { - auto double_range = static_cast((max_val + 1) * 2); - auto grid_abs = (grid_slice_t + static_cast(0.5)).abs(); - auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; - grid_slice_t.device(place) = - extra.cwiseMin(double_range - extra) - static_cast(0.5); - grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast(0)) - .cwiseMin(static_cast(max_val)); - } - } -} - -template -static inline void clipWithMask(const platform::CPUDeviceContext& ctx, - const int max_val, // height-1 or width-1 - bool align_corners, std::string padding_mode, - Tensor* grid_slice, Tensor* grid_scale) { - auto& place = *ctx.eigen_device(); - grid_scale->mutable_data(grid_slice->dims(), ctx.GetPlace()); - - auto grid_slice_t = EigenTensor::From(*grid_slice); - auto factor = static_cast(max_val * 0.5); - if (!align_corners) { - factor = static_cast((max_val + 1) * 0.5); - } - auto grid_scale_t = EigenTensor::From(*grid_scale).setConstant(factor); - - if (padding_mode == "border") { - // auto bounded_lo = grid_slice_t.cwiseMax(static_cast(0)); - auto res = grid_slice_t.cwiseMax(static_cast(0)) - .cwiseMin(static_cast(max_val)); - - auto in_bound = (res == grid_slice_t); - grid_scale_t.device(place) = grid_scale_t * in_bound.template cast(); - grid_slice_t.device(place) = res; - } else if (padding_mode == "reflection") { - if (align_corners) { - auto double_range = static_cast(max_val * 2); - auto is_neg = (grid_slice_t < static_cast(0)); - auto grid_abs = grid_slice_t.abs(); - auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; - auto one_more_flip = (extra > (double_range - extra)); - grid_scale_t.device(place) = - grid_scale_t * ((is_neg == one_more_flip).template cast() - - (is_neg != one_more_flip).template cast()); - grid_slice_t.device(place) = extra.cwiseMin(double_range - extra); - if (max_val == 0) { - grid_slice_t.device(place) = grid_slice_t.constant(static_cast(0)); - } - } else { - auto double_range = static_cast((max_val + 1) * 2); - auto grid_abs = (grid_slice_t + static_cast(0.5)).abs(); - auto is_neg = ((grid_slice_t + static_cast(0.5)) < static_cast(0)); - auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; - auto one_more_flip = (extra > (double_range - extra)); - auto reflected = - extra.cwiseMin(double_range - extra) - static_cast(0.5); - auto clipped = reflected.cwiseMax(static_cast(0)) - .cwiseMin(static_cast(max_val)); - auto in_bound = (clipped == reflected).template cast(); - grid_scale_t.device(place) = - grid_scale_t * ((is_neg == one_more_flip).template cast() - - (is_neg != one_more_flip).template cast()) * - in_bound; - grid_slice_t.device(place) = clipped; - } - } -} - -template -static void calcGridLocations(const platform::CPUDeviceContext& ctx, - const Tensor& grid, const int in_h, - const int in_w, bool align_corners, - std::string padding_mode, Tensor* grid_x, - Tensor* grid_y) { - const int n = grid.dims()[0]; - const int out_h = grid.dims()[1]; - const int out_w = grid.dims()[2]; - - // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim - T* grid_x_data = grid_x->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - T* grid_y_data = grid_y->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - const T* grid_data = grid.data(); - for (int i = 0; i < n * out_h * out_w; i++) { - grid_x_data[i] = grid_data[2 * i]; - grid_y_data[i] = grid_data[(2 * i) + 1]; - } - - unnormalize(ctx, grid_x, in_w - 1, align_corners); - unnormalize(ctx, grid_y, in_h - 1, align_corners); - - clip(ctx, grid_x, in_w - 1, align_corners, padding_mode); - clip(ctx, grid_y, in_h - 1, align_corners, padding_mode); -} - -template -static void calcGridLocationsWithGrad(const platform::CPUDeviceContext& ctx, - const Tensor& grid, const int in_h, - const int in_w, bool align_corners, - std::string padding_mode, Tensor* grid_x, - Tensor* grid_y, Tensor* grid_x_scale, - Tensor* grid_y_scale) { - const int n = grid.dims()[0]; - const int out_h = grid.dims()[1]; - const int out_w = grid.dims()[2]; - - // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim - T* grid_x_data = grid_x->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - T* grid_y_data = grid_y->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - - const T* grid_data = grid.data(); - for (int i = 0; i < n * out_h * out_w; i++) { - grid_x_data[i] = grid_data[2 * i]; - grid_y_data[i] = grid_data[(2 * i) + 1]; - } - - unnormalize(ctx, grid_x, in_w - 1, align_corners); - unnormalize(ctx, grid_y, in_h - 1, align_corners); - - clipWithMask(ctx, in_w - 1, align_corners, padding_mode, grid_x, - grid_x_scale); - clipWithMask(ctx, in_h - 1, align_corners, padding_mode, grid_y, - grid_y_scale); -} - -template -static void getGridPointValue(const Tensor& input, Tensor* output, - const Tensor& x, const Tensor& y) { - const int n = input.dims()[0]; - const int c = input.dims()[1]; - const int in_h = input.dims()[2]; - const int in_w = input.dims()[3]; - const int out_h = x.dims()[1]; - const int out_w = x.dims()[2]; - auto x_t = EigenTensor::From(x); - auto y_t = EigenTensor::From(y); - auto output_t = EigenTensor::From(*output).setConstant((T)0); - auto input_t = EigenTensor::From(input); - - for (int i = 0; i < n; i++) { - for (int k = 0; k < out_h; k++) { - for (int l = 0; l < out_w; l++) { - if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), - (T)(in_h - 1))) { - for (int j = 0; j < c; j++) { - output_t(i, j, k, l) = - input_t(i, j, static_cast(round(y_t(i, k, l))), - static_cast(round(x_t(i, k, l)))); - } - } - } - } - } -} - -template -static void allNeigbors(const platform::CPUDeviceContext& ctx, - const Tensor& input, Tensor* grid_x, Tensor* grid_y, - Tensor* x_w, Tensor* x_e, Tensor* y_n, - Tensor* y_s, // positions - Tensor* d_w, Tensor* d_e, Tensor* d_n, - Tensor* d_s, // distance - Tensor* v_wn, Tensor* v_en, Tensor* v_ws, - Tensor* v_es) { // values - auto& place = *ctx.eigen_device(); - - const int c = input.dims()[1]; - const int n = grid_x->dims()[0]; - const int out_h = grid_x->dims()[1]; - const int out_w = grid_x->dims()[2]; - // calculate coords of 4 corner points - x_w->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - x_e->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - y_n->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - y_s->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - auto x_w_t = EigenTensor::From(*x_w); - auto x_e_t = EigenTensor::From(*x_e); - auto y_n_t = EigenTensor::From(*y_n); - auto y_s_t = EigenTensor::From(*y_s); - - auto grid_x_t = EigenTensor::From(*grid_x); - auto grid_y_t = EigenTensor::From(*grid_y); - - x_w_t.device(place) = grid_x_t.floor(); - x_e_t.device(place) = x_w_t + static_cast(1); - y_n_t.device(place) = grid_y_t.floor(); - y_s_t.device(place) = y_n_t + static_cast(1); - - // calculate distances to 4 sides - d_w->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - d_e->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - d_n->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - d_s->mutable_data({n, out_h, out_w}, ctx.GetPlace()); - auto d_w_t = EigenTensor::From(*d_w); - auto d_e_t = EigenTensor::From(*d_e); - auto d_n_t = EigenTensor::From(*d_n); - auto d_s_t = EigenTensor::From(*d_s); - d_w_t.device(place) = grid_x_t - x_w_t; - d_e_t.device(place) = x_e_t - grid_x_t; - d_n_t.device(place) = grid_y_t - y_n_t; - d_s_t.device(place) = y_s_t - grid_y_t; - - // calc 4 corner points value - v_wn->mutable_data({n, c, out_h, out_w}, ctx.GetPlace()); - v_en->mutable_data({n, c, out_h, out_w}, ctx.GetPlace()); - v_ws->mutable_data({n, c, out_h, out_w}, ctx.GetPlace()); - v_es->mutable_data({n, c, out_h, out_w}, ctx.GetPlace()); - getGridPointValue(input, v_wn, *x_w, *y_n); - getGridPointValue(input, v_en, *x_e, *y_n); - getGridPointValue(input, v_ws, *x_w, *y_s); - getGridPointValue(input, v_es, *x_e, *y_s); -} - -template -static void bilinearInter(const platform::CPUDeviceContext& ctx, - const Tensor& input, Tensor* grid_x, Tensor* grid_y, - Tensor* out) { - auto& place = *ctx.eigen_device(); - const int n = grid_x->dims()[0]; - const int out_h = grid_x->dims()[1]; - const int out_w = grid_x->dims()[2]; - const int c = input.dims()[1]; - - Tensor x_w, x_e, y_n, y_s; - Tensor d_w, d_e, d_n, d_s; - Tensor v_wn, v_en, v_ws, v_es; - - allNeigbors(ctx, input, grid_x, grid_y, &x_w, &x_e, &y_n, &y_s, &d_w, &d_e, - &d_n, &d_s, &v_wn, &v_en, &v_ws, &v_es); - - auto d_w_t = EigenTensor::From(d_w); - auto d_e_t = EigenTensor::From(d_e); - auto d_n_t = EigenTensor::From(d_n); - auto d_s_t = EigenTensor::From(d_s); - - auto d_w_scaled_t = - d_w_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); - auto d_e_scaled_t = - d_e_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); - auto d_n_scaled_t = - d_n_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); - auto d_s_scaled_t = - d_s_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); - auto v_wn_t = EigenTensor::From(v_wn); - auto v_en_t = EigenTensor::From(v_en); - auto v_ws_t = EigenTensor::From(v_ws); - auto v_es_t = EigenTensor::From(v_es); - auto output_t = EigenTensor::From(*out); - // bilinear interpolaetion by 4 corner points - output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t + - v_en_t * d_w_scaled_t * d_s_scaled_t + - v_ws_t * d_e_scaled_t * d_n_scaled_t + - v_es_t * d_w_scaled_t * d_n_scaled_t; -} - -template -static void nearestInter(const platform::CPUDeviceContext& ctx, - const Tensor& input, Tensor* grid_x, Tensor* grid_y, - Tensor* out) { - auto& place = *ctx.eigen_device(); - - auto grid_x_t = EigenTensor::From(*grid_x); - auto grid_y_t = EigenTensor::From(*grid_y); - grid_x_t = grid_x_t.round(); - grid_y_t = grid_y_t.round(); - getGridPointValue(input, out, *grid_x, *grid_y); -} - -template -static void gatherOutputGradToInputGrad(const Tensor& output_grad, - Tensor* input_grad, const Tensor& x, - const Tensor& y, const Tensor& d1, - const Tensor& d2) { - const int n = output_grad.dims()[0]; - const int c = output_grad.dims()[1]; - const int out_h = output_grad.dims()[2]; - const int out_w = output_grad.dims()[3]; - const int in_h = input_grad->dims()[2]; - const int in_w = input_grad->dims()[3]; - auto x_t = EigenTensor::From(x); - auto y_t = EigenTensor::From(y); - auto d1_t = EigenTensor::From(d1); - auto d2_t = EigenTensor::From(d2); - auto input_grad_t = EigenTensor::From(*input_grad); - auto output_grad_t = EigenTensor::From(output_grad); - - for (int i = 0; i < n; i++) { - for (int k = 0; k < out_h; k++) { - for (int l = 0; l < out_w; l++) { - if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), - (T)(in_h - 1))) { - for (int j = 0; j < c; j++) { - input_grad_t(i, j, static_cast(round(y_t(i, k, l))), - static_cast(round(x_t(i, k, l)))) += - output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l); - } - } - } - } - } -} - -template -static void gatherOutputGradToInputGrad(const Tensor& output_grad, - Tensor* input_grad, const Tensor& x, - const Tensor& y) { - const int n = output_grad.dims()[0]; - const int c = output_grad.dims()[1]; - const int out_h = output_grad.dims()[2]; - const int out_w = output_grad.dims()[3]; - const int in_h = input_grad->dims()[2]; - const int in_w = input_grad->dims()[3]; - auto x_t = EigenTensor::From(x); - auto y_t = EigenTensor::From(y); - auto input_grad_t = EigenTensor::From(*input_grad); - auto output_grad_t = EigenTensor::From(output_grad); - for (int i = 0; i < n; i++) { - for (int k = 0; k < out_h; k++) { - for (int l = 0; l < out_w; l++) { - if (isInBound(x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), - (T)(in_h - 1))) { - for (int j = 0; j < c; j++) { - input_grad_t(i, j, static_cast(round(y_t(i, k, l))), - static_cast(round(x_t(i, k, l)))) += - output_grad_t(i, j, k, l); - } - } - } - } - } -} - -template -static void gatherBilinearGrad(const platform::CPUDeviceContext& ctx, - const Tensor& input, const Tensor& output_grad, - Tensor* grid_x, Tensor* grid_y, - Tensor* grid_x_scale, Tensor* grid_y_scale, - Tensor* input_grad, Tensor* grid_grad) { - const int n = grid_x->dims()[0]; - const int out_h = grid_x->dims()[1]; - const int out_w = grid_x->dims()[2]; - const int c = input.dims()[1]; - - Tensor x_w, x_e, y_n, y_s; - Tensor d_w, d_e, d_n, d_s; - Tensor v_wn, v_en, v_ws, v_es; - - allNeigbors(ctx, input, - grid_x, // grid_x - grid_y, // grid_y - &x_w, &x_e, &y_n, &y_s, &d_w, &d_e, &d_n, &d_s, &v_wn, &v_en, - &v_ws, &v_es); - - // gather output grad value to input grad by corner point coords and weight - gatherOutputGradToInputGrad(output_grad, input_grad, x_w, y_n, d_e, d_s); - gatherOutputGradToInputGrad(output_grad, input_grad, x_w, y_s, d_e, d_n); - gatherOutputGradToInputGrad(output_grad, input_grad, x_e, y_n, d_w, d_s); - gatherOutputGradToInputGrad(output_grad, input_grad, x_e, y_s, d_w, d_n); - - auto v_wn_t = EigenTensor::From(v_wn); - auto v_en_t = EigenTensor::From(v_en); - auto v_ws_t = EigenTensor::From(v_ws); - auto v_es_t = EigenTensor::From(v_es); - - auto d_w_t = EigenTensor::From(d_w); - auto d_e_t = EigenTensor::From(d_e); - auto d_n_t = EigenTensor::From(d_n); - auto d_s_t = EigenTensor::From(d_s); - - auto output_grad_t = EigenTensor::From(output_grad); - - if (grid_grad != nullptr) { - Tensor grid_grad_x, grid_grad_y; - grid_grad_x.mutable_data({n, out_h, out_w}, ctx.GetPlace()); - grid_grad_y.mutable_data({n, out_h, out_w}, ctx.GetPlace()); - auto grid_grad_x_t = - EigenTensor::From(grid_grad_x).setConstant(static_cast(0.0)); - auto grid_grad_y_t = - EigenTensor::From(grid_grad_y).setConstant(static_cast(0.0)); - for (int i = 0; i < n; i++) { - for (int j = 0; j < c; j++) { - for (int k = 0; k < out_h; k++) { - for (int l = 0; l < out_w; l++) { - grid_grad_x_t(i, k, l) += - ((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) + - (v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) * - output_grad_t(i, j, k, l); - grid_grad_y_t(i, k, l) += - ((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) + - (v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) * - output_grad_t(i, j, k, l); - } - } - } - } - - // const T x_max = static_cast(in_w - 1); - // const T y_max = static_cast(in_h - 1); - - auto grid_x_scale_t = EigenTensor::From(*grid_x_scale); - auto grid_y_scale_t = EigenTensor::From(*grid_y_scale); - grid_grad_x_t = grid_grad_x_t * grid_x_scale_t; - grid_grad_y_t = grid_grad_y_t * grid_y_scale_t; - - // gather grid_grad [x, y] in 3rd Dim - T* grid_grad_data = grid_grad->data(); - T* grid_grad_x_data = grid_grad_x.data(); - T* grid_grad_y_data = grid_grad_y.data(); - for (int i = 0; i < n * out_h * out_w; i++) { - grid_grad_data[2 * i] = grid_grad_x_data[i]; - grid_grad_data[2 * i + 1] = grid_grad_y_data[i]; - } - } -} - -template -class GridSampleOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto align_corners = ctx.Attr("align_corners"); - auto padding_mode = ctx.Attr("padding_mode"); - auto mode = ctx.Attr("mode"); - - auto* input = ctx.Input("X"); - auto* grid = ctx.Input("Grid"); - - const int n = grid->dims()[0]; - const int out_h = grid->dims()[1]; - const int out_w = grid->dims()[2]; - const int c = input->dims()[1]; - const int in_h = input->dims()[2]; - const int in_w = input->dims()[3]; - - auto* output = ctx.Output("Output"); - output->mutable_data({n, c, out_h, out_w}, ctx.GetPlace()); - phi::funcs::SetConstant()( - ctx.template device_context(), output, - static_cast(0)); - - Tensor grid_x, grid_y; - calcGridLocations( - ctx.template device_context(), *grid, in_h, - in_w, align_corners, padding_mode, &grid_x, &grid_y); - if (mode == "bilinear") { - bilinearInter( - ctx.template device_context(), *input, - &grid_x, &grid_y, output); - } else if (mode == "nearest") { - auto grid_x_t = EigenTensor::From(grid_x); - auto grid_y_t = EigenTensor::From(grid_y); - grid_x_t = grid_x_t.round(); - grid_y_t = grid_y_t.round(); - getGridPointValue(*input, output, grid_x, grid_y); - } - } -}; - -template -class GridSampleGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto align_corners = ctx.Attr("align_corners"); - auto padding_mode = ctx.Attr("padding_mode"); - auto mode = ctx.Attr("mode"); - - auto* input = ctx.Input("X"); - auto* grid = ctx.Input("Grid"); - auto* output_grad = ctx.Input(framework::GradVarName("Output")); - - const int n = grid->dims()[0]; - const int out_h = grid->dims()[1]; - const int out_w = grid->dims()[2]; - const int c = input->dims()[1]; - const int in_h = input->dims()[2]; - const int in_w = input->dims()[3]; - - auto* input_grad = ctx.Output(framework::GradVarName("X")); - input_grad->mutable_data({n, c, in_h, in_w}, ctx.GetPlace()); - phi::funcs::SetConstant()( - ctx.template device_context(), input_grad, - static_cast(0)); - - Tensor* grid_grad = nullptr; - if (ctx.HasOutput(framework::GradVarName("Grid"))) { - grid_grad = ctx.Output(framework::GradVarName("Grid")); - grid_grad->mutable_data({n, out_h, out_w, 2}, ctx.GetPlace()); - phi::funcs::SetConstant()( - ctx.template device_context(), grid_grad, - static_cast(0)); - } - - Tensor grid_x, grid_y; - Tensor grid_x_scale, grid_y_scale; - calcGridLocationsWithGrad( - ctx.template device_context(), *grid, in_h, - in_w, align_corners, padding_mode, &grid_x, &grid_y, &grid_x_scale, - &grid_y_scale); - if (mode == "bilinear") { - gatherBilinearGrad(ctx.template device_context(), - *input, *output_grad, &grid_x, &grid_y, - &grid_x_scale, &grid_y_scale, input_grad, - grid_grad); - } else { - auto grid_x_t = EigenTensor::From(grid_x); - auto grid_y_t = EigenTensor::From(grid_y); - grid_x_t = grid_x_t.round(); - grid_y_t = grid_y_t.round(); - gatherOutputGradToInputGrad(*output_grad, input_grad, grid_x, grid_y); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc b/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc new file mode 100644 index 00000000000..923cb842411 --- /dev/null +++ b/paddle/phi/kernels/cpu/grid_sample_grad_kernel.cc @@ -0,0 +1,357 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/grid_sample_grad_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/grid_sample_utils.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +static inline void ClipWithMask(const CPUContext& ctx, + const int max_val, // height-1 or width-1 + bool align_corners, + std::string padding_mode, + DenseTensor* grid_slice, + DenseTensor* grid_scale) { + auto& place = *ctx.eigen_device(); + grid_scale->Resize(grid_slice->dims()); + ctx.Alloc(grid_scale); + + auto grid_slice_t = EigenTensor::From(*grid_slice); + auto factor = static_cast(max_val * 0.5); + if (!align_corners) { + factor = static_cast((max_val + 1) * 0.5); + } + auto grid_scale_t = EigenTensor::From(*grid_scale).setConstant(factor); + + if (padding_mode == "border") { + // auto bounded_lo = grid_slice_t.cwiseMax(static_cast(0)); + auto res = grid_slice_t.cwiseMax(static_cast(0)) + .cwiseMin(static_cast(max_val)); + + auto in_bound = (res == grid_slice_t); + grid_scale_t.device(place) = grid_scale_t * in_bound.template cast(); + grid_slice_t.device(place) = res; + } else if (padding_mode == "reflection") { + if (align_corners) { + auto double_range = static_cast(max_val * 2); + auto is_neg = (grid_slice_t < static_cast(0)); + auto grid_abs = grid_slice_t.abs(); + auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; + auto one_more_flip = (extra > (double_range - extra)); + grid_scale_t.device(place) = + grid_scale_t * ((is_neg == one_more_flip).template cast() - + (is_neg != one_more_flip).template cast()); + grid_slice_t.device(place) = extra.cwiseMin(double_range - extra); + if (max_val == 0) { + grid_slice_t.device(place) = grid_slice_t.constant(static_cast(0)); + } + } else { + auto double_range = static_cast((max_val + 1) * 2); + auto grid_abs = (grid_slice_t + static_cast(0.5)).abs(); + auto is_neg = ((grid_slice_t + static_cast(0.5)) < static_cast(0)); + auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; + auto one_more_flip = (extra > (double_range - extra)); + auto reflected = + extra.cwiseMin(double_range - extra) - static_cast(0.5); + auto clipped = reflected.cwiseMax(static_cast(0)) + .cwiseMin(static_cast(max_val)); + auto in_bound = (clipped == reflected).template cast(); + grid_scale_t.device(place) = + grid_scale_t * ((is_neg == one_more_flip).template cast() - + (is_neg != one_more_flip).template cast()) * + in_bound; + grid_slice_t.device(place) = clipped; + } + } +} + +template +static void CalcGridLocationsWithGrad(const CPUContext& ctx, + const DenseTensor& grid, + const int in_h, + const int in_w, + bool align_corners, + std::string padding_mode, + DenseTensor* grid_x, + DenseTensor* grid_y, + DenseTensor* grid_x_scale, + DenseTensor* grid_y_scale) { + const int n = grid.dims()[0]; + const int out_h = grid.dims()[1]; + const int out_w = grid.dims()[2]; + + // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim + grid_x->Resize({n, out_h, out_w}); + grid_y->Resize({n, out_h, out_w}); + T* grid_x_data = ctx.Alloc(grid_x); + T* grid_y_data = ctx.Alloc(grid_y); + + const T* grid_data = grid.data(); + for (int i = 0; i < n * out_h * out_w; i++) { + grid_x_data[i] = grid_data[2 * i]; + grid_y_data[i] = grid_data[(2 * i) + 1]; + } + + Unnormalize(ctx, grid_x, in_w - 1, align_corners); + Unnormalize(ctx, grid_y, in_h - 1, align_corners); + + ClipWithMask( + ctx, in_w - 1, align_corners, padding_mode, grid_x, grid_x_scale); + ClipWithMask( + ctx, in_h - 1, align_corners, padding_mode, grid_y, grid_y_scale); +} + +template +static void GatherOutputGradToInputGrad(const DenseTensor& output_grad, + DenseTensor* input_grad, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& d1, + const DenseTensor& d2) { + const int n = output_grad.dims()[0]; + const int c = output_grad.dims()[1]; + const int out_h = output_grad.dims()[2]; + const int out_w = output_grad.dims()[3]; + const int in_h = input_grad->dims()[2]; + const int in_w = input_grad->dims()[3]; + auto x_t = EigenTensor::From(x); + auto y_t = EigenTensor::From(y); + auto d1_t = EigenTensor::From(d1); + auto d2_t = EigenTensor::From(d2); + auto input_grad_t = EigenTensor::From(*input_grad); + auto output_grad_t = EigenTensor::From(output_grad); + + for (int i = 0; i < n; i++) { + for (int k = 0; k < out_h; k++) { + for (int l = 0; l < out_w; l++) { + if (IsInBound( + x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), (T)(in_h - 1))) { + for (int j = 0; j < c; j++) { + input_grad_t(i, + j, + static_cast(round(y_t(i, k, l))), + static_cast(round(x_t(i, k, l)))) += + output_grad_t(i, j, k, l) * d1_t(i, k, l) * d2_t(i, k, l); + } + } + } + } + } +} + +template +static void GatherBilinearGrad(const CPUContext& ctx, + const DenseTensor& input, + const DenseTensor& output_grad, + DenseTensor* grid_x, + DenseTensor* grid_y, + DenseTensor* grid_x_scale, + DenseTensor* grid_y_scale, + DenseTensor* input_grad, + DenseTensor* grid_grad) { + const int n = grid_x->dims()[0]; + const int out_h = grid_x->dims()[1]; + const int out_w = grid_x->dims()[2]; + const int c = input.dims()[1]; + + DenseTensor x_w, x_e, y_n, y_s; + DenseTensor d_w, d_e, d_n, d_s; + DenseTensor v_wn, v_en, v_ws, v_es; + + AllNeigbors(ctx, + input, + grid_x, // grid_x + grid_y, // grid_y + &x_w, + &x_e, + &y_n, + &y_s, + &d_w, + &d_e, + &d_n, + &d_s, + &v_wn, + &v_en, + &v_ws, + &v_es); + + // gather output grad value to input grad by corner point coords and weight + GatherOutputGradToInputGrad(output_grad, input_grad, x_w, y_n, d_e, d_s); + GatherOutputGradToInputGrad(output_grad, input_grad, x_w, y_s, d_e, d_n); + GatherOutputGradToInputGrad(output_grad, input_grad, x_e, y_n, d_w, d_s); + GatherOutputGradToInputGrad(output_grad, input_grad, x_e, y_s, d_w, d_n); + + auto v_wn_t = EigenTensor::From(v_wn); + auto v_en_t = EigenTensor::From(v_en); + auto v_ws_t = EigenTensor::From(v_ws); + auto v_es_t = EigenTensor::From(v_es); + + auto d_w_t = EigenTensor::From(d_w); + auto d_e_t = EigenTensor::From(d_e); + auto d_n_t = EigenTensor::From(d_n); + auto d_s_t = EigenTensor::From(d_s); + + auto output_grad_t = EigenTensor::From(output_grad); + + if (grid_grad != nullptr) { + DenseTensor grid_grad_x, grid_grad_y; + grid_grad_x.Resize({n, out_h, out_w}); + grid_grad_y.Resize({n, out_h, out_w}); + ctx.Alloc(&grid_grad_x); + ctx.Alloc(&grid_grad_y); + auto grid_grad_x_t = + EigenTensor::From(grid_grad_x).setConstant(static_cast(0.0)); + auto grid_grad_y_t = + EigenTensor::From(grid_grad_y).setConstant(static_cast(0.0)); + for (int i = 0; i < n; i++) { + for (int j = 0; j < c; j++) { + for (int k = 0; k < out_h; k++) { + for (int l = 0; l < out_w; l++) { + grid_grad_x_t(i, k, l) += + ((v_en_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_s_t(i, k, l) + + (v_es_t(i, j, k, l) - v_ws_t(i, j, k, l)) * d_n_t(i, k, l)) * + output_grad_t(i, j, k, l); + grid_grad_y_t(i, k, l) += + ((v_ws_t(i, j, k, l) - v_wn_t(i, j, k, l)) * d_e_t(i, k, l) + + (v_es_t(i, j, k, l) - v_en_t(i, j, k, l)) * d_w_t(i, k, l)) * + output_grad_t(i, j, k, l); + } + } + } + } + + // const T x_max = static_cast(in_w - 1); + // const T y_max = static_cast(in_h - 1); + + auto grid_x_scale_t = EigenTensor::From(*grid_x_scale); + auto grid_y_scale_t = EigenTensor::From(*grid_y_scale); + grid_grad_x_t = grid_grad_x_t * grid_x_scale_t; + grid_grad_y_t = grid_grad_y_t * grid_y_scale_t; + + // gather grid_grad [x, y] in 3rd Dim + T* grid_grad_data = grid_grad->data(); + T* grid_grad_x_data = grid_grad_x.data(); + T* grid_grad_y_data = grid_grad_y.data(); + for (int i = 0; i < n * out_h * out_w; i++) { + grid_grad_data[2 * i] = grid_grad_x_data[i]; + grid_grad_data[2 * i + 1] = grid_grad_y_data[i]; + } + } +} + +template +static void GatherOutputGradToInputGrad(const DenseTensor& output_grad, + DenseTensor* input_grad, + const DenseTensor& x, + const DenseTensor& y) { + const int n = output_grad.dims()[0]; + const int c = output_grad.dims()[1]; + const int out_h = output_grad.dims()[2]; + const int out_w = output_grad.dims()[3]; + const int in_h = input_grad->dims()[2]; + const int in_w = input_grad->dims()[3]; + auto x_t = EigenTensor::From(x); + auto y_t = EigenTensor::From(y); + auto input_grad_t = EigenTensor::From(*input_grad); + auto output_grad_t = EigenTensor::From(output_grad); + for (int i = 0; i < n; i++) { + for (int k = 0; k < out_h; k++) { + for (int l = 0; l < out_w; l++) { + if (IsInBound( + x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), (T)(in_h - 1))) { + for (int j = 0; j < c; j++) { + input_grad_t(i, + j, + static_cast(round(y_t(i, k, l))), + static_cast(round(x_t(i, k, l)))) += + output_grad_t(i, j, k, l); + } + } + } + } + } +} + +template +void GridSampleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& grid, + const DenseTensor& out_grid, + const std::string& mode, + const std::string& padding_mode, + bool align_corners, + DenseTensor* x_grad, + DenseTensor* grid_grad) { + const int n = grid.dims()[0]; + const int out_h = grid.dims()[1]; + const int out_w = grid.dims()[2]; + const int c = x.dims()[1]; + const int in_h = x.dims()[2]; + const int in_w = x.dims()[3]; + + x_grad->Resize({n, c, in_h, in_w}); + dev_ctx.template Alloc(x_grad); + phi::funcs::SetConstant()(dev_ctx, x_grad, static_cast(0)); + + if (grid_grad != nullptr) { + grid_grad->Resize({n, out_h, out_w, 2}); + dev_ctx.template Alloc(grid_grad); + phi::funcs::SetConstant()( + dev_ctx, grid_grad, static_cast(0)); + } + + DenseTensor grid_x, grid_y; + DenseTensor grid_x_scale, grid_y_scale; + CalcGridLocationsWithGrad(dev_ctx, + grid, + in_h, + in_w, + align_corners, + padding_mode, + &grid_x, + &grid_y, + &grid_x_scale, + &grid_y_scale); + if (mode == "bilinear") { + GatherBilinearGrad(dev_ctx, + x, + out_grid, + &grid_x, + &grid_y, + &grid_x_scale, + &grid_y_scale, + x_grad, + grid_grad); + } else { + auto grid_x_t = EigenTensor::From(grid_x); + auto grid_y_t = EigenTensor::From(grid_y); + grid_x_t = grid_x_t.round(); + grid_y_t = grid_y_t.round(); + GatherOutputGradToInputGrad(out_grid, x_grad, grid_x, grid_y); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(grid_sample_grad, + CPU, + ALL_LAYOUT, + phi::GridSampleGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/grid_sample_kernel.cc b/paddle/phi/kernels/cpu/grid_sample_kernel.cc new file mode 100644 index 00000000000..92a528cdda9 --- /dev/null +++ b/paddle/phi/kernels/cpu/grid_sample_kernel.cc @@ -0,0 +1,184 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/grid_sample_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/grid_sample_utils.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +using Array4 = Eigen::DSizes; + +template +static inline void Clip(const CPUContext& ctx, + DenseTensor* grid_slice, + const int max_val, // height-1 or width-1 + bool align_corners, + std::string padding_mode) { + auto& place = *ctx.eigen_device(); + auto grid_slice_t = EigenTensor::From(*grid_slice); + if (padding_mode == "border") { + grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast(0)) + .cwiseMin(static_cast(max_val)); + } else if (padding_mode == "reflection") { + if (align_corners) { + auto double_range = static_cast(max_val * 2); + auto grid_abs = grid_slice_t.abs(); + auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; + grid_slice_t.device(place) = extra.cwiseMin(double_range - extra); + if (max_val == 0) { + grid_slice_t.device(place) = grid_slice_t.constant(static_cast(0)); + } + } else { + auto double_range = static_cast((max_val + 1) * 2); + auto grid_abs = (grid_slice_t + static_cast(0.5)).abs(); + auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; + grid_slice_t.device(place) = + extra.cwiseMin(double_range - extra) - static_cast(0.5); + grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast(0)) + .cwiseMin(static_cast(max_val)); + } + } +} + +template +static void CalcGridLocations(const CPUContext& ctx, + const DenseTensor& grid, + const int in_h, + const int in_w, + bool align_corners, + std::string padding_mode, + DenseTensor* grid_x, + DenseTensor* grid_y) { + const int n = grid.dims()[0]; + const int out_h = grid.dims()[1]; + const int out_w = grid.dims()[2]; + + // split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim + grid_x->Resize({n, out_h, out_w}); + grid_y->Resize({n, out_h, out_w}); + T* grid_x_data = ctx.Alloc(grid_x); + T* grid_y_data = ctx.Alloc(grid_y); + const T* grid_data = grid.data(); + for (int i = 0; i < n * out_h * out_w; i++) { + grid_x_data[i] = grid_data[2 * i]; + grid_y_data[i] = grid_data[(2 * i) + 1]; + } + + Unnormalize(ctx, grid_x, in_w - 1, align_corners); + Unnormalize(ctx, grid_y, in_h - 1, align_corners); + + Clip(ctx, grid_x, in_w - 1, align_corners, padding_mode); + Clip(ctx, grid_y, in_h - 1, align_corners, padding_mode); +} + +template +static void BilinearInter(const CPUContext& ctx, + const DenseTensor& input, + DenseTensor* grid_x, + DenseTensor* grid_y, + DenseTensor* out) { + auto& place = *ctx.eigen_device(); + const int n = grid_x->dims()[0]; + const int out_h = grid_x->dims()[1]; + const int out_w = grid_x->dims()[2]; + const int c = input.dims()[1]; + + DenseTensor x_w, x_e, y_n, y_s; + DenseTensor d_w, d_e, d_n, d_s; + DenseTensor v_wn, v_en, v_ws, v_es; + + AllNeigbors(ctx, + input, + grid_x, + grid_y, + &x_w, + &x_e, + &y_n, + &y_s, + &d_w, + &d_e, + &d_n, + &d_s, + &v_wn, + &v_en, + &v_ws, + &v_es); + + auto d_w_t = EigenTensor::From(d_w); + auto d_e_t = EigenTensor::From(d_e); + auto d_n_t = EigenTensor::From(d_n); + auto d_s_t = EigenTensor::From(d_s); + + auto d_w_scaled_t = + d_w_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); + auto d_e_scaled_t = + d_e_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); + auto d_n_scaled_t = + d_n_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); + auto d_s_scaled_t = + d_s_t.reshape(Array4(n, 1, out_h, out_w)).broadcast(Array4(1, c, 1, 1)); + auto v_wn_t = EigenTensor::From(v_wn); + auto v_en_t = EigenTensor::From(v_en); + auto v_ws_t = EigenTensor::From(v_ws); + auto v_es_t = EigenTensor::From(v_es); + auto output_t = EigenTensor::From(*out); + // bilinear interpolaetion by 4 corner points + output_t.device(place) = v_wn_t * d_e_scaled_t * d_s_scaled_t + + v_en_t * d_w_scaled_t * d_s_scaled_t + + v_ws_t * d_e_scaled_t * d_n_scaled_t + + v_es_t * d_w_scaled_t * d_n_scaled_t; +} + +template +void GridSampleKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& grid, + const std::string& mode, + const std::string& padding_mode, + bool align_corners, + DenseTensor* out) { + const int n = grid.dims()[0]; + const int out_h = grid.dims()[1]; + const int out_w = grid.dims()[2]; + const int c = x.dims()[1]; + const int in_h = x.dims()[2]; + const int in_w = x.dims()[3]; + + out->Resize(phi::make_ddim({n, c, out_h, out_w})); + dev_ctx.template Alloc(out); + phi::funcs::SetConstant()(dev_ctx, out, static_cast(0)); + + DenseTensor grid_x, grid_y; + CalcGridLocations( + dev_ctx, grid, in_h, in_w, align_corners, padding_mode, &grid_x, &grid_y); + + if (mode == "bilinear") { + BilinearInter(dev_ctx, x, &grid_x, &grid_y, out); + } else if (mode == "nearest") { + auto grid_x_t = EigenTensor::From(grid_x); + auto grid_y_t = EigenTensor::From(grid_y); + grid_x_t = grid_x_t.round(); + grid_y_t = grid_y_t.round(); + GetGridPointValue(x, out, grid_x, grid_y); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + grid_sample, CPU, ALL_LAYOUT, phi::GridSampleKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/grid_sample_utils.h b/paddle/phi/kernels/cpu/grid_sample_utils.h new file mode 100644 index 00000000000..53a16446d7e --- /dev/null +++ b/paddle/phi/kernels/cpu/grid_sample_utils.h @@ -0,0 +1,160 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { + +template +void Unnormalize(const CPUContext& ctx, + DenseTensor* grid_slice, + const int max_val, // height-1 or width-1 + bool align_corners) { + auto& place = *ctx.eigen_device(); + auto grid_slice_t = EigenTensor::From(*grid_slice); + + if (!align_corners) { + auto factor = static_cast((max_val + 1) * 0.5); + grid_slice_t.device(place) = + (grid_slice_t + static_cast(1)) * factor - static_cast(0.5); + } else { + auto factor = static_cast(max_val * 0.5); + grid_slice_t.device(place) = (grid_slice_t + static_cast(1)) * factor; + } +} + +template +inline bool IsInBound(T x, T y, T x_max, T y_max) { + if (x < 0 || x > x_max || y < 0 || y > y_max) { + return false; + } + return true; +} + +template +void GetGridPointValue(const DenseTensor& input, + DenseTensor* output, + const DenseTensor& x, + const DenseTensor& y) { + const int n = input.dims()[0]; + const int c = input.dims()[1]; + const int in_h = input.dims()[2]; + const int in_w = input.dims()[3]; + const int out_h = x.dims()[1]; + const int out_w = x.dims()[2]; + auto x_t = EigenTensor::From(x); + auto y_t = EigenTensor::From(y); + auto output_t = EigenTensor::From(*output).setConstant((T)0); + auto input_t = EigenTensor::From(input); + + for (int i = 0; i < n; i++) { + for (int k = 0; k < out_h; k++) { + for (int l = 0; l < out_w; l++) { + if (IsInBound( + x_t(i, k, l), y_t(i, k, l), (T)(in_w - 1), (T)(in_h - 1))) { + for (int j = 0; j < c; j++) { + output_t(i, j, k, l) = + input_t(i, + j, + static_cast(round(y_t(i, k, l))), + static_cast(round(x_t(i, k, l)))); + } + } + } + } + } +} + +template +void AllNeigbors(const CPUContext& ctx, + const DenseTensor& input, + DenseTensor* grid_x, + DenseTensor* grid_y, + DenseTensor* x_w, + DenseTensor* x_e, + DenseTensor* y_n, + DenseTensor* y_s, // positions + DenseTensor* d_w, + DenseTensor* d_e, + DenseTensor* d_n, + DenseTensor* d_s, // distance + DenseTensor* v_wn, + DenseTensor* v_en, + DenseTensor* v_ws, + DenseTensor* v_es) { // values + auto& place = *ctx.eigen_device(); + + const int c = input.dims()[1]; + const int n = grid_x->dims()[0]; + const int out_h = grid_x->dims()[1]; + const int out_w = grid_x->dims()[2]; + // calculate coords of 4 corner points + x_w->Resize({n, out_h, out_w}); + x_e->Resize({n, out_h, out_w}); + y_n->Resize({n, out_h, out_w}); + y_s->Resize({n, out_h, out_w}); + ctx.Alloc(x_w); + ctx.Alloc(x_e); + ctx.Alloc(y_n); + ctx.Alloc(y_s); + auto x_w_t = EigenTensor::From(*x_w); + auto x_e_t = EigenTensor::From(*x_e); + auto y_n_t = EigenTensor::From(*y_n); + auto y_s_t = EigenTensor::From(*y_s); + + auto grid_x_t = EigenTensor::From(*grid_x); + auto grid_y_t = EigenTensor::From(*grid_y); + + x_w_t.device(place) = grid_x_t.floor(); + x_e_t.device(place) = x_w_t + static_cast(1); + y_n_t.device(place) = grid_y_t.floor(); + y_s_t.device(place) = y_n_t + static_cast(1); + + // calculate distances to 4 sides + d_w->Resize({n, out_h, out_w}); + d_e->Resize({n, out_h, out_w}); + d_n->Resize({n, out_h, out_w}); + d_s->Resize({n, out_h, out_w}); + ctx.Alloc(d_w); + ctx.Alloc(d_e); + ctx.Alloc(d_n); + ctx.Alloc(d_s); + auto d_w_t = EigenTensor::From(*d_w); + auto d_e_t = EigenTensor::From(*d_e); + auto d_n_t = EigenTensor::From(*d_n); + auto d_s_t = EigenTensor::From(*d_s); + d_w_t.device(place) = grid_x_t - x_w_t; + d_e_t.device(place) = x_e_t - grid_x_t; + d_n_t.device(place) = grid_y_t - y_n_t; + d_s_t.device(place) = y_s_t - grid_y_t; + + // calc 4 corner points value + v_wn->Resize({n, c, out_h, out_w}); + v_en->Resize({n, c, out_h, out_w}); + v_ws->Resize({n, c, out_h, out_w}); + v_es->Resize({n, c, out_h, out_w}); + ctx.Alloc(v_wn); + ctx.Alloc(v_en); + ctx.Alloc(v_ws); + ctx.Alloc(v_es); + GetGridPointValue(input, v_wn, *x_w, *y_n); + GetGridPointValue(input, v_en, *x_e, *y_n); + GetGridPointValue(input, v_ws, *x_w, *y_s); + GetGridPointValue(input, v_es, *x_e, *y_s); +} + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu new file mode 100644 index 00000000000..457a348be83 --- /dev/null +++ b/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu @@ -0,0 +1,324 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/grid_sample_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/gpu/grid_sample_utils.h" + +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" + +namespace phi { + +template +static __forceinline__ __device__ void AtomicAdd( + T* data, int h, int w, int sH, int sW, int H, int W, T delta) { + if (InBounds(h, w, H, W)) { + paddle::platform::CudaAtomicAdd(data + h * sH + w * sW, delta); + } +} + +template +static __forceinline__ __device__ T +UnnormalizeWithMask(T coord, int size, bool align_corners, T* grad_in) { + if (align_corners) { + *grad_in = static_cast(size - 1) / 2; + return ((coord + 1.f) / 2) * (size - 1); + } else { + *grad_in = static_cast(size) / 2; + return ((coord + 1.f) * size - 1) / 2; + } +} + +template +static __forceinline__ __device__ T ClipIndexesWithMask(T in, + int clip_limit, + T* grad_in) { + if (in <= static_cast(0)) { + *grad_in = static_cast(0); + return static_cast(0); + } else { + T max = static_cast(clip_limit - 1); + if (in >= max) { + *grad_in = static_cast(0); + return max; + } else { + *grad_in = static_cast(1); + return in; + } + } +} + +template +static __forceinline__ __device__ T +ReflectIndexesWithMask(T in, int twice_low, int twice_high, T* grad_in) { + if (twice_low == twice_high) { + *grad_in = static_cast(0); + return static_cast(0); + } + int grad_in_mult_; + T min = static_cast(twice_low) / 2; + T span = static_cast(twice_high - twice_low) / 2; + in = in - min; + if (in < static_cast(0)) { + grad_in_mult_ = -1; + in = -in; + } else { + grad_in_mult_ = 1; + } + T extra = fmod(in, span); + int flips = static_cast(floor(in / span)); + if (flips % 2 == 0) { + *grad_in = static_cast(grad_in_mult_); + return extra + min; + } else { + *grad_in = static_cast(-grad_in_mult_); + return span - extra + min; + } +} + +template +static __forceinline__ __device__ T +ComputePositionsWithMask(T coord, + int size, + PaddingMode padding_mode, + bool align_corners, + T* grad_in) { + T grad_clip, grad_refl; + coord = UnnormalizeWithMask(coord, size, align_corners, grad_in); + if (padding_mode == PaddingMode::border) { + coord = ClipIndexesWithMask(coord, size, &grad_clip); + *grad_in = (*grad_in) * grad_clip; + } else if (padding_mode == PaddingMode::reflect) { + if (align_corners) { + coord = ReflectIndexesWithMask(coord, 0, 2 * (size - 1), &grad_refl); + } else { + coord = ReflectIndexesWithMask(coord, -1, 2 * size - 1, &grad_refl); + } + coord = ClipIndexesWithMask(coord, size, &grad_clip); + *grad_in = (*grad_in) * grad_refl * grad_clip; + } + + return coord; +} + +template +__global__ void GridSamplerCudaBackwardKernel(const int nthreads, + const T* grad_output, + const T* input, + const T* grid, + int n, + int out_c, + int out_h, + int out_w, + int in_h, + int in_w, + T* grad_input, + T* grad_grid, + const Mode mode, + const PaddingMode padding_mode, + bool align_corners) { + int inp_sN = out_c * in_h * in_w; + int inp_sC = in_h * in_w; + int inp_sH = in_w; + int inp_sW = 1; + int grid_sN = out_h * out_w * 2; + int grid_sH = out_w * 2; + int grid_sW = 2; + int grid_sCoor = 1; + + int gOut_sN = out_c * out_h * out_w; + int gOut_sC = out_h * out_w; + int gOut_sH = out_w; + int gOut_sW = 1; + + CUDA_KERNEL_LOOP(index, nthreads) { + const int w = index % out_w; + const int h = (index / out_w) % out_h; + const int n = index / (out_h * out_w); + const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW; + + T ix = grid[grid_offset]; + T iy = grid[grid_offset + grid_sCoor]; + + T gix_mult, giy_mult; + ix = ComputePositionsWithMask( + ix, in_w, padding_mode, align_corners, &gix_mult); + iy = ComputePositionsWithMask( + iy, in_h, padding_mode, align_corners, &giy_mult); + + if (mode == Mode::bilinear) { + int ix_nw = static_cast(floor(ix)); + int iy_nw = static_cast(floor(iy)); + int ix_ne = ix_nw + 1; + int iy_ne = iy_nw; + int ix_sw = ix_nw; + int iy_sw = iy_nw + 1; + int ix_se = ix_nw + 1; + int iy_se = iy_nw + 1; + + T nw = (ix_se - ix) * (iy_se - iy); + T ne = (ix - ix_sw) * (iy_sw - iy); + T sw = (ix_ne - ix) * (iy - iy_ne); + T se = (ix - ix_nw) * (iy - iy_nw); + + T gix = static_cast(0), giy = static_cast(0); + int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW; + T* gInp_ptr_NC = grad_input + n * inp_sN; + int inp_offset_NC = n * inp_sN; + for (int c = 0; c < out_c; ++c, + inp_offset_NC += inp_sC, + gInp_ptr_NC += inp_sC, + gOut_offset += gOut_sC) { + T gOut = grad_output[gOut_offset]; + + AtomicAdd( + gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w, nw * gOut); + AtomicAdd( + gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w, ne * gOut); + AtomicAdd( + gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w, sw * gOut); + AtomicAdd( + gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w, se * gOut); + + if (InBounds(iy_nw, ix_nw, in_h, in_w)) { + T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW]; + gix -= nw_val * (iy_se - iy) * gOut; + giy -= nw_val * (ix_se - ix) * gOut; + } + if (InBounds(iy_ne, ix_ne, in_h, in_w)) { + T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW]; + gix += ne_val * (iy_sw - iy) * gOut; + giy -= ne_val * (ix - ix_sw) * gOut; + } + if (InBounds(iy_sw, ix_sw, in_h, in_w)) { + T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW]; + gix -= sw_val * (iy - iy_ne) * gOut; + giy += sw_val * (ix_ne - ix) * gOut; + } + if (InBounds(iy_se, ix_se, in_h, in_w)) { + T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW]; + gix += se_val * (iy - iy_nw) * gOut; + giy += se_val * (ix - ix_nw) * gOut; + } + } + + if (grad_grid != nullptr) { + T* gGrid_ptr_NHW = grad_grid + index * grid_sW; + gGrid_ptr_NHW[0] = gix_mult * gix; + gGrid_ptr_NHW[1] = giy_mult * giy; + } + } else if (mode == Mode::nearest) { + int ix_nearest = static_cast(std::nearbyint(ix)); + int iy_nearest = static_cast(std::nearbyint(iy)); + + int gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW; + T* gInp_ptr_NC = grad_input + n * inp_sN; + for (int c = 0; c < out_c; + ++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) { + AtomicAdd(gInp_ptr_NC, + iy_nearest, + ix_nearest, + inp_sH, + inp_sW, + in_h, + in_w, + grad_output[gOut_offset]); + } + + if (grad_grid != nullptr) { + T* gGrid_ptr_NHW = grad_grid + index * grid_sW; + gGrid_ptr_NHW[0] = static_cast(0); + gGrid_ptr_NHW[1] = static_cast(0); + } + } + } +} + +template +void GridSampleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& grid, + const DenseTensor& out_grad, + const std::string& mode, + const std::string& padding_mode, + bool align_corners, + DenseTensor* x_grad, + DenseTensor* grid_grad) { + PaddingMode enum_padding_mode; + Mode enum_mode; + if (padding_mode == "border") { + enum_padding_mode = PaddingMode::border; + } else if (padding_mode == "reflection") { + enum_padding_mode = PaddingMode::reflect; + } else { + enum_padding_mode = PaddingMode::zeros; + } + + if (mode == "nearest") { + enum_mode = Mode::nearest; + } else { + enum_mode = Mode::bilinear; + } + + const int n = grid.dims()[0]; + const int out_h = grid.dims()[1]; + const int out_w = grid.dims()[2]; + const int c = x.dims()[1]; + const int in_h = x.dims()[2]; + const int in_w = x.dims()[3]; + + dev_ctx.template Alloc(x_grad); + phi::funcs::SetConstant()(dev_ctx, x_grad, static_cast(0)); + + T* grid_grad_data = nullptr; + if (grid_grad != nullptr) { + grid_grad_data = dev_ctx.template Alloc(grid_grad); + } + + int count = static_cast(n * out_h * out_w); + auto cu_stream = dev_ctx.stream(); + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count); + GridSamplerCudaBackwardKernel< + T><<>>( + count, + out_grad.data(), + x.data(), + grid.data(), + n, + c, + out_h, + out_w, + in_h, + in_w, + x_grad->data(), + grid_grad_data, + enum_mode, + enum_padding_mode, + align_corners); +} + +} // namespace phi + +PD_REGISTER_KERNEL(grid_sample_grad, + GPU, + ALL_LAYOUT, + phi::GridSampleGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/grid_sample_kernel.cu b/paddle/phi/kernels/gpu/grid_sample_kernel.cu new file mode 100644 index 00000000000..f611b46911c --- /dev/null +++ b/paddle/phi/kernels/gpu/grid_sample_kernel.cu @@ -0,0 +1,233 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/grid_sample_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/grid_sample_utils.h" + +namespace phi { + +template +static __forceinline__ __device__ T Unnormalize(T coord, + int size, + bool align_corners) { + if (align_corners) { + return ((coord + 1.f) / 2) * (size - 1); + } else { + return ((coord + 1.f) * size - 1) / 2; + } +} + +template +static __forceinline__ __device__ T ClipIndexes(T in, int max_value) { + return min(static_cast(max_value), max(in, static_cast(0))); +} + +template +static __forceinline__ __device__ T ReflectIndexes(T in, + int twice_low, + int twice_high) { + if (twice_low == twice_high) { + return static_cast(0); + } + T min = static_cast(twice_low) / 2; + T span = static_cast(twice_high - twice_low) / 2; + in = fabs(in - min); + T extra = fmod(in, span); + int flips = static_cast(floor(in / span)); + if (flips % 2 == 0) { + return extra + min; + } else { + return span - extra + min; + } +} + +template +static __forceinline__ __device__ T ComputePositions(T coord, + int size, + PaddingMode padding_mode, + bool align_corners) { + coord = Unnormalize(coord, size, align_corners); + if (padding_mode == PaddingMode::border) { + coord = ClipIndexes(coord, size - 1); + } else if (padding_mode == PaddingMode::reflect) { + if (align_corners) { + coord = ReflectIndexes(coord, 0, 2 * (size - 1)); + } else { + coord = ReflectIndexes(coord, -1, 2 * size - 1); + } + coord = ClipIndexes(coord, size - 1); + } + return coord; +} + +template +__global__ void GridSampleCudaKernel(const int nthreads, + int n, + int out_c, + int out_h, + int out_w, + int in_h, + int in_w, + const T* input, + const T* grid, + T* output, + const Mode mode, + const PaddingMode padding_mode, + bool align_corners) { + int inp_sN = out_c * in_h * in_w; + + int inp_sC = in_h * in_w; + int inp_sH = in_w; + int inp_sW = 1; + int grid_sN = out_h * out_w * 2; + int grid_sH = out_w * 2; + int grid_sW = 2; + int grid_sCoor = 1; + int out_sN = out_c * out_h * out_w; + int out_sC = out_h * out_w; + int out_sH = out_w; + int out_sW = 1; + CUDA_KERNEL_LOOP(index, nthreads) { + const int w = index % out_w; + const int h = (index / out_w) % out_h; + const int n = index / (out_h * out_w); + const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW; + + T ix = grid[grid_offset]; + T iy = grid[grid_offset + grid_sCoor]; + + ix = ComputePositions(ix, in_w, padding_mode, align_corners); + iy = ComputePositions(iy, in_h, padding_mode, align_corners); + if (mode == Mode::bilinear) { + int ix_nw = static_cast(floor(ix)); + int iy_nw = static_cast(floor(iy)); + int ix_ne = ix_nw + 1; + int iy_ne = iy_nw; + int ix_sw = ix_nw; + int iy_sw = iy_nw + 1; + int ix_se = ix_nw + 1; + int iy_se = iy_nw + 1; + + T nw = (ix_se - ix) * (iy_se - iy); + T ne = (ix - ix_sw) * (iy_sw - iy); + T sw = (ix_ne - ix) * (iy - iy_ne); + T se = (ix - ix_nw) * (iy - iy_nw); + + auto inp_offset_NC = n * inp_sN; + + auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW; + for (int c = 0; c < out_c; + ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) { + *out_ptr_NCHW = static_cast(0); + if (InBounds(iy_nw, ix_nw, in_h, in_w)) { + *out_ptr_NCHW += + input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw; + } + if (InBounds(iy_ne, ix_ne, in_h, in_w)) { + *out_ptr_NCHW += + input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne; + } + if (InBounds(iy_sw, ix_sw, in_h, in_w)) { + *out_ptr_NCHW += + input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw; + } + if (InBounds(iy_se, ix_se, in_h, in_w)) { + *out_ptr_NCHW += + input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se; + } + } + } else if (mode == Mode::nearest) { + int ix_nearest = static_cast(std::nearbyint(ix)); + int iy_nearest = static_cast(std::nearbyint(iy)); + auto inp_offset_NC = n * inp_sN; + auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW; + for (int c = 0; c < out_c; + ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) { + if (InBounds(iy_nearest, ix_nearest, in_h, in_w)) { + *out_ptr_NCHW = + input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW]; + } else { + *out_ptr_NCHW = static_cast(0); + } + } + } + } +} + +template +void GridSampleKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& grid, + const std::string& mode, + const std::string& padding_mode, + bool align_corners, + DenseTensor* out) { + PaddingMode enum_padding_mode; + Mode enum_mode; + if (padding_mode == "border") { + enum_padding_mode = PaddingMode::border; + } else if (padding_mode == "reflection") { + enum_padding_mode = PaddingMode::reflect; + } else { + enum_padding_mode = PaddingMode::zeros; + } + + if (mode == "nearest") { + enum_mode = Mode::nearest; + } else { + enum_mode = Mode::bilinear; + } + + const int n = grid.dims()[0]; + const int out_h = grid.dims()[1]; + const int out_w = grid.dims()[2]; + const int c = x.dims()[1]; + const int in_h = x.dims()[2]; + const int in_w = x.dims()[3]; + VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h + << "; out_w: " << out_w; + + auto* output_data = dev_ctx.template Alloc(out); + VLOG(3) << "out dims: " << out->dims()[0] << "; " << out->dims()[1] << "; " + << out->dims()[2] << "; " << out->dims()[3]; + + int count = static_cast(n * out_h * out_w); + auto cu_stream = dev_ctx.stream(); + backends::gpu::GpuLaunchConfig config = + backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count); + GridSampleCudaKernel< + T><<>>( + count, + n, + c, + out_h, + out_w, + in_h, + in_w, + x.data(), + grid.data(), + output_data, + enum_mode, + enum_padding_mode, + align_corners); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + grid_sample, GPU, ALL_LAYOUT, phi::GridSampleKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/grid_sample_utils.h b/paddle/phi/kernels/gpu/grid_sample_utils.h new file mode 100644 index 00000000000..098eb9defb5 --- /dev/null +++ b/paddle/phi/kernels/gpu/grid_sample_utils.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace phi { + +enum class Mode { + bilinear, + nearest, +}; + +enum class PaddingMode { zeros, border, reflect }; + +static __forceinline__ __device__ bool InBounds(int h, int w, int H, int W) { + return h >= 0 && h < H && w >= 0 && w < W; +} + +} // namespace phi diff --git a/paddle/phi/kernels/grid_sample_grad_kernel.h b/paddle/phi/kernels/grid_sample_grad_kernel.h new file mode 100644 index 00000000000..50a8d5be260 --- /dev/null +++ b/paddle/phi/kernels/grid_sample_grad_kernel.h @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void GridSampleGradKernel(const Context &dev_ctx, + const DenseTensor &x, + const DenseTensor &grid, + const DenseTensor &out_grid, + const std::string &mode, + const std::string &padding_mode, + bool align_corners, + DenseTensor *x_grad, + DenseTensor *grid_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/grid_sample_kernel.h b/paddle/phi/kernels/grid_sample_kernel.h new file mode 100644 index 00000000000..2e1e9b50864 --- /dev/null +++ b/paddle/phi/kernels/grid_sample_kernel.h @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void GridSampleKernel(const Context &dev_ctx, + const DenseTensor &x, + const DenseTensor &grid, + const std::string &mode, + const std::string &padding_mode, + bool align_corners, + DenseTensor *out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/grid_sampler_sig.cc b/paddle/phi/ops/compat/grid_sampler_sig.cc new file mode 100644 index 00000000000..b76a9770d4d --- /dev/null +++ b/paddle/phi/ops/compat/grid_sampler_sig.cc @@ -0,0 +1,43 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature GridSamplerOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("grid_sample", + {"X", "Grid"}, + {"mode", "padding_mode", "align_corners"}, + {"Output"}); +} + +KernelSignature GridSamplerGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("grid_sample_grad", + {"X", "Grid", GradVarName("Output")}, + {"mode", "padding_mode", "align_corners"}, + {GradVarName("X"), GradVarName("Grid")}); +} + +} // namespace phi + +// use Python API name as kernel name +PD_REGISTER_BASE_KERNEL_NAME(grid_sampler, grid_sample); +PD_REGISTER_BASE_KERNEL_NAME(grid_sampler_grad, grid_sample_grad); + +PD_REGISTER_ARG_MAPPING_FN(grid_sampler, phi::GridSamplerOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(grid_sampler_grad, + phi::GridSamplerGradOpArgumentMapping); -- GitLab From c76377005be429ad12e42e26cce39fbc16229521 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 16 Mar 2022 11:39:57 +0800 Subject: [PATCH 092/176] move isclose infershape (#40595) --- paddle/fluid/operators/isclose_op.cc | 41 ++++------------------------ paddle/phi/infermeta/binary.cc | 10 +++++++ paddle/phi/infermeta/binary.h | 5 ++++ 3 files changed, 21 insertions(+), 35 deletions(-) diff --git a/paddle/fluid/operators/isclose_op.cc b/paddle/fluid/operators/isclose_op.cc index 8668de4d3a6..1c79213757f 100644 --- a/paddle/fluid/operators/isclose_op.cc +++ b/paddle/fluid/operators/isclose_op.cc @@ -14,10 +14,13 @@ #include #include + +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -60,40 +63,6 @@ class IscloseOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Isclose"); - OP_INOUT_CHECK(ctx->HasInput("Other"), "Input", "Other", "Isclose"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Isclose"); - - auto input_dim = ctx->GetInputDim("Input"); - auto other_dim = ctx->GetInputDim("Other"); - PADDLE_ENFORCE_EQ(input_dim.size(), other_dim.size(), - platform::errors::PreconditionNotMet( - "Input(Input) and Input(Other) must have the same " - "dimension size.")); - int n = input_dim.size(); - bool is_runtime = ctx->IsRuntime(); - for (int i = 0; i < n; i++) { - if (is_runtime) { - PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i], - platform::errors::PreconditionNotMet( - "The value at dim %d of Input(Input) is not " - "equal to the Input(Other): %ld != %ld.", - i, input_dim[i], other_dim[i])); - } else { - if (!(input_dim[i] < 0 || other_dim[i] < 0)) { - PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i], - platform::errors::PreconditionNotMet( - "The value at dim %d of Input(Input) is not " - "equal to the Input(Other): %ld != %ld.", - i, input_dim[i], other_dim[i])); - } - } - } - - ctx->SetOutputDim("Out", input_dim); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -115,8 +84,10 @@ class IscloseOpVarTypeInference : public framework::VarTypeInference { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(isclose, IscloseInferShapeFunctor, + PD_INFER_META(phi::ValueCompareInferMeta)); REGISTER_OPERATOR( isclose, ops::IscloseOp, ops::IscloseOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker, - ops::IscloseOpVarTypeInference); + ops::IscloseOpVarTypeInference, IscloseInferShapeFunctor); diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index ffb1ed54502..d103bef2d9e 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -862,6 +862,16 @@ void TriangularSolveInferMeta(const MetaTensor& x, out->share_lod(y); } +void ValueCompareInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out, + MetaConfig config) { + detail::BinarySameInputDimsCheck(x, y, config); + + out->set_dims(x.dims()); + out->set_dtype(DataType::BOOL); +} + } // namespace phi PD_REGISTER_INFER_META_FN(add_raw, phi::ElementwiseRawInferMeta); diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index d852db7a846..5d93bae3162 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -142,4 +142,9 @@ void TriangularSolveInferMeta(const MetaTensor& x, bool unitriangular, MetaTensor* out); +void ValueCompareInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out, + MetaConfig config = MetaConfig()); + } // namespace phi -- GitLab From 6d205516e1b8cb0aca0e5382f28d82041c122d7d Mon Sep 17 00:00:00 2001 From: xiongkun Date: Wed, 16 Mar 2022 11:55:40 +0800 Subject: [PATCH 093/176] tranfer cumprod and kldiv_loss infershape to phi (#40575) --- paddle/fluid/operators/cumprod_op.cc | 15 ++++---- paddle/fluid/operators/kldiv_loss_op.cc | 46 ++++--------------------- paddle/phi/infermeta/binary.cc | 45 ++++++++++++++++++++++++ paddle/phi/infermeta/binary.h | 6 ++++ 4 files changed, 64 insertions(+), 48 deletions(-) diff --git a/paddle/fluid/operators/cumprod_op.cc b/paddle/fluid/operators/cumprod_op.cc index 90910bbbb20..889cdac8f68 100644 --- a/paddle/fluid/operators/cumprod_op.cc +++ b/paddle/fluid/operators/cumprod_op.cc @@ -12,8 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -21,14 +23,6 @@ namespace operators { class CumprodOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Cumprod"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Cumprod"); - - ctx->ShareDim("X", "Out"); - ctx->ShareLoD("X", "Out"); - } }; class CumprodOpMaker : public framework::OpProtoAndCheckerMaker { @@ -82,9 +76,12 @@ class CumprodGradOp : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(cumprod, CumprodInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(cumprod, ops::CumprodOp, ops::CumprodOpMaker, ops::CumprodGradOpMaker, - ops::CumprodGradOpMaker); + ops::CumprodGradOpMaker, + CumprodInferShapeFunctor); REGISTER_OPERATOR(cumprod_grad, ops::CumprodGradOp); diff --git a/paddle/fluid/operators/kldiv_loss_op.cc b/paddle/fluid/operators/kldiv_loss_op.cc index dcd98054b05..67c1942ea0b 100644 --- a/paddle/fluid/operators/kldiv_loss_op.cc +++ b/paddle/fluid/operators/kldiv_loss_op.cc @@ -11,7 +11,9 @@ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -21,44 +23,6 @@ using framework::Tensor; class KLDivLossOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "KLDivLoss"); - OP_INOUT_CHECK(ctx->HasInput("Target"), "Input", "Target", "KLDivLoss"); - OP_INOUT_CHECK(ctx->HasOutput("Loss"), "Output", "Loss", "KLDivLoss"); - - auto dim_x = ctx->GetInputDim("X"); - auto dim_target = ctx->GetInputDim("Target"); - PADDLE_ENFORCE_EQ(dim_x.size(), dim_target.size(), - platform::errors::InvalidArgument( - "Input(X) rank and Input(Target) rank should be " - "same, but received X rank(%d) != Target rank(%d)", - dim_x.size(), dim_target.size())); - for (int i = 0; i < dim_x.size(); i++) { - if (ctx->IsRuntime() || (dim_x[i] > 0 && dim_target[i] > 0)) { - PADDLE_ENFORCE_EQ( - dim_x[i], dim_target[i], - platform::errors::InvalidArgument( - "Input(X) and Input(Target) should in same shape. but received " - "X dimension[%d](%d) != Target dimension[%d](%d)", - i, dim_x[i], i, dim_target[i])); - } - } - - auto reduction = ctx->Attrs().Get("reduction"); - - auto reduction_valid = "mean" == reduction || "sum" == reduction || - "batchmean" == reduction || "none" == reduction; - PADDLE_ENFORCE_EQ( - reduction_valid, true, - platform::errors::InvalidArgument( - "Attr(reduction) can only be 'none'|'batchmean'|'sum'|'mean'.")); - - if ("none" == reduction) { - ctx->SetOutputDim("Loss", dim_x); - } else { - ctx->SetOutputDim("Loss", {1}); - } - } protected: framework::OpKernelType GetExpectedKernelType( @@ -171,8 +135,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(KLDivLossGradNoNeedBufferVarInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(kldiv_loss, KLDivInferShapeFunctor, + PD_INFER_META(phi::KLDivInferMeta)); + REGISTER_OPERATOR(kldiv_loss, ops::KLDivLossOp, ops::KLDivLossOpMaker, ops::KLDivLossOpGradMaker, - ops::KLDivLossOpGradMaker); + ops::KLDivLossOpGradMaker, + KLDivInferShapeFunctor); REGISTER_OPERATOR(kldiv_loss_grad, ops::KLDivLossOpGrad, ops::KLDivLossGradNoNeedBufferVarInferer); diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index d103bef2d9e..4c1d169615b 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -73,6 +73,51 @@ void AllValueCompareInferMeta(const MetaTensor& x, out->set_dtype(DataType::BOOL); } +void KLDivInferMeta(const MetaTensor& x, + const MetaTensor& label, + const std::string& reduction, + MetaTensor* out, + MetaConfig config) { + auto dim_x = x.dims(); + auto dim_target = label.dims(); + PADDLE_ENFORCE_EQ(dim_x.size(), + dim_target.size(), + phi::errors::InvalidArgument( + "Input(X) rank and Input(Target) rank should be " + "same, but received X rank(%d) != Target rank(%d)", + dim_x.size(), + dim_target.size())); + for (int i = 0; i < dim_x.size(); i++) { + if (config.is_runtime || (dim_x[i] > 0 && dim_target[i] > 0)) { + PADDLE_ENFORCE_EQ( + dim_x[i], + dim_target[i], + phi::errors::InvalidArgument( + "Input(X) and Input(Target) should in same shape. but received " + "X dimension[%d](%d) != Target dimension[%d](%d)", + i, + dim_x[i], + i, + dim_target[i])); + } + } + + auto reduction_valid = "mean" == reduction || "sum" == reduction || + "batchmean" == reduction || "none" == reduction; + PADDLE_ENFORCE_EQ( + reduction_valid, + true, + phi::errors::InvalidArgument( + "Attr(reduction) can only be 'none'|'batchmean'|'sum'|'mean'.")); + + if ("none" == reduction) { + out->set_dims(dim_x); + } else { + out->set_dims({1}); + } + out->set_dtype(x.dtype()); +} + void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { out->share_meta(x); } diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index 5d93bae3162..40641ea4858 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -35,6 +35,12 @@ void AllValueCompareInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void KLDivInferMeta(const MetaTensor& x, + const MetaTensor& label, + const std::string& reduction, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out); void BCELossInferMeta(const MetaTensor& input, -- GitLab From 84e17a31a88dbcf1c4adf1adcadf38b20fa8cbee Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Wed, 16 Mar 2022 13:15:05 +0800 Subject: [PATCH 094/176] [Phi]move reduce kernels into one file (#40584) * move reduce kernels into one file * rename reduce_prod to prod * move reduce sum/mean from math_kernel into reduce_kernel * rm comment --- paddle/phi/kernels/cpu/math_kernel.cc | 43 ----- .../phi/kernels/cpu/matrix_rank_tol_kernel.cc | 2 +- paddle/phi/kernels/cpu/reduce_all_kernel.cc | 37 ---- paddle/phi/kernels/cpu/reduce_any_kernel.cc | 37 ---- paddle/phi/kernels/cpu/reduce_kernel.cc | 145 +++++++++++++++ paddle/phi/kernels/cpu/reduce_max_kernel.cc | 39 ----- paddle/phi/kernels/cpu/reduce_min_kernel.cc | 39 ----- paddle/phi/kernels/cpu/reduce_prod_kernel.cc | 44 ----- paddle/phi/kernels/gpu/math_kernel.cu | 51 ------ .../phi/kernels/gpu/matrix_rank_tol_kernel.cu | 2 +- paddle/phi/kernels/gpu/reduce_all_kernel.cu | 36 ---- paddle/phi/kernels/gpu/reduce_any_kernel.cu | 36 ---- paddle/phi/kernels/gpu/reduce_kernel.cu | 158 +++++++++++++++++ paddle/phi/kernels/gpu/reduce_max_kernel.cu | 2 +- paddle/phi/kernels/gpu/reduce_min_kernel.cu | 37 ---- paddle/phi/kernels/gpu/reduce_prod_kernel.cu | 43 ----- paddle/phi/kernels/math_kernel.cc | 67 +------ paddle/phi/kernels/math_kernel.h | 60 ------- paddle/phi/kernels/reduce_all_kernel.cc | 37 ---- paddle/phi/kernels/reduce_all_kernel.h | 35 ---- paddle/phi/kernels/reduce_any_kernel.cc | 37 ---- paddle/phi/kernels/reduce_any_kernel.h | 35 ---- paddle/phi/kernels/reduce_kernel.cc | 165 ++++++++++++++++++ paddle/phi/kernels/reduce_kernel.h | 154 ++++++++++++++++ paddle/phi/kernels/reduce_max_kernel.cc | 39 ----- paddle/phi/kernels/reduce_max_kernel.h | 35 ---- paddle/phi/kernels/reduce_min_kernel.cc | 39 ----- paddle/phi/kernels/reduce_min_kernel.h | 35 ---- paddle/phi/ops/compat/reduce_sig.cc | 20 ++- paddle/phi/tests/kernels/test_mean_dev_api.cc | 2 +- paddle/phi/tests/kernels/test_sum_dev_api.cc | 2 +- 31 files changed, 642 insertions(+), 871 deletions(-) delete mode 100644 paddle/phi/kernels/cpu/reduce_all_kernel.cc delete mode 100644 paddle/phi/kernels/cpu/reduce_any_kernel.cc create mode 100644 paddle/phi/kernels/cpu/reduce_kernel.cc delete mode 100644 paddle/phi/kernels/cpu/reduce_max_kernel.cc delete mode 100644 paddle/phi/kernels/cpu/reduce_min_kernel.cc delete mode 100644 paddle/phi/kernels/cpu/reduce_prod_kernel.cc delete mode 100644 paddle/phi/kernels/gpu/reduce_all_kernel.cu delete mode 100644 paddle/phi/kernels/gpu/reduce_any_kernel.cu create mode 100644 paddle/phi/kernels/gpu/reduce_kernel.cu delete mode 100644 paddle/phi/kernels/gpu/reduce_min_kernel.cu delete mode 100644 paddle/phi/kernels/gpu/reduce_prod_kernel.cu delete mode 100644 paddle/phi/kernels/reduce_all_kernel.cc delete mode 100644 paddle/phi/kernels/reduce_all_kernel.h delete mode 100644 paddle/phi/kernels/reduce_any_kernel.cc delete mode 100644 paddle/phi/kernels/reduce_any_kernel.h create mode 100644 paddle/phi/kernels/reduce_kernel.cc create mode 100644 paddle/phi/kernels/reduce_kernel.h delete mode 100644 paddle/phi/kernels/reduce_max_kernel.cc delete mode 100644 paddle/phi/kernels/reduce_max_kernel.h delete mode 100644 paddle/phi/kernels/reduce_min_kernel.cc delete mode 100644 paddle/phi/kernels/reduce_min_kernel.h diff --git a/paddle/phi/kernels/cpu/math_kernel.cc b/paddle/phi/kernels/cpu/math_kernel.cc index 250f656926c..0047940fd17 100644 --- a/paddle/phi/kernels/cpu/math_kernel.cc +++ b/paddle/phi/kernels/cpu/math_kernel.cc @@ -19,10 +19,8 @@ #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/cpu/elementwise.h" -#include "paddle/phi/kernels/cpu/reduce.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" -#include "paddle/phi/kernels/funcs/reduce_functor.h" // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/framework/eigen.h" @@ -55,30 +53,6 @@ namespace phi { } \ } -template -void MeanRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -template -void SumRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DataType out_dtype, - DenseTensor* out) { - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - template void DivideRawKernel(const Context& dev_ctx, const DenseTensor& x, @@ -164,20 +138,3 @@ PD_REGISTER_KERNEL(multiply_raw, complex64, complex128, phi::dtype::bfloat16) {} -PD_REGISTER_KERNEL(sum_raw, - CPU, - ALL_LAYOUT, - phi::SumRawKernel, - bool, - float, - double, - phi::dtype::float16, - int16_t, - int, - int64_t, - complex64, - complex128) { - kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); -} -PD_REGISTER_KERNEL( - mean_raw, CPU, ALL_LAYOUT, phi::MeanRawKernel, float, double, bool) {} diff --git a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc index 70b6316e104..636018ffa68 100644 --- a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc +++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc @@ -23,7 +23,7 @@ #include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h" #include "paddle/phi/kernels/math_kernel.h" -#include "paddle/phi/kernels/reduce_max_kernel.h" +#include "paddle/phi/kernels/reduce_kernel.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/reduce_all_kernel.cc b/paddle/phi/kernels/cpu/reduce_all_kernel.cc deleted file mode 100644 index 3e8e38ee444..00000000000 --- a/paddle/phi/kernels/cpu/reduce_all_kernel.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_all_kernel.h" - -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/reduce.h" -#include "paddle/phi/kernels/funcs/reduce_functor.h" - -namespace phi { - -template -void AllRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - phi::BoolReduceKernel( - dev_ctx, x, dims, keep_dim, reduce_all, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL(all_raw, CPU, ALL_LAYOUT, phi::AllRawKernel, bool) {} diff --git a/paddle/phi/kernels/cpu/reduce_any_kernel.cc b/paddle/phi/kernels/cpu/reduce_any_kernel.cc deleted file mode 100644 index 4fd71f1d0b1..00000000000 --- a/paddle/phi/kernels/cpu/reduce_any_kernel.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_any_kernel.h" - -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/reduce.h" -#include "paddle/phi/kernels/funcs/reduce_functor.h" - -namespace phi { - -template -void AnyRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - phi::BoolReduceKernel( - dev_ctx, x, dims, keep_dim, reduce_all, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL(any_raw, CPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {} diff --git a/paddle/phi/kernels/cpu/reduce_kernel.cc b/paddle/phi/kernels/cpu/reduce_kernel.cc new file mode 100644 index 00000000000..bc99e2cb39a --- /dev/null +++ b/paddle/phi/kernels/cpu/reduce_kernel.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" + +namespace phi { + +template +void MeanRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void SumRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType out_dtype, + DenseTensor* out) { + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void ProdRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void MaxRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void MinRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void AllRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + phi::BoolReduceKernel( + dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +template +void AnyRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + phi::BoolReduceKernel( + dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +} // namespace phi + +using complex64 = ::phi::dtype::complex; +using complex128 = ::phi::dtype::complex; + +PD_REGISTER_KERNEL(sum_raw, + CPU, + ALL_LAYOUT, + phi::SumRawKernel, + bool, + float, + double, + phi::dtype::float16, + int16_t, + int, + int64_t, + complex64, + complex128) { + kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); +} +PD_REGISTER_KERNEL( + mean_raw, CPU, ALL_LAYOUT, phi::MeanRawKernel, float, double, bool) {} + +PD_REGISTER_KERNEL(prod_raw, + CPU, + ALL_LAYOUT, + phi::ProdRawKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL( + max_raw, CPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {} + +PD_REGISTER_KERNEL( + min_raw, CPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {} + +PD_REGISTER_KERNEL(all_raw, CPU, ALL_LAYOUT, phi::AllRawKernel, bool) {} +PD_REGISTER_KERNEL(any_raw, CPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {} diff --git a/paddle/phi/kernels/cpu/reduce_max_kernel.cc b/paddle/phi/kernels/cpu/reduce_max_kernel.cc deleted file mode 100644 index f9ea0aa0faf..00000000000 --- a/paddle/phi/kernels/cpu/reduce_max_kernel.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_max_kernel.h" - -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/reduce.h" -#include "paddle/phi/kernels/funcs/reduce_functor.h" - -namespace phi { - -template -void MaxRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL( - max_raw, CPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/cpu/reduce_min_kernel.cc b/paddle/phi/kernels/cpu/reduce_min_kernel.cc deleted file mode 100644 index 0a241c81dbe..00000000000 --- a/paddle/phi/kernels/cpu/reduce_min_kernel.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_min_kernel.h" - -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/reduce.h" -#include "paddle/phi/kernels/funcs/reduce_functor.h" - -namespace phi { - -template -void MinRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL( - min_raw, CPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc b/paddle/phi/kernels/cpu/reduce_prod_kernel.cc deleted file mode 100644 index 9a9bf46e948..00000000000 --- a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_prod_kernel.h" - -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/reduce.h" -#include "paddle/phi/kernels/funcs/reduce_functor.h" - -namespace phi { - -template -void ReduceProdKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL(reduce_prod, - CPU, - ALL_LAYOUT, - phi::ReduceProdKernel, - float, - double, - int, - int64_t) {} diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu index af9d5574aa9..d33f2164682 100644 --- a/paddle/phi/kernels/gpu/math_kernel.cu +++ b/paddle/phi/kernels/gpu/math_kernel.cu @@ -56,30 +56,6 @@ namespace phi { * Kernels */ -template -void MeanRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -template -void SumRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DataType out_dtype, - DenseTensor* out) { - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - // Create the definition of Add DEFINE_CUDA_ELEMENTWISE_OP(Add) // Create the definition of Subtract @@ -147,30 +123,3 @@ PD_REGISTER_KERNEL(multiply_raw, complex64, complex128, bfloat16) {} -PD_REGISTER_KERNEL(sum_raw, - GPU, - ALL_LAYOUT, - phi::SumRawKernel, - bool, - float, - double, - float16, - bfloat16, - int16_t, - int, - int64_t, - complex64, - complex128) { - kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); -} - -PD_REGISTER_KERNEL(mean_raw, - GPU, - ALL_LAYOUT, - phi::MeanRawKernel, - float, - double, - bool, - float16, - int, - int64_t) {} diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu index 2009547fc8d..7796132ec07 100644 --- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu +++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu @@ -28,7 +28,7 @@ #include "paddle/phi/kernels/funcs/compare_functors.h" #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h" #include "paddle/phi/kernels/math_kernel.h" -#include "paddle/phi/kernels/reduce_max_kernel.h" +#include "paddle/phi/kernels/reduce_kernel.h" namespace phi { diff --git a/paddle/phi/kernels/gpu/reduce_all_kernel.cu b/paddle/phi/kernels/gpu/reduce_all_kernel.cu deleted file mode 100644 index 2963d3f206c..00000000000 --- a/paddle/phi/kernels/gpu/reduce_all_kernel.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_all_kernel.h" - -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/gpu/reduce.h" - -namespace phi { - -template -void AllRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL(all_raw, GPU, ALL_LAYOUT, phi::AllRawKernel, bool) {} diff --git a/paddle/phi/kernels/gpu/reduce_any_kernel.cu b/paddle/phi/kernels/gpu/reduce_any_kernel.cu deleted file mode 100644 index 39c8cbe442c..00000000000 --- a/paddle/phi/kernels/gpu/reduce_any_kernel.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_any_kernel.h" - -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/gpu/reduce.h" - -namespace phi { - -template -void AnyRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL(any_raw, GPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {} diff --git a/paddle/phi/kernels/gpu/reduce_kernel.cu b/paddle/phi/kernels/gpu/reduce_kernel.cu new file mode 100644 index 00000000000..6cbe699e8e0 --- /dev/null +++ b/paddle/phi/kernels/gpu/reduce_kernel.cu @@ -0,0 +1,158 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/reduce.h" + +namespace phi { + +template +void MeanRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void SumRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType out_dtype, + DenseTensor* out) { + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void ProdRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void MaxRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void MinRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void AllRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void AnyRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace phi + +using float16 = phi::dtype::float16; +using bfloat16 = phi::dtype::bfloat16; +using complex64 = ::phi::dtype::complex; +using complex128 = ::phi::dtype::complex; + +PD_REGISTER_KERNEL(sum_raw, + GPU, + ALL_LAYOUT, + phi::SumRawKernel, + bool, + float, + double, + float16, + bfloat16, + int16_t, + int, + int64_t, + complex64, + complex128) { + kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); +} + +PD_REGISTER_KERNEL(mean_raw, + GPU, + ALL_LAYOUT, + phi::MeanRawKernel, + float, + double, + bool, + float16, + int, + int64_t) {} + +PD_REGISTER_KERNEL(prod_raw, + GPU, + ALL_LAYOUT, + phi::ProdRawKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL( + max_raw, GPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {} + +PD_REGISTER_KERNEL( + min_raw, GPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {} + +PD_REGISTER_KERNEL(all_raw, GPU, ALL_LAYOUT, phi::AllRawKernel, bool) {} + +PD_REGISTER_KERNEL(any_raw, GPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {} diff --git a/paddle/phi/kernels/gpu/reduce_max_kernel.cu b/paddle/phi/kernels/gpu/reduce_max_kernel.cu index 98c3986c51d..ddbc08b06c8 100644 --- a/paddle/phi/kernels/gpu/reduce_max_kernel.cu +++ b/paddle/phi/kernels/gpu/reduce_max_kernel.cu @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/reduce_max_kernel.h" +#include "paddle/phi/kernels/reduce_kernel.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/gpu/reduce.h" diff --git a/paddle/phi/kernels/gpu/reduce_min_kernel.cu b/paddle/phi/kernels/gpu/reduce_min_kernel.cu deleted file mode 100644 index ba37d54895d..00000000000 --- a/paddle/phi/kernels/gpu/reduce_min_kernel.cu +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_min_kernel.h" - -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/gpu/reduce.h" - -namespace phi { - -template -void MinRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL( - min_raw, GPU, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu b/paddle/phi/kernels/gpu/reduce_prod_kernel.cu deleted file mode 100644 index 278d4a6e5ab..00000000000 --- a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_prod_kernel.h" - -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/gpu/reduce.h" - -namespace phi { - -template -void ReduceProdKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - phi::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL(reduce_prod, - GPU, - ALL_LAYOUT, - phi::ReduceProdKernel, - float, - double, - int, - int64_t) {} diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/math_kernel.cc index a5d3f51e544..5aad2375ebb 100644 --- a/paddle/phi/kernels/math_kernel.cc +++ b/paddle/phi/kernels/math_kernel.cc @@ -19,27 +19,6 @@ namespace phi { -template -void MeanKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out) { - bool reduce_all = false; - MeanRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); -} - -template -void SumKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - DataType out_dtype, - bool keep_dim, - DenseTensor* out) { - bool reduce_all = false; - SumRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out); -} - template void AddKernel(const Context& dev_ctx, const DenseTensor& x, @@ -81,25 +60,6 @@ void MultiplyKernel(const Context& dev_ctx, using complex64 = ::phi::dtype::complex; using complex128 = ::phi::dtype::complex; -PD_REGISTER_KERNEL( - mean, CPU, ALL_LAYOUT, phi::MeanKernel, float, double, bool) {} - -PD_REGISTER_KERNEL(sum, - CPU, - ALL_LAYOUT, - phi::SumKernel, - bool, - float, - double, - phi::dtype::float16, - int16_t, - int, - int64_t, - complex64, - complex128) { - kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); -} - PD_REGISTER_KERNEL(add, CPU, ALL_LAYOUT, @@ -147,32 +107,7 @@ PD_REGISTER_KERNEL(multiply, phi::dtype::bfloat16) {} #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PD_REGISTER_KERNEL(mean, - GPU, - ALL_LAYOUT, - phi::MeanKernel, - float, - double, - bool, - int, - int64_t, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(sum, - GPU, - ALL_LAYOUT, - phi::SumKernel, - bool, - float, - double, - phi::dtype::float16, - phi::dtype::bfloat16, - int16_t, - int, - int64_t, - complex64, - complex128) { - kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); -} + PD_REGISTER_KERNEL(add, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/math_kernel.h b/paddle/phi/kernels/math_kernel.h index 7569cbcff08..ddc3a46e989 100644 --- a/paddle/phi/kernels/math_kernel.h +++ b/paddle/phi/kernels/math_kernel.h @@ -16,43 +16,8 @@ limitations under the License. */ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/infermeta/binary.h" -#include "paddle/phi/infermeta/unary.h" -#include "paddle/phi/kernels/empty_kernel.h" - namespace phi { -template -void MeanRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out); - -template -void MeanKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out); - -template -void SumRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DataType out_dtype, - DenseTensor* out); - -template -void SumKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - DataType out_dtype, - bool keep_dim, - DenseTensor* out); - template void AddRawKernel(const Context& dev_ctx, const DenseTensor& x, @@ -149,29 +114,4 @@ DenseTensor Multiply(const Context& dev_ctx, return dense_out; } -template -DenseTensor Mean(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& axis, - bool keep_dim) { - DenseTensor dense_out; - MetaTensor meta_out(&dense_out); - SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out); - MeanKernel(dev_ctx, x, axis, keep_dim, &dense_out); - return dense_out; -} - -template -DenseTensor Sum(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& axis, - DataType dtype, - bool keep_dim) { - DenseTensor dense_out; - MetaTensor meta_out(&dense_out); - SumInferMeta(x, axis, dtype, keep_dim, &meta_out); - SumKernel(dev_ctx, x, axis, dtype, keep_dim, &dense_out); - return dense_out; -} - } // namespace phi diff --git a/paddle/phi/kernels/reduce_all_kernel.cc b/paddle/phi/kernels/reduce_all_kernel.cc deleted file mode 100644 index 3cbd0976ad8..00000000000 --- a/paddle/phi/kernels/reduce_all_kernel.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_all_kernel.h" - -#include "paddle/phi/backends/all_context.h" -#include "paddle/phi/core/kernel_registry.h" - -namespace phi { - -template -void AllKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out) { - bool reduce_all = false; - AllRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL(all, CPU, ALL_LAYOUT, phi::AllKernel, bool) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PD_REGISTER_KERNEL(all, GPU, ALL_LAYOUT, phi::AllKernel, bool) {} -#endif diff --git a/paddle/phi/kernels/reduce_all_kernel.h b/paddle/phi/kernels/reduce_all_kernel.h deleted file mode 100644 index 8d7a9ab3faf..00000000000 --- a/paddle/phi/kernels/reduce_all_kernel.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" - -namespace phi { - -template -void AllRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out); - -template -void AllKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out); -} // namespace phi diff --git a/paddle/phi/kernels/reduce_any_kernel.cc b/paddle/phi/kernels/reduce_any_kernel.cc deleted file mode 100644 index 371dd972129..00000000000 --- a/paddle/phi/kernels/reduce_any_kernel.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_any_kernel.h" - -#include "paddle/phi/backends/all_context.h" -#include "paddle/phi/core/kernel_registry.h" - -namespace phi { - -template -void AnyKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out) { - bool reduce_all = false; - AnyRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL(any, CPU, ALL_LAYOUT, phi::AnyKernel, bool) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PD_REGISTER_KERNEL(any, GPU, ALL_LAYOUT, phi::AnyKernel, bool) {} -#endif diff --git a/paddle/phi/kernels/reduce_any_kernel.h b/paddle/phi/kernels/reduce_any_kernel.h deleted file mode 100644 index 0f505817084..00000000000 --- a/paddle/phi/kernels/reduce_any_kernel.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" - -namespace phi { - -template -void AnyRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out); - -template -void AnyKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out); -} // namespace phi diff --git a/paddle/phi/kernels/reduce_kernel.cc b/paddle/phi/kernels/reduce_kernel.cc new file mode 100644 index 00000000000..7638c782d54 --- /dev/null +++ b/paddle/phi/kernels/reduce_kernel.cc @@ -0,0 +1,165 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_kernel.h" + +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void SumKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + DataType out_dtype, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + SumRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out); +} + +template +void MeanKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + MeanRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +template +void ProdKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + ProdRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +template +void MaxKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + MaxRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +template +void MinKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + MinRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +template +void AllKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + AllRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +template +void AnyKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + AnyRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +} // namespace phi + +using complex64 = ::phi::dtype::complex; +using complex128 = ::phi::dtype::complex; + +PD_REGISTER_KERNEL( + mean, CPU, ALL_LAYOUT, phi::MeanKernel, float, double, bool) {} + +PD_REGISTER_KERNEL(sum, + CPU, + ALL_LAYOUT, + phi::SumKernel, + bool, + float, + double, + phi::dtype::float16, + int16_t, + int, + int64_t, + complex64, + complex128) { + kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); +} + +PD_REGISTER_KERNEL( + prod, CPU, ALL_LAYOUT, phi::ProdKernel, float, double, int, int64_t) {} + +PD_REGISTER_KERNEL( + max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {} +PD_REGISTER_KERNEL( + min, CPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {} +PD_REGISTER_KERNEL(all, CPU, ALL_LAYOUT, phi::AllKernel, bool) {} +PD_REGISTER_KERNEL(any, CPU, ALL_LAYOUT, phi::AnyKernel, bool) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + +PD_REGISTER_KERNEL(mean, + GPU, + ALL_LAYOUT, + phi::MeanKernel, + float, + double, + bool, + int, + int64_t, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(sum, + GPU, + ALL_LAYOUT, + phi::SumKernel, + bool, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16, + int16_t, + int, + int64_t, + complex64, + complex128) { + kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); +} + +PD_REGISTER_KERNEL( + prod, GPU, ALL_LAYOUT, phi::ProdKernel, float, double, int, int64_t) {} + +PD_REGISTER_KERNEL( + max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {} +PD_REGISTER_KERNEL( + min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {} +PD_REGISTER_KERNEL(all, GPU, ALL_LAYOUT, phi::AllKernel, bool) {} +PD_REGISTER_KERNEL(any, GPU, ALL_LAYOUT, phi::AnyKernel, bool) {} +#endif diff --git a/paddle/phi/kernels/reduce_kernel.h b/paddle/phi/kernels/reduce_kernel.h new file mode 100644 index 00000000000..75f52c36beb --- /dev/null +++ b/paddle/phi/kernels/reduce_kernel.h @@ -0,0 +1,154 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/infermeta/unary.h" +#include "paddle/phi/kernels/empty_kernel.h" + +namespace phi { +template +void SumRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType out_dtype, + DenseTensor* out); + +template +void MeanRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void ProdRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void MaxRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void MinRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void AnyRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void AllRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void SumKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + DataType out_dtype, + bool keep_dim, + DenseTensor* out); + +template +void MeanKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); + +template +void ProdKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); + +template +void MaxKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); + +template +void MinKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); + +template +void AnyKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); + +template +void AllKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); + +template +DenseTensor Mean(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& axis, + bool keep_dim) { + DenseTensor dense_out; + MetaTensor meta_out(&dense_out); + SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out); + MeanKernel(dev_ctx, x, axis, keep_dim, &dense_out); + return dense_out; +} + +template +DenseTensor Sum(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& axis, + DataType dtype, + bool keep_dim) { + DenseTensor dense_out; + MetaTensor meta_out(&dense_out); + SumInferMeta(x, axis, dtype, keep_dim, &meta_out); + SumKernel(dev_ctx, x, axis, dtype, keep_dim, &dense_out); + return dense_out; +} + +} // namespace phi diff --git a/paddle/phi/kernels/reduce_max_kernel.cc b/paddle/phi/kernels/reduce_max_kernel.cc deleted file mode 100644 index de172a12d72..00000000000 --- a/paddle/phi/kernels/reduce_max_kernel.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_max_kernel.h" - -#include "paddle/phi/backends/all_context.h" -#include "paddle/phi/core/kernel_registry.h" - -namespace phi { - -template -void MaxKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out) { - bool reduce_all = false; - MaxRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL( - max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PD_REGISTER_KERNEL( - max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {} -#endif diff --git a/paddle/phi/kernels/reduce_max_kernel.h b/paddle/phi/kernels/reduce_max_kernel.h deleted file mode 100644 index 49a350519c5..00000000000 --- a/paddle/phi/kernels/reduce_max_kernel.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" - -namespace phi { - -template -void MaxRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out); - -template -void MaxKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out); -} // namespace phi diff --git a/paddle/phi/kernels/reduce_min_kernel.cc b/paddle/phi/kernels/reduce_min_kernel.cc deleted file mode 100644 index c8ec6b3678c..00000000000 --- a/paddle/phi/kernels/reduce_min_kernel.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_min_kernel.h" - -#include "paddle/phi/backends/all_context.h" -#include "paddle/phi/core/kernel_registry.h" - -namespace phi { - -template -void MinKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out) { - bool reduce_all = false; - MinRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); -} - -} // namespace phi - -PD_REGISTER_KERNEL( - min, CPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PD_REGISTER_KERNEL( - min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {} -#endif diff --git a/paddle/phi/kernels/reduce_min_kernel.h b/paddle/phi/kernels/reduce_min_kernel.h deleted file mode 100644 index 3227ec00e64..00000000000 --- a/paddle/phi/kernels/reduce_min_kernel.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" - -namespace phi { - -template -void MinRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out); - -template -void MinKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - DenseTensor* out); -} // namespace phi diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc index dcb00fe1b0c..789496ccbd0 100644 --- a/paddle/phi/ops/compat/reduce_sig.cc +++ b/paddle/phi/ops/compat/reduce_sig.cc @@ -52,8 +52,19 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) { } KernelSignature ReduceProdOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature( - "reduce_prod", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); + if (ctx.IsDenseTensorInput("X")) { + bool reduce_all = paddle::any_cast(ctx.Attr("reduce_all")); + // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in + // InferShape, so we must return the "max_raw" KernelSignature. + // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with + // the "max_raw" KernelSignature + if (ctx.IsForInferShape() || reduce_all) { + return KernelSignature( + "prod_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); + } + return KernelSignature("prod", {"X"}, {"dim", "keep_dim"}, {"Out"}); + } + return KernelSignature("unregistered", {}, {}, {}); } KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) { @@ -107,10 +118,6 @@ KernelSignature ReduceAnyOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature ReduceAllOpArgumentMapping(const ArgumentMappingContext& ctx) { if (ctx.IsDenseTensorInput("X")) { bool reduce_all = paddle::any_cast(ctx.Attr("reduce_all")); - // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in - // InferShape, so we must return the "all_raw" KernelSignature. - // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with - // the "all_raw" KernelSignature if (ctx.IsForInferShape() || reduce_all) { return KernelSignature( "all_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); @@ -135,6 +142,7 @@ PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum); PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean); PD_REGISTER_BASE_KERNEL_NAME(reduce_max, max); PD_REGISTER_BASE_KERNEL_NAME(reduce_min, min); +PD_REGISTER_BASE_KERNEL_NAME(reduce_prod, prod); PD_REGISTER_BASE_KERNEL_NAME(reduce_all, all); PD_REGISTER_BASE_KERNEL_NAME(reduce_any, any); diff --git a/paddle/phi/tests/kernels/test_mean_dev_api.cc b/paddle/phi/tests/kernels/test_mean_dev_api.cc index 23edfeacaf8..ce31b2021e0 100644 --- a/paddle/phi/tests/kernels/test_mean_dev_api.cc +++ b/paddle/phi/tests/kernels/test_mean_dev_api.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include #include -#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/reduce_kernel.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/phi/api/lib/utils/allocator.h" diff --git a/paddle/phi/tests/kernels/test_sum_dev_api.cc b/paddle/phi/tests/kernels/test_sum_dev_api.cc index dfec291bc07..82fa90c1574 100644 --- a/paddle/phi/tests/kernels/test_sum_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sum_dev_api.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include #include -#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/reduce_kernel.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/phi/api/lib/utils/allocator.h" -- GitLab From 517b1a7cea5cc4f353f8cb61342ed0994bc80c48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?= <78149749+winter-wang@users.noreply.github.com> Date: Wed, 16 Mar 2022 13:17:43 +0800 Subject: [PATCH 095/176] [infrt] add parse for infrt.dense_tensor_type. test=develop (#40592) --- paddle/infrt/dialect/infrt/ir/infrt_dialect.cc | 12 +++++++++++- paddle/infrt/dialect/phi/phi_ir_exec.cc | 2 +- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc index 42de08ebc41..867d854ba3c 100644 --- a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc +++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc @@ -134,6 +134,10 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const { return DenseTensorType::get( parser.getContext(), *targetType, *precisionType, *layoutType); } + + if (keyword == "dense_tensor_map") { + return DenseTensorMapType::get(parser.getContext()); + } // Todo: parse other type return mlir::Type(); } @@ -156,7 +160,7 @@ void InfrtDialect::printType(::mlir::Type type, } // print DenseTensorType, for example: !infrt.dense_tensor - if (type.isa()) { + if (type.isa()) { auto dense_tensor_type = type.cast(); os << "dense_tensor<" << dense_tensor_type.getTarget() << ", " << dense_tensor_type.getPrecision() << ", " @@ -164,6 +168,12 @@ void InfrtDialect::printType(::mlir::Type type, return; } + // print DenseTensorType, for example: !infrt.dense_tensor + if (type.isa()) { + os << "dense_tensor_map"; + return; + } + llvm_unreachable("unknown infrt type."); } diff --git a/paddle/infrt/dialect/phi/phi_ir_exec.cc b/paddle/infrt/dialect/phi/phi_ir_exec.cc index de61dba8e74..0beb5bff29f 100644 --- a/paddle/infrt/dialect/phi/phi_ir_exec.cc +++ b/paddle/infrt/dialect/phi/phi_ir_exec.cc @@ -18,7 +18,7 @@ #include "paddle/infrt/common/global.h" #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h" #include "paddle/infrt/dialect/mlir_loader.h" -#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h" +#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h" int main(int argc, char** argv) { static llvm::cl::opt input_file( -- GitLab From dce87e3d668235e845918f100057c3e3c17069d5 Mon Sep 17 00:00:00 2001 From: caozhou <48191911+Caozhou1995@users.noreply.github.com> Date: Wed, 16 Mar 2022 14:21:36 +0800 Subject: [PATCH 096/176] [Phi] Migrate multiplex, qr, tril_triu op kernel to phi (#40007) * migrate multiplex op kernel * migrate qr cpu kernel * migrate tril_triu op kernel * fix multiplex kernel * add kernel sig * fix dependence and bug * fix multiplex error * fix npu include error * fix conflict * fix conflict and delete tril_triu * fix date and multiplex input * adapt header file order * fix header file include * fix conflict * delete cholesky_solve_op.h * delete triangular_solve_op.h --- paddle/fluid/operators/lu_op.h | 30 ++--- paddle/fluid/operators/lu_unpack_op.h | 9 +- paddle/fluid/operators/multiplex_op.cc | 14 +-- paddle/fluid/operators/multiplex_op.cu | 117 ------------------ paddle/fluid/operators/multiplex_op.h | 96 -------------- paddle/fluid/operators/qr_op.cc | 2 - paddle/fluid/operators/qr_op.h | 79 ------------ paddle/fluid/operators/tril_triu_op.cc | 18 +-- paddle/fluid/operators/tril_triu_op.cu | 35 ------ paddle/fluid/operators/tril_triu_op.h | 102 --------------- paddle/fluid/operators/tril_triu_op_npu.cc | 2 +- paddle/fluid/operators/tril_triu_op_xpu.cc | 2 +- .../phi/kernels/cpu/multiplex_grad_kernel.cc | 65 ++++++++++ paddle/phi/kernels/cpu/multiplex_kernel.cc | 65 ++++++++++ paddle/phi/kernels/cpu/qr_kernel.cc | 116 +++++++++++++++++ .../phi/kernels/cpu/tril_triu_grad_kernel.cc | 29 +++++ paddle/phi/kernels/cpu/tril_triu_kernel.cc | 29 +++++ paddle/phi/kernels/funcs/tril_triu_compute.h | 48 +++++++ .../phi/kernels/gpu/multiplex_grad_kernel.cu | 68 ++++++++++ paddle/phi/kernels/gpu/multiplex_kernel.cu | 70 +++++++++++ .../phi/kernels/gpu/tril_triu_grad_kernel.cu | 29 +++++ paddle/phi/kernels/gpu/tril_triu_kernel.cu | 29 +++++ .../impl/cholesky_solve_grad_kernel_impl.h | 7 +- .../impl/triangular_solve_grad_kernel_impl.h | 7 +- .../kernels/impl/tril_triu_grad_kernel_impl.h | 44 +++++++ .../phi/kernels/impl/tril_triu_kernel_impl.h | 43 +++++++ paddle/phi/kernels/multiplex_grad_kernel.h | 27 ++++ paddle/phi/kernels/multiplex_kernel.h | 27 ++++ paddle/phi/kernels/qr_kernel.h | 28 +++++ paddle/phi/kernels/tril_triu_grad_kernel.h | 28 +++++ paddle/phi/kernels/tril_triu_kernel.h | 28 +++++ paddle/phi/ops/compat/multiplex_sig.cc | 32 +++++ paddle/phi/ops/compat/qr_sig.cc | 25 ++++ paddle/phi/ops/compat/tril_triu_sig.cc | 34 +++++ 34 files changed, 896 insertions(+), 488 deletions(-) delete mode 100644 paddle/fluid/operators/multiplex_op.cu delete mode 100644 paddle/fluid/operators/multiplex_op.h delete mode 100644 paddle/fluid/operators/tril_triu_op.cu delete mode 100644 paddle/fluid/operators/tril_triu_op.h create mode 100644 paddle/phi/kernels/cpu/multiplex_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/multiplex_kernel.cc create mode 100644 paddle/phi/kernels/cpu/qr_kernel.cc create mode 100644 paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/tril_triu_kernel.cc create mode 100644 paddle/phi/kernels/funcs/tril_triu_compute.h create mode 100644 paddle/phi/kernels/gpu/multiplex_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/multiplex_kernel.cu create mode 100644 paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/tril_triu_kernel.cu create mode 100644 paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/tril_triu_kernel_impl.h create mode 100644 paddle/phi/kernels/multiplex_grad_kernel.h create mode 100644 paddle/phi/kernels/multiplex_kernel.h create mode 100644 paddle/phi/kernels/qr_kernel.h create mode 100644 paddle/phi/kernels/tril_triu_grad_kernel.h create mode 100644 paddle/phi/kernels/tril_triu_kernel.h create mode 100644 paddle/phi/ops/compat/multiplex_sig.cc create mode 100644 paddle/phi/ops/compat/qr_sig.cc create mode 100644 paddle/phi/ops/compat/tril_triu_sig.cc diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h index 214b2eccae9..6e2ac4617da 100644 --- a/paddle/fluid/operators/lu_op.h +++ b/paddle/fluid/operators/lu_op.h @@ -18,8 +18,8 @@ limitations under the License. */ #include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/operators/set_value_op.h" #include "paddle/fluid/operators/svd_helper.h" -#include "paddle/fluid/operators/tril_triu_op.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" +#include "paddle/phi/kernels/funcs/tril_triu_compute.h" #include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/triangular_solve_kernel.h" @@ -404,11 +404,12 @@ void LU_Unpack(const DeviceContext& dev_ctx, const framework::Tensor* LU, const auto W = udims[udims.size() - 1]; auto L_dataptr = L->mutable_data(dev_ctx.GetPlace()); platform::ForRange x_for_range(dev_ctx, LU->numel()); - TrilTriuCompute tril_computer(LU->data(), -1, true, H, W, L_dataptr); + phi::funcs::TrilTriuCompute tril_computer(LU->data(), -1, true, H, W, + L_dataptr); x_for_range(tril_computer); - TrilTriuCompute triu_computer(LU->data(), 0, false, H, W, - U->mutable_data(dev_ctx.GetPlace())); + phi::funcs::TrilTriuCompute triu_computer( + LU->data(), 0, false, H, W, U->mutable_data(dev_ctx.GetPlace())); x_for_range(triu_computer); // set L's diagonal 1 @@ -532,15 +533,15 @@ class LUGradKernel : public framework::OpKernel { auto phil_rank = LmHdims.size(); auto phiu_rank = UmHdims.size(); platform::ForRange l_for_range(dev_ctx, phi_L.numel()); - TrilTriuCompute tril_computer(phi_L.data(), -1, true, - LmHdims[phil_rank - 2], - LmHdims[phil_rank - 1], phi_L.data()); + phi::funcs::TrilTriuCompute tril_computer( + phi_L.data(), -1, true, LmHdims[phil_rank - 2], + LmHdims[phil_rank - 1], phi_L.data()); l_for_range(tril_computer); platform::ForRange u_for_range(dev_ctx, phi_U.numel()); - TrilTriuCompute triu_computer(phi_U.data(), 0, false, - UmHdims[phiu_rank - 2], - UmHdims[phiu_rank - 1], phi_U.data()); + phi::funcs::TrilTriuCompute triu_computer( + phi_U.data(), 0, false, UmHdims[phiu_rank - 2], + UmHdims[phiu_rank - 1], phi_U.data()); u_for_range(triu_computer); Tensor_Add(dev_ctx, phi_L, phi_U, &phi); @@ -591,8 +592,9 @@ class LUGradKernel : public framework::OpKernel { const auto W = phidims[phidims.size() - 1]; platform::ForRange x_for_range(dev_ctx, phi_complement.numel()); - TrilTriuCompute tril_computer(phi_complement.data(), -1, true, H, - W, phi_complement_l.data()); + phi::funcs::TrilTriuCompute tril_computer( + phi_complement.data(), -1, true, H, W, + phi_complement_l.data()); x_for_range(tril_computer); Tensor_Sub(dev_ctx, phi, phi_complement_l, &phi); @@ -664,8 +666,8 @@ class LUGradKernel : public framework::OpKernel { const auto W = phidims[phidims.size() - 1]; platform::ForRange x_for_range(dev_ctx, phi_complement.numel()); - TrilTriuCompute triu_computer(phi_complement.data(), 0, false, H, W, - phi_complement_u.data()); + phi::funcs::TrilTriuCompute triu_computer( + phi_complement.data(), 0, false, H, W, phi_complement_u.data()); x_for_range(triu_computer); Tensor_Sub(dev_ctx, phi, phi_complement_u, &phi); diff --git a/paddle/fluid/operators/lu_unpack_op.h b/paddle/fluid/operators/lu_unpack_op.h index d2303f2c08d..e4100867dc6 100644 --- a/paddle/fluid/operators/lu_unpack_op.h +++ b/paddle/fluid/operators/lu_unpack_op.h @@ -16,7 +16,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/lu_op.h" -#include "paddle/fluid/operators/tril_triu_op.h" +#include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/funcs/tril_triu_compute.h" namespace paddle { namespace operators { @@ -87,7 +88,8 @@ class LU_UnpackGradKernel : public framework::OpKernel { auto W = ldims[ldims.size() - 1]; auto L_dataptr = dl_tril.mutable_data(dev_ctx.GetPlace()); platform::ForRange l_for_range(dev_ctx, dl->numel()); - TrilTriuCompute tril_computer(dl->data(), -1, true, H, W, L_dataptr); + phi::funcs::TrilTriuCompute tril_computer(dl->data(), -1, true, H, W, + L_dataptr); l_for_range(tril_computer); const auto udims = du->dims(); @@ -96,7 +98,8 @@ class LU_UnpackGradKernel : public framework::OpKernel { W = udims[udims.size() - 1]; auto U_dataptr = du_triu.mutable_data(dev_ctx.GetPlace()); platform::ForRange u_for_range(dev_ctx, du->numel()); - TrilTriuCompute triu_computer(du->data(), 0, false, H, W, U_dataptr); + phi::funcs::TrilTriuCompute triu_computer(du->data(), 0, false, H, W, + U_dataptr); u_for_range(triu_computer); auto xdims = dx->dims(); diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc index 313a479ea30..8771a6573cb 100644 --- a/paddle/fluid/operators/multiplex_op.cc +++ b/paddle/fluid/operators/multiplex_op.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/multiplex_op.h" #include #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -169,15 +169,3 @@ REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, ops::MultiplexGradMaker, ops::MultiplexGradMaker); REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp); -REGISTER_OP_CPU_KERNEL( - multiplex, - ops::MultiplexCPUKernel, - ops::MultiplexCPUKernel, - ops::MultiplexCPUKernel, - ops::MultiplexCPUKernel); -REGISTER_OP_CPU_KERNEL( - multiplex_grad, - ops::MultiplexGradCPUKernel, - ops::MultiplexGradCPUKernel, - ops::MultiplexGradCPUKernel, - ops::MultiplexGradCPUKernel); diff --git a/paddle/fluid/operators/multiplex_op.cu b/paddle/fluid/operators/multiplex_op.cu deleted file mode 100644 index 0a32ee96fb6..00000000000 --- a/paddle/fluid/operators/multiplex_op.cu +++ /dev/null @@ -1,117 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/multiplex_op.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class MultiplexGPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto ins = ctx.MultiInput("X"); - auto* ids = ctx.Input("Ids"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - for (size_t i = 0; i < ins.size(); ++i) { - PADDLE_ENFORCE_GT( - ins[i]->numel(), 0, - platform::errors::OutOfRange( - "indexing will be out of bounds with size 0 for the %d-th input.", - i)); - } - - auto rows = ins[0]->dims()[0]; - auto cols = ins[0]->numel() / rows; - // copy index to cpu - Tensor index_t_cpu; - paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu); - auto* index = index_t_cpu.data(); - auto stream = ctx.cuda_device_context().stream(); - platform::CUDAPlace place = ctx.GetPlace(); - for (auto i = 0; i < rows; i++) { - int32_t k = index[i]; - PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet( - "index must be nonnegative.")); - PADDLE_ENFORCE_LT(static_cast(k), ins.size(), - platform::errors::PreconditionNotMet( - "index exceeds the number of candidate tensors.")); - memory::Copy(place, out->data() + i * cols, place, - ins[k]->data() + i * cols, cols * sizeof(T), stream); - } - } -}; - -template -class MultiplexGradGPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* d_out = ctx.Input(framework::GradVarName("Out")); - auto* ids = ctx.Input("Ids"); - auto d_ins = ctx.MultiOutput(framework::GradVarName("X")); - - size_t idx = -1UL; - for (size_t i = 0; i < d_ins.size(); i++) { - if (d_ins[i]) { - d_ins[i]->mutable_data(ctx.GetPlace()); - auto t = framework::EigenVector::Flatten(*d_ins[i]); - t.device(*ctx.template device_context().eigen_device()) = - t.constant(static_cast(0)); - - idx = i; - } - } - - if (idx == -1UL) return; - - auto rows = d_ins[idx]->dims()[0]; - auto cols = d_ins[idx]->numel() / rows; - // copy index to cpu - Tensor index_t_cpu; - paddle::framework::TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu); - auto* index = index_t_cpu.data(); - - auto stream = ctx.cuda_device_context().stream(); - platform::CUDAPlace place = ctx.GetPlace(); - for (auto i = 0; i < rows; i++) { - size_t k = static_cast(index[i]); - if (d_ins[k]) { - memory::Copy(place, d_ins[k]->data() + i * cols, place, - d_out->data() + i * cols, cols * sizeof(T), stream); - } - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - multiplex, - ops::MultiplexGPUKernel, - ops::MultiplexGPUKernel, - ops::MultiplexGPUKernel, - ops::MultiplexGPUKernel); -REGISTER_OP_CUDA_KERNEL( - multiplex_grad, - ops::MultiplexGradGPUKernel, - ops::MultiplexGradGPUKernel, - ops::MultiplexGradGPUKernel, - ops::MultiplexGradGPUKernel); diff --git a/paddle/fluid/operators/multiplex_op.h b/paddle/fluid/operators/multiplex_op.h deleted file mode 100644 index 1d0a009edee..00000000000 --- a/paddle/fluid/operators/multiplex_op.h +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/memory/memcpy.h" - -namespace paddle { -namespace operators { - -template -class MultiplexCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto ins = ctx.MultiInput("X"); - auto ids = ctx.Input("Ids"); - auto* out = ctx.Output("Out"); - - out->mutable_data(ctx.GetPlace()); - - for (size_t i = 0; i < ins.size(); ++i) { - PADDLE_ENFORCE_GT( - ins[i]->numel(), 0, - platform::errors::OutOfRange( - "indexing will be out of bounds with size 0 for the %d-th input.", - i)); - } - - auto rows = ins[0]->dims()[0]; - auto cols = ins[0]->numel() / rows; - auto index = ids->data(); - platform::CPUPlace place = ctx.GetPlace(); - for (auto i = 0; i < rows; i++) { - int32_t k = index[i]; - PADDLE_ENFORCE_GE(k, 0, platform::errors::PreconditionNotMet( - "index must be nonnegative.")); - PADDLE_ENFORCE_LT(static_cast(k), ins.size(), - platform::errors::PreconditionNotMet( - "index exceeds the number of candidate tensors.")); - memory::Copy(place, out->data() + i * cols, place, - ins[k]->data() + i * cols, cols * sizeof(T)); - } - } -}; - -template -class MultiplexGradCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* d_out = ctx.Input(framework::GradVarName("Out")); - auto* ids = ctx.Input("Ids"); - auto d_ins = - ctx.MultiOutput(framework::GradVarName("X")); - - size_t idx = -1UL; - for (size_t i = 0; i < d_ins.size(); i++) { - if (d_ins[i]) { - d_ins[i]->mutable_data(ctx.GetPlace()); - auto t = framework::EigenVector::Flatten(*d_ins[i]); - t.device(*ctx.template device_context().eigen_device()) = - t.constant(static_cast(0)); - - idx = i; - } - } - - if (idx == -1UL) return; - - auto rows = d_ins[idx]->dims()[0]; - auto cols = d_ins[idx]->numel() / rows; - auto* index = ids->data(); - platform::CPUPlace place = ctx.GetPlace(); - for (auto i = 0; i < rows; i++) { - size_t k = static_cast(index[i]); - if (d_ins[k]) { - memory::Copy(place, d_ins[k]->data() + i * cols, place, - d_out->data() + i * cols, cols * sizeof(T)); - } - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/qr_op.cc b/paddle/fluid/operators/qr_op.cc index 40e3cbde3b0..82fc9ef1b78 100644 --- a/paddle/fluid/operators/qr_op.cc +++ b/paddle/fluid/operators/qr_op.cc @@ -145,8 +145,6 @@ REGISTER_OPERATOR(qr, ops::QrOp, ops::QrOpMaker, REGISTER_OPERATOR(qr_grad, ops::QrGradOp); -REGISTER_OP_CPU_KERNEL(qr, ops::QrCPUKernel, ops::QrCPUKernel); - REGISTER_OP_CPU_KERNEL( qr_grad, ops::QrGradKernel, ops::QrGradKernel); diff --git a/paddle/fluid/operators/qr_op.h b/paddle/fluid/operators/qr_op.h index f09a07e96cd..5ef02d89427 100644 --- a/paddle/fluid/operators/qr_op.h +++ b/paddle/fluid/operators/qr_op.h @@ -48,85 +48,6 @@ static inline std::tuple _parse_qr_mode(std::string mode) { return std::make_tuple(compute_q, reduced); } -template -class QrCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - bool compute_q; - bool reduced_mode; - const Tensor& x = *context.Input("X"); - Tensor& q = *context.Output("Q"); - Tensor& r = *context.Output("R"); - std::string mode = context.Attr("mode"); - std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode); - - auto numel = x.numel(); - PADDLE_ENFORCE_GT(numel, 0, platform::errors::PreconditionNotMet( - "The input of QR is empty.")); - auto x_dims = x.dims(); - int x_rank = x_dims.size(); - int m = x_dims[x_rank - 2]; - int n = x_dims[x_rank - 1]; - int min_mn = std::min(m, n); - int k = reduced_mode ? min_mn : m; - int batch_size = numel / (m * n); - int x_stride = m * n; - int q_stride = m * k; - int r_stride = k * n; - - auto* x_data = x.data>(); - T* q_data = nullptr; - if (compute_q) { - q_data = q.mutable_data>( - context.GetPlace(), - size_t(batch_size * m * k * sizeof(phi::dtype::Real))); - memset(q_data, 0, - size_t(batch_size * m * k * sizeof(phi::dtype::Real))); - } - auto* r_data = r.mutable_data>( - context.GetPlace(), - size_t(batch_size * k * n * sizeof(phi::dtype::Real))); - memset(r_data, 0, size_t(batch_size * k * n * sizeof(phi::dtype::Real))); - - // Implement QR by calling Eigen - for (int i = 0; i < batch_size; ++i) { - const T* x_matrix_ptr = x_data + i * x_stride; - T* r_matrix_ptr = r_data + i * r_stride; - using EigenDynamicMatrix = - Eigen::Matrix; - auto x_matrix = Eigen::Map(x_matrix_ptr, m, n); - Eigen::HouseholderQR qr(x_matrix); - if (reduced_mode) { - auto qr_top_matrix = qr.matrixQR().block(0, 0, min_mn, n); - auto r_matrix_view = - qr_top_matrix.template triangularView(); - auto r_matrix = EigenDynamicMatrix(r_matrix_view); - memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T)); - } else { - auto r_matrix_view = - qr.matrixQR().template triangularView(); - auto r_matrix = EigenDynamicMatrix(r_matrix_view); - memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T)); - } - - if (compute_q) { - T* q_matrix_ptr = q_data + i * q_stride; - if (reduced_mode) { - auto q_matrix = - qr.householderQ() * EigenDynamicMatrix::Identity(m, min_mn); - q_matrix.transposeInPlace(); - memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T)); - } else { - auto q_matrix = - qr.householderQ() * EigenDynamicMatrix::Identity(m, m); - q_matrix.transposeInPlace(); - memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T)); - } - } - } - } -}; - template class QrGradKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/tril_triu_op.cc b/paddle/fluid/operators/tril_triu_op.cc index 3e943c62e1c..c8010e8a128 100644 --- a/paddle/fluid/operators/tril_triu_op.cc +++ b/paddle/fluid/operators/tril_triu_op.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/tril_triu_op.h" #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -104,19 +104,3 @@ REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker, ops::TrilTriuGradOpMaker, ops::TrilTriuGradOpMaker); REGISTER_OPERATOR(tril_triu_grad, ops::TrilTriuGradOp); -REGISTER_OP_CPU_KERNEL( - tril_triu, ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel); -REGISTER_OP_CPU_KERNEL( - tril_triu_grad, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel); diff --git a/paddle/fluid/operators/tril_triu_op.cu b/paddle/fluid/operators/tril_triu_op.cu deleted file mode 100644 index 9cbbdeeb2ce..00000000000 --- a/paddle/fluid/operators/tril_triu_op.cu +++ /dev/null @@ -1,35 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/tril_triu_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - tril_triu, ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel, - ops::TrilTriuOpKernel); -REGISTER_OP_CUDA_KERNEL( - tril_triu_grad, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel, - ops::TrilTriuGradOpKernel); diff --git a/paddle/fluid/operators/tril_triu_op.h b/paddle/fluid/operators/tril_triu_op.h deleted file mode 100644 index 3150b7617d1..00000000000 --- a/paddle/fluid/operators/tril_triu_op.h +++ /dev/null @@ -1,102 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/for_range.h" - -namespace paddle { -namespace operators { - -template -class TrilTriuCompute { - public: - HOSTDEVICE TrilTriuCompute(const T* in, const int diagonal, const bool lower, - const int64_t H, const int64_t W, T* out) - : in_(in), diagonal_(diagonal), lower_(lower), H_(H), W_(W), out_(out) {} - - HOSTDEVICE void operator()(int64_t idx) { - const int64_t row = (idx / W_) % H_; - const int64_t col = idx % W_; - const bool mask = - lower_ ? (col - row > diagonal_) : (col - row < diagonal_); - out_[idx] = mask ? static_cast(0) : in_[idx]; - } - - private: - const T* in_; - const int diagonal_; - const bool lower_; - const int64_t H_; - const int64_t W_; - T* out_; -}; - -template -class TrilTriuOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const auto* x = context.Input("X"); - const auto* x_data = x->data(); - auto* out = context.Output("Out"); - auto* out_data = out->mutable_data(context.GetPlace()); - - const int diagonal = context.Attr("diagonal"); - const bool lower = context.Attr("lower"); - - const auto& dims = x->dims(); - const auto H = dims[dims.size() - 2]; - const auto W = dims[dims.size() - 1]; - - platform::ForRange for_range( - context.template device_context(), - static_cast(x->numel())); - - paddle::operators::TrilTriuCompute tril_triu_computer( - x_data, diagonal, lower, H, W, out_data); - for_range(tril_triu_computer); - } -}; - -template -class TrilTriuGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const auto* d_out = - context.Input(framework::GradVarName("Out")); - const auto* dout_data = d_out->data(); - auto* d_x = context.Output(framework::GradVarName("X")); - auto* dx_data = d_x->mutable_data(context.GetPlace()); - - const int diagonal = context.Attr("diagonal"); - const bool lower = context.Attr("lower"); - - const auto& dims = d_out->dims(); - const auto H = dims[dims.size() - 2]; - const auto W = dims[dims.size() - 1]; - - platform::ForRange for_range( - context.template device_context(), - static_cast(d_out->numel())); - - paddle::operators::TrilTriuCompute tril_triu_grad_computer( - dout_data, diagonal, lower, H, W, dx_data); - for_range(tril_triu_grad_computer); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/tril_triu_op_npu.cc b/paddle/fluid/operators/tril_triu_op_npu.cc index ad1c1814c05..4145730357d 100644 --- a/paddle/fluid/operators/tril_triu_op_npu.cc +++ b/paddle/fluid/operators/tril_triu_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/tril_triu_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/tril_triu_op_xpu.cc b/paddle/fluid/operators/tril_triu_op_xpu.cc index e36cbcf228c..a44ea8ff689 100644 --- a/paddle/fluid/operators/tril_triu_op_xpu.cc +++ b/paddle/fluid/operators/tril_triu_op_xpu.cc @@ -11,7 +11,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/tril_triu_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/device_wrapper.h" namespace paddle { diff --git a/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc b/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc new file mode 100644 index 00000000000..f5a426e93db --- /dev/null +++ b/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc @@ -0,0 +1,65 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/multiplex_grad_kernel.h" + +#include "paddle/fluid/memory/memcpy.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { + +template +void MultiplexGradKernel(const Context& ctx, + const DenseTensor& ids, + const DenseTensor& out_grad, + std::vector ins_grad) { + size_t idx = -1UL; + for (size_t i = 0; i < ins_grad.size(); i++) { + if (ins_grad[i]) { + ctx.template Alloc(ins_grad[i]); + auto t = phi::EigenVector::Flatten(*ins_grad[i]); + t.device(*ctx.eigen_device()) = t.constant(static_cast(0)); + idx = i; + } + } + if (idx == -1UL) return; + + auto rows = ins_grad[idx]->dims()[0]; + auto cols = ins_grad[idx]->numel() / rows; + auto* index = ids.data(); + for (auto i = 0; i < rows; i++) { + size_t k = static_cast(index[i]); + if (ins_grad[k]) { + paddle::memory::Copy(ctx.GetPlace(), + ins_grad[k]->data() + i * cols, + ctx.GetPlace(), + out_grad.data() + i * cols, + cols * sizeof(T)); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(multiplex_grad, + CPU, + ALL_LAYOUT, + phi::MultiplexGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/multiplex_kernel.cc b/paddle/phi/kernels/cpu/multiplex_kernel.cc new file mode 100644 index 00000000000..2d9f4c51a98 --- /dev/null +++ b/paddle/phi/kernels/cpu/multiplex_kernel.cc @@ -0,0 +1,65 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/multiplex_kernel.h" + +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void MultiplexKernel(const Context& ctx, + const std::vector& ins, + const DenseTensor& ids, + DenseTensor* out) { + ctx.template Alloc(out); + for (size_t i = 0; i < ins.size(); ++i) { + PADDLE_ENFORCE_GT( + ins[i]->numel(), + 0, + errors::OutOfRange( + "indexing will be out of bounds with size 0 for the %d-th input.", + i)); + } + auto rows = ins[0]->dims()[0]; + auto cols = ins[0]->numel() / rows; + auto index = ids.data(); + for (auto i = 0; i < rows; i++) { + int32_t k = index[i]; + PADDLE_ENFORCE_GE( + k, 0, errors::PreconditionNotMet("index must be nonnegative.")); + PADDLE_ENFORCE_LT(static_cast(k), + ins.size(), + errors::PreconditionNotMet( + "index exceeds the number of candidate tensors.")); + paddle::memory::Copy(ctx.GetPlace(), + out->data() + i * cols, + ctx.GetPlace(), + ins[k]->data() + i * cols, + cols * sizeof(T)); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(multiplex, + CPU, + ALL_LAYOUT, + phi::MultiplexKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/qr_kernel.cc b/paddle/phi/kernels/cpu/qr_kernel.cc new file mode 100644 index 00000000000..e2e32567441 --- /dev/null +++ b/paddle/phi/kernels/cpu/qr_kernel.cc @@ -0,0 +1,116 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/phi/kernels/qr_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/complex_functors.h" + +namespace phi { + +static inline std::tuple ParseQrMode(const std::string& mode) { + bool compute_q; + bool reduced; + if (mode == "reduced") { + compute_q = true; + reduced = true; + } else if (mode == "complete") { + compute_q = true; + reduced = false; + } else if (mode == "r") { + compute_q = false; + reduced = true; + } else { + PADDLE_THROW(errors::InvalidArgument( + "QR received unrecognized mode '%s'" + " but expected one of 'reduced' (default), 'r', or 'complete'", + mode)); + } + return std::make_tuple(compute_q, reduced); +} + +template +void QrKernel(const Context& ctx, + const DenseTensor& x, + const std::string& mode, + DenseTensor* q, + DenseTensor* r) { + bool compute_q; + bool reduced_mode; + std::tie(compute_q, reduced_mode) = ParseQrMode(mode); + auto numel = x.numel(); + PADDLE_ENFORCE_GT( + numel, 0, errors::PreconditionNotMet("The input of QR is empty.")); + auto x_dims = x.dims(); + int x_rank = x_dims.size(); + int m = x_dims[x_rank - 2]; + int n = x_dims[x_rank - 1]; + int min_mn = std::min(m, n); + int k = reduced_mode ? min_mn : m; + int batch_size = numel / (m * n); + int x_stride = m * n; + int q_stride = m * k; + int r_stride = k * n; + auto* x_data = x.data>(); + T* q_data = nullptr; + if (compute_q) { + q_data = ctx.template Alloc>( + q, batch_size * m * k * sizeof(phi::dtype::Real)); + } + auto* r_data = ctx.template Alloc>( + r, batch_size * k * n * sizeof(phi::dtype::Real)); + + // Implement QR by calling Eigen + for (int i = 0; i < batch_size; ++i) { + const T* x_matrix_ptr = x_data + i * x_stride; + T* r_matrix_ptr = r_data + i * r_stride; + using EigenDynamicMatrix = + Eigen::Matrix; + auto x_matrix = Eigen::Map(x_matrix_ptr, m, n); + Eigen::HouseholderQR qr(x_matrix); + if (reduced_mode) { + auto qr_top_matrix = qr.matrixQR().block(0, 0, min_mn, n); + auto r_matrix_view = + qr_top_matrix.template triangularView(); + auto r_matrix = EigenDynamicMatrix(r_matrix_view); + memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T)); + } else { + auto r_matrix_view = + qr.matrixQR().template triangularView(); + auto r_matrix = EigenDynamicMatrix(r_matrix_view); + memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T)); + } + + if (compute_q) { + T* q_matrix_ptr = q_data + i * q_stride; + if (reduced_mode) { + auto q_matrix = + qr.householderQ() * EigenDynamicMatrix::Identity(m, min_mn); + q_matrix.transposeInPlace(); + memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T)); + } else { + auto q_matrix = qr.householderQ() * EigenDynamicMatrix::Identity(m, m); + q_matrix.transposeInPlace(); + memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T)); + } + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(qr, CPU, ALL_LAYOUT, phi::QrKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc b/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc new file mode 100644 index 00000000000..14aca258a2c --- /dev/null +++ b/paddle/phi/kernels/cpu/tril_triu_grad_kernel.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(tril_triu_grad, + CPU, + ALL_LAYOUT, + phi::TrilTriuGradKernel, + bool, + float, + double, + int, + int64_t, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/cpu/tril_triu_kernel.cc b/paddle/phi/kernels/cpu/tril_triu_kernel.cc new file mode 100644 index 00000000000..a3d20e55e21 --- /dev/null +++ b/paddle/phi/kernels/cpu/tril_triu_kernel.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(tril_triu, + CPU, + ALL_LAYOUT, + phi::TrilTriuKernel, + bool, + float, + double, + int, + int64_t, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/funcs/tril_triu_compute.h b/paddle/phi/kernels/funcs/tril_triu_compute.h new file mode 100644 index 00000000000..d2b6f1e559d --- /dev/null +++ b/paddle/phi/kernels/funcs/tril_triu_compute.h @@ -0,0 +1,48 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace phi { +namespace funcs { + +template +class TrilTriuCompute { + public: + HOSTDEVICE TrilTriuCompute(const T* in, + const int diagonal, + const bool lower, + const int64_t H, + const int64_t W, + T* out) + : in_(in), diagonal_(diagonal), lower_(lower), H_(H), W_(W), out_(out) {} + + HOSTDEVICE void operator()(int64_t idx) { + const int64_t row = (idx / W_) % H_; + const int64_t col = idx % W_; + const bool mask = + lower_ ? (col - row > diagonal_) : (col - row < diagonal_); + out_[idx] = mask ? static_cast(0) : in_[idx]; + } + + private: + const T* in_; + const int diagonal_; + const bool lower_; + const int64_t H_; + const int64_t W_; + T* out_; +}; +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu b/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu new file mode 100644 index 00000000000..21576ab608d --- /dev/null +++ b/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu @@ -0,0 +1,68 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/multiplex_grad_kernel.h" + +#include "paddle/phi/api/lib/utils/tensor_utils.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { + +template +void MultiplexGradKernel(const Context& ctx, + const DenseTensor& ids, + const DenseTensor& out_grad, + std::vector ins_grad) { + size_t idx = -1UL; + for (size_t i = 0; i < ins_grad.size(); i++) { + if (ins_grad[i]) { + ctx.template Alloc(ins_grad[i]); + auto t = phi::EigenVector::Flatten(*ins_grad[i]); + t.device(*ctx.eigen_device()) = t.constant(static_cast(0)); + idx = i; + } + } + if (idx == -1UL) return; + + auto rows = ins_grad[idx]->dims()[0]; + auto cols = ins_grad[idx]->numel() / rows; + DenseTensor index_t_cpu; + paddle::framework::TensorCopySync(ids, phi::CPUPlace(), &index_t_cpu); + auto* index = index_t_cpu.data(); + auto stream = ctx.stream(); + for (auto i = 0; i < rows; i++) { + size_t k = static_cast(index[i]); + if (ins_grad[k]) { + paddle::memory::Copy(ctx.GetPlace(), + ins_grad[k]->data() + i * cols, + ctx.GetPlace(), + out_grad.data() + i * cols, + cols * sizeof(T), + stream); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(multiplex_grad, + GPU, + ALL_LAYOUT, + phi::MultiplexGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/multiplex_kernel.cu b/paddle/phi/kernels/gpu/multiplex_kernel.cu new file mode 100644 index 00000000000..743448a4686 --- /dev/null +++ b/paddle/phi/kernels/gpu/multiplex_kernel.cu @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/multiplex_kernel.h" + +#include "paddle/phi/api/lib/utils/tensor_utils.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void MultiplexKernel(const Context& ctx, + const std::vector& ins, + const DenseTensor& ids, + DenseTensor* out) { + ctx.template Alloc(out); + for (size_t i = 0; i < ins.size(); ++i) { + PADDLE_ENFORCE_GT( + ins[i]->numel(), + 0, + errors::OutOfRange( + "indexing will be out of bounds with size 0 for the %d-th input.", + i)); + } + + auto rows = ins[0]->dims()[0]; + auto cols = ins[0]->numel() / rows; + DenseTensor index_t_cpu; + paddle::framework::TensorCopySync(ids, phi::CPUPlace(), &index_t_cpu); + auto* index = index_t_cpu.data(); + auto stream = ctx.stream(); + for (auto i = 0; i < rows; i++) { + int32_t k = index[i]; + PADDLE_ENFORCE_GE( + k, 0, errors::PreconditionNotMet("index must be nonnegative.")); + PADDLE_ENFORCE_LT(static_cast(k), + ins.size(), + errors::PreconditionNotMet( + "index exceeds the number of candidate tensors.")); + paddle::memory::Copy(ctx.GetPlace(), + out->data() + i * cols, + ctx.GetPlace(), + ins[k]->data() + i * cols, + cols * sizeof(T), + stream); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(multiplex, + GPU, + ALL_LAYOUT, + phi::MultiplexKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu b/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu new file mode 100644 index 00000000000..bc3ef1bc623 --- /dev/null +++ b/paddle/phi/kernels/gpu/tril_triu_grad_kernel.cu @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(tril_triu_grad, + GPU, + ALL_LAYOUT, + phi::TrilTriuGradKernel, + bool, + float, + double, + int, + int64_t, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/tril_triu_kernel.cu b/paddle/phi/kernels/gpu/tril_triu_kernel.cu new file mode 100644 index 00000000000..8c48edf9eff --- /dev/null +++ b/paddle/phi/kernels/gpu/tril_triu_kernel.cu @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(tril_triu, + GPU, + ALL_LAYOUT, + phi::TrilTriuKernel, + bool, + float, + double, + int, + int64_t, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h index 9f557e74637..72741e6d3a0 100644 --- a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h @@ -24,13 +24,12 @@ #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/common_shape.h" #include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/matrix_reduce.h" +#include "paddle/phi/kernels/funcs/tril_triu_compute.h" #include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/operators/tril_triu_op.h" - namespace phi { template @@ -115,7 +114,7 @@ void CholeskySolveGradKernel(const Context& dev_ctx, const auto H = y_bst_dims_vec[y_bst_ndim - 2]; const auto W = y_bst_dims_vec[y_bst_ndim - 1]; phi::funcs::ForRange y_for_range(dev_ctx, dy_bst.numel()); - paddle::operators::TrilTriuCompute tril_triu_functor( + phi::funcs::TrilTriuCompute tril_triu_functor( dy_bst.data(), 0, !upper, H, W, dy_bst_upper.data()); y_for_range(tril_triu_functor); diff --git a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h index 9b1e4b1d3a6..044adb0230c 100644 --- a/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h @@ -21,12 +21,11 @@ #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/common_shape.h" #include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/matrix_reduce.h" +#include "paddle/phi/kernels/funcs/tril_triu_compute.h" #include "paddle/phi/kernels/triangular_solve_kernel.h" -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/operators/tril_triu_op.h" - namespace phi { template @@ -119,7 +118,7 @@ void TriangularSolveGradKernel(const Context& dev_ctx, const auto H = dims[dims.size() - 2]; const auto W = dims[dims.size() - 1]; phi::funcs::ForRange x_for_range(dev_ctx, dx_bst.numel()); - paddle::operators::TrilTriuCompute tril_triu_functor( + phi::funcs::TrilTriuCompute tril_triu_functor( dx_bst.data(), unitriangular, !upper, H, W, dx_bst_upper.data()); x_for_range(tril_triu_functor); diff --git a/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h b/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h new file mode 100644 index 00000000000..dcc7224b507 --- /dev/null +++ b/paddle/phi/kernels/impl/tril_triu_grad_kernel_impl.h @@ -0,0 +1,44 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/tril_triu_grad_kernel.h" + +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/funcs/tril_triu_compute.h" + +namespace phi { + +template +void TrilTriuGradKernel(const Context& ctx, + const DenseTensor& out_grad, + int diagonal, + bool lower, + DenseTensor* x_grad) { + const auto* dout_data = out_grad.data(); + auto* dx_data = ctx.template Alloc(x_grad); + + const auto& dims = out_grad.dims(); + const auto H = dims[dims.size() - 2]; + const auto W = dims[dims.size() - 1]; + + phi::funcs::ForRange for_range( + ctx, static_cast(out_grad.numel())); + phi::funcs::TrilTriuCompute tril_triu_grad_computer( + dout_data, diagonal, lower, H, W, dx_data); + for_range(tril_triu_grad_computer); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/tril_triu_kernel_impl.h b/paddle/phi/kernels/impl/tril_triu_kernel_impl.h new file mode 100644 index 00000000000..959169d87ce --- /dev/null +++ b/paddle/phi/kernels/impl/tril_triu_kernel_impl.h @@ -0,0 +1,43 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/tril_triu_kernel.h" + +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/funcs/tril_triu_compute.h" + +namespace phi { + +template +void TrilTriuKernel(const Context& ctx, + const DenseTensor& x, + int diagonal, + bool lower, + DenseTensor* out) { + const auto* x_data = x.data(); + auto* out_data = ctx.template Alloc(out); + + const auto& dims = x.dims(); + const auto H = dims[dims.size() - 2]; + const auto W = dims[dims.size() - 1]; + phi::funcs::ForRange for_range(ctx, static_cast(x.numel())); + + phi::funcs::TrilTriuCompute tril_triu_computer( + x_data, diagonal, lower, H, W, out_data); + for_range(tril_triu_computer); +} + +} // namespace phi diff --git a/paddle/phi/kernels/multiplex_grad_kernel.h b/paddle/phi/kernels/multiplex_grad_kernel.h new file mode 100644 index 00000000000..b32c9dbe100 --- /dev/null +++ b/paddle/phi/kernels/multiplex_grad_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void MultiplexGradKernel(const Context& ctx, + const DenseTensor& ids, + const DenseTensor& out_grad, + std::vector ins_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/multiplex_kernel.h b/paddle/phi/kernels/multiplex_kernel.h new file mode 100644 index 00000000000..341c6d5cabb --- /dev/null +++ b/paddle/phi/kernels/multiplex_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void MultiplexKernel(const Context& ctx, + const std::vector& ins, + const DenseTensor& ids, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/qr_kernel.h b/paddle/phi/kernels/qr_kernel.h new file mode 100644 index 00000000000..9c3dfb16601 --- /dev/null +++ b/paddle/phi/kernels/qr_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void QrKernel(const Context& ctx, + const DenseTensor& x, + const std::string& mode, + DenseTensor* q, + DenseTensor* r); + +} // namespace phi diff --git a/paddle/phi/kernels/tril_triu_grad_kernel.h b/paddle/phi/kernels/tril_triu_grad_kernel.h new file mode 100644 index 00000000000..10faf5c48d5 --- /dev/null +++ b/paddle/phi/kernels/tril_triu_grad_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void TrilTriuGradKernel(const Context& ctx, + const DenseTensor& out_grad, + int diagonal, + bool lower, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/tril_triu_kernel.h b/paddle/phi/kernels/tril_triu_kernel.h new file mode 100644 index 00000000000..4daa84e25c3 --- /dev/null +++ b/paddle/phi/kernels/tril_triu_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void TrilTriuKernel(const Context& ctx, + const DenseTensor& x, + int diagonal, + bool lower, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/multiplex_sig.cc b/paddle/phi/ops/compat/multiplex_sig.cc new file mode 100644 index 00000000000..9dab4655d17 --- /dev/null +++ b/paddle/phi/ops/compat/multiplex_sig.cc @@ -0,0 +1,32 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature MultiplexOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("multiplex", {"X", "Ids"}, {}, {"Out"}); +} + +KernelSignature MultiplexGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "multiplex_grad", {"Ids", GradVarName("Out")}, {}, {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(multiplex, phi::MultiplexOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(multiplex_grad, phi::MultiplexGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/qr_sig.cc b/paddle/phi/ops/compat/qr_sig.cc new file mode 100644 index 00000000000..dd424d590ee --- /dev/null +++ b/paddle/phi/ops/compat/qr_sig.cc @@ -0,0 +1,25 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature QrOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("qr", {"X"}, {"mode"}, {"Q", "R"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(qr, phi::QrOpArgumentMapping); diff --git a/paddle/phi/ops/compat/tril_triu_sig.cc b/paddle/phi/ops/compat/tril_triu_sig.cc new file mode 100644 index 00000000000..4f79f8650de --- /dev/null +++ b/paddle/phi/ops/compat/tril_triu_sig.cc @@ -0,0 +1,34 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature TrilTriuOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("tril_triu", {"X"}, {"diagonal", "lower"}, {"Out"}); +} + +KernelSignature TrilTriuGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("tril_triu_grad", + {GradVarName("Out")}, + {"diagonal", "lower"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(tril_triu, phi::TrilTriuOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tril_triu_grad, phi::TrilTriuGradOpArgumentMapping); -- GitLab From d1a98f0bb72822f44c88bec8a82e4e7fc031ba9f Mon Sep 17 00:00:00 2001 From: TTerror Date: Wed, 16 Mar 2022 15:18:18 +0800 Subject: [PATCH 097/176] fix xpu op test, *test=kunlun (#40409) --- .../fluid/tests/unittests/op_test_xpu.py | 27 ++++++++++++++----- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/op_test_xpu.py b/python/paddle/fluid/tests/unittests/op_test_xpu.py index 50ea0652094..6c964a828ee 100644 --- a/python/paddle/fluid/tests/unittests/op_test_xpu.py +++ b/python/paddle/fluid/tests/unittests/op_test_xpu.py @@ -123,17 +123,26 @@ class XPUOpTest(OpTest): return super().check_grad_with_place( place, inputs_to_check, output_names, no_grad_set, numeric_grad_delta, in_place, max_relative_error, - user_defined_grads, user_defined_grads, check_dygraph) + user_defined_grads, user_defined_grad_outputs, check_dygraph) a1 = self.get_grad_with_place( - place, inputs_to_check, output_names, no_grad_set=no_grad_set) + place, + inputs_to_check, + output_names, + no_grad_set=no_grad_set, + user_defined_grad_outputs=user_defined_grad_outputs) a2 = self.get_grad_with_place( - place, inputs_to_check, output_names, no_grad_set=no_grad_set) + place, + inputs_to_check, + output_names, + no_grad_set=no_grad_set, + user_defined_grad_outputs=user_defined_grad_outputs) a3 = self.get_grad_with_place( paddle.CPUPlace(), inputs_to_check, output_names, - no_grad_set=no_grad_set) + no_grad_set=no_grad_set, + user_defined_grad_outputs=user_defined_grad_outputs) self._assert_is_close(a1, a2, inputs_to_check, 0.00000001, "Gradient Check On two xpu") self._assert_is_close(a1, a3, inputs_to_check, max_relative_error, @@ -147,7 +156,7 @@ class XPUOpTest(OpTest): numeric_grad_delta=0.005, in_place=False, max_relative_error=0.005, - user_defined_grads=None, + user_defined_grad_outputs=None, check_dygraph=True): self.scope = core.Scope() op_inputs = self.inputs if hasattr(self, "inputs") else dict() @@ -197,6 +206,10 @@ class XPUOpTest(OpTest): if not type(output_names) is list: output_names = [output_names] - analytic_grads = self._get_gradient(inputs_to_check, place, - output_names, no_grad_set) + analytic_grads = self._get_gradient( + inputs_to_check, + place, + output_names, + no_grad_set, + user_defined_grad_outputs=user_defined_grad_outputs) return analytic_grads -- GitLab From c040bbd7b3c4df056ef2107982d0fbd8489dfa2f Mon Sep 17 00:00:00 2001 From: Ming-Xu Huang Date: Wed, 16 Mar 2022 15:41:11 +0800 Subject: [PATCH 098/176] Add Support Layer List to ASP (#40253) --- .../paddle/fluid/contrib/sparsity/__init__.py | 3 +- python/paddle/fluid/contrib/sparsity/asp.py | 79 +++++--- .../contrib/sparsity/supported_layer_list.py | 86 +++++++++ .../asp/test_asp_customized_pruning.py | 179 ++++++++++++++++++ .../test_fleet_sharding_meta_optimizer.py | 6 +- 5 files changed, 320 insertions(+), 33 deletions(-) create mode 100644 python/paddle/fluid/contrib/sparsity/supported_layer_list.py create mode 100644 python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py diff --git a/python/paddle/fluid/contrib/sparsity/__init__.py b/python/paddle/fluid/contrib/sparsity/__init__.py index 9bf45f42727..ec288a12871 100644 --- a/python/paddle/fluid/contrib/sparsity/__init__.py +++ b/python/paddle/fluid/contrib/sparsity/__init__.py @@ -29,10 +29,11 @@ from .asp import decorate from .asp import prune_model from .asp import set_excluded_layers from .asp import reset_excluded_layers +from .supported_layer_list import add_supported_layer __all__ = [ 'calculate_density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d', 'get_mask_2d_greedy', 'get_mask_2d_best', 'create_mask', 'check_sparsity', 'MaskAlgo', 'CheckMethod', 'decorate', 'prune_model', 'set_excluded_layers', - 'reset_excluded_layers' + 'reset_excluded_layers', 'add_supported_layer' ] diff --git a/python/paddle/fluid/contrib/sparsity/asp.py b/python/paddle/fluid/contrib/sparsity/asp.py index ffa12ac7046..30439ad736d 100644 --- a/python/paddle/fluid/contrib/sparsity/asp.py +++ b/python/paddle/fluid/contrib/sparsity/asp.py @@ -23,6 +23,8 @@ import paddle from paddle.fluid import global_scope, program_guard, layers from paddle.fluid.initializer import ConstantInitializer from paddle.fluid.contrib import sparsity +from paddle.fluid.contrib.sparsity.supported_layer_list import supported_layers_and_prune_func_map +from paddle.fluid.contrib.sparsity.supported_layer_list import _default_pruning from paddle.fluid import core OpRole = core.op_proto_and_checker_maker.OpRole @@ -292,8 +294,8 @@ class ASPHelper(object): 2. pruning well-trained models into 2:4 sparse pattern on FP16 or 1:2 sparse pattern on FP32 for fine-tuning. """ - MASK_APPENDDED_NAME = '_asp_mask' - SUPPORTED_LAYERS = {'fc': 'w_0', 'linear': 'w_0', 'conv2d': 'w_0'} + MASK_APPENDDED_NAME = 'asp_mask' + PADDLE_WEIGHT_SUFFIX = "w_" __asp_info = {} @@ -334,7 +336,6 @@ class ASPHelper(object): r""" This is the implementation of `sparsity.prune_model`, for details please see explanation in `sparsity.prune_model`. """ - checked_func_name = sparsity.CheckMethod.get_checking_method(mask_algo) if main_program is None: main_program = paddle.static.default_main_program() @@ -345,33 +346,27 @@ class ASPHelper(object): weight_tensor = global_scope().find_var(param.name).get_tensor() weight_nparray = np.array(weight_tensor) - # The double transpose ops here make sure pruning direction consistent with cuSparseLt. - # SPMMA in cuSparseLt: D = (AxB) + C, where matrix A (mxk) is sparse matrix. - # cuSparseLt would prune matrix A along k dimension. - # In sparse training, layer weight matriices is viewed sparse matrix A, so - # the math fomula should be 'Act(WX + b)'. However, default fomula in PaddlePaddle - # is 'Act(XW + b)'. For enabling SPMMA, weights and inputs should be transposed - # for computing, Act( (W^T X^T)^T + b). Therefore, we have to prune alog k dimension - # of W^T, which is m dimension of W. Moreove, all mask generating functions in - # sparsity/utils is row-major pruning. That is the reason we have to transpose weight - # matrices beforce invoking create_mask. Then we transpose the result maks to make - # sure its shape to be the same as the input weight. - weight_sparse_mask = sparsity.create_mask( - weight_nparray.T, func_name=mask_algo, n=n, m=m).T - weight_pruned_nparray = np.multiply(weight_nparray, - weight_sparse_mask) + prune_func = ASPHelper._get_prune_func_by_name(param.name) + + weight_pruned_nparray, weight_sparse_mask = \ + prune_func(weight_nparray, m, n, mask_algo, param.name) + weight_pruned_nparray = weight_pruned_nparray.astype( + weight_nparray.dtype) weight_tensor.set(weight_pruned_nparray, place) - assert sparsity.check_sparsity(weight_pruned_nparray.T, n=n, m=m, func_name=checked_func_name), \ - 'Pruning {} weight matrix failure!!!'.format(param.name) + if with_mask: weight_mask_param = global_scope().find_var( ASPHelper._get_mask_name(param.name)) assert weight_mask_param is not None, \ - 'Cannot find {} variable, please call ASPHelper.minimize' \ + 'Cannot find {} variable, please call optimizer.minimize (' \ + 'paddle.sparsity.decorate(optimizer).minimize(loss)' \ ' and initialization (exe.run(startup_program)) first!'.format(ASPHelper._get_mask_name(param.name)) weight_mask_tensor = weight_mask_param.get_tensor() + weight_sparse_mask = weight_sparse_mask.astype( + np.array(weight_mask_tensor).dtype) weight_mask_tensor.set(weight_sparse_mask, place) asp_info.update_masks(param.name, weight_sparse_mask) + return asp_info.masks.copy() @staticmethod @@ -384,7 +379,7 @@ class ASPHelper(object): Returns: string: The mask name of :attr:`param_name`. """ - return param_name + ASPHelper.MASK_APPENDDED_NAME + return param_name + "." + ASPHelper.MASK_APPENDDED_NAME @staticmethod def _get_not_ASP_relevant_vars(main_program): @@ -434,19 +429,46 @@ class ASPHelper(object): # fc_0.w_0 -> True # fc_0.b_0 -> False """ - if ASPHelper.MASK_APPENDDED_NAME in param_name: + param_name_list = param_name.split('.') + + if ASPHelper.MASK_APPENDDED_NAME in param_name_list: return False for layer in cls._get_program_asp_info(main_program).excluded_layers: if layer in param_name: return False - for name in ASPHelper.SUPPORTED_LAYERS: - if name in param_name and \ - ASPHelper.SUPPORTED_LAYERS[name] in param_name: - return True + if param_name in supported_layers_and_prune_func_map: + return True + + param_name_no_weight_suffix = param_name_list[0] + param_type_suffix = param_name_list[1] + layer_name = param_name_no_weight_suffix[:param_name_no_weight_suffix. + rfind('_')] + if ASPHelper.PADDLE_WEIGHT_SUFFIX not in param_type_suffix: + return False + + if param_name_no_weight_suffix in supported_layers_and_prune_func_map or \ + layer_name in supported_layers_and_prune_func_map: + return True + return False + @classmethod + def _get_prune_func_by_name(cls, param_name): + func = supported_layers_and_prune_func_map.get(param_name, None) + param_name_no_weight_suffix = param_name.split('.')[0] + if func is None: + func = supported_layers_and_prune_func_map.get( + param_name_no_weight_suffix, None) + if func is None: + layer_name = param_name_no_weight_suffix[: + param_name_no_weight_suffix. + rfind('_')] + func = supported_layers_and_prune_func_map.get(layer_name, + _default_pruning) + return func + @classmethod def _minimize(cls, optimizer, @@ -509,8 +531,7 @@ class ASPHelper(object): if ASPHelper._is_supported_layer(main_program, param_and_grad[0].name): mask_param = layers.create_parameter( - name=param_and_grad[0].name + - ASPHelper.MASK_APPENDDED_NAME, + name=ASPHelper._get_mask_name(param_and_grad[0].name), shape=param_and_grad[0].shape, dtype=param_and_grad[0].dtype, default_initializer=ConstantInitializer(value=1.0)) diff --git a/python/paddle/fluid/contrib/sparsity/supported_layer_list.py b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py new file mode 100644 index 00000000000..105c2ded9ee --- /dev/null +++ b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py @@ -0,0 +1,86 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 NVIDIA Corporation. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +from paddle.fluid.contrib import sparsity +import threading + +__all__ = ['add_supported_layer'] + + +def _default_pruning(weight_nparray, m, n, func_name, param_name): + + checked_func_name = sparsity.CheckMethod.get_checking_method(func_name) + + # The double transpose ops here make sure pruning direction consistent with cuSparseLt. + # SPMMA in cuSparseLt: D = (AxB) + C, where matrix A (mxk) is sparse matrix. + # cuSparseLt would prune matrix A along k dimension. + # In sparse training, layer weight matrices is viewed sparse matrix A, so + # the math fomula should be 'Act(WX + b)'. However, default fomula in PaddlePaddle + # is 'Act(XW + b)'. For enabling SPMMA, weights and inputs should be transposed + # for computing, Act( (W^T X^T)^T + b). Therefore, we have to prune alog k dimension + # of W^T, which is m dimension of W. Moreove, all mask generating functions in + # sparsity/utils is row-major pruning. That is the reason we have to transpose weight + # matrices beforce invoking create_mask. Then we transpose the result mask to make + # sure its shape to be the same as the input weight. + weight_sparse_mask = sparsity.create_mask( + weight_nparray.T, func_name=func_name, n=n, m=m).T + weight_pruned_nparray = np.multiply(weight_nparray, weight_sparse_mask) + assert sparsity.check_sparsity(weight_pruned_nparray.T, n=n, m=m, func_name=checked_func_name), \ + 'Pruning {} weight matrix failure!!!'.format(param_name) + return weight_pruned_nparray, weight_sparse_mask + + +# When value of given key in this DICT is None, +# ASP will call default pruning function in pruning stage. +_supported_layers_and_prune_func_map_lock = threading.Lock() +supported_layers_and_prune_func_map = {} + + +def add_supported_layer(layer, pruning_func=None): + r""" + Add supported layers and its corresponding pruning function. + + Args: + name (string|Layer): The name or type of layer, needed to support. If layer is `Layer` then + it would be turn to string internally. ASP would use this name to match parameter's name and call + its the corresponding pruning function. + pruning_func (function, optional): a function type which receives five argument (weight_nparray, + m, n, func_name, param_name), weight_nparray is a nparray of weight, param_name is the name of weight, + m, n, and func_name, please see `prune_model` for details. + """ + name = None + if isinstance(layer, str): + name = layer + elif isinstance(layer, paddle.fluid.dygraph.layers.Layer): + name = paddle.fluid.dygraph.layers._convert_camel_to_snake( + type(layer).__name__) + elif issubclass(layer, paddle.fluid.dygraph.layers.Layer): + name = paddle.fluid.dygraph.layers._convert_camel_to_snake( + layer.__name__) + else: + assert "The type of layer should be string of Layer, but got {}!".format( + type(layer)) + if pruning_func is None: + pruning_func = _default_pruning + _supported_layers_and_prune_func_map_lock.acquire() + supported_layers_and_prune_func_map.update({name: pruning_func}) + _supported_layers_and_prune_func_map_lock.release() + + +add_supported_layer('fc') +add_supported_layer('linear') +add_supported_layer('conv2d') diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py new file mode 100644 index 00000000000..a2b499a9e01 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py @@ -0,0 +1,179 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 NVIDIA Corporation. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.contrib import sparsity +from paddle.fluid.contrib.sparsity.supported_layer_list import supported_layers_and_prune_func_map +from paddle.fluid.dygraph.layers import Layer, _convert_camel_to_snake + + +class MyOwnLayer(Layer): + def __init__(self): + super(MyOwnLayer, self).__init__() + + def forward(self, x): + return x + + +static_tensor = None +static_tensor_mask = None + + +def my_own_pruning(tensor, m, n, mask_algo, param_name): + global static_tensor + global static_tensor_mask + if static_tensor is None: + static_tensor = np.random.rand(*tensor.shape).astype(np.float32) + if static_tensor_mask is None: + static_tensor_mask = np.random.rand(*tensor.shape).astype(np.float32) + return static_tensor, static_tensor_mask + + +class TestASPAddSupportedLayer(unittest.TestCase): + def test_add_supported_layer_via_name(self): + sparsity.add_supported_layer("test_supported_1") + sparsity.add_supported_layer("test_supported_2", my_own_pruning) + sparsity.add_supported_layer(MyOwnLayer) + my_own_layer_name = _convert_camel_to_snake(MyOwnLayer.__name__) + + self.assertTrue( + "test_supported_1" in supported_layers_and_prune_func_map) + self.assertTrue( + "test_supported_2" in supported_layers_and_prune_func_map) + self.assertTrue( + "test_supported_2" in supported_layers_and_prune_func_map) + self.assertTrue(supported_layers_and_prune_func_map["test_supported_2"] + == my_own_pruning) + self.assertTrue( + my_own_layer_name in supported_layers_and_prune_func_map) + + +class TestASPStaticCustomerizedPruneFunc(unittest.TestCase): + def setUp(self): + paddle.enable_static() + + self.main_program = fluid.Program() + self.startup_program = fluid.Program() + + self.customer_prefix = "customer_layer" + + def build_model(): + img = fluid.data( + name='img', shape=[None, 3, 32, 32], dtype='float32') + label = fluid.data(name='label', shape=[None, 1], dtype='int64') + hidden = fluid.layers.conv2d( + input=img, num_filters=4, filter_size=3, padding=2, act="relu") + hidden = fluid.layers.fc(input=hidden, + size=32, + act='relu', + name=self.customer_prefix) + hidden = fluid.layers.fc(input=hidden, + size=32, + act='relu', + name=self.customer_prefix) + hidden = fluid.layers.fc(input=hidden, size=32, act='relu') + prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') + return img, label, prediction + + with fluid.program_guard(self.main_program, self.startup_program): + self.img, self.label, self.predict = build_model() + self.supported_layer_count_ref = 5 + + self.place = paddle.CPUPlace() + if core.is_compiled_with_cuda(): + self.place = paddle.CUDAPlace(0) + self.exe = fluid.Executor(self.place) + + sparsity.add_supported_layer(self.customer_prefix, my_own_pruning) + + def test_inference_pruning(self): + self.exe.run(self.startup_program) + + sparsity.prune_model( + self.main_program, mask_algo="mask_1d", with_mask=False) + + supported_layer_count = 0 + for param in self.main_program.global_block().all_parameters(): + mat = np.array(fluid.global_scope().find_var(param.name).get_tensor( + )) + if sparsity.asp.ASPHelper._is_supported_layer(self.main_program, + param.name): + supported_layer_count += 1 + if (self.customer_prefix in param.name): + self.assertLessEqual( + np.sum(mat.flatten() - static_tensor.flatten()), 1e-4) + else: + self.assertTrue( + sparsity.check_sparsity( + mat.T, + func_name=sparsity.CheckMethod.CHECK_1D, + n=2, + m=4)) + self.assertEqual(supported_layer_count, self.supported_layer_count_ref) + + def test_training_pruning(self): + with fluid.program_guard(self.main_program, self.startup_program): + loss = fluid.layers.mean( + fluid.layers.cross_entropy( + input=self.predict, label=self.label)) + optimizer = sparsity.decorate( + fluid.optimizer.SGD(learning_rate=0.01)) + optimizer.minimize(loss, self.startup_program) + + self.exe.run(self.startup_program) + + sparsity.prune_model( + self.main_program, mask_algo="mask_1d", with_mask=True) + + supported_layer_count = 0 + for param in self.main_program.global_block().all_parameters(): + mat = np.array(fluid.global_scope().find_var(param.name).get_tensor( + )) + if sparsity.asp.ASPHelper._is_supported_layer(self.main_program, + param.name): + mat_mask = np.array(fluid.global_scope().find_var( + sparsity.asp.ASPHelper._get_mask_name(param.name)) + .get_tensor()) + supported_layer_count += 1 + if (self.customer_prefix in param.name): + self.assertLessEqual( + np.sum(mat.flatten() - static_tensor.flatten()), 1e-4) + self.assertLessEqual( + np.sum(mat_mask.flatten() - static_tensor_mask.flatten( + )), 1e-4) + else: + self.assertTrue( + sparsity.check_sparsity( + mat.T, + func_name=sparsity.CheckMethod.CHECK_1D, + n=2, + m=4)) + self.assertTrue( + sparsity.check_sparsity( + mat_mask.T, + func_name=sparsity.CheckMethod.CHECK_1D, + n=2, + m=4)) + self.assertEqual(supported_layer_count, self.supported_layer_count_ref) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py index d2bffbe074f..0ae005430e0 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py @@ -213,9 +213,9 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer): set(parameters), set([ 'fc_2.b_0', 'num_good_steps_0', 'fc_2.w_0', 'loss_scaling_0', - 'num_bad_steps_0', 'fc_2.w_0_velocity_0', 'fc_2.w_0_asp_mask', - 'learning_rate_0', 'fc_1.b_0', 'fc_1.w_0_asp_mask', - 'fc_0.w_0_asp_mask', 'fc_1.b_0_velocity_0', + 'num_bad_steps_0', 'fc_2.w_0_velocity_0', 'fc_2.w_0.asp_mask', + 'learning_rate_0', 'fc_1.b_0', 'fc_1.w_0.asp_mask', + 'fc_0.w_0.asp_mask', 'fc_1.b_0_velocity_0', 'fc_2.b_0_velocity_0' ])) self.assertEqual(ops, [ -- GitLab From 603f84255cdb023d12b22d946fc5ecb09ba6865b Mon Sep 17 00:00:00 2001 From: Liu-xiandong <85323580+Liu-xiandong@users.noreply.github.com> Date: Wed, 16 Mar 2022 15:51:26 +0800 Subject: [PATCH 099/176] [KP]fix bug that cannot fallback to CPU normally in XPU KP (#40576) * [kp]fix bug that cannot fallback to CPU normally in XPU KP * fix bug in static graph --- paddle/fluid/framework/operator.cc | 44 +++++++++++++----- paddle/fluid/imperative/prepared_operator.cc | 49 +++++++++++--------- 2 files changed, 60 insertions(+), 33 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index f23a266ef03..ad01adf1a25 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1456,7 +1456,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { kernel_iter = kernels.find(expected_kernel_key); } #endif -#ifdef PADDLE_WITH_XPU + +#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) if (platform::is_xpu_place(expected_kernel_key.place_) && (kernel_iter == kernels.end() || !paddle::platform::is_xpu_support_op(type_, expected_kernel_key) || @@ -1470,17 +1471,36 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { #endif #ifdef PADDLE_WITH_XPU_KP - bool use_xpu_kp_kernel_rt = - FLAGS_run_kp_kernel && - paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key); - bool use_xpu_kp_kernel_debug = - paddle::platform::is_in_xpu_kpwhite_list(type_); - if (platform::is_xpu_place(expected_kernel_key.place_) && - (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) { - expected_kernel_key.library_type_ = LibraryType::kKP; - kernel_iter = kernels.find(expected_kernel_key); - VLOG(3) << "using XPU KP kernel: " << type_ - << ", using_kernel_key:" << expected_kernel_key; + if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) { + bool use_xpu_kp_kernel_rt = + FLAGS_run_kp_kernel && + paddle::platform::is_xpu_kp_support_op(type_, expected_kernel_key); + bool use_xpu_kp_kernel_debug = + paddle::platform::is_in_xpu_kpwhite_list(type_); + if (use_xpu_kp_kernel_rt) { + VLOG(3) << "xpu_kp using rt mode "; + } + if (use_xpu_kp_kernel_debug) { + VLOG(3) << "xpu_kp using debug mode "; + } + bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug); + if (is_xpu_kp_support) { + expected_kernel_key.library_type_ = LibraryType::kKP; + kernel_iter = kernels.find(expected_kernel_key); + VLOG(3) << "using XPU KP kernel: " << type_ + << ", using_kernel_key:" << expected_kernel_key; + } + bool is_xpu_unsupport = + (!paddle::platform::is_xpu_support_op(type_, expected_kernel_key) || + paddle::platform::is_in_xpu_black_list(type_)); + if (!is_xpu_kp_support && + (kernel_iter == kernels.end() || is_xpu_unsupport)) { + VLOG(3) << "missing XPU kernel: " << type_ + << ", expected_kernel_key:" << expected_kernel_key + << ", fallbacking to CPU one!"; + expected_kernel_key.place_ = platform::CPUPlace(); + kernel_iter = kernels.find(expected_kernel_key); + } } #endif diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index bae49fb381a..a427b9b8199 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -234,7 +234,7 @@ PreparedOp PrepareImpl(const NameVarMap& ins, auto& kernels = kernels_iter->second; auto kernel_iter = kernels.find(expected_kernel_key); -#ifdef PADDLE_WITH_XPU +#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) if (paddle::platform::is_xpu_place(expected_kernel_key.place_) && (kernel_iter == kernels.end() || is_xpu_unsupport)) { VLOG(3) << "missing XPU kernel: " << op.Type() @@ -243,29 +243,36 @@ PreparedOp PrepareImpl(const NameVarMap& ins, expected_kernel_key.place_ = platform::CPUPlace(); kernel_iter = kernels.find(expected_kernel_key); } - #endif #ifdef PADDLE_WITH_XPU_KP - expected_kernel_key.place_ = platform::XPUPlace(); - bool use_xpu_kp_kernel_rt = - FLAGS_run_kp_kernel && - paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key); - bool use_xpu_kp_kernel_debug = - paddle::platform::is_in_xpu_kpwhite_list(op.Type()); - if (use_xpu_kp_kernel_rt) { - VLOG(3) << "xpu_kp using rt mode "; - } - if (use_xpu_kp_kernel_debug) { - VLOG(3) << "xpu_kp using debug mode "; - } - if (paddle::platform::is_xpu_place(expected_kernel_key.place_) && - (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug)) { - expected_kernel_key.place_ = platform::XPUPlace(); - expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP; - kernel_iter = kernels.find(expected_kernel_key); - VLOG(3) << "using XPU KP kernel: " << op.Type() - << ", using_kernel_key:" << expected_kernel_key; + if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) { + bool use_xpu_kp_kernel_rt = + FLAGS_run_kp_kernel && + paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key); + bool use_xpu_kp_kernel_debug = + paddle::platform::is_in_xpu_kpwhite_list(op.Type()); + if (use_xpu_kp_kernel_rt) { + VLOG(3) << "xpu_kp using rt mode "; + } + if (use_xpu_kp_kernel_debug) { + VLOG(3) << "xpu_kp using debug mode "; + } + bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug); + if (is_xpu_kp_support) { + expected_kernel_key.library_type_ = paddle::framework::LibraryType::kKP; + kernel_iter = kernels.find(expected_kernel_key); + VLOG(3) << "using XPU KP kernel: " << op.Type() + << ", using_kernel_key:" << expected_kernel_key; + } + if (!is_xpu_kp_support && + (kernel_iter == kernels.end() || is_xpu_unsupport)) { + VLOG(3) << "missing XPU kernel: " << op.Type() + << ", expected_kernel_key:" << expected_kernel_key + << ", fallbacking to CPU one!"; + expected_kernel_key.place_ = platform::CPUPlace(); + kernel_iter = kernels.find(expected_kernel_key); + } } #endif -- GitLab From 57f54d3b6c95dc9c65a8318685df17d4039530a9 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Wed, 16 Mar 2022 16:04:13 +0800 Subject: [PATCH 100/176] move activation kernel (#40565) --- paddle/fluid/operators/activation_op.cc | 23 +- paddle/fluid/operators/activation_op.h | 258 ++--------- paddle/fluid/operators/activation_op.kps | 260 +---------- paddle/phi/kernels/activation_grad_kernel.h | 70 ++- paddle/phi/kernels/activation_kernel.h | 42 +- .../phi/kernels/cpu/activation_grad_kernel.cc | 183 +++++--- paddle/phi/kernels/cpu/activation_kernel.cc | 137 +++--- paddle/phi/kernels/funcs/activation_functor.h | 435 ++++++++++++++++++ .../phi/kernels/gpu/activation_grad_kernel.cu | 80 +++- paddle/phi/kernels/gpu/activation_kernel.cu | 51 +- .../phi/kernels/impl/activation_grad_impl.h | 20 + paddle/phi/ops/compat/activation_sig.cc | 100 ++-- 12 files changed, 919 insertions(+), 740 deletions(-) diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 4205f2253a6..c835cf8ea14 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -1485,6 +1485,13 @@ REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor); REGISTER_ACTIVATION_OP(brelu, BRelu, BReluFunctor, BReluGradFunctor); REGISTER_ACTIVATION_OP(thresholded_relu, ThresholdedRelu, ThresholdedReluFunctor, ThresholdedReluGradFunctor); +REGISTER_ACTIVATION_OP(hard_shrink, HardShrink, HardShrinkFunctor, + HardShrinkGradFunctor); +REGISTER_ACTIVATION_OP(softshrink, SoftShrink, SoftShrinkFunctor, + SoftShrinkGradFunctor); +REGISTER_ACTIVATION_OP(tanh_shrink, TanhShrink, TanhShrinkFunctor, + TanhShrinkGradFunctor); +REGISTER_ACTIVATION_OP(silu, Silu, SiluFunctor, SiluGradFunctor); /* ========================== sigmoid register ============================= */ @@ -1626,22 +1633,6 @@ REGISTER_OPERATOR( ops::ActivationOpDoubleGrad::FwdDeps()>, ops::ActivationDoubleGradOpInplaceInferer); -REGISTER_OP_CPU_KERNEL(elu, - ops::ActivationKernel>, - ops::ActivationKernel>); -REGISTER_OP_CPU_KERNEL( - elu_grad, ops::ELUGradKernel, - ops::ELUGradKernel); -REGISTER_OP_CPU_KERNEL( - elu_grad_grad, ops::ELUDoubleGradKernel>, - ops::ELUDoubleGradKernel>, - ops::ELUDoubleGradKernel>); - /* ========================================================================== */ /* ======================== logit register ============================ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index b076db01c22..4f197b95b21 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -279,6 +279,15 @@ USE_PHI_FUNCTOR(BRelu) USE_PHI_FUNCTOR(ThresholdedRelu) USE_PHI_FUNCTOR(LeakyRelu) USE_PHI_DOUBLE_GRAD_FUNCTOR(LeakyRelu) +USE_PHI_FUNCTOR(HardShrink) +USE_PHI_FUNCTOR(SoftShrink) +USE_PHI_FUNCTOR(TanhShrink) +USE_PHI_FUNCTOR(Silu) +USE_PHI_FUNCTOR(ELU) +USE_PHI_DOUBLE_GRAD_FUNCTOR(ELU) + +template +using ELUGradNegativeAlphaFunctor = phi::funcs::ELUGradNegativeAlphaFunctor; template struct SigmoidGradFunctor : public BaseActivationFunctor { @@ -392,31 +401,6 @@ struct SigmoidTripleGradFunctor : public BaseActivationFunctor { } }; -// silu(x) = x / (1 + exp(-x)) -template -struct SiluFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - auto temp = static_cast(1) / (static_cast(1) + (-x).exp()); - out.device(d) = x * temp; - } -}; - -// silu'(x) = (1 / (1 + e^{-x})) * (1 + out * e^{-x})) -template -struct SiluGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto temp1 = static_cast(1) + (-x).exp(); // 1+e^(-x) - auto temp2 = x * (-x).exp(); // x*e^(-x) - dx.device(d) = dout * ((static_cast(1) / temp1) * - (static_cast(1) + (temp2 / temp1))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - // Originally: logsigmoid(x) = -log (1 + exp(-x)) // For numerical stability, we can use the log-sum-exp trick: // https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ @@ -512,99 +496,6 @@ using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor; template using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor; -// tanhshrink(x) = x - tanh(x) -// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) -template -struct TanhShrinkFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x - x.tanh(); - } -}; - -template -struct TanhShrinkGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * (x.tanh() * x.tanh()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -// tanhshrink(x) = x - tanh(x) -// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) -template -struct HardShrinkFunctor : public BaseActivationFunctor { - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - template - void operator()(Device d, X x, Out out) const { - auto temp1 = x < static_cast(threshold * -1.f); - auto temp2 = x > static_cast(threshold); - out.device(d) = x * (temp1 || temp2).template cast(); - } -}; - -template -struct HardShrinkGradFunctor : public BaseActivationFunctor { - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto temp1 = x < static_cast(threshold * -1.f); - auto temp2 = x > static_cast(threshold); - dx.device(d) = dout * (temp1 || temp2).template cast(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0 -// otherwise -template -struct SoftShrinkFunctor : public BaseActivationFunctor { - float lambda; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"lambda", &lambda}}; - } - - template - void operator()(Device d, X x, Out out) const { - auto lambdaT = static_cast(lambda); - auto temp1 = (x > lambdaT).template cast(); - auto temp2 = (x < -lambdaT).template cast(); - out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT); - } -}; - -template -struct SoftShrinkGradFunctor : public BaseActivationFunctor { - float lambda; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"lambda", &lambda}}; - } - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto lambdaT = static_cast(lambda); - auto temp1 = (x > lambdaT).template cast(); - auto temp2 = (x < -lambdaT).template cast(); - dx.device(d) = dout * (temp1 + temp2).template cast(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - // sqrt(x) = x^(1/2) template struct SqrtFunctor : public BaseActivationFunctor { @@ -1036,59 +927,6 @@ struct SoftReluGradFunctor : public BaseActivationFunctor { } }; -template -struct ELUFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - template - void operator()(Device d, X x, Out out) const { - out.device(d) = - (x < static_cast(0)) - .select(static_cast(alpha) * (x.exp() - static_cast(1)), x); - } -}; - -template -struct ELUGradFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - // case 1: alpha >= 0 - // dx = dout, if out > 0 - // dx = dout * (out + alpha), if out <= 0 - dx.device(d) = (out > static_cast(0)) - .select(dout, dout * (out + static_cast(alpha))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - // case 2: alpha < 0 - // dx = dout, if x > 0 - // dx = dout * (out + alpha), if x <=0 - dx.device(d) = (x > static_cast(0)) - .select(dout, dout * static_cast(alpha) * x.exp()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template class ELUGradKernel : public framework::OpKernel { public: @@ -1354,44 +1192,6 @@ struct AbsGradGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct ELUGradGradFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - template - void operator()(const Device& dev, const framework::Tensor* X, - const framework::Tensor* ddX, framework::Tensor* ddOut, - const framework::Tensor* dOut, framework::Tensor* dX) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "ELUGradGrad")); - auto x = framework::EigenVector::Flatten( - GET_DATA_SAFELY(X, "Input", "X", "ELUGradGrad")); - - if (dX) { - auto dx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dX, "Output", "DX", "ELUGradGrad")); - auto dout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Output", "DOut", "ELUGradGrad")); - dx.device(*d) = ddx * dout * static_cast(alpha) * x.exp() * - (x <= static_cast(0)).template cast(); - } - - if (ddOut) { - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ELUGradGrad")); - ddout.device(*d) = ddx * - ((x > static_cast(0)).template cast() + - static_cast(alpha) * x.exp() * - (x <= static_cast(0)).template cast()) - .template cast(); - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CELUGradGradFunctor : public BaseActivationFunctor { float alpha; @@ -2151,26 +1951,22 @@ struct LogGradGradFunctor : public BaseActivationFunctor { } // namespace operators } // namespace paddle -#define FOR_EACH_ACTIVATION_OP(__macro) \ - __macro(silu, Silu, SiluFunctor, SiluGradFunctor); \ - __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ - __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ - __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \ - __macro(floor, Floor, FloorFunctor, ZeroGradFunctor); \ - __macro(round, Round, RoundFunctor, ZeroGradFunctor); \ - __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ - __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \ - __macro(log2, Log2, Log2Functor, Log2GradFunctor); \ - __macro(log10, Log10, Log10Functor, Log10GradFunctor); \ - __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \ - __macro(stanh, STanh, STanhFunctor, STanhGradFunctor); \ - __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \ - __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor); \ - __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor); \ - __macro(tanh_shrink, TanhShrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \ - __macro(hard_shrink, HardShrink, HardShrinkFunctor, HardShrinkGradFunctor); \ - __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, \ - HardSigmoidGradFunctor); \ - __macro(swish, Swish, SwishFunctor, SwishGradFunctor); \ - __macro(mish, Mish, MishFunctor, MishGradFunctor); \ +#define FOR_EACH_ACTIVATION_OP(__macro) \ + __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ + __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \ + __macro(floor, Floor, FloorFunctor, ZeroGradFunctor); \ + __macro(round, Round, RoundFunctor, ZeroGradFunctor); \ + __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ + __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \ + __macro(log2, Log2, Log2Functor, Log2GradFunctor); \ + __macro(log10, Log10, Log10Functor, Log10GradFunctor); \ + __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \ + __macro(stanh, STanh, STanhFunctor, STanhGradFunctor); \ + __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \ + __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor); \ + __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor); \ + __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, \ + HardSigmoidGradFunctor); \ + __macro(swish, Swish, SwishFunctor, SwishGradFunctor); \ + __macro(mish, Mish, MishFunctor, MishGradFunctor); \ __macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor); diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index 256f20db084..22613cbe2a2 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -44,35 +44,6 @@ struct CudaSigmoidGradFunctor : public BaseActivationFunctor { } }; -template -struct CudaSiluFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // silu(x) = x / (1 + exp(-x)) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(x / (one + exp(-x))); - } -}; - -template -struct CudaSiluGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - MPType temp = one / (one + exp(-x)); - return static_cast(dout * (temp * (one + x * (one - temp)))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CudaLogSigmoidFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; @@ -110,43 +81,6 @@ struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct CudaSoftShrinkFunctor : public BaseActivationFunctor { - float lambda; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"lambda", &lambda}}; - } - - // softshrink(x) = x - lambda, if x > lambda; - // x + lambda, if x < -lambda; - // 0, otherwise. - __device__ __forceinline__ T operator()(const T x) const { - T l = static_cast(lambda); - T temp1 = static_cast(x > l); - T temp2 = static_cast(x < -l); - return temp1 * (x - l) + temp2 * (x + l); - } -}; - -template -struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float lambda; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"lambda", &lambda}}; - } - - // dx = dout, if x > lambda or x < -lambda else 0 - __device__ __forceinline__ T operator()(const T dout, const T x) const { - T l = static_cast(lambda); - return (x >= -l && x <= l) ? zero : dout; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CudaCeilFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; @@ -615,66 +549,6 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor { } }; -template -struct CudaTanhShrinkFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // tanhshrink(x) = x - tanh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(x - tanh(x)); - } -}; - -template -struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * tanh(x)^2 - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * tanh(x) * tanh(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -struct CudaHardShrinkFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - // hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x - __device__ __forceinline__ T operator()(const T x) const { - T t = static_cast(threshold); - return (x > -t && x < t) ? zero : x; - } -}; - -template -struct CudaHardShrinkGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - // dx = (x > -threshold && x < threshold) ? 0 : dout - __device__ __forceinline__ T operator()(const T dout, const T x) const { - T t = static_cast(threshold); - return (x > -t && x < t) ? zero : dout; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CudaHardSigmoidFunctor : public BaseActivationFunctor { T zero = static_cast(0.0f); @@ -863,110 +737,6 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct CudaELUFunctor : public BaseActivationFunctor { - using CT = typename details::MPTypeTrait::Type; - CT zero = static_cast(0.0f); - CT one = static_cast(1.0f); - float alpha; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - // elu(x) = x, if x > 0 - // elu(x) = alpha * (e^x - 1), if x <= 0 - __device__ __forceinline__ T operator()(const T arg_x) const { - CT x = static_cast(arg_x); - CT temp = static_cast(alpha) * (exp(x) - one); - CT res = x > zero ? x : temp; - return static_cast(res); - } -}; - -template -struct CudaELUGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType zero = static_cast(0.0f); - float alpha; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - // case 1: alpha >= 0 - // dx = dout, if out > 0 - // dx = dout * (out + alpha), if out <= 0 - __device__ __forceinline__ T operator()(T arg_dout, T arg_out) const { - MPType dout = static_cast(arg_dout); - MPType out = static_cast(arg_out); - MPType a = static_cast(alpha); - MPType out_pos = static_cast(out > zero); - MPType out_neg = static_cast(out <= zero); - return static_cast(dout * (out_pos + out_neg * (out + a))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -template -struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType zero = static_cast(0.0f); - float alpha; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - // case 2: alpha < 0 - // dx = dout, if x > 0 - // dx = dout * (out + alpha), if x <=0 - __device__ __forceinline__ T operator()(const T arg_dout, const T arg_out, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType out = static_cast(arg_out); - MPType x = static_cast(arg_x); - MPType a = static_cast(alpha); - MPType x_pos = static_cast(x > zero); - MPType x_neg = static_cast(x <= zero); - return static_cast(dout * (x_pos + x_neg * (out + a))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - -template -class ELUGradCudaKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* d_out = ctx.Input(framework::GradVarName("Out")); - auto* out = ctx.Input("Out"); - auto* x = ctx.Input("X"); - auto* d_x = ctx.Output(framework::GradVarName("X")); - d_x->mutable_data(ctx.GetPlace()); - const float alpha = ctx.Attr("alpha"); - - auto& dev_ctx = ctx.device_context(); - std::vector ins = {d_out, out}; - std::vector outs = {d_x}; - if (alpha > 0) { - CudaELUGradFunctor functor; - functor.alpha = alpha; - paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, - &outs, functor); - } else { - CudaELUGradNegativeAlphaFunctor functor; - functor.alpha = alpha; - ins.push_back(x); - paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, - &outs, functor); - } - } -}; - template struct CudaCELUFunctor : public BaseActivationFunctor { using CT = typename details::MPTypeTrait::Type; @@ -1099,6 +869,15 @@ USE_PHI_FUNCTOR(CudaTanh) USE_PHI_FUNCTOR(CudaBRelu) USE_PHI_FUNCTOR(CudaLeakyRelu) USE_PHI_FUNCTOR(CudaThresholdedRelu) +USE_PHI_FUNCTOR(CudaHardShrink) +USE_PHI_FUNCTOR(CudaSoftShrink) +USE_PHI_FUNCTOR(CudaTanhShrink) +USE_PHI_FUNCTOR(CudaSilu) +USE_PHI_FUNCTOR(CudaELU) + +template +using CudaELUGradNegativeAlphaFunctor = + phi::funcs::CudaELUGradNegativeAlphaFunctor; } // namespace operators } // namespace paddle @@ -1158,26 +937,6 @@ namespace plat = paddle::platform; ops::ActivationGradCudaKernel>); -/* ======================== elu register ============================ */ -REGISTER_OP_CUDA_KERNEL( - elu, ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>); -REGISTER_OP_CUDA_KERNEL( - elu_grad, ops::ELUGradCudaKernel, - ops::ELUGradCudaKernel, - ops::ELUGradCudaKernel); - -REGISTER_OP_CUDA_KERNEL( - elu_grad_grad, ops::ELUDoubleGradKernel>, - ops::ELUDoubleGradKernel>, - ops::ELUDoubleGradKernel>); /* ========================================================================== */ /* ======================== celu register ============================ */ @@ -1359,7 +1118,6 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ #define FOR_EACH_ACTIVATION_CUDA_OP(__macro) \ - __macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor); \ __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, \ CudaLogSigmoidGradFunctor); \ __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor, \ diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h index a5b737b28c2..e0dfca756e1 100644 --- a/paddle/phi/kernels/activation_grad_kernel.h +++ b/paddle/phi/kernels/activation_grad_kernel.h @@ -26,6 +26,23 @@ namespace phi { const DenseTensor& dout, \ DenseTensor* dx); +#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(name, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx); + +#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(name, attr1, attr2) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr1, \ + float attr2, \ + DenseTensor* dx); + #define DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(name) \ template \ void name##GradKernel(const Context& dev_ctx, \ @@ -33,6 +50,14 @@ namespace phi { const DenseTensor& dout, \ DenseTensor* dx); +#define DECLARE_ACTIVATION_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut(name, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx); + template void ReluDoubleGradKernel(const Context& dev_ctx, const DenseTensor& out, @@ -58,21 +83,6 @@ void TanhTripleGradKernel(const Context& dev_ctx, DenseTensor* d_dout, DenseTensor* d_ddx); -template -void BReluGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& dout, - float t_min, - float t_max, - DenseTensor* dx); - -template -void LeakyReluGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& dout, - float alpha, - DenseTensor* dx); - template void LeakyReluDoubleGradKernel(const Context& dev_ctx, const DenseTensor& x, @@ -81,11 +91,21 @@ void LeakyReluDoubleGradKernel(const Context& dev_ctx, DenseTensor* ddout); template -void ThresholdedReluGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& dout, - float threshold, - DenseTensor* dx); +void EluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + float alpha, + DenseTensor* dx); + +template +void EluDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + const DenseTensor& ddx, + float alpha, + DenseTensor* dx, + DenseTensor* ddout); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cos); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Tan); @@ -98,7 +118,17 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cosh); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asinh); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acosh); DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atanh); +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(TanhShrink); +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Silu); + DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Relu); DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Tanh); +DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu, alpha) + DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(ThresholdedRelu, threshold) + DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(SoftShrink, lambda) + DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(HardShrink, threshold) + + DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu, t_min, t_max) + } // namespace phi diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h index 885dccad8e3..0762ce43ff8 100644 --- a/paddle/phi/kernels/activation_kernel.h +++ b/paddle/phi/kernels/activation_kernel.h @@ -24,6 +24,21 @@ namespace phi { void name##Kernel( \ const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); +#define DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(name, attr) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr, \ + DenseTensor* out); + +#define DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(name, attr1, attr2) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr1, \ + float attr2, \ + DenseTensor* out); + DECLARE_ACTIVATION_KERNEL(Cos) DECLARE_ACTIVATION_KERNEL(Tan) DECLARE_ACTIVATION_KERNEL(Acos) @@ -37,24 +52,15 @@ DECLARE_ACTIVATION_KERNEL(Acosh) DECLARE_ACTIVATION_KERNEL(Atanh) DECLARE_ACTIVATION_KERNEL(Relu) DECLARE_ACTIVATION_KERNEL(Tanh) +DECLARE_ACTIVATION_KERNEL(TanhShrink) +DECLARE_ACTIVATION_KERNEL(Silu) + +DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu, alpha) +DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, threshold) +DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SoftShrink, lambda) +DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold) +DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha) -template -void BReluKernel(const Context& dev_ctx, - const DenseTensor& x, - float t_min, - float t_max, - DenseTensor* out); - -template -void LeakyReluKernel(const Context& dev_ctx, - const DenseTensor& x, - float alpha, - DenseTensor* out); - -template -void ThresholdedReluKernel(const Context& dev_ctx, - const DenseTensor& x, - float threshold, - DenseTensor* out); +DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(BRelu, t_min, t_max) } // namespace phi diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc index f9af50f6832..11b396a84d0 100644 --- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc @@ -21,101 +21,140 @@ limitations under the License. */ namespace phi { -#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \ +#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(name, functor_class) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& x, \ const DenseTensor& dout, \ DenseTensor* dx) { \ - functor_class functor; \ - ActivationGradImpl>( \ + funcs::functor_class functor; \ + ActivationGradImpl>( \ dev_ctx, &x, nullptr, &dout, dx, functor); \ } -#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( \ - name, functor_class, attr) \ - template \ - void name##GradKernel(const Context& dev_ctx, \ - const DenseTensor& x, \ - const DenseTensor& dout, \ - float attr, \ - DenseTensor* dx) { \ - functor_class functor; \ - auto attrs = functor.GetAttrs(); \ - *(attrs[0].second) = attr; \ - ActivationGradImpl>( \ - dev_ctx, &x, nullptr, &dout, dx, functor); \ +#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX( \ + name, functor_class, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationGradImpl>( \ + dev_ctx, &x, nullptr, &dout, dx, functor); \ } -#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX( \ - name, functor_class, attr1, attr2) \ - template \ - void name##GradKernel(const Context& dev_ctx, \ - const DenseTensor& x, \ - const DenseTensor& dout, \ - float attr1, \ - float attr2, \ - DenseTensor* dx) { \ - functor_class functor; \ - auto attrs = functor.GetAttrs(); \ - *(attrs[0].second) = attr1; \ - *(attrs[1].second) = attr2; \ - ActivationGradImpl>( \ - dev_ctx, &x, nullptr, &dout, dx, functor); \ +#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX( \ + name, functor_class, attr1, attr2) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + float attr1, \ + float attr2, \ + DenseTensor* dx) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationGradImpl>( \ + dev_ctx, &x, nullptr, &dout, dx, functor); \ } -#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \ +#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& out, \ const DenseTensor& dout, \ DenseTensor* dx) { \ - functor_class functor; \ - ActivationGradImpl>( \ + funcs::functor_class functor; \ + ActivationGradImpl>( \ dev_ctx, nullptr, &out, &dout, dx, functor); \ } -#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut( \ - name, functor_class, attr) \ - template \ - void name##GradKernel(const Context& dev_ctx, \ - const DenseTensor& out, \ - const DenseTensor& dout, \ - float attr, \ - DenseTensor* dx) { \ - functor_class functor; \ - auto attrs = functor.GetAttrs(); \ - *(attrs[0].second) = attr; \ - ActivationGradImpl>( \ - dev_ctx, nullptr, &out, &dout, dx, functor); \ +#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT( \ + name, functor_class, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationGradImpl>( \ + dev_ctx, nullptr, &out, &dout, dx, functor); \ } -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CosGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::TanGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::AcosGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::SinGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::AsinGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::AtanGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::SinhGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CoshGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::AsinhGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::AcoshGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::AtanhGradFunctor); - -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::ReluGradFunctor); -DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Tanh, funcs::TanhGradFunctor); - -DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu, - funcs::LeakyReluGradFunctor, +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CosGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, TanGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, AcosGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Sin, SinGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Asin, AsinGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atan, AtanGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Sinh, SinhGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Cosh, CoshGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Asinh, AsinhGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, AcoshGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, AtanhGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, TanhShrinkGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, SiluGradFunctor); + +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhGradFunctor); + +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, + LeakyReluGradFunctor, alpha); -DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( - ThresholdedRelu, funcs::ThresholdedReluGradFunctor, threshold); - -DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu, - funcs::BReluGradFunctor, +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu, + ThresholdedReluGradFunctor, + threshold); +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, + SoftShrinkGradFunctor, + lambda); +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, + HardShrinkGradFunctor, + threshold); + +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, + BReluGradFunctor, t_min, t_max); +template +void EluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + float alpha, + DenseTensor* dx) { + dev_ctx.template Alloc(dx); + + auto x_flatten = + EigenVector::Flatten(GET_DATA_SAFELY(&x, "Input", "X", "elu_grad")); + auto out_flatten = EigenVector::Flatten( + GET_DATA_SAFELY(&out, "Input", "Out", "elu_grad")); + auto dout_flatten = EigenVector::Flatten( + GET_DATA_SAFELY(&dout, "Input", "dOut", "elu_grad")); + auto dx_flatten = + EigenVector::Flatten(GET_DATA_SAFELY(dx, "Output", "dX", "elu_grad")); + auto* place = dev_ctx.eigen_device(); + + if (alpha > 0) { + funcs::ELUGradFunctor functor; + functor.alpha = alpha; + functor(*place, x_flatten, out_flatten, dout_flatten, dx_flatten); + } else { + funcs::ELUGradNegativeAlphaFunctor functor; + functor.alpha = alpha; + functor(*place, x_flatten, out_flatten, dout_flatten, dx_flatten); + } +} + } // namespace phi PD_REGISTER_KERNEL( @@ -144,6 +183,11 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(brelu_grad, BReluGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad, ThresholdedReluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel) PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(relu_double_grad, ReluDoubleGradKernel) @@ -151,6 +195,7 @@ PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(tanh_double_grad, TanhDoubleGradKernel) PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(leaky_relu_double_grad, LeakyReluDoubleGradKernel) +PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel) PD_REGISTER_KERNEL(tanh_triple_grad, CPU, diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc index 0d13429c8f6..59ce18a11cc 100644 --- a/paddle/phi/kernels/cpu/activation_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_kernel.cc @@ -19,78 +19,93 @@ limitations under the License. */ namespace phi { -#define DEFINE_CPU_ACTIVATION_KERNEL(name, functor_class) \ - template \ - void name##Kernel( \ - const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \ - functor_class functor; \ - ActivationImpl(dev_ctx, x, out, functor); \ +#define DEFINE_CPU_ACTIVATION_KERNEL(name, functor_class) \ + template \ + void name##Kernel( \ + const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \ + funcs::functor_class functor; \ + ActivationImpl>( \ + dev_ctx, x, out, functor); \ } -#define DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \ - template \ - void name##Kernel(const Context& dev_ctx, \ - const DenseTensor& x, \ - float attr, \ - DenseTensor* out) { \ - functor_class functor; \ - auto attrs = functor.GetAttrs(); \ - *(attrs[0].second) = attr; \ - ActivationImpl>(dev_ctx, x, out, functor); \ +#define DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr, \ + DenseTensor* out) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr; \ + ActivationImpl>( \ + dev_ctx, x, out, functor); \ } -#define DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS( \ - name, functor_class, attr1, attr2) \ - template \ - void name##Kernel(const Context& dev_ctx, \ - const DenseTensor& x, \ - float attr1, \ - float attr2, \ - DenseTensor* out) { \ - functor_class functor; \ - auto attrs = functor.GetAttrs(); \ - *(attrs[0].second) = attr1; \ - *(attrs[1].second) = attr2; \ - ActivationImpl>(dev_ctx, x, out, functor); \ +#define DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS( \ + name, functor_class, attr1, attr2) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + float attr1, \ + float attr2, \ + DenseTensor* out) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationImpl>( \ + dev_ctx, x, out, functor); \ } -DEFINE_CPU_ACTIVATION_KERNEL(Sin, funcs::SinFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Cos, funcs::CosFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Tan, funcs::TanFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Asin, funcs::AsinFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Atan, funcs::AtanFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Acos, funcs::AcosFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Sinh, funcs::SinhFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Cosh, funcs::CoshFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Asinh, funcs::AsinhFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Acosh, funcs::AcoshFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Atanh, funcs::AtanhFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Relu, funcs::ReluCPUFunctor) -DEFINE_CPU_ACTIVATION_KERNEL(Tanh, funcs::TanhFunctor) -DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, funcs::LeakyReluFunctor, alpha) +DEFINE_CPU_ACTIVATION_KERNEL(Sin, SinFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Cos, CosFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Tan, TanFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Asin, AsinFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Atan, AtanFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Acos, AcosFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Sinh, SinhFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Cosh, CoshFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Asinh, AsinhFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Acosh, AcoshFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Atanh, AtanhFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Relu, ReluCPUFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Tanh, TanhFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(TanhShrink, TanhShrinkFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Silu, SiluFunctor) + +DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha) DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, - funcs::ThresholdedReluFunctor, + ThresholdedReluFunctor, threshold) -DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, funcs::BReluFunctor, t_min, t_max) +DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink, HardShrinkFunctor, threshold) +DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, SoftShrinkFunctor, lambda) +DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, ELUFunctor, alpha) + +DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, BReluFunctor, t_min, t_max) } // namespace phi PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {} #define PD_REGISTER_ACTIVATION_KERNEL(name, func) \ - PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func##Kernel, float, double) {} + PD_REGISTER_KERNEL(name, CPU, ALL_LAYOUT, phi::func, float, double) {} -PD_REGISTER_ACTIVATION_KERNEL(sin, Sin) -PD_REGISTER_ACTIVATION_KERNEL(cos, Cos) -PD_REGISTER_ACTIVATION_KERNEL(tan, Tan) -PD_REGISTER_ACTIVATION_KERNEL(acos, Acos) -PD_REGISTER_ACTIVATION_KERNEL(asin, Asin) -PD_REGISTER_ACTIVATION_KERNEL(atan, Atan) -PD_REGISTER_ACTIVATION_KERNEL(sinh, Sinh) -PD_REGISTER_ACTIVATION_KERNEL(cosh, Cosh) -PD_REGISTER_ACTIVATION_KERNEL(asinh, Asinh) -PD_REGISTER_ACTIVATION_KERNEL(acosh, Acosh) -PD_REGISTER_ACTIVATION_KERNEL(atanh, Atanh) -PD_REGISTER_ACTIVATION_KERNEL(tanh, Tanh) -PD_REGISTER_ACTIVATION_KERNEL(brelu, BRelu) -PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyRelu) -PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedRelu) +PD_REGISTER_ACTIVATION_KERNEL(sin, SinKernel) +PD_REGISTER_ACTIVATION_KERNEL(cos, CosKernel) +PD_REGISTER_ACTIVATION_KERNEL(tan, TanKernel) +PD_REGISTER_ACTIVATION_KERNEL(acos, AcosKernel) +PD_REGISTER_ACTIVATION_KERNEL(asin, AsinKernel) +PD_REGISTER_ACTIVATION_KERNEL(atan, AtanKernel) +PD_REGISTER_ACTIVATION_KERNEL(sinh, SinhKernel) +PD_REGISTER_ACTIVATION_KERNEL(cosh, CoshKernel) +PD_REGISTER_ACTIVATION_KERNEL(asinh, AsinhKernel) +PD_REGISTER_ACTIVATION_KERNEL(acosh, AcoshKernel) +PD_REGISTER_ACTIVATION_KERNEL(atanh, AtanhKernel) +PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel) +PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel) +PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel) +PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel) +PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel) +PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel) +PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel) +PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel) +PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel) diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h index 7fe513a24ba..663258fa560 100644 --- a/paddle/phi/kernels/funcs/activation_functor.h +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -29,11 +29,13 @@ #include #include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/extensions.h" #ifdef PADDLE_WITH_XPU_KP #define __forceinline__ __inline__ @@ -780,6 +782,236 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; +// tanhshrink(x) = x - tanh(x) +// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) +template +struct TanhShrinkFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x - x.tanh(); + } +}; + +template +struct TanhShrinkGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * (x.tanh() * x.tanh()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +// tanhshrink(x) = x - tanh(x) +// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) +template +struct HardShrinkFunctor : public BaseActivationFunctor { + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + template + void operator()(Device d, X x, Out out) const { + auto temp1 = x < static_cast(threshold * -1.f); + auto temp2 = x > static_cast(threshold); + out.device(d) = x * (temp1 || temp2).template cast(); + } +}; + +template +struct HardShrinkGradFunctor : public BaseActivationFunctor { + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp1 = x < static_cast(threshold * -1.f); + auto temp2 = x > static_cast(threshold); + dx.device(d) = dout * (temp1 || temp2).template cast(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0 +// otherwise +template +struct SoftShrinkFunctor : public BaseActivationFunctor { + float lambda; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } + + template + void operator()(Device d, X x, Out out) const { + auto lambdaT = static_cast(lambda); + auto temp1 = (x > lambdaT).template cast(); + auto temp2 = (x < -lambdaT).template cast(); + out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT); + } +}; + +template +struct SoftShrinkGradFunctor : public BaseActivationFunctor { + float lambda; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto lambdaT = static_cast(lambda); + auto temp1 = (x > lambdaT).template cast(); + auto temp2 = (x < -lambdaT).template cast(); + dx.device(d) = dout * (temp1 + temp2).template cast(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct ELUFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = + (x < static_cast(0)) + .select(static_cast(alpha) * (x.exp() - static_cast(1)), x); + } +}; + +template +struct ELUGradFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + // case 1: alpha >= 0 + // dx = dout, if out > 0 + // dx = dout * (out + alpha), if out <= 0 + dx.device(d) = (out > static_cast(0)) + .select(dout, dout * (out + static_cast(alpha))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + // case 2: alpha < 0 + // dx = dout, if x > 0 + // dx = dout * (out + alpha), if x <=0 + dx.device(d) = (x > static_cast(0)) + .select(dout, dout * static_cast(alpha) * x.exp()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct ELUGradGradFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(const Device& dev, + const DenseTensor* X, + const DenseTensor* ddX, + DenseTensor* ddOut, + const DenseTensor* dOut, + DenseTensor* dX) const { + auto* d = dev.eigen_device(); + auto ddx = EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "ELUGradGrad")); + auto x = EigenVector::Flatten( + GET_DATA_SAFELY(X, "Input", "X", "ELUGradGrad")); + + if (dX) { + auto dx = EigenVector::Flatten( + GET_DATA_SAFELY(dX, "Output", "DX", "ELUGradGrad")); + auto dout = EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Output", "DOut", "ELUGradGrad")); + dx.device(*d) = ddx * dout * static_cast(alpha) * x.exp() * + (x <= static_cast(0)).template cast(); + } + + if (ddOut) { + auto ddout = EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ELUGradGrad")); + ddout.device(*d) = ddx * + ((x > static_cast(0)).template cast() + + static_cast(alpha) * x.exp() * + (x <= static_cast(0)).template cast()) + .template cast(); + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +// silu(x) = x / (1 + exp(-x)) +template +struct SiluFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + auto temp = static_cast(1) / (static_cast(1) + (-x).exp()); + out.device(d) = x * temp; + } +}; + +// silu'(x) = (1 / (1 + e^{-x})) * (1 + out * e^{-x})) +template +struct SiluGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp1 = static_cast(1) + (-x).exp(); // 1+e^(-x) + auto temp2 = x * (-x).exp(); // x*e^(-x) + dx.device(d) = dout * ((static_cast(1) / temp1) * + (static_cast(1) + (temp2 / temp1))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) template struct CudaReluFunctor : public BaseActivationFunctor { @@ -1218,6 +1450,209 @@ struct CudaLeakyReluGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; + +template +struct CudaSoftShrinkFunctor : public BaseActivationFunctor { + float lambda; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } + + // softshrink(x) = x - lambda, if x > lambda; + // x + lambda, if x < -lambda; + // 0, otherwise. + __device__ __forceinline__ T operator()(const T x) const { + T l = static_cast(lambda); + T temp1 = static_cast(x > l); + T temp2 = static_cast(x < -l); + return temp1 * (x - l) + temp2 * (x + l); + } +}; + +template +struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float lambda; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } + + // dx = dout, if x > lambda or x < -lambda else 0 + __device__ __forceinline__ T operator()(const T dout, const T x) const { + T l = static_cast(lambda); + return (x >= -l && x <= l) ? zero : dout; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaTanhShrinkFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // tanhshrink(x) = x - tanh(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(x - tanh(x)); + } +}; + +template +struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // dx = dout * tanh(x)^2 + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + return static_cast(dout * tanh(x) * tanh(x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaHardShrinkFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x + __device__ __forceinline__ T operator()(const T x) const { + T t = static_cast(threshold); + return (x > -t && x < t) ? zero : x; + } +}; + +template +struct CudaHardShrinkGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // dx = (x > -threshold && x < threshold) ? 0 : dout + __device__ __forceinline__ T operator()(const T dout, const T x) const { + T t = static_cast(threshold); + return (x > -t && x < t) ? zero : dout; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaELUFunctor : public BaseActivationFunctor { + using CT = typename phi::dtype::MPTypeTrait::Type; + CT zero = static_cast(0.0f); + CT one = static_cast(1.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // elu(x) = x, if x > 0 + // elu(x) = alpha * (e^x - 1), if x <= 0 + __device__ __forceinline__ T operator()(const T arg_x) const { + CT x = static_cast(arg_x); + CT temp = static_cast(alpha) * (exp(x) - one); + CT res = x > zero ? x : temp; + return static_cast(res); + } +}; + +template +struct CudaELUGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType zero = static_cast(0.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // case 1: alpha >= 0 + // dx = dout, if out > 0 + // dx = dout * (out + alpha), if out <= 0 + __device__ __forceinline__ T operator()(T arg_dout, T arg_out) const { + MPType dout = static_cast(arg_dout); + MPType out = static_cast(arg_out); + MPType a = static_cast(alpha); + MPType out_pos = static_cast(out > zero); + MPType out_neg = static_cast(out <= zero); + return static_cast(dout * (out_pos + out_neg * (out + a))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +template +struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType zero = static_cast(0.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // case 2: alpha < 0 + // dx = dout, if x > 0 + // dx = dout * (out + alpha), if x <=0 + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_out, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType out = static_cast(arg_out); + MPType x = static_cast(arg_x); + MPType a = static_cast(alpha); + MPType x_pos = static_cast(x > zero); + MPType x_neg = static_cast(x <= zero); + return static_cast(dout * (x_pos + x_neg * (out + a))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaSiluFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // silu(x) = x / (1 + exp(-x)) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(x / (one + exp(-x))); + } +}; + +template +struct CudaSiluGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + MPType temp = one / (one + exp(-x)); + return static_cast(dout * (temp * (one + x * (one - temp)))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + #endif } // namespace funcs diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu index 00792b8ab60..b12fc6975b3 100644 --- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu @@ -73,7 +73,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx, } } -#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \ +#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(name, functor_class) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& x, \ @@ -84,7 +84,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx, dev_ctx, &x, nullptr, &dout, dx, functor); \ } -#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX( \ +#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX( \ name, functor_class, attr) \ template \ void name##GradKernel(const Context& dev_ctx, \ @@ -99,7 +99,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx, dev_ctx, &x, nullptr, &dout, dx, functor); \ } -#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX( \ +#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX( \ name, functor_class, attr1, attr2) \ template \ void name##GradKernel(const Context& dev_ctx, \ @@ -116,7 +116,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx, dev_ctx, &x, nullptr, &dout, dx, functor); \ } -#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \ +#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& out, \ @@ -127,7 +127,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx, dev_ctx, nullptr, &out, &dout, dx, functor); \ } -#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut( \ +#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT( \ name, functor_class, attr) \ template \ void name##GradKernel(const Context& dev_ctx, \ @@ -142,32 +142,62 @@ void ActivationGradGPUImpl(const Context& dev_ctx, dev_ctx, nullptr, &out, &dout, dx, functor); \ } -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, CudaReluGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Tanh, CudaTanhGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, CudaCosGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, CudaTanGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, CudaAcosGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, CudaSinGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, CudaAsinGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, CudaAtanGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, CudaSinhGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, CudaCoshGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, CudaAsinhGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, CudaAcoshGradFunctor); -DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, CudaAtanhGradFunctor); - -DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu, +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, CudaReluGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, CudaTanhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CudaCosGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, CudaTanGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, CudaAcosGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Sin, CudaSinGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Asin, CudaAsinGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atan, CudaAtanGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Sinh, CudaSinhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Cosh, CudaCoshGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Asinh, CudaAsinhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, CudaAcoshGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, CudaAtanhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, CudaTanhShrinkGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, CudaSiluGradFunctor); + +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, CudaLeakyReluGradFunctor, alpha); -DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(ThresholdedRelu, +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu, CudaThresholdedReluGradFunctor, threshold); +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, + CudaSoftShrinkGradFunctor, + lambda); +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, + CudaHardShrinkGradFunctor, + threshold); -DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu, +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, CudaBReluGradFunctor, t_min, t_max); +template +void EluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& dout, + float alpha, + DenseTensor* dx) { + dev_ctx.template Alloc(dx); + std::vector ins = {&dout, &out}; + std::vector outs = {dx}; + if (alpha > 0) { + funcs::CudaELUGradFunctor functor; + functor.alpha = alpha; + funcs::ElementwiseKernel(dev_ctx, ins, &outs, functor); + } else { + funcs::CudaELUGradNegativeAlphaFunctor functor; + functor.alpha = alpha; + ins.push_back(&x); + funcs::ElementwiseKernel(dev_ctx, ins, &outs, functor); + } +} + } // namespace phi #ifdef PADDLE_WITH_HIP @@ -234,3 +264,9 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_double_grad, LeakyReluDoubleGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad, ThresholdedReluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(soft_shrink_grad, SoftShrinkGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel) diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu index 3c340a89f57..cd9330ead84 100644 --- a/paddle/phi/kernels/gpu/activation_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_kernel.cu @@ -38,12 +38,13 @@ void ActivationGPUImpl(const Context& dev_ctx, funcs::ElementwiseKernel(dev_ctx, ins, &outs, functor); } -#define DEFINE_GPU_ACTIVATION_KERNEL(name, functor_class) \ - template \ - void name##Kernel( \ - const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \ - functor_class functor; \ - ActivationGPUImpl(dev_ctx, x, out, functor); \ +#define DEFINE_GPU_ACTIVATION_KERNEL(name, functor_class) \ + template \ + void name##Kernel( \ + const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \ + funcs::functor_class functor; \ + ActivationGPUImpl>( \ + dev_ctx, x, out, functor); \ } #define DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(name, functor_class, attr) \ @@ -75,24 +76,31 @@ void ActivationGPUImpl(const Context& dev_ctx, dev_ctx, x, out, functor); \ } -DEFINE_GPU_ACTIVATION_KERNEL(Cos, funcs::CudaCosFunctor) -DEFINE_GPU_ACTIVATION_KERNEL(Tan, funcs::CudaTanFunctor) -DEFINE_GPU_ACTIVATION_KERNEL(Acos, funcs::CudaAcosFunctor) -DEFINE_GPU_ACTIVATION_KERNEL(Sin, funcs::CudaSinFunctor) -DEFINE_GPU_ACTIVATION_KERNEL(Asin, funcs::CudaAsinFunctor) -DEFINE_GPU_ACTIVATION_KERNEL(Atan, funcs::CudaAtanFunctor) -DEFINE_GPU_ACTIVATION_KERNEL(Sinh, funcs::CudaSinhFunctor) -DEFINE_GPU_ACTIVATION_KERNEL(Cosh, funcs::CudaCoshFunctor) -DEFINE_GPU_ACTIVATION_KERNEL(Asinh, funcs::CudaAsinhFunctor) -DEFINE_GPU_ACTIVATION_KERNEL(Acosh, funcs::CudaAcoshFunctor) -DEFINE_GPU_ACTIVATION_KERNEL(Atanh, funcs::CudaAtanhFunctor) -DEFINE_GPU_ACTIVATION_KERNEL(Relu, funcs::CudaReluFunctor) -DEFINE_GPU_ACTIVATION_KERNEL(Tanh, funcs::CudaTanhFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Cos, CudaCosFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Tan, CudaTanFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Acos, CudaAcosFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Sin, CudaSinFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Asin, CudaAsinFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Atan, CudaAtanFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Sinh, CudaSinhFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Cosh, CudaCoshFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Asinh, CudaAsinhFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Acosh, CudaAcoshFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Atanh, CudaAtanhFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Relu, CudaReluFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Tanh, CudaTanhFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(TanhShrink, CudaTanhShrinkFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Silu, CudaSiluFunctor) DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha) DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, CudaThresholdedReluFunctor, threshold) +DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink, + CudaHardShrinkFunctor, + threshold) +DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, CudaSoftShrinkFunctor, lambda) +DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, CudaELUFunctor, alpha) DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, CudaBReluFunctor, t_min, t_max) @@ -142,3 +150,8 @@ PD_REGISTER_ACTIVATION_KERNEL(tanh, TanhKernel) PD_REGISTER_ACTIVATION_KERNEL(brelu, BReluKernel) PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel) PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel) +PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel) +PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel) +PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel) +PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel) +PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel) diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h index a48a6226f23..a95f49c0e7c 100644 --- a/paddle/phi/kernels/impl/activation_grad_impl.h +++ b/paddle/phi/kernels/impl/activation_grad_impl.h @@ -202,4 +202,24 @@ void TanhTripleGradKernel(const Context& dev_ctx, d_ddx); // output } +template +void EluDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + const DenseTensor& ddx, + float alpha, + DenseTensor* dx, + DenseTensor* ddout) { + if (dx) { + dx->Resize(x.dims()); + dev_ctx.template Alloc(dx); + } + if (ddout) { + dev_ctx.template Alloc(ddout); + } + funcs::ELUGradGradFunctor functor; + functor.alpha = alpha; + functor(dev_ctx, &x, &ddx, ddout, &dout, dx); +} + } // namespace phi diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc index cbfca5b17ae..890dbadf17c 100644 --- a/paddle/phi/ops/compat/activation_sig.cc +++ b/paddle/phi/ops/compat/activation_sig.cc @@ -16,45 +16,49 @@ limitations under the License. */ namespace phi { -#define DefineActGradDepXOpArgMap(func_name, op_name, attrs) \ - KernelSignature func_name##GradOpArgumentMapping( \ - const ArgumentMappingContext& ctx) { \ - return KernelSignature(op_name "_grad", \ - {"X", GradVarName("Out")}, \ - {attrs}, \ - {GradVarName("X")}); \ +#define DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(func_name, op_name, attrs) \ + KernelSignature func_name##GradOpArgumentMapping( \ + const ArgumentMappingContext& ctx) { \ + return KernelSignature(op_name "_grad", \ + {"X", GradVarName("Out")}, \ + {attrs}, \ + {GradVarName("X")}); \ } -#define DefineActGradDepOutOpArgMap(func_name, op_name, attrs) \ - KernelSignature func_name##GradOpArgumentMapping( \ - const ArgumentMappingContext& ctx) { \ - return KernelSignature(op_name "_grad", \ - {"Out", GradVarName("Out")}, \ - {attrs}, \ - {GradVarName("X")}); \ +#define DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(func_name, op_name, attrs) \ + KernelSignature func_name##GradOpArgumentMapping( \ + const ArgumentMappingContext& ctx) { \ + return KernelSignature(op_name "_grad", \ + {"Out", GradVarName("Out")}, \ + {attrs}, \ + {GradVarName("X")}); \ } #define comma , -DefineActGradDepXOpArgMap(Cos, "cos", ); // NOLINT -DefineActGradDepXOpArgMap(Tan, "tan", ); // NOLINT -DefineActGradDepXOpArgMap(Acos, "acos", ); // NOLINT -DefineActGradDepXOpArgMap(Sin, "sin", ); // NOLINT -DefineActGradDepXOpArgMap(Asin, "asin", ); // NOLINT -DefineActGradDepXOpArgMap(Atan, "atan", ); // NOLINT -DefineActGradDepXOpArgMap(Sinh, "sinh", ); // NOLINT -DefineActGradDepXOpArgMap(Cosh, "cosh", ); // NOLINT -DefineActGradDepXOpArgMap(Asinh, "asinh", ); // NOLINT -DefineActGradDepXOpArgMap(Acosh, "acosh", ); // NOLINT -DefineActGradDepXOpArgMap(Atanh, "atanh", ); // NOLINT -DefineActGradDepXOpArgMap(BRelu, "brelu", "t_min" comma "t_max"); // NOLINT -DefineActGradDepXOpArgMap(LeakyRelu, "leaky_relu", "alpha"); // NOLINT -DefineActGradDepXOpArgMap(ThresholdedRelu, - "thresholded_relu", - "threshold"); // NOLINT - -DefineActGradDepOutOpArgMap(Relu, "relu", ); // NOLINT -DefineActGradDepOutOpArgMap(Tanh, "tanh", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cos, "cos", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Tan, "tan", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acos, "acos", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sin, "sin", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asin, "asin", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atan, "atan", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Sinh, "sinh", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Cosh, "cosh", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Asinh, "asinh", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Acosh, "acosh", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Atanh, "atanh", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(BRelu, "brelu", "t_min" comma "t_max"); +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LeakyRelu, "leaky_relu", "alpha"); +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(ThresholdedRelu, + "thresholded_relu", + "threshold"); +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(SoftShrink, "soft_shrink", "lambda"); +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardShrink, "hard_shrink", "threshold"); +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(TanhShrink, "tanh_shrink", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Silu, "silu", ); // NOLINT + +DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", ); // NOLINT +DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", ); // NOLINT KernelSignature ReluDoubleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { @@ -85,11 +89,31 @@ KernelSignature LeakyReluOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("leaky_relu", {"X"}, {"alpha"}, {"Out"}); } +KernelSignature EluOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("elu", {"X"}, {"alpha"}, {"Out"}); +} + +KernelSignature EluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("elu_grad", + {"X", "Out", GradVarName("Out")}, + {"alpha"}, + {GradVarName("X")}); +} + +KernelSignature EluDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "elu_double_grad", {"X", "DOut", "DDX"}, {"alpha"}, {"DX", "DDOut"}); +} + } // namespace phi PD_REGISTER_BASE_KERNEL_NAME(relu_grad_grad, relu_double_grad); PD_REGISTER_BASE_KERNEL_NAME(tanh_grad_grad, tanh_double_grad); PD_REGISTER_BASE_KERNEL_NAME(leaky_relu_grad_grad, leaky_relu_double_grad); +PD_REGISTER_BASE_KERNEL_NAME(softshrink, soft_shrink); +PD_REGISTER_BASE_KERNEL_NAME(softshrink_grad, soft_shrink_grad); +PD_REGISTER_BASE_KERNEL_NAME(elu_grad_grad, elu_double_grad); PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping); @@ -118,3 +142,13 @@ PD_REGISTER_ARG_MAPPING_FN(leaky_relu_grad_grad, phi::LeakyReluDoubleGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(thresholded_relu_grad, phi::ThresholdedReluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(softshrink_grad, + phi::SoftShrinkGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(hard_shrink_grad, + phi::HardShrinkGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tanh_shrink_grad, + phi::TanhShrinkGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(elu, phi::EluOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(elu_grad, phi::EluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(elu_grad_grad, phi::EluDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(silu_grad, phi::SiluGradOpArgumentMapping); -- GitLab From 99452af73c4b0438233daf3c8d6359c7c6d72d6d Mon Sep 17 00:00:00 2001 From: chenenquan Date: Wed, 16 Mar 2022 16:21:01 +0800 Subject: [PATCH 101/176] [PHI] Migrate index_select op (#40260) * [PHI] Migrate index_select op * [PHI] Fix bug in test_variable * [PHI] migrate index_select op --- paddle/fluid/operators/index_select_op.cc | 68 +----- paddle/fluid/operators/index_select_op.cu | 209 ------------------ paddle/fluid/operators/index_select_op.h | 73 ------ paddle/fluid/operators/index_select_op_npu.cc | 5 +- paddle/phi/infermeta/binary.cc | 43 ++++ paddle/phi/infermeta/binary.h | 5 + .../kernels/cpu/index_select_grad_kernel.cc | 63 ++++++ paddle/phi/kernels/cpu/index_select_impl.h | 178 +++++++++++++++ paddle/phi/kernels/cpu/index_select_kernel.cc | 62 ++++++ .../kernels/gpu/index_select_grad_kernel.cu | 141 ++++++++++++ paddle/phi/kernels/gpu/index_select_kernel.cu | 109 +++++++++ paddle/phi/kernels/index_select_grad_kernel.h | 29 +++ paddle/phi/kernels/index_select_kernel.h | 28 +++ paddle/phi/ops/compat/index_select_sig.cc | 30 +++ .../fluid/tests/unittests/test_variable.py | 3 +- 15 files changed, 703 insertions(+), 343 deletions(-) delete mode 100644 paddle/fluid/operators/index_select_op.cu create mode 100644 paddle/phi/kernels/cpu/index_select_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/index_select_impl.h create mode 100644 paddle/phi/kernels/cpu/index_select_kernel.cc create mode 100644 paddle/phi/kernels/gpu/index_select_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/index_select_kernel.cu create mode 100644 paddle/phi/kernels/index_select_grad_kernel.h create mode 100644 paddle/phi/kernels/index_select_kernel.h create mode 100644 paddle/phi/ops/compat/index_select_sig.cc diff --git a/paddle/fluid/operators/index_select_op.cc b/paddle/fluid/operators/index_select_op.cc index fea71edf413..069cc9416a6 100644 --- a/paddle/fluid/operators/index_select_op.cc +++ b/paddle/fluid/operators/index_select_op.cc @@ -13,8 +13,13 @@ // limitations under the License. #include "paddle/fluid/operators/index_select_op.h" + #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/binary.h" + namespace paddle { namespace operators { @@ -24,52 +29,6 @@ class IndexSelectOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of IndexSelectOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true, - platform::errors::InvalidArgument( - "Input(Index) of IndexSelectOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of IndexSelectOp should not be null.")); - - auto input_dim = ctx->GetInputDim("X"); - auto index_dim = ctx->GetInputDim("Index"); - auto dim = ctx->Attrs().Get("dim"); - - PADDLE_ENFORCE_EQ( - dim < input_dim.size() && dim >= (0 - input_dim.size()), true, - platform::errors::OutOfRange( - "Attr(dim) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(dim) = %d.", - input_dim.size(), input_dim.size() - 1, dim)); - - PADDLE_ENFORCE_EQ( - index_dim.size() == 1 || (index_dim.size() == 2 && index_dim[1] == 1), - true, platform::errors::InvalidArgument( - "The 'shape' of Input(Index) must be 1-D tensor. " - "But received: the 'shape' of Input(Index) is [%s], " - "the dimension of Input(Index) is [%d].", - index_dim, index_dim.size())); - - PADDLE_ENFORCE_EQ(index_dim[0] != 0, true, - platform::errors::InvalidArgument( - "The length of Input(Index) can't be 0.")); - - auto output_dim = phi::vectorize(input_dim); - if (dim < 0) { - dim += input_dim.size(); - } - output_dim[dim] = index_dim[0]; - ctx->SetOutputDim("Out", phi::make_ddim(output_dim)); - auto type = ctx->GetInputsVarType("X")[0]; - if (type == framework::proto::VarType::LOD_TENSOR) { - ctx->ShareLoD("X", /*->*/ "Out"); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -148,20 +107,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSelectGradNoNeedBufferVarsInferer, } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(index_select, IndexSelectInferShapeFunctor, + PD_INFER_META(phi::IndexSelectInferMeta)); REGISTER_OPERATOR(index_select, ops::IndexSelectOp, ops::IndexSelectOpMaker, ops::IndexSelectGradMaker, - ops::IndexSelectGradMaker); + ops::IndexSelectGradMaker, + IndexSelectInferShapeFunctor); REGISTER_OPERATOR(index_select_grad, ops::IndexSelectGradOp, ops::IndexSelectGradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL( - index_select, - ops::IndexSelectKernel, - ops::IndexSelectKernel, - ops::IndexSelectKernel, - ops::IndexSelectKernel); -REGISTER_OP_CPU_KERNEL( - index_select_grad, - ops::IndexSelectGradKernel, - ops::IndexSelectGradKernel, - ops::IndexSelectGradKernel, - ops::IndexSelectGradKernel); diff --git a/paddle/fluid/operators/index_select_op.cu b/paddle/fluid/operators/index_select_op.cu deleted file mode 100644 index f810aee2ade..00000000000 --- a/paddle/fluid/operators/index_select_op.cu +++ /dev/null @@ -1,209 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/index_select_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using platform::PADDLE_CUDA_NUM_THREADS; -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -template -__global__ void index_select_cuda_kernel(const T* input, T* output, - const IndexT* index, int64_t N, - int64_t stride, int64_t size, - int64_t delta) { - int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= N) { - return; - } - - int64_t pre_idx = idx / (stride * size); - int64_t dim_idx = idx % (stride * size) / stride; - IndexT src_dim_idx = index[dim_idx]; - int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride; - output[idx] = input[input_idx]; -} - -template -__global__ void index_select_grad_cuda_kernel(const T* output_grad, - T* input_grad, - const IndexT* index, int64_t nums, - int64_t N, int64_t stride, - int64_t size, int64_t delta) { - int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= N) { - return; - } - - int64_t pre_idx = idx / (stride * size); - int64_t dim_idx = idx % (stride * size) / stride; - IndexT src_dim_idx = index[dim_idx]; - int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride; - paddle::platform::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]); -} - -template -__global__ void index_select_grad_init(T* input_grad, int64_t N) { - int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= N) { - return; - } - input_grad[idx] = 0.0; -} - -template -class IndexSelectCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* index = context.Input("Index"); - auto* out = context.Output("Out"); - int dim = context.Attr("dim"); - auto input_dim = in->dims(); - auto output_dim = out->dims(); - dim = dim >= 0 ? dim : dim + input_dim.size(); - auto stride_dim = phi::stride(input_dim); - int64_t stride = stride_dim[dim]; - int64_t size = output_dim[dim]; - int64_t delta = input_dim[dim] - size; - - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT64 || - index_type == framework::proto::VarType::INT32; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - - auto* in_data = in->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - int64_t numel = out->numel(); - - auto stream = - context.template device_context().stream(); - - if (index_type == framework::proto::VarType::INT64) { - const int64_t* index_data = index->data(); - index_select_cuda_kernel<<< - (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, index_data, - numel, stride, size, delta); - platform::GpuStreamSync(stream); - } else { - const int* index_data = index->data(); - index_select_cuda_kernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - in_data, out_data, index_data, numel, stride, size, delta); - platform::GpuStreamSync(stream); - } - } -}; - -template -class IndexSelectGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* output_grad = context.Input(framework::GradVarName("Out")); - auto* in_grad = context.Output(framework::GradVarName("X")); - auto* index = context.Input("Index"); - - auto* output_grad_data = output_grad->data(); - auto* in_grad_data = in_grad->mutable_data(context.GetPlace()); - - int dim = context.Attr("dim"); - auto input_dim = in_grad->dims(); - auto output_dim = output_grad->dims(); - dim = dim >= 0 ? dim : dim + input_dim.size(); - auto stride_dim = phi::stride(input_dim); - int64_t stride = stride_dim[dim]; - int64_t size = output_dim[dim]; - int64_t delta = input_dim[dim] - size; - - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT64 || - index_type == framework::proto::VarType::INT32; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - - int64_t numel = in_grad->numel(); - int64_t index_nums = index->numel(); - int64_t out_nums = output_grad->numel(); - - auto stream = - context.template device_context().stream(); - - index_select_grad_init< - T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_grad_data, numel); - - if (index_type == framework::proto::VarType::INT64) { - const int64_t* index_data = index->data(); - index_select_grad_cuda_kernel<<< - (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data, - index_data, index_nums, - out_nums, stride, size, delta); - platform::GpuStreamSync(stream); - } else { - const int* index_data = index->data(); - index_select_grad_cuda_kernel<<< - (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(output_grad_data, in_grad_data, - index_data, index_nums, - out_nums, stride, size, delta); - platform::GpuStreamSync(stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - index_select, - ops::IndexSelectCUDAKernel, - ops::IndexSelectCUDAKernel, - ops::IndexSelectCUDAKernel, - ops::IndexSelectCUDAKernel, - ops::IndexSelectCUDAKernel); -REGISTER_OP_CUDA_KERNEL( - index_select_grad, - ops::IndexSelectGradCUDAKernel, - ops::IndexSelectGradCUDAKernel, - ops::IndexSelectGradCUDAKernel, - ops::IndexSelectGradCUDAKernel, - ops::IndexSelectGradCUDAKernel); diff --git a/paddle/fluid/operators/index_select_op.h b/paddle/fluid/operators/index_select_op.h index 04b4f69add7..684829be269 100644 --- a/paddle/fluid/operators/index_select_op.h +++ b/paddle/fluid/operators/index_select_op.h @@ -91,41 +91,6 @@ void IndexSelectInner(const framework::ExecutionContext& context, output->Resize(output_dim); } -template -class IndexSelectKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto inputs = *context.Input("X"); - auto* index = context.Input("Index"); - auto* output = context.Output("Out"); - - int dim = context.Attr("dim"); - if (dim < 0) { - dim += inputs.dims().size(); - } - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - - if (index_type == framework::proto::VarType::INT32) { - IndexSelectInner(context, &inputs, *index, output, - dim); - } else if (index_type == framework::proto::VarType::INT64) { - IndexSelectInner(context, &inputs, *index, - output, dim); - } - } -}; - template struct IndexSelectAdd { void operator()(const framework::ExecutionContext& ctx, int slice_size, @@ -197,43 +162,5 @@ void IndexSelectGradInner(const framework::ExecutionContext& context, x_grad->Resize(output_dim); } -template -class IndexSelectGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x_grad = - context.Output(framework::GradVarName("X")); - auto* index = context.Input("Index"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - - int dim = context.Attr("dim"); - if (dim < 0) { - dim += out_grad->dims().size(); - } - const auto& index_type = framework::TransToProtoVarType(index->dtype()); - - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Input(Index) holds the wrong type, it holds %s, but " - "desires to be %s or %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - - if (index_type == framework::proto::VarType::INT32) { - IndexSelectGradInner(context, *out_grad, *index, - x_grad, dim); - } else if (index_type == framework::proto::VarType::INT64) { - IndexSelectGradInner(context, *out_grad, - *index, x_grad, dim); - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/index_select_op_npu.cc b/paddle/fluid/operators/index_select_op_npu.cc index bce7a3c1caa..a232fba7e28 100644 --- a/paddle/fluid/operators/index_select_op_npu.cc +++ b/paddle/fluid/operators/index_select_op_npu.cc @@ -12,12 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/index_select_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + template class IndexSelectNPUKernel : public framework::OpKernel { public: diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 4c1d169615b..38dce0dc69d 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -643,6 +643,49 @@ void IndexSampleInferMeta(const MetaTensor& x, out->share_lod(y); } +void IndexSelectInferMeta(const MetaTensor& x, + const MetaTensor& index, + int dim, + MetaTensor* output) { + auto input_dim = x.dims(); + auto index_dim = index.dims(); + + PADDLE_ENFORCE_EQ( + dim < input_dim.size() && dim >= (0 - input_dim.size()), + true, + phi::errors::OutOfRange( + "Attr(dim) is out of range, It's expected " + "to be in range of [-%d, %d]. But received Attr(dim) = %d.", + input_dim.size(), + input_dim.size() - 1, + dim)); + + PADDLE_ENFORCE_EQ( + index_dim.size() == 1 || (index_dim.size() == 2 && index_dim[1] == 1), + true, + phi::errors::InvalidArgument( + "The 'shape' of Input(Index) must be 1-D tensor. " + "But received: the 'shape' of Input(Index) is [%s], " + "the dimension of Input(Index) is [%d].", + index_dim, + index_dim.size())); + + PADDLE_ENFORCE_EQ( + index_dim[0] != 0, + true, + phi::errors::InvalidArgument("The length of Input(Index) can't be 0.")); + + auto output_dim = phi::vectorize(input_dim); + if (dim < 0) { + dim += input_dim.size(); + } + output_dim[dim] = index_dim[0]; + output->set_dims(phi::make_ddim(output_dim)); + output->set_dtype(x.dtype()); + output->set_layout(x.layout()); + output->share_lod(x); +} + void LogLossInferMeta(const MetaTensor& input, const MetaTensor& label, float epsilon, diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index 40641ea4858..1727e85b1d5 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -113,6 +113,11 @@ void IndexSampleInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void IndexSelectInferMeta(const MetaTensor& x, + const MetaTensor& index, + int dim, + MetaTensor* output); + void LogLossInferMeta(const MetaTensor& input, const MetaTensor& label, float epsilon, diff --git a/paddle/phi/kernels/cpu/index_select_grad_kernel.cc b/paddle/phi/kernels/cpu/index_select_grad_kernel.cc new file mode 100644 index 00000000000..9dd50e7df8f --- /dev/null +++ b/paddle/phi/kernels/cpu/index_select_grad_kernel.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/index_select_grad_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/kernels/cpu/index_select_impl.h" + +namespace phi { + +template +void IndexSelectGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + int dim, + DenseTensor* x_grad) { + if (dim < 0) { + dim += out_grad.dims().size(); + } + const auto& index_type = index.dtype(); + + bool index_type_match = + index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ(index_type_match, + true, + phi::errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + index_type, + phi::DataType::INT32, + phi::DataType::INT64)); + + if (index_type == phi::DataType::INT32) { + IndexSelectGradInner(ctx, out_grad, index, x_grad, dim); + } else if (index_type == phi::DataType::INT64) { + IndexSelectGradInner( + ctx, out_grad, index, x_grad, dim); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(index_select_grad, + CPU, + ALL_LAYOUT, + phi::IndexSelectGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/index_select_impl.h b/paddle/phi/kernels/cpu/index_select_impl.h new file mode 100644 index 00000000000..163174580ff --- /dev/null +++ b/paddle/phi/kernels/cpu/index_select_impl.h @@ -0,0 +1,178 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +struct IndexSelectAdd { + void operator()(const Context& ctx, + int slice_size, + const T* src_pointer, + const T* p_pointer, + T* dist_pointer) { + for (int i = 0; i < slice_size; i++) { + dist_pointer[i] = src_pointer[i] + p_pointer[i]; + } + } +}; + +template +struct IndexSelectAdd< + Context, + T, + typename std::enable_if::value>::type> { + void operator()(const Context& ctx, + int slice_size, + const T* src_pointer, + const T* p_pointer, + T* dist_pointer) { + auto blas = phi::funcs::GetBlas(ctx); + blas.VADD(slice_size, src_pointer, p_pointer, dist_pointer); + } +}; + +template +void IndexSelectInner(const Context& ctx, + DenseTensor* input, + const DenseTensor& index, + DenseTensor* output, + int dim) { + auto input_dim = input->dims(); + auto input_dim_size = input_dim.size(); + auto output_dim = output->dims(); + auto index_size = index.dims()[0]; + + DenseTensor index_cpu_copy; + if (!paddle::platform::is_cpu_place(index.place())) { + phi::Copy(ctx, index, phi::CPUPlace(), true, &index_cpu_copy); + } + const IndexT* index_data = paddle::platform::is_cpu_place(index.place()) + ? index.data() + : index_cpu_copy.data(); + ctx.template Alloc(output); + + auto slice_size = 1; + for (auto i = dim + 1; i < input_dim_size; i++) { + slice_size *= input_dim[i]; + } + + auto outer_nums = 1; + for (auto i = 0; i < dim; i++) { + outer_nums *= input_dim[i]; + } + + for (int i = 0; i < index_size; i++) { + PADDLE_ENFORCE_GE( + index_data[i], + 0, + phi::errors::InvalidArgument( + "Variable value (index) of OP(index_select) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + input_dim[dim], + index_data[i])); + PADDLE_ENFORCE_LT( + index_data[i], + input_dim[dim], + phi::errors::InvalidArgument( + "Variable value (index) of OP(index_select) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + input_dim[dim], + index_data[i])); + } + + VLOG(3) << "Index_Select_Debug; outer_nums: " << outer_nums + << "; slice_size: " << slice_size << "; index_size: " << index_size; + + input->Resize(phi::make_ddim({outer_nums, input_dim[dim], slice_size})); + output->Resize(phi::make_ddim({outer_nums, index_size, slice_size})); + + auto input_tensor = EigenTensor::From(*input); + auto output_tensor = EigenTensor::From(*output); + + auto& place = *ctx.eigen_device(); + + for (auto j = 0; j < index_size; j++) { + IndexT index_value = index_data[j]; + auto output_t = output_tensor.chip(j, 1); + output_t.device(place) = input_tensor.chip(index_value, 1); + } + input->Resize(input_dim); + output->Resize(output_dim); +} + +template +void IndexSelectGradInner(const Context& ctx, + const DenseTensor& out_grad, + const DenseTensor& index, + DenseTensor* x_grad, + int dim) { + const T* input_data = out_grad.data(); + const IndexT* index_data = index.data(); + + const T* p_output = ctx.template Alloc(x_grad); + T* out_data = ctx.template Alloc(x_grad); + + auto input_dim = out_grad.dims(); + auto input_dim_size = input_dim.size(); + auto output_dim = x_grad->dims(); + + phi::funcs::SetConstant set_constant; + set_constant(ctx, x_grad, static_cast(0.0)); + + auto slice_size = 1; + for (auto i = dim + 1; i < input_dim_size; i++) { + slice_size *= input_dim[i]; + } + + auto input_width = slice_size * input_dim[dim]; + auto output_width = slice_size * output_dim[dim]; + + auto outer_nums = 1; + for (auto i = 0; i < dim; i++) { + outer_nums *= input_dim[i]; + } + + auto index_size = index.dims()[0]; + VLOG(3) << "Index_Select_Grad_Debug; outer_nums: " << outer_nums + << "; slice_size: " << slice_size << "; input_width: " << input_width + << "; output_width: " << output_width + << "; index_size: " << index_size; + + for (auto i = 0; i < outer_nums; i++) { + auto input_start_offset = i * input_width; + auto output_start_offset = i * output_width; + + for (auto j = 0; j < index_size; j++) { + IndexT index_value = index_data[j]; + auto src = input_data + input_start_offset + j * slice_size; + auto p_out = p_output + output_start_offset + index_value * slice_size; + auto dst = out_data + output_start_offset + index_value * slice_size; + IndexSelectAdd index_select_add; + index_select_add(ctx, slice_size, src, p_out, dst); + } + } + x_grad->Resize(output_dim); +} + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/index_select_kernel.cc b/paddle/phi/kernels/cpu/index_select_kernel.cc new file mode 100644 index 00000000000..5341ede6b2f --- /dev/null +++ b/paddle/phi/kernels/cpu/index_select_kernel.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/index_select_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/kernels/cpu/index_select_impl.h" + +namespace phi { + +template +void IndexSelectKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& index, + int dim, + DenseTensor* output) { + auto inputs = x; + if (dim < 0) { + dim += inputs.dims().size(); + } + const auto& index_type = index.dtype(); + + bool index_type_match = + index_type == phi::DataType::INT32 || index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ(index_type_match, + true, + phi::errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + index_type, + phi::DataType::INT32, + phi::DataType::INT64)); + + if (index_type == phi::DataType::INT32) { + IndexSelectInner(ctx, &inputs, index, output, dim); + } else if (index_type == phi::DataType::INT64) { + IndexSelectInner(ctx, &inputs, index, output, dim); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(index_select, + CPU, + ALL_LAYOUT, + phi::IndexSelectKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu new file mode 100644 index 00000000000..a393eecd512 --- /dev/null +++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu @@ -0,0 +1,141 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/index_select_grad_kernel.h" + +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" + +namespace phi { + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void index_select_grad_cuda_kernel(const T* output_grad, + T* input_grad, + const IndexT* index, + int64_t nums, + int64_t N, + int64_t stride, + int64_t size, + int64_t delta) { + int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= N) { + return; + } + + int64_t pre_idx = idx / (stride * size); + int64_t dim_idx = idx % (stride * size) / stride; + IndexT src_dim_idx = index[dim_idx]; + int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride; + paddle::platform::CudaAtomicAdd(&input_grad[input_idx], output_grad[idx]); +} + +template +__global__ void index_select_grad_init(T* input_grad, int64_t N) { + int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= N) { + return; + } + input_grad[idx] = 0.0; +} + +template +void IndexSelectGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + int dim, + DenseTensor* x_grad) { + auto* output_grad_data = out_grad.data(); + auto* in_grad_data = ctx.template Alloc(x_grad); + + auto input_dim = x_grad->dims(); + auto output_dim = out_grad.dims(); + dim = dim >= 0 ? dim : dim + input_dim.size(); + auto stride_dim = phi::stride(input_dim); + int64_t stride = stride_dim[dim]; + int64_t size = output_dim[dim]; + int64_t delta = input_dim[dim] - size; + const auto& index_type = index.dtype(); + + bool index_type_match = + index_type == phi::DataType::INT64 || index_type == phi::DataType::INT32; + PADDLE_ENFORCE_EQ(index_type_match, + true, + phi::errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + index_type, + phi::DataType::INT32, + phi::DataType::INT64)); + + int64_t numel = x_grad->numel(); + int64_t index_nums = index.numel(); + int64_t out_nums = out_grad.numel(); + + auto stream = ctx.stream(); + + index_select_grad_init< + T><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, + 0, + stream>>>(in_grad_data, numel); + + if (index_type == phi::DataType::INT64) { + const int64_t* index_data = index.data(); + index_select_grad_cuda_kernel<<< + (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, + 0, + stream>>>(output_grad_data, + in_grad_data, + index_data, + index_nums, + out_nums, + stride, + size, + delta); + phi::backends::gpu::GpuStreamSync(stream); + } else { + const int* index_data = index.data(); + index_select_grad_cuda_kernel<<< + (out_nums + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, + 0, + stream>>>(output_grad_data, + in_grad_data, + index_data, + index_nums, + out_nums, + stride, + size, + delta); + phi::backends::gpu::GpuStreamSync(stream); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(index_select_grad, + GPU, + ALL_LAYOUT, + phi::IndexSelectGradKernel, + float, + double, + phi::dtype::float16, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/index_select_kernel.cu b/paddle/phi/kernels/gpu/index_select_kernel.cu new file mode 100644 index 00000000000..f774522318a --- /dev/null +++ b/paddle/phi/kernels/gpu/index_select_kernel.cu @@ -0,0 +1,109 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/index_select_kernel.h" + +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" + +namespace phi { + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void index_select_cuda_kernel(const T* input, + T* output, + const IndexT* index, + int64_t N, + int64_t stride, + int64_t size, + int64_t delta) { + int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= N) { + return; + } + + int64_t pre_idx = idx / (stride * size); + int64_t dim_idx = idx % (stride * size) / stride; + IndexT src_dim_idx = index[dim_idx]; + int64_t input_idx = idx + (delta * pre_idx + src_dim_idx - dim_idx) * stride; + output[idx] = input[input_idx]; +} + +template +void IndexSelectKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& index, + int dim, + DenseTensor* output) { + auto input_dim = x.dims(); + auto output_dim = output->dims(); + dim = dim >= 0 ? dim : dim + input_dim.size(); + auto stride_dim = phi::stride(input_dim); + int64_t stride = stride_dim[dim]; + int64_t size = output_dim[dim]; + int64_t delta = input_dim[dim] - size; + const auto& index_type = index.dtype(); + + bool index_type_match = + index_type == phi::DataType::INT64 || index_type == phi::DataType::INT32; + PADDLE_ENFORCE_EQ(index_type_match, + true, + phi::errors::InvalidArgument( + "Input(Index) holds the wrong type, it holds %s, but " + "desires to be %s or %s", + index_type, + phi::DataType::INT32, + phi::DataType::INT64)); + + auto* in_data = x.data(); + T* out_data = ctx.template Alloc(output); + + int64_t numel = output->numel(); + auto stream = ctx.stream(); + + if (index_type == phi::DataType::INT64) { + const int64_t* index_data = index.data(); + index_select_cuda_kernel<<< + (numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, + 0, + stream>>>(in_data, out_data, index_data, numel, stride, size, delta); + phi::backends::gpu::GpuStreamSync(stream); + } else { + const int* index_data = index.data(); + index_select_cuda_kernel< + T, + int><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, + 0, + stream>>>( + in_data, out_data, index_data, numel, stride, size, delta); + phi::backends::gpu::GpuStreamSync(stream); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(index_select, + GPU, + ALL_LAYOUT, + phi::IndexSelectKernel, + float, + double, + phi::dtype::float16, + int, + int64_t) {} diff --git a/paddle/phi/kernels/index_select_grad_kernel.h b/paddle/phi/kernels/index_select_grad_kernel.h new file mode 100644 index 00000000000..c3dc1595989 --- /dev/null +++ b/paddle/phi/kernels/index_select_grad_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void IndexSelectGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + int dim, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/index_select_kernel.h b/paddle/phi/kernels/index_select_kernel.h new file mode 100644 index 00000000000..124b6897311 --- /dev/null +++ b/paddle/phi/kernels/index_select_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void IndexSelectKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& index, + int dim, + DenseTensor* output); + +} // namespace phi diff --git a/paddle/phi/ops/compat/index_select_sig.cc b/paddle/phi/ops/compat/index_select_sig.cc new file mode 100644 index 00000000000..53eff1bbcd7 --- /dev/null +++ b/paddle/phi/ops/compat/index_select_sig.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature IndexSelectGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("index_select_grad", + {"X", "Index", GradVarName("Out")}, + {"dim"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(index_select_grad, + phi::IndexSelectGradOpArgumentMapping); diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py index a3bfe3864a2..beaf361379b 100644 --- a/python/paddle/fluid/tests/unittests/test_variable.py +++ b/python/paddle/fluid/tests/unittests/test_variable.py @@ -333,7 +333,8 @@ class TestVariable(unittest.TestCase): with self.assertRaises(IndexError): res = x[[True, False, False]] with self.assertRaises(ValueError): - res = x[[False, False]] + with paddle.static.program_guard(prog): + res = x[[False, False]] def test_slice(self): places = [fluid.CPUPlace()] -- GitLab From 44d46d0396e2514478b79b2de326816dc0f04041 Mon Sep 17 00:00:00 2001 From: chenenquan Date: Wed, 16 Mar 2022 16:24:51 +0800 Subject: [PATCH 102/176] [PHI] Migrate roll op (#40257) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [PHI] Migrate roll op * 【phi】migrate eigh op to phi (#40213) * migrate eigh to phi * optimize code * modify code according to comment * conflict resolution * [PHI] Migrate roll op * [PHI] Fix converage of roll_sig * [PHI] Fix infermate of roll_sig * [Phi] Fix unittest coverage of roll op * [PHI] Fix infermeta in unary * [PHI] Fix parameter type of roll op * [PHI] Fix parameter type of roll op * [PHI] Fix parameter of roll op Co-authored-by: crystal <62974595+Zjq9409@users.noreply.github.com> --- paddle/fluid/operators/roll_op.cc | 70 +------ paddle/fluid/operators/roll_op.cu | 225 --------------------- paddle/fluid/operators/roll_op.h | 169 ---------------- paddle/phi/infermeta/unary.cc | 31 +++ paddle/phi/infermeta/unary.h | 5 + paddle/phi/kernels/cpu/roll_grad_kernel.cc | 64 ++++++ paddle/phi/kernels/cpu/roll_kernel.cc | 75 +++++++ paddle/phi/kernels/cpu/roll_kernel_impl.h | 76 +++++++ paddle/phi/kernels/gpu/roll_grad_kernel.cu | 88 ++++++++ paddle/phi/kernels/gpu/roll_kernel.cu | 90 +++++++++ paddle/phi/kernels/gpu/roll_kernel_impl.h | 71 +++++++ paddle/phi/kernels/roll_grad_kernel.h | 30 +++ paddle/phi/kernels/roll_kernel.h | 29 +++ paddle/phi/ops/compat/roll_sig.cc | 36 ++++ 14 files changed, 606 insertions(+), 453 deletions(-) delete mode 100644 paddle/fluid/operators/roll_op.cu delete mode 100644 paddle/fluid/operators/roll_op.h create mode 100644 paddle/phi/kernels/cpu/roll_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/roll_kernel.cc create mode 100644 paddle/phi/kernels/cpu/roll_kernel_impl.h create mode 100644 paddle/phi/kernels/gpu/roll_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/roll_kernel.cu create mode 100644 paddle/phi/kernels/gpu/roll_kernel_impl.h create mode 100644 paddle/phi/kernels/roll_grad_kernel.h create mode 100644 paddle/phi/kernels/roll_kernel.h create mode 100644 paddle/phi/ops/compat/roll_sig.cc diff --git a/paddle/fluid/operators/roll_op.cc b/paddle/fluid/operators/roll_op.cc index f82510556fd..898db4c22fe 100644 --- a/paddle/fluid/operators/roll_op.cc +++ b/paddle/fluid/operators/roll_op.cc @@ -12,13 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/roll_op.h" - #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/platform/complex.h" +#include "paddle/fluid/operators/utils.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -29,43 +32,6 @@ class RollOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of RollOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of RollOp should not be null.")); - - auto dims = ctx->Attrs().Get>("axis"); - auto shifts = ctx->Attrs().Get>("shifts"); - - if (!ctx->HasInput("ShiftsTensor")) { - if (dims.size() != 0) { - PADDLE_ENFORCE_EQ(dims.size(), shifts.size(), - platform::errors::InvalidArgument( - "When dims.size() != 0, dims.size() " - "should be equal to " - "shifts.size(). But received " - "dims.size() = %d, shifts.size() = %d", - dims.size(), shifts.size())); - } else { - PADDLE_ENFORCE_EQ(shifts.size(), 1, - platform::errors::InvalidArgument( - "When dims.size() == 0, shifts.size() " - "should be equal to 1, But received " - "shifts.size() = %d", - shifts.size())); - } - } - - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - auto type = ctx->GetInputsVarType("X")[0]; - if (type == framework::proto::VarType::LOD_TENSOR) { - ctx->ShareLoD("X", /*->*/ "Out"); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -149,29 +115,15 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RollGradNoNeedBufferVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(roll, RollInferShapeFunctor, + PD_INFER_META(phi::RollInferMeta)); + REGISTER_OPERATOR(roll, ops::RollOp, ops::RollOpMaker, ops::RollGradMaker, - ops::RollGradMaker); + ops::RollGradMaker, + RollInferShapeFunctor); REGISTER_OPERATOR(roll_grad, ops::RollGradOp, ops::RollGradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL( - roll, ops::RollKernel, - ops::RollKernel, - ops::RollKernel, - ops::RollKernel, - ops::RollKernel>, - ops::RollKernel>); -REGISTER_OP_CPU_KERNEL( - roll_grad, ops::RollGradKernel, - ops::RollGradKernel, - ops::RollGradKernel, - ops::RollGradKernel, - ops::RollGradKernel>, - ops::RollGradKernel>); REGISTER_OP_VERSION(roll) .AddCheckpoint( diff --git a/paddle/fluid/operators/roll_op.cu b/paddle/fluid/operators/roll_op.cu deleted file mode 100644 index b9064c5450f..00000000000 --- a/paddle/fluid/operators/roll_op.cu +++ /dev/null @@ -1,225 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/roll_op.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/phi/core/utils/array.h" - -namespace paddle { -namespace operators { - -using platform::PADDLE_CUDA_NUM_THREADS; -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -template -__global__ void RollCudaKernel(const T* input, T* output, int64_t N, - phi::Array shifts, - phi::Array strides, - phi::Array sizes) { - int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= N) { - return; - } - - int64_t output_idx = idx; - int64_t new_dim_idx = 0; - -#pragma unroll - for (size_t i = 0; i < Rank; i++) { - new_dim_idx = (idx / strides[i]) % sizes[i] + shifts[i]; - if (new_dim_idx >= sizes[i]) { - output_idx += (shifts[i] - sizes[i]) * strides[i]; - } else { - output_idx += shifts[i] * strides[i]; - } - } - output[output_idx] = input[idx]; -} - -template -class RollKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - std::vector shifts = context.Attr>("shifts"); - if (context.HasInput("ShiftsTensor")) { - const auto* shifts_tensor = - context.Input("ShiftsTensor"); - PADDLE_ENFORCE_EQ( - shifts_tensor->dims().size(), 1, - platform::errors::InvalidArgument( - "The rank of ShiftsTensor is expected to be 1, got %s", - shifts_tensor->dims().size())); - shifts = GetDataFromTensor(shifts_tensor); - } - std::vector dims = context.Attr>("axis"); - - auto* in_data = in->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - int64_t numel = in->numel(); - auto stream = - context.template device_context().stream(); - - size_t nums = shifts.size(); - auto input_dim = in->dims(); - auto stride_dim = phi::stride(input_dim); - - std::vector strides(nums), sizes(nums); - if (dims.size() == 0) { - strides[0] = 1; - sizes[0] = numel; - shifts[0] = (shifts[0] % numel + numel) % numel; - } else { - for (size_t i = 0; i < nums; i++) { - int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size(); - int64_t size = input_dim[dim]; - - if (size != 0) { - shifts[i] = (shifts[i] % size + size) % size; - strides[i] = stride_dim[dim]; - sizes[i] = size; - } - } - } - -#define CALL_ROLL_CUDA_KERNEL(N) \ - case N: { \ - phi::Array _strides; \ - phi::Array _shifts; \ - phi::Array _sizes; \ - for (size_t idx = 0; idx < N; ++idx) { \ - _strides[idx] = strides[idx]; \ - _shifts[idx] = shifts[idx]; \ - _sizes[idx] = sizes[idx]; \ - } \ - RollCudaKernel< \ - T, \ - N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, \ - PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, numel, \ - _shifts, _strides, _sizes); \ - break; \ - } - - switch (nums) { - CALL_ROLL_CUDA_KERNEL(1); - CALL_ROLL_CUDA_KERNEL(2); - CALL_ROLL_CUDA_KERNEL(3); - CALL_ROLL_CUDA_KERNEL(4); - CALL_ROLL_CUDA_KERNEL(5); - CALL_ROLL_CUDA_KERNEL(6); - CALL_ROLL_CUDA_KERNEL(7); - CALL_ROLL_CUDA_KERNEL(8); - CALL_ROLL_CUDA_KERNEL(9); - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "shifts.size() should be less than 10, But received shifts.size() " - "= %d", - shifts.size())); - } - } -}; - -template -class RollGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input(framework::GradVarName("Out")); - auto* out = context.Output(framework::GradVarName("X")); - std::vector shifts = context.Attr>("shifts"); - if (context.HasInput("ShiftsTensor")) { - const auto* shifts_tensor = - context.Input("ShiftsTensor"); - PADDLE_ENFORCE_EQ( - shifts_tensor->dims().size(), 1, - platform::errors::InvalidArgument( - "The rank of ShiftsTensor is expected to be 1, got %s", - shifts_tensor->dims().size())); - shifts = GetDataFromTensor(shifts_tensor); - } - std::vector dims = context.Attr>("axis"); - - auto* in_data = in->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - int64_t numel = in->numel(); - auto stream = - context.template device_context().stream(); - size_t nums = shifts.size(); - auto input_dim = in->dims(); - auto stride_dim = phi::stride(input_dim); - - std::vector strides(nums), sizes(nums); - if (dims.size() == 0) { - strides[0] = 1; - sizes[0] = numel; - shifts[0] = ((-shifts[0]) % numel + numel) % numel; - } else { - for (size_t i = 0; i < nums; i++) { - int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size(); - int64_t size = input_dim[dim]; - if (size != 0) { - shifts[i] = ((-shifts[i]) % size + size) % size; - strides[i] = stride_dim[dim]; - sizes[i] = size; - } - } - } - - switch (nums) { - CALL_ROLL_CUDA_KERNEL(1); - CALL_ROLL_CUDA_KERNEL(2); - CALL_ROLL_CUDA_KERNEL(3); - CALL_ROLL_CUDA_KERNEL(4); - CALL_ROLL_CUDA_KERNEL(5); - CALL_ROLL_CUDA_KERNEL(6); - CALL_ROLL_CUDA_KERNEL(7); - CALL_ROLL_CUDA_KERNEL(8); - CALL_ROLL_CUDA_KERNEL(9); - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "shifts.size() should be less than 10, But received shifts.size() " - "= %d", - shifts.size())); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - roll, ops::RollKernel, - ops::RollKernel, - ops::RollKernel, - ops::RollKernel, - ops::RollKernel>, - ops::RollKernel>); -REGISTER_OP_CUDA_KERNEL( - roll_grad, ops::RollGradKernel, - ops::RollGradKernel, - ops::RollGradKernel, - ops::RollGradKernel, - ops::RollGradKernel>, - ops::RollGradKernel>); diff --git a/paddle/fluid/operators/roll_op.h b/paddle/fluid/operators/roll_op.h deleted file mode 100644 index 413c7bcfc15..00000000000 --- a/paddle/fluid/operators/roll_op.h +++ /dev/null @@ -1,169 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/utils.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; -using DDim = framework::DDim; - -template -inline void shift_along_dim(T* data, const DDim& input_dim, int64_t dim, - int64_t shift) { - if (dim < 0) { - dim += input_dim.size(); - } - if (input_dim[dim] == 0) { - return; - } - shift = shift % input_dim[dim]; - if (shift < 0) { - shift += input_dim[dim]; - } - - auto outer_loops = 1; - for (auto i = 0; i < dim; i++) { - outer_loops *= input_dim[i]; - } - auto slice_width = 1; - for (auto i = dim + 1; i < input_dim.size(); i++) { - slice_width *= input_dim[i]; - } - - VLOG(3) << "shift_along_dim_debug: input_dim: " << input_dim - << "; dim: " << dim << "; shift: " << shift - << "; outer_loops: " << outer_loops - << "; slice_width: " << slice_width; - if (shift == 0) { - return; - } - - std::vector head; - auto head_size = slice_width * (input_dim[dim] - shift); - head.resize(head_size); - - for (auto i = 0; i < outer_loops; i++) { - for (auto j = 0; j < head_size; j++) { - head[j] = data[i * input_dim[dim] * slice_width + j]; - } - for (auto j = input_dim[dim] - shift; j < input_dim[dim]; j++) { - auto dst_pos = j - input_dim[dim] + shift; - for (auto k = 0; k < slice_width; k++) { - data[(i * input_dim[dim] + dst_pos) * slice_width + k] = - data[(i * input_dim[dim] + j) * slice_width + k]; - } - } - for (auto j = 0; j < head_size; j++) { - data[(i * input_dim[dim] + shift) * slice_width + j] = head[j]; - } - } -} - -template -class RollKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input_var = context.InputVar("X"); - auto* output_var = context.OutputVar("Out"); - auto& input = input_var->Get(); - auto* output = output_var->GetMutable(); - std::vector shifts = context.Attr>("shifts"); - if (context.HasInput("ShiftsTensor")) { - const auto* shifts_tensor = - context.Input("ShiftsTensor"); - PADDLE_ENFORCE_EQ( - shifts_tensor->dims().size(), 1, - platform::errors::InvalidArgument( - "The rank of ShiftsTensor is expected to be 1, got %s", - shifts_tensor->dims().size())); - shifts = GetDataFromTensor(shifts_tensor); - } - std::vector dims = context.Attr>("axis"); - - std::vector out_vec; - paddle::framework::TensorToVector(input, context.device_context(), - &out_vec); - - size_t nums = shifts.size(); - DDim input_dim = input.dims(); - - // axis = none, reshape to 1-D tensor - if (dims.size() == 0) { - dims.push_back(0l); - input_dim = framework::Dim<1>(out_vec.size()); - } - - for (size_t i = 0; i < nums; i++) { - PADDLE_ENFORCE_EQ( - dims[i] < input_dim.size() && dims[i] >= (0 - input_dim.size()), true, - platform::errors::OutOfRange( - "Attr(axis[%d]) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(axis[%d]) = %d.", - i, input_dim.size(), input_dim.size() - 1, i, dims[i])); - shift_along_dim(out_vec.data(), input_dim, dims[i], shifts[i]); - } - output->mutable_data(context.GetPlace()); - framework::TensorFromVector(out_vec, context.device_context(), output); - output->Resize(input.dims()); - } -}; - -template -class RollGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input_var = context.InputVar(framework::GradVarName("Out")); - auto* output_var = context.OutputVar(framework::GradVarName("X")); - auto& input = input_var->Get(); - auto* output = output_var->GetMutable(); - std::vector shifts = context.Attr>("shifts"); - if (context.HasInput("ShiftsTensor")) { - const auto* shifts_tensor = - context.Input("ShiftsTensor"); - shifts = GetDataFromTensor(shifts_tensor); - } - std::vector dims = context.Attr>("axis"); - - std::vector out_vec; - paddle::framework::TensorToVector(input, context.device_context(), - &out_vec); - - size_t nums = shifts.size(); - DDim input_dim = input.dims(); - - // axis = none, reshape to 1-D tensor - if (dims.size() == 0) { - dims.push_back(0l); - input_dim = framework::Dim<1>(out_vec.size()); - } - - for (size_t i = 0; i < nums; i++) { - shift_along_dim(out_vec.data(), input_dim, dims[i], 0 - shifts[i]); - } - output->mutable_data(context.GetPlace()); - framework::TensorFromVector(out_vec, context.device_context(), output); - output->Resize(input.dims()); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 752abae1b03..262ada3eaf3 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -1016,6 +1016,37 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x, ReshapeInferMeta(x, shape, out, config); } +void RollInferMeta(const MetaTensor& x, + const ScalarArray& shifts, + const std::vector& axis, + MetaTensor* out) { + auto shifts_data = shifts.GetData(); + + if (axis.size() != 0) { + PADDLE_ENFORCE_EQ( + axis.size(), + shifts_data.size(), + phi::errors::InvalidArgument("When dims.size() != 0, dims.size() " + "should be equal to " + "shifts.size(). But received " + "dims.size() = %d, shifts.size() = %d", + axis.size(), + shifts_data.size())); + } else { + PADDLE_ENFORCE_EQ( + shifts_data.size(), + 1, + phi::errors::InvalidArgument("When dims.size() == 0, shifts.size() " + "should be equal to 1, But received " + "shifts.size() = %d", + shifts_data.size())); + } + + out->set_dims(x.dims()); + out->share_lod(x); + out->set_dtype(x.dtype()); +} + void ShapeInferMeta(const MetaTensor& input, MetaTensor* out) { auto in_dim = input.dims(); out->set_dims(phi::make_ddim({in_dim.size()})); diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index a9aefd1f12d..5447c9a573f 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -164,6 +164,11 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void RollInferMeta(const MetaTensor& x, + const ScalarArray& shifts, + const std::vector& axis, + MetaTensor* out); + void ShapeInferMeta(const MetaTensor& input, MetaTensor* out); void ShardIndexInferMeta(const MetaTensor& in, diff --git a/paddle/phi/kernels/cpu/roll_grad_kernel.cc b/paddle/phi/kernels/cpu/roll_grad_kernel.cc new file mode 100644 index 00000000000..b0d0c0663e4 --- /dev/null +++ b/paddle/phi/kernels/cpu/roll_grad_kernel.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roll_grad_kernel.h" + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/roll_kernel_impl.h" + +namespace phi { + +template +void RollGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const ScalarArray& shifts, + const std::vector& axis, + DenseTensor* x_grad) { + std::vector out_vec; + paddle::framework::TensorToVector(out_grad, dev_ctx, &out_vec); + + auto shifts_data = shifts.GetData(); + size_t nums = shifts_data.size(); + DDim input_dim = out_grad.dims(); + auto dims = axis; + + // axis = none, reshape to 1-D tensor + if (dims.size() == 0) { + dims.push_back(0l); + input_dim = phi::Dim<1>(out_vec.size()); + } + + for (size_t i = 0; i < nums; i++) { + ShiftAlongDim(out_vec.data(), input_dim, dims[i], 0 - shifts_data[i]); + } + + dev_ctx.template Alloc(x_grad); + paddle::framework::TensorFromVector(out_vec, dev_ctx, x_grad); + x_grad->Resize(out_grad.dims()); +} + +} // namespace phi + +PD_REGISTER_KERNEL(roll_grad, + CPU, + ALL_LAYOUT, + phi::RollGradKernel, + float, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/roll_kernel.cc b/paddle/phi/kernels/cpu/roll_kernel.cc new file mode 100644 index 00000000000..25b64ef257d --- /dev/null +++ b/paddle/phi/kernels/cpu/roll_kernel.cc @@ -0,0 +1,75 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roll_kernel.h" + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/common/complex.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/roll_kernel_impl.h" + +namespace phi { + +template +void RollKernel(const Context& dev_ctx, + const DenseTensor& x, + const ScalarArray& shifts, + const std::vector& axis, + DenseTensor* out) { + std::vector out_vec; + paddle::framework::TensorToVector(x, dev_ctx, &out_vec); + + auto shifts_data = shifts.GetData(); + size_t nums = shifts_data.size(); + DDim input_dim = x.dims(); + auto dims = axis; + + // axis = none, reshape to 1-D tensor + if (dims.size() == 0) { + dims.push_back(0l); + input_dim = phi::Dim<1>(out_vec.size()); + } + + for (size_t i = 0; i < nums; i++) { + PADDLE_ENFORCE_EQ( + dims[i] < input_dim.size() && dims[i] >= (0 - input_dim.size()), + true, + phi::errors::OutOfRange( + "Attr(axis[%d]) is out of range, It's expected " + "to be in range of [-%d, %d]. But received Attr(axis[%d]) = %d.", + i, + input_dim.size(), + input_dim.size() - 1, + i, + dims[i])); + ShiftAlongDim(out_vec.data(), input_dim, dims[i], shifts_data[i]); + } + dev_ctx.template Alloc(out); + paddle::framework::TensorFromVector(out_vec, dev_ctx, out); + out->Resize(x.dims()); +} + +} // namespace phi + +PD_REGISTER_KERNEL(roll, + CPU, + ALL_LAYOUT, + phi::RollKernel, + float, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/roll_kernel_impl.h b/paddle/phi/kernels/cpu/roll_kernel_impl.h new file mode 100644 index 00000000000..924e71aff31 --- /dev/null +++ b/paddle/phi/kernels/cpu/roll_kernel_impl.h @@ -0,0 +1,76 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +inline void ShiftAlongDim(T* data, + const DDim& input_dim, + int64_t dim, + int64_t shift) { + if (dim < 0) { + dim += input_dim.size(); + } + if (input_dim[dim] == 0) { + return; + } + shift = shift % input_dim[dim]; + if (shift < 0) { + shift += input_dim[dim]; + } + + auto outer_loops = 1; + for (auto i = 0; i < dim; i++) { + outer_loops *= input_dim[i]; + } + auto slice_width = 1; + for (auto i = dim + 1; i < input_dim.size(); i++) { + slice_width *= input_dim[i]; + } + + VLOG(3) << "shift_along_dim_debug: input_dim: " << input_dim + << "; dim: " << dim << "; shift: " << shift + << "; outer_loops: " << outer_loops + << "; slice_width: " << slice_width; + if (shift == 0) { + return; + } + + std::vector head; + auto head_size = slice_width * (input_dim[dim] - shift); + head.resize(head_size); + + for (auto i = 0; i < outer_loops; i++) { + for (auto j = 0; j < head_size; j++) { + head[j] = data[i * input_dim[dim] * slice_width + j]; + } + for (auto j = input_dim[dim] - shift; j < input_dim[dim]; j++) { + auto dst_pos = j - input_dim[dim] + shift; + for (auto k = 0; k < slice_width; k++) { + data[(i * input_dim[dim] + dst_pos) * slice_width + k] = + data[(i * input_dim[dim] + j) * slice_width + k]; + } + } + for (auto j = 0; j < head_size; j++) { + data[(i * input_dim[dim] + shift) * slice_width + j] = head[j]; + } + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/roll_grad_kernel.cu b/paddle/phi/kernels/gpu/roll_grad_kernel.cu new file mode 100644 index 00000000000..93e9e81882c --- /dev/null +++ b/paddle/phi/kernels/gpu/roll_grad_kernel.cu @@ -0,0 +1,88 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roll_grad_kernel.h" + +#include "paddle/phi/common/complex.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/roll_kernel_impl.h" + +namespace phi { + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +template +void RollGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const ScalarArray& shifts, + const std::vector& axis, + DenseTensor* x_grad) { + auto* in_data = out_grad.data(); + T* out_data = dev_ctx.template Alloc(x_grad); + int64_t numel = out_grad.numel(); + auto stream = dev_ctx.stream(); + + auto shifts_data = shifts.GetData(); + size_t nums = shifts_data.size(); + auto input_dim = out_grad.dims(); + auto stride_dim = phi::stride(input_dim); + + std::vector strides(nums), sizes(nums); + if (axis.size() == 0) { + strides[0] = 1; + sizes[0] = numel; + shifts_data[0] = ((-shifts_data[0]) % numel + numel) % numel; + } else { + for (size_t i = 0; i < nums; i++) { + int dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size(); + int64_t size = input_dim[dim]; + if (size != 0) { + shifts_data[i] = ((-shifts_data[i]) % size + size) % size; + strides[i] = stride_dim[dim]; + sizes[i] = size; + } + } + } + + switch (nums) { + CALL_ROLL_CUDA_KERNEL(1); + CALL_ROLL_CUDA_KERNEL(2); + CALL_ROLL_CUDA_KERNEL(3); + CALL_ROLL_CUDA_KERNEL(4); + CALL_ROLL_CUDA_KERNEL(5); + CALL_ROLL_CUDA_KERNEL(6); + CALL_ROLL_CUDA_KERNEL(7); + CALL_ROLL_CUDA_KERNEL(8); + CALL_ROLL_CUDA_KERNEL(9); + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "shifts.size() should be less than 10, But received shifts.size() " + "= %d", + shifts_data.size())); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(roll_grad, + GPU, + ALL_LAYOUT, + phi::RollGradKernel, + float, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/roll_kernel.cu b/paddle/phi/kernels/gpu/roll_kernel.cu new file mode 100644 index 00000000000..1543335d3a0 --- /dev/null +++ b/paddle/phi/kernels/gpu/roll_kernel.cu @@ -0,0 +1,90 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roll_kernel.h" + +#include "paddle/phi/common/complex.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/array.h" +#include "paddle/phi/kernels/gpu/roll_kernel_impl.h" + +namespace phi { + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +template +void RollKernel(const Context& dev_ctx, + const DenseTensor& x, + const ScalarArray& shifts, + const std::vector& axis, + DenseTensor* out) { + auto* in_data = x.data(); + T* out_data = dev_ctx.template Alloc(out); + int64_t numel = x.numel(); + auto stream = dev_ctx.stream(); + + auto shifts_data = shifts.GetData(); + + size_t nums = shifts_data.size(); + auto input_dim = x.dims(); + auto stride_dim = phi::stride(input_dim); + + std::vector strides(nums), sizes(nums); + if (axis.size() == 0) { + strides[0] = 1; + sizes[0] = numel; + shifts_data[0] = (shifts_data[0] % numel + numel) % numel; + } else { + for (size_t i = 0; i < nums; i++) { + int dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size(); + int64_t size = input_dim[dim]; + + if (size != 0) { + shifts_data[i] = (shifts_data[i] % size + size) % size; + strides[i] = stride_dim[dim]; + sizes[i] = size; + } + } + } + + switch (nums) { + CALL_ROLL_CUDA_KERNEL(1); + CALL_ROLL_CUDA_KERNEL(2); + CALL_ROLL_CUDA_KERNEL(3); + CALL_ROLL_CUDA_KERNEL(4); + CALL_ROLL_CUDA_KERNEL(5); + CALL_ROLL_CUDA_KERNEL(6); + CALL_ROLL_CUDA_KERNEL(7); + CALL_ROLL_CUDA_KERNEL(8); + CALL_ROLL_CUDA_KERNEL(9); + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "shifts.size() should be less than 10, But received shifts.size() " + "= %d", + shifts_data.size())); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(roll, + GPU, + ALL_LAYOUT, + phi::RollKernel, + float, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/roll_kernel_impl.h b/paddle/phi/kernels/gpu/roll_kernel_impl.h new file mode 100644 index 00000000000..abe3ee470b4 --- /dev/null +++ b/paddle/phi/kernels/gpu/roll_kernel_impl.h @@ -0,0 +1,71 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/core/utils/array.h" +#include "paddle/phi/kernels/primitive/kernel_primitives.h" + +namespace phi { + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void RollCudaKernel(const T* input, + T* output, + int64_t N, + phi::Array shifts, + phi::Array strides, + phi::Array sizes) { + int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= N) { + return; + } + + int64_t output_idx = idx; + int64_t new_dim_idx = 0; + +#pragma unroll + for (size_t i = 0; i < Rank; i++) { + new_dim_idx = (idx / strides[i]) % sizes[i] + shifts[i]; + if (new_dim_idx >= sizes[i]) { + output_idx += (shifts[i] - sizes[i]) * strides[i]; + } else { + output_idx += shifts[i] * strides[i]; + } + } + output[output_idx] = input[idx]; +} + +#define CALL_ROLL_CUDA_KERNEL(N) \ + case N: { \ + phi::Array _strides; \ + phi::Array _shifts; \ + phi::Array _sizes; \ + for (size_t idx = 0; idx < N; ++idx) { \ + _strides[idx] = strides[idx]; \ + _shifts[idx] = shifts_data[idx]; \ + _sizes[idx] = sizes[idx]; \ + } \ + RollCudaKernel< \ + T, \ + N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, \ + PADDLE_CUDA_NUM_THREADS, \ + 0, \ + stream>>>(in_data, out_data, numel, _shifts, _strides, _sizes); \ + break; \ + } + +} // namespace phi diff --git a/paddle/phi/kernels/roll_grad_kernel.h b/paddle/phi/kernels/roll_grad_kernel.h new file mode 100644 index 00000000000..331f3626e56 --- /dev/null +++ b/paddle/phi/kernels/roll_grad_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void RollGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const ScalarArray& shifts, + const std::vector& axis, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/roll_kernel.h b/paddle/phi/kernels/roll_kernel.h new file mode 100644 index 00000000000..56f32174a4c --- /dev/null +++ b/paddle/phi/kernels/roll_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void RollKernel(const Context& dev_ctx, + const DenseTensor& x, + const ScalarArray& shifts, + const std::vector& axis, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/roll_sig.cc b/paddle/phi/ops/compat/roll_sig.cc new file mode 100644 index 00000000000..a144f0e8e8a --- /dev/null +++ b/paddle/phi/ops/compat/roll_sig.cc @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature RollOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.HasInput("ShiftsTensor")) { + return KernelSignature("roll", {"X"}, {"ShiftsTensor", "axis"}, {"Out"}); + } + return KernelSignature("roll", {"X"}, {"shifts", "axis"}, {"Out"}); +} + +KernelSignature RollGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("roll_grad", + {"X", GradVarName("Out")}, + {"shifts", "axis"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(roll, phi::RollOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(roll_grad, phi::RollGradOpArgumentMapping); -- GitLab From 3898080ebccd27d2517df4e3bcf468cd8e04af12 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Wed, 16 Mar 2022 16:25:54 +0800 Subject: [PATCH 103/176] [Phi] Move roi_align grad kernel and infershape from fuild to phi (#40556) * move roi_align_grad kernel * move roi_align grad kernel and infershape to phi * remove roi_align infershape --- paddle/fluid/imperative/prepared_operator.h | 1 - paddle/fluid/operators/roi_align_op.cc | 89 +----- paddle/fluid/operators/roi_align_op.cu | 227 --------------- paddle/fluid/operators/roi_align_op.h | 196 ------------- paddle/phi/infermeta/backward.h | 4 + paddle/phi/infermeta/binary.h | 2 + paddle/phi/infermeta/multiary.h | 17 ++ paddle/phi/infermeta/nullary.h | 2 + paddle/phi/infermeta/ternary.cc | 77 ++++++ paddle/phi/infermeta/ternary.h | 13 + paddle/phi/infermeta/unary.h | 2 + .../phi/kernels/cpu/roi_align_grad_kernel.cc | 203 ++++++++++++++ paddle/phi/kernels/cpu/roi_align_kernel.cc | 4 +- .../phi/kernels/gpu/roi_align_grad_kernel.cu | 260 ++++++++++++++++++ paddle/phi/kernels/gpu/roi_align_kernel.cu | 8 +- paddle/phi/kernels/roi_align_grad_kernel.h | 35 +++ paddle/phi/kernels/roi_align_kernel.h | 2 +- paddle/phi/ops/compat/roi_align_sig.cc | 17 +- 18 files changed, 646 insertions(+), 513 deletions(-) delete mode 100644 paddle/fluid/operators/roi_align_op.cu delete mode 100644 paddle/fluid/operators/roi_align_op.h create mode 100644 paddle/phi/kernels/cpu/roi_align_grad_kernel.cc create mode 100644 paddle/phi/kernels/gpu/roi_align_grad_kernel.cu create mode 100644 paddle/phi/kernels/roi_align_grad_kernel.h diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 8deb3b93e9c..16f2df79246 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -341,7 +341,6 @@ void BuildDygraphPhiKernelContext( } for (size_t i = 0; i < attr_names.size(); ++i) { - VLOG(1) << "############## attr_name: " << i << " : " << attr_names[i]; if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) { if (attrs.find(attr_names[i]) != attrs.end()) { // shape is in the attribute diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc index ac0cd75237b..bf78b6a6965 100644 --- a/paddle/fluid/operators/roi_align_op.cc +++ b/paddle/fluid/operators/roi_align_op.cc @@ -9,9 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/roi_align_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -23,79 +26,6 @@ class ROIAlignOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::NotFound("Input(X) of ROIAlignOp " - "is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("ROIs"), true, - platform::errors::NotFound("Input(ROIs) of ROIAlignOp " - "is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::NotFound("Output(Out) of ROIAlignOp " - "is not found.")); - auto input_dims = ctx->GetInputDim("X"); - auto rois_dims = ctx->GetInputDim("ROIs"); - - if (ctx->HasInput("RoisNum")) { - auto rois_num_dims = ctx->GetInputDim("RoisNum"); - PADDLE_ENFORCE_EQ( - rois_num_dims.size(), 1, - platform::errors::InvalidArgument("The size of RoisNum should be 1" - ", but received size = %d", - rois_num_dims.size())); - } - PADDLE_ENFORCE_EQ( - input_dims.size(), 4, - platform::errors::InvalidArgument( - "The format of Input(X) in" - "RoIAlignOp is NCHW. And the rank of input must be 4. " - "But received rank = %d", - input_dims.size())); - PADDLE_ENFORCE_EQ(rois_dims.size(), 2, platform::errors::InvalidArgument( - "The rank of Input(ROIs) " - "in RoIAlignOp should be 2. " - "But the rank of RoIs is %d", - rois_dims.size())); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(rois_dims[1], 4, - platform::errors::InvalidArgument( - "The second dimension " - "of Input(ROIs) should be 4. But received the " - "dimension = %d", - rois_dims[1])); - } - int pooled_height = ctx->Attrs().Get("pooled_height"); - int pooled_width = ctx->Attrs().Get("pooled_width"); - float spatial_scale = ctx->Attrs().Get("spatial_scale"); - - PADDLE_ENFORCE_GT(pooled_height, 0, - platform::errors::InvalidArgument( - "The 'pooled_height' attribute in RoIAlignOp is " - "invalid. The height must be greater than 0. But " - "received 'pooled_height' = %d", - pooled_height)); - PADDLE_ENFORCE_GT(pooled_width, 0, - platform::errors::InvalidArgument( - "The 'pooled_width' attribute in RoIAlignOp is " - "invalid. The width must be greater than 0. But " - "received 'pooled_width' = %d", - pooled_width)); - PADDLE_ENFORCE_GT(spatial_scale, 0.0f, - platform::errors::InvalidArgument( - "The 'spatial_scale' attribute in RoIAlignOp is " - "invalid. The scale must be greater than 0. But " - "received 'spatial_scale' = %f", - spatial_scale)); - - auto out_dims = input_dims; - out_dims[0] = rois_dims[0]; - out_dims[1] = input_dims[1]; - out_dims[2] = pooled_height; - out_dims[3] = pooled_width; - - ctx->SetOutputDim("Out", out_dims); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -221,17 +151,16 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(RoiAlignGradNoNeedBufVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(roi_align, RoiAlignInferShapeFunctor, + PD_INFER_META(phi::RoiAlignInferMeta)); + REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker, ops::ROIAlignGradMaker, - ops::ROIAlignGradMaker); + ops::ROIAlignGradMaker, + RoiAlignInferShapeFunctor); REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp, ops::RoiAlignGradNoNeedBufVarsInferer); -REGISTER_OP_CPU_KERNEL( - roi_align_grad, - ops::CPUROIAlignGradOpKernel, - ops::CPUROIAlignGradOpKernel, - ops::CPUROIAlignGradOpKernel); REGISTER_OP_VERSION(roi_align) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu deleted file mode 100644 index 1a2e64cd45c..00000000000 --- a/paddle/fluid/operators/roi_align_op.cu +++ /dev/null @@ -1,227 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/operators/roi_align_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -static constexpr int kNumCUDAThreads = 512; -static constexpr int kNumMaxinumNumBlocks = 4096; -static constexpr int kROISize = 4; - -static inline int NumBlocks(const int N) { - return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, - kNumMaxinumNumBlocks); -} - -template -__device__ void BilinearInterpolateGradient(const int height, const int width, - T y, T x, T* w1, T* w2, T* w3, - T* w4, int* x_low, int* x_high, - int* y_low, int* y_high) { - if (y < -1.0 || y > height || x < -1.0 || x > width) { - return; - } - - y = y <= 0 ? 0 : y; - x = x <= 0 ? 0 : x; - *y_low = static_cast(y); - *x_low = static_cast(x); - if (*y_low >= height - 1) { - *y_high = *y_low = height - 1; - y = static_cast(*y_low); - } else { - *y_high = *y_low + 1; - } - if (*x_low >= width - 1) { - *x_high = *x_low = width - 1; - x = static_cast(*x_low); - } else { - *x_high = *x_low + 1; - } - T ly = y - *y_low, lx = x - *x_low; - T hy = 1. - ly, hx = 1. - lx; - *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx; - - return; -} - -template -__global__ void GPUROIAlignBackward( - const int nthreads, const T* input_rois, const T* out_grad, - const int num_rois, const float spatial_scale, const int channels, - const int height, const int width, const int pooled_height, - const int pooled_width, const int sampling_ratio, int* roi_batch_id_data, - T* input_grad, const bool continuous_coordinate) { - CUDA_KERNEL_LOOP(i, nthreads) { - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % channels; - int n = i / pooled_width / pooled_height / channels; - const T* offset_input_rois = input_rois + n * kROISize; - int roi_batch_ind = roi_batch_id_data[n]; - - T roi_offset = continuous_coordinate ? T(0.5) : 0; - T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset; - T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset; - T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset; - T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset; - - T roi_width = roi_xmax - roi_xmin; - T roi_height = roi_ymax - roi_ymin; - if (!continuous_coordinate) { - roi_width = max(roi_width, static_cast(1.)); - roi_height = max(roi_height, static_cast(1.)); - } - T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); - T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); - - T* offset_input_grad = - input_grad + (roi_batch_ind * channels + c) * height * width; - - const T* offset_out_grad = - out_grad + (n * channels + c) * pooled_height * pooled_width; - const T out_grad_this_bin = offset_out_grad[ph * pooled_width + pw]; - - int roi_bin_grid_h = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_height / pooled_height); - int roi_bin_grid_w = - (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); - - const T count = roi_bin_grid_h * roi_bin_grid_w; - for (int iy = 0; iy < roi_bin_grid_h; iy++) { - const T y = roi_ymin + ph * bin_size_h + - static_cast(iy + .5f) * bin_size_h / - static_cast(roi_bin_grid_h); - for (int ix = 0; ix < roi_bin_grid_w; ix++) { - const T x = roi_xmin + pw * bin_size_w + - static_cast(ix + .5f) * bin_size_w / - static_cast(roi_bin_grid_w); - T w1 = 0, w2 = 0, w3 = 0, w4 = 0; - int x_low = -1, x_high = -1, y_low = -1, y_high = -1; - BilinearInterpolateGradient(height, width, y, x, &w1, &w2, &w3, &w4, - &x_low, &x_high, &y_low, &y_high); - T diff1 = out_grad_this_bin * w1 / count; - T diff2 = out_grad_this_bin * w2 / count; - T diff3 = out_grad_this_bin * w3 / count; - T diff4 = out_grad_this_bin * w4 / count; - if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { - platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_low, - diff1); - platform::CudaAtomicAdd(offset_input_grad + y_low * width + x_high, - diff2); - platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_low, - diff3); - platform::CudaAtomicAdd(offset_input_grad + y_high * width + x_high, - diff4); - } - } - } - } -} - -template -class GPUROIAlignGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - - auto* out_grad = ctx.Input(framework::GradVarName("Out")); - auto* in_grad = ctx.Output(framework::GradVarName("X")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - auto sampling_ratio = ctx.Attr("sampling_ratio"); - auto aligned = ctx.Attr("aligned"); - - int rois_num = rois->dims()[0]; - int channels = in->dims()[1]; - int height = in->dims()[2]; - int width = in->dims()[3]; - - if (!in_grad) { - return; - } - Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - auto cplace = platform::CPUPlace(); - int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); - - auto& dev_ctx = ctx.cuda_device_context(); - auto gplace = ctx.GetPlace(); - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - int rois_batch_size = rois_num_t->numel(); - std::vector rois_num_list(rois_batch_size); - memory::Copy(cplace, rois_num_list.data(), gplace, - rois_num_t->data(), sizeof(int) * rois_batch_size, 0); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = start; i < start + rois_num_list[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_list[n]; - } - } else { - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - auto roi_ptr = - memory::Alloc(dev_ctx, roi_batch_id_list.numel() * sizeof(int)); - int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); - int bytes = roi_batch_id_list.numel() * sizeof(int); - memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, - dev_ctx.stream()); - in_grad->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, in_grad, static_cast(0)); - - int output_grad_size = out_grad->numel(); - int blocks = NumBlocks(output_grad_size); - int threads = kNumCUDAThreads; - - if (output_grad_size > 0) { - GPUROIAlignBackward<<>>( - output_grad_size, rois->data(), out_grad->data(), rois_num, - spatial_scale, channels, height, width, pooled_height, pooled_width, - sampling_ratio, roi_id_data, in_grad->mutable_data(ctx.GetPlace()), - aligned); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - roi_align_grad, - ops::GPUROIAlignGradOpKernel, - ops::GPUROIAlignGradOpKernel); diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h deleted file mode 100644 index 589e35e4ab7..00000000000 --- a/paddle/fluid/operators/roi_align_op.h +++ /dev/null @@ -1,196 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -template -void bilinear_interpolate_gradient(const int height, const int width, T y, T x, - const T out_grad_this_bin, const T count, - T* batch_grad_data) { - int x_low, y_low, x_high, y_high; - T w1, w2, w3, w4; - if (y < -1.0 || y > height || x < -1.0 || x > width) { - w1 = w2 = w3 = w4 = 0; - x_low = x_high = y_low = y_high = -1; - return; - } - y = y <= 0 ? 0 : y; - x = x <= 0 ? 0 : x; - y_low = static_cast(y); - x_low = static_cast(x); - if (y_low >= height - 1) { - y_high = y_low = height - 1; - y = static_cast(y_low); - } else { - y_high = y_low + 1; - } - - if (x_low >= width - 1) { - x_high = x_low = width - 1; - x = static_cast(x_low); - } else { - x_high = x_low + 1; - } - - T ly = y - y_low, lx = x - x_low; - T hy = 1. - ly, hx = 1. - lx; - w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; - T diff1 = out_grad_this_bin * w1 / count; - T diff2 = out_grad_this_bin * w2 / count; - T diff3 = out_grad_this_bin * w3 / count; - T diff4 = out_grad_this_bin * w4 / count; - if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { - *(batch_grad_data + y_low * width + x_low) += diff1; - *(batch_grad_data + y_low * width + x_high) += diff2; - *(batch_grad_data + y_high * width + x_low) += diff3; - *(batch_grad_data + y_high * width + x_high) += diff4; - } -} - -template -class CPUROIAlignGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out_grad = - ctx.Input(framework::GradVarName("Out")); - auto* in_grad = ctx.Output(framework::GradVarName("X")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - auto sampling_ratio = ctx.Attr("sampling_ratio"); - auto in_dims = in->dims(); - auto aligned = ctx.Attr("aligned"); - - int channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - if (!in_grad) { - return; - } - Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - int* roi_batch_id_data = - roi_batch_id_list.mutable_data(ctx.GetPlace()); - - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - auto* rois_num_data = rois_num_t->data(); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_data[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_data[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - for (int n = 0; n < rois_batch_size; ++n) { - for (std::size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - in_grad->mutable_data(ctx.GetPlace()); - auto& dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, in_grad, static_cast(0)); - - int output_grad_size = out_grad->numel(); - - if ((!out_grad->IsInitialized()) || (output_grad_size <= 0)) { - return; - } - - const T* rois_data = rois->data(); - const T* out_grad_data = out_grad->data(); - T* in_grad_data = in_grad->mutable_data(ctx.GetPlace()); - - auto in_stride = phi::stride(in->dims()); - auto roi_stride = phi::stride(rois->dims()); - auto out_stride = phi::stride(out_grad->dims()); - - T roi_offset = aligned ? T(0.5) : 0; - for (int n = 0; n < rois_num; ++n) { - int roi_batch_idx = roi_batch_id_data[n]; - T roi_xmin = rois_data[0] * spatial_scale - roi_offset; - T roi_ymin = rois_data[1] * spatial_scale - roi_offset; - T roi_xmax = rois_data[2] * spatial_scale - roi_offset; - T roi_ymax = rois_data[3] * spatial_scale - roi_offset; - - T roi_width = roi_xmax - roi_xmin; - T roi_height = roi_ymax - roi_ymin; - roi_width = std::max(roi_width, static_cast(1.)); - roi_height = std::max(roi_height, static_cast(1.)); - if (!aligned) { - roi_width = std::max(roi_width, static_cast(1.)); - roi_height = std::max(roi_height, static_cast(1.)); - } - - T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); - T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); - for (int c = 0; c < channels; ++c) { - T* batch_grad_data = - in_grad_data + roi_batch_idx * in_stride[0] + c * in_stride[1]; - const T* batch_out_grad_data = - out_grad_data + n * out_stride[0] + c * out_stride[1]; - for (int ph = 0; ph < pooled_height; ++ph) { - for (int pw = 0; pw < pooled_width; ++pw) { - int pool_index = ph * pooled_width + pw; - T out_grad_this_bin = batch_out_grad_data[pool_index]; - int roi_bin_grid_h = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_height / pooled_height); - int roi_bin_grid_w = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_width / pooled_width); - T count = roi_bin_grid_h * roi_bin_grid_w; - for (int iy = 0; iy < roi_bin_grid_h; iy++) { - const T y = roi_ymin + ph * bin_size_h + - static_cast(iy + .5f) * bin_size_h / - static_cast(roi_bin_grid_h); - for (int ix = 0; ix < roi_bin_grid_w; ix++) { - const T x = roi_xmin + pw * bin_size_w + - static_cast(ix + .5f) * bin_size_w / - static_cast(roi_bin_grid_w); - bilinear_interpolate_gradient(height, width, y, x, - out_grad_this_bin, count, - batch_grad_data); - } - } - } - } - } - rois_data += roi_stride[0]; - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index 06ee5a205d7..260fbfe7197 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -21,6 +21,10 @@ limitations under the License. */ namespace phi { +// Common InferMeta Functions for backward operators. +// +// NOTE: The InferMeta Functions in this file are arranged in alphabetic order. + void BilinearTensorProductGradInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index 1727e85b1d5..8cf7ce3930e 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -29,6 +29,8 @@ namespace phi { // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good. // Because functions in this file not only can infer shape, but also need // infer lod or other useful data. +// +// The InferMeta Functions in this file are arranged in alphabetic order. void AllValueCompareInferMeta(const MetaTensor& x, const MetaTensor& y, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 0bdd35d5f58..6de95386dd9 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -18,6 +18,23 @@ limitations under the License. */ #include "paddle/phi/core/meta_tensor.h" namespace phi { +// Common InferMeta Functions for multiary operators, The format like: +// +// 1. The number of input MetaTensor is more than 3: +// void [FunctionDesc|OpName]InferMeta(const MetaTensor& x, +// const MetaTensor& y, +// const MetaTensor& z, +// const MetaTensor& w, +// ..., +// MetaTensor* out) {} +// +// 2. There are `const vector&` in params: +// void [FunctionDesc|OpName]InferMeta(const vector& x, +// ..., +// MetaTensor* out) {} +// +// NOTE: The InferMeta Functions in this file are arranged in alphabetic order. + std::vector GetMetaTensorsDim(const std::vector& tensors); void AdadeltaInferMeta(const MetaTensor& param, diff --git a/paddle/phi/infermeta/nullary.h b/paddle/phi/infermeta/nullary.h index 38eaa636f8c..55e59b27e71 100644 --- a/paddle/phi/infermeta/nullary.h +++ b/paddle/phi/infermeta/nullary.h @@ -27,6 +27,8 @@ namespace phi { // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good. // Because functions in this file not only can infer shape, but also need // infer lod or other useful data. +// +// The InferMeta Functions in this file are arranged in alphabetic order. void CreateInferMeta(const ScalarArray& shape, DataType dtype, MetaTensor* out); diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index 235cfe368c1..837750710c9 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -322,6 +322,83 @@ void NllLossRawInferMeta(const MetaTensor& input, total_weight->set_dtype(input.dtype()); } +void RoiAlignInferMeta(const MetaTensor& x, + const MetaTensor& boxes, + paddle::optional boxes_num, + int pooled_height, + int pooled_width, + float spatial_scale, + int sampling_ratio, + bool aligned, + MetaTensor* out, + MetaConfig config) { + auto input_dims = x.dims(); + auto boxes_dims = boxes.dims(); + + if (boxes_num) { + auto boxes_num_dims = boxes_num->dims(); + PADDLE_ENFORCE_EQ( + boxes_num_dims.size(), + 1, + phi::errors::InvalidArgument("The size of RoisNum should be 1" + ", but received size = %d", + boxes_num_dims.size())); + } + PADDLE_ENFORCE_EQ(input_dims.size(), + 4, + phi::errors::InvalidArgument( + "The format of Input(X) in" + "RoIAlignOp is NCHW. And the rank of input must be 4. " + "But received rank = %d", + input_dims.size())); + PADDLE_ENFORCE_EQ(boxes_dims.size(), + 2, + phi::errors::InvalidArgument("The rank of Input(ROIs) " + "in RoIAlignOp should be 2. " + "But the rank of RoIs is %d", + boxes_dims.size())); + if (config.is_runtime) { + PADDLE_ENFORCE_EQ(boxes_dims[1], + 4, + phi::errors::InvalidArgument( + "The second dimension " + "of Input(ROIs) should be 4. But received the " + "dimension = %d", + boxes_dims[1])); + } + + PADDLE_ENFORCE_GT(pooled_height, + 0, + phi::errors::InvalidArgument( + "The 'pooled_height' attribute in RoIAlignOp is " + "invalid. The height must be greater than 0. But " + "received 'pooled_height' = %d", + pooled_height)); + PADDLE_ENFORCE_GT(pooled_width, + 0, + phi::errors::InvalidArgument( + "The 'pooled_width' attribute in RoIAlignOp is " + "invalid. The width must be greater than 0. But " + "received 'pooled_width' = %d", + pooled_width)); + PADDLE_ENFORCE_GT(spatial_scale, + 0.0f, + phi::errors::InvalidArgument( + "The 'spatial_scale' attribute in RoIAlignOp is " + "invalid. The scale must be greater than 0. But " + "received 'spatial_scale' = %f", + spatial_scale)); + + auto out_dims = input_dims; + out_dims[0] = boxes_dims[0]; + out_dims[1] = input_dims[1]; + out_dims[2] = pooled_height; + out_dims[3] = pooled_width; + + out->set_dims(out_dims); + out->set_dtype(x.dtype()); +} + void ScatterInferMeta(const MetaTensor& x, const MetaTensor& index, const MetaTensor& updates, diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index 209a07db18b..0e7b9cb12a4 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -30,6 +30,8 @@ namespace phi { // Because functions in this file not only can infer shape, but also need // infer lod or other useful data. // +// The InferMeta Functions in this file are arranged in alphabetic order. + void AccuracyInferMeta(const MetaTensor& out, const MetaTensor& indice, const MetaTensor& label, @@ -71,6 +73,17 @@ void NllLossRawInferMeta(const MetaTensor& input, MetaTensor* total_weight, MetaConfig config = MetaConfig()); +void RoiAlignInferMeta(const MetaTensor& x, + const MetaTensor& boxes, + paddle::optional boxes_num, + int pooled_height, + int pooled_width, + float spatial_scale, + int sampling_ratio, + bool aligned, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void ScatterInferMeta(const MetaTensor& x, const MetaTensor& index, const MetaTensor& updates, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 5447c9a573f..3dfc9b797c0 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -31,6 +31,8 @@ class MetaConfig; // NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good. // Because functions in this file not only can infer shape, but also need // infer lod or other useful data. +// +// The InferMeta Functions in this file are arranged in alphabetic order. void ArgMinMaxInferMeta(const MetaTensor& x, int64_t axis, diff --git a/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc new file mode 100644 index 00000000000..a91b8b6c1fc --- /dev/null +++ b/paddle/phi/kernels/cpu/roi_align_grad_kernel.cc @@ -0,0 +1,203 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roi_align_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void bilinear_interpolate_gradient(const int height, + const int width, + T y, + T x, + const T out_grad_this_bin, + const T count, + T* batch_grad_data) { + int x_low, y_low, x_high, y_high; + T w1, w2, w3, w4; + if (y < -1.0 || y > height || x < -1.0 || x > width) { + w1 = w2 = w3 = w4 = 0; + x_low = x_high = y_low = y_high = -1; + return; + } + y = y <= 0 ? 0 : y; + x = x <= 0 ? 0 : x; + y_low = static_cast(y); + x_low = static_cast(x); + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = static_cast(y_low); + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = static_cast(x_low); + } else { + x_high = x_low + 1; + } + + T ly = y - y_low, lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + T diff1 = out_grad_this_bin * w1 / count; + T diff2 = out_grad_this_bin * w2 / count; + T diff3 = out_grad_this_bin * w3 / count; + T diff4 = out_grad_this_bin * w4 / count; + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + *(batch_grad_data + y_low * width + x_low) += diff1; + *(batch_grad_data + y_low * width + x_high) += diff2; + *(batch_grad_data + y_high * width + x_low) += diff3; + *(batch_grad_data + y_high * width + x_high) += diff4; + } +} + +template +void RoiAlignGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + const DenseTensor& out_grad, + int pooled_height, + int pooled_width, + float spatial_scale, + int sampling_ratio, + bool aligned, + DenseTensor* dx) { + auto in_dims = x.dims(); + int channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num = boxes.dims()[0]; + + if (!dx) { + return; + } + + DenseTensor roi_batch_id_list = Empty(dev_ctx, {rois_num}); + int* box_batch_id_data = roi_batch_id_list.data(); + + int boxes_batch_size; + if (boxes_num) { + boxes_batch_size = boxes_num->numel(); + auto* boxes_num_data = boxes_num->data(); + int start = 0; + for (int n = 0; n < boxes_batch_size; ++n) { + for (int i = start; i < start + boxes_num_data[n]; ++i) { + box_batch_id_data[i] = n; + } + start += boxes_num_data[n]; + } + } else { + auto boxes_lod = boxes.lod().back(); + boxes_batch_size = boxes_lod.size() - 1; + for (int n = 0; n < boxes_batch_size; ++n) { + for (std::size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) { + box_batch_id_data[i] = n; + } + } + } + dev_ctx.template Alloc(dx); + + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, dx, static_cast(0)); + + int output_grad_size = out_grad.numel(); + + if ((!out_grad.IsInitialized()) || (output_grad_size <= 0)) { + return; + } + + const T* boxes_data = boxes.data(); + const T* out_grad_data = out_grad.data(); + T* dx_data = dev_ctx.template Alloc(dx); + + auto in_stride = phi::stride(x.dims()); + auto roi_stride = phi::stride(boxes.dims()); + auto out_stride = phi::stride(out_grad.dims()); + + T roi_offset = aligned ? T(0.5) : 0; + for (int n = 0; n < rois_num; ++n) { + int box_batch_idx = box_batch_id_data[n]; + T roi_xmin = boxes_data[0] * spatial_scale - roi_offset; + T roi_ymin = boxes_data[1] * spatial_scale - roi_offset; + T roi_xmax = boxes_data[2] * spatial_scale - roi_offset; + T roi_ymax = boxes_data[3] * spatial_scale - roi_offset; + + T roi_width = roi_xmax - roi_xmin; + T roi_height = roi_ymax - roi_ymin; + roi_width = std::max(roi_width, static_cast(1.)); + roi_height = std::max(roi_height, static_cast(1.)); + if (!aligned) { + roi_width = std::max(roi_width, static_cast(1.)); + roi_height = std::max(roi_height, static_cast(1.)); + } + + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + for (int c = 0; c < channels; ++c) { + T* batch_grad_data = + dx_data + box_batch_idx * in_stride[0] + c * in_stride[1]; + const T* batch_out_grad_data = + out_grad_data + n * out_stride[0] + c * out_stride[1]; + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + int pool_index = ph * pooled_width + pw; + T out_grad_this_bin = batch_out_grad_data[pool_index]; + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); + int roi_bin_grid_w = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_width / pooled_width); + T count = roi_bin_grid_h * roi_bin_grid_w; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + const T y = roi_ymin + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T x = roi_xmin + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + bilinear_interpolate_gradient(height, + width, + y, + x, + out_grad_this_bin, + count, + batch_grad_data); + } + } + } + } + } + boxes_data += roi_stride[0]; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(roi_align_grad, + CPU, + ALL_LAYOUT, + phi::RoiAlignGradKernel, + float, + double, + int) {} diff --git a/paddle/phi/kernels/cpu/roi_align_kernel.cc b/paddle/phi/kernels/cpu/roi_align_kernel.cc index 35ab99a98eb..4752a9b3a48 100644 --- a/paddle/phi/kernels/cpu/roi_align_kernel.cc +++ b/paddle/phi/kernels/cpu/roi_align_kernel.cc @@ -179,7 +179,7 @@ void AvgPool(const std::vector& interpolated_values, } template -void ROIAlignKernel(const Context& dev_ctx, +void RoiAlignKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& boxes, paddle::optional boxes_num, @@ -315,4 +315,4 @@ void ROIAlignKernel(const Context& dev_ctx, } // namespace phi PD_REGISTER_KERNEL( - roi_align, CPU, ALL_LAYOUT, phi::ROIAlignKernel, float, double, int) {} + roi_align, CPU, ALL_LAYOUT, phi::RoiAlignKernel, float, double, int) {} diff --git a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu new file mode 100644 index 00000000000..cf076128b69 --- /dev/null +++ b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu @@ -0,0 +1,260 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roi_align_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" + +namespace phi { + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; +static constexpr int kROISize = 4; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +template +__device__ void BilinearInterpolateGradient(const int height, + const int width, + T y, + T x, + T* w1, + T* w2, + T* w3, + T* w4, + int* x_low, + int* x_high, + int* y_low, + int* y_high) { + if (y < -1.0 || y > height || x < -1.0 || x > width) { + return; + } + + y = y <= 0 ? 0 : y; + x = x <= 0 ? 0 : x; + *y_low = static_cast(y); + *x_low = static_cast(x); + if (*y_low >= height - 1) { + *y_high = *y_low = height - 1; + y = static_cast(*y_low); + } else { + *y_high = *y_low + 1; + } + if (*x_low >= width - 1) { + *x_high = *x_low = width - 1; + x = static_cast(*x_low); + } else { + *x_high = *x_low + 1; + } + T ly = y - *y_low, lx = x - *x_low; + T hy = 1. - ly, hx = 1. - lx; + *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx; + + return; +} + +template +__global__ void GPURoiAlignBackward(const int nthreads, + const T* input_rois, + const T* out_grad, + const int num_rois, + const float spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + int* roi_batch_id_data, + T* input_grad, + const bool continuous_coordinate) { + CUDA_KERNEL_LOOP(i, nthreads) { + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % channels; + int n = i / pooled_width / pooled_height / channels; + const T* offset_input_rois = input_rois + n * kROISize; + int roi_batch_ind = roi_batch_id_data[n]; + + T roi_offset = continuous_coordinate ? T(0.5) : 0; + T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset; + T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset; + T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset; + T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset; + + T roi_width = roi_xmax - roi_xmin; + T roi_height = roi_ymax - roi_ymin; + if (!continuous_coordinate) { + roi_width = max(roi_width, static_cast(1.)); + roi_height = max(roi_height, static_cast(1.)); + } + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + T* offset_input_grad = + input_grad + (roi_batch_ind * channels + c) * height * width; + + const T* offset_out_grad = + out_grad + (n * channels + c) * pooled_height * pooled_width; + const T out_grad_this_bin = offset_out_grad[ph * pooled_width + pw]; + + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + const T count = roi_bin_grid_h * roi_bin_grid_w; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + const T y = roi_ymin + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T x = roi_xmin + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + T w1 = 0, w2 = 0, w3 = 0, w4 = 0; + int x_low = -1, x_high = -1, y_low = -1, y_high = -1; + BilinearInterpolateGradient(height, + width, + y, + x, + &w1, + &w2, + &w3, + &w4, + &x_low, + &x_high, + &y_low, + &y_high); + T diff1 = out_grad_this_bin * w1 / count; + T diff2 = out_grad_this_bin * w2 / count; + T diff3 = out_grad_this_bin * w3 / count; + T diff4 = out_grad_this_bin * w4 / count; + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + paddle::platform::CudaAtomicAdd( + offset_input_grad + y_low * width + x_low, diff1); + paddle::platform::CudaAtomicAdd( + offset_input_grad + y_low * width + x_high, diff2); + paddle::platform::CudaAtomicAdd( + offset_input_grad + y_high * width + x_low, diff3); + paddle::platform::CudaAtomicAdd( + offset_input_grad + y_high * width + x_high, diff4); + } + } + } + } +} + +template +void RoiAlignGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + const DenseTensor& out_grad, + int pooled_height, + int pooled_width, + float spatial_scale, + int sampling_ratio, + bool aligned, + DenseTensor* dx) { + int rois_num = boxes.dims()[0]; + int channels = x.dims()[1]; + int height = x.dims()[2]; + int width = x.dims()[3]; + + if (!dx) { + return; + } + + DenseTensor box_batch_id_list; + box_batch_id_list.Resize({rois_num}); + int* box_batch_size = dev_ctx.template HostAlloc(&box_batch_id_list); + + auto cplace = phi::CPUPlace(); + auto gplace = dev_ctx.GetPlace(); + if (boxes_num) { + int boxes_batch_size = boxes_num->numel(); + std::vector boxes_num_list(boxes_batch_size); + paddle::memory::Copy(cplace, + boxes_num_list.data(), + gplace, + boxes_num->data(), + sizeof(int) * boxes_batch_size, + 0); + int start = 0; + for (int n = 0; n < boxes_batch_size; ++n) { + for (size_t i = start; i < start + boxes_num_list[n]; ++i) { + box_batch_size[i] = n; + } + start += boxes_num_list[n]; + } + } else { + auto boxes_lod = boxes.lod().back(); + int boxes_batch_size = boxes_lod.size() - 1; + for (int n = 0; n < boxes_batch_size; ++n) { + for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) { + box_batch_size[i] = n; + } + } + } + auto roi_ptr = + paddle::memory::Alloc(dev_ctx, box_batch_id_list.numel() * sizeof(int)); + int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); + int bytes = box_batch_id_list.numel() * sizeof(int); + paddle::memory::Copy( + gplace, roi_id_data, cplace, box_batch_size, bytes, dev_ctx.stream()); + dev_ctx.template Alloc(dx); + + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, dx, static_cast(0)); + + int output_grad_size = out_grad.numel(); + int blocks = NumBlocks(output_grad_size); + int threads = kNumCUDAThreads; + + if (output_grad_size > 0) { + GPURoiAlignBackward<<>>( + output_grad_size, + boxes.data(), + out_grad.data(), + rois_num, + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + roi_id_data, + dx->data(), + aligned); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + roi_align_grad, GPU, ALL_LAYOUT, phi::RoiAlignGradKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/roi_align_kernel.cu b/paddle/phi/kernels/gpu/roi_align_kernel.cu index 2f906fa4f66..cd4ed29cdd1 100644 --- a/paddle/phi/kernels/gpu/roi_align_kernel.cu +++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu @@ -71,7 +71,7 @@ __device__ T BilinearInterpolate( } template -__global__ void GPUROIAlignForward(const int nthreads, +__global__ void GPURoiAlignForward(const int nthreads, const T* input_data, const T* input_rois, const float spatial_scale, @@ -137,7 +137,7 @@ __global__ void GPUROIAlignForward(const int nthreads, } template -void ROIAlignKernel(const Context& dev_ctx, +void RoiAlignKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& boxes, paddle::optional boxes_num, @@ -233,7 +233,7 @@ void ROIAlignKernel(const Context& dev_ctx, int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); paddle::memory::Copy( gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream()); - GPUROIAlignForward<<>>( + GPURoiAlignForward<<>>( output_size, x.data(), boxes.data(), @@ -252,4 +252,4 @@ void ROIAlignKernel(const Context& dev_ctx, } // namespace phi PD_REGISTER_KERNEL( - roi_align, GPU, ALL_LAYOUT, phi::ROIAlignKernel, float, double) {} + roi_align, GPU, ALL_LAYOUT, phi::RoiAlignKernel, float, double) {} diff --git a/paddle/phi/kernels/roi_align_grad_kernel.h b/paddle/phi/kernels/roi_align_grad_kernel.h new file mode 100644 index 00000000000..eea1fa03886 --- /dev/null +++ b/paddle/phi/kernels/roi_align_grad_kernel.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/utils/optional.h" + +namespace phi { + +template +void RoiAlignGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + const DenseTensor& out_grad, + int pooled_height, + int pooled_width, + float spatial_scale, + int sampling_ratio, + bool aligned, + DenseTensor* dx); + +} // namespace phi diff --git a/paddle/phi/kernels/roi_align_kernel.h b/paddle/phi/kernels/roi_align_kernel.h index 16b52c563a5..9734da53b7f 100644 --- a/paddle/phi/kernels/roi_align_kernel.h +++ b/paddle/phi/kernels/roi_align_kernel.h @@ -20,7 +20,7 @@ namespace phi { template -void ROIAlignKernel(const Context& dev_ctx, +void RoiAlignKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& boxes, paddle::optional boxes_num, diff --git a/paddle/phi/ops/compat/roi_align_sig.cc b/paddle/phi/ops/compat/roi_align_sig.cc index 0549103b6fb..1717ec8f788 100644 --- a/paddle/phi/ops/compat/roi_align_sig.cc +++ b/paddle/phi/ops/compat/roi_align_sig.cc @@ -16,7 +16,7 @@ namespace phi { -KernelSignature ROIAlignOpArgumentMapping(const ArgumentMappingContext& ctx) { +KernelSignature RoiAlignOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("roi_align", {"X", "ROIs", "RoisNum"}, {"pooled_height", @@ -27,6 +27,19 @@ KernelSignature ROIAlignOpArgumentMapping(const ArgumentMappingContext& ctx) { {"Out"}); } +KernelSignature RoiAlignGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("roi_align_grad", + {"X", "ROIs", "RoisNum", GradVarName("Out")}, + {"pooled_height", + "pooled_width", + "spatial_scale", + "sampling_ratio", + "aligned"}, + {GradVarName("X")}); +} + } // namespace phi -PD_REGISTER_ARG_MAPPING_FN(roi_align, phi::ROIAlignOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(roi_align, phi::RoiAlignOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(roi_align_grad, phi::RoiAlignGradOpArgumentMapping); -- GitLab From 2f5fb031f96bc3c91da704b7fc8471ff6f124398 Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Wed, 16 Mar 2022 16:34:34 +0800 Subject: [PATCH 104/176] Restructure sparse conv (#40570) restructure conv --- paddle/phi/kernels/funcs/sparse/convolution.h | 20 +- .../kernels/sparse/convolution_grad_kernel.h | 4 +- paddle/phi/kernels/sparse/cpu/convolution.h | 14 +- .../sparse/cpu/convolution_grad_kernel.cc | 4 +- .../kernels/sparse/cpu/convolution_kernel.cc | 9 +- .../phi/kernels/sparse/gpu/convolution.cu.h | 493 +++++++++++++++++ .../sparse/gpu/convolution_grad_kernel.cu | 13 +- .../kernels/sparse/gpu/convolution_kernel.cu | 508 +----------------- .../kernels/test_sparse_conv3d_dev_api.cc | 42 +- 9 files changed, 555 insertions(+), 552 deletions(-) diff --git a/paddle/phi/kernels/funcs/sparse/convolution.h b/paddle/phi/kernels/funcs/sparse/convolution.h index 68fe8880a97..d82d793e534 100644 --- a/paddle/phi/kernels/funcs/sparse/convolution.h +++ b/paddle/phi/kernels/funcs/sparse/convolution.h @@ -93,7 +93,7 @@ inline HOSTDEVICE void IndexToPoint( } inline void GetOutShape(const DDim& x_dims, - const DDim& kernel_dims, + const std::vector& kernel_sizes, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, @@ -102,17 +102,17 @@ inline void GetOutShape(const DDim& x_dims, x_dims.size(), 5, phi::errors::InvalidArgument("the shape of x should be (N, D, H, W, C)")); - PADDLE_ENFORCE_EQ(kernel_dims.size(), + PADDLE_ENFORCE_EQ(kernel_sizes.size(), 5, phi::errors::InvalidArgument( "the shape of kernel should be (D, H, W, C, OC)")); // infer out shape (*out_dims)[0] = x_dims[0]; - (*out_dims)[4] = kernel_dims[4]; + (*out_dims)[4] = kernel_sizes[4]; for (int i = 1; i < 4; i++) { (*out_dims)[i] = (x_dims[i] + 2 * paddings[i - 1] - - dilations[i - 1] * (kernel_dims[i - 1] - 1) - 1) / + dilations[i - 1] * (kernel_sizes[i - 1] - 1) - 1) / strides[i - 1] + 1; } @@ -131,7 +131,7 @@ template inline void SubmPreProcess(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& kernel, - const SparseCooTensor& out_grad, + const DenseTensor& out_grad, const int in_channels, const int out_channels, const int half_kernel_size, @@ -142,11 +142,11 @@ inline void SubmPreProcess(const Context& dev_ctx, blas.GEMM(CblasTrans, CblasNoTrans, x.non_zero_elements().dims()[1], - out_grad.non_zero_elements().dims()[1], + out_grad.dims()[1], x.non_zero_elements().dims()[0], static_cast(1), x.non_zero_elements().data(), - out_grad.non_zero_elements().data(), + out_grad.data(), static_cast(0), d_kernel_ptr + half_kernel_size * in_channels * out_channels); @@ -155,11 +155,11 @@ inline void SubmPreProcess(const Context& dev_ctx, T* x_grad_ptr = x_grad->data(); blas.GEMM(CblasNoTrans, CblasTrans, - out_grad.non_zero_elements().dims()[0], + out_grad.dims()[0], in_channels, - out_grad.non_zero_elements().dims()[1], + out_grad.dims()[1], static_cast(1), - out_grad.non_zero_elements().data(), + out_grad.data(), kernel.data() + half_kernel_size * in_channels * out_channels, static_cast(0), x_grad_ptr); diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h index 42bde442e1e..23e059c72e7 100644 --- a/paddle/phi/kernels/sparse/convolution_grad_kernel.h +++ b/paddle/phi/kernels/sparse/convolution_grad_kernel.h @@ -27,7 +27,7 @@ void Conv3dGradKernel(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& rulebook, const DenseTensor& kernel, - const SparseCooTensor& out_grad, + const DenseTensor& out_grad, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, @@ -41,7 +41,7 @@ std::vector Conv3dGrad(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& rulebook, const DenseTensor& kernel, - const SparseCooTensor& out_grad, + const DenseTensor& out_grad, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h index 64c32df1897..93a335e2f1c 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution.h +++ b/paddle/phi/kernels/sparse/cpu/convolution.h @@ -34,7 +34,7 @@ using Dims4D = phi::funcs::sparse::Dims4D; template void ProductRuleBook(const Context& dev_ctx, const SparseCooTensor& x, - const DenseTensor& kernel, + const std::vector& kernel_sizes, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, @@ -42,19 +42,19 @@ void ProductRuleBook(const Context& dev_ctx, const bool subm, DenseTensor* rulebook, DenseTensor* counter_per_kernel) { - const auto& kernel_dims = kernel.dims(); const int64_t non_zero_num = x.nnz(); const auto& non_zero_indices = x.non_zero_indices(); const int* indices_ptr = non_zero_indices.data(); int* counter_ptr = counter_per_kernel->data(); - int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; + int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; memset(counter_ptr, 0, kernel_size * sizeof(int)); int rulebook_len = 0; // calc the rulebook_len const auto& x_dims = x.dims(); const Dims4D c_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]); - const Dims4D c_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]); + const Dims4D c_kernel_dims( + 1, kernel_sizes[2], kernel_sizes[1], kernel_sizes[0]); const Dims4D c_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]); const Dims4D c_paddings(1, paddings[2], paddings[1], paddings[0]); const Dims4D c_strides(1, strides[2], strides[1], strides[0]); @@ -75,9 +75,9 @@ void ProductRuleBook(const Context& dev_ctx, auto f_calc_rulebook = [&](int* rulebook_ptr) { int kernel_index = 0, rulebook_index = 0; - for (int kz = 0; kz < kernel_dims[0]; kz++) { - for (int ky = 0; ky < kernel_dims[1]; ky++) { - for (int kx = 0; kx < kernel_dims[2]; kx++) { + for (int kz = 0; kz < kernel_sizes[0]; kz++) { + for (int ky = 0; ky < kernel_sizes[1]; ky++) { + for (int kx = 0; kx < kernel_sizes[2]; kx++) { ++kernel_index; for (int64_t i = 0; i < non_zero_num; i++) { int batch = indices_ptr[i]; diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc index 5d7b381b7cb..3348d81cf6b 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc @@ -33,7 +33,7 @@ void Conv3dGradKernel(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& rulebook, const DenseTensor& kernel, - const SparseCooTensor& out_grad, + const DenseTensor& out_grad, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, @@ -113,7 +113,7 @@ void Conv3dGradKernel(const Context& dev_ctx, rulebook_len, in_channels, in_features_ptr); - Gather(out_grad.non_zero_elements().data(), + Gather(out_grad.data(), rulebook_ptr + rulebook_len * 2, rulebook_len, out_channels, diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc index 746ca04a826..f022e4ef4bb 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc @@ -44,8 +44,13 @@ void Conv3dKernel(const Context& dev_ctx, const auto& kernel_dims = kernel.dims(); int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; DDim out_dims = {1, 1, 1, 1, 1}; + std::vector kernel_sizes(kernel_dims.size()); + for (int i = 0; i < kernel_dims.size(); i++) { + kernel_sizes[i] = kernel_dims[i]; + } + phi::funcs::sparse::GetOutShape( - x_dims, kernel_dims, paddings, dilations, strides, &out_dims); + x_dims, kernel_sizes, paddings, dilations, strides, &out_dims); const int in_channels = kernel_dims[3]; const int out_channels = kernel_dims[4]; @@ -63,7 +68,7 @@ void Conv3dKernel(const Context& dev_ctx, ProductRuleBook(dev_ctx, x, - kernel, + kernel_sizes, subm_paddings, dilations, subm_strides, diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index 8826fd7cf87..5b928817f64 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -23,11 +23,15 @@ limitations under the License. */ #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/kernels/funcs/index_impl.cu.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/primitive/compute_primitives.h" #include "paddle/phi/kernels/sparse/convolution_kernel.h" namespace phi { namespace sparse { +using Dims4D = phi::funcs::sparse::Dims4D; + // TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace // this kernel with phi::GatherCUDAKernel; // Vectorization can be used to improve read and write bandwidth @@ -139,5 +143,494 @@ inline int* SortedAndUniqueIndex(const Context& dev_ctx, return new_end.first; } +template +__global__ void SetFlagAndUpdateCounterKernel(const int* indexs, + const int n, + const int rulebook_len, + const int kernel_size, + T* rulebook_ptr, + int* counter_ptr) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + extern __shared__ int cache_count[]; // kernel_size + for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { + cache_count[i] = 0; + } + __syncthreads(); + + for (int i = tid; i < n; i += gridDim.x * blockDim.x) { + int index = indexs[i]; + int kernel_index = rulebook_ptr[index]; + rulebook_ptr[index + rulebook_len] = -1; + rulebook_ptr[index + 2 * rulebook_len] = -1; + rulebook_ptr[index] = -1; + atomicAdd(&cache_count[kernel_index], 1); + } + __syncthreads(); + + for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { + atomicSub(&counter_ptr[i], cache_count[i]); + } +} + +/** + * @brief: update the out index and indices + * unique_keys: save the index of the output feature list + * unique_values: indiates the index of key before deduplication + * out_indexs: indicates the position of the output index in the rulebook + * rulebook_len: indicates the length of rulebook + * out_dims: indicates the output dims + * out_indices: the indices of output, out_indices = IndexToPoint(unique_keys) + * rulebook_out_indexs: the output index in rulebook +**/ +template +__global__ void UpdateIndexKernel(const int* unique_keys, + const int* unique_values, + const int* out_indexs, + const int non_zero_num, + const int rulebook_len, + const Dims4D out_dims, + T* out_indices, + T* rulebook_out_indexs) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) { + const int index = unique_keys[i]; + int batch, x, y, z; + phi::funcs::sparse::IndexToPoint( + index, out_dims, &batch, &x, &y, &z); + // get out indices + out_indices[i] = batch; + out_indices[i + non_zero_num] = z; + out_indices[i + non_zero_num * 2] = y; + out_indices[i + non_zero_num * 3] = x; + + // update rulebook + int start = unique_values[i]; + int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1]; + // max(end-start) = kernel_size + for (int j = start; j < end; j++) { + rulebook_out_indexs[out_indexs[j]] = i; + } + } +} + +// brief: calculation the distance between start and end +template +__global__ void DistanceKernel(const T* start, const T* end, int* distance) { + if (threadIdx.x == 0) { + *distance = end - start; + } +} + +/** + * @brief product rulebook + * for input_i in x_indices: + * if input_i participate in the convolution calculation: + * infer the output_i by input_i and kernel_i + * save output_i + * + * x_indices: the indices of input features + * x_dims: the input dims + * kernel_dims: the kernel dims + * out_dims: the output dims + * non_zero_num: the number of input features + * rulebook: the rulebook to save the kernel index, input index and output index + * counter: save the number of times each location in the kernel participates in + *the caculation +**/ +template +__global__ void ProductRuleBookKernel(const T* x_indices, + const Dims4D x_dims, + const Dims4D kernel_dims, + const Dims4D out_dims, + const int64_t non_zero_num, + const Dims4D paddings, + const Dims4D dilations, + const Dims4D strides, + const bool subm, + T* rulebook, + int* counter, + int* in_indexs) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + extern __shared__ int counter_buf[]; // kernel_size + const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1]; + const int offset = kernel_size * non_zero_num; + for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { + counter_buf[i] = 0; + } + __syncthreads(); + + for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) { + int kernel_index = 0; + int batch = x_indices[i]; + int in_z = x_indices[i + non_zero_num]; + int in_y = x_indices[i + 2 * non_zero_num]; + int in_x = x_indices[i + 3 * non_zero_num]; + if (subm) { + in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims); + } + for (int kz = 0; kz < kernel_dims[1]; kz++) { + for (int ky = 0; ky < kernel_dims[2]; ky++) { + for (int kx = 0; kx < kernel_dims[3]; kx++) { + int in_i = -1, out_index = -1, kernel_i = -1; + if (phi::funcs::sparse::Check(x_dims, + kernel_dims, + paddings, + dilations, + strides, + in_x, + in_y, + in_z, + kx, + ky, + kz)) { + int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1]; + int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2]; + int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3]; + in_i = i; + out_index = phi::funcs::sparse::PointToIndex( + batch, out_x, out_y, out_z, out_dims); + atomicAdd(&counter_buf[kernel_index], 1); + kernel_i = kernel_index; + } + rulebook[kernel_index * non_zero_num + i] = kernel_i; + rulebook[kernel_index * non_zero_num + offset + i] = in_i; + rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index; + ++kernel_index; + } + } + } + } + __syncthreads(); + for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { + atomicAdd(&counter[i], counter_buf[i]); + } +} + +// the basic algorithm can refer to convolution_kernel.cc or +// the second paper +// example: +// 1. the rulebook: +// the kernel_index: 0, 0, 0, 1, 1, 1, 2, 2, .... +// the out_index(key): 20, 30, 33, 30, 33, 20, 25 +// 2. mark the index of out_index(value): 0, 1, 2, 3, 4, 5, 6, .... +// 3. sorted the (key, value) +// 4. unique the (key, value): +// unique_key: 20, 25, 30, 33 +// unique_values: 0, 2, 3, 5 +// the index of unique_values is: 0, 1, 2, 3 +// 5. update the out_index by unique_key, uniqe_value and the index of +// unique_value: +// the new out_index: 0, 2, 3, 2, 3, 0, 1 +template +int ProductRuleBook(const Context& dev_ctx, + const SparseCooTensor& x, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const DDim& out_dims, + const bool subm, + DenseTensor* rulebook, + DenseTensor* counter_per_kernel, + DenseTensor* offsets_per_kernel, + DenseTensor* out_index, + DenseTensor* unique_key, + DenseTensor* unique_value, + SparseCooTensor* out, + std::vector* h_counter, + std::vector* h_offsets) { + const int64_t non_zero_num = x.nnz(); + const auto& non_zero_indices = x.non_zero_indices(); + const int* indices_ptr = non_zero_indices.data(); + DenseTensor in_indexs = phi::Empty( + dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW)); + int* counter_ptr = counter_per_kernel->data(); + int* offsets_ptr = offsets_per_kernel->data(); + int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; + const int rulebook_rows = 3; + const int rulebook_cols = kernel_size * non_zero_num; + rulebook->ResizeAndAllocate({rulebook_rows, rulebook_cols}); + int* rulebook_ptr = rulebook->data(); + + const auto x_dims = x.dims(); + Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]); + Dims4D d_kernel_dims(1, kernel_sizes[2], kernel_sizes[1], kernel_sizes[0]); + Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]); + Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]); + Dims4D d_strides(1, strides[2], strides[1], strides[0]); + Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]); + + // 1. product rule book + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, counter_per_kernel, 0); + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); + + ProductRuleBookKernel<<>>(indices_ptr, + d_x_dims, + d_kernel_dims, + d_out_dims, + non_zero_num, + d_paddings, + d_dilations, + d_strides, + subm, + rulebook_ptr, + counter_ptr, + in_indexs.data()); + +// 2. remove -1 +#ifdef PADDLE_WITH_HIP + int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()), +#else + int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()), +#endif + rulebook_ptr, + rulebook_ptr + rulebook_rows * rulebook_cols, + -1); + + DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( + rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1); + int rulebook_len = 0; + phi::backends::gpu::GpuMemcpyAsync( + &rulebook_len, + rulebook_ptr + 3 * kernel_size * non_zero_num - 1, + sizeof(int), +#ifdef PADDLE_WITH_HIP + hipMemcpyDeviceToHost, +#else + cudaMemcpyDeviceToHost, +#endif + dev_ctx.stream()); + rulebook_len /= 3; + dev_ctx.Wait(); + + if (subm) { + // At present, hashtable is not used to map the input and output indexes. + // At present, the intermediate output index is generated by normal + // convolution, + // and then the intermediate output index is subtracted from the input index + // to obain the rulebook. + // get difference + int32_t* A_key_ptr = rulebook_ptr + 2 * rulebook_len; + int32_t* B_key_ptr = in_indexs.data(); + DenseTensor A_val = phi::Empty( + dev_ctx, + DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW)); + DenseTensor B_val = phi::Empty( + dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW)); + phi::IndexKernel>( + dev_ctx, &A_val, kps::IdentityFunctor()); + phi::IndexKernel>( + dev_ctx, &B_val, kps::IdentityFunctor()); + DenseTensor key_result = phi::Empty( + dev_ctx, + DenseTensorMeta(DataType::INT32, {rulebook_len + 1}, DataLayout::NCHW)); + DenseTensor val_result = phi::Empty( + dev_ctx, + DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW)); + +#ifdef PADDLE_WITH_HIP + thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), +#endif + counter_ptr, + counter_ptr + kernel_size, + offsets_ptr); + std::vector offsets(kernel_size, 0); + // TODO(zhangkaihuo): used unified memcpy interface + phi::backends::gpu::GpuMemcpyAsync(offsets.data(), + offsets_ptr, + kernel_size * sizeof(int), +#ifdef PADDLE_WITH_HIP + hipMemcpyDeviceToHost, +#else + cudaMemcpyDeviceToHost, +#endif + dev_ctx.stream()); + dev_ctx.Wait(); + + thrust::pair end; + // Because set_diff does not support duplicate data, set_diff is performed + // separately for each segment of data. + // TODO(zhangkaihuo): Using hashtable here may get better performance, + // further tests ared needed. + for (int i = 0; i < kernel_size; i++) { + int start = offsets[i]; + int stop = i == kernel_size - 1 ? rulebook_len : offsets[i + 1]; + int* key_result_start = (i == 0 ? key_result.data() : end.first); + int* val_result_start = i == 0 ? val_result.data() : end.second; + end = +#ifdef PADDLE_WITH_HIP + thrust::set_difference_by_key(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::set_difference_by_key(thrust::cuda::par.on(dev_ctx.stream()), +#endif + A_key_ptr + start, + A_key_ptr + stop, + B_key_ptr, + B_key_ptr + x.nnz(), + A_val.data() + start, + B_val.data(), + key_result_start, + val_result_start); + } + + DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( + key_result.data(), + end.first, + key_result.data() + rulebook_len); + int len = 0; + phi::backends::gpu::GpuMemcpyAsync(&len, + key_result.data() + rulebook_len, + sizeof(int), +#ifdef PADDLE_WITH_HIP + hipMemcpyDeviceToHost, +#else + cudaMemcpyDeviceToHost, +#endif + dev_ctx.stream()); + dev_ctx.Wait(); + // set the diff value = -1, and update counter + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len, 1); + SetFlagAndUpdateCounterKernel<<>>( + val_result.data(), + len, + rulebook_len, + kernel_size, + rulebook_ptr, + counter_ptr); +// remove -1 +#ifdef PADDLE_WITH_HIP + int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()), +#else + int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()), +#endif + rulebook_ptr, + rulebook_ptr + 3 * rulebook_len, + -1); + DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( + rulebook_ptr, last, key_result.data() + rulebook_len); + phi::backends::gpu::GpuMemcpyAsync(&rulebook_len, + key_result.data() + rulebook_len, + sizeof(int), +#ifdef PADDLE_WITH_HIP + hipMemcpyDeviceToHost, +#else + cudaMemcpyDeviceToHost, +#endif + dev_ctx.stream()); + dev_ctx.Wait(); + rulebook_len /= 3; + } + +#ifdef PADDLE_WITH_HIP + thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), +#endif + counter_ptr, + counter_ptr + kernel_size, + offsets_ptr); + +#ifdef PADDLE_WITH_HIP + phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], + counter_ptr, + kernel_size * sizeof(int), + hipMemcpyDeviceToHost, + dev_ctx.stream()); + phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], + offsets_ptr, + kernel_size * sizeof(int), + hipMemcpyDeviceToHost, + dev_ctx.stream()); +#else + phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], + counter_ptr, + kernel_size * sizeof(int), + cudaMemcpyDeviceToHost, + dev_ctx.stream()); + phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], + offsets_ptr, + kernel_size * sizeof(int), + cudaMemcpyDeviceToHost, + dev_ctx.stream()); +#endif + rulebook->Resize({rulebook_rows, rulebook_len}); + + // 3. sorted or merge the out index + out_index->ResizeAndAllocate({rulebook_len}); + unique_value->ResizeAndAllocate({rulebook_len}); + unique_key->ResizeAndAllocate({rulebook_len}); + int* out_index_ptr = out_index->data(); + int* unique_value_ptr = unique_value->data(); + int* unique_key_ptr = unique_key->data(); + + int* new_end = SortedAndUniqueIndex(dev_ctx, + rulebook_ptr + 2 * rulebook_len, + rulebook_len, + out_index, + unique_key, + unique_value); + // thrust::distance doesn't support stream parameters + // const int out_non_zero_num = thrust::distance(unique_key_ptr, + // new_end.first); + DistanceKernel<<<1, 1>>>( + unique_key_ptr, + new_end, + rulebook_ptr + rulebook_rows * rulebook_cols - 1); + int out_non_zero_num = 0; +#ifdef PADDLE_WITH_HIP + phi::backends::gpu::GpuMemcpyAsync( + &out_non_zero_num, + rulebook_ptr + rulebook_rows * rulebook_cols - 1, + sizeof(int), + hipMemcpyDeviceToHost, + dev_ctx.stream()); +#else + phi::backends::gpu::GpuMemcpyAsync( + &out_non_zero_num, + rulebook_ptr + rulebook_rows * rulebook_cols - 1, + sizeof(int), + cudaMemcpyDeviceToHost, + dev_ctx.stream()); +#endif + dev_ctx.Wait(); + + // 5. update out_indices and rulebook by unique_value_ptr + const int64_t sparse_dim = 4; + DenseTensorMeta indices_meta( + DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW); + DenseTensorMeta values_meta( + x.dtype(), {out_non_zero_num, kernel_sizes[4]}, x.layout()); + phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta)); + phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta)); + + int* out_indices_ptr = out_indices.data(); + + config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1); + UpdateIndexKernel<<>>(unique_key_ptr, + unique_value_ptr, + out_index_ptr, + out_non_zero_num, + rulebook_len, + d_out_dims, + out_indices_ptr, + rulebook_ptr + 2 * rulebook_len); + out->SetMember(out_indices, out_values, out_dims, true); + return rulebook_len; +} + } // namespace sparse } // namespace phi diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu index d6d992d0f4b..4db0a0b0011 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu @@ -38,7 +38,7 @@ void Conv3dGradKernel(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& rulebook, const DenseTensor& kernel, - const SparseCooTensor& out_grad, + const DenseTensor& out_grad, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, @@ -140,12 +140,11 @@ void Conv3dGradKernel(const Context& dev_ctx, GatherKernel<<>>( - out_grad.non_zero_elements().data(), - rulebook_ptr + rulebook_len * 2, - out_grad_features_ptr, - rulebook_len, - out_channels); + dev_ctx.stream()>>>(out_grad.data(), + rulebook_ptr + rulebook_len * 2, + out_grad_features_ptr, + rulebook_len, + out_channels); const T* kernel_ptr = kernel.data(); for (int i = 0; i < kernel_size; i++) { diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu index 1a0c7e9b972..214e689e937 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu @@ -12,515 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include -#include -#include - -#include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/backends/gpu/gpu_info.h" -#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/index_impl.cu.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/primitive/compute_primitives.h" #include "paddle/phi/kernels/sparse/convolution_kernel.h" #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" namespace phi { namespace sparse { -using Dims4D = phi::funcs::sparse::Dims4D; - -__global__ void SetFlagAndUpdateCounterKernel(const int* indexs, - const int n, - const int rulebook_len, - const int kernel_size, - int* rulebook_ptr, - int* counter_ptr) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - extern __shared__ int cache_count[]; // kernel_size - for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { - cache_count[i] = 0; - } - __syncthreads(); - - for (int i = tid; i < n; i += gridDim.x * blockDim.x) { - int index = indexs[i]; - int kernel_index = rulebook_ptr[index]; - rulebook_ptr[index + rulebook_len] = -1; - rulebook_ptr[index + 2 * rulebook_len] = -1; - rulebook_ptr[index] = -1; - atomicAdd(&cache_count[kernel_index], 1); - } - __syncthreads(); - - for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { - atomicSub(&counter_ptr[i], cache_count[i]); - } -} - -/** - * @brief: update the out index and indices - * unique_keys: save the index of the output feature list - * unique_values: indiates the index of key before deduplication - * out_indexs: indicates the position of the output index in the rulebook - * rulebook_len: indicates the length of rulebook - * out_dims: indicates the output dims - * out_indices: the indices of output, out_indices = IndexToPoint(unique_keys) - * rulebook_out_indexs: the output index in rulebook -**/ -__global__ void UpdateIndexKernel(const int* unique_keys, - const int* unique_values, - const int* out_indexs, - const int non_zero_num, - const int rulebook_len, - const Dims4D out_dims, - int* out_indices, - int* rulebook_out_indexs) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) { - const int index = unique_keys[i]; - int batch, x, y, z; - phi::funcs::sparse::IndexToPoint( - index, out_dims, &batch, &x, &y, &z); - // get out indices - out_indices[i] = batch; - out_indices[i + non_zero_num] = z; - out_indices[i + non_zero_num * 2] = y; - out_indices[i + non_zero_num * 3] = x; - - // update rulebook - int start = unique_values[i]; - int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1]; - // max(end-start) = kernel_size - for (int j = start; j < end; j++) { - rulebook_out_indexs[out_indexs[j]] = i; - } - } -} - -/** - * @brief product rulebook - * for input_i in x_indices: - * if input_i participate in the convolution calculation: - * infer the output_i by input_i and kernel_i - * save output_i - * - * x_indices: the indices of input features - * x_dims: the input dims - * kernel_dims: the kernel dims - * out_dims: the output dims - * non_zero_num: the number of input features - * rulebook: the rulebook to save the kernel index, input index and output index - * counter: save the number of times each location in the kernel participates in - *the caculation -**/ -__global__ void ProductRuleBookKernel(const int* x_indices, - const Dims4D x_dims, - const Dims4D kernel_dims, - const Dims4D out_dims, - const int64_t non_zero_num, - const Dims4D paddings, - const Dims4D dilations, - const Dims4D strides, - const bool subm, - int* rulebook, - int* counter, - int* in_indexs) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - extern __shared__ int counter_buf[]; // kernel_size - const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1]; - const int offset = kernel_size * non_zero_num; - for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { - counter_buf[i] = 0; - } - __syncthreads(); - - for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) { - int kernel_index = 0; - int batch = x_indices[i]; - int in_z = x_indices[i + non_zero_num]; - int in_y = x_indices[i + 2 * non_zero_num]; - int in_x = x_indices[i + 3 * non_zero_num]; - if (subm) { - in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims); - } - for (int kz = 0; kz < kernel_dims[1]; kz++) { - for (int ky = 0; ky < kernel_dims[2]; ky++) { - for (int kx = 0; kx < kernel_dims[3]; kx++) { - int in_i = -1, out_index = -1, kernel_i = -1; - if (phi::funcs::sparse::Check(x_dims, - kernel_dims, - paddings, - dilations, - strides, - in_x, - in_y, - in_z, - kx, - ky, - kz)) { - int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1]; - int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2]; - int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3]; - in_i = i; - out_index = phi::funcs::sparse::PointToIndex( - batch, out_x, out_y, out_z, out_dims); - atomicAdd(&counter_buf[kernel_index], 1); - kernel_i = kernel_index; - } - rulebook[kernel_index * non_zero_num + i] = kernel_i; - rulebook[kernel_index * non_zero_num + offset + i] = in_i; - rulebook[kernel_index * non_zero_num + offset * 2 + i] = out_index; - ++kernel_index; - } - } - } - } - __syncthreads(); - for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) { - atomicAdd(&counter[i], counter_buf[i]); - } -} - -// brief: calculation the distance between start and end -__global__ void DistanceKernel(const int* start, - const int* end, - int* distance) { - if (threadIdx.x == 0) { - *distance = end - start; - } -} - -// the basic algorithm can refer to convolution_kernel.cc or -// the second paper -// example: -// 1. the rulebook: -// the kernel_index: 0, 0, 0, 1, 1, 1, 2, 2, .... -// the out_index(key): 20, 30, 33, 30, 33, 20, 25 -// 2. mark the index of out_index(value): 0, 1, 2, 3, 4, 5, 6, .... -// 3. sorted the (key, value) -// 4. unique the (key, value): -// unique_key: 20, 25, 30, 33 -// unique_values: 0, 2, 3, 5 -// the index of unique_values is: 0, 1, 2, 3 -// 5. update the out_index by unique_key, uniqe_value and the index of -// unique_value: -// the new out_index: 0, 2, 3, 2, 3, 0, 1 -template -int ProductRuleBook(const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& kernel, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const DDim& out_dims, - const bool subm, - DenseTensor* rulebook, - DenseTensor* counter_per_kernel, - DenseTensor* offsets_per_kernel, - DenseTensor* out_index, - DenseTensor* unique_key, - DenseTensor* unique_value, - SparseCooTensor* out, - std::vector* h_counter, - std::vector* h_offsets) { - const auto& kernel_dims = kernel.dims(); - const int64_t non_zero_num = x.nnz(); - const auto& non_zero_indices = x.non_zero_indices(); - const int* indices_ptr = non_zero_indices.data(); - DenseTensor in_indexs = phi::Empty( - dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW)); - int* counter_ptr = counter_per_kernel->data(); - int* offsets_ptr = offsets_per_kernel->data(); - int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; - const int rulebook_rows = 3; - const int rulebook_cols = kernel_size * non_zero_num; - rulebook->ResizeAndAllocate({rulebook_rows, rulebook_cols}); - int* rulebook_ptr = rulebook->data(); - - const auto x_dims = x.dims(); - Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]); - Dims4D d_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]); - Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]); - Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]); - Dims4D d_strides(1, strides[2], strides[1], strides[0]); - Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]); - - // 1. product rule book - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, counter_per_kernel, 0); - auto config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); - - ProductRuleBookKernel<<>>(indices_ptr, - d_x_dims, - d_kernel_dims, - d_out_dims, - non_zero_num, - d_paddings, - d_dilations, - d_strides, - subm, - rulebook_ptr, - counter_ptr, - in_indexs.data()); - -// 2. remove -1 -#ifdef PADDLE_WITH_HIP - int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()), -#else - int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()), -#endif - rulebook_ptr, - rulebook_ptr + rulebook_rows * rulebook_cols, - -1); - - DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( - rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1); - int rulebook_len = 0; - phi::backends::gpu::GpuMemcpyAsync( - &rulebook_len, - rulebook_ptr + 3 * kernel_size * non_zero_num - 1, - sizeof(int), -#ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, -#else - cudaMemcpyDeviceToHost, -#endif - dev_ctx.stream()); - rulebook_len /= 3; - dev_ctx.Wait(); - - if (subm) { - // At present, hashtable is not used to map the input and output indexes. - // At present, the intermediate output index is generated by normal - // convolution, - // and then the intermediate output index is subtracted from the input index - // to obain the rulebook. - // get difference - int32_t* A_key_ptr = rulebook_ptr + 2 * rulebook_len; - int32_t* B_key_ptr = in_indexs.data(); - DenseTensor A_val = phi::Empty( - dev_ctx, - DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW)); - DenseTensor B_val = phi::Empty( - dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW)); - phi::IndexKernel>( - dev_ctx, &A_val, kps::IdentityFunctor()); - phi::IndexKernel>( - dev_ctx, &B_val, kps::IdentityFunctor()); - DenseTensor key_result = phi::Empty( - dev_ctx, - DenseTensorMeta(DataType::INT32, {rulebook_len + 1}, DataLayout::NCHW)); - DenseTensor val_result = phi::Empty( - dev_ctx, - DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW)); - -#ifdef PADDLE_WITH_HIP - thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), -#else - thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), -#endif - counter_ptr, - counter_ptr + kernel_size, - offsets_ptr); - std::vector offsets(kernel_size, 0); - // TODO(zhangkaihuo): used unified memcpy interface - phi::backends::gpu::GpuMemcpyAsync(offsets.data(), - offsets_ptr, - kernel_size * sizeof(int), -#ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, -#else - cudaMemcpyDeviceToHost, -#endif - dev_ctx.stream()); - dev_ctx.Wait(); - - thrust::pair end; - // Because set_diff does not support duplicate data, set_diff is performed - // separately for each segment of data. - // TODO(zhangkaihuo): Using hashtable here may get better performance, - // further tests ared needed. - for (int i = 0; i < kernel_size; i++) { - int start = offsets[i]; - int stop = i == kernel_size - 1 ? rulebook_len : offsets[i + 1]; - int* key_result_start = (i == 0 ? key_result.data() : end.first); - int* val_result_start = i == 0 ? val_result.data() : end.second; - end = -#ifdef PADDLE_WITH_HIP - thrust::set_difference_by_key(thrust::hip::par.on(dev_ctx.stream()), -#else - thrust::set_difference_by_key(thrust::cuda::par.on(dev_ctx.stream()), -#endif - A_key_ptr + start, - A_key_ptr + stop, - B_key_ptr, - B_key_ptr + x.nnz(), - A_val.data() + start, - B_val.data(), - key_result_start, - val_result_start); - } - - DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( - key_result.data(), - end.first, - key_result.data() + rulebook_len); - int len = 0; - phi::backends::gpu::GpuMemcpyAsync(&len, - key_result.data() + rulebook_len, - sizeof(int), -#ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, -#else - cudaMemcpyDeviceToHost, -#endif - dev_ctx.stream()); - dev_ctx.Wait(); - // set the diff value = -1, and update counter - auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len, 1); - SetFlagAndUpdateCounterKernel<<>>(val_result.data(), - len, - rulebook_len, - kernel_size, - rulebook_ptr, - counter_ptr); -// remove -1 -#ifdef PADDLE_WITH_HIP - int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()), -#else - int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()), -#endif - rulebook_ptr, - rulebook_ptr + 3 * rulebook_len, - -1); - DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>( - rulebook_ptr, last, key_result.data() + rulebook_len); - phi::backends::gpu::GpuMemcpyAsync(&rulebook_len, - key_result.data() + rulebook_len, - sizeof(int), -#ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, -#else - cudaMemcpyDeviceToHost, -#endif - dev_ctx.stream()); - dev_ctx.Wait(); - rulebook_len /= 3; - } - -#ifdef PADDLE_WITH_HIP - thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), -#else - thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()), -#endif - counter_ptr, - counter_ptr + kernel_size, - offsets_ptr); - -#ifdef PADDLE_WITH_HIP - phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], - counter_ptr, - kernel_size * sizeof(int), - hipMemcpyDeviceToHost, - dev_ctx.stream()); - phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], - offsets_ptr, - kernel_size * sizeof(int), - hipMemcpyDeviceToHost, - dev_ctx.stream()); -#else - phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0], - counter_ptr, - kernel_size * sizeof(int), - cudaMemcpyDeviceToHost, - dev_ctx.stream()); - phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0], - offsets_ptr, - kernel_size * sizeof(int), - cudaMemcpyDeviceToHost, - dev_ctx.stream()); -#endif - rulebook->Resize({rulebook_rows, rulebook_len}); - - // 3. sorted or merge the out index - out_index->ResizeAndAllocate({rulebook_len}); - unique_value->ResizeAndAllocate({rulebook_len}); - unique_key->ResizeAndAllocate({rulebook_len}); - int* out_index_ptr = out_index->data(); - int* unique_value_ptr = unique_value->data(); - int* unique_key_ptr = unique_key->data(); - - int* new_end = SortedAndUniqueIndex(dev_ctx, - rulebook_ptr + 2 * rulebook_len, - rulebook_len, - out_index, - unique_key, - unique_value); - // thrust::distance doesn't support stream parameters - // const int out_non_zero_num = thrust::distance(unique_key_ptr, - // new_end.first); - DistanceKernel<<<1, 1>>>(unique_key_ptr, - new_end, - rulebook_ptr + rulebook_rows * rulebook_cols - 1); - int out_non_zero_num = 0; -#ifdef PADDLE_WITH_HIP - phi::backends::gpu::GpuMemcpyAsync( - &out_non_zero_num, - rulebook_ptr + rulebook_rows * rulebook_cols - 1, - sizeof(int), - hipMemcpyDeviceToHost, - dev_ctx.stream()); -#else - phi::backends::gpu::GpuMemcpyAsync( - &out_non_zero_num, - rulebook_ptr + rulebook_rows * rulebook_cols - 1, - sizeof(int), - cudaMemcpyDeviceToHost, - dev_ctx.stream()); -#endif - dev_ctx.Wait(); - - // 5. update out_indices and rulebook by unique_value_ptr - const int64_t sparse_dim = 4; - DenseTensorMeta indices_meta( - DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW); - DenseTensorMeta values_meta( - x.dtype(), {out_non_zero_num, kernel_dims[4]}, x.layout()); - phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta)); - phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta)); - - int* out_indices_ptr = out_indices.data(); - - config = - phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1); - UpdateIndexKernel<<>>(unique_key_ptr, - unique_value_ptr, - out_index_ptr, - out_non_zero_num, - rulebook_len, - d_out_dims, - out_indices_ptr, - rulebook_ptr + 2 * rulebook_len); - out->SetMember(out_indices, out_values, out_dims, true); - return rulebook_len; -} - /** * x: (N, D, H, W, C) * kernel: (D, H, W, C, OC) @@ -545,9 +46,12 @@ void Conv3dKernel(const Context& dev_ctx, const auto& kernel_dims = kernel.dims(); int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; DDim out_dims = {1, 1, 1, 1, 1}; + std::vector kernel_sizes(kernel_dims.size()); + for (int i = 0; i < kernel_dims.size(); i++) { + kernel_sizes[i] = kernel_dims[i]; + } phi::funcs::sparse::GetOutShape( - x_dims, kernel_dims, paddings, dilations, strides, &out_dims); - out->set_dims(out_dims); + x_dims, kernel_sizes, paddings, dilations, strides, &out_dims); const int in_channels = kernel_dims[3]; const int out_channels = kernel_dims[4]; std::vector offsets(kernel_size + 1), h_counter(kernel_size); @@ -574,7 +78,7 @@ void Conv3dKernel(const Context& dev_ctx, int n = ProductRuleBook(dev_ctx, x, - kernel, + kernel_sizes, subm_paddings, dilations, subm_strides, diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc index 37a69a176c6..4800e1402ba 100644 --- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc @@ -132,16 +132,17 @@ void TestConv3dBase(const std::vector& indices, f_verify(out.non_zero_elements().data(), correct_out_features); if (backward) { - std::vector grads = sparse::Conv3dGrad(dev_ctx_cpu, - x_tensor, - rulebook, - kernel_tensor, - out, - paddings, - dilations, - strides, - 1, - subm); + std::vector grads = + sparse::Conv3dGrad(dev_ctx_cpu, + x_tensor, + rulebook, + kernel_tensor, + out.non_zero_elements(), + paddings, + dilations, + strides, + 1, + subm); f_verify(grads[0].data(), features_grad); f_verify(grads[1].data(), kernel_grad); } @@ -231,16 +232,17 @@ void TestConv3dBase(const std::vector& indices, f_verify(h_features_tensor.data(), correct_out_features); if (backward) { - std::vector grads = sparse::Conv3dGrad(dev_ctx_gpu, - d_x_tensor, - d_rulebook, - d_kernel_tensor, - d_out, - paddings, - dilations, - strides, - 1, - subm); + std::vector grads = + sparse::Conv3dGrad(dev_ctx_gpu, + d_x_tensor, + d_rulebook, + d_kernel_tensor, + d_out.non_zero_elements(), + paddings, + dilations, + strides, + 1, + subm); DenseTensor h_features_grad = phi::Empty( dev_ctx_cpu, DenseTensorMeta(grads[0].dtype(), grads[0].dims(), grads[0].layout())); -- GitLab From 849bfbbf420115707871b05211dcf77487d99286 Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Wed, 16 Mar 2022 16:48:10 +0800 Subject: [PATCH 105/176] Add tensor desc size check (#40518) --- paddle/fluid/framework/tensor_util.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 10ceae62dcc..5de86123546 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -1224,8 +1224,12 @@ void TensorFromStream(std::istream& is, Tensor* tensor, proto::VarType::TensorDesc desc; { // int32_t size // proto buffer - int32_t size; + int32_t size = -1; is.read(reinterpret_cast(&size), sizeof(size)); + PADDLE_ENFORCE_EQ(is.good(), true, platform::errors::Unavailable( + "Cannot read tensor desc size")); + PADDLE_ENFORCE_GE(size, 0, platform::errors::InvalidArgument( + "Tensor desc size should >= 0")); std::unique_ptr buf(new char[size]); is.read(reinterpret_cast(buf.get()), size); PADDLE_ENFORCE_EQ( -- GitLab From 8e631715eb18feddb95cb7b87bbb6352cc734006 Mon Sep 17 00:00:00 2001 From: Nyakku Shigure Date: Wed, 16 Mar 2022 17:00:11 +0800 Subject: [PATCH 106/176] fix paddle.optimizer.SGD en docs (#40479) * align to cn docs * add parameter `weight_decay` --- python/paddle/optimizer/sgd.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py index 5167c18de17..6c575b4b997 100644 --- a/python/paddle/optimizer/sgd.py +++ b/python/paddle/optimizer/sgd.py @@ -59,16 +59,14 @@ class SGD(Optimizer): .. code-block:: python import paddle - import numpy as np - inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") + + inp = paddle.uniform(min=-0.1, max=0.1, shape=[10, 10], dtype='float32') linear = paddle.nn.Linear(10, 10) inp = paddle.to_tensor(inp) out = linear(inp) loss = paddle.mean(out) - beta1 = paddle.to_tensor([0.9], dtype="float32") - beta2 = paddle.to_tensor([0.99], dtype="float32") sgd = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01) - back = out.backward() + out.backward() sgd.step() sgd.clear_grad() -- GitLab From f5bf46e6a3da6c780931db7aefad8addd40f4ff0 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Wed, 16 Mar 2022 17:07:37 +0800 Subject: [PATCH 107/176] Fix tile_op inferShape (#40589) * Fix tile_op inferShape * fix style --- paddle/phi/ops/compat/tile_sig.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/paddle/phi/ops/compat/tile_sig.cc b/paddle/phi/ops/compat/tile_sig.cc index 49a6d02225d..ca3fa5fe1f8 100644 --- a/paddle/phi/ops/compat/tile_sig.cc +++ b/paddle/phi/ops/compat/tile_sig.cc @@ -20,6 +20,11 @@ KernelSignature TileOpArgumentMapping(const ArgumentMappingContext& ctx) { if (ctx.HasInput("RepeatTimes")) { return KernelSignature("tile", {"X"}, {"RepeatTimes"}, {"Out"}); } else if (ctx.InputSize("repeat_times_tensor") > 0) { + const auto& repeat_times = + paddle::any_cast>(ctx.Attr("repeat_times")); + if (!ctx.IsRuntime() && !repeat_times.empty()) { + return KernelSignature("tile", {"X"}, {"repeat_times"}, {"Out"}); + } return KernelSignature("tile", {"X"}, {"repeat_times_tensor"}, {"Out"}); } else { return KernelSignature("tile", {"X"}, {"repeat_times"}, {"Out"}); -- GitLab From 23c036d62192bc76c7657ac87ab75ea682760eb5 Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Wed, 16 Mar 2022 17:33:56 +0800 Subject: [PATCH 108/176] clean up DeviceManager in advance manually (#40504) --- paddle/fluid/pybind/pybind.cc | 6 ++++++ paddle/phi/backends/device_manager.cc | 5 +++++ paddle/phi/backends/device_manager.h | 2 ++ python/paddle/fluid/__init__.py | 2 ++ 4 files changed, 15 insertions(+) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 21bbc7f3e36..ed42d0792ea 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -114,6 +114,7 @@ limitations under the License. */ #include "paddle/fluid/pybind/metrics_py.h" #include "paddle/fluid/pybind/ps_gpu_wrapper_py.h" #include "paddle/fluid/pybind/pybind_boost_headers.h" +#include "paddle/phi/backends/device_manager.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/pybind/nccl_wrapper_py.h" @@ -742,6 +743,11 @@ PYBIND11_MODULE(core_noavx, m) { // stored in this static instance to avoid illegal memory access. m.def("clear_kernel_factory", []() { phi::KernelFactory::Instance().kernels().clear(); }); + m.def("clear_device_manager", []() { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + phi::DeviceManager::Clear(); +#endif + }); // NOTE(zjl): ctest would load environment variables at the beginning even // though we have not `import paddle.fluid as fluid`. So we add this API diff --git a/paddle/phi/backends/device_manager.cc b/paddle/phi/backends/device_manager.cc index 1ffe38d8e1f..35339aed0f3 100644 --- a/paddle/phi/backends/device_manager.cc +++ b/paddle/phi/backends/device_manager.cc @@ -393,6 +393,11 @@ DeviceManager& DeviceManager::Instance() { return platform_manager; } +void DeviceManager::Clear() { + Instance().device_map_.clear(); + Instance().device_impl_map_.clear(); +} + std::vector ListAllLibraries(const std::string& library_dir) { std::vector libraries; std::regex express(".*\\.so"); diff --git a/paddle/phi/backends/device_manager.h b/paddle/phi/backends/device_manager.h index c0911a0f8d5..39eef27b4a6 100644 --- a/paddle/phi/backends/device_manager.h +++ b/paddle/phi/backends/device_manager.h @@ -158,6 +158,8 @@ class DeviceManager { static std::vector GetDeviceList(const std::string& device_type); + static void Clear(); + private: DISABLE_COPY_AND_ASSIGN(DeviceManager); DeviceManager() {} diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 7480909a2d8..fb9e8d8ece1 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -228,3 +228,5 @@ if core.is_compiled_with_npu(): atexit.register(core.clear_executor_cache) # NOTE(Aganlengzi): clean up KernelFactory in advance manually. atexit.register(core.clear_kernel_factory) +# NOTE(wangran16): clean up DeviceManger in advance manually. +atexit.register(core.clear_device_manager) -- GitLab From dec2b1cad5a95dd9c5b065c33fa758c0564b0880 Mon Sep 17 00:00:00 2001 From: "joanna.wozna.intel" Date: Wed, 16 Mar 2022 11:40:02 +0100 Subject: [PATCH 109/176] Modify save_quant_model to support different input and output filenames (#40542) * Modify save_quant_model.py to support differnet input and output filenames * Correct wrong order of arguments --- .../contrib/slim/tests/save_quant_model.py | 61 ++++++++++++++++--- 1 file changed, 52 insertions(+), 9 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py index 3fadf25150f..f97c2778c09 100644 --- a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py +++ b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py @@ -52,6 +52,30 @@ def parse_args(): '--debug', action='store_true', help='If used, the graph of Quant model is drawn.') + parser.add_argument( + '--quant_model_filename', + type=str, + default="", + help='The input model`s file name. If empty, search default `__model__` and separate parameter files and use them or in case if not found, attempt loading `model` and `params` files.' + ) + parser.add_argument( + '--quant_params_filename', + type=str, + default="", + help='If quant_model_filename is empty, this field is ignored. The input model`s all parameters file name. If empty load parameters from separate files.' + ) + parser.add_argument( + '--save_model_filename', + type=str, + default="__model__", + help='The name of file to save the inference program itself. If is set None, a default filename __model__ will be used.' + ) + parser.add_argument( + '--save_params_filename', + type=str, + default=None, + help='The name of file to save all related parameters. If it is set None, parameters will be saved in separate files' + ) test_args, args = parser.parse_known_args(namespace=unittest) return test_args, sys.argv[:1] + args @@ -61,18 +85,29 @@ def transform_and_save_int8_model(original_path, save_path, ops_to_quantize='', op_ids_to_skip='', - debug=False): + debug=False, + quant_model_filename='', + quant_params_filename='', + save_model_filename='', + save_params_filename=''): place = fluid.CPUPlace() exe = fluid.Executor(place) inference_scope = fluid.executor.global_scope() with fluid.scope_guard(inference_scope): - if os.path.exists(os.path.join(original_path, '__model__')): - [inference_program, feed_target_names, - fetch_targets] = fluid.io.load_inference_model(original_path, exe) + if not quant_model_filename: + if os.path.exists(os.path.join(original_path, '__model__')): + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model(original_path, + exe) + else: + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model( + original_path, exe, 'model', 'params') else: [inference_program, feed_target_names, - fetch_targets] = fluid.io.load_inference_model(original_path, exe, - 'model', 'params') + fetch_targets] = fluid.io.load_inference_model( + original_path, exe, quant_model_filename, + quant_params_filename) ops_to_quantize_set = set() print(ops_to_quantize) @@ -97,8 +132,14 @@ def transform_and_save_int8_model(original_path, graph = transform_to_mkldnn_int8_pass.apply(graph) inference_program = graph.to_program() with fluid.scope_guard(inference_scope): - fluid.io.save_inference_model(save_path, feed_target_names, - fetch_targets, exe, inference_program) + fluid.io.save_inference_model( + save_path, + feed_target_names, + fetch_targets, + exe, + inference_program, + model_filename=save_model_filename, + params_filename=save_params_filename) print( "Success! INT8 model obtained from the Quant model can be found at {}\n" .format(save_path)) @@ -109,4 +150,6 @@ if __name__ == '__main__': test_args, remaining_args = parse_args() transform_and_save_int8_model( test_args.quant_model_path, test_args.int8_model_save_path, - test_args.ops_to_quantize, test_args.op_ids_to_skip, test_args.debug) + test_args.ops_to_quantize, test_args.op_ids_to_skip, test_args.debug, + test_args.quant_model_filename, test_args.quant_params_filename, + test_args.save_model_filename, test_args.save_params_filename) -- GitLab From 2def79bcdc73f2f0d444fc2bc27535a98337937e Mon Sep 17 00:00:00 2001 From: Zuza Date: Wed, 16 Mar 2022 11:42:57 +0100 Subject: [PATCH 110/176] Quantize elementwise mul (#40546) * Quantize elementwise mul op * Parametrize elementwise functions * Fix code formatting --- .../framework/ir/graph_pattern_detector.cc | 21 ++--- .../framework/ir/graph_pattern_detector.h | 28 +++--- .../conv_elementwise_add_mkldnn_fuse_pass.cc | 90 +++++++++---------- .../framework/ir/mkldnn/cpu_quantize_pass.cc | 71 +++++++-------- .../framework/ir/mkldnn/cpu_quantize_pass.h | 3 +- .../ir/mkldnn/cpu_quantize_pass_tester.cc | 70 +++++++++++---- .../ir/mkldnn/cpu_quantize_placement_pass.cc | 8 +- 7 files changed, 163 insertions(+), 128 deletions(-) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 18068e22b7f..164a13d1560 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -2052,18 +2052,19 @@ PDNode *patterns::Pool::operator()() { return output_var; } -PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) { - auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr()) - ->assert_is_op("elementwise_add"); - - x_var->AsInput()->assert_is_op_input("elementwise_add", "X"); - y_var->AsInput()->assert_is_op_input("elementwise_add", "Y"); - auto out_var = pattern->NewNode(elementwise_add_out_repr()) +PDNode *patterns::Elementwise::operator()(PDNode *x_var, PDNode *y_var, + const std::string elementwise_type) { + auto elementwise_op = + pattern->NewNode(elementwise_op_repr())->assert_is_op(elementwise_type); + + x_var->AsInput()->assert_is_op_input(elementwise_type, "X"); + y_var->AsInput()->assert_is_op_input(elementwise_type, "Y"); + auto out_var = pattern->NewNode(elementwise_out_repr()) ->AsOutput() - ->assert_is_op_output("elementwise_add", "Out"); + ->assert_is_op_output(elementwise_type, "Out"); - elementwise_add_op->LinksFrom({x_var, y_var}); - elementwise_add_op->LinksTo({out_var}); + elementwise_op->LinksFrom({x_var, y_var}); + elementwise_op->LinksTo({out_var}); return out_var; } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 062d2f9dedc..17c70ace301 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -1016,20 +1016,20 @@ struct Pool : public PatternBase { PATTERN_DECL_NODE(pool_output); }; -// ElementwiseAdd used in residual connections. -// y_var is used and convolution output. -// The operator is removed, when residual -// connection fusion is on. -struct ElementwiseAdd : public PatternBase { - ElementwiseAdd(PDPattern* pattern, const std::string& name_scope) - : PatternBase(pattern, name_scope, "elementwise_add") {} - - PDNode* operator()(PDNode* x_var, PDNode* y_var); - - PATTERN_DECL_NODE(elementwise_add_op); - PATTERN_DECL_NODE(elementwise_add_x); - PATTERN_DECL_NODE(elementwise_add_y); - PATTERN_DECL_NODE(elementwise_add_out); +// Elementwise ops +// Forward pass for element-wise operators (add, mul) +// elementwise_mul_out is the result of the operator +struct Elementwise : public PatternBase { + Elementwise(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "elementwise") {} + + PDNode* operator()(PDNode* x_var, PDNode* y_var, + const std::string elementwise_type); + + PATTERN_DECL_NODE(elementwise_op); + PATTERN_DECL_NODE(elementwise_x); + PATTERN_DECL_NODE(elementwise_y); + PATTERN_DECL_NODE(elementwise_out); }; // Transpose op diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc index 0f3f37320b0..fc2758c2734 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc @@ -145,10 +145,10 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX( patterns::Conv conv_pattern{pattern, name_scope}; auto conv_output = conv_pattern(); - patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope}; - elementwise_add_pattern( - conv_output, - pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr())); + patterns::Elementwise elementwise_pattern{pattern, name_scope}; + elementwise_pattern( + conv_output, pattern->NewNode(elementwise_pattern.elementwise_y_repr()), + "elementwise_add"); conv_output->AsIntermediate(); int found_conv_as_x_count = 0; @@ -160,16 +160,16 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX( GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern); GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_identity, elementwise_add_y, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_identity, elementwise_y, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, + elementwise_pattern); - if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return; + if (FindFuseOption(*conv_op, *elementwise_op) != FUSE_MKLDNN) return; - if (!IsReachable(g, elementwise_add_identity, conv_output)) return; + if (!IsReachable(g, elementwise_identity, conv_output)) return; if (HasFusedActivation(conv_op)) return; @@ -179,14 +179,14 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX( return; } - conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()}); - conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); + conv_op->Op()->SetInput("ResidualData", {elementwise_identity->Name()}); + conv_op->Op()->SetOutput("Output", {elementwise_out->Name()}); conv_op->Op()->SetAttr("fuse_residual_connection", true); - GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op}); + GraphSafeRemoveNodes(g, {conv_output, elementwise_op}); - IR_NODE_LINK_TO(elementwise_add_identity, conv_op); - IR_NODE_LINK_TO(conv_op, elementwise_add_out); + IR_NODE_LINK_TO(elementwise_identity, conv_op); + IR_NODE_LINK_TO(conv_op, elementwise_out); found_conv_as_x_count++; }; @@ -212,10 +212,10 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY( patterns::Conv conv_pattern{pattern, name_scope}; auto conv_output = conv_pattern(); - patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope}; - elementwise_add_pattern( - pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()), - conv_output); + patterns::Elementwise elementwise_pattern{pattern, name_scope}; + elementwise_pattern( + pattern->NewNode(elementwise_pattern.elementwise_x_repr()), conv_output, + "elementwise_add"); conv_output->AsIntermediate(); int found_conv_as_y_count = 0; @@ -227,16 +227,16 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY( GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern); GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_x, elementwise_x, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, + elementwise_pattern); - if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return; + if (FindFuseOption(*conv_op, *elementwise_op) != FUSE_MKLDNN) return; - if (!IsReachable(g, elementwise_add_x, conv_output)) return; + if (!IsReachable(g, elementwise_x, conv_output)) return; if (HasFusedActivation(conv_op)) return; @@ -246,14 +246,14 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY( return; } - conv_op->Op()->SetInput("ResidualData", {elementwise_add_x->Name()}); - conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); + conv_op->Op()->SetInput("ResidualData", {elementwise_x->Name()}); + conv_op->Op()->SetOutput("Output", {elementwise_out->Name()}); conv_op->Op()->SetAttr("fuse_residual_connection", true); - GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op}); + GraphSafeRemoveNodes(g, {conv_output, elementwise_op}); - IR_NODE_LINK_TO(elementwise_add_x, conv_op); - IR_NODE_LINK_TO(conv_op, elementwise_add_out); + IR_NODE_LINK_TO(elementwise_x, conv_op); + IR_NODE_LINK_TO(conv_op, elementwise_out); found_conv_as_y_count++; }; @@ -282,8 +282,8 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( patterns::Conv conv_y_pattern{pattern, name_scope}; auto conv_y_output = conv_y_pattern(); - patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope}; - elementwise_add_pattern(conv_x_output, conv_y_output); + patterns::Elementwise elementwise_pattern{pattern, name_scope}; + elementwise_pattern(conv_x_output, conv_y_output, "elementwise_add"); conv_x_output->AsIntermediate(); conv_y_output->AsIntermediate(); @@ -301,10 +301,10 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( GET_IR_NODE_FROM_SUBGRAPH(conv_y_filter, conv_filter, conv_y_pattern); GET_IR_NODE_FROM_SUBGRAPH(conv_y_output, conv_output, conv_y_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, + elementwise_pattern); if (!IsCompat(subgraph, g)) { LOG(WARNING) @@ -312,8 +312,8 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( return; } - if (FindFuseOption(*conv_x_op, *elementwise_add_op) != FUSE_MKLDNN) return; - if (FindFuseOption(*conv_y_op, *elementwise_add_op) != FUSE_MKLDNN) return; + if (FindFuseOption(*conv_x_op, *elementwise_op) != FUSE_MKLDNN) return; + if (FindFuseOption(*conv_y_op, *elementwise_op) != FUSE_MKLDNN) return; Node* projection_node; Node* residual_conv_op; @@ -333,14 +333,14 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( if (HasFusedActivation(residual_conv_op)) return; residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()}); - residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); + residual_conv_op->Op()->SetOutput("Output", {elementwise_out->Name()}); residual_conv_op->Op()->SetAttr("fuse_residual_connection", true); - GraphSafeRemoveNodes(g, {residual_conv_output, elementwise_add_op}); + GraphSafeRemoveNodes(g, {residual_conv_output, elementwise_op}); IR_NODE_LINK_TO(projection_node, residual_conv_op); - IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out); + IR_NODE_LINK_TO(residual_conv_op, elementwise_out); found_projection_conv_count++; }; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index 371482b5343..f4358fb243f 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -807,74 +807,74 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const { PrettyLogDetail("--- quantized %d matmul ops", quantize_matmul_count); } -void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const { +void CPUQuantizePass::QuantizeElementwise( + Graph* graph, const std::string elementwise_type) const { GraphPatternDetector gpd; auto pattern = gpd.mutable_pattern(); - patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope_}; + patterns::Elementwise elementwise_pattern{pattern, name_scope_}; - elementwise_add_pattern( - pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()), - pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr())); + elementwise_pattern( + pattern->NewNode(elementwise_pattern.elementwise_x_repr()), + pattern->NewNode(elementwise_pattern.elementwise_y_repr()), + elementwise_type); - int quantize_elementwise_add_count = 0; + int quantize_elementwise_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { - VLOG(4) << "Quantize elementwise_add op"; - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); + VLOG(4) << "Quantize " + elementwise_type + " op"; + GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op, + elementwise_pattern); // skip if should not be quantized - if (!platform::HasOpINT8DataType(elementwise_add_op->Op())) { - LogQuantizationDisabled(elementwise_add_op); + if (!platform::HasOpINT8DataType(elementwise_op->Op())) { + LogQuantizationDisabled(elementwise_op); return; } - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_x, elementwise_x, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_y, elementwise_y, + elementwise_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, + elementwise_pattern); if (!AreScalesPresentForNodes( - {elementwise_add_x, elementwise_add_y, elementwise_add_out})) { - LogCannotQuantizeOp(elementwise_add_op, + {elementwise_x, elementwise_y, elementwise_out})) { + LogCannotQuantizeOp(elementwise_op, "No scale available for the operator"); return; } bool is_x_unsigned{false}, is_y_unsigned{false}; - auto input_x_scale = - GetScaleValueForNode(elementwise_add_x, &is_x_unsigned); - auto input_y_scale = - GetScaleValueForNode(elementwise_add_y, &is_y_unsigned); + auto input_x_scale = GetScaleValueForNode(elementwise_x, &is_x_unsigned); + auto input_y_scale = GetScaleValueForNode(elementwise_y, &is_y_unsigned); // TODO(sfraczek): add support for different signness if (is_x_unsigned != is_y_unsigned) { - LogCannotQuantizeOp(elementwise_add_op, - "ElementwiseAdd inputs must be of the same type."); + LogCannotQuantizeOp(elementwise_op, + "Elementwise inputs must be of the same type."); return; } - QuantizeInput(g, elementwise_add_op, elementwise_add_x, "X", input_x_scale, + QuantizeInput(g, elementwise_op, elementwise_x, "X", input_x_scale, is_x_unsigned, "Scale_x"); - QuantizeInput(g, elementwise_add_op, elementwise_add_y, "Y", input_y_scale, + QuantizeInput(g, elementwise_op, elementwise_y, "Y", input_y_scale, is_y_unsigned, "Scale_y"); bool is_output_unsigned{false}; auto output_scale = - GetScaleValueForNode(elementwise_add_out, &is_output_unsigned); + GetScaleValueForNode(elementwise_out, &is_output_unsigned); - DequantizeOutput(g, elementwise_add_op, elementwise_add_out, "Out", - output_scale, is_output_unsigned, "Scale_out"); + DequantizeOutput(g, elementwise_op, elementwise_out, "Out", output_scale, + is_output_unsigned, "Scale_out"); - ++quantize_elementwise_add_count; + ++quantize_elementwise_count; }; gpd(graph, handler); - AddStatis(quantize_elementwise_add_count); + AddStatis(quantize_elementwise_count); - PrettyLogDetail("--- quantized %d elementwise_add ops", - quantize_elementwise_add_count); + PrettyLogDetail("--- quantized %d %s ops", quantize_elementwise_count, + elementwise_type); } void CPUQuantizePass::QuantizeFusionGru(Graph* graph) const { @@ -1146,7 +1146,8 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { QuantizeFc(graph); QuantizeReshape(graph); QuantizeMatmul(graph); - QuantizeElementwiseAdd(graph); + QuantizeElementwise(graph, "elementwise_add"); + QuantizeElementwise(graph, "elementwise_mul"); QuantizeFusionGru(graph); QuantizeMultiGru(graph); QuantizeFusionLSTM(graph); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h index 412c4e40a01..3a286264e41 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h @@ -57,7 +57,8 @@ class CPUQuantizePass : public FusePassBase { void QuantizeTranspose(Graph* graph) const; void QuantizeReshape(Graph* graph) const; void QuantizeMatmul(Graph* graph) const; - void QuantizeElementwiseAdd(Graph* graph) const; + void QuantizeElementwise(Graph* graph, + const std::string elementwise_type) const; void QuantizeFusionGru(Graph* graph) const; void QuantizeMultiGru(Graph* graph) const; void QuantizeFusionLSTM(Graph* graph) const; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc index 889417b78c8..22000865948 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc @@ -90,7 +90,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, op->SetAttr("Scale_x", 1.0f); op->SetAttr("Scale_y", 1.0f); op->SetAttr("Scale_out", 1.0f); - } else if (type == "elementwise_add") { + } else if (type == "elementwise_add" || type == "elementwise_mul") { op->SetInput("X", {inputs[0]}); if (inputs.size() > 1) op->SetInput("Y", {inputs[1]}); op->SetOutput("Out", {outputs[0]}); @@ -167,7 +167,8 @@ void CheckScales(const OpDesc* op, float scale, float shift) { scale); scale_names.push_back("Scale_in"); scale_names.push_back("Scale_out"); - } else if (type == "matmul" || type == "elementwise_add") { + } else if (type == "matmul" || type == "elementwise_add" || + type == "elementwise_mul") { scale_names.push_back("Scale_x"); scale_names.push_back("Scale_y"); scale_names.push_back("Scale_out"); @@ -546,46 +547,77 @@ TEST(CpuQuantizePass, matmul_not_quantized) { expected_operators, added_nodes, 1.0f); } -static const std::initializer_list variable_names_elementwise_add = - {"a", "b", "c", "d", "e", "f"}; +static const std::initializer_list variable_names_elementwise = { + "a", "b", "c", "d", "e", "f"}; -ProgramDesc BuildProgramDescElementwiseAdd() { +ProgramDesc BuildProgramDescElementwise(const std::string elementwise_type, + const std::string elementwise_name) { ProgramDesc prog; - for (auto& v : variable_names_elementwise_add) { + for (auto& v : variable_names_elementwise) { prog.MutableBlock(0)->Var(v); } SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true); SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true); - SetOp(&prog, "elementwise_add", "ElementwiseAdd", {"b", "d"}, {"e"}, true, + SetOp(&prog, elementwise_type, elementwise_name, {"b", "d"}, {"e"}, true, "int8"); SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, "float32"); return prog; } -TEST(CpuQuantizePass, elementwise_add) { +void TestElementwise(const std::string elementwise_type, + const std::string elementwise_name) { // 2 Quant + 2 IN + 1 DeQuant + 1 OUT int added_nodes = 6; std::unordered_map expected_operators = { - {"elementwise_add", 1}, {"quantize", 2}, {"dequantize", 3}}; - MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add, - expected_operators, added_nodes, SCALE * S8_MAX); + {elementwise_type, 1}, {"quantize", 2}, {"dequantize", 3}}; + MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name), + variable_names_elementwise, expected_operators, added_nodes, + SCALE * S8_MAX); } -TEST(CpuQuantizePass, elementwise_add_output_scale_missing) { +void TestElementwiseOutputScaleMissing(const std::string elementwise_type, + const std::string elementwise_name) { int added_nodes = 0; std::unordered_map expected_operators = { - {"elementwise_add", 1}, {"quantize", 0}, {"dequantize", 2}}; - MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add, - expected_operators, added_nodes, 1.f, 1.f, "e"); + {elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}}; + MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name), + variable_names_elementwise, expected_operators, added_nodes, 1.f, + 1.f, "e"); } -TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) { +void TestElementwiseUnsignedAndSignedInput(const std::string elementwise_type, + const std::string elementwise_name) { int added_nodes = 0; std::unordered_map expected_operators = { - {"elementwise_add", 1}, {"quantize", 0}, {"dequantize", 2}}; - MainTest(BuildProgramDescElementwiseAdd(), variable_names_elementwise_add, - expected_operators, added_nodes, 1.f, 1.f, "", "b"); + {elementwise_type, 1}, {"quantize", 0}, {"dequantize", 2}}; + MainTest(BuildProgramDescElementwise(elementwise_type, elementwise_name), + variable_names_elementwise, expected_operators, added_nodes, 1.f, + 1.f, "", "b"); +} + +TEST(CpuQuantizePass, elementwise_add) { + TestElementwise("elementwise_add", "ElementwiseAdd"); +} + +TEST(CpuQuantizePass, elementwise_add_output_scale_missing) { + TestElementwiseOutputScaleMissing("elementwise_add", "ElementwiseAdd"); +} + +TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) { + TestElementwiseUnsignedAndSignedInput("elementwise_add", "ElementwiseAdd"); +} + +TEST(CpuQuantizePass, elementwise_mul) { + TestElementwise("elementwise_mul", "ElementwiseMul"); +} + +TEST(CpuQuantizePass, elementwise_mul_output_scale_missing) { + TestElementwiseOutputScaleMissing("elementwise_mul", "ElementwiseMul"); +} + +TEST(CpuQuantizePass, elementwise_mul_unsigned_and_signed_input) { + TestElementwiseUnsignedAndSignedInput("elementwise_mul", "ElementwiseMul"); } const std::vector churn_out_vars(ProgramDesc* prog, diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc index 5f74b61ee86..3b883dac978 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc @@ -26,10 +26,10 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Marks operators which are to be quantized."; std::unordered_set supported_op_types = std::unordered_set( - {"concat", "conv2d", "depthwise_conv2d", "elementwise_add", "fc", - "matmul", "nearest_interp", "nearest_interp_v2", "pool2d", - "prior_box", "reshape2", "transpose2", "fusion_gru", "fusion_lstm", - "multi_gru", "slice"}); + {"concat", "conv2d", "depthwise_conv2d", "elementwise_add", + "elementwise_mul", "fc", "matmul", "nearest_interp", + "nearest_interp_v2", "pool2d", "prior_box", "reshape2", "transpose2", + "fusion_gru", "fusion_lstm", "multi_gru", "slice"}); const auto& excluded_ids_list = Get>("quantize_excluded_op_ids"); const auto& op_types_list = -- GitLab From 7004f65c53acf3cdeef99cd3c53e20b22fa0f1ac Mon Sep 17 00:00:00 2001 From: piotrekobi <48731682+piotrekobi@users.noreply.github.com> Date: Wed, 16 Mar 2022 11:46:03 +0100 Subject: [PATCH 111/176] Refactor elementwise op grad classes (#40187) * Refactor elementwise op grad classes * Add more refactor changes * Revert set layout and format deletion * Fix failing elementwise test --- .../mkldnn/elementwise_add_mkldnn_op.cc | 102 +------- .../mkldnn/elementwise_div_mkldnn_op.cc | 174 +++----------- .../mkldnn/elementwise_mkldnn_op.h | 223 ++++++++++++++++-- .../mkldnn/elementwise_mul_mkldnn_op.cc | 142 ++--------- .../mkldnn/elementwise_sub_mkldnn_op.cc | 117 +-------- .../unittests/test_elementwise_add_op.py | 2 +- .../unittests/test_elementwise_mul_op.py | 2 +- 7 files changed, 266 insertions(+), 496 deletions(-) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc index 838df2e1625..f9347d28104 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,100 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" -namespace paddle { -namespace framework { -class ExecutionContext; -} // namespace framework -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -template -class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto& dev_ctx = - ctx.template device_context(); - const auto& onednn_engine = dev_ctx.GetEngine(); - - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - auto tz = phi::vectorize(dout->dims()); - memory::data_type dout_type = framework::ToMKLDNNDataType( - framework::TransToProtoVarType(dout->dtype())); - platform::ReorderMKLDNNHandler handler( - tz, framework::TransToProtoVarType(dout->dtype()), dout_type, - onednn_engine); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - auto reorder_src_memory_p = handler.AcquireSrcMemory( - dout->format(), platform::to_void_cast(dout->data())); - - if (dx) { - auto reorder_dst_memory_p = - handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace()); - auto reorder_p = - handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); - platform::RecordEvent record_reorder( - "int_reorder", platform::TracerEventType::UserDefined, 2, - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); - astream.wait(); - - dx->set_layout(DataLayout::kMKLDNN); - dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); - } - - if (dy) { - // Direct copy - if (dout->dims() == dy->dims()) { - auto reorder_dst_memory_p = - handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace()); - auto reorder_p = - handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); - platform::RecordEvent record_reorder( - "int_reorder", platform::TracerEventType::UserDefined, 2, - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, *reorder_src_memory_p, - *reorder_dst_memory_p); - astream.wait(); - - dy->set_layout(DataLayout::kMKLDNN); - dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); - } else { - // Broadcasting - platform::ReductionMKLDNNHandler handler_sum( - dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine, - ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy)); - auto dy_memory_p = handler_sum.AcquireDstMemory(dy); - auto reduction_p = handler_sum.AcquireForwardPrimitive(); - reduction_p->execute(astream, {{DNNL_ARG_SRC, *reorder_src_memory_p}, - {DNNL_ARG_DST, *dy_memory_p}}); - astream.wait(); - - dy->set_layout(DataLayout::kMKLDNN); - dy->set_format( - platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape( - phi::vectorize(dy->dims())))); - } - } - } -}; - -} // namespace operators -} // namespace paddle - namespace ops = paddle::operators; REGISTER_OP_KERNEL( @@ -116,6 +24,8 @@ REGISTER_OP_KERNEL( ops::EltwiseMKLDNNKernel, ops::EltwiseMKLDNNKernel) -REGISTER_OP_KERNEL(elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace, - ops::EltwiseAddMKLDNNGradKernel, - ops::EltwiseAddMKLDNNGradKernel) +REGISTER_OP_KERNEL( + elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::EltwiseMKLDNNGradKernel, + ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc index 367d602f590..c68aa8d3d1b 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc @@ -1,146 +1,28 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" - -namespace paddle { -namespace framework { -class ExecutionContext; -} // namespace framework -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -template -class EltwiseDivMKLDNNGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - - auto& dev_ctx = - ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); - - auto* y = ctx.Input("Y"); - auto* out = ctx.Input("Out"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - int axis = ctx.Attr("axis"); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - - if (dx) { - // dx = dout / y - - platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_div, axis, mkldnn_engine, ctx.GetPlace(), - dout, y, dx, 1.0f, 1.0f, 1.0f); - - const auto src_dout_memory = handler.AcquireSrcMemory(dout); - const auto src_y_memory = handler.AcquireSecondSrcMemory(y); - const auto dst_dx_memory = handler.AcquireDstMemory(dx); - - const auto binary_prim = handler.AcquireForwardPrimitive(); - - const std::unordered_map args = { - {DNNL_ARG_SRC_0, *src_dout_memory}, - {DNNL_ARG_SRC_1, *src_y_memory}, - {DNNL_ARG_DST, *dst_dx_memory}}; - - binary_prim->execute(astream, args); - astream.wait(); - - dx->set_layout(framework::DataLayout::kMKLDNN); - dx->set_format(platform::GetMKLDNNFormat(*dst_dx_memory)); - } - - if (dy) { - // dy = -dout * out / y - - platform::BinaryMKLDNNHandler y_handler( - dnnl::algorithm::binary_div, axis, mkldnn_engine, ctx.GetPlace(), y, - y, nullptr, 1.0f, 1.0f, 1.0f); - - const auto y_memory = y_handler.AcquireSrcMemory(y); - - dnnl::post_ops po; - po.append_binary(dnnl::algorithm::binary_div, y_memory->get_desc()); - - platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), - dout, out, nullptr, -1.0f, 1.0f, 1.0f, po); - - const auto src_dout_memory = handler.AcquireSrcMemory(dout); - const auto src_out_memory = handler.AcquireSecondSrcMemory(out); - - // If broadcasting is in use then let's write to temporary - // buffer allocated by oneDNN - const auto dst_dy_memory = (dout->dims() == dy->dims()) - ? handler.AcquireDstMemory(dy) - : handler.AcquireDstMemory(); - - const auto binary_prim = handler.AcquireForwardPrimitive(); - - const std::unordered_map args = { - {DNNL_ARG_SRC_0, *src_dout_memory}, - {DNNL_ARG_SRC_1, *src_out_memory}, - {DNNL_ARG_DST, *dst_dy_memory}, - {DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, *y_memory}}; - - binary_prim->execute(astream, args); - astream.wait(); - - dy->set_layout(framework::DataLayout::kMKLDNN); - - // Reduction is needed for broadcasting scenario - if (dout->dims() != dy->dims()) { - platform::ReductionMKLDNNHandler handler_sum( - dnnl::algorithm::reduction_sum, 0.0f, 0.0f, mkldnn_engine, - ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy)); - auto dy_memory_p = handler_sum.AcquireDstMemory(dy); - auto reduction_p = handler_sum.AcquireForwardPrimitive(); - - // As source we use mem object with results from binary operation - reduction_p->execute(astream, {{DNNL_ARG_SRC, *dst_dy_memory}, - {DNNL_ARG_DST, *dy_memory_p}}); - astream.wait(); - dy->set_format( - platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape( - phi::vectorize(dy->dims())))); - - } else { - dy->set_format(platform::GetMKLDNNFormat(*dst_dy_memory)); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -// TODO(piotrekobi) add int8, uint8 support -REGISTER_OP_KERNEL(elementwise_div, MKLDNN, paddle::platform::CPUPlace, - ops::EltwiseMKLDNNKernel, - ops::EltwiseMKLDNNKernel) - -REGISTER_OP_KERNEL(elementwise_div_grad, MKLDNN, paddle::platform::CPUPlace, - ops::EltwiseDivMKLDNNGradKernel, - ops::EltwiseDivMKLDNNGradKernel) +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(elementwise_div, MKLDNN, paddle::platform::CPUPlace, + ops::EltwiseMKLDNNKernel, + ops::EltwiseMKLDNNKernel) + +REGISTER_OP_KERNEL( + elementwise_div_grad, MKLDNN, paddle::platform::CPUPlace, + ops::EltwiseMKLDNNGradKernel, + ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index ad8fd317013..761b401ca9a 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -15,20 +15,35 @@ #pragma once #include #include -#include "paddle/fluid/operators/elementwise/elementwise_add_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/framework/data_layout_transform.h" +#include "paddle/fluid/operators/elementwise/elementwise_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/platform/mkldnn_reuse.h" namespace paddle { namespace operators { -using framework::DataLayout; -using framework::Tensor; using dnnl::memory; using dnnl::primitive; using dnnl::stream; +using framework::DataLayout; +using framework::Tensor; + +inline std::vector CalculateBroadcastedDims(const Tensor* x, + const Tensor* y) { + const auto src_tz = phi::vectorize(x->dims()); + const auto dst_tz = phi::vectorize(y->dims()); + + size_t j = 0; + std::vector dst_tz_ex(src_tz.size(), 1); + for (size_t i = 0; i < src_tz.size(); ++i) { + dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++]; + if (j == dst_tz.size()) break; + } + + return dst_tz_ex; +} template class EltwiseMKLDNNKernel : public framework::OpKernel { @@ -103,7 +118,7 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { // operation. const bool reuse_x_memopry = x->numel() == z->numel() && x->IsSharedBufferWith(*z); - std::shared_ptr dst_memory = nullptr; + std::shared_ptr dst_memory; if (reuse_x_memopry) { dst_memory = src_x_memory; // NOTE(chenfeiyu): when the output reuses memory from other tensor rather @@ -135,19 +150,193 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { } }; -inline std::vector CalculateBroadcastedDims(const Tensor* x, - const Tensor* y) { - const auto src_tz = phi::vectorize(x->dims()); - const auto dst_tz = phi::vectorize(y->dims()); +template +class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + ElemwiseGradKernel::Compute(ctx); + using Tensor = framework::Tensor; - size_t j = 0; - std::vector dst_tz_ex(src_tz.size(), 1); - for (size_t i = 0; i < src_tz.size(); ++i) { - dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++]; - if (j == dst_tz.size()) break; - } + auto& dev_ctx = + ctx.template device_context(); + const auto& onednn_engine = dev_ctx.GetEngine(); - return dst_tz_ex; -} + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Input("Out"); + + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + auto* dout = ctx.Input(framework::GradVarName("Out")); + + int axis = ctx.Attr("axis"); + + auto tz = phi::vectorize(dout->dims()); + auto proto_type_dout = framework::TransToProtoVarType(dout->dtype()); + + platform::ReorderMKLDNNHandler reorder_handler( + tz, proto_type_dout, framework::ToMKLDNNDataType(proto_type_dout), + onednn_engine); + + auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( + dout->format(), platform::to_void_cast(dout->data())); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + + if (dx) { + std::shared_ptr dst_memory; + + // elementwise_add & elementwise_sub + if (BINARY_OP == dnnl::algorithm::binary_add || + BINARY_OP == dnnl::algorithm::binary_sub) { + dst_memory = reorder_handler.AcquireDstMemory(dx, dout->format(), + ctx.GetPlace()); + auto reorder_p = + reorder_handler.AcquireReorder(dst_memory, reorder_src_memory_p); + platform::RecordEvent record_reorder( + "int_reorder", platform::TracerEventType::UserDefined, 2, + platform::EventRole::kUniqueOp); + + reorder_p->execute(astream, *reorder_src_memory_p, *dst_memory); + } + + // elementwise_mul & elementwise_div + else { + platform::BinaryMKLDNNHandler binary_handler( + BINARY_OP, axis, onednn_engine, ctx.GetPlace(), dout, y, dx, 1.0f, + 1.0f, 1.0f); + + const auto src_dout_memory = binary_handler.AcquireSrcMemory(dout); + const auto src_y_memory = binary_handler.AcquireSecondSrcMemory(y); + dst_memory = binary_handler.AcquireDstMemory(dx); + + const auto binary_prim = binary_handler.AcquireForwardPrimitive(); + + const std::unordered_map args = { + {DNNL_ARG_SRC_0, *src_dout_memory}, + {DNNL_ARG_SRC_1, *src_y_memory}, + {DNNL_ARG_DST, *dst_memory}}; + + binary_prim->execute(astream, args); + } + astream.wait(); + + dx->set_layout(framework::DataLayout::kMKLDNN); + dx->set_format(platform::GetMKLDNNFormat(*dst_memory)); + } + + if (dy) { + dnnl::primitive_attr broadcast_reduction_attr; + std::shared_ptr broadcast_src_memory; + std::shared_ptr dst_memory; + + // elementwise_add & elementwise_sub + if (BINARY_OP == dnnl::algorithm::binary_add || + BINARY_OP == dnnl::algorithm::binary_sub) { + if (dout->dims() == dy->dims()) { + auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( + dy, dout->format(), ctx.GetPlace()); + + dnnl::primitive_attr reorder_attr; + std::vector scales(1); + scales[0] = (BINARY_OP == dnnl::algorithm::binary_add) ? 1 : -1; + reorder_attr.set_output_scales(0, scales); + auto reorder_p = std::make_shared( + *(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr); + platform::RecordEvent record_reorder( + "int_reorder", platform::TracerEventType::UserDefined, 2, + platform::EventRole::kUniqueOp); + reorder_p->execute(astream, *reorder_src_memory_p, + *reorder_dst_memory_p); + + dst_memory = reorder_dst_memory_p; + } else { + broadcast_src_memory = reorder_src_memory_p; + } + } + + // elementwise_mul & elementwise_div + else { + std::unordered_map args; + std::shared_ptr binary_prim; + std::shared_ptr post_op_memory; + std::shared_ptr src_0_memory; + std::shared_ptr src_1_memory; + + platform::BinaryMKLDNNHandler binary_handler( + dnnl::algorithm::binary_mul, axis, onednn_engine, ctx.GetPlace(), + dout, x, nullptr, 1.0f, 1.0f, 1.0f); + + src_1_memory = binary_handler.AcquireSecondSrcMemory(x); + + if (BINARY_OP == dnnl::algorithm::binary_div) { + platform::BinaryMKLDNNHandler post_op_binary_handler( + dnnl::algorithm::binary_div, axis, onednn_engine, ctx.GetPlace(), + y, y, nullptr, 1.0f, 1.0f, 1.0f); + + post_op_memory = post_op_binary_handler.AcquireSrcMemory(y); + + dnnl::post_ops po; + po.append_binary(dnnl::algorithm::binary_div, + post_op_memory->get_desc()); + + binary_handler = platform::BinaryMKLDNNHandler( + dnnl::algorithm::binary_mul, axis, onednn_engine, ctx.GetPlace(), + dout, out, nullptr, -1.0f, 1.0f, 1.0f, po); + + src_1_memory = binary_handler.AcquireSecondSrcMemory(out); + } + + src_0_memory = binary_handler.AcquireSrcMemory(dout); + + const auto dst_dy_memory = (dout->dims() == dy->dims()) + ? binary_handler.AcquireDstMemory(dy) + : binary_handler.AcquireDstMemory(); + + binary_prim = binary_handler.AcquireForwardPrimitive(); + args = {{DNNL_ARG_SRC_0, *src_0_memory}, + {DNNL_ARG_SRC_1, *src_1_memory}, + {DNNL_ARG_DST, *dst_dy_memory}}; + + if (BINARY_OP == dnnl::algorithm::binary_div) + args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, + *post_op_memory}); + + binary_prim->execute(astream, args); + broadcast_src_memory = dst_dy_memory; + dst_memory = dst_dy_memory; + } + astream.wait(); + dy->set_layout(DataLayout::kMKLDNN); + + if (dout->dims() != dy->dims()) { + // Broadcasting + if (BINARY_OP == dnnl::algorithm::binary_sub) { + dnnl::post_ops po; + po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, -1.0f, 0); + broadcast_reduction_attr.set_post_ops(po); + } + + platform::ReductionMKLDNNHandler reduction_handler( + dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine, + ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy), + broadcast_reduction_attr); + dst_memory = reduction_handler.AcquireDstMemory(dy); + + auto reduction_p = reduction_handler.AcquireForwardPrimitive(); + + reduction_p->execute(astream, { + {DNNL_ARG_SRC, *broadcast_src_memory}, + {DNNL_ARG_DST, *dst_memory}, + }); + astream.wait(); + dy->set_format(platform::GetMKLDNNFormat(dst_memory->get_desc().reshape( + phi::vectorize(dy->dims())))); + } else { + dy->set_format(platform::GetMKLDNNFormat(*dst_memory)); + } + } + } +}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc index c03794012ff..0ef5c5e628c 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc @@ -1,127 +1,19 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" -namespace paddle { -namespace framework { -class ExecutionContext; -} // namespace framework -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -template -class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - - auto& dev_ctx = - ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - int axis = ctx.Attr("axis"); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - - if (dx) { - // dx = dout*y - platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), - dout, y, dx, 1.0f, 1.0f, 1.0f); - - const auto src_dout_memory = handler.AcquireSrcMemory(dout); - const auto src_y_memory = handler.AcquireSecondSrcMemory(y); - const auto dst_dx_memory = handler.AcquireDstMemory(dx); - - const auto binary_prim = handler.AcquireForwardPrimitive(); - - const std::unordered_map args = { - {DNNL_ARG_SRC_0, *src_dout_memory}, - {DNNL_ARG_SRC_1, *src_y_memory}, - {DNNL_ARG_DST, *dst_dx_memory}}; - - binary_prim->execute(astream, args); - astream.wait(); - - dx->set_layout(framework::DataLayout::kMKLDNN); - dx->set_format(platform::GetMKLDNNFormat(*dst_dx_memory)); - } - - if (dy) { - // dy = dout*x - // Handler is having nullptr passed instead of output tensor as - // we want Dst buffer to be allocated by oneDNN not to use Tensor - platform::BinaryMKLDNNHandler handler( - dnnl::algorithm::binary_mul, axis, mkldnn_engine, ctx.GetPlace(), - dout, x, nullptr, 1.0f, 1.0f, 1.0f); - - const auto src_dout_memory = handler.AcquireSrcMemory(dout); - const auto src_x_memory = handler.AcquireSecondSrcMemory(x); - - // If broadcasting is in use then let's write to temporary - // buffer allocated by oneDNN - const auto dst_dy_memory = (dout->dims() == dy->dims()) - ? handler.AcquireDstMemory(dy) - : handler.AcquireDstMemory(); - - const auto binary_prim = handler.AcquireForwardPrimitive(); - - const std::unordered_map args = { - {DNNL_ARG_SRC_0, *src_dout_memory}, - {DNNL_ARG_SRC_1, *src_x_memory}, - {DNNL_ARG_DST, *dst_dy_memory}}; - - binary_prim->execute(astream, args); - astream.wait(); - - dy->set_layout(framework::DataLayout::kMKLDNN); - - // Reduction is needed for broadcasting scenario - if (dout->dims() != dy->dims()) { - platform::ReductionMKLDNNHandler handler_sum( - dnnl::algorithm::reduction_sum, 0.0f, 0.0f, mkldnn_engine, - ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy)); - auto dy_memory_p = handler_sum.AcquireDstMemory(dy); - auto reduction_p = handler_sum.AcquireForwardPrimitive(); - // As source we use mem object with results from binary operation - reduction_p->execute(astream, {{DNNL_ARG_SRC, *dst_dy_memory}, - {DNNL_ARG_DST, *dy_memory_p}}); - astream.wait(); - dy->set_format( - platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape( - phi::vectorize(dy->dims())))); - - } else { - dy->set_format(platform::GetMKLDNNFormat(*dst_dy_memory)); - } - } - } -}; - -} // namespace operators -} // namespace paddle - namespace ops = paddle::operators; REGISTER_OP_KERNEL( @@ -132,6 +24,8 @@ REGISTER_OP_KERNEL( ops::EltwiseMKLDNNKernel, ops::EltwiseMKLDNNKernel) -REGISTER_OP_KERNEL(elementwise_mul_grad, MKLDNN, ::paddle::platform::CPUPlace, - ops::EltwiseMulMKLDNNGradKernel, - ops::EltwiseMulMKLDNNGradKernel) +REGISTER_OP_KERNEL( + elementwise_mul_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::EltwiseMKLDNNGradKernel, + ops::EltwiseMKLDNNGradKernel) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc index 3c799008a2a..510373831eb 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc @@ -1,5 +1,4 @@ - -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,113 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" -namespace paddle { -namespace framework { -class ExecutionContext; -} // namespace framework -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -template -class EltwiseSubMKLDNNGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto& dev_ctx = - ctx.template device_context(); - const auto& onednn_engine = dev_ctx.GetEngine(); - - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - auto tz = phi::vectorize(dout->dims()); - memory::data_type dout_type = framework::ToMKLDNNDataType( - framework::TransToProtoVarType(dout->dtype())); - platform::ReorderMKLDNNHandler handler( - tz, framework::TransToProtoVarType(dout->dtype()), dout_type, - onednn_engine); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - auto reorder_src_memory_p = handler.AcquireSrcMemory( - dout->format(), platform::to_void_cast(dout->data())); - - if (dx) { - auto reorder_dst_memory_p = - handler.AcquireDstMemory(dx, dout->format(), ctx.GetPlace()); - auto reorder_p = - handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); - platform::RecordEvent record_reorder( - "int_reorder", platform::TracerEventType::UserDefined, 2, - platform::EventRole::kUniqueOp); - - reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); - astream.wait(); - - dx->set_layout(DataLayout::kMKLDNN); - dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); - } - - if (dy) { - // Direct copy - if (dout->dims() == dy->dims()) { - auto reorder_dst_memory_p = - handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace()); - - dnnl::primitive_attr reorder_attr; - std::vector scales = {-1}; - reorder_attr.set_output_scales(0, scales); - auto reorder_p = std::make_shared( - *(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr); - platform::RecordEvent record_reorder( - "int_reorder", platform::TracerEventType::UserDefined, 2, - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, *reorder_src_memory_p, - *reorder_dst_memory_p); - astream.wait(); - - dy->set_layout(DataLayout::kMKLDNN); - dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); - } else { - // Broadcasting - - dnnl::post_ops po; - po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, -1.0f, 0); - dnnl::primitive_attr attr; - attr.set_post_ops(po); - - platform::ReductionMKLDNNHandler handler_sum( - dnnl::algorithm::reduction_sum, 0.0f, 0.0f, onednn_engine, - ctx.GetPlace(), dout, dy, CalculateBroadcastedDims(dout, dy), attr); - - auto dy_memory_p = handler_sum.AcquireDstMemory(dy); - auto reduction_p = handler_sum.AcquireForwardPrimitive(); - - reduction_p->execute(astream, { - {DNNL_ARG_SRC, *reorder_src_memory_p}, - {DNNL_ARG_DST, *dy_memory_p}, - }); - astream.wait(); - - dy->set_layout(DataLayout::kMKLDNN); - dy->set_format( - platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape( - phi::vectorize(dy->dims())))); - } - } - } -}; - -} // namespace operators -} // namespace paddle namespace ops = paddle::operators; @@ -131,6 +24,8 @@ REGISTER_OP_KERNEL( ops::EltwiseMKLDNNKernel, ops::EltwiseMKLDNNKernel) -REGISTER_OP_KERNEL(elementwise_sub_grad, MKLDNN, ::paddle::platform::CPUPlace, - ops::EltwiseSubMKLDNNGradKernel, - ops::EltwiseSubMKLDNNGradKernel) +REGISTER_OP_KERNEL( + elementwise_sub_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::EltwiseMKLDNNGradKernel, + ops::EltwiseMKLDNNGradKernel) diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py index d1d391a3949..318e826058f 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py @@ -17,7 +17,7 @@ import unittest import numpy as np import paddle import paddle.fluid.core as core -from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 +from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 import paddle.fluid as fluid from paddle.fluid import compiler, Program, program_guard diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py index 00967cb503f..b35b2840ed3 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py @@ -23,7 +23,7 @@ import paddle.fluid.core as core from paddle.fluid import Program, compiler, program_guard from paddle.fluid.op import Operator -from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 +from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 class ElementwiseMulOp(OpTest): -- GitLab From 00183a93d61aad84183d662e91125b69f67de72e Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Wed, 16 Mar 2022 19:11:31 +0800 Subject: [PATCH 112/176] [Phi] Migrate mode_op and mode_grad_op into Phi (#40571) * [Phi] Migrate mode_op and mode_grad_op into Phi * fix omp * add ifdef * migrate infershape * modify according reviewer --- paddle/fluid/operators/mode_op.cc | 60 +--- paddle/fluid/operators/mode_op.cu | 232 --------------- paddle/fluid/operators/mode_op.h | 317 --------------------- paddle/phi/infermeta/unary.cc | 43 +++ paddle/phi/infermeta/unary.h | 6 + paddle/phi/kernels/cpu/mode_grad_kernel.cc | 170 +++++++++++ paddle/phi/kernels/cpu/mode_kernel.cc | 121 ++++++++ paddle/phi/kernels/funcs/mode.h | 197 +++++++++++++ paddle/phi/kernels/gpu/mode_grad_kernel.cu | 85 ++++++ paddle/phi/kernels/gpu/mode_kernel.cu | 119 ++++++++ paddle/phi/kernels/mode_grad_kernel.h | 30 ++ paddle/phi/kernels/mode_kernel.h | 29 ++ paddle/phi/ops/compat/mode_sig.cc | 34 +++ 13 files changed, 844 insertions(+), 599 deletions(-) delete mode 100644 paddle/fluid/operators/mode_op.cu delete mode 100644 paddle/fluid/operators/mode_op.h create mode 100644 paddle/phi/kernels/cpu/mode_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/mode_kernel.cc create mode 100644 paddle/phi/kernels/funcs/mode.h create mode 100644 paddle/phi/kernels/gpu/mode_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/mode_kernel.cu create mode 100644 paddle/phi/kernels/mode_grad_kernel.h create mode 100644 paddle/phi/kernels/mode_kernel.h create mode 100644 paddle/phi/ops/compat/mode_sig.cc diff --git a/paddle/fluid/operators/mode_op.cc b/paddle/fluid/operators/mode_op.cc index c7fb92cd510..9c16ccb138f 100644 --- a/paddle/fluid/operators/mode_op.cc +++ b/paddle/fluid/operators/mode_op.cc @@ -12,10 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/mode_op.h" #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace operators { @@ -23,43 +27,6 @@ class ModeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "mode"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "mode"); - OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "mode"); - - auto input_dims = ctx->GetInputDim("X"); - const int& dim_size = input_dims.size(); - int axis = static_cast(ctx->Attrs().Get("axis")); - PADDLE_ENFORCE_EQ( - (axis < dim_size) && (axis >= (-1 * dim_size)), true, - paddle::platform::errors::InvalidArgument( - "the axis of ModeOp must be [-%d, %d), but you set axis is %d", - dim_size, dim_size, axis)); - PADDLE_ENFORCE_GE(input_dims.size(), 1, - paddle::platform::errors::InvalidArgument( - "input of ModeOp must have >= 1d shape")); - if (axis < 0) axis += dim_size; - bool keepdim = ctx->Attrs().Get("keepdim"); - std::vector dimvec; - for (int64_t i = 0; i < axis; i++) { - dimvec.emplace_back(input_dims[i]); - } - if (keepdim) { - dimvec.emplace_back(static_cast(1)); - } - for (int64_t i = axis + 1; i < dim_size; i++) { - dimvec.emplace_back(input_dims[i]); - } - framework::DDim dims = phi::make_ddim(dimvec); - PADDLE_ENFORCE_GE(input_dims.size(), 1, platform::errors::InvalidArgument( - "input shape should >= 1d")); - ctx->SetOutputDim("Out", dims); - ctx->SetOutputDim("Indices", dims); - ctx->ShareLoD("X", "Out"); - ctx->ShareLoD("X", "Indices"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -138,18 +105,11 @@ class ModeGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(mode, ModeInferShapeFunctor, + PD_INFER_META(phi::ModeInferMeta)); REGISTER_OPERATOR(mode, ops::ModeOp, ops::ModeOpMaker, ops::ModeGradOpMaker, - ops::ModeGradOpMaker); -REGISTER_OP_CPU_KERNEL(mode, - ops::ModeCPUKernel, - ops::ModeCPUKernel, - ops::ModeCPUKernel, - ops::ModeCPUKernel); - + ops::ModeGradOpMaker, + ModeInferShapeFunctor); REGISTER_OPERATOR(mode_grad, ops::ModeOpGrad); -REGISTER_OP_CPU_KERNEL( - mode_grad, ops::ModeGradCPUKernel, - ops::ModeGradCPUKernel, - ops::ModeGradCPUKernel, - ops::ModeGradCPUKernel); diff --git a/paddle/fluid/operators/mode_op.cu b/paddle/fluid/operators/mode_op.cu deleted file mode 100644 index 2bacda8afb0..00000000000 --- a/paddle/fluid/operators/mode_op.cu +++ /dev/null @@ -1,232 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/mode_op.h" -#include "paddle/fluid/operators/top_k_function_cuda.h" - -namespace paddle { -namespace operators { - -int ComputeBlockSize(int col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; -} - -template -void getModebySort(const platform::CUDADeviceContext& ctx, - const framework::Tensor* input_tensor, - const int64_t num_cols, const int64_t num_rows, - T* out_tensor, int64_t* indices_tensor) { - framework::Tensor input_tmp; - framework::TensorCopy(*input_tensor, ctx.GetPlace(), &input_tmp); - T* input_tmp_data = input_tmp.mutable_data(ctx.GetPlace()); - input_tmp.Resize(phi::make_ddim({num_rows, num_cols})); - thrust::device_ptr out_tensor_ptr(out_tensor); - thrust::device_ptr indices_tensor_ptr(indices_tensor); - - for (int64_t i = 0; i < num_rows; ++i) { - T* begin = input_tmp_data + num_cols * i; - T* end = input_tmp_data + num_cols * (i + 1); - thrust::device_vector indices_data(num_cols); - thrust::sequence(thrust::device, indices_data.begin(), - indices_data.begin() + num_cols); - thrust::sort_by_key(thrust::device, begin, end, indices_data.begin()); - int unique = 1 + thrust::inner_product(thrust::device, begin, end - 1, - begin + 1, 0, thrust::plus(), - thrust::not_equal_to()); - thrust::device_vector keys_data(unique); - thrust::device_vector cnts_data(unique); - thrust::reduce_by_key(thrust::device, begin, end, - thrust::constant_iterator(1), keys_data.begin(), - cnts_data.begin()); - auto it = thrust::max_element(thrust::device, cnts_data.begin(), - cnts_data.begin() + unique); - T mode = keys_data[it - cnts_data.begin()]; - int64_t counts = cnts_data[it - cnts_data.begin()]; - auto pos = thrust::find(thrust::device, begin, end, mode); - int64_t index = indices_data[pos - begin + counts - 1]; - out_tensor_ptr[i] = static_cast(mode); - indices_tensor_ptr[i] = static_cast(index); - } -} - -template -class ModeOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - int axis = static_cast(ctx.Attr("axis")); - bool keepdim = static_cast(ctx.Attr("keepdim")); - - // get the input dims - const auto& in_dims = input->dims(); - // calcluate the real axis - if (axis < 0) axis += in_dims.size(); - - auto out_dims = output->dims(); - - const T* input_data = input->data(); - T* output_data = output->mutable_data(ctx.GetPlace()); - int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); - - if (axis == in_dims.size() - 1) { - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - const auto& dev_ctx = ctx.cuda_device_context(); - getModebySort(dev_ctx, input, input_width, input_height, output_data, - indices_data); - } else { - std::vector trans_axis; - for (int i = 0; i < axis; i++) { - trans_axis.emplace_back(i); - } - trans_axis.emplace_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans_axis.emplace_back(i); - } - trans_axis.emplace_back(axis); - - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - framework::DDim tmp_out_dim = phi::make_ddim(tmp_out_shape); - output->Resize(tmp_out_dim); - indices->Resize(tmp_out_dim); - } - - framework::DDim trans_shape(in_dims); - framework::DDim trans_out_shape(in_dims); - for (int i = 0; i < trans_axis.size(); i++) { - trans_shape[i] = in_dims[trans_axis[i]]; - trans_out_shape[i] = in_dims[trans_axis[i]]; - } - trans_out_shape[in_dims.size() - 1] = 1; - - // second step, tranpose the input - framework::Tensor trans_input; - trans_input.mutable_data(trans_shape, ctx.GetPlace()); - int ndims = trans_axis.size(); - const auto& dev_ctx = ctx.cuda_device_context(); - TransCompute(ndims, dev_ctx, *input, - &trans_input, trans_axis); - framework::Tensor trans_ind; - int64_t* trans_ind_data = - trans_ind.mutable_data(trans_out_shape, ctx.GetPlace()); - framework::Tensor trans_out; - T* trans_out_data = - trans_out.mutable_data(trans_out_shape, ctx.GetPlace()); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1)); - const int64_t input_width = trans_shape[trans_shape.size() - 1]; - getModebySort(dev_ctx, &trans_input, input_width, input_height, - trans_out_data, trans_ind_data); - // last step, tranpose back the indices and output - TransCompute( - ndims, dev_ctx, trans_ind, indices, trans_axis); - TransCompute(ndims, dev_ctx, trans_out, - output, trans_axis); - if (!keepdim) { - output->Resize(out_dims); - indices->Resize(out_dims); - } - } - } -}; - -template -class ModeOpGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(context.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* x = context.Input("X"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = - context.Output(framework::GradVarName("X")); - int axis = context.Attr("axis"); - - const auto& in_dims = x->dims(); - auto out_dims = indices->dims(); - - if (axis < 0) axis += in_dims.size(); - // allocate the cuda memory for the x_grad - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - const T* out_grad_data = out_grad->data(); - const int64_t* indices_data = indices->data(); - - int pre, n, post; - GetDims(in_dims, axis, &pre, &n, &post); - - // calcluate the block and grid num - auto& dev_ctx = context.cuda_device_context(); - int block_size = ComputeBlockSize(post); - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1); - int grid_size = std::min(max_blocks, pre); - AssignGradWithAxis<<>>( - out_grad_data, indices_data, x_grad_data, pre, post, n, 1); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - mode, ops::ModeOpCUDAKernel, - ops::ModeOpCUDAKernel, - ops::ModeOpCUDAKernel, - ops::ModeOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL( - mode_grad, - ops::ModeOpGradCUDAKernel, - ops::ModeOpGradCUDAKernel, - ops::ModeOpGradCUDAKernel, - ops::ModeOpGradCUDAKernel); diff --git a/paddle/fluid/operators/mode_op.h b/paddle/fluid/operators/mode_op.h deleted file mode 100644 index 76d356ed16e..00000000000 --- a/paddle/fluid/operators/mode_op.h +++ /dev/null @@ -1,317 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/transpose_op.h" - -namespace paddle { -namespace operators { - -template -static void getMode(Type input_height, Type input_width, int input_dim, - const framework::Tensor* input, T* t_out, Type* t_indices) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - std::vector> col_vec; - col_vec.reserve(input_width); - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(j), j)); - } - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(i, j), j)); - } - } - std::sort(col_vec.begin(), col_vec.end(), - [](const std::pair& l, const std::pair& r) { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - }); - T mode = 0; - int64_t indice = 0; - int64_t cur_freq = 0; - int64_t max_freq = 0; - for (int64_t i = 0; i < input_width; ++i) { - ++cur_freq; - if (i == input_width - 1 || (col_vec[i + 1].first != col_vec[i].first)) { - if (cur_freq > max_freq) { - max_freq = cur_freq; - mode = col_vec[i].first; - indice = col_vec[i].second; - } - cur_freq = 0; - } - } - t_out[i] = mode; - t_indices[i] = indice; - } -} - -template -static void ModeAssign(const Type& input_height, const Type& input_width, - const int& input_dim, const framework::Tensor* input, - const framework::Tensor* indices, T* output_data) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - auto e_indices = framework::EigenVector::Flatten(*indices); - output_data[i * input_width + e_indices(0)] = e_input(0); - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - auto e_indices = - framework::EigenMatrix::Reshape(*indices, input_dim - 1); - output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0); - } - } -} - -template -class ModeCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - auto* indices = context.Output("Indices"); - const auto& in_dims = input->dims(); - bool keepdim = static_cast(context.Attr("keepdim")); - - // axis < 0, cacluate the real axis - int axis = static_cast(context.Attr("axis")); - if (axis < 0) axis += in_dims.size(); - - T* output_data = output->mutable_data(context.GetPlace()); - int64_t* indices_data = indices->mutable_data(context.GetPlace()); - auto out_dims = output->dims(); - // if axis is not the last dim, transpose it to the last dim, do the - // calculation, - // then tranpose it back to orginal axis. - if (axis == in_dims.size() - 1) { - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - getMode(input_height, input_width, in_dims.size(), input, - output_data, indices_data); - } else { - std::vector trans_axis; - for (int i = 0; i < axis; i++) { - trans_axis.emplace_back(i); - } - trans_axis.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans_axis.emplace_back(i); - } - trans_axis.emplace_back(axis); - - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - framework::DDim tmp_out_dim = phi::make_ddim(tmp_out_shape); - output->Resize(tmp_out_dim); - indices->Resize(tmp_out_dim); - } - - // get the trans input_dims, out_dims - framework::DDim trans_shape(in_dims); - framework::DDim trans_out_shape(in_dims); - - for (size_t i = 0; i < trans_axis.size(); i++) { - trans_shape[i] = in_dims[trans_axis[i]]; - trans_out_shape[i] = in_dims[trans_axis[i]]; - } - trans_out_shape[in_dims.size() - 1] = 1; - - framework::Tensor trans_input; - trans_input.mutable_data(trans_shape, context.GetPlace()); - int ndims = trans_axis.size(); - auto& dev_context = - context.template device_context(); - - // transpose the input value - TransCompute(ndims, dev_context, *input, - &trans_input, trans_axis); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1)); - const int64_t input_width = trans_shape[trans_shape.size() - 1]; - framework::Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_out_shape, context.GetPlace()); - framework::Tensor tmp_indices; - auto* t_ind = tmp_indices.mutable_data(trans_out_shape, - context.GetPlace()); - - getMode(input_height, input_width, in_dims.size(), - &trans_input, t_out, t_ind); - // transpose back - TransCompute( - ndims, dev_context, tmp_indices, indices, trans_axis); - TransCompute(ndims, dev_context, tmp_out, - output, trans_axis); - if (!keepdim) { - output->Resize(out_dims); - indices->Resize(out_dims); - } - } - } -}; - -template -class ModeGradCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = - context.Output(framework::GradVarName("X")); - int axis = static_cast(context.Attr("axis")); - bool keepdim = static_cast(context.Attr("keepdim")); - - auto in_dims = x->dims(); - auto out_dims = indices->dims(); - - // axis < 0, get the real axis - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(out_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(out_dims[i - 1]); - } - out_dims = phi::make_ddim(tmp_out_shape); - } - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - if (axis == in_dims.size() - 1) { - // allocate the memory for the input_grad - // assign the out_grad to input_grad directly - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - - // init the output grad with 0, because some input elements has no grad - memset(x_grad_data, 0, x_grad->numel() * sizeof(T)); - // Assign the output_grad to input_grad - if (keepdim) { - ModeAssign(input_height, input_width, in_dims.size(), out_grad, indices, - x_grad_data); - } else { - auto& dev_context = - context.template device_context(); - framework::Tensor out_grad_tmp; - framework::Tensor indices_tmp; - out_grad_tmp.mutable_data(out_grad->dims(), dev_context.GetPlace()); - indices_tmp.mutable_data(indices->dims(), - dev_context.GetPlace()); - framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context, - &out_grad_tmp); - framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context, - &indices_tmp); - out_grad_tmp.Resize(out_dims); - indices_tmp.Resize(out_dims); - ModeAssign(input_height, input_width, in_dims.size(), &out_grad_tmp, - &indices_tmp, x_grad_data); - } - } else { - // can not assign grad to input_grad, must do the transpose - std::vector trans_axis; - for (int i = 0; i < axis; i++) { - trans_axis.emplace_back(i); - } - trans_axis.emplace_back(out_dims.size() - 1); - for (int i = axis + 1; i < out_dims.size() - 1; i++) { - trans_axis.emplace_back(i); - } - trans_axis.emplace_back(axis); - framework::DDim trans_shape(out_dims); - framework::DDim trans_in_shape(in_dims); - for (size_t i = 0; i < trans_axis.size(); i++) { - trans_shape[i] = out_dims[trans_axis[i]]; - trans_in_shape[i] = in_dims[trans_axis[i]]; - } - // transpose the out_grad, indices - framework::Tensor trans_dO; - trans_dO.mutable_data(trans_shape, context.GetPlace()); - framework::Tensor trans_ind; - trans_ind.mutable_data(trans_shape, context.GetPlace()); - int ndims = trans_axis.size(); - auto& dev_context = - context.template device_context(); - - if (keepdim) { - // Do transpose - TransCompute( - ndims, dev_context, *out_grad, &trans_dO, trans_axis); - TransCompute( - ndims, dev_context, *indices, &trans_ind, trans_axis); - } else { - framework::Tensor out_grad_tmp; - framework::Tensor indices_tmp; - out_grad_tmp.mutable_data(out_grad->dims(), dev_context.GetPlace()); - indices_tmp.mutable_data(indices->dims(), - dev_context.GetPlace()); - framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context, - &out_grad_tmp); - framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context, - &indices_tmp); - out_grad_tmp.Resize(out_dims); - indices_tmp.Resize(out_dims); - // Do transpose - TransCompute( - ndims, dev_context, out_grad_tmp, &trans_dO, trans_axis); - TransCompute( - ndims, dev_context, indices_tmp, &trans_ind, trans_axis); - } - const int64_t input_height = phi::product( - phi::slice_ddim(trans_in_shape, 0, trans_in_shape.size() - 1)); - const int64_t input_width = trans_in_shape[trans_in_shape.size() - 1]; - - // Assign the out_grad to tranpose input_grad - framework::Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_in_shape, context.GetPlace()); - memset(t_out, 0, x_grad->numel() * sizeof(T)); - - ModeAssign(input_height, input_width, in_dims.size(), - &trans_dO, &trans_ind, t_out); - - // Transpose back - TransCompute(ndims, dev_context, tmp_out, - x_grad, trans_axis); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 262ada3eaf3..f81f4a1b7c7 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -648,6 +648,49 @@ void MaxPoolWithIndexInferMeta(const MetaTensor& x, mask->set_dtype(paddle::experimental::CppTypeToDataType::Type()); } +void ModeInferMeta(const MetaTensor& x, + int axis, + bool keepdim, + MetaTensor* out, + MetaTensor* indices) { + auto input_dims = x.dims(); + const int& dim_size = input_dims.size(); + PADDLE_ENFORCE_EQ( + (axis < dim_size) && (axis >= (-1 * dim_size)), + true, + errors::InvalidArgument( + "the axis of ModeOp must be [-%d, %d), but you set axis is %d", + dim_size, + dim_size, + axis)); + PADDLE_ENFORCE_GE( + input_dims.size(), + 1, + errors::InvalidArgument("input of ModeOp must have >= 1d shape")); + if (axis < 0) axis += dim_size; + std::vector dimvec; + for (int64_t i = 0; i < axis; i++) { + dimvec.emplace_back(input_dims[i]); + } + if (keepdim) { + dimvec.emplace_back(static_cast(1)); + } + for (int64_t i = axis + 1; i < dim_size; i++) { + dimvec.emplace_back(input_dims[i]); + } + DDim dims = phi::make_ddim(dimvec); + PADDLE_ENFORCE_GE(input_dims.size(), + 1, + errors::InvalidArgument("input shape should >= 1d")); + out->set_dims(dims); + out->share_lod(x); + out->set_dtype(x.dtype()); + + indices->set_dims(dims); + indices->share_lod(x); + indices->set_dtype(x.dtype()); +} + void MultinomialInferMeta(const MetaTensor& x, int num_samples, bool replacement, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 3dfc9b797c0..eb894003e53 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -112,6 +112,12 @@ void MaxPoolWithIndexInferMeta(const MetaTensor& x, MetaTensor* mask, MetaConfig config = MetaConfig()); +void ModeInferMeta(const MetaTensor& x, + int axis, + bool keepdim, + MetaTensor* out, + MetaTensor* indices); + void MultinomialInferMeta(const MetaTensor& x, int num_samples, bool replacement, diff --git a/paddle/phi/kernels/cpu/mode_grad_kernel.cc b/paddle/phi/kernels/cpu/mode_grad_kernel.cc new file mode 100644 index 00000000000..ca813c1757e --- /dev/null +++ b/paddle/phi/kernels/cpu/mode_grad_kernel.cc @@ -0,0 +1,170 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/mode_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/mode.h" + +namespace phi { + +template +void ModeGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& indices, + const DenseTensor& out_grad, + int axis, + bool keepdim, + DenseTensor* x_grad) { + auto in_dims = x.dims(); + auto out_dims = indices.dims(); + + // axis < 0, get the real axis + axis = (axis < 0) ? (in_dims.size() + axis) : axis; + + if (!keepdim) { + std::vector tmp_out_shape; + for (int i = 0; i < axis; i++) { + tmp_out_shape.emplace_back(out_dims[i]); + } + tmp_out_shape.emplace_back(1); + for (int i = axis + 1; i < in_dims.size(); i++) { + tmp_out_shape.emplace_back(out_dims[i - 1]); + } + out_dims = phi::make_ddim(tmp_out_shape); + } + T* x_grad_data = dev_ctx.template Alloc(x_grad); + + if (axis == in_dims.size() - 1) { + // allocate the memory for the input_grad + // assign the out_grad to input_grad directly + const int64_t input_height = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t input_width = in_dims[in_dims.size() - 1]; + + // init the output grad with 0, because some input elements has no grad + memset(x_grad_data, 0, x_grad->numel() * sizeof(T)); + // Assign the output_grad to input_grad + if (keepdim) { + funcs::ModeAssign(input_height, + input_width, + in_dims.size(), + &out_grad, + &indices, + x_grad_data); + } else { + DenseTensor out_grad_tmp; + dev_ctx.template Alloc(&out_grad_tmp); + DenseTensor indices_tmp; + dev_ctx.template Alloc(&indices_tmp); + + phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, &out_grad_tmp); + phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp); + + out_grad_tmp.Resize(out_dims); + indices_tmp.Resize(out_dims); + + funcs::ModeAssign(input_height, + input_width, + in_dims.size(), + &out_grad_tmp, + &indices_tmp, + x_grad_data); + } + } else { + // can not assign grad to input_grad, must do the transpose + std::vector trans_axis; + for (int i = 0; i < axis; i++) { + trans_axis.emplace_back(i); + } + trans_axis.emplace_back(out_dims.size() - 1); + for (int i = axis + 1; i < out_dims.size() - 1; i++) { + trans_axis.emplace_back(i); + } + trans_axis.emplace_back(axis); + DDim trans_shape(out_dims); + DDim trans_in_shape(in_dims); + for (size_t i = 0; i < trans_axis.size(); i++) { + trans_shape[i] = out_dims[trans_axis[i]]; + trans_in_shape[i] = in_dims[trans_axis[i]]; + } + // transpose the out_grad, indices + DenseTensor trans_dO; + trans_dO.Resize(trans_shape); + dev_ctx.template Alloc(&trans_dO); + + DenseTensor trans_ind; + trans_ind.Resize(trans_shape); + dev_ctx.template Alloc(&trans_ind); + + int ndims = trans_axis.size(); + + if (keepdim) { + // Do transpose + funcs::TransCompute( + ndims, dev_ctx, out_grad, &trans_dO, trans_axis); + funcs::TransCompute( + ndims, dev_ctx, indices, &trans_ind, trans_axis); + } else { + DenseTensor out_grad_tmp; + dev_ctx.template Alloc(&out_grad_tmp); + + DenseTensor indices_tmp; + dev_ctx.template Alloc(&indices_tmp); + + phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, &out_grad_tmp); + phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp); + out_grad_tmp.Resize(out_dims); + indices_tmp.Resize(out_dims); + // Do transpose + funcs::TransCompute( + ndims, dev_ctx, out_grad_tmp, &trans_dO, trans_axis); + funcs::TransCompute( + ndims, dev_ctx, indices_tmp, &trans_ind, trans_axis); + } + const int64_t input_height = phi::product( + phi::slice_ddim(trans_in_shape, 0, trans_in_shape.size() - 1)); + const int64_t input_width = trans_in_shape[trans_in_shape.size() - 1]; + + // Assign the out_grad to tranpose input_grad + DenseTensor tmp_out; + tmp_out.Resize(trans_in_shape); + T* t_out = dev_ctx.template Alloc(&tmp_out); + memset(t_out, 0, x_grad->numel() * sizeof(T)); + + funcs::ModeAssign(input_height, + input_width, + in_dims.size(), + &trans_dO, + &trans_ind, + t_out); + + // Transpose back + funcs::TransCompute( + ndims, dev_ctx, tmp_out, x_grad, trans_axis); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(mode_grad, + CPU, + ALL_LAYOUT, + phi::ModeGradKernel, + float, + double, + int32_t, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/mode_kernel.cc b/paddle/phi/kernels/cpu/mode_kernel.cc new file mode 100644 index 00000000000..6535d1b89af --- /dev/null +++ b/paddle/phi/kernels/cpu/mode_kernel.cc @@ -0,0 +1,121 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/mode_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/mode.h" + +namespace phi { + +template +void ModeKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + bool keepdim, + DenseTensor* out, + DenseTensor* indices) { + const auto& in_dims = x.dims(); + auto out_dims = out->dims(); + // axis < 0, cacluate the real axis + if (axis < 0) axis += in_dims.size(); + + T* output_data = dev_ctx.template Alloc(out); + int64_t* indices_data = dev_ctx.template Alloc(indices); + // if axis is not the last dim, transpose it to the last dim, do the + // calculation, then tranpose it back to original axis. + if (axis == in_dims.size() - 1) { + const int64_t& input_height = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t& input_width = in_dims[in_dims.size() - 1]; + funcs::GetMode(input_height, + input_width, + in_dims.size(), + &x, + output_data, + indices_data); + } else { + std::vector trans_axis; + for (int i = 0; i < axis; i++) { + trans_axis.emplace_back(i); + } + trans_axis.push_back(in_dims.size() - 1); + for (int i = axis + 1; i < in_dims.size() - 1; i++) { + trans_axis.emplace_back(i); + } + trans_axis.emplace_back(axis); + + if (!keepdim) { + std::vector tmp_out_shape; + for (int i = 0; i < axis; i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + tmp_out_shape.emplace_back(1); + for (int i = axis + 1; i < in_dims.size(); i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + DDim tmp_out_dim = phi::make_ddim(tmp_out_shape); + out->Resize(tmp_out_dim); + indices->Resize(tmp_out_dim); + } + + // get the trans input_dims, out_dims + DDim trans_shape(in_dims); + DDim trans_out_shape(in_dims); + + for (size_t i = 0; i < trans_axis.size(); i++) { + trans_shape[i] = in_dims[trans_axis[i]]; + trans_out_shape[i] = in_dims[trans_axis[i]]; + } + trans_out_shape[in_dims.size() - 1] = 1; + + DenseTensor trans_input; + trans_input.Resize(trans_shape); + dev_ctx.template Alloc(&trans_input); + int ndims = trans_axis.size(); + + // transpose the input value + funcs::TransCompute( + ndims, dev_ctx, x, &trans_input, trans_axis); + + const int64_t input_height = + phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1)); + const int64_t input_width = trans_shape[trans_shape.size() - 1]; + DenseTensor tmp_out; + tmp_out.Resize(trans_out_shape); + T* t_out = dev_ctx.template Alloc(&tmp_out); + + DenseTensor tmp_indices; + tmp_indices.Resize(trans_out_shape); + int64_t* t_ind = dev_ctx.template Alloc(&tmp_indices); + + funcs::GetMode( + input_height, input_width, in_dims.size(), &trans_input, t_out, t_ind); + // transpose back + funcs::TransCompute( + ndims, dev_ctx, tmp_indices, indices, trans_axis); + funcs::TransCompute( + ndims, dev_ctx, tmp_out, out, trans_axis); + if (!keepdim) { + out->Resize(out_dims); + indices->Resize(out_dims); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + mode, CPU, ALL_LAYOUT, phi::ModeKernel, float, double, int32_t, int64_t) {} diff --git a/paddle/phi/kernels/funcs/mode.h b/paddle/phi/kernels/funcs/mode.h new file mode 100644 index 00000000000..1b7641762e2 --- /dev/null +++ b/paddle/phi/kernels/funcs/mode.h @@ -0,0 +1,197 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#include +#include +#include +#include +#include +#include +#include +#include +#endif + +#include +#include +#include +#include +#ifdef PADDLE_WITH_MKLML +#include +#endif + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { +namespace funcs { + +static int ComputeBlockSize(int col) { + if (col > 512) + return 1024; + else if (col > 256 && col <= 512) + return 512; + else if (col > 128 && col <= 256) + return 256; + else if (col > 64 && col <= 128) + return 128; + else + return 64; +} + +static inline void GetDims( + const phi::DDim& dim, int axis, int* pre, int* n, int* post) { + *pre = 1; + *post = 1; + *n = dim[axis]; + for (int i = 0; i < axis; ++i) { + (*pre) *= dim[i]; + } + for (int i = axis + 1; i < dim.size(); ++i) { + (*post) *= dim[i]; + } +} + +template +static void GetMode(Type input_height, + Type input_width, + int input_dim, + const DenseTensor* input, + T* t_out, + Type* t_indices) { +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (Type i = 0; i < input_height; ++i) { + std::vector> col_vec; + col_vec.reserve(input_width); + if (input_dim == 1) { + auto e_input = EigenVector::Flatten(*input); + for (Type j = 0; j < input_width; ++j) { + col_vec.emplace_back(std::pair(e_input(j), j)); + } + } else { + auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); + for (Type j = 0; j < input_width; ++j) { + col_vec.emplace_back(std::pair(e_input(i, j), j)); + } + } + std::sort(col_vec.begin(), + col_vec.end(), + [](const std::pair& l, const std::pair& r) { + return (!std::isnan(static_cast(l.first)) && + std::isnan(static_cast(r.first))) || + (l.first < r.first); + }); + T mode = 0; + int64_t indice = 0; + int64_t cur_freq = 0; + int64_t max_freq = 0; + for (int64_t i = 0; i < input_width; ++i) { + ++cur_freq; + if (i == input_width - 1 || (col_vec[i + 1].first != col_vec[i].first)) { + if (cur_freq > max_freq) { + max_freq = cur_freq; + mode = col_vec[i].first; + indice = col_vec[i].second; + } + cur_freq = 0; + } + } + t_out[i] = mode; + t_indices[i] = indice; + } +} + +template +static void ModeAssign(const Type& input_height, + const Type& input_width, + const int& input_dim, + const DenseTensor* input, + const DenseTensor* indices, + T* output_data) { +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (Type i = 0; i < input_height; ++i) { + if (input_dim == 1) { + auto e_input = EigenVector::Flatten(*input); + auto e_indices = EigenVector::Flatten(*indices); + output_data[i * input_width + e_indices(0)] = e_input(0); + } else { + auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); + auto e_indices = EigenMatrix::Reshape(*indices, input_dim - 1); + output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0); + } + } +} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +template +static void GetModebySort(const phi::GPUContext& dev_ctx, + const DenseTensor* input_tensor, + const int64_t num_cols, + const int64_t num_rows, + T* out_tensor, + int64_t* indices_tensor) { + DenseTensor input_tmp; + input_tmp.Resize(phi::make_ddim({num_rows, num_cols})); + T* input_tmp_data = dev_ctx.Alloc(&input_tmp); + phi::Copy(dev_ctx, *input_tensor, dev_ctx.GetPlace(), false, &input_tmp); + + thrust::device_ptr out_tensor_ptr(out_tensor); + thrust::device_ptr indices_tensor_ptr(indices_tensor); + + for (int64_t i = 0; i < num_rows; ++i) { + T* begin = input_tmp_data + num_cols * i; + T* end = input_tmp_data + num_cols * (i + 1); + thrust::device_vector indices_data(num_cols); + thrust::sequence( + thrust::device, indices_data.begin(), indices_data.begin() + num_cols); + thrust::sort_by_key(thrust::device, begin, end, indices_data.begin()); + int unique = 1 + thrust::inner_product(thrust::device, + begin, + end - 1, + begin + 1, + 0, + thrust::plus(), + thrust::not_equal_to()); + thrust::device_vector keys_data(unique); + thrust::device_vector cnts_data(unique); + thrust::reduce_by_key(thrust::device, + begin, + end, + thrust::constant_iterator(1), + keys_data.begin(), + cnts_data.begin()); + auto it = thrust::max_element( + thrust::device, cnts_data.begin(), cnts_data.begin() + unique); + T mode = keys_data[it - cnts_data.begin()]; + int64_t counts = cnts_data[it - cnts_data.begin()]; + auto pos = thrust::find(thrust::device, begin, end, mode); + int64_t index = indices_data[pos - begin + counts - 1]; + out_tensor_ptr[i] = static_cast(mode); + indices_tensor_ptr[i] = static_cast(index); + } +} +#endif + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/mode_grad_kernel.cu b/paddle/phi/kernels/gpu/mode_grad_kernel.cu new file mode 100644 index 00000000000..43502621c2d --- /dev/null +++ b/paddle/phi/kernels/gpu/mode_grad_kernel.cu @@ -0,0 +1,85 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/mode_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/mode.h" + +namespace phi { + +template +__global__ void AssignGradWithAxis(const T* grad_out, + const int64_t* indices, + T* grad_in, + int pre, + int post, + int raw_height, + int k) { + // raw_height is the length of topk axis + for (int i = blockIdx.x; i < pre; i += gridDim.x) { + int base_index = i * post * k; + int base_grad = i * post * raw_height; + for (int j = threadIdx.x; j < raw_height * post; j += blockDim.x) { + grad_in[base_grad + j] = static_cast(0); + } + __syncthreads(); + for (int j = threadIdx.x; j < k * post; j += blockDim.x) { + int64_t idx_ij = indices[base_index + j]; + int64_t in_ij = base_grad + (idx_ij * post) + (j % post); + grad_in[in_ij] = grad_out[base_index + j]; + } + } +} + +template +void ModeGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& indices, + const DenseTensor& out_grad, + int axis, + bool keepdim, + DenseTensor* x_grad) { + const auto& in_dims = x.dims(); + auto out_dims = indices.dims(); + + if (axis < 0) axis += in_dims.size(); + // allocate the cuda memory for the x_grad + T* x_grad_data = dev_ctx.template Alloc(x_grad); + const T* out_grad_data = out_grad.data(); + const int64_t* indices_data = indices.data(); + + int pre, n, post; + funcs::GetDims(in_dims, axis, &pre, &n, &post); + + // calcluate the block and grid num + int block_size = funcs::ComputeBlockSize(post); + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1); + int grid_size = std::min(max_blocks, pre); + AssignGradWithAxis<<>>( + out_grad_data, indices_data, x_grad_data, pre, post, n, 1); +} + +} // namespace phi + +PD_REGISTER_KERNEL(mode_grad, + GPU, + ALL_LAYOUT, + phi::ModeGradKernel, + float, + double, + int32_t, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/mode_kernel.cu b/paddle/phi/kernels/gpu/mode_kernel.cu new file mode 100644 index 00000000000..629b9722cd6 --- /dev/null +++ b/paddle/phi/kernels/gpu/mode_kernel.cu @@ -0,0 +1,119 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/mode_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/mode.h" + +namespace phi { + +template +void ModeKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + bool keepdim, + DenseTensor* out, + DenseTensor* indices) { + // get the input dims + const auto& in_dims = x.dims(); + // calcluate the real axis + if (axis < 0) axis += in_dims.size(); + + auto out_dims = out->dims(); + + const T* input_data = x.data(); + T* output_data = dev_ctx.template Alloc(out); + int64_t* indices_data = dev_ctx.template Alloc(indices); + + if (axis == in_dims.size() - 1) { + const int64_t& input_height = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t& input_width = in_dims[in_dims.size() - 1]; + funcs::GetModebySort( + dev_ctx, &x, input_width, input_height, output_data, indices_data); + } else { + std::vector trans_axis; + for (int i = 0; i < axis; i++) { + trans_axis.emplace_back(i); + } + trans_axis.emplace_back(in_dims.size() - 1); + for (int i = axis + 1; i < in_dims.size() - 1; i++) { + trans_axis.emplace_back(i); + } + trans_axis.emplace_back(axis); + + if (!keepdim) { + std::vector tmp_out_shape; + for (int i = 0; i < axis; i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + tmp_out_shape.emplace_back(1); + for (int i = axis + 1; i < in_dims.size(); i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + DDim tmp_out_dim = phi::make_ddim(tmp_out_shape); + out->Resize(tmp_out_dim); + indices->Resize(tmp_out_dim); + } + + DDim trans_shape(in_dims); + DDim trans_out_shape(in_dims); + for (int i = 0; i < trans_axis.size(); i++) { + trans_shape[i] = in_dims[trans_axis[i]]; + trans_out_shape[i] = in_dims[trans_axis[i]]; + } + trans_out_shape[in_dims.size() - 1] = 1; + + // second step, tranpose the input + DenseTensor trans_input; + trans_input.Resize(trans_shape); + dev_ctx.template Alloc(&trans_input); + + int ndims = trans_axis.size(); + funcs::TransCompute( + ndims, dev_ctx, x, &trans_input, trans_axis); + DenseTensor trans_ind; + trans_ind.Resize(trans_out_shape); + int64_t* trans_ind_data = dev_ctx.template Alloc(&trans_ind); + + DenseTensor trans_out; + trans_out.Resize(trans_out_shape); + T* trans_out_data = dev_ctx.template Alloc(&trans_out); + + const int64_t input_height = + phi::product(phi::slice_ddim(trans_shape, 0, trans_shape.size() - 1)); + const int64_t input_width = trans_shape[trans_shape.size() - 1]; + funcs::GetModebySort(dev_ctx, + &trans_input, + input_width, + input_height, + trans_out_data, + trans_ind_data); + // last step, tranpose back the indices and output + funcs::TransCompute( + ndims, dev_ctx, trans_ind, indices, trans_axis); + funcs::TransCompute(ndims, dev_ctx, trans_out, out, trans_axis); + if (!keepdim) { + out->Resize(out_dims); + indices->Resize(out_dims); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + mode, GPU, ALL_LAYOUT, phi::ModeKernel, float, double, int32_t, int64_t) {} diff --git a/paddle/phi/kernels/mode_grad_kernel.h b/paddle/phi/kernels/mode_grad_kernel.h new file mode 100644 index 00000000000..ccde8c3648f --- /dev/null +++ b/paddle/phi/kernels/mode_grad_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ModeGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& indices, + const DenseTensor& out_grad, + int axis, + bool keepdim, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/mode_kernel.h b/paddle/phi/kernels/mode_kernel.h new file mode 100644 index 00000000000..831c4369304 --- /dev/null +++ b/paddle/phi/kernels/mode_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ModeKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + bool keepdim, + DenseTensor* out, + DenseTensor* indices); + +} // namespace phi diff --git a/paddle/phi/ops/compat/mode_sig.cc b/paddle/phi/ops/compat/mode_sig.cc new file mode 100644 index 00000000000..20994c08aa7 --- /dev/null +++ b/paddle/phi/ops/compat/mode_sig.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature ModeOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "mode", {"X"}, {"axis", "keepdim"}, {"Out", "Indices"}); +} + +KernelSignature ModeGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("mode_grad", + {"X", "Indices", GradVarName("Out")}, + {"axis", "keepdim"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(mode, phi::ModeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(mode_grad, phi::ModeGradOpArgumentMapping); -- GitLab From 8ffcf596fe70bfa26509b84127179c118c1588f0 Mon Sep 17 00:00:00 2001 From: From00 Date: Wed, 16 Mar 2022 19:14:18 +0800 Subject: [PATCH 113/176] Fix Jetson compilation error in pooling (#40586) --- paddle/phi/kernels/funcs/pooling.cu | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/paddle/phi/kernels/funcs/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu index 4cf5e1c02c5..417c1cd2347 100644 --- a/paddle/phi/kernels/funcs/pooling.cu +++ b/paddle/phi/kernels/funcs/pooling.cu @@ -392,7 +392,7 @@ void Pool2dDirectCUDAFunctor::operator()( int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - // paddle::platform::ChangeThreadNum(context, &thread_num); + // backends::gpu::ChangeThreadNum(context, &thread_num); thread_num = 512; #endif int blocks = (nthreads + thread_num - 1) / thread_num; @@ -460,7 +460,7 @@ class Pool2dFunctor { int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - paddle::platform::ChangeThreadNum(context, &thread_num); + backends::gpu::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); @@ -527,7 +527,7 @@ class Pool2dFunctor { int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - paddle::platform::ChangeThreadNum(context, &thread_num); + backends::gpu::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); @@ -1293,7 +1293,7 @@ class Pool3dFunctor { output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - paddle::platform::ChangeThreadNum(context, &thread_num); + backends::gpu::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); @@ -1369,7 +1369,7 @@ class Pool3dFunctor { output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - paddle::platform::ChangeThreadNum(context, &thread_num); + backends::gpu::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; dim3 threads(thread_num, 1); @@ -1906,7 +1906,7 @@ class MaxPool2dWithIndexFunctor { int nthreads = batch_size * output_channels * output_height * output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - paddle::platform::ChangeThreadNum(context, &thread_num); + backends::gpu::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; @@ -2205,7 +2205,7 @@ class MaxPool3dWithIndexFunctor { output_width; int thread_num = 1024; #ifdef WITH_NV_JETSON - paddle::platform::ChangeThreadNum(context, &thread_num); + backends::gpu::ChangeThreadNum(context, &thread_num); #endif int blocks = (nthreads + thread_num - 1) / thread_num; -- GitLab From ac5cc136458c450161fe1d946aece11723844b80 Mon Sep 17 00:00:00 2001 From: From00 Date: Wed, 16 Mar 2022 19:14:42 +0800 Subject: [PATCH 114/176] Add yaml config for pool2d (#40563) * Add yaml config for pool2d * Fix CI error * Fix code format error --- .../final_state_generator/eager_gen.py | 7 +++++-- .../final_state_generator/python_c_gen.py | 2 +- python/paddle/fluid/dygraph/tracer.py | 6 ++++++ python/paddle/utils/code_gen/api.yaml | 8 ++++++++ 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index bc30f6aa03f..d2d699e154f 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -28,6 +28,7 @@ namespace = "" yaml_types_mapping = { 'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t', 'size_t' : 'size_t', \ 'float' : 'float', 'double' : 'double', 'bool' : 'bool', \ + 'str' : 'std::string', \ 'Backend' : 'paddle::experimental::Backend', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \ 'int64[]' : 'std::vector', 'int[]' : 'std::vector', 'Tensor' : 'Tensor', @@ -212,7 +213,8 @@ def ParseYamlArgs(string): default_value = m.group(3).split("=")[1].strip() if len( m.group(3).split("=")) > 1 else None - assert arg_type in yaml_types_mapping.keys() + assert arg_type in yaml_types_mapping.keys( + ), f"The argument type {arg_type} in yaml config is not supported in yaml_types_mapping." arg_type = yaml_types_mapping[arg_type] arg_name = RemoveSpecialSymbolsInName(arg_name) @@ -247,7 +249,8 @@ def ParseYamlReturns(string): else: ret_type = ret.strip() - assert ret_type in yaml_types_mapping.keys() + assert ret_type in yaml_types_mapping.keys( + ), f"The return type {ret_type} in yaml config is not supported in yaml_types_mapping." ret_type = yaml_types_mapping[ret_type] assert "Tensor" in ret_type diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index 9b77f0449e0..c0ed77ecdc4 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -24,7 +24,7 @@ atype_to_parsing_function = { "long": "CastPyArg2Long", "int64_t": "CastPyArg2Long", "float": "CastPyArg2Float", - "string": "CastPyArg2String", + "std::string": "CastPyArg2String", "std::vector": "CastPyArg2Booleans", "std::vector": "CastPyArg2Ints", "std::vector": "CastPyArg2Longs", diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py index d0552ca41f0..d8b1883fc62 100644 --- a/python/paddle/fluid/dygraph/tracer.py +++ b/python/paddle/fluid/dygraph/tracer.py @@ -35,6 +35,12 @@ final_state_name_mapping = { "x": "X", "out": "Out", }, + "pool2d": { + "final_op_name": "final_state_pool2d", + "x": "X", + "kernel_size": "ksize", + "out": "Out", + }, "abs": { "final_op_name": "final_state_abs", "x": "X", diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index d24b64bf661..70dea65b769 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -141,6 +141,14 @@ output : Tensor invoke : full_like(x, 1, dtype, place) +- api : pool2d + args : (Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) + output : Tensor(out) + infer_meta : + func : PoolInferMeta + kernel: + func : pool2d + - api : reshape args : (Tensor x, ScalarArray shape) output : Tensor(out) -- GitLab From 9fc89b34623adc7aca69e5ea2f67bcb07cae4434 Mon Sep 17 00:00:00 2001 From: huzhiqiang <912790387@qq.com> Date: Wed, 16 Mar 2022 06:20:29 -0500 Subject: [PATCH 115/176] Add model check (#40398) --- .../infrt/dialect/infrt/ir/infrt_dialect.cc | 7 + paddle/infrt/host_context/paddle_mlir.cc | 16 ++- paddle/infrt/tests/CMakeLists.txt | 2 + paddle/infrt/tests/model/abs_model.py | 38 ++++++ paddle/infrt/tests/model/test_abs.cc | 126 ++++++++++++++++++ paddle/scripts/infrt_build.sh | 2 + 6 files changed, 186 insertions(+), 5 deletions(-) create mode 100644 paddle/infrt/tests/model/abs_model.py create mode 100644 paddle/infrt/tests/model/test_abs.cc diff --git a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc index 867d854ba3c..3a1b45d3a20 100644 --- a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc +++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc @@ -90,6 +90,9 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const { return LoDTensorType::get( parser.getContext(), shape, elementType, lod_level); } + if (keyword == "dense_tensor_map") { + return DenseTensorMapType::get(parser.getContext()); + } if (keyword == "dense_tensor") { // parse DenseTensor, for example: !i=Infrt.tensor llvm::StringRef target; @@ -158,6 +161,10 @@ void InfrtDialect::printType(::mlir::Type type, << lod_tensor_type.getLod_level() << ">"; return; } + if (type.isa()) { + os << "dense_tensor_map"; + return; + } // print DenseTensorType, for example: !infrt.dense_tensor if (type.isa()) { diff --git a/paddle/infrt/host_context/paddle_mlir.cc b/paddle/infrt/host_context/paddle_mlir.cc index 18c25827b8e..96aecb75589 100644 --- a/paddle/infrt/host_context/paddle_mlir.cc +++ b/paddle/infrt/host_context/paddle_mlir.cc @@ -13,15 +13,17 @@ // limitations under the License. #include "paddle/infrt/host_context/paddle_mlir.h" +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" #include "paddle/infrt/dialect/pd_ops_info.h" MLIRModelGenImpl::MLIRModelGenImpl() : context_(infrt::Global::getMLIRContext()), builder_(context_) { - context_->allowUnregisteredDialects(); context_->getOrLoadDialect(); context_->getOrLoadDialect(); context_->getOrLoadDialect(); context_->getOrLoadDialect(); + context_->getOrLoadDialect<::infrt::InfrtDialect>(); module_ = mlir::ModuleOp::create(mlir::UnknownLoc::get(context_)); } @@ -55,7 +57,6 @@ mlir::ModuleOp MLIRModelGenImpl::ImportPaddleModel( UpdateModelParams(program, &mainFunc); UpdateModelOps(program); UpdateModelOutputs(program); - return module_; } @@ -171,7 +172,11 @@ void MLIRModelGenImpl::UpdateModelParams( ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(), builder_, &precision_); - mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_); + mlir::Type type_ = + infrt::DenseTensorType::get(context_, + infrt::TargetType::CPU, + infrt::PrecisionType::FLOAT32, + infrt::LayoutType::NCHW); auto op = builder_.create( mlir::UnknownLoc::get(context_), type_, map, name); params_map_.insert(std::pair( @@ -197,8 +202,9 @@ void MLIRModelGenImpl::UpdateModelOutputs( llvm::SmallVector resultTypes; llvm::SmallVector attrs; + mlir::OperationState state(loc, - mlir::ReturnOp::getOperationName(), + ::infrt::ReturnOp::getOperationName(), operands, resultTypes, attrs); @@ -321,7 +327,7 @@ llvm::SmallVector MLIRModelGenImpl::GetOpAttributes( switch (type) { ATTR_IMPL_CASE(FLOAT, f, getF32FloatAttr); ATTR_IMPL_CASE(BOOLEAN, b, getBoolAttr); - ATTR_IMPL_CASE(INT, i, getI32IntegerAttr); + ATTR_IMPL_CASE(INT, i, getSI32IntegerAttr); ATTR_IMPL_CASE(LONG, l, getI64IntegerAttr); ATTR_IMPL_CASE(STRING, s, getStringAttr); diff --git a/paddle/infrt/tests/CMakeLists.txt b/paddle/infrt/tests/CMakeLists.txt index e5cc1ec1121..58543a68642 100644 --- a/paddle/infrt/tests/CMakeLists.txt +++ b/paddle/infrt/tests/CMakeLists.txt @@ -1,3 +1,5 @@ +cc_test_tiny(test_abs_model SRCS model/test_abs.cc DEPS infrt ${MLIR_IR_LIBS}) + configure_file(lit.cfg.py.in "${CMAKE_SOURCE_DIR}/paddle/infrt/tests/lit.cfg.py") add_test(NAME test_infrt_by_lit COMMAND sh -c "lit -v ${CMAKE_SOURCE_DIR}/paddle/infrt/tests --filter-out \"disabled_*\"" diff --git a/paddle/infrt/tests/model/abs_model.py b/paddle/infrt/tests/model/abs_model.py new file mode 100644 index 00000000000..dd1632bc9d4 --- /dev/null +++ b/paddle/infrt/tests/model/abs_model.py @@ -0,0 +1,38 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from paddle.nn import Layer +from paddle.static import InputSpec +from paddle.jit import to_static +import sys + + +class AbsNet(paddle.nn.Layer): + def __init__(self): + super(AbsNet, self).__init__() + + def forward(self, x): + x = paddle.abs(x) + return x + + +if __name__ == '__main__': + # build network + model = AbsNet() + # save inferencing format model + net = to_static( + model, input_spec=[InputSpec( + shape=[None, 1, 28, 28], name='x')]) + paddle.jit.save(net, sys.argv[1]) diff --git a/paddle/infrt/tests/model/test_abs.cc b/paddle/infrt/tests/model/test_abs.cc new file mode 100644 index 00000000000..5de159b86fc --- /dev/null +++ b/paddle/infrt/tests/model/test_abs.cc @@ -0,0 +1,126 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include "llvm/Support/DynamicLibrary.h" +#include "paddle/infrt/common/global.h" +#include "paddle/infrt/dialect/mlir_loader.h" +#include "paddle/infrt/host_context/core_runtime.h" +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/mlir_to_runtime_translate.h" +#include "paddle/infrt/kernel/basic_kernels.h" +#include "paddle/infrt/kernel/control_flow_kernels.h" +#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h" +#include "paddle/infrt/kernel/phi/registry.h" +#include "paddle/infrt/kernel/tensor_kernels.h" +#include "paddle/infrt/kernel/tensor_shape_kernels.h" +#include "paddle/infrt/kernel/test_kernels.h" + +#include "paddle/infrt/kernel/phi/infershaped/infershaped_utils.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/meta_tensor.h" + +#include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" + +#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h" +#include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h" +#include "paddle/infrt/host_context/paddle_mlir.h" + +#include "paddle/infrt/dialect/dense_tensor.h" +#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" +#include "paddle/infrt/dialect/phi/ir/phi_base.h" +#include "paddle/infrt/dialect/phi/ir/phi_kernels.h" + +static llvm::cl::list cl_shared_libs( // NOLINT + "shared_libs", + llvm::cl::desc("Specify shared library with kernels."), + llvm::cl::ZeroOrMore, + llvm::cl::MiscFlags::CommaSeparated); + +TEST(ABS_MODEL, convert_and_execute) { + std::string model_file_name = "./abs.pdmodel"; + std::string params_file_name = "./abs.pdiparams"; + // convert model + MLIRModelGenImpl myGen; + auto module_ = myGen.ImportPaddleModel(model_file_name, params_file_name); + module_.dump(); + // pick kernel + mlir::MLIRContext* context = infrt::Global::getMLIRContext(); + context->allowUnregisteredDialects(); + context->getOrLoadDialect(); + + context->getOrLoadDialect(); + context->getOrLoadDialect(); + context->getOrLoadDialect(); + context->getOrLoadDialect(); + context->getOrLoadDialect(); + + context->getOrLoadDialect(); + context->getOrLoadDialect(); + context->getOrLoadDialect(); + context->getOrLoadDialect(); + + context->loadAllAvailableDialects(); + mlir::PassManager pm(context); + + mlir::OpPassManager& phi_pass_manager = pm.nest(); + std::vector valid_places = {{infrt::TargetType::CPU, + infrt::PrecisionType::FLOAT32, + infrt::LayoutType::NCHW}}; + phi_pass_manager.addPass(infrt::createPhiOpCvtPass(valid_places)); + phi_pass_manager.addPass(infrt::createInfrtOpFusePass()); + + if (mlir::failed(pm.run(module_))) { + std::cout << "\npass failed!\n" << std::endl; + } + module_.dump(); + + // executate + infrt::host_context::KernelRegistry registry; + infrt::kernel::RegisterBasicKernels(®istry); + infrt::kernel::RegisterTestKernels(®istry); + infrt::kernel::RegisterTensorShapeKernels(®istry); + infrt::kernel::RegisterTensorKernels(®istry); + infrt::kernel::RegisterControlFlowKernels(®istry); + infrt::kernel::RegisterPhiKernels(®istry); + infrt::kernel::RegisterInferShapeLaunchers(®istry); + // load extra shared library + for (const auto& lib_path : cl_shared_libs) { + std::string err; + llvm::sys::DynamicLibrary dynLib = + llvm::sys::DynamicLibrary::getPermanentLibrary(lib_path.c_str(), &err); + if (!dynLib.isValid()) { + llvm::errs() << "Load shared library failed. Error: " << err << "\n"; + break; + } + if (auto reg_sym = dynLib.SearchForAddressOfSymbol("RegisterKernels")) { + auto reg_func = + reinterpret_cast( + reg_sym); + reg_func(®istry); + } else { + llvm::outs() << "Symbol \"RegisterKernels\" not found in \"" << lib_path + << "\". Skip.\n"; + } + } + infrt::host_context::TestMlir(module_, ®istry); +} diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh index 3b2df68074a..850d4015abf 100755 --- a/paddle/scripts/infrt_build.sh +++ b/paddle/scripts/infrt_build.sh @@ -44,6 +44,8 @@ function update_pd_ops() { cd ${PADDLE_ROOT}/tools/infrt/ python3 generate_pd_op_dialect_from_paddle_op_maker.py python3 generate_phi_kernel_dialect.py + # generate test model + python3 paddle/infrt/tests/model/abs_model.py ${PADDLE_ROOT}/build/paddle/infrt/tests/abs } function init() { -- GitLab From bef6f2e1e24821cdf654c6f4daf389c746dc1c3f Mon Sep 17 00:00:00 2001 From: Lijunhui <1578034415@qq.com> Date: Wed, 16 Mar 2022 21:42:09 +0800 Subject: [PATCH 116/176] [KP] Fix registry and add UT for thresholded_relu & softshrink (#40524) * init commit * correct namespace --- paddle/fluid/operators/activation_op.kps | 276 ++++++++++++++---- .../unittests/xpu/test_activation_op_xpu.py | 63 ++++ 2 files changed, 281 insertions(+), 58 deletions(-) diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index 22613cbe2a2..865943696c3 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -15,6 +15,8 @@ limitations under the License. */ #include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/kernels/funcs/activation_functor.h" + namespace paddle { namespace operators { @@ -1148,63 +1150,221 @@ REGISTER_OP_CUDA_KERNEL( FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL) #ifdef PADDLE_WITH_XPU_KP -#define REGISTER_ACTIVATION_XPU_KERNEL(act_type, op_name, functor, \ - grad_functor) \ - REGISTER_OP_KERNEL( \ - act_type, KP, plat::XPUPlace, \ - ops::ActivationCudaKernel>); \ - REGISTER_OP_KERNEL(act_type##_grad, KP, plat::XPUPlace, \ - ops::ActivationGradCudaKernel>); - -REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor, - CudaLeakyReluGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor, - CudaSigmoidGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(exp, Exp, CudaExpFunctor, CudaExpGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(reciprocal, Reciprocal, CudaReciprocalFunctor, - CudaReciprocalGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(softplus, Softplus, CudaSoftplusFunctor, - CudaSoftplusGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, HardSwish, CudaHardSwishFunctor, - CudaHardSwishGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(elu, Elu, CudaELUFunctor, CudaELUGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(celu, Celu, CudaCELUFunctor, - CudaCELUGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(sqrt, Sqrt, CudaSqrtFunctor, - CudaSqrtGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(square, Square, CudaSquareFunctor, - CudaSquareGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(silu, Silu, CudaSiluFunctor, - CudaSiluGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, - CudaLogSigmoidGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(softshrink, SoftShrink, CudaSoftShrinkFunctor, - CudaSoftShrinkGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(ceil, Ceil, CudaCeilFunctor, - CudaZeroGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(floor, Floor, CudaFloorFunctor, - CudaZeroGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(log1p, Log1p, CudaLog1pFunctor, - CudaLog1pGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(brelu, BRelu, CudaBReluFunctor, - CudaBReluGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(soft_relu, SoftRelu, CudaSoftReluFunctor, - CudaSoftReluGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(softsign, Softsign, CudaSoftsignFunctor, - CudaSoftsignGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(relu6, Relu6, CudaRelu6Functor, - CudaRelu6GradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(hard_shrink, HardShrink, CudaHardShrinkFunctor, - CudaHardShrinkGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(hard_sigmoid, HardSigmoid, - CudaHardSigmoidFunctor, - CudaHardSigmoidGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(swish, Swish, CudaSwishFunctor, - CudaSwishGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(thresholded_relu, ThresholdedRelu, - CudaThresholdedReluFunctor, - CudaThresholdedReluGradFunctor); +REGISTER_OP_KERNEL( + brelu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + brelu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(ceil, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + ceil_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(celu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + celu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(elu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + elu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(exp, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + exp_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(floor, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + floor_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + hard_shrink, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + hard_shrink_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + hard_sigmoid, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + hard_sigmoid_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(hard_swish, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + hard_swish_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + leaky_relu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + leaky_relu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(log, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + log_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(log1p, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + log1p_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + logsigmoid, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + logsigmoid_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + reciprocal, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + reciprocal_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + relu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + relu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(relu6, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + relu6_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(sigmoid, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + sigmoid_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(silu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + silu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(soft_relu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + soft_relu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(softplus, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + softplus_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + softshrink, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + softshrink_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(softsign, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + softsign_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(sqrt, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + sqrt_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(square, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + square_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL(swish, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + swish_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); + +REGISTER_OP_KERNEL( + thresholded_relu, KP, plat::XPUPlace, + ops::ActivationCudaKernel>); +REGISTER_OP_KERNEL( + thresholded_relu_grad, KP, plat::XPUPlace, + ops::ActivationGradCudaKernel>); #endif // PADDLE_WITH_XPU_KP diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py index 69bca8dd9ef..66f2e871dac 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py @@ -849,6 +849,38 @@ def ref_softsign(x): return out +class XPUTestSoftshrinkOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'softshrink' + self.use_dynamic_create_class = False + + class XPUTestSoftshrink(TestActivationOPBase): + def set_case(self): + self.op_type = "softshrink" + self.dtype = self.in_type + + threshold = 0.5 + np.random.seed(1023) + x = np.random.uniform(0.25, 10, [10, 12]).astype(self.dtype) + out = ref_softshrink(x, threshold) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True} + + +support_types = get_xpu_op_support_types('softshrink') +for stype in support_types: + create_test_class(globals(), XPUTestSoftshrinkOP, stype) + + +def ref_softshrink(x, threshold=0.5): + out = np.copy(x) + out = (out < -threshold) * (out + threshold) + (out > threshold) * ( + out - threshold) + return out + + class XPUTestSwishOP(XPUOpTestWrapper): def __init__(self): self.op_name = 'swish' @@ -879,5 +911,36 @@ def ref_swish(x): return out +class XPUTestThresholdedReluOP(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'thresholded_relu' + self.use_dynamic_create_class = False + + class XPUTestThresholdedRelu(TestActivationOPBase): + def set_case(self): + self.op_type = "thresholded_relu" + self.dtype = self.in_type + + threshold = 1.0 + np.random.seed(1024) + x = np.random.uniform(-20, 20, [10, 12]).astype(self.dtype) + x[np.abs(x) < 0.005] = 0.02 + out = ref_thresholded_relu(x, threshold) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True} + + +support_types = get_xpu_op_support_types('thresholded_relu') +for stype in support_types: + create_test_class(globals(), XPUTestThresholdedReluOP, stype) + + +def ref_thresholded_relu(x, threshold=1.0): + out = (x > threshold) * x + return out + + if __name__ == "__main__": unittest.main() -- GitLab From a09a93a177b29bee75c1eaa99f96500d3d2087f2 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 16 Mar 2022 21:51:56 +0800 Subject: [PATCH 117/176] move determinant op infershape (#40624) --- paddle/fluid/operators/determinant_op.cc | 32 +++++++++--------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/operators/determinant_op.cc b/paddle/fluid/operators/determinant_op.cc index 68083c75985..6959b5cf811 100644 --- a/paddle/fluid/operators/determinant_op.cc +++ b/paddle/fluid/operators/determinant_op.cc @@ -13,6 +13,10 @@ // limitations under the License. #include "paddle/fluid/operators/determinant_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -20,11 +24,6 @@ namespace operators { class DeterminantOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "determinant"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "determinant"); - } }; class DeterminantOpMaker : public framework::OpProtoAndCheckerMaker { @@ -44,19 +43,6 @@ class DeterminantGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", - "DeterminantGradOp"); - OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "DeterminantGradOp"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input", - framework::GradVarName("Out"), "DeterminantGradOp"); - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Input")), "Output", - framework::GradVarName("Input"), "DeterminantGradOp"); - - ctx->SetOutputDim(framework::GradVarName("Input"), - ctx->GetInputDim("Input")); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -162,11 +148,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(SlogDeterminantGradNoNeedBufferVarsInferer, namespace ops = paddle::operators; namespace plat = paddle::platform; +DECLARE_INFER_SHAPE_FUNCTOR(determinant, DeterminantInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(determinant, ops::DeterminantOp, ops::DeterminantOpMaker, ops::DeterminantGradOpMaker, - ops::DeterminantGradOpMaker); + ops::DeterminantGradOpMaker, + DeterminantInferShapeFunctor); -REGISTER_OPERATOR(determinant_grad, ops::DeterminantGradOp) +DECLARE_INFER_SHAPE_FUNCTOR(determinant_grad, DeterminantGradInferShapeFunctor, + PD_INFER_META(phi::GeneralUnaryGradInferMeta)); +REGISTER_OPERATOR(determinant_grad, ops::DeterminantGradOp, + DeterminantGradInferShapeFunctor); REGISTER_OPERATOR(slogdeterminant, ops::SlogDeterminantOp, ops::SlogDeterminantOpMaker, -- GitLab From 2dec25dba21da5eb79bb364cb5eb58ece561a433 Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Wed, 16 Mar 2022 22:10:13 +0800 Subject: [PATCH 118/176] Optimize the computation of log_softmax (#40612) * Optimize the computation of log_softmax * modify the var name --- paddle/phi/kernels/gpudnn/softmax_gpudnn.h | 43 ++++++++++------------ 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h index 2b2dd511896..77159bfc876 100644 --- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h +++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h @@ -121,17 +121,10 @@ struct ReduceMaxFunctor { }; template -struct ExpSubFunctor { - HOSTDEVICE inline ExpSubFunctor() { y = static_cast(0.0f); } - - HOSTDEVICE explicit inline ExpSubFunctor(Tx y) : y((Tx)(y)) {} - +struct ExpFunctor { HOSTDEVICE inline Ty operator()(const Tx& x) const { - return static_cast(std::exp(x - y)); + return static_cast(std::exp(x)); } - - private: - Tx y; }; template @@ -293,10 +286,14 @@ __global__ void WarpSoftmaxForward(T* softmax, } // data src - AccT srcdata[kBatchSize][kLoopsV][kVSize]; - T src_tmp[kBatchSize][kLoopsV][kVSize]; - kps::Init(&srcdata[0][0][0], kLowInf); - kps::Init(&src_tmp[0][0][0], -std::numeric_limits::infinity()); + // src_data: the raw data form global memory + // sub_data: store the data obtained by (src_data - max), used by log_softmax + // exp_data: store the data obtained by (exp(sub_data)), used by softmax + T src_data[kBatchSize][kLoopsV][kVSize]; + AccT sub_data[kBatchSize][kLoopsV][kVSize]; + AccT exp_data[kBatchSize][kLoopsV][kVSize]; + kps::Init(&sub_data[0][0][0], kLowInf); + kps::Init(&src_data[0][0][0], -std::numeric_limits::infinity()); // data dst T out_tmp[kBatchSize][kLoopsV][kVSize]; @@ -313,11 +310,11 @@ __global__ void WarpSoftmaxForward(T* softmax, for (int i = 0; i < kBatchSize; ++i) { const VecT* src_v = reinterpret_cast(&src[(first_batch + i) * stride]); - VecT* reg_v = reinterpret_cast(&src_tmp[i][0][0]); + VecT* reg_v = reinterpret_cast(&src_data[i][0][0]); kps::ReadData( ®_v[0], &src_v[0], idx_max_v[i], 0, kWarpSize, 1); kps::ElementwiseUnary>( - &srcdata[i][0][0], &src_tmp[i][0][0], DataTransFunctor()); + &sub_data[i][0][0], &src_data[i][0][0], DataTransFunctor()); } // compute max @@ -327,14 +324,16 @@ __global__ void WarpSoftmaxForward(T* softmax, 1, ReduceMaxFunctor, kMode::kLocalMode>( - &max[0], &srcdata[0][0][0], ReduceMaxFunctor(), true); + &max[0], &sub_data[0][0][0], ReduceMaxFunctor(), true); WarpReduceMax(max); // compute sum #pragma unroll for (int i = 0; i < kBatchSize; ++i) { - kps::ElementwiseUnary>( - &srcdata[i][0][0], &srcdata[i][0][0], ExpSubFunctor(max[i])); + kps::ElementwiseUnary>( + &sub_data[i][0][0], &sub_data[i][0][0], UnarySubFunctor(max[i])); + kps::ElementwiseUnary>( + &exp_data[i][0][0], &sub_data[i][0][0], ExpFunctor()); } kps::Reduce, kMode::kLocalMode>( - &sum[0], &srcdata[0][0][0], kps::AddFunctor(), true); + &sum[0], &exp_data[0][0][0], kps::AddFunctor(), true); WarpReduceSum(sum); // write data to global memory @@ -352,15 +351,13 @@ __global__ void WarpSoftmaxForward(T* softmax, reinterpret_cast(&softmax[(first_batch + i) * stride]); VecT* reg_v = reinterpret_cast(&out_tmp[i][0][0]); if (LogMode) { - kps::ElementwiseUnary>( - &srcdata[i][0][0], &srcdata[i][0][0], UnaryLogFunctor()); kps::ElementwiseUnary>( &out_tmp[i][0][0], - &srcdata[i][0][0], + &sub_data[i][0][0], UnarySubFunctor(std::log(sum[i]))); } else { kps::ElementwiseUnary>( - &out_tmp[i][0][0], &srcdata[i][0][0], UnaryDivFunctor(sum[i])); + &out_tmp[i][0][0], &exp_data[i][0][0], UnaryDivFunctor(sum[i])); } kps::WriteData( &softmax_v[0], ®_v[0], idx_max_v[i], 0, kWarpSize, 1); -- GitLab From 6849d33b62cacccb27797375a212e37a47ca9484 Mon Sep 17 00:00:00 2001 From: Zhong Hui Date: Wed, 16 Mar 2022 23:05:54 +0800 Subject: [PATCH 119/176] [Ops] segment pool op support for int int64 kernel. (#40577) * segment pool support for int int64 kernel. * add support in python api --- .../kernels/cpu/segment_pool_grad_kernel.cc | 4 +++- paddle/phi/kernels/cpu/segment_pool_kernel.cc | 10 ++++++++-- paddle/phi/kernels/funcs/segment_pooling.cc | 9 +++++++++ paddle/phi/kernels/funcs/segment_pooling.cu | 9 +++++++++ .../kernels/gpu/segment_pool_grad_kernel.cu | 4 +++- paddle/phi/kernels/gpu/segment_pool_kernel.cu | 10 ++++++++-- python/paddle/incubate/tensor/math.py | 20 +++++++++++-------- 7 files changed, 52 insertions(+), 14 deletions(-) diff --git a/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc index 585c27bdcec..a5c9dc4c55e 100644 --- a/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/segment_pool_grad_kernel.cc @@ -23,4 +23,6 @@ PD_REGISTER_KERNEL(segment_pool_grad, ALL_LAYOUT, phi::SegmentPoolGradKernel, float, - double) {} + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/segment_pool_kernel.cc b/paddle/phi/kernels/cpu/segment_pool_kernel.cc index d0413457f81..ad76a7a86bc 100644 --- a/paddle/phi/kernels/cpu/segment_pool_kernel.cc +++ b/paddle/phi/kernels/cpu/segment_pool_kernel.cc @@ -18,5 +18,11 @@ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" -PD_REGISTER_KERNEL( - segment_pool, CPU, ALL_LAYOUT, phi::SegmentPoolKernel, float, double) {} +PD_REGISTER_KERNEL(segment_pool, + CPU, + ALL_LAYOUT, + phi::SegmentPoolKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/funcs/segment_pooling.cc b/paddle/phi/kernels/funcs/segment_pooling.cc index bf4a21f3722..fbd744430aa 100644 --- a/paddle/phi/kernels/funcs/segment_pooling.cc +++ b/paddle/phi/kernels/funcs/segment_pooling.cc @@ -149,10 +149,19 @@ template class SegmentPoolFunctor; template class SegmentPoolFunctor; template class SegmentPoolFunctor; template class SegmentPoolFunctor; +template class SegmentPoolFunctor; +template class SegmentPoolFunctor; +template class SegmentPoolFunctor; +template class SegmentPoolFunctor; + template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu index 305cd39f077..95606b15267 100644 --- a/paddle/phi/kernels/funcs/segment_pooling.cu +++ b/paddle/phi/kernels/funcs/segment_pooling.cu @@ -453,10 +453,19 @@ template class SegmentPoolFunctor; template class SegmentPoolFunctor; template class SegmentPoolFunctor; template class SegmentPoolFunctor; +template class SegmentPoolFunctor; +template class SegmentPoolFunctor; +template class SegmentPoolFunctor; +template class SegmentPoolFunctor; + template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; +template class SegmentPoolGradFunctor; } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu index d9618dc159a..9d1769e18b4 100644 --- a/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/segment_pool_grad_kernel.cu @@ -24,4 +24,6 @@ PD_REGISTER_KERNEL(segment_pool_grad, ALL_LAYOUT, phi::SegmentPoolGradKernel, float, - double) {} + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/segment_pool_kernel.cu b/paddle/phi/kernels/gpu/segment_pool_kernel.cu index c38e935adf8..3128e534166 100644 --- a/paddle/phi/kernels/gpu/segment_pool_kernel.cu +++ b/paddle/phi/kernels/gpu/segment_pool_kernel.cu @@ -19,5 +19,11 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" -PD_REGISTER_KERNEL( - segment_pool, GPU, ALL_LAYOUT, phi::SegmentPoolKernel, float, double) {} +PD_REGISTER_KERNEL(segment_pool, + GPU, + ALL_LAYOUT, + phi::SegmentPoolKernel, + float, + double, + int, + int64_t) {} diff --git a/python/paddle/incubate/tensor/math.py b/python/paddle/incubate/tensor/math.py index 9f577d5ff38..2d0b079ee92 100644 --- a/python/paddle/incubate/tensor/math.py +++ b/python/paddle/incubate/tensor/math.py @@ -29,7 +29,7 @@ def segment_sum(data, segment_ids, name=None): where sum is over j such that `segment_ids[j] == i`. Args: - data (Tensor): A tensor, available data type float32, float64. + data (Tensor): A tensor, available data type float32, float64, int32, int64. segment_ids (Tensor): A 1-D tensor, which have the same size with the first dimension of input data. Available data type is int32, int64. @@ -54,7 +54,8 @@ def segment_sum(data, segment_ids, name=None): out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "SUM") return out - check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool") + check_variable_and_dtype(data, "X", ("float32", "float64", "int32", + "int64"), "segment_pool") check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"), "segment_pool") @@ -82,7 +83,7 @@ def segment_mean(data, segment_ids, name=None): of all index 'segment_ids[j] == i'. Args: - data (tensor): a tensor, available data type float32, float64. + data (tensor): a tensor, available data type float32, float64, int32, int64. segment_ids (tensor): a 1-d tensor, which have the same size with the first dimension of input data. available data type is int32, int64. @@ -107,7 +108,8 @@ def segment_mean(data, segment_ids, name=None): out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MEAN") return out - check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool") + check_variable_and_dtype(data, "X", ("float32", "float64", "int32", + "int64"), "segment_pool") check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"), "segment_pool") @@ -134,7 +136,7 @@ def segment_min(data, segment_ids, name=None): where min is over j such that `segment_ids[j] == i`. Args: - data (tensor): a tensor, available data type float32, float64. + data (tensor): a tensor, available data type float32, float64, int32, int64. segment_ids (tensor): a 1-d tensor, which have the same size with the first dimension of input data. available data type is int32, int64. @@ -159,7 +161,8 @@ def segment_min(data, segment_ids, name=None): out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MIN") return out - check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool") + check_variable_and_dtype(data, "X", ("float32", "float64", "int32", + "int64"), "segment_pool") check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"), "segment_pool") @@ -186,7 +189,7 @@ def segment_max(data, segment_ids, name=None): where max is over j such that `segment_ids[j] == i`. Args: - data (tensor): a tensor, available data type float32, float64. + data (tensor): a tensor, available data type float32, float64, int32, int64. segment_ids (tensor): a 1-d tensor, which have the same size with the first dimension of input data. available data type is int32, int64. @@ -211,7 +214,8 @@ def segment_max(data, segment_ids, name=None): out, tmp = _C_ops.segment_pool(data, segment_ids, 'pooltype', "MAX") return out - check_variable_and_dtype(data, "X", ("float32", "float64"), "segment_pool") + check_variable_and_dtype(data, "X", ("float32", "float64", "int32", + "int64"), "segment_pool") check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"), "segment_pool") -- GitLab From 3a2566370eee2075f3b430082cd9bd7c725d07cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?= <78149749+winter-wang@users.noreply.github.com> Date: Thu, 17 Mar 2022 06:34:46 +0800 Subject: [PATCH 120/176] [infrt] move pd dialect position. test=develop (#40616) --- paddle/infrt/dialect/CMakeLists.txt | 2 +- .../dialect/infrt/pass/infrt_op_fuse_pass.cc | 2 +- paddle/infrt/dialect/init_dialects.cc | 2 +- paddle/infrt/dialect/pd/CMakeLists.txt | 3 ++ paddle/infrt/dialect/pd/common/CMakeLists.txt | 4 ++ paddle/infrt/dialect/pd/ir/CMakeLists.txt | 5 +++ paddle/infrt/dialect/{ => pd/ir}/pd_ops.cc | 2 +- paddle/infrt/dialect/{ => pd/ir}/pd_ops.h | 0 paddle/infrt/dialect/pd/pass/CMakeLists.txt | 5 +++ .../infrt/dialect/pd/pass/pd_op_fuse_pass.cc | 43 +++++++++++++++++++ .../dialect/phi/pass/proto_arg_map_context.h | 2 +- .../dialect/tensorrt/trt_graph_fuse_pass.cc | 3 +- .../dialect/tensorrt/trt_graph_split_pass.cc | 2 +- .../dialect/tensorrt/trt_op_converter_pass.cc | 2 +- .../dialect/tensorrt/trt_op_teller_pass.cc | 2 +- paddle/infrt/dialect/tensorrt/trt_ops.h | 2 +- paddle/infrt/host_context/paddle_mlir.cc | 2 +- paddle/infrt/host_context/paddle_mlir.h | 2 +- ...rate_pd_op_dialect_from_paddle_op_maker.py | 2 +- 19 files changed, 74 insertions(+), 13 deletions(-) create mode 100644 paddle/infrt/dialect/pd/CMakeLists.txt create mode 100644 paddle/infrt/dialect/pd/common/CMakeLists.txt create mode 100644 paddle/infrt/dialect/pd/ir/CMakeLists.txt rename paddle/infrt/dialect/{ => pd/ir}/pd_ops.cc (99%) rename paddle/infrt/dialect/{ => pd/ir}/pd_ops.h (100%) create mode 100644 paddle/infrt/dialect/pd/pass/CMakeLists.txt create mode 100644 paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt index a3f2d0afafc..353a9c67952 100644 --- a/paddle/infrt/dialect/CMakeLists.txt +++ b/paddle/infrt/dialect/CMakeLists.txt @@ -7,7 +7,6 @@ gather_srcs(infrt_src SRCS dense_tensor.cc mlir_loader.cc diagnostic_utils.cc - pd_ops.cc ) mlir_tablegen_on(tensor_shape DIALECT ts) @@ -28,6 +27,7 @@ add_dependencies(print-ir pd_ops_inc) cc_test_tiny(test_infrt_mlir_loader SRCS mlir_loader_test.cc DEPS infrt ${MLIR_IR_LIBS}) add_subdirectory(infrt) +add_subdirectory(pd) add_subdirectory(tensorrt) if (INFRT_WITH_PHI) diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc index 9d8ce5d8dfe..eec0e0bc7c5 100644 --- a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc +++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc @@ -16,7 +16,7 @@ #include #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" -#include "paddle/infrt/dialect/pd_ops.h" +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" namespace { #include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse.cpp.inc" // NOLINT diff --git a/paddle/infrt/dialect/init_dialects.cc b/paddle/infrt/dialect/init_dialects.cc index 0c5944ebf84..55f6de62523 100644 --- a/paddle/infrt/dialect/init_dialects.cc +++ b/paddle/infrt/dialect/init_dialects.cc @@ -20,7 +20,7 @@ #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" -#include "paddle/infrt/dialect/pd_ops.h" +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" #include "paddle/infrt/dialect/phi/ir/phi_base.h" #include "paddle/infrt/dialect/phi/ir/phi_kernels.h" diff --git a/paddle/infrt/dialect/pd/CMakeLists.txt b/paddle/infrt/dialect/pd/CMakeLists.txt new file mode 100644 index 00000000000..5f65336453f --- /dev/null +++ b/paddle/infrt/dialect/pd/CMakeLists.txt @@ -0,0 +1,3 @@ +add_subdirectory(common) +add_subdirectory(ir) +add_subdirectory(pass) diff --git a/paddle/infrt/dialect/pd/common/CMakeLists.txt b/paddle/infrt/dialect/pd/common/CMakeLists.txt new file mode 100644 index 00000000000..ee1b0d4c30d --- /dev/null +++ b/paddle/infrt/dialect/pd/common/CMakeLists.txt @@ -0,0 +1,4 @@ +core_gather_headers() + +gather_srcs(infrt_src SRCS + ) diff --git a/paddle/infrt/dialect/pd/ir/CMakeLists.txt b/paddle/infrt/dialect/pd/ir/CMakeLists.txt new file mode 100644 index 00000000000..0787a612d48 --- /dev/null +++ b/paddle/infrt/dialect/pd/ir/CMakeLists.txt @@ -0,0 +1,5 @@ +core_gather_headers() + +gather_srcs(infrt_src SRCS + pd_ops.cc + ) diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd/ir/pd_ops.cc similarity index 99% rename from paddle/infrt/dialect/pd_ops.cc rename to paddle/infrt/dialect/pd/ir/pd_ops.cc index 96e9e307f2f..5abf7d1a1b9 100644 --- a/paddle/infrt/dialect/pd_ops.cc +++ b/paddle/infrt/dialect/pd/ir/pd_ops.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/dialect/pd_ops.h" +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" #include #include diff --git a/paddle/infrt/dialect/pd_ops.h b/paddle/infrt/dialect/pd/ir/pd_ops.h similarity index 100% rename from paddle/infrt/dialect/pd_ops.h rename to paddle/infrt/dialect/pd/ir/pd_ops.h diff --git a/paddle/infrt/dialect/pd/pass/CMakeLists.txt b/paddle/infrt/dialect/pd/pass/CMakeLists.txt new file mode 100644 index 00000000000..59640e7e625 --- /dev/null +++ b/paddle/infrt/dialect/pd/pass/CMakeLists.txt @@ -0,0 +1,5 @@ +core_gather_headers() + +gather_srcs(infrt_src SRCS + pd_op_fuse_pass.cc + ) diff --git a/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc new file mode 100644 index 00000000000..620c8594234 --- /dev/null +++ b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc @@ -0,0 +1,43 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" +namespace { +#include "paddle/infrt/dialect/rewrite.cpp.inc" // NOLINT + +/* + * PdOpFusePass. + */ +struct PdOpFusePass + : public mlir::PassWrapper { + public: + ::llvm::StringRef getName() const override { return "PdOpFusePass"; } + + llvm::StringRef getArgument() const override { return "pd-op-fuse"; } + + void runOnFunction() override; +}; + +// Implementation of the PdOpFusePass. +void PdOpFusePass::runOnFunction() { + ::mlir::RewritePatternSet patterns(&getContext()); + populateWithGenerated(patterns); + (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)); +} + +} // namespace + +mlir::PassRegistration infrt_op_fuse_pass; diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h index e4e9b5c3ff8..7d08c32161b 100644 --- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h +++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h @@ -16,7 +16,7 @@ limitations under the License. */ #include #include -#include "paddle/infrt/dialect/pd_ops_info.h" +#include "paddle/infrt/dialect/pd/common/pd_ops_info.h" #include "paddle/phi/core/compat/arg_map_context.h" namespace infrt { diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc index ad6b136463a..e22a2309cbe 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc @@ -17,11 +17,12 @@ #include #include #include -#include #include #include #include +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" + namespace infrt { namespace trt { namespace { diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc index e3a7b455024..f81179e548f 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc @@ -15,7 +15,7 @@ #include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h" #include -#include "paddle/infrt/dialect/pd_ops.h" +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" namespace infrt { namespace trt { diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc index 83bebdb6bf1..1e6a3e13805 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc @@ -14,7 +14,7 @@ #include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h" #include #include -#include "paddle/infrt/dialect/pd_ops.h" +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" #include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h" namespace infrt { diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc index 9f348b4122f..2c6f08277c8 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc @@ -17,7 +17,7 @@ #include #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" -#include "paddle/infrt/dialect/pd_ops.h" +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" namespace infrt { namespace trt { diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h index 78d960b5120..76768037dbd 100644 --- a/paddle/infrt/dialect/tensorrt/trt_ops.h +++ b/paddle/infrt/dialect/tensorrt/trt_ops.h @@ -30,7 +30,7 @@ #include #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" -#include "paddle/infrt/dialect/pd_ops.h" +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" namespace infrt { namespace trt { diff --git a/paddle/infrt/host_context/paddle_mlir.cc b/paddle/infrt/host_context/paddle_mlir.cc index 96aecb75589..48999a23ef3 100644 --- a/paddle/infrt/host_context/paddle_mlir.cc +++ b/paddle/infrt/host_context/paddle_mlir.cc @@ -15,7 +15,7 @@ #include "paddle/infrt/host_context/paddle_mlir.h" #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" #include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" -#include "paddle/infrt/dialect/pd_ops_info.h" +#include "paddle/infrt/dialect/pd/common/pd_ops_info.h" MLIRModelGenImpl::MLIRModelGenImpl() : context_(infrt::Global::getMLIRContext()), builder_(context_) { diff --git a/paddle/infrt/host_context/paddle_mlir.h b/paddle/infrt/host_context/paddle_mlir.h index e825cbb5a11..1fb3f7b7349 100644 --- a/paddle/infrt/host_context/paddle_mlir.h +++ b/paddle/infrt/host_context/paddle_mlir.h @@ -29,7 +29,7 @@ #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" #include "paddle/infrt/dialect/init_dialects.h" -#include "paddle/infrt/dialect/pd_ops.h" +#include "paddle/infrt/dialect/pd/ir/pd_ops.h" #include "paddle/infrt/dialect/tensor_shape.h" #include "paddle/infrt/paddle/model_parser.h" diff --git a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py index 027dfe4328a..528d61daf3b 100644 --- a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py +++ b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py @@ -186,7 +186,7 @@ def generate_all_ops_inputs_outputs_map(op_descs): cpp_style_ops_outputs_map_str = start_ + ops_outputs_str + "\n};" # 3. Write to header file - dst_head_file = "../../paddle/infrt/dialect/pd_ops_info.h" + dst_head_file = "../../paddle/infrt/dialect/pd/common/pd_ops_info.h" with open(dst_head_file, 'w') as ops_inputs_outputs_head_file: ops_inputs_outputs_head_file.write(cpp_style_ops_inputs_map_str) ops_inputs_outputs_head_file.write("\n\n") -- GitLab From 46abe798d8ca7edc72f76f117878b5b7edc7b6d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?= <78149749+winter-wang@users.noreply.github.com> Date: Thu, 17 Mar 2022 06:35:16 +0800 Subject: [PATCH 121/176] [infrt] add default kernel argument remap feature in phi_op_convert_pass. (#40633) --- .../dialect/phi/pass/phi_op_convert_pass.cc | 72 ++++++++++--------- 1 file changed, 40 insertions(+), 32 deletions(-) diff --git a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc index f9e124aba6c..13cba6eeabb 100644 --- a/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc +++ b/paddle/infrt/dialect/phi/pass/phi_op_convert_pass.cc @@ -32,6 +32,7 @@ #include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h" #include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h" #include "paddle/phi/core/compat/op_utils.h" +#include "paddle/phi/core/kernel_factory.h" #include "paddle/phi/ops/compat/signatures.h" namespace { @@ -94,42 +95,49 @@ void PhiOpConvertPass::convertStage() { // Todo: print log continue; } - - ::phi::KernelSignature kernel_sign = - ::phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)( - infrt::ProtoArgumentMappingContext(op)); - // resort input&output according to kernel_sign - ::llvm::SmallVector inputs, ori_output; - ::llvm::SmallVector output_types; - for (const std::string &str : std::get<0>(kernel_sign.args)) { - if (pd_dialect_inputs_info_map_.at(op_name).count(str) == 0) { - LOG(ERROR) << "No input info for Op " << op_name << " and argument " - << str; - return; + auto loc = getFunction().getLoc(); + builder.setInsertionPoint(op); + if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_name)) { + std::string kernel_name = phi::TransToPhiKernelName(op_name); + auto kernel_op = builder.create(loc, + op->getResultTypes(), + op->getOperands(), + kernel_name, + op->getAttrDictionary()); + op->replaceAllUsesWith(kernel_op.getResults()); + } else { + ::phi::KernelSignature kernel_sign = + ::phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)( + infrt::ProtoArgumentMappingContext(op)); + // resort input&output according to kernel_sign + ::llvm::SmallVector inputs, ori_output; + ::llvm::SmallVector output_types; + for (const std::string &str : std::get<0>(kernel_sign.args)) { + if (pd_dialect_inputs_info_map_.at(op_name).count(str) == 0) { + LOG(ERROR) << "No input info for Op " << op_name << " and argument " + << str; + return; + } + uint8_t index = pd_dialect_inputs_info_map_.at(op_name).at(str); + inputs.push_back(op->getOperands()[index]); } - uint8_t index = pd_dialect_inputs_info_map_.at(op_name).at(str); - inputs.push_back(op->getOperands()[index]); - } - for (const std::string &str : std::get<2>(kernel_sign.args)) { - if (pd_dialect_outputs_info_map_.at(op_name).count(str) == 0) { - LOG(ERROR) << "No output info for Op " << op_name << " and argument " - << str; - return; + for (const std::string &str : std::get<2>(kernel_sign.args)) { + if (pd_dialect_outputs_info_map_.at(op_name).count(str) == 0) { + LOG(ERROR) << "No output info for Op " << op_name << " and argument " + << str; + return; + } + uint8_t index = pd_dialect_outputs_info_map_.at(op_name).at(str); + output_types.push_back(op->getResultTypes()[index]); + ori_output.push_back(op->getResult(index)); + } + auto kernel_op = builder.create( + loc, output_types, inputs, kernel_sign.name, op->getAttrDictionary()); + for (size_t index = 0; index < ori_output.size(); ++index) { + ori_output[index].replaceAllUsesWith(kernel_op.getResult(index)); } - uint8_t index = pd_dialect_outputs_info_map_.at(op_name).at(str); - output_types.push_back(op->getResultTypes()[index]); - ori_output.push_back(op->getResult(index)); - } - - auto loc = getFunction().getLoc(); - builder.setInsertionPoint(op); - auto kernel_op = builder.create( - loc, output_types, inputs, kernel_sign.name, op->getAttrDictionary()); - for (size_t index = 0; index < ori_output.size(); ++index) { - ori_output[index].replaceAllUsesWith(kernel_op.getResult(index)); } - CHECK(op->use_empty()); op->erase(); } -- GitLab From 3082ed460670be3d9b02fc025690816ee37e28d6 Mon Sep 17 00:00:00 2001 From: Wilber Date: Thu, 17 Mar 2022 10:05:47 +0800 Subject: [PATCH 122/176] Trt engine. (#40532) * infrt add trt engine * fix register * file generate * fix ci error * fix conflict * add copyright * update * update * update * update engine name * refactor trt code * update * update * update * update * fix conflict * update * fix compile with cuda --- paddle/infrt/CMakeLists.txt | 15 +- paddle/infrt/backends/host/phi_allocator.h | 21 +++ paddle/infrt/backends/host/phi_context.h | 12 ++ .../backends/tensorrt/test_trt_engine.cc | 35 ++-- paddle/infrt/backends/tensorrt/trt_engine.cc | 21 ++- paddle/infrt/backends/tensorrt/trt_engine.h | 11 +- paddle/infrt/dialect/dense_tensor.td | 28 ++- paddle/infrt/dialect/infrt/ir/infrt_base.td | 7 + .../infrt/dialect/infrt/ir/infrt_dialect.cc | 7 + paddle/infrt/dialect/init_dialects.cc | 4 +- .../infrt/dialect/phi/ir/infrt_phi_tensor.td | 8 +- paddle/infrt/dialect/tensorrt/trt_ops.cc | 4 + paddle/infrt/dialect/tensorrt/trt_ops.td | 41 +++-- paddle/infrt/host_context/mlir_exec.cc | 8 +- .../host_context/mlir_to_runtime_translate.cc | 134 +++++++++----- paddle/infrt/host_context/value.h | 30 ++- paddle/infrt/kernel/CMakeLists.txt | 1 + paddle/infrt/kernel/phi/context_kernels.cc | 10 + paddle/infrt/kernel/phi/context_kernels.h | 4 + .../infrt/kernel/phi/dense_tensor_kernels.cc | 87 +++++++-- .../infrt/kernel/phi/dense_tensor_kernels.h | 7 + paddle/infrt/kernel/phi/registry.cc | 11 +- paddle/infrt/kernel/tensor_kernels.cc | 26 +++ paddle/infrt/kernel/tensorrt/CMakeLists.txt | 10 + paddle/infrt/kernel/tensorrt/registry.cc | 33 ++++ paddle/infrt/kernel/tensorrt/registry.h | 35 ++++ paddle/infrt/kernel/tensorrt/trt_kernels.cc | 172 ++++++++++++++++++ paddle/infrt/kernel/tensorrt/trt_kernels.h | 49 +++++ paddle/infrt/tests/dialect/disabled_trt.mlir | 37 ++++ .../infrt/tests/dialect/phi/dense_tensor.mlir | 2 +- paddle/infrt/tests/dialect/phi/phi_test.mlir | 2 +- paddle/infrt/tests/dialect/trt_ops.mlir | 20 +- paddle/phi/backends/gpu/gpu_context.cc | 4 + paddle/phi/backends/gpu/gpu_context.h | 2 + 34 files changed, 780 insertions(+), 118 deletions(-) create mode 100644 paddle/infrt/kernel/tensorrt/CMakeLists.txt create mode 100644 paddle/infrt/kernel/tensorrt/registry.cc create mode 100644 paddle/infrt/kernel/tensorrt/registry.h create mode 100644 paddle/infrt/kernel/tensorrt/trt_kernels.cc create mode 100644 paddle/infrt/kernel/tensorrt/trt_kernels.h create mode 100644 paddle/infrt/tests/dialect/disabled_trt.mlir diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt index 4e273f6d551..f394b754a8e 100644 --- a/paddle/infrt/CMakeLists.txt +++ b/paddle/infrt/CMakeLists.txt @@ -3,12 +3,22 @@ if (NOT WITH_INFRT) endif() option(INFRT_WITH_PHI "Compile INFRT with PHI" ON) +option(INFRT_WITH_GPU "Compile INFRT with GPU" OFF) +option(INFRT_WITH_TRT "Compile INFRT with TensorRT" OFF) #TODO(xiaowei) remove fluid include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform) if (INFRT_WITH_PHI) - add_definitions("-DINFRT_WITH_PHI") + add_definitions("-DINFRT_WITH_PHI") + + # TODO(wilber): Now Infrt gpu/trt depends on phi's components, Modify compile dependency options later. + if (INFRT_WITH_GPU) + add_definitions("-DINFRT_WITH_GPU") + if (INFRT_WITH_TRT) + add_definitions("-DINFRT_WITH_TRT") + endif() + endif() endif() # compile flags @@ -106,6 +116,9 @@ if (INFRT_WITH_PHI) endif() cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto infrt_naive) +if (INFRT_WITH_TRT) + target_link_libraries(infrt infrt_trt) +endif() cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto) add_dependencies(infrt ${infrt_mlir_incs} mlir-headers) diff --git a/paddle/infrt/backends/host/phi_allocator.h b/paddle/infrt/backends/host/phi_allocator.h index c8f97e04a1b..6e3bef92991 100644 --- a/paddle/infrt/backends/host/phi_allocator.h +++ b/paddle/infrt/backends/host/phi_allocator.h @@ -13,6 +13,10 @@ limitations under the License. */ #include "paddle/phi/core/allocator.h" +#ifdef INFRT_WITH_GPU +#include +#endif + namespace infrt { namespace backends { @@ -29,5 +33,22 @@ class CpuPhiAllocator : public phi::Allocator { } }; +#ifdef INFRT_WITH_GPU +// TODO(wilber): Just for demo test. we need a more efficient gpu allocator. +class GpuPhiAllocator : public phi::Allocator { + public: + static void deleter(phi::Allocation* ptr) { cudaFree(ptr->ptr()); } + + AllocationPtr Allocate(size_t bytes_size) { + void* ptr; + cudaMalloc(&ptr, bytes_size); + return AllocationPtr( + new phi::Allocation( + ptr, bytes_size, phi::Place(phi::AllocationType::GPU)), + deleter); + } +}; +#endif + } // namespace backends } // namespace infrt diff --git a/paddle/infrt/backends/host/phi_context.h b/paddle/infrt/backends/host/phi_context.h index 5713fdbbaf8..bcd63dbb39f 100644 --- a/paddle/infrt/backends/host/phi_context.h +++ b/paddle/infrt/backends/host/phi_context.h @@ -13,6 +13,7 @@ limitations under the License. */ #include "paddle/infrt/backends/host/phi_allocator.h" #include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/backends/gpu/gpu_context.h" namespace infrt { namespace backends { @@ -31,5 +32,16 @@ class CpuPhiContext : public phi::CPUContext { std::unique_ptr alloc_{std::make_unique()}; }; +class GpuPhiContext : public phi::GPUContext { + public: + using Base = phi::GPUContext; + using phi::GPUContext::SetStream; + using phi::GPUContext::SetEigenDevice; + using phi::GPUContext::SetBlasHandle; + using phi::GPUContext::SetDnnHandle; + using phi::GPUContext::SetSolverHandle; + using phi::GPUContext::SetSparseHandle; +}; + } // namespace backends } // namespace infrt diff --git a/paddle/infrt/backends/tensorrt/test_trt_engine.cc b/paddle/infrt/backends/tensorrt/test_trt_engine.cc index 12cf14060e2..0ab64dd51c8 100644 --- a/paddle/infrt/backends/tensorrt/test_trt_engine.cc +++ b/paddle/infrt/backends/tensorrt/test_trt_engine.cc @@ -37,9 +37,9 @@ namespace infrt { namespace backends { namespace tensorrt { -const char* model_input = "model_input"; -const char* model_output = "model_output1"; -const char* model_output2 = "model_output2"; +const char* model_input = "input_0"; +const char* model_output = "output_0"; +const char* model_output2 = "output_1"; TrtUniquePtr ConstructNetwork( nvinfer1::IBuilder* builder, nvinfer1::Dims dims, bool is_static_shape) { @@ -122,27 +122,26 @@ TEST(trt, run_static) { std::unordered_map inputs; inputs.emplace(std::make_pair(model_input, &input)); - phi::DenseTensor output, output2; - std::unordered_map outputs; - outputs.emplace(std::make_pair(model_output, &output)); - outputs.emplace(std::make_pair(model_output2, &output2)); - - static_trt_engine.SetUpInference(inference_options, inputs, &outputs); + static_trt_engine.PrepareOutputHandle("output_0"); + static_trt_engine.PrepareOutputHandle("output_1"); + static_trt_engine.SetUpInference(inference_options, inputs); static_trt_engine.GetEngineInfo(); static_trt_engine.Run(context); + phi::DenseTensor* output0 = static_trt_engine.GetOutput("output_0"); + phi::DenseTensor* output1 = static_trt_engine.GetOutput("output_1"); std::vector output_data1(inference_options.batch * 1 * 28 * 28, 0); std::vector output_data2(inference_options.batch * 2 * 28 * 28, 0); paddle::memory::Copy(phi::CPUPlace(), output_data1.data(), place, - output.data(), + output0->data(), sizeof(float) * output_data1.size(), context.stream()); paddle::memory::Copy(phi::CPUPlace(), output_data2.data(), place, - output2.data(), + output1->data(), sizeof(float) * output_data2.size(), context.stream()); cudaStreamSynchronize(context.stream()); @@ -208,27 +207,27 @@ TEST(trt, run_dynamic) { context.stream()); std::unordered_map inputs; - std::unordered_map outputs; inputs.emplace(std::make_pair(model_input, &input)); - outputs.emplace(std::make_pair(model_output, &output)); - outputs.emplace(std::make_pair(model_output2, &output2)); - - engine.SetUpInference(inference_options, inputs, &outputs); + engine.PrepareOutputHandle("output_0"); + engine.PrepareOutputHandle("output_1"); + engine.SetUpInference(inference_options, inputs); engine.GetEngineInfo(); engine.Run(context); + phi::DenseTensor* output0 = engine.GetOutput("output_0"); + phi::DenseTensor* output1 = engine.GetOutput("output_1"); std::vector output_data1(inference_options.batch * 1 * 16 * 16, 0); std::vector output_data2(inference_options.batch * 2 * 16 * 16, 0); paddle::memory::Copy(phi::CPUPlace(), output_data1.data(), place, - output.data(), + output0->data(), sizeof(float) * output_data1.size(), context.stream()); paddle::memory::Copy(phi::CPUPlace(), output_data2.data(), place, - output2.data(), + output1->data(), sizeof(float) * output_data2.size(), context.stream()); cudaStreamSynchronize(context.stream()); diff --git a/paddle/infrt/backends/tensorrt/trt_engine.cc b/paddle/infrt/backends/tensorrt/trt_engine.cc index 232653e8c41..43d356b6d69 100644 --- a/paddle/infrt/backends/tensorrt/trt_engine.cc +++ b/paddle/infrt/backends/tensorrt/trt_engine.cc @@ -21,6 +21,7 @@ #include "paddle/phi/backends/dynload/tensorrt.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/dense_tensor.h" namespace infrt { namespace backends { @@ -235,10 +236,20 @@ bool TrtEngine::SetupNetworkAndConfig(const BuildOptions& build, return true; } +void TrtEngine::PrepareOutputHandle(const std::string& out_name) { + phi::DenseTensor t; + outputs_.emplace(out_name, t); +} + +phi::DenseTensor* TrtEngine::GetOutput(const std::string& name) { + return &outputs_[name]; +} + +size_t TrtEngine::GetOutputNum() const { return outputs_.size(); } + bool TrtEngine::SetUpInference( const InferenceOptions& inference, - const std::unordered_map& inputs, - std::unordered_map* outputs) { + const std::unordered_map& inputs) { // TODO(wilber): now only create one exec_context FreshDeviceId(); CHECK(engine_ != nullptr); @@ -252,10 +263,10 @@ bool TrtEngine::SetUpInference( bindings_.front()->AddBinding( bind_index, it.first, true, it.second, nvinfer1::DataType::kFLOAT); } - for (auto& it : *outputs) { + for (auto& it : outputs_) { const int bind_index = engine_->getBindingIndex(it.first.c_str()); bindings_.front()->AddBinding( - bind_index, it.first, false, it.second, nvinfer1::DataType::kFLOAT); + bind_index, it.first, false, &it.second, nvinfer1::DataType::kFLOAT); } return true; @@ -290,11 +301,13 @@ void TrtEngine::StaticRun(const phi::GPUContext& ctx) { const int bind_index = engine_->getBindingIndex(bind.name.c_str()); std::vector ddim; auto dims = engine_->getBindingDimensions(bind_index); + CHECK_NE(runtime_batch, -1) << "runtime_batch should not be -1."; ddim.push_back(runtime_batch); for (int i = 0; i < dims.nbDims; ++i) { ddim.push_back(dims.d[i]); } bind.buffer->Resize(phi::make_ddim(ddim)); + // TODO(wilber): now only support float output. ctx.Alloc(bind.buffer, sizeof(float) * bind.buffer->numel()); buffers[bind_index] = static_cast(bind.buffer->data()); } diff --git a/paddle/infrt/backends/tensorrt/trt_engine.h b/paddle/infrt/backends/tensorrt/trt_engine.h index 3c8243e3c38..a26474f8cbb 100644 --- a/paddle/infrt/backends/tensorrt/trt_engine.h +++ b/paddle/infrt/backends/tensorrt/trt_engine.h @@ -81,11 +81,17 @@ class TrtEngine { // TODO(wilber): How to support multiple execution contexts? bool SetUpInference( const InferenceOptions& inference, - const std::unordered_map& inputs, - std::unordered_map* outputs); + const std::unordered_map& inputs); void GetEngineInfo(); + void PrepareOutputHandle(const std::string& out_name); + + // TODO(wilber): The output tensor names are: output_0, output_1, ... + phi::DenseTensor* GetOutput(const std::string&); + + size_t GetOutputNum() const; + private: void FreshDeviceId(); @@ -112,6 +118,7 @@ class TrtEngine { std::vector> bindings_; int device_id_{0}; bool is_dynamic_shape_{false}; + std::unordered_map outputs_; }; } // namespace tensorrt diff --git a/paddle/infrt/dialect/dense_tensor.td b/paddle/infrt/dialect/dense_tensor.td index 666c7b300af..59df4e96973 100644 --- a/paddle/infrt/dialect/dense_tensor.td +++ b/paddle/infrt/dialect/dense_tensor.td @@ -130,7 +130,7 @@ def TensorMapGetTensorOp : DT_Op<"tensor_map_get_tensor", [NoSideEffect]> { } def TensorMapGetSizeOp : DT_Op<"tensor_map_get_size", [NoSideEffect]> { - let summary = "ddt.tensor_map_get_size operation"; + let summary = "dt.tensor_map_get_size operation"; let description = [{ An operation that get the size of a TensorMap. @@ -141,6 +141,32 @@ def TensorMapGetSizeOp : DT_Op<"tensor_map_get_size", [NoSideEffect]> { let assemblyFormat = "`(` $map `)` attr-dict `->` type($size)"; } +def Infrt_TensorListGetTensorOp : DT_Op<"tensor_list_get_tensor", [NoSideEffect]> { + let summary = "dt.tensor_list_get_tensor operation"; + + let description = [{ + An operation that can get a tensor from a TensorList. + }]; + + let arguments = (ins + DenseTensorList:$l, + I32Attr:$id + ); + let results = (outs DenseTensor:$output); + let verifier = ?; +} + +def TensorListGetSizeOp : DT_Op<"tensor_list_get_size", [NoSideEffect]> { + let summary = "dt.tensor_list_get_size operation"; + + let description = [{ + An operation that get the size of a TensorList. + }]; + + let arguments = (ins DenseTensorList:$map); + let results = (outs I32:$size); +} + def GetTensorShapeOp : DT_Op<"get_tensor_shape", [NoSideEffect]> { let summary = "dt.get_tensor_shape operation"; diff --git a/paddle/infrt/dialect/infrt/ir/infrt_base.td b/paddle/infrt/dialect/infrt/ir/infrt_base.td index c5130e89bb1..86cfc375330 100644 --- a/paddle/infrt/dialect/infrt/ir/infrt_base.td +++ b/paddle/infrt/dialect/infrt/ir/infrt_base.td @@ -89,6 +89,13 @@ def DenseTensorMap : Infrt_Type<"DenseTensorMap"> { let parameters = (ins); } +// TODO(wilber): Add !infrt.vec type. +def DenseTensorList : Infrt_Type<"DenseTensorList"> { + let summary = "infrt dense tensor map"; + let description = [{dense_tensor map}]; + let parameters = (ins); +} + // Type Constrait for concrete DenseTensor type. class DenseTensor : Type, diff --git a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc index 3a1b45d3a20..8966ca13c2b 100644 --- a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc +++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc @@ -138,6 +138,10 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const { parser.getContext(), *targetType, *precisionType, *layoutType); } + if (keyword == "tensor_list") { + return infrt::DenseTensorListType::get(parser.getContext()); + } + if (keyword == "dense_tensor_map") { return DenseTensorMapType::get(parser.getContext()); } @@ -175,6 +179,9 @@ void InfrtDialect::printType(::mlir::Type type, return; } + if (type.isa()) { + os << "tensor_list"; + } // print DenseTensorType, for example: !infrt.dense_tensor if (type.isa()) { os << "dense_tensor_map"; diff --git a/paddle/infrt/dialect/init_dialects.cc b/paddle/infrt/dialect/init_dialects.cc index 55f6de62523..6183295cafb 100644 --- a/paddle/infrt/dialect/init_dialects.cc +++ b/paddle/infrt/dialect/init_dialects.cc @@ -26,6 +26,7 @@ #include "paddle/infrt/dialect/phi/ir/phi_kernels.h" #include "paddle/infrt/dialect/tensor_shape.h" +#include "paddle/infrt/dialect/tensorrt/trt_ops.h" namespace infrt { void registerCinnDialects(mlir::DialectRegistry ®istry) { // NOLINT @@ -37,7 +38,8 @@ void registerCinnDialects(mlir::DialectRegistry ®istry) { // NOLINT phi::PHIDenseTensorDialect, phi::PHICPUKernelDialect, phi::PHIGPUKernelDialect, - phi::PHIDialect + phi::PHIDialect, + infrt::trt::TensorRTDialect #endif >(); } diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td index 8c3a79498d7..1fda2d9d888 100644 --- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td @@ -21,8 +21,8 @@ def PHI_DenseTensorDialect : Dialect { class PDT_Op traits = []> : Op {} -class CreateDenseTensorOp - : PDT_Op<"create_dense_tensor", [NoSideEffect]> { +class CreateDenseTensorOp + : PDT_Op<"create_dense_tensor." # target, [NoSideEffect]> { let arguments = (ins Context:$context, I64ArrayAttr:$dims, LayoutAttr:$layout, I64ArrayAttr:$lod, PrecisionAttr:$precision); let results = (outs DenseTensor:$output); @@ -51,9 +51,11 @@ class CreateContextOp let results = (outs Context:$output); } -def PDT_CreateDenseTensorOp : CreateDenseTensorOp; +def PDT_CreateCPUDenseTensorOp : CreateDenseTensorOp<"cpu">; +def PDT_CreateGPUDenseTensorOp : CreateDenseTensorOp<"gpu">; def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp; def PDT_CreateCPUContextOp : CreateContextOp<"cpu">; +def PDT_CreateGPUContextOp : CreateContextOp<"gpu">; def PDT_PrintDenseTensor : PrintDenseTensorOp; def FakeKernelOp : PDT_Op<"fake_phi_kernel"> { diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.cc b/paddle/infrt/dialect/tensorrt/trt_ops.cc index d5222976625..415a78a6967 100644 --- a/paddle/infrt/dialect/tensorrt/trt_ops.cc +++ b/paddle/infrt/dialect/tensorrt/trt_ops.cc @@ -21,6 +21,10 @@ #include "paddle/infrt/common/global.h" #include "paddle/infrt/dialect/tensorrt/trt_dialect_types.h" +#include "paddle/infrt/dialect/dense_tensor.h" +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" +#include "paddle/infrt/dialect/phi/ir/phi_base.h" + namespace infrt { namespace trt { diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.td b/paddle/infrt/dialect/tensorrt/trt_ops.td index 132a1d7805b..31b28a38e7c 100755 --- a/paddle/infrt/dialect/tensorrt/trt_ops.td +++ b/paddle/infrt/dialect/tensorrt/trt_ops.td @@ -7,6 +7,8 @@ include "mlir/Interfaces/CallInterfaces.td" include "mlir/IR/OpBase.td" include "paddle/infrt/dialect/tensorrt/trt_op_base.td" +include "paddle/infrt/dialect/infrt/ir/infrt_base.td" +include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td" def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator<"::infrt::ReturnOp">]> { let summary = "trt CreateEngine Op"; @@ -14,8 +16,8 @@ def TRT_CreateEngineOp : TRT_Op<"create_engine", [SingleBlockImplicitTerminator< Describe a tensorrt subgraph. }]; let regions = (region SizedRegion<1>:$body); - let arguments = (ins Variadic:$inputs, DefaultValuedAttr:$run_once); - let results = (outs TRT_EngineType:$output); + let arguments = (ins Variadic:$inputs, DefaultValuedAttr:$run_once); + let results = (outs TRT_EngineType:$engine); } def TRT_ExecuteOp : TRT_Op<"execute", [NoSideEffect]> { @@ -23,8 +25,25 @@ def TRT_ExecuteOp : TRT_Op<"execute", [NoSideEffect]> { let description = [{ Describe a tensorrt runtime. }]; - let arguments = (ins TRT_EngineType:$engine, Variadic:$inputs); - let results = (outs Variadic:$output); + let arguments = (ins TRT_EngineType:$engine, Variadic:$inputs); + let results = (outs Variadic:$output); +} + +def TRT_EngineComputeOp : TRT_Op<"compute", [NoSideEffect]> { + let summary = "trt compute engine"; + let description = [{ + execute engine + }]; + let arguments = (ins TRT_EngineType:$engine, Context:$context); + let results = (outs DenseTensorList:$outputs); +} + +def TRT_InspectEngineOp : TRT_Op<"inspect_engine", [NoSideEffect]> { + let summary = "trt inspect engine"; + let description = [{ + Show engine + }]; + let arguments = (ins TRT_EngineType:$engine); } def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> { @@ -34,11 +53,11 @@ def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> { TensorRT IActivationLayer. }]; - let arguments = (ins TRT_Tensor:$input, SI32Attr:$activation_type, + let arguments = (ins DenseTensor:$input, SI32Attr:$activation_type, DefaultValuedAttr:$alpha, DefaultValuedAttr:$beta); - let results = (outs TRT_Tensor:$output); + let results = (outs DenseTensor:$output); } def TRT_ElementWiseOp : TRT_Op<"ElementWise", [NoSideEffect]> { @@ -48,9 +67,9 @@ def TRT_ElementWiseOp : TRT_Op<"ElementWise", [NoSideEffect]> { TensorRT IElementWiseLayer. }]; - let arguments = (ins TRT_Tensor:$input1, TRT_Tensor:$input2, SI32Attr:$elementwise_operation); + let arguments = (ins DenseTensor:$input1, DenseTensor:$input2, SI32Attr:$elementwise_operation); - let results = (outs TRT_Tensor:$output); + let results = (outs DenseTensor:$output); } def TRT_MatrixMultiplyOp : TRT_Op<"MatrixMultiply", [NoSideEffect]> { @@ -60,10 +79,10 @@ def TRT_MatrixMultiplyOp : TRT_Op<"MatrixMultiply", [NoSideEffect]> { TensorRT IMatrixMultiplyLayer. }]; - let arguments = (ins TRT_Tensor:$input1, BoolAttr:$transpose1, - TRT_Tensor:$input2, BoolAttr:$transpose2); + let arguments = (ins DenseTensor:$input1, BoolAttr:$transpose1, + DenseTensor:$input2, BoolAttr:$transpose2); - let results = (outs TRT_Tensor:$output); + let results = (outs DenseTensor:$output); } #endif // TRT_OPS diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc index 319df90d3ee..81bf873ddf0 100644 --- a/paddle/infrt/host_context/mlir_exec.cc +++ b/paddle/infrt/host_context/mlir_exec.cc @@ -33,7 +33,10 @@ #include "paddle/infrt/dialect/phi/pass/phi_op_convert_pass.h" #include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h" #include "paddle/infrt/kernel/phi/registry.h" -#endif +#if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT) +#include "paddle/infrt/kernel/tensorrt/registry.h" +#endif // INFRT_WITH_GPU && INFRT_WITH_TRT +#endif // INFRT_WITH_PHI static llvm::cl::list cl_shared_libs( // NOLINT "shared_libs", @@ -62,6 +65,9 @@ int main(int argc, char** argv) { #ifdef INFRT_WITH_PHI kernel::RegisterPhiKernels(®istry); kernel::RegisterInferShapeLaunchers(®istry); +#if defined(INFRT_WITH_GPU) && defined(INFRT_WITH_TRT) + kernel::RegisterTrtKernels(®istry); +#endif // INFRT_WITH_GPU && INFRT_WITH_TRT #endif // load extra shared library diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc index c613843cd17..3d5cccb5c32 100644 --- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc +++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc @@ -16,12 +16,14 @@ #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -42,6 +44,13 @@ #include "paddle/infrt/host_context/value.h" #include "paddle/infrt/tensor/tensor_shape.h" +#ifdef INFRT_WITH_PHI +#ifdef INFRT_WITH_TRT +#include "paddle/infrt/kernel/tensorrt/trt_kernels.h" +#endif +#include "paddle/phi/core/dense_tensor.h" +#endif + namespace infrt { namespace host_context { @@ -277,33 +286,58 @@ bool MlirToRuntimeTranslator::EmitGeneralOp( impl_->runtime->NewOpExecutable(op->getName().getStringRef().str()); VLOG(3) << "processing general op : " << op->getName().getStringRef().str(); + // TODO(wilber): Find a more appropriate way to handle special cases. + if (op->getName().getStringRef() == "trt.create_engine") { +#ifdef INFRT_WITH_TRT + auto* symbols = impl_->runtime->symbol_table(); + ::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol mlir_operation; + mlir_operation.operation = op; + mlir_operation.symbol_table = symbols; + impl_->cur_op->AppendArgument(new Value(mlir_operation)); + // TODO(wilber): how to pass DenseTensor to create_engine op? temporialiy + // add a naive implement. + for (int i = 0, e = op->getNumOperands(); i < e; ++i) { + auto operand = op->getOperand(i); + if (operand.isa()) { + mlir::BlockArgument arg = operand.dyn_cast(); + Value* arg_value = GetValue(arg); + if (arg_value->is_type()) { + impl_->runtime->FeedInArgs( + std::make_pair(std::to_string(i), ValueRef(arg_value))); + } + } + } +#else + CHECK(false) << "should not reach here"; +#endif + } else { + // process operands + for (int i = 0, e = op->getNumOperands(); i < e; i++) { + // function argument as value + auto operand = op->getOperand(i); + /// if (operand.getKind() == mlir::Value::Kind::BlockArgument) { + if (operand.isa()) { + mlir::BlockArgument arg = operand.dyn_cast(); + Value* arg_value = GetValue(arg); + impl_->cur_op->AppendArgument(arg_value); + VLOG(3) << "* op mlir operand: " << DumpToString(arg) << " " + << GetValue(arg); + continue; + } - // process operands - for (int i = 0, e = op->getNumOperands(); i < e; i++) { - // function argument as value - auto operand = op->getOperand(i); - /// if (operand.getKind() == mlir::Value::Kind::BlockArgument) { - if (operand.isa()) { - mlir::BlockArgument arg = operand.dyn_cast(); - Value* arg_value = GetValue(arg); + // normal value + Value* arg_value = GetValue(operand); + if (!arg_value) { + auto upstream_op = operand.getDefiningOp(); + arg_value = GetOpResult(upstream_op); + } + CHECK(arg_value) << "No-exist argument value found: " + << DumpToString(operand); impl_->cur_op->AppendArgument(arg_value); - VLOG(3) << "* op mlir operand: " << DumpToString(arg) << " " - << GetValue(arg); - continue; - } - // normal value - Value* arg_value = GetValue(operand); - if (!arg_value) { - auto upstream_op = operand.getDefiningOp(); - arg_value = GetOpResult(upstream_op); + VLOG(3) << "* op mlir operand: " << DumpToString(operand) << " " + << GetValue(operand) << " vs " << arg_value; } - CHECK(arg_value) << "No-exist argument value found: " - << DumpToString(operand); - impl_->cur_op->AppendArgument(arg_value); - - VLOG(3) << "* op mlir operand: " << DumpToString(operand) << " " - << GetValue(operand) << " vs " << arg_value; } // process attributes @@ -383,33 +417,6 @@ bool MlirToRuntimeTranslator::EmitGeneralOp( impl_->cur_op->AppendAttribute(tmp[i]); } - // process results - llvm::SmallVector res_values; - for (int i = 0, e = op->getNumResults(); i < e; i++) { - auto res = op->getResult(i); - if (res.getType().isa<::infrt::DenseTensorType>()) { - auto r = impl_->value_map.try_emplace( - res, ValueRef(new Value{::phi::DenseTensor()})); - CHECK(r.second) << "Duplicate add mlir value [" << DumpToString(res) - << "]"; - res_values.push_back(r.first->second.get()); - } else { - res_values.push_back(AddValue(res)); - } - - VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res); - } - impl_->cur_op->SetResults(res_values); - -#ifdef INFRT_DEBUG - { - VLOG(3) << "check result"; - for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) { - VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i]; - } - } -#endif - // process regions, we treat regions as attribute. auto num_regions = op->getNumRegions(); if (num_regions > 0) { @@ -438,6 +445,33 @@ bool MlirToRuntimeTranslator::EmitGeneralOp( impl_->cur_op->AppendAttribute(new Value(function)); } + // process results + llvm::SmallVector res_values; + for (int i = 0, e = op->getNumResults(); i < e; i++) { + auto res = op->getResult(i); + if (res.getType().isa<::infrt::DenseTensorType>()) { + auto r = impl_->value_map.try_emplace( + res, ValueRef(new Value{::phi::DenseTensor()})); + CHECK(r.second) << "Duplicate add mlir value [" << DumpToString(res) + << "]"; + res_values.push_back(r.first->second.get()); + } else { + res_values.push_back(AddValue(res)); + } + + VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res); + } + impl_->cur_op->SetResults(res_values); + +#ifdef INFRT_DEBUG + { + VLOG(3) << "check result"; + for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) { + VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i]; + } + } +#endif + return true; } diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h index 957d852442b..1f0b1dabd94 100644 --- a/paddle/infrt/host_context/value.h +++ b/paddle/infrt/host_context/value.h @@ -24,6 +24,7 @@ #include "paddle/infrt/common/shared.h" #include "paddle/infrt/dialect/infrt/common/types.h" #include "paddle/infrt/host_context/function.h" +#include "paddle/infrt/host_context/symbol_table.h" #include "paddle/infrt/support/variant.h" #include "paddle/infrt/tensor/dense_host_tensor.h" #include "paddle/infrt/tensor/dense_tensor_view.h" @@ -41,7 +42,15 @@ #include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/meta_tensor.h" -#endif + +#ifdef INFRT_WITH_GPU +#include "paddle/phi/backends/gpu/gpu_context.h" +#endif // INFRT_WITH_GPU +#ifdef INFRT_WITH_TRT +#include "paddle/infrt/backends/tensorrt/trt_engine.h" +#include "paddle/infrt/kernel/tensorrt/trt_kernels.h" +#endif // INFRT_WITH_TRT +#endif // INFRT_WITH_PHI namespace infrt { namespace host_context { @@ -72,8 +81,13 @@ using ValueVariantType = ::phi::MetaTensor, ::phi::DenseTensor, backends::CpuPhiContext, +#ifdef INFRT_WITH_GPU + backends::GpuPhiContext, + ::phi::GPUContext, +#endif ::phi::CPUContext, std::vector, + std::vector, paddle::experimental::ScalarBase, paddle::experimental::ScalarArrayBase, std::vector, @@ -81,6 +95,10 @@ using ValueVariantType = paddle::experimental::Backend, paddle::experimental::DataLayout, paddle::experimental::DataType, +#ifdef INFRT_WITH_TRT + ::infrt::backends::tensorrt::TrtEngine, + ::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol, +#endif // INFRT_WITH_TRT #endif std::vector, std::vector, @@ -120,8 +138,18 @@ class Value : public common::Object { #ifdef INFRT_WITH_PHI explicit Value(::phi::CPUContext&& x) : data(std::move(x)) {} explicit Value(backends::CpuPhiContext&& x) : data(std::move(x)) {} +#ifdef INFRT_WITH_GPU + explicit Value(::phi::GPUContext&& x) : data(std::move(x)) {} + explicit Value(backends::GpuPhiContext&& x) : data(std::move(x)) {} +#endif explicit Value(::phi::DenseTensor&& x) : data(std::move(x)) {} explicit Value(::phi::MetaTensor&& x) : data(std::move(x)) {} +#ifdef INFRT_WITH_TRT + explicit Value(::infrt::backends::tensorrt::TrtEngine&& x) + : data(std::move(x)) {} + explicit Value(::infrt::kernel::tensorrt::MlirOperationWithInfrtSymbol x) + : data(x) {} +#endif // INFRT_WITH_TRT #endif template diff --git a/paddle/infrt/kernel/CMakeLists.txt b/paddle/infrt/kernel/CMakeLists.txt index f1cbfba1c46..f20344f6f6b 100644 --- a/paddle/infrt/kernel/CMakeLists.txt +++ b/paddle/infrt/kernel/CMakeLists.txt @@ -1,4 +1,5 @@ add_subdirectory(phi) +add_subdirectory(tensorrt) core_gather_headers() diff --git a/paddle/infrt/kernel/phi/context_kernels.cc b/paddle/infrt/kernel/phi/context_kernels.cc index 39ef172fade..b27eacf9e52 100644 --- a/paddle/infrt/kernel/phi/context_kernels.cc +++ b/paddle/infrt/kernel/phi/context_kernels.cc @@ -25,6 +25,16 @@ namespace phi { return ctx; } +#ifdef INFRT_WITH_GPU +::phi::GPUContext CreateGPUContext() { + ::phi::GPUContext context; + context.PartialInitWithoutAllocator(); + context.SetAllocator(new ::infrt::backends::GpuPhiAllocator{}); + context.PartialInitWithAllocator(); + return context; +} +#endif + } // namespace phi } // namespace kernel } // namespace infrt diff --git a/paddle/infrt/kernel/phi/context_kernels.h b/paddle/infrt/kernel/phi/context_kernels.h index 3e9580b91da..ae3f76c8fe5 100644 --- a/paddle/infrt/kernel/phi/context_kernels.h +++ b/paddle/infrt/kernel/phi/context_kernels.h @@ -25,6 +25,10 @@ namespace phi { ::phi::CPUContext CreateCPUContext(); +#ifdef INFRT_WITH_GPU +::phi::GPUContext CreateGPUContext(); +#endif + } // namespace phi } // namespace kernel } // namespace infrt diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc index 777fb29ac60..6d16b814c6b 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc @@ -15,6 +15,12 @@ #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h" #include "paddle/infrt/dialect/phi/data_type.h" #include "paddle/infrt/kernel/phi/context_kernels.h" +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/common/place.h" + +#ifdef INFRT_WITH_GPU +#include +#endif namespace infrt { namespace kernel { @@ -34,26 +40,83 @@ namespace phi { {})); } +::phi::DenseTensor CreateGPUDenseTensor( + const ::phi::GPUContext& context, + host_context::Attribute> dims, + host_context::Attribute> lod, + host_context::Attribute<::infrt::LayoutType> layout, + host_context::Attribute<::infrt::PrecisionType> precision) { + return ::phi::DenseTensor( + const_cast<::phi::Allocator*>(&context.GetAllocator()), + ::phi::DenseTensorMeta(ConvertPrecisionToPhi(precision.get()), + ::phi::make_ddim(dims.get()), + ConvertLayoutToPhi(layout.get()), + {})); +} + void FillDenseTensorF32(::phi::DenseTensor* dense_tensor, host_context::Attribute> value) { - auto place = ::phi::CPUPlace(); + auto place = dense_tensor->place(); float* a_data = dense_tensor->mutable_data(place); - for (int64_t i = 0; i < dense_tensor->numel(); ++i) { - a_data[i] = (value.get())[i]; + if (place.GetType() == ::phi::AllocationType::CPU) { + for (int64_t i = 0; i < dense_tensor->numel(); ++i) { + a_data[i] = (value.get())[i]; + } + } else if (place.GetType() == ::phi::AllocationType::GPU) { +#ifdef INFRT_WITH_GPU + // TODO(wilber): how to set the stream parameter to copy with stream. + cudaMemcpy(a_data, + value.get().data(), + sizeof(float) * value.get().size(), + cudaMemcpyHostToDevice); +#endif + } else { + llvm_unreachable("temporarily not support other target."); } } void PrintDenseTensor(::phi::DenseTensor* dense_tensor) { -#define PRINT_META_DATA(PHI_DATATYPE, DTYPE) \ - case ::phi::DataType::PHI_DATATYPE: { \ - DTYPE* data = dense_tensor->data(); \ - if (dense_tensor->numel() == 0) break; \ - std::cout << data[0]; \ - for (int64_t i = 1; i < dense_tensor->numel(); i++) { \ - std::cout << "," << data[i]; \ - } \ - break; \ +#ifndef INFRT_WITH_GPU +#define PRINT_META_DATA(PHI_DATATYPE, DTYPE) \ + case ::phi::DataType::PHI_DATATYPE: { \ + auto place = dense_tensor->place(); \ + if (place.GetType() == ::phi::AllocationType::CPU) { \ + DTYPE* data = dense_tensor->data(); \ + if (dense_tensor->numel() == 0) break; \ + std::cout << data[0]; \ + for (int64_t i = 1; i < dense_tensor->numel(); i++) { \ + std::cout << "," << data[i]; \ + } \ + } \ + break; \ + } +#else +#define PRINT_META_DATA(PHI_DATATYPE, DTYPE) \ + case ::phi::DataType::PHI_DATATYPE: { \ + auto place = dense_tensor->place(); \ + DTYPE* data = dense_tensor->data(); \ + if (dense_tensor->numel() == 0) break; \ + if (place.GetType() == ::phi::AllocationType::CPU) { \ + std::cout << data[0]; \ + for (int64_t i = 1; i < dense_tensor->numel(); i++) { \ + std::cout << "," << data[i]; \ + } \ + } else if (place.GetType() == ::phi::AllocationType::GPU) { \ + std::vector host_data(dense_tensor->numel(), 0); \ + cudaMemcpy(host_data.data(), \ + data, \ + sizeof(DTYPE) * dense_tensor->numel(), \ + cudaMemcpyDeviceToHost); \ + std::cout << host_data[0]; \ + for (int64_t i = 1; i < dense_tensor->numel(); i++) { \ + std::cout << "," << host_data[i]; \ + } \ + } else { \ + llvm_unreachable("temporarily not support other target."); \ + } \ + break; \ } +#endif ::phi::DDim dims = dense_tensor->dims(); std::cout << "dense_tensor: shape=shape" << dims.to_str() << "," diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h index 8cc0e39e0e4..47d89506e2a 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h @@ -30,6 +30,13 @@ namespace phi { host_context::Attribute<::infrt::LayoutType> layout, host_context::Attribute<::infrt::PrecisionType> precision); +::phi::DenseTensor CreateGPUDenseTensor( + const ::phi::GPUContext& context, + host_context::Attribute> dims, + host_context::Attribute> lod, + host_context::Attribute<::infrt::LayoutType> layout, + host_context::Attribute<::infrt::PrecisionType> precision); + void FillDenseTensorF32(::phi::DenseTensor* dense_tensor, host_context::Attribute> values); void PrintDenseTensor(::phi::DenseTensor* dense_tensor); diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc index 0e071418603..36d40118f16 100644 --- a/paddle/infrt/kernel/phi/registry.cc +++ b/paddle/infrt/kernel/phi/registry.cc @@ -35,7 +35,7 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) { registry->AddKernel("phi_dt.create_context.cpu", INFRT_KERNEL(infrt::kernel::phi::CreateCPUContext)); registry->AddKernelWithAttrs( - "phi_dt.create_dense_tensor", + "phi_dt.create_dense_tensor.cpu", INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensor), {"dims", "lod", "layout", "precision"}); registry->AddKernelWithAttrs( @@ -44,6 +44,15 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) { {"value"}); registry->AddKernel("phi_dt.print_tensor", INFRT_KERNEL(infrt::kernel::phi::PrintDenseTensor)); + +#ifdef INFRT_WITH_GPU + registry->AddKernel("phi_dt.create_context.gpu", + INFRT_KERNEL(infrt::kernel::phi::CreateGPUContext)); + registry->AddKernelWithAttrs( + "phi_dt.create_dense_tensor.gpu", + INFRT_KERNEL(infrt::kernel::phi::CreateGPUDenseTensor), + {"dims", "lod", "layout", "precision"}); +#endif } } // namespace kernel diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc index b7503aa4ef3..79502f9fdfd 100644 --- a/paddle/infrt/kernel/tensor_kernels.cc +++ b/paddle/infrt/kernel/tensor_kernels.cc @@ -25,6 +25,10 @@ #include "paddle/infrt/tensor/tensor_map.h" #include "paddle/infrt/tensor/tensor_shape.h" +#ifdef INFRT_WITH_PHI +#include "paddle/phi/core/dense_tensor.h" +#endif + namespace infrt { namespace kernel { using namespace host_context; // NOLINT @@ -62,6 +66,20 @@ DenseHostTensor TensorMapGetTensor(TensorMap map, Attribute name) { int32_t TensorMapGetSize(TensorMap map) { return map.size(); } +// TODO(wilber): Maybe we should place TensorList type in dt dialect. +#ifdef INFRT_WITH_PHI +phi::DenseTensor TensorListGetTensor(std::vector list, + Attribute idx) { + CHECK_LT(idx.get(), static_cast(list.size())) + << "idx should less than list size"; + return *list[idx.get()]; +} + +int32_t TensorListGetSize(const std::vector &list) { + return list.size(); +} +#endif + DenseHostTensor ShallowCopyTensor(DenseHostTensor v) { return v; } template @@ -126,6 +144,14 @@ void RegisterTensorKernels(host_context::KernelRegistry *registry) { INFRT_KERNEL(TensorMapGetTensor)); registry->AddKernel("dt.tensor_map_get_size", INFRT_KERNEL(TensorMapGetSize)); +// TensorList related methods. +#ifdef INFRT_WITH_PHI + registry->AddKernel("dt.tensor_list_get_tensor", + INFRT_KERNEL(TensorListGetTensor)); + registry->AddKernel("dt.tensor_list_get_size", + INFRT_KERNEL(TensorListGetSize)); +#endif + registry->AddKernel("dt.shallow_copy_tensor", INFRT_KERNEL(ShallowCopyTensor)); diff --git a/paddle/infrt/kernel/tensorrt/CMakeLists.txt b/paddle/infrt/kernel/tensorrt/CMakeLists.txt new file mode 100644 index 00000000000..cd35fccbe2a --- /dev/null +++ b/paddle/infrt/kernel/tensorrt/CMakeLists.txt @@ -0,0 +1,10 @@ +if (NOT (INFRT_WITH_PHI AND INFRT_WITH_GPU AND INFRT_WITH_TRT)) + return() +endif() + +core_gather_headers() + +gather_srcs(infrt_src SRCS + registry.cc + trt_kernels.cc +) diff --git a/paddle/infrt/kernel/tensorrt/registry.cc b/paddle/infrt/kernel/tensorrt/registry.cc new file mode 100644 index 00000000000..a37e3c0f7f2 --- /dev/null +++ b/paddle/infrt/kernel/tensorrt/registry.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/kernel/tensorrt/registry.h" + +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/kernel_utils.h" +#include "paddle/infrt/kernel/tensorrt/trt_kernels.h" + +namespace infrt { +namespace kernel { + +void RegisterTrtKernels(host_context::KernelRegistry* registry) { + registry->AddKernel("trt.create_engine", + INFRT_KERNEL(tensorrt::CreateTrtEngine)); + registry->AddKernel("trt.inspect_engine", + INFRT_KERNEL(tensorrt::PrintTrtLayer)); + registry->AddKernel("trt.compute", INFRT_KERNEL(tensorrt::TrtEngineCompute)); +} + +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/tensorrt/registry.h b/paddle/infrt/kernel/tensorrt/registry.h new file mode 100644 index 00000000000..762329ca61d --- /dev/null +++ b/paddle/infrt/kernel/tensorrt/registry.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +namespace infrt { +namespace host_context { + +struct KernelRegistry; + +} // namespace host_context +} // namespace infrt + +namespace infrt { +namespace kernel { + +/** + * Register all the trt kernels to registry. + */ +void RegisterTrtKernels(host_context::KernelRegistry* registry); + +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.cc b/paddle/infrt/kernel/tensorrt/trt_kernels.cc new file mode 100644 index 00000000000..04847ac8982 --- /dev/null +++ b/paddle/infrt/kernel/tensorrt/trt_kernels.cc @@ -0,0 +1,172 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/kernel/tensorrt/trt_kernels.h" +#include +#include "NvInfer.h" +#include "NvInferRuntime.h" +#include "NvInferRuntimeCommon.h" +#include "glog/logging.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/raw_ostream.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/Value.h" +#include "paddle/infrt/backends/tensorrt/trt_engine.h" +#include "paddle/infrt/backends/tensorrt/trt_options.h" +#include "paddle/infrt/dialect/tensorrt/trt_ops.h" +#include "paddle/infrt/host_context/symbol_table.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace infrt { +namespace kernel { +namespace tensorrt { + +::infrt::backends::tensorrt::TrtEngine CreateTrtEngine( + MlirOperationWithInfrtSymbol + create_engine_op /*, input_tensors, output_tensors, weights*/) { + // TODO(wilber): The device_id needs to get from mlir. + int device_id = 0; + backends::tensorrt::TrtEngine engine(device_id); + + auto* builder = engine.GetTrtBuilder(); + // TODO(wilber): How to process weights? + backends::tensorrt::TrtUniquePtr network; + // TODO(wilber): static_shape or dynamic_shape network? The code is just + // static_shape test. + network.reset(builder->createNetworkV2(0)); + + // TODO(wilber): The build option shoule be fiiled from mlir info. + backends::tensorrt::BuildOptions options; + options.max_batch = 4; + + // Parse mlir Region which only has one block. + mlir::Operation& operation = *create_engine_op.operation; + auto* symbol_table = create_engine_op.symbol_table; + CHECK_NOTNULL(symbol_table); + + unsigned int num_regions = operation.getNumRegions(); + CHECK_EQ(num_regions, 1U) << "only support one region case."; + auto& region = operation.getRegion(0); + auto& block = region.getBlocks().front(); + + llvm::DenseMap map_info; + std::unordered_map trt_bind_inputs; + + for (auto index_operand : llvm::enumerate(operation.getOperands())) { + mlir::Value operand = index_operand.value(); + size_t idx = index_operand.index(); + + const std::string input_name = "input_" + std::to_string(idx); + auto* v = symbol_table->GetValue(std::to_string(idx)); + CHECK_NOTNULL(v); + auto* t = &v->get(); + trt_bind_inputs[input_name] = t; + // TODO(wilber): get input info from mlir. + // TODO(wilber): input dims, now only support static_shape, and just remove + // the first dimension. + // TODO(wilber): now only suppot float input. + nvinfer1::Dims dims; + dims.nbDims = t->dims().size() - 1; + for (int i = 0; i < dims.nbDims; ++i) { + dims.d[i] = t->dims()[i + 1]; + } + auto* in = + network->addInput(input_name.c_str(), nvinfer1::DataType::kFLOAT, dims); + map_info[operand] = in; + } + + // TODO(wilber): Find a way to add layer. + for (auto& inner_op : block.without_terminator()) { + if (inner_op.getName().getStringRef() == "trt.Activation") { + trt::ActivationOp act_op = llvm::dyn_cast(inner_op); + auto in_arg = act_op.getOperand(); + if (!map_info.count(in_arg)) { + CHECK(false) << "map_info not has in_arg."; + } + nvinfer1::ActivationType act_type = + static_cast(act_op.activation_type()); + auto* act_layer = network->addActivation(*map_info[in_arg], act_type); + act_layer->setAlpha(act_op.alpha().convertToFloat()); + act_layer->setBeta(act_op.beta().convertToFloat()); + for (size_t i = 0; i < act_op->getNumResults(); ++i) { + nvinfer1::ITensor* act_out_tensor = act_layer->getOutput(i); + mlir::Value act_out = act_op->getResult(i); + map_info[act_out] = act_out_tensor; + } + } + + // if (inner_op.getName().getStringRef() == "trt.Constant") { + // trt::ConstantOp op = llvm::dyn_cast(inner_op); + // mlir::Value op_out = op.getResult(); + // std::vector weight_data{1}; + // auto* layer = network->addConstant(nvinfer1::Dims2(1, 1), + // nvinfer1::Weights{nvinfer1::DataType::kFLOAT, weight_data.data(), 1}); + // auto* op_out_tenor = layer->getOutput(0); + // map_info[op_out] = op_out_tenor; + // } + } + for (auto& inner_op : block.without_terminator()) { + for (mlir::Value v : inner_op.getResults()) { + for (mlir::Operation* user : v.getUsers()) { + if (user->getName().getStringRef() == "infrt.return") { + if (!map_info.count(v)) { + CHECK(false) << "map_info not has value"; + } + network->markOutput(*map_info[v]); + } + } + } + } + // std::unordered_map trt_bind_outputs; + mlir::Operation* ret = block.getTerminator(); + for (unsigned int i = 0; i < ret->getNumOperands(); ++i) { + mlir::Value arg = ret->getOperand(i); + CHECK(map_info.count(arg)); + map_info[arg]->setName(("output_" + std::to_string(i)).c_str()); + } + for (int i = 0; i < network->getNbOutputs(); ++i) { + engine.PrepareOutputHandle(network->getOutput(i)->getName()); + } + + VLOG(3) << "trt engine build start."; + engine.Build(std::move(network), options); + VLOG(3) << "trt engine build done."; + + // TODO(wilber): get inference options from mlir. + backends::tensorrt::InferenceOptions inference_options; + inference_options.batch = 1; + // TODO(wilber): bind trt input/output tensors. + engine.SetUpInference(inference_options, trt_bind_inputs); + return engine; +} + +void PrintTrtLayer(backends::tensorrt::TrtEngine* engine) { + engine->GetEngineInfo(); +} + +std::vector TrtEngineCompute( + backends::tensorrt::TrtEngine* engine, const phi::GPUContext& context) { + engine->Run(context); + std::vector res; + for (size_t i = 0; i < engine->GetOutputNum(); ++i) { + res.push_back(engine->GetOutput("output_" + std::to_string(i))); + } + return res; +} + +} // namespace tensorrt +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.h b/paddle/infrt/kernel/tensorrt/trt_kernels.h new file mode 100644 index 00000000000..546ee9dc788 --- /dev/null +++ b/paddle/infrt/kernel/tensorrt/trt_kernels.h @@ -0,0 +1,49 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "mlir/IR/Operation.h" + +#include "paddle/infrt/backends/tensorrt/trt_engine.h" +#include "paddle/phi/backends/gpu/gpu_context.h" + +namespace infrt { +namespace host_context { +class SymbolTable; +} // namespace host_context + +namespace kernel { +namespace tensorrt { + +struct MlirOperationWithInfrtSymbol { + mlir::Operation* operation; + ::infrt::host_context::SymbolTable* symbol_table; +}; + +::infrt::backends::tensorrt::TrtEngine CreateTrtEngine( + MlirOperationWithInfrtSymbol engine_op); + +void PrintTrtLayer(backends::tensorrt::TrtEngine* engine); + +std::vector TrtEngineCompute( + backends::tensorrt::TrtEngine* engine, const phi::GPUContext& context); + +} // namespace tensorrt +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/tests/dialect/disabled_trt.mlir b/paddle/infrt/tests/dialect/disabled_trt.mlir new file mode 100644 index 00000000000..ef86dcf1e72 --- /dev/null +++ b/paddle/infrt/tests/dialect/disabled_trt.mlir @@ -0,0 +1,37 @@ +// RUN: infrtexec -i %s | FileCheck %s + +// CHECK-LABEL: @run_trt +func @run_trt(%0 : !infrt.dense_tensor, %ctx : !phi.context) { + %a = "trt.create_engine"(%0) ({ + %1 = "trt.Activation"(%0) {activation_type = 1 : si32, alpha = 1.0 : f32, beta = 6.0 : f32} : (!infrt.dense_tensor) -> !infrt.dense_tensor + "infrt.return"(%1) : (!infrt.dense_tensor) -> () + }) : (!infrt.dense_tensor) -> !trt.engine + "trt.inspect_engine"(%a) {} : (!trt.engine) -> () + + %res = "trt.compute"(%a, %ctx) {} : (!trt.engine, !phi.context) -> (!infrt.tensor_list) + %size = "dt.tensor_list_get_size"(%res) {} : (!infrt.tensor_list) -> (i32) + "infrt.print.i32"(%size) {} : (i32) -> () + + %ts0 = "dt.tensor_list_get_tensor"(%res) {id = 0 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor) + "phi_dt.print_tensor" (%ts0) : (!infrt.dense_tensor) -> () + + infrt.return +} + +// CHECK-LABEL: @main +func @main() { + %ctx = "phi_dt.create_context.gpu" (): () -> !phi.context + %t = "phi_dt.create_dense_tensor.gpu" (%ctx) { + precision=#infrt.precision, + layout=#infrt.layout, + dims=[1:i64, 3:i64, 1:i64, 1:i64], lod=[1:i64]}: (!phi.context) -> (!infrt.dense_tensor) + + "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32, 2.4:f32, 1.3:f32]} : (!infrt.dense_tensor) -> () + "phi_dt.print_tensor" (%t) : (!infrt.dense_tensor) -> () + + //%res = + infrt.call @run_trt(%t, %ctx) : (!infrt.dense_tensor, !phi.context) -> () + //-> (!infrt.dense_tensor) + + infrt.return +} diff --git a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir index 3657777a5b0..b8cb1a5cec2 100644 --- a/paddle/infrt/tests/dialect/phi/dense_tensor.mlir +++ b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir @@ -3,7 +3,7 @@ // CHECK-LABEL: @sign_any_float32_execute func @sign_any_float32_execute() { %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context - %t = "phi_dt.create_dense_tensor" (%ctx) { + %t = "phi_dt.create_dense_tensor.cpu" (%ctx) { precision=#infrt.precision, layout=#infrt.layout, lod=[1:i64], dims=[1:i64]}: (!phi.context) -> (!infrt.dense_tensor) "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor) -> () diff --git a/paddle/infrt/tests/dialect/phi/phi_test.mlir b/paddle/infrt/tests/dialect/phi/phi_test.mlir index 5b0fa735897..21ee8ebf0b7 100644 --- a/paddle/infrt/tests/dialect/phi/phi_test.mlir +++ b/paddle/infrt/tests/dialect/phi/phi_test.mlir @@ -6,7 +6,7 @@ module { } func @main() { %ctx = "phi_dt.create_context.cpu" (): () -> !phi.context - %t = "phi_dt.create_dense_tensor" (%ctx) {precision=#infrt.precision, layout=#infrt.layout, lod=[1:i64], dims=[1:i64]}: (!phi.context) -> (!infrt.dense_tensor) + %t = "phi_dt.create_dense_tensor.cpu" (%ctx) {precision=#infrt.precision, layout=#infrt.layout, lod=[1:i64], dims=[1:i64]}: (!phi.context) -> (!infrt.dense_tensor) "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor) -> () %2 = infrt.call@predict(%t) : (!infrt.dense_tensor) -> !infrt.dense_tensor phi_dt.print_tensor(%2 : !infrt.dense_tensor) diff --git a/paddle/infrt/tests/dialect/trt_ops.mlir b/paddle/infrt/tests/dialect/trt_ops.mlir index e3cb9670bec..7bdf62a2778 100644 --- a/paddle/infrt/tests/dialect/trt_ops.mlir +++ b/paddle/infrt/tests/dialect/trt_ops.mlir @@ -1,16 +1,16 @@ // RUN: trt-exec %s // CHECK-LABEL: @main -func @main(%bias:tensor, %c:tensor, %b1:tensor, %b2:tensor, %bias1:tensor, %bias2:tensor) -> tensor { - %d = "pd.elementwise_add"(%c, %bias) {axis=-1:si32} : (tensor, tensor) -> tensor - %e = "pd.relu6"(%d) {} : (tensor) -> tensor +func @main(%bias:!infrt.dense_tensor, %c:!infrt.dense_tensor, %b1:!infrt.dense_tensor, %b2:!infrt.dense_tensor, %bias1:!infrt.dense_tensor, %bias2:!infrt.dense_tensor) -> !infrt.dense_tensor { + %d = "pd.elementwise_add"(%c, %bias) {axis=-1:si32} : (!infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + %e = "pd.relu6"(%d) {} : (!infrt.dense_tensor) -> !infrt.dense_tensor - %c1 = "pd.matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (tensor, tensor) -> tensor - %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=-1:si32} : (tensor, tensor) -> tensor - %e1 = "pd.relu"(%d1) {} : (tensor) -> tensor + %c1 = "pd.matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (!infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=-1:si32} : (!infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + %e1 = "pd.relu"(%d1) {} : (!infrt.dense_tensor) -> !infrt.dense_tensor - %c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (tensor, tensor) -> tensor - %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=-1:si32} : (tensor, tensor) -> tensor - %e2 = "pd.relu"(%d2) {} : (tensor) -> tensor + %c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (!infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=-1:si32} : (!infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + %e2 = "pd.relu"(%d2) {} : (!infrt.dense_tensor) -> !infrt.dense_tensor - infrt.return %e2 : tensor + infrt.return %e2 : !infrt.dense_tensor } diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index a3b25259858..0394835aa8b 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -741,6 +741,10 @@ struct GPUContext::Impl { GPUContext::GPUContext() : DeviceContext(), impl_(std::make_unique()) {} +GPUContext::GPUContext(GPUContext&&) = default; + +GPUContext& GPUContext::operator=(GPUContext&&) = default; + GPUContext::GPUContext(const GPUPlace& place) : DeviceContext(), impl_(std::make_unique(place)) {} diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h index 3eb4360ad35..cd08da1c0f2 100644 --- a/paddle/phi/backends/gpu/gpu_context.h +++ b/paddle/phi/backends/gpu/gpu_context.h @@ -77,6 +77,8 @@ class DnnWorkspaceHandle { class GPUContext : public DeviceContext { public: GPUContext(); + GPUContext(GPUContext&&); + GPUContext& operator=(GPUContext&&); explicit GPUContext(const GPUPlace& place); -- GitLab From 827b6a0e5a6d6439b00b743d243c4d4a9e318546 Mon Sep 17 00:00:00 2001 From: Leo Chen <39020268+leo0519@users.noreply.github.com> Date: Thu, 17 Mar 2022 10:16:24 +0800 Subject: [PATCH 123/176] Improve the performance of fake quantize OP (#40491) * Move the computation of moving average scale to device * Use register to save local maximum in a thread --- paddle/fluid/operators/fake_quantize_op.cu | 63 +++++++++++----------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu index 9f7e4fb8d57..70597be393c 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu +++ b/paddle/fluid/operators/fake_quantize_op.cu @@ -28,13 +28,14 @@ __global__ void FindAbsMaxKernel(const T* in, const int n, T* out) { extern __shared__ char* shared_max_data_tmp[]; auto shared_max_data = reinterpret_cast(shared_max_data_tmp); if (gridDim.x > 1) { - shared_max_data[tid] = T(0); + T local_max_data = T(0); for (int i = bid; i < n; i += blockDim.x * gridDim.x) { T tmp = abs(in[i]); - if (tmp > shared_max_data[tid]) { - shared_max_data[tid] = tmp; + if (tmp > local_max_data) { + local_max_data = tmp; } } + shared_max_data[tid] = local_max_data; } else { if (bid < n) { shared_max_data[tid] = abs(in[bid]); @@ -83,13 +84,14 @@ __global__ void FindChannelAbsMaxKernelQuantAxis0(const T* in, const int n, int channel_size = n / c; const T* in_c = in + blockIdx.x * channel_size; extern __shared__ T shared_max_data[]; - shared_max_data[tid] = T(0); + T local_max_data = T(0); for (int i = tid; i < channel_size; i += blockDim.x) { T tmp = fabs(in_c[i]); - if (tmp > shared_max_data[tid]) { - shared_max_data[tid] = tmp; + if (tmp > local_max_data) { + local_max_data = tmp; } } + shared_max_data[tid] = local_max_data; __syncthreads(); for (int i = blockDim.x / 2; i > 0; i >>= 1) { if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) { @@ -113,13 +115,14 @@ __global__ void FindChannelAbsMaxKernelQuantAxis1(const T* in, const int n, int tid = threadIdx.x; int bid = blockIdx.x; const T* in_current = in + tid * cout_wh_size + bid * wh_size; - shared_max_data[tid] = T(0); + T local_max_data = T(0); for (int i = 0; i < wh_size; i++) { T tmp = fabs(in_current[i]); - if (tmp > shared_max_data[tid]) { - shared_max_data[tid] = tmp; + if (tmp > local_max_data) { + local_max_data = tmp; } } + shared_max_data[tid] = local_max_data; __syncthreads(); int len = blockDim.x; @@ -404,6 +407,19 @@ struct FindRangeAbsMaxFunctor { } }; +template +__global__ void FindMovingAverageAbsMaxKernel(const T* in_state, + const T* in_accum, + const T* cur_scale, const T rate, + T* out_state, T* out_accum, + T* out_scale) { + T state = rate * (*in_state) + T(1.0f); + T accum = rate * (*in_accum) + (*cur_scale); + *out_state = state; + *out_accum = accum; + *out_scale = accum / state; +} + template struct FindRangeAbsMaxFunctor; template @@ -415,29 +431,14 @@ struct FindMovingAverageAbsMaxFunctor { framework::Tensor* out_accum, framework::Tensor* out_scale) { const auto gpu_place = ctx.GetPlace(); - T accum; - T state; - T scale; - memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data(), - sizeof(T), ctx.stream()); - memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data(), - sizeof(T), ctx.stream()); - memory::Copy(platform::CPUPlace(), &scale, gpu_place, cur_scale, sizeof(T), - ctx.stream()); - ctx.Wait(); - T rate_t = static_cast(rate); - state = rate_t * state + static_cast(1.0); - accum = rate_t * accum + scale; - scale = accum / state; - - memory::Copy(gpu_place, out_accum->mutable_data(gpu_place), - platform::CPUPlace(), &accum, sizeof(T), ctx.stream()); - memory::Copy(gpu_place, out_state->mutable_data(gpu_place), - platform::CPUPlace(), &state, sizeof(T), ctx.stream()); - memory::Copy(gpu_place, out_scale->mutable_data(gpu_place), - platform::CPUPlace(), &scale, sizeof(T), ctx.stream()); - ctx.Wait(); + T* out_state_data = out_state->mutable_data(gpu_place); + T* out_accum_data = out_accum->mutable_data(gpu_place); + T* out_scale_data = out_scale->mutable_data(gpu_place); + + FindMovingAverageAbsMaxKernel<<<1, 1, 0, ctx.stream()>>>( + in_state.data(), in_accum.data(), cur_scale, rate_t, + out_state_data, out_accum_data, out_scale_data); } }; -- GitLab From b1b244630ce7fa270a97cc3fb0bd50ee43dcbc13 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 17 Mar 2022 10:20:25 +0800 Subject: [PATCH 124/176] move grid sample op infershape (#40625) --- paddle/fluid/operators/grid_sampler_op.cc | 63 +++++------------------ paddle/phi/infermeta/binary.cc | 42 +++++++++++++++ paddle/phi/infermeta/binary.h | 5 ++ 3 files changed, 59 insertions(+), 51 deletions(-) diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc index 6ee9582dacd..f6d3fd89846 100644 --- a/paddle/fluid/operators/grid_sampler_op.cc +++ b/paddle/fluid/operators/grid_sampler_op.cc @@ -15,9 +15,13 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -27,43 +31,6 @@ using Tensor = framework::Tensor; class GridSampleOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GridSampler"); - OP_INOUT_CHECK(ctx->HasInput("Grid"), "Input", "Grid", "GridSampler"); - OP_INOUT_CHECK(ctx->HasOutput("Output"), "Output", "Output", "GridSampler"); - - auto x_dims = ctx->GetInputDim("X"); - auto grid_dims = ctx->GetInputDim("Grid"); - PADDLE_ENFORCE_EQ(x_dims.size(), 4, - platform::errors::InvalidArgument( - "Input(X) of GridSampleOp should be 4-D Tensor, but " - "received X dimension size(%d)", - x_dims.size())); - PADDLE_ENFORCE_EQ(grid_dims.size(), 4, - platform::errors::InvalidArgument( - "Input(Grid) of GridSampleOp should be 4-D Tensor, " - "but received X dimension size(%d)", - grid_dims.size())); - if (ctx->IsRuntime() || grid_dims[3] > 0) { - PADDLE_ENFORCE_EQ( - grid_dims[3], 2, - platform::errors::InvalidArgument( - "Input(Grid) dimension[3] should be 2, but received %d", - grid_dims[3])); - } - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ( - grid_dims[0], x_dims[0], - platform::errors::InvalidArgument( - "Input(X) and Input(Grid) dimension[0] should be equal, but " - "received X dimension[0](%d) != Grid dimension[0](%d)", - x_dims[0], grid_dims[0])); - } - - ctx->SetOutputDim("Output", - {x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]}); - ctx->ShareLoD("X", "Output"); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -173,18 +140,6 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker { class GridSampleOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output", - framework::GradVarName("X"), "grid_sampler"); - auto input_dims = ctx->GetInputDim("X"); - auto grid_dims = ctx->GetInputDim("Grid"); - if (ctx->HasOutput(framework::GradVarName("X"))) { - ctx->SetOutputDim(framework::GradVarName("X"), input_dims); - } - if (ctx->HasOutput(framework::GradVarName("Grid"))) { - ctx->SetOutputDim(framework::GradVarName("Grid"), grid_dims); - } - } protected: framework::OpKernelType GetExpectedKernelType( @@ -224,10 +179,16 @@ class GridSampleGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(grid_sampler, GridSamplerInferShapeFunctor, + PD_INFER_META(phi::GridSampleBaseInferMeta)); REGISTER_OPERATOR(grid_sampler, ops::GridSampleOp, ops::GridSampleOpMaker, ops::GridSampleGradMaker, - ops::GridSampleGradMaker); -REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad); + ops::GridSampleGradMaker, + GridSamplerInferShapeFunctor); +DECLARE_INFER_SHAPE_FUNCTOR(grid_sampler_grad, GridSamplerGradInferShapeFunctor, + PD_INFER_META(phi::GeneralBinaryGradInferMeta)); +REGISTER_OPERATOR(grid_sampler_grad, ops::GridSampleOpGrad, + GridSamplerGradInferShapeFunctor); REGISTER_OP_VERSION(grid_sampler) .AddCheckpoint( diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 38dce0dc69d..521f2a9bf06 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -571,6 +571,48 @@ void GatherTreeMeta(const MetaTensor& ids, out->set_dims(ids_dims); } +void GridSampleBaseInferMeta(const MetaTensor& x, + const MetaTensor& grid, + MetaTensor* out, + MetaConfig config) { + auto x_dims = x.dims(); + auto grid_dims = grid.dims(); + PADDLE_ENFORCE_EQ(x_dims.size(), + 4, + phi::errors::InvalidArgument( + "Input(X) of GridSampleOp should be 4-D Tensor, but " + "received X dimension size(%d)", + x_dims.size())); + PADDLE_ENFORCE_EQ(grid_dims.size(), + 4, + phi::errors::InvalidArgument( + "Input(Grid) of GridSampleOp should be 4-D Tensor, " + "but received X dimension size(%d)", + grid_dims.size())); + if (config.is_runtime || grid_dims[3] > 0) { + PADDLE_ENFORCE_EQ( + grid_dims[3], + 2, + phi::errors::InvalidArgument( + "Input(Grid) dimension[3] should be 2, but received %d", + grid_dims[3])); + } + if (config.is_runtime) { + PADDLE_ENFORCE_EQ( + grid_dims[0], + x_dims[0], + phi::errors::InvalidArgument( + "Input(X) and Input(Grid) dimension[0] should be equal, but " + "received X dimension[0](%d) != Grid dimension[0](%d)", + x_dims[0], + grid_dims[0])); + } + + out->set_dims({x_dims[0], x_dims[1], grid_dims[1], grid_dims[2]}); + out->set_dtype(x.dtype()); + out->share_lod(x); +} + void HuberLossInferMeta(const MetaTensor& input, const MetaTensor& label, float delta, diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index 8cf7ce3930e..9e1a35640ad 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -103,6 +103,11 @@ void GatherTreeMeta(const MetaTensor& ids, const MetaTensor& parents, MetaTensor* out); +void GridSampleBaseInferMeta(const MetaTensor& x, + const MetaTensor& grid, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void HuberLossInferMeta(const MetaTensor& input_meta, const MetaTensor& label_meta, float delta, -- GitLab From c1931beb72ff8429ec44230c014777840fb231b0 Mon Sep 17 00:00:00 2001 From: Jiabin Yang <360788950@qq.com> Date: Thu, 17 Mar 2022 10:26:08 +0800 Subject: [PATCH 125/176] fix copy_ problem by doing it with phi copy (#40521) * fix copy_ problem by doing it with phi copy * improve test coverage * refactor copy with sr kernel --- .../data_structure_tests/eager_tensor_test.cc | 53 +++++++++ paddle/fluid/pybind/eager_method.cc | 15 ++- paddle/fluid/pybind/eager_properties.cc | 2 +- paddle/phi/api/include/tensor.h | 4 +- paddle/phi/api/lib/CMakeLists.txt | 2 +- paddle/phi/api/lib/tensor_method.cc | 102 ++++++++++++++++-- .../phi/kernels/selected_rows/copy_kernel.cc | 49 +++++++++ .../phi/kernels/selected_rows/copy_kernel.h | 31 ++++++ 8 files changed, 248 insertions(+), 10 deletions(-) create mode 100644 paddle/phi/kernels/selected_rows/copy_kernel.cc create mode 100644 paddle/phi/kernels/selected_rows/copy_kernel.h diff --git a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc index 1683f4ed5fb..c8b2d22dcf9 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc @@ -17,6 +17,14 @@ #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/phi/api/lib/utils/allocator.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(copy_sr, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(copy_sr, GPU, ALL_LAYOUT); +#endif namespace eager_test { using AbstractAutogradMeta = paddle::experimental::AbstractAutogradMeta; @@ -151,5 +159,50 @@ TEST(EagerVariable, Constructor) { CHECK_EQ(dt3_tmp_ptr[1], 10.0f); t4.reset(); CHECK(t4.defined() == false); + + VLOG(6) << "Check Tensor Copy_"; + std::vector rows = {1, 2}; + std::vector dims = {2}; + paddle::experimental::Tensor t7(std::make_shared(rows, 2)); + std::dynamic_pointer_cast(t7.impl()) + ->mutable_value() + ->Resize(phi::make_ddim(dims)); + auto* dt7_tmp_ptr = std::dynamic_pointer_cast(t7.impl()) + ->mutable_value() + ->mutable_data(paddle::platform::CPUPlace()); + dt7_tmp_ptr[0] = 6.0f; + dt7_tmp_ptr[1] = 11.0f; + + paddle::experimental::Tensor t8; + paddle::experimental::Tensor t5; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + paddle::experimental::Tensor t6; + paddle::experimental::Tensor t9; + VLOG(6) << "Check Tensor Copy_ Selected Rows"; + t8.copy_(t7, paddle::platform::CUDAPlace(0), true); + t9.copy_(t8, paddle::platform::CPUPlace(), true); + auto* dt9_tmp_ptr = std::dynamic_pointer_cast(t9.impl()) + ->value() + .data(); + CHECK_EQ(dt9_tmp_ptr[0], 6.0f); + CHECK_EQ(dt9_tmp_ptr[1], 11.0f); + CHECK_EQ(std::dynamic_pointer_cast(t9.impl())->height(), + 2); + + VLOG(6) << "Check Tensor Copy_ Dense Tensor"; + t5.copy_(t3, paddle::platform::CUDAPlace(0), true); + t6.copy_(t5, paddle::platform::CPUPlace(), true); + auto* dt6_tmp_ptr = + std::dynamic_pointer_cast(t6.impl())->data(); + CHECK_EQ(dt6_tmp_ptr[0], 5.0f); + CHECK_EQ(dt6_tmp_ptr[1], 10.0f); +#else + t5.copy_(t3, paddle::platform::CPUPlace(), true); + auto* dt5_tmp_ptr = + std::dynamic_pointer_cast(t5.impl())->data(); + CHECK_EQ(dt5_tmp_ptr[0], 5.0f); + CHECK_EQ(dt5_tmp_ptr[1], 10.0f); +#endif + VLOG(6) << "Finish"; } diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 082ec382c79..7f8fcd351fe 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -226,6 +226,19 @@ static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } +static PyObject* tensor_method_cpu(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + auto cp_tensor = + self->tensor.copy_to(phi::TransToPhiBackend(phi::CPUPlace()), true); + egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true); + egr::EagerUtils::autograd_meta(&cp_tensor) + ->SetPersistable( + egr::EagerUtils::autograd_meta(&(self->tensor))->Persistable()); + return ToPyObject(cp_tensor); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + static PyObject* tensor_method_reconstruct_from_(TensorObject* self, PyObject* args, PyObject* kwargs) { @@ -264,7 +277,7 @@ static PyObject* tensor_method_copy_(TensorObject* self, PyObject* args, egr::EagerUtils::autograd_meta(&(src_tensor))->Persistable()); } - self->tensor.copy_(src_tensor, blocking); + self->tensor.copy_(src_tensor, self->tensor.inner_place(), blocking); VLOG(6) << "Finish Copy Tensor " << src_tensor.name() << " to " << self->tensor.name(); diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc index 2572866b8f5..ff8980d727e 100644 --- a/paddle/fluid/pybind/eager_properties.cc +++ b/paddle/fluid/pybind/eager_properties.cc @@ -96,7 +96,7 @@ int tensor_properties_set_grad(TensorObject* self, PyObject* value, "Detected NULL grad" "Please check if you have manually cleared" "the grad inside autograd_meta")); - grad->copy_(src, true); + grad->copy_(src, self->tensor.inner_place(), true); return 0; EAGER_CATCH_AND_THROW_RETURN_ZERO } diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h index c268742fa56..1312710a80f 100644 --- a/paddle/phi/api/include/tensor.h +++ b/paddle/phi/api/include/tensor.h @@ -397,7 +397,9 @@ class PADDLE_API Tensor final { * @param blocking, Should we copy this in sync way. * @return void */ - void copy_(const Tensor& src, const bool blocking); + void copy_(const Tensor& src, + const phi::Place& target_place, + const bool blocking); /** * @brief Cast datatype from one to another * diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index 42bf7a8103f..4cbca072362 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -148,4 +148,4 @@ cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw ph cc_library(sparse_api SRCS ${sparse_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl) cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api sparse_api_custom_impl) -cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api) +cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta) diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc index 885e29b27fa..cc797507e68 100644 --- a/paddle/phi/api/lib/tensor_method.cc +++ b/paddle/phi/api/lib/tensor_method.cc @@ -19,9 +19,12 @@ limitations under the License. */ #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/tensor_base.h" +#include "paddle/phi/api/lib/api_gen_utils.h" +#include "paddle/phi/api/lib/kernel_dispatch.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace experimental { - // declare cast api Tensor cast(const Tensor &x, DataType out_dtype); Tensor copy_to(const Tensor &x, Backend backend, bool blocking); @@ -67,12 +70,18 @@ template PADDLE_API Tensor Tensor::copy_to>( template PADDLE_API Tensor Tensor::copy_to(const PlaceType &target_place) const; -void Tensor::copy_(const Tensor &src, bool blocking) { +void Tensor::copy_(const Tensor &src, + const phi::Place &target_place, + bool blocking) { if (!src.is_initialized()) { + VLOG(8) << "Src is empty, skip copy"; return; } + // Prepare copy kernel key and outputs + auto kernel_key_set = ParseKernelKeyByInputArgs(src); + KernelType kernel_type = ParseKernelTypeByInputArgs(src); VLOG(3) << "Deep copy Tensor from " << src.name() << " to " << name(); - if (defined()) { + if (is_initialized()) { PADDLE_ENFORCE_EQ(dtype(), src.dtype(), platform::errors::PreconditionNotMet( @@ -87,10 +96,91 @@ void Tensor::copy_(const Tensor &src, bool blocking) { "Copy cannot be performed!", name(), src.name())); + PADDLE_ENFORCE_EQ(target_place, + inner_place(), + platform::errors::PreconditionNotMet( + "Place is different of dst tensor and args %s, which " + "current tensor holds %s " + "Copy cannot be performed!", + target_place.DebugString(), + inner_place().DebugString())); + kernel_key_set.backend_set = + kernel_key_set.backend_set | + BackendSet(phi::TransToPhiBackend(inner_place())); + } else { + // Deep Copy AutoGrad info from src to self. + *autograd_meta_ = *(src.autograd_meta_); + } + + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); + auto *dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); + Backend kernel_backend = Backend::UNDEFINED; + DataLayout kernel_layout = DataLayout::UNDEFINED; + DataType kernel_data_type = DataType::UNDEFINED; + + if (kernel_backend == Backend::UNDEFINED || + kernel_layout == DataLayout::UNDEFINED || + kernel_data_type == DataType::UNDEFINED) { + if (kernel_backend == Backend::UNDEFINED) { + kernel_backend = kernel_key.backend(); + } + if (kernel_layout == DataLayout::UNDEFINED) { + kernel_layout = kernel_key.layout(); + } + if (kernel_data_type == DataType::UNDEFINED) { + kernel_data_type = kernel_key.dtype(); + } + } + + if (kernel_type == KernelType::DENSE_TENSOR_KENREL) { + auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( + "copy", {kernel_backend, kernel_layout, kernel_data_type}); + VLOG(6) << "copy API kernel key: " << kernel_key; + VLOG(6) << "copy API kernel: " << kernel; + using kernel_signature = void (*)(const platform::DeviceContext &, + const phi::DenseTensor &, + phi::Place, + bool, + phi::DenseTensor *); + SetKernelOutput(kernel_backend, this); + phi::MetaTensor meta_out(impl_.get()); + phi::UnchangedInferMeta( + MakeMetaTensor( + *(std::static_pointer_cast(src.impl_))), + &meta_out); + auto *kernel_fn = kernel.GetVariadicKernelFn(); + (*kernel_fn)(*dev_ctx, + (*(std::static_pointer_cast(src.impl_))), + target_place, + blocking, + static_cast(impl_.get())); + } else if (kernel_type == KernelType::SELECTED_ROWS_KENREL) { + auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( + "copy_sr", {kernel_backend, kernel_layout, kernel_data_type}); + VLOG(6) << "copy API kernel key: " << kernel_key; + VLOG(6) << "copy API kernel: " << kernel; + using kernel_signature = void (*)(const platform::DeviceContext &, + const phi::SelectedRows &, + phi::Place, + bool, + phi::SelectedRows *); + SetSelectedRowsKernelOutput(kernel_backend, this); + phi::MetaTensor meta_out(impl_.get()); + phi::UnchangedInferMeta( + MakeMetaTensor( + *(std::static_pointer_cast(src.impl_))), + &meta_out); + auto *kernel_fn = kernel.GetVariadicKernelFn(); + (*kernel_fn)(*dev_ctx, + (*(std::static_pointer_cast(src.impl_))), + target_place, + blocking, + static_cast(impl_.get())); + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "We currently only support dense tensor copy for now and if u need to " + "copy selected rows please raise a issue.")); } - auto copy_tensor = - src.copy_to(phi::TransToPhiBackend(src.inner_place()), blocking); - set_impl(copy_tensor.impl()); } } // namespace experimental diff --git a/paddle/phi/kernels/selected_rows/copy_kernel.cc b/paddle/phi/kernels/selected_rows/copy_kernel.cc new file mode 100644 index 00000000000..cf71ab0583f --- /dev/null +++ b/paddle/phi/kernels/selected_rows/copy_kernel.cc @@ -0,0 +1,49 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/selected_rows/copy_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +namespace phi { +namespace sr { + +template +void Copy(const Context& dev_ctx, + const SelectedRows& src, + Place dst_place, + bool blocking, + SelectedRows* dst) { + if (src.value().Holder() != dst->value().Holder() || + src.value().data() != dst->value().data()) { + dst->set_rows(src.rows()); + dst->set_height(src.height()); + } + phi::Copy( + dev_ctx, src.value(), dst_place, blocking, dst->mutable_value()); +} + +} // namespace sr +} // namespace phi + +PD_REGISTER_GENERAL_KERNEL( + copy_sr, CPU, ALL_LAYOUT, phi::sr::Copy, ALL_DTYPE) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_GENERAL_KERNEL( + copy_sr, GPU, ALL_LAYOUT, phi::sr::Copy, ALL_DTYPE) {} +#endif diff --git a/paddle/phi/kernels/selected_rows/copy_kernel.h b/paddle/phi/kernels/selected_rows/copy_kernel.h new file mode 100644 index 00000000000..4aa848bea2a --- /dev/null +++ b/paddle/phi/kernels/selected_rows/copy_kernel.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/selected_rows.h" +#include "paddle/phi/core/sparse_csr_tensor.h" + +namespace phi { +namespace sr { + +template +void Copy(const Context& dev_ctx, + const SelectedRows& src, + Place dst_place, + bool blocking, + SelectedRows* dst); + +} // namespace sr +} // namespace phi -- GitLab From add304ed4b419644138c45a36370a65c45612ba6 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Thu, 17 Mar 2022 10:26:16 +0800 Subject: [PATCH 126/176] Optimize the performance of C++ API (#40640) * Optimize performance * optimiaze c++ api performance * remove unsed code * fix paddle throw * updata format --- paddle/phi/api/include/tensor.h | 9 ++++++++- paddle/phi/api/lib/api_gen_utils.cc | 12 +++--------- paddle/phi/api/lib/data_transform.cc | 9 +++------ paddle/phi/api/lib/tensor.cc | 14 ++++++++++++-- paddle/phi/core/kernel_factory.h | 8 ++++++++ python/paddle/utils/code_gen/api_base.py | 2 +- 6 files changed, 35 insertions(+), 19 deletions(-) diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h index 1312710a80f..ce40627bb0d 100644 --- a/paddle/phi/api/include/tensor.h +++ b/paddle/phi/api/include/tensor.h @@ -324,7 +324,7 @@ class PADDLE_API Tensor final { * * @return std::shared_ptr */ - std::shared_ptr impl() const; + const std::shared_ptr& impl() const; /** * @brief Set the implemention of current Tensor. @@ -333,6 +333,13 @@ class PADDLE_API Tensor final { */ void set_impl(const std::shared_ptr& impl); + /** + * @brief Set the implemention of current Tensor. + * + * @param impl + */ + void set_impl(std::shared_ptr&& impl); + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) /** * @brief Get the stream where the tensor is currently located diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc index e1ebe8c6465..0c11e2df65d 100644 --- a/paddle/phi/api/lib/api_gen_utils.cc +++ b/paddle/phi/api/lib/api_gen_utils.cc @@ -95,12 +95,8 @@ paddle::optional MakeMetaTensor( /* ------------------ for output ----------------------- */ phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) { - if (!out->initialized()) { - auto dense_tensor = std::make_shared( - phi::make_intrusive(phi::TransToPhiPlace(backend)), - phi::DenseTensorMeta()); - out->set_impl(dense_tensor); - return dense_tensor.get(); + if (out->impl() == nullptr) { + out->set_impl(std::make_shared()); } return static_cast(out->impl().get()); } @@ -111,9 +107,7 @@ std::vector SetKernelOutput(size_t out_size, out->reserve(out_size); std::vector results(out_size); for (size_t i = 0; i < out_size; ++i) { - auto tensor_ptr = std::make_shared( - phi::make_intrusive(phi::TransToPhiPlace(backend)), - phi::DenseTensorMeta()); + auto tensor_ptr = std::make_shared(); results[i] = tensor_ptr.get(); out->emplace_back(); out->back().set_impl(tensor_ptr); diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc index 79b8ac6d0b8..e280ab626da 100644 --- a/paddle/phi/api/lib/data_transform.cc +++ b/paddle/phi/api/lib/data_transform.cc @@ -167,10 +167,7 @@ phi::DenseTensor TransformData(const phi::DenseTensor& tensor, if (NeedTransformPlace( out.place(), target_args_def.backend, transform_flag)) { - phi::DenseTensor result( - phi::make_intrusive( - phi::TransToPhiPlace(target_args_def.backend)), - {out.dtype(), out.dims(), out.layout()}); + phi::DenseTensor result; framework::TransDataDevice( out, phi::TransToPhiPlace(target_args_def.backend), &result); out = result; @@ -190,14 +187,14 @@ std::shared_ptr PrepareData( tensor_in->dtype(), target_args_def.dtype, transform_flag) && !NeedTransformLayout( tensor_in->layout(), target_args_def.layout, transform_flag))) { - return std::dynamic_pointer_cast(tensor_in); + return std::static_pointer_cast(tensor_in); } phi::DenseTensor out = TransformData(*(static_cast(tensor_in.get())), target_args_def, transform_flag); - return std::make_shared(out); + return std::make_shared(std::move(out)); } std::shared_ptr PrepareData( diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index 40174a505dc..6be85d72000 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -46,6 +46,7 @@ limitations under the License. */ * In the future, the necessary components will be moved to the this library, * or the corresponding components will be re-implemented. */ + #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/stream/cuda_stream.h" @@ -142,7 +143,12 @@ PlaceType Tensor::place() const { } paddle::platform::Place Tensor::inner_place() const { - return ConvertExtPlaceToInnerPlace(place()); + PADDLE_ENFORCE_NOT_NULL( + impl_, + phi::errors::PermissionDenied( + "Null pointer error, the impl_ of Tensor should not be " + "Null when calling Tensor::inner_place().")); + return impl_->place(); } bool Tensor::is_cpu() const { @@ -286,12 +292,16 @@ Tensor Tensor::slice(int64_t begin_idx, int64_t end_idx) const { } } -std::shared_ptr Tensor::impl() const { return impl_; } +const std::shared_ptr &Tensor::impl() const { return impl_; } void Tensor::set_impl(const std::shared_ptr &impl) { impl_ = impl; } +void Tensor::set_impl(std::shared_ptr &&impl) { + impl_ = std::move(impl); +} + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpuStream_t Tensor::stream() const { return platform::stream::get_current_stream(-1)->raw_stream(); diff --git a/paddle/phi/core/kernel_factory.h b/paddle/phi/core/kernel_factory.h index be914097626..e502b9cb3e0 100644 --- a/paddle/phi/core/kernel_factory.h +++ b/paddle/phi/core/kernel_factory.h @@ -197,8 +197,16 @@ class Kernel { const KernelArgsDef& args_def() const { return args_def_; } + const TensorArgDef& InputAt(size_t idx) const { + return args_def_.input_defs().at(idx); + } + TensorArgDef& InputAt(size_t idx) { return args_def_.input_defs().at(idx); } + const TensorArgDef& OutputAt(size_t idx) const { + return args_def_.output_defs().at(idx); + } + TensorArgDef& OutputAt(size_t idx) { return args_def_.output_defs().at(idx); } bool IsValid() { return fn_ != nullptr; } diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py index d91b76bb703..bf3d7b3d19e 100644 --- a/python/paddle/utils/code_gen/api_base.py +++ b/python/paddle/utils/code_gen/api_base.py @@ -698,7 +698,7 @@ PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self self.outputs['types'], 'SetKernelOutput', code_indent, inplace_flag) api_func_name = self.get_api_func_name() + ('_' if inplace_flag else '') return f""" -{code_indent} auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( +{code_indent} const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( {code_indent} "{self.kernel['func'][0]}", {{kernel_backend, kernel_layout, kernel_data_type}}); {code_indent} VLOG(6) << "{self.api} API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]"; {code_indent} VLOG(6) << "{self.api} API kernel: " << kernel; -- GitLab From 4c01763ca7a3d0863b174ce9f209434bfa63e7ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?= <78149749+winter-wang@users.noreply.github.com> Date: Thu, 17 Mar 2022 10:26:30 +0800 Subject: [PATCH 127/176] [infrt] move pd_ops.td to pd floder. test=develop (#40613) --- .gitignore | 4 +- paddle/infrt/CMakeLists.txt | 1 - paddle/infrt/dialect/CMakeLists.txt | 6 - .../infrt/dialect/infrt/pass/infrt_op_fuse.td | 2 +- paddle/infrt/dialect/pd/ir/CMakeLists.txt | 2 + .../infrt/dialect/{ => pd/ir}/pd_extra_ops.td | 2 +- .../infrt/dialect/{ => pd/ir}/pd_op_base.td | 6 +- paddle/infrt/dialect/pd/ir/pd_ops.cc | 117 ++---------------- paddle/infrt/dialect/pd/ir/pd_ops.h | 49 ++------ paddle/infrt/dialect/pd/pass/CMakeLists.txt | 3 + .../{rewrite.td => pd/pass/pd_op_fuse.td} | 4 +- .../infrt/dialect/pd/pass/pd_op_fuse_pass.cc | 5 +- .../infrt/dialect/pd/pass/pd_op_fuse_pass.h | 24 ++++ .../infrt/dialect/tensorrt/pd_lower_to_trt.td | 2 +- paddle/infrt/host_context/paddle_mlir.h | 12 +- .../infrt/tests/dialect/{ => pd}/rewrite.mlir | 2 +- tools/infrt/custom_pdop.td | 10 -- ...rate_pd_op_dialect_from_paddle_op_maker.py | 8 +- 18 files changed, 68 insertions(+), 191 deletions(-) rename paddle/infrt/dialect/{ => pd/ir}/pd_extra_ops.td (90%) rename paddle/infrt/dialect/{ => pd/ir}/pd_op_base.td (96%) rename paddle/infrt/dialect/{rewrite.td => pd/pass/pd_op_fuse.td} (97%) create mode 100644 paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h rename paddle/infrt/tests/dialect/{ => pd}/rewrite.mlir (97%) diff --git a/.gitignore b/.gitignore index 801790d0a47..664c45b7202 100644 --- a/.gitignore +++ b/.gitignore @@ -52,12 +52,12 @@ tools/__pycache__ # This file is automatically generated. # TODO(zhiqiang) Move this file to build directory. -paddle/infrt/dialect/pd_ops.td +paddle/infrt/dialect/pd/ir/pd_ops.td paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td tools/infrt/kernels.json tools/infrt/kernel_signature.json -paddle/infrt/dialect/pd_ops_info.h +paddle/infrt/dialect/pd/common/pd_ops_info.h .lit_test_times.txt paddle/infrt/tests/dialect/Output paddle/infrt/tests/lit.cfg.py diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt index f394b754a8e..e777a8e3ab4 100644 --- a/paddle/infrt/CMakeLists.txt +++ b/paddle/infrt/CMakeLists.txt @@ -102,7 +102,6 @@ set(infrt_mlir_incs test_kernels_inc tensor_shape_inc dense_tensor_inc - pd_ops_inc pd_extra_ops_inc trt_ops_inc ) diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt index 353a9c67952..cf3906c32e5 100644 --- a/paddle/infrt/dialect/CMakeLists.txt +++ b/paddle/infrt/dialect/CMakeLists.txt @@ -11,11 +11,6 @@ gather_srcs(infrt_src SRCS mlir_tablegen_on(tensor_shape DIALECT ts) mlir_tablegen_on(dense_tensor DIALECT dt) -mlir_tablegen_on(pd_op_base DIALECT pd) -mlir_tablegen_on(pd_ops) -mlir_tablegen_on(pd_extra_ops) - -mlir_add_rewriter(rewrite) # TODO(Superjomn) add a cmake function cc_executable to ecapsulate the following code add_executable(infrtopt opt.cc) @@ -23,7 +18,6 @@ target_link_libraries(infrtopt infrt) add_executable(print-ir print_ir.cc) target_link_libraries(print-ir infrt ${mlir_libs}) -add_dependencies(print-ir pd_ops_inc) cc_test_tiny(test_infrt_mlir_loader SRCS mlir_loader_test.cc DEPS infrt ${MLIR_IR_LIBS}) add_subdirectory(infrt) diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td index 7ae0bbae627..3d825a9c762 100644 --- a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td +++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td @@ -3,7 +3,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td" include "paddle/infrt/dialect/infrt/ir/infrt_ops.td" -include "paddle/infrt/dialect/pd_ops.td" +include "paddle/infrt/dialect/pd/ir/pd_ops.td" def FuseTensorCastPattern : Pat< (Infrt_TensorCastOp (Infrt_TensorCastOp $arg)), diff --git a/paddle/infrt/dialect/pd/ir/CMakeLists.txt b/paddle/infrt/dialect/pd/ir/CMakeLists.txt index 0787a612d48..8aacfc97623 100644 --- a/paddle/infrt/dialect/pd/ir/CMakeLists.txt +++ b/paddle/infrt/dialect/pd/ir/CMakeLists.txt @@ -3,3 +3,5 @@ core_gather_headers() gather_srcs(infrt_src SRCS pd_ops.cc ) +add_mlir_dialect(pd_ops pd) +mlir_tablegen_on(pd_extra_ops) diff --git a/paddle/infrt/dialect/pd_extra_ops.td b/paddle/infrt/dialect/pd/ir/pd_extra_ops.td similarity index 90% rename from paddle/infrt/dialect/pd_extra_ops.td rename to paddle/infrt/dialect/pd/ir/pd_extra_ops.td index c6d3f530455..cf17db211cb 100644 --- a/paddle/infrt/dialect/pd_extra_ops.td +++ b/paddle/infrt/dialect/pd/ir/pd_extra_ops.td @@ -4,7 +4,7 @@ include "mlir/Interfaces/InferTypeOpInterface.td" include "mlir/Interfaces/LoopLikeInterface.td" include "mlir/IR/OpBase.td" -include "paddle/infrt/dialect/pd_op_base.td" +include "paddle/infrt/dialect/pd/ir/pd_op_base.td" def PD_FusedFC : PD_Op<"FC", [NoSideEffect]> { let summary = "Computes the Fully Connected result of two tensors"; diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd/ir/pd_op_base.td similarity index 96% rename from paddle/infrt/dialect/pd_op_base.td rename to paddle/infrt/dialect/pd/ir/pd_op_base.td index f6af4c83aed..7cab0eca45a 100644 --- a/paddle/infrt/dialect/pd_op_base.td +++ b/paddle/infrt/dialect/pd/ir/pd_op_base.td @@ -8,7 +8,7 @@ include "mlir/IR/OpBase.td" include "mlir/Interfaces/SideEffectInterfaces.td" include "paddle/infrt/dialect/infrt/ir/infrt_base.td" -def PD_Dialect : Dialect { +def Paddle_Dialect : Dialect { let name = "pd"; let description = [{ @@ -16,12 +16,12 @@ def PD_Dialect : Dialect { This dialect contains the PaddlePaddle operators. }]; - + let hasConstantMaterializer = 1; let cppNamespace = "mlir::pd"; } class PD_Op traits = []> : - Op; + Op; class PD_PaddleAttr : diff --git a/paddle/infrt/dialect/pd/ir/pd_ops.cc b/paddle/infrt/dialect/pd/ir/pd_ops.cc index 5abf7d1a1b9..d105aa07dd0 100644 --- a/paddle/infrt/dialect/pd/ir/pd_ops.cc +++ b/paddle/infrt/dialect/pd/ir/pd_ops.cc @@ -17,24 +17,22 @@ #include #include +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" +#include "paddle/infrt/dialect/pd/ir/pd_opsDialect.cpp.inc" #define GET_OP_CLASSES -#include "paddle/infrt/dialect/pd_ops.cpp.inc" // NOLINT +#include "paddle/infrt/dialect/pd/ir/pd_ops.cpp.inc" // NOLINT #define GET_OP_CLASSES -#include "paddle/infrt/dialect/pd_extra_ops.cpp.inc" // NOLINT +#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.cpp.inc" // NOLINT namespace mlir { namespace pd { - -#include "paddle/infrt/dialect/rewrite.cpp.inc" // NOLINT - -PaddleDialect::PaddleDialect(MLIRContext *context) - : Dialect("pd", context, TypeID::get()) { +void PaddleDialect::initialize() { addOperations< #define GET_OP_LIST -#include "paddle/infrt/dialect/pd_ops.cpp.inc" // NOLINT +#include "paddle/infrt/dialect/pd/ir/pd_ops.cpp.inc" // NOLINT , #define GET_OP_LIST -#include "paddle/infrt/dialect/pd_extra_ops.cpp.inc" // NOLINT +#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.cpp.inc" // NOLINT >(); } @@ -73,106 +71,5 @@ mlir::OpFoldResult ConstantOp::fold( ::llvm::ArrayRef operands) { return value(); } -/* -LogicalResult ElementwiseAdd::inferReturnTypes( - MLIRContext *context, - Optional location, - ValueRange operands, - DictionaryAttr attributes, - RegionRange regions, - SmallVectorImpl &inferredReturnTypes) { - inferredReturnTypes.push_back(operands[0].getType()); - return success(); -} -*/ - -void Elementwise_addOp::getCanonicalizationPatterns( - mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) { - results.insert(context); -} - -/* -mlir::OpFoldResult ElementwiseAdd::fold( - llvm::ArrayRef operands) { - if (getElementTypeOrSelf(getType()).isa()) { - if (!operands[0] || !operands[1]) return {}; - DenseElementsAttr lhs = operands[0].dyn_cast(); - DenseElementsAttr rhs = operands[1].dyn_cast(); - if (!lhs || !rhs) return {}; - ShapedType type = getType().template cast(); - if (!type.hasStaticShape()) return {}; - Type etype = type.getElementType(); - if (!etype.isa()) return {}; - SmallVector values; - values.reserve(lhs.getNumElements()); - for (const auto zip : - llvm::zip(lhs.getValues(), rhs.getValues())) { - values.push_back( - std::plus()(std::get<0>(zip), std::get<1>(zip))); - } - return DenseElementsAttr::get(type, values); - } - return {}; -} - -LogicalResult ElementwiseDiv::inferReturnTypes( - MLIRContext *context, - Optional location, - ValueRange operands, - DictionaryAttr attributes, - RegionRange regions, - SmallVectorImpl &inferredReturnTypes) { - inferredReturnTypes.push_back(operands[0].getType()); - return success(); -} - -LogicalResult ElementwiseMul::inferReturnTypes( - MLIRContext *context, - Optional location, - ValueRange operands, - DictionaryAttr attributes, - RegionRange regions, - SmallVectorImpl &inferredReturnTypes) { - inferredReturnTypes.push_back(operands[0].getType()); - return success(); -} - -LogicalResult ElementwiseSub::inferReturnTypes( - MLIRContext *context, - Optional location, - ValueRange operands, - DictionaryAttr attributes, - RegionRange regions, - SmallVectorImpl &inferredReturnTypes) { - inferredReturnTypes.push_back(operands[0].getType()); - return success(); -} - -LogicalResult MulOp::inferReturnTypes( - MLIRContext *context, - Optional location, - ValueRange operands, - DictionaryAttr attributes, - RegionRange regions, - SmallVectorImpl &inferredReturnTypes) { - inferredReturnTypes.push_back(operands[0].getType()); - return success(); -} - -void ReluOp::getCanonicalizationPatterns( - mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) { - results.insert(context); -} - -void FusedRepeatedFCRelu::getCanonicalizationPatterns( - mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) { - results.insert(context); -} - -void BatchNormOp::getCanonicalizationPatterns( - mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) { - results.insert(context); -}*/ - } // namespace pd } // namespace mlir diff --git a/paddle/infrt/dialect/pd/ir/pd_ops.h b/paddle/infrt/dialect/pd/ir/pd_ops.h index e6b0f30c059..8383ff6ed82 100644 --- a/paddle/infrt/dialect/pd/ir/pd_ops.h +++ b/paddle/infrt/dialect/pd/ir/pd_ops.h @@ -1,4 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,49 +14,20 @@ #pragma once -#include -#include -#include -#include +//===----------------------------------------------------------------------===// +// Dialect +//===----------------------------------------------------------------------===// +#include #include #include -#include +#include #include -#include -#include -#include #include -#include #include -#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" - -namespace mlir { -namespace pd { - -class PaddleDialect : public Dialect { - public: - explicit PaddleDialect(MLIRContext* context); - - static StringRef getDialectNamespace() { return "pd"; } - - /// A hook used to materialize constant values with the given type. - Operation* materializeConstant(OpBuilder& builder, - Attribute value, - Type type, - Location loc) override; - - Type parseType(DialectAsmParser& parser) const override { - return Dialect::parseType(parser); - } - void printType(Type type, DialectAsmPrinter& printer) const override { - Dialect::printType(type, printer); - } -}; - -} // namespace pd -} // namespace mlir +#include "paddle/infrt/dialect/infrt/ir/infrt_dialect.h" +#include "paddle/infrt/dialect/pd/ir/pd_opsDialect.h.inc" #define GET_OP_CLASSES -#include "paddle/infrt/dialect/pd_ops.hpp.inc" +#include "paddle/infrt/dialect/pd/ir/pd_ops.h.inc" #define GET_OP_CLASSES -#include "paddle/infrt/dialect/pd_extra_ops.hpp.inc" +#include "paddle/infrt/dialect/pd/ir/pd_extra_ops.hpp.inc" diff --git a/paddle/infrt/dialect/pd/pass/CMakeLists.txt b/paddle/infrt/dialect/pd/pass/CMakeLists.txt index 59640e7e625..827df597b76 100644 --- a/paddle/infrt/dialect/pd/pass/CMakeLists.txt +++ b/paddle/infrt/dialect/pd/pass/CMakeLists.txt @@ -1,5 +1,8 @@ + core_gather_headers() gather_srcs(infrt_src SRCS pd_op_fuse_pass.cc ) + +mlir_add_rewriter(pd_op_fuse) diff --git a/paddle/infrt/dialect/rewrite.td b/paddle/infrt/dialect/pd/pass/pd_op_fuse.td similarity index 97% rename from paddle/infrt/dialect/rewrite.td rename to paddle/infrt/dialect/pd/pass/pd_op_fuse.td index 62e7471a390..f5a8ea78d7d 100644 --- a/paddle/infrt/dialect/rewrite.td +++ b/paddle/infrt/dialect/pd/pass/pd_op_fuse.td @@ -3,8 +3,8 @@ include "paddle/infrt/dialect/infrt/ir/infrt_base.td" include "mlir/Interfaces/SideEffectInterfaces.td" -include "paddle/infrt/dialect/pd_ops.td" -include "paddle/infrt/dialect/pd_extra_ops.td" +include "paddle/infrt/dialect/pd/ir/pd_ops.td" +include "paddle/infrt/dialect/pd/ir/pd_extra_ops.td" //===----------------------------------------------------------------------===// // This is to fuse the composition: 'Matmul o ElementwiseAdd' into 'PD_FusedFC'. diff --git a/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc index 620c8594234..8bdf957db27 100644 --- a/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc +++ b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.cc @@ -11,12 +11,13 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h" // NOLINT -#include #include #include "paddle/infrt/dialect/pd/ir/pd_ops.h" + namespace { -#include "paddle/infrt/dialect/rewrite.cpp.inc" // NOLINT +#include "paddle/infrt/dialect/pd/pass/pd_op_fuse.cpp.inc" // NOLINT /* * PdOpFusePass. diff --git a/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h new file mode 100644 index 00000000000..854545ab1a2 --- /dev/null +++ b/paddle/infrt/dialect/pd/pass/pd_op_fuse_pass.h @@ -0,0 +1,24 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +namespace infrt { +/* + * PdOpFusePass. + */ +std::unique_ptr CreatePdOpFusePass(); + +} // namespace infrt diff --git a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td index 46c250b0549..6467c1285f8 100644 --- a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td +++ b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td @@ -3,7 +3,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td" include "paddle/infrt/dialect/infrt/ir/infrt_base.td" -include "paddle/infrt/dialect/pd_ops.td" +include "paddle/infrt/dialect/pd/ir/pd_ops.td" include "paddle/infrt/dialect/tensorrt/trt_ops.td" def PD2TRT_Matmul_Lower : Pat< diff --git a/paddle/infrt/host_context/paddle_mlir.h b/paddle/infrt/host_context/paddle_mlir.h index 1fb3f7b7349..d5f1209b992 100644 --- a/paddle/infrt/host_context/paddle_mlir.h +++ b/paddle/infrt/host_context/paddle_mlir.h @@ -14,20 +14,20 @@ #ifndef PADDLE_INFRT_HOST_CONTEXT_PADDLE_MLIR_H_ #define PADDLE_INFRT_HOST_CONTEXT_PADDLE_MLIR_H_ +#include +#include +#include +#include +#include +#include #include #include #include -#include "llvm/Support/CommandLine.h" -#include "mlir/Dialect/StandardOps/IR/Ops.h" -#include "mlir/IR/AsmState.h" -#include "mlir/IR/Builders.h" -#include "mlir/IR/MLIRContext.h" #include "paddle/infrt/common/global.h" #include "paddle/infrt/common/string.h" #include "paddle/infrt/dialect/dense_tensor.h" #include "paddle/infrt/dialect/infrt/ir/basic_kernels.h" - #include "paddle/infrt/dialect/init_dialects.h" #include "paddle/infrt/dialect/pd/ir/pd_ops.h" #include "paddle/infrt/dialect/tensor_shape.h" diff --git a/paddle/infrt/tests/dialect/rewrite.mlir b/paddle/infrt/tests/dialect/pd/rewrite.mlir similarity index 97% rename from paddle/infrt/tests/dialect/rewrite.mlir rename to paddle/infrt/tests/dialect/pd/rewrite.mlir index 9fbb09e2244..ea0248b9d95 100644 --- a/paddle/infrt/tests/dialect/rewrite.mlir +++ b/paddle/infrt/tests/dialect/pd/rewrite.mlir @@ -1,4 +1,4 @@ -// RUN: infrtopt --canonicalize %s | FileCheck %s +// RUN: infrtopt --pd-op-fuse %s | FileCheck %s // CHECK-LABEL: @main func @main() -> tensor { %a = "pd.feed"() {name="input0"} : () -> tensor diff --git a/tools/infrt/custom_pdop.td b/tools/infrt/custom_pdop.td index f7547672595..861b3194120 100644 --- a/tools/infrt/custom_pdop.td +++ b/tools/infrt/custom_pdop.td @@ -23,16 +23,6 @@ def PD_FetchOp : PD_Op<"fetch", [Terminator]> { let arguments = (ins PD_Tensor :$inputs, StrAttr:$name); } -def PD_ReturnOp : PD_Op<"return", [Terminator]> { - let summary = "return Op"; - - let description = [{ - Fetch tensor from the graph. - }]; - - let arguments = (ins Variadic:$inputs); -} - def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"::infrt::ReturnOp">]> { let summary = "paddle graph Op"; let description = [{ diff --git a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py index 528d61daf3b..8855e1eee38 100644 --- a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py +++ b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py @@ -16,8 +16,6 @@ import paddle.fluid.framework as framework from paddle.fluid import core from paddle import compat as cpt -ops_having_canonicalization = {"elementwise_add", } - # collect original ops: op which has both inference and grid defination def get_original_ops(): @@ -195,7 +193,7 @@ def generate_all_ops_inputs_outputs_map(op_descs): # funtion to generate paddle op dialect file def convert_op_proto_into_mlir(op_descs): - dst_dialect_file = "../../paddle/infrt/dialect/pd_ops.td" + dst_dialect_file = "../../paddle/infrt/dialect/pd/ir/pd_ops.td" custom_dialect_file = "custom_pdop.td" # 1. Head files @@ -214,7 +212,7 @@ def convert_op_proto_into_mlir(op_descs): "include \"mlir/Interfaces/InferTypeOpInterface.td\"", "include \"mlir/Interfaces/LoopLikeInterface.td\"", "include \"mlir/IR/OpBase.td\"", - "include \"paddle/infrt/dialect/pd_op_base.td\"", + "include \"paddle/infrt/dialect/pd/ir/pd_op_base.td\"", "", ] @@ -245,7 +243,6 @@ def convert_op_proto_into_mlir(op_descs): op_type=op_type, left_brace="{") SUMMARY = ' let summary = "{} op";\n'.format(op_type) - CANONICALIZATION = "let hasCanonicalizer = 1;" if op_type in ops_having_canonicalization else "" # 2.2 Description contents = "" @@ -348,7 +345,6 @@ def convert_op_proto_into_mlir(op_descs): ops_mlir_file.write(DESCRIPTION) ops_mlir_file.write(ARGUMENTS) ops_mlir_file.write(RESULTS) - ops_mlir_file.write(CANONICALIZATION) ops_mlir_file.write("}\n") print("Skipped ops num: " + str(len(skipped_op_list))) -- GitLab From 1e045cae8f220cc6e853c681912b1c1e9bf3e6ed Mon Sep 17 00:00:00 2001 From: 0x45f <23097963+0x45f@users.noreply.github.com> Date: Thu, 17 Mar 2022 10:27:55 +0800 Subject: [PATCH 128/176] Refine io for test_mnist.py (#40496) * for test_mnist.py * remove comments * using type() replace isinstance() * valid vars for run program OP in io.py * open test_mnist in eager_gurad for coverage --- python/paddle/fluid/dygraph/io.py | 68 ++++++++----------- python/paddle/fluid/dygraph/jit.py | 2 +- python/paddle/fluid/dygraph/layers.py | 3 +- .../unittests/dygraph_to_static/test_mnist.py | 8 +++ python/paddle/static/input.py | 2 +- 5 files changed, 41 insertions(+), 42 deletions(-) diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py index f58952d3036..a36164a277d 100644 --- a/python/paddle/fluid/dygraph/io.py +++ b/python/paddle/fluid/dygraph/io.py @@ -30,6 +30,7 @@ from paddle.fluid.layers import nn from paddle.fluid.layers.utils import _hash_with_id from paddle.fluid.dygraph.base import switch_to_static_graph from paddle.fluid.framework import in_dygraph_mode +from paddle import _C_ops __all__ = ['TranslatedLayer'] @@ -761,6 +762,21 @@ def _construct_params_and_buffers(model_path, return var_dict +def _valid_vars(vars): + if vars: + return vars + if framework._in_eager_mode(): + return [ + core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var", + core.VarDesc.VarType.RAW, False) + ] + else: + return [ + core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var", + core.VarDesc.VarType.RAW, False) + ] + + def _run_dygraph(instance, input, program_holder): # 1. prepare inputs, outputs, attrs @@ -826,17 +842,12 @@ def _run_dygraph(instance, input, program_holder): # hold forward variables if framework._in_eager_mode(): - tmp_scope_vec = core.eager.Tensor( - dtype=core.VarDesc.VarType.FP32, - dims=[], - name="program_out_scope", - type=core.VarDesc.VarType.STEP_SCOPES, - persistable=True) + tmp_scope_vec = [program_holder.scope] else: tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [], "program_out_scope", core.VarDesc.VarType.STEP_SCOPES, True) - tmp_scope_vec.value().set_scope(program_holder.scope) + tmp_scope_vec.value().set_scope(program_holder.scope) double_grad_vars = [] for var_desc in program_holder.double_grad_descs: @@ -852,41 +863,18 @@ def _run_dygraph(instance, input, program_holder): var_desc.shape(), var_desc.name(), var_desc.type(), False) double_grad_vars.append(var) - if len(double_grad_vars) == 0: - if framework._in_eager_mode(): - double_grad_vars = [ - core.eager.Tensor( - value=[1], - name='Fake_var', - place=framework._current_expected_place()) - ] - else: - double_grad_vars = [ - core.VarBase( - value=[1], - name='Fake_var', - place=framework._current_expected_place()) - ] # 2. run program by op trace_program = program_holder.infer_program if instance._is_test else program_holder.train_program end_op_index = program_holder.infer_program.block(0).op_size() - framework._dygraph_tracer().trace_op( - type='run_program', - inputs={'X': input_vars, - 'Params': persistable_vars}, - outputs={ - 'Out': output_vars, - 'OutScope': tmp_scope_vec, - 'DOut': double_grad_vars - }, - attrs={ - 'global_block': trace_program.block(0), - 'start_op_index': 0, - 'end_op_index': end_op_index, - 'is_test': instance._is_test, - 'program_id': _hash_with_id(trace_program, instance) - }) + attrs = ('global_block', trace_program.block(0), 'start_op_index', 0, + 'end_op_index', end_op_index, 'is_test', instance._is_test, + 'program_id', _hash_with_id(trace_program, instance)) + _C_ops.run_program( + _valid_vars(input_vars), + _valid_vars(persistable_vars), + _valid_vars(output_vars), tmp_scope_vec, + _valid_vars(double_grad_vars), *attrs) # NOTE: [ why need set param's gradient type here ] # if user set sparse gradient mode, the param's gradient # will be SelectedRows, not LoDTensor. But tracer will just @@ -914,8 +902,10 @@ def _run_dygraph(instance, input, program_holder): def drop_scope_if_no_grad(instance, scope_vec): tracer = framework._dygraph_tracer() + scope = scope_vec.value().get_scope() if isinstance(scope_vec, ( + core.VarBase)) else scope_vec[0] if (not instance._is_test) and (not tracer._has_grad): - scope_vec.value().get_scope().drop_kids() + scope.drop_kids() def _run_static_graph(input, program_holder, trace_program): diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py index b1865691b24..1e1ce3ba7e4 100644 --- a/python/paddle/fluid/dygraph/jit.py +++ b/python/paddle/fluid/dygraph/jit.py @@ -821,7 +821,7 @@ def save(layer, path, input_spec=None, **configs): for var in flatten(input_spec): if isinstance(var, paddle.static.InputSpec): inner_input_spec.append(var) - elif isinstance(var, (core.VarBase, Variable)): + elif isinstance(var, (core.VarBase, core.eager.Tensor, Variable)): inner_input_spec.append( paddle.static.InputSpec.from_tensor(var)) else: diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py index 53dbf1a66b2..6957850d205 100644 --- a/python/paddle/fluid/dygraph/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -760,7 +760,8 @@ class Layer(object): raise KeyError("The name of buffer can not be empty.") elif hasattr(self, name) and name not in self._buffers: raise KeyError("attribute '{}' already exists.".format(name)) - elif tensor is not None and not type(tensor) == core.VarBase: + elif tensor is not None and not (type(tensor) == core.VarBase or + type(tensor) == core.eager.Tensor): raise TypeError( "The registered buffer should be a core.VarBase, but received {}.". format(type(tensor).__name__)) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py index cac64c73913..2b8307461b8 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py @@ -27,6 +27,7 @@ from paddle.fluid.dygraph.nn import Conv2D, Linear, Pool2D from paddle.fluid.optimizer import AdamOptimizer from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator +from paddle.fluid.framework import _test_eager_guard from predictor_utils import PredictorTools @@ -155,6 +156,13 @@ class TestMNISTWithToStatic(TestMNIST): np.allclose(dygraph_loss, static_loss), msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss, static_loss)) + with _test_eager_guard(): + dygraph_loss = self.train_dygraph() + static_loss = self.train_static() + self.assertTrue( + np.allclose(dygraph_loss, static_loss), + msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss, + static_loss)) def test_mnist_declarative_cpu_vs_mkldnn(self): dygraph_loss_cpu = self.train_dygraph() diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py index f06c45cc369..7c0c71951aa 100644 --- a/python/paddle/static/input.py +++ b/python/paddle/static/input.py @@ -193,7 +193,7 @@ class InputSpec(object): print(x_spec) # InputSpec(shape=(2, 2), dtype=VarType.FP32, name=x) """ - if isinstance(tensor, (Variable, core.VarBase)): + if isinstance(tensor, (Variable, core.VarBase, core.eager.Tensor)): return cls(tensor.shape, tensor.dtype, name or tensor.name) else: raise ValueError( -- GitLab From 4db8cf240093a3258903d70a69af968aceab51be Mon Sep 17 00:00:00 2001 From: Weilong Wu Date: Thu, 17 Mar 2022 10:41:25 +0800 Subject: [PATCH 129/176] [Eager Grad] Support eager grad interface (#40170) * [Eager] Support eager grad interface, draft version * Support eager grad interface with allow_unused and multi startup_op * Fix code format * Fix allow_unused case, return PyNone if tensor not initialize * Support output's stop_gradient related to create_graph * Support grad exception case in eager mode, fix coverage CI * Update ToPyObject, return PyNone if not initialize * AccumulationNode add FLAGS_retain_grad_for_all_tensor * Fix ci issue * Fix CI issue * fix, use core.eager.Tensor * Add func SetBufferSlotRankZeros for GradTensorHolder * Support retain_graph by using ClearTensorWrappers * Support retain_graph by using ClearTensorWrappers * Update retain_graph and no_grad_vars related test case * Update code gen logic for ClearTensorWrappers * Fix by override statement * fix override func args * Support retain_graph, update unit tests * Updated ClearTensorWrappers logic * fix grad python interface * Use deep copy and update unit tests * Polish code * Polish code * Fix CI issue, Deep copy only use when user set grad_tensors * Fix CI, use Backward instead RunBackward * Fix CI, Declare kernel explicitly in test file * Polish, remove vector of TensorWrapper * Refactor the logic of grad/backward, polish codes * Update code after merge upstream develop * Polish after merge upstream develop * Update to adapt new GradNodeBase superclass * Fix error introduced during conflict resolution * Update purify potential_startup_nodes logic * Fix errors * Polish code * Remove useless args for ToPyObject * Remove useless TensorWrappersSet * Fix code-format, re-install pre-commit * Fix pre-process logic for potential_startup_ops * Update unit tests, use eager mode --- .../eager/accumulation/accumulation_node.cc | 8 +- .../eager/accumulation/accumulation_node.h | 11 +- .../eager_generated/backwards/scale_node.cc | 4 +- .../eager_generated/backwards/scale_node.h | 11 +- .../auto_code_generator/eager_generator.cc | 33 +- .../final_state_generator/eager_gen.py | 41 +- paddle/fluid/eager/backward.cc | 390 +++++++++++++++++- paddle/fluid/eager/backward.h | 16 +- .../custom_operator/custom_operator_node.cc | 4 +- .../custom_operator/custom_operator_node.h | 10 +- paddle/fluid/eager/grad_node_info.h | 6 +- paddle/fluid/eager/grad_tensor_holder.cc | 5 + paddle/fluid/eager/grad_tensor_holder.h | 2 + paddle/fluid/eager/tensor_wrapper.h | 2 + .../data_structure_tests/grad_node_test.h | 9 +- .../performance_tests/benchmark_utils.cc | 8 +- .../eager/tests/task_tests/CMakeLists.txt | 1 + .../eager/tests/task_tests/backward_test.cc | 9 +- .../cross_batch_accumulation_test.cc | 4 +- .../tests/task_tests/fwd_bwd_joint_test.cc | 16 +- .../eager/tests/task_tests/generated_test.cc | 6 +- .../fluid/eager/tests/task_tests/grad_test.cc | 339 +++++++++++++++ .../fluid/eager/tests/task_tests/hook_test.cc | 4 +- .../task_tests/hook_test_intermidiate.cc | 6 +- .../eager/to_static/run_program_op_node.h | 10 +- paddle/fluid/pybind/eager_functions.cc | 27 +- paddle/fluid/pybind/eager_utils.cc | 24 +- paddle/fluid/pybind/eager_utils.h | 3 +- python/paddle/fluid/dygraph/base.py | 62 ++- .../tests/unittests/test_egr_python_api.py | 2 +- .../unittests/test_imperative_double_grad.py | 214 ++++++++-- .../test_paddle_imperative_double_grad.py | 93 +++-- 32 files changed, 1217 insertions(+), 163 deletions(-) create mode 100644 paddle/fluid/eager/tests/task_tests/grad_test.cc diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc index 3a2ec403c0a..9c4089af092 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.cc +++ b/paddle/fluid/eager/accumulation/accumulation_node.cc @@ -24,7 +24,7 @@ #include "paddle/fluid/platform/errors.h" #include "glog/logging.h" - +DECLARE_bool(retain_grad_for_all_tensor); namespace egr { static void CopyOrAddTensor(paddle::experimental::Tensor* tensor, @@ -39,8 +39,8 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor, } std::vector> GradNodeAccumulation:: -operator()( - const std::vector>& grads) { +operator()(const std::vector>& grads, + bool create_graph) { VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation"; PADDLE_ENFORCE(grads.size() == 1, paddle::platform::errors::Fatal( @@ -62,7 +62,7 @@ operator()( grad_out = grads[0][0]; } - if (!weak_grad_.expired()) { + if (!weak_grad_.expired() && FLAGS_retain_grad_for_all_tensor) { auto grad = weak_grad_.lock(); CopyOrAddTensor(grad.get(), grad_out); } diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h index 07fa4016516..a91a0b6e34c 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.h +++ b/paddle/fluid/eager/accumulation/accumulation_node.h @@ -35,8 +35,15 @@ class GradNodeAccumulation : public GradNodeBase { // Functor: perform backward computations virtual std::vector> operator()( - const std::vector>& grads) - override; + const std::vector>& grads, + bool create_graph = false) override; + + void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } + + bool IsTensorWrappersCleared() override { + VLOG(6) << "Do nothing here now"; + return false; + } std::string name() { return "GradNodeAccumulation"; } diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc index 5a2595b9103..0bc998a03a8 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc @@ -145,8 +145,8 @@ void GradNodeScale::SetTensorWrappers_X( void GradNodeScale::SetAttributes_scale(float scale) { scale_ = scale; } std::vector> GradNodeScale:: -operator()( - const std::vector>& grads) { +operator()(const std::vector>& grads, + bool create_graph) { // 1. Check Output Size PADDLE_ENFORCE( ((grads.size() == 1) && (grads[0].size() == 1)), diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h index 247fde6ed1f..e263f73a6b8 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h @@ -39,8 +39,15 @@ class GradNodeScale : public GradNodeBase { // Functor: perform backward computations virtual std::vector> operator()( - const std::vector>& grads) - override; + const std::vector>& grads, + bool create_graph = false) override; + + void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } + + bool IsTensorWrappersCleared() override { + VLOG(6) << "Do nothing here now"; + return false; + } void SetTensorWrappers_X( const std::vector& tensors); diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index bf838b27615..d9f201dc9f1 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -2074,7 +2074,8 @@ static std::string GenerateGradNodeCCContents( const char* GRAD_FUNCTION_TEMPLATE = "std::vector> " "GradNode%s::operator()(const " - "std::vector>& grads) {\n%s\n}"; + "std::vector>& grads, " + "bool create_graph) {\n%s\n}"; std::string grad_function_str = paddle::string::Sprintf( GRAD_FUNCTION_TEMPLATE, fwd_op_type, generated_grad_function_body); @@ -2109,18 +2110,28 @@ static std::string GenerateGradNodeHeaderContents( "\n" " virtual std::vector> " "operator()(const " - "std::vector>& grads) " + "std::vector>& grads, const " + "bool create_graph = false) " "override;\n" "\n" + " void ClearTensorWrappers() override { \n" + "%s\n" + " is_tensor_wrappers_cleared = true;\n" + " }\n" " std::string name() override { return \" GradNode%s \"; } \n " "\n" " // SetX, SetY, ...\n" "%s\n" " // SetAttrMap\n" "%s\n" + " bool IsTensorWrappersCleared() override { \n" + " return is_tensor_wrappers_cleared;\n" + " }\n" " private:\n" " // TensorWrappers\n" "%s\n" + " bool is_tensor_wrappers_cleared = false;\n" + "\n" " // Attribute Map\n" "%s\n" "};"; @@ -2154,6 +2165,7 @@ static std::string GenerateGradNodeHeaderContents( std::string set_tensor_wrappers_str = ""; std::string tensor_wrapper_members_str = ""; + std::string clear_tensor_wrappers_str = ""; for (const auto& iter : op_base_infos) { const std::map& grad_ins_fwd_slotname_map = iter.GetGradInsFwdSlotnameMap(); @@ -2185,6 +2197,13 @@ static std::string GenerateGradNodeHeaderContents( SET_TENSOR_WRAPPER_BODY_TEMPLATE, tensor_wrapper_name, struct_tensor_wrapper_name); + const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = + "for (auto tw: %s) {\n" + " tw.clear();\n" + " }\n"; + clear_tensor_wrappers_str += paddle::string::Sprintf( + CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name); + } else { const char* ATTR_TENSOR_WRAPPER_ARG_TEMPLATE = "const paddle::experimental::Tensor& %s"; @@ -2197,10 +2216,14 @@ static std::string GenerateGradNodeHeaderContents( TENSOR_WRAPPER_MEMBER_TEMPLATE, struct_tensor_wrapper_name); const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE = - "%s = egr::TensorWrapper(%s, %s /*full_reserved*/);"; + "%s = egr::TensorWrapper(%s, %s /*full_reserved*/);\n"; tensor_wrapper_body_str = paddle::string::Sprintf( SET_TENSOR_WRAPPER_BODY_TEMPLATE, struct_tensor_wrapper_name, tensor_wrapper_name, full_reserved_str); + + const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = " %s.clear();\n"; + clear_tensor_wrappers_str += paddle::string::Sprintf( + CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name); } std::string full_reserved_signature_str = "bool full_reserved"; const char* SET_TENSOR_WRAPPER_TEMPLATE = @@ -2215,8 +2238,8 @@ static std::string GenerateGradNodeHeaderContents( std::string grad_node_str = paddle::string::Sprintf( GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type, - op_type, op_type, set_tensor_wrappers_str, set_attr_map_str, - tensor_wrapper_members_str, attr_members_str); + op_type, clear_tensor_wrappers_str, op_type, set_tensor_wrappers_str, + set_attr_map_str, tensor_wrapper_members_str, attr_members_str); return grad_node_str; } diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index d2d699e154f..4c1e5b00cba 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -478,6 +478,7 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, # SetTensorWrapper Methods & TensorWrapper Members set_tensor_wrapper_methods_str = "" tensor_wrapper_members_str = "" + clear_tensor_wrapper_str = "" for tname, (ttype, is_fwd_input, _) in backward_fwd_input_map.items(): if tname in no_need_buffer_set: no_need_buffer = "true" @@ -499,6 +500,13 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, """ tensor_wrapper_members_str += PLAIN_TENSOR_MEMBER_TEMPLATE.format( tensor_wrapper_name) + + CLEAR_TENSOR_WRAPPERS_TEMPLATE = """ + {}.clear(); +""" + clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format( + tensor_wrapper_name) + else: assert IsVectorTensorType(ttype) SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = """ @@ -516,6 +524,15 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, """ tensor_wrapper_members_str += VECTOR_TENSOR_MEMBER_TEMPLATE.format( tensor_wrapper_name) + + CLEAR_TENSOR_WRAPPERS_TEMPLATE = """ + for (auto tw: {}) { + tw.clear(); + }; +""" + clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format( + tensor_wrapper_name) + # End: SetTensorWrapper Methods & TensorWrapper Members # SetAttributes & Attribute Members @@ -524,7 +541,7 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, for aname, atype, default_val, _ in backward_attrs_list: saved_attr_name = GetSavedName(aname) SET_ATTR_METHOD_TEMPLATE = """ - void SetAttribute{}({} {}) {{ + void SetAttribute{}({} {}) {{ {} = {}; }} """ @@ -555,25 +572,37 @@ class {} : public egr::GradNodeBase {{ ~{}() override = default; virtual std::vector> operator()( - const std::vector>& grads) override; + const std::vector>& grads, bool create_graph = false) override; std::string name() override {{ return \" {} \"; }} + + void ClearTensorWrappers() override {{ + {} + is_tensor_wrappers_cleared = true; + }} + // SetTensorWrapperX, SetTensorWrapperY, ... {} // SetAttributes {} + + bool IsTensorWrappersCleared() override {{ + return is_tensor_wrappers_cleared; + }} private: // TensorWrappers {} + bool is_tensor_wrappers_cleared = false; + // Attributes {} }}; """ node_declaration_str = NODE_DECLARATION_TEMPLATE.format( grad_node_name, grad_node_name, grad_node_name, grad_node_name, - grad_node_name, set_tensor_wrapper_methods_str, - set_attribute_methods_str, tensor_wrapper_members_str, - attribute_members_str) + grad_node_name, clear_tensor_wrapper_str, + set_tensor_wrapper_methods_str, set_attribute_methods_str, + tensor_wrapper_members_str, attribute_members_str) return node_declaration_str @@ -637,7 +666,7 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map, grad_api_namespace = f"paddle::experimental" FUNCTION_TEMPLATE = """ -std::vector> {}::operator()(const std::vector>& grads) {{ +std::vector> {}::operator()(const std::vector>& grads, bool create_graph) {{ // Call grad_api function auto grad_api_returns = {}::{}({}); {} diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 1987d024d8f..f2d5f338bd4 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -39,12 +39,21 @@ std::unordered_map getInDegreeMap( // Copy nodes std::queue queue = init_queue; std::unordered_set visited; + size_t potential_startup_ops_cnt = queue.size(); + size_t cnt = 0; // Visit each node exactly once in any order while (!queue.empty()) { GradNodeBase* node = queue.front(); queue.pop(); + if (cnt < potential_startup_ops_cnt) { + if (!node_in_degree_map.count(node)) { + node_in_degree_map[node] = 0; + } + cnt += 1; + } + if (visited.count(node)) { continue; } @@ -76,23 +85,248 @@ std::unordered_map getInDegreeMap( return node_in_degree_map; } -void RunBackward(const std::vector& tensors, - const std::vector& grad_tensors, - bool retain_graph) { - paddle::platform::RecordEvent backward_record_event( - "backward", paddle::platform::TracerEventType::Operator, 1); +// Remove some nodes those doesn't need to be +// stored in potential_stop_nodes、potential_startup_nodes +void UpdateGraphInfo( + std::unordered_map* + target_nodes_inputmeta_map, + std::unordered_map>* + depending_nodes, + std::unordered_set* potential_stop_nodes, + std::unordered_set* potential_startup_nodes) { + // Updated potential_sotp_nodes by depending_nodes, + // make sure the path from root to target_node is ok + std::unordered_set _startup_ops; + VLOG(6) << "Running in UpdateGraphInfo"; + std::queue queue; + for (auto& target_nodes_inputmeta_pair : *target_nodes_inputmeta_map) { + queue.emplace(target_nodes_inputmeta_pair.first); + } + + while (!queue.empty()) { + auto* target_node = queue.front(); + queue.pop(); + if (!(*depending_nodes)[target_node].empty()) { + auto precedding_nodes = (*depending_nodes)[target_node]; + for (auto pre_nodes : precedding_nodes) { + queue.emplace(pre_nodes); + if (potential_stop_nodes->find(pre_nodes) != + potential_stop_nodes->end()) { + potential_stop_nodes->erase(pre_nodes); + } + } + } else { // startup_ops have no precedding nodes + VLOG(6) << "Emplace _startup_ops"; + _startup_ops.emplace(target_node); + } + } + // Purify potential_startup_nodes again, remove some + // potential startup_nodes that unreach to input target nodes + if (!_startup_ops.empty()) { + std::unordered_set potential_startup_nodes_to_be_erased; + for (auto node : *potential_startup_nodes) { + if (_startup_ops.count(node) == 0) { + VLOG(6) << "Set up potential_startup_nodes_to_be_erased"; + potential_startup_nodes_to_be_erased.emplace(node); + } + } + if (!potential_startup_nodes_to_be_erased.empty()) { + for (auto node : potential_startup_nodes_to_be_erased) { + VLOG(6) << "Erase nodes in potential_startup_nodes_to_be_erased"; + potential_startup_nodes->erase(node); + } + } + } +} + +// Get Graph Info Betweent input target gradnode and outputs, +// record depending_nodes、 potential_stop_nodes、potential_startup_nodes +void GetGraphInfoBetweenTargets( + const std::queue& init_queue, + std::unordered_map* + input_target_nodes_inputmeta_map, + std::unordered_map>* + depending_nodes, + std::unordered_set* potential_stop_nodes, + std::unordered_set* potential_startup_nodes) { + if (input_target_nodes_inputmeta_map->empty()) return; + + VLOG(6) << "Runing In GetGraphInfoBetweenTargets"; + + // Calculate in_degree for each node + std::unordered_map node_in_degree_map; + + // Copy nodes + std::queue queue = init_queue; + std::unordered_set visited; + + // Visit each node exactly once in any order + while (!queue.empty()) { + GradNodeBase* node = queue.front(); + queue.pop(); + + if (visited.count(node)) { + continue; + } + visited.insert(node); + + // Check node is target_nodes or not, if node is not target_node, + // all the next_node will be marked in potential_stop_nodes + bool is_potential_stop_nodes = + input_target_nodes_inputmeta_map->count(node); + + // Find and append next nodes + const std::vector>& edges = node->GetEdges(); + for (const auto& edge_list : edges) { + for (const Edge& edge : edge_list) { + GradNodeBase* next_node = edge.GetMutableGradNode().get(); + + // Next node could be nullptr if it is leaf tensor with no + // AccumulationNode attached + // Or it could also originated from dispensable inputs + if (!next_node) continue; + + // if node not in input_target_nodes, + // all the next_nodes of current node will be inserted to + // potential_stop_node + if (is_potential_stop_nodes) { + potential_stop_nodes->emplace(next_node); + } + + // Update in_degree + if (!node_in_degree_map.count(next_node)) + node_in_degree_map[next_node] = 0; + node_in_degree_map[next_node]++; + // Record depending relationship + (*depending_nodes)[next_node].emplace(node); + queue.push(next_node); + } + } + } + // Update Graph Info, remove some stop_node in potential_stop_nodes + UpdateGraphInfo(input_target_nodes_inputmeta_map, depending_nodes, + potential_stop_nodes, potential_startup_nodes); +} + +void GetTargetNodesInfo(const std::vector& inputs, + std::unordered_map* + target_nodes_inputmeta_map) { + VLOG(6) << "Running in GetTargetNodesInfo"; + if (!inputs.empty()) { + VLOG(6) << "Inputs are not empty"; + size_t num_inputs = inputs.size(); + for (size_t i = 0; i < num_inputs; i++) { + AutogradMeta* auto_grad_meta = + EagerUtils::unsafe_autograd_meta(inputs[i]); + auto target_node = auto_grad_meta->GetMutableGradNode().get(); + + PADDLE_ENFORCE_NOT_NULL(target_node, + paddle::platform::errors::Fatal( + "There is no grad op for input:%d or it's" + "stop_gradient=True", + i)); + (*target_nodes_inputmeta_map)[target_node] = auto_grad_meta; + } + } +} + +std::vector GetResults( + const std::vector& inputs, + std::unordered_map* + results_map, + bool allow_unused, bool create_graph) { + VLOG(6) << "Running in GetResults"; + if (inputs.empty()) return {}; + + std::vector results; + results.reserve(inputs.size()); + + for (size_t i = 0; i < inputs.size(); ++i) { + auto& input = inputs[i]; + AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(input); + auto target_node = auto_grad_meta->GetMutableGradNode().get(); + + auto iter = results_map->find(target_node); + if (iter != results_map->end()) { + // set StopGradient = !create_graph + AutogradMeta* tensor_auto_grad_meta = + EagerUtils::autograd_meta(&(iter->second)); + tensor_auto_grad_meta->SetStopGradient(!create_graph); + results.emplace_back(iter->second); + } else { + PADDLE_ENFORCE_EQ(allow_unused, true, + paddle::platform::errors::InvalidArgument( + "The %d-th input does not appear in the backward " + "graph. Please check the input variable or set " + "allow_unused=True to get None result.", + i)); + results.emplace_back(); + } + } + return results; +} + +// Enforce GradNode has TensorWrappers as Input +void EnforceGradNodeHasInput(GradNodeBase* node) { + VLOG(6) << "Running in EnforceGradNodeHasInput"; + PADDLE_ENFORCE_NE( + node->IsTensorWrappersCleared(), true, + paddle::platform::errors::Fatal( + "The TensorWrappers of %s do not exist. This may be because:\n" + "You calculate backward twice for the same subgraph without " + "setting retain_graph=True. Please set retain_graph=True in the " + "first backward/grad call.\n", + node->name())); +} + +// Purify potential_startup_nodes, remove nodes those are the same as +// input_target_nodes +void PurifyPotentialStartUpNodes( + std::unordered_set* potential_startup_nodes, + std::unordered_map* + input_target_nodes_inputmeta_map) { + VLOG(6) << "Running in PurifyPotentialStartUpNodes"; + if (input_target_nodes_inputmeta_map->empty()) return; + std::unordered_set potential_startup_nodes_to_be_erased; + for (auto startup_op : *potential_startup_nodes) { + auto iter = input_target_nodes_inputmeta_map->find(startup_op); + if (iter != input_target_nodes_inputmeta_map->end()) { + potential_startup_nodes_to_be_erased.emplace(iter->first); + } + } + if (!potential_startup_nodes_to_be_erased.empty()) { + for (auto nodes : potential_startup_nodes_to_be_erased) { + potential_startup_nodes->erase(nodes); + } + } +} + +std::vector RunBackward( + const std::vector& tensors, // output + const std::vector& grad_tensors, + bool retain_graph, bool create_graph = false, + const std::vector& inputs = {}, + bool allow_unused = false, + const std::vector& no_grad_vars = {}) { VLOG(6) << "Start Backward"; // *Gradient Hook should happen at node-level // *Inplace version check should perform at node-level // *Cross-batch accumulation happens at forward pass + std::unordered_map + no_grad_var_nodes_inputmeta_map; + // Get no_grad_vars's GradNodes and InputMeta Info + GetTargetNodesInfo(no_grad_vars, &no_grad_var_nodes_inputmeta_map); + /* --- Initialization --- */ // 1. Init queue with starting nodes // 2. Prepare initial input buffers std::queue queue; std::unordered_map> node_input_buffers_dict; + std::unordered_set potential_startup_nodes; for (size_t i = 0; i < tensors.size(); i++) { const paddle::experimental::Tensor& tensor = tensors[i]; @@ -132,8 +366,17 @@ void RunBackward(const std::vector& tensors, "size = 0 or same size as tensors")); // Feed given tensor if it's provided VLOG(6) << "Fill grad input tensor " << i << "with give grad tensor"; - node_input_buffers_dict[grad_node]->add( - input_info.first, input_info.second, grad_tensors[i]); + + if (grad_tensors[i].is_initialized()) { + // Deep copy + paddle::experimental::Tensor tmp_tensor; + tmp_tensor.copy_(grad_tensors[i], true); + node_input_buffers_dict[grad_node]->add(input_info.first, + input_info.second, tmp_tensor); + } else { + node_input_buffers_dict[grad_node]->add( + input_info.first, input_info.second, grad_tensors[i]); + } } else { VLOG(6) << "Fill grad input tensor " << i << " with 1.0"; @@ -146,8 +389,9 @@ void RunBackward(const std::vector& tensors, input_info.first, input_info.second, tensor, true /*fill_one=true*/); } - // Prepare queue + // Prepare queue, potential startup_nodes queue.push(grad_node); + potential_startup_nodes.emplace(grad_node); } VLOG(6) << "Update In degree Map for backward"; @@ -155,25 +399,74 @@ void RunBackward(const std::vector& tensors, std::unordered_map node_in_degree_map = getInDegreeMap(queue); + // Get input's GradNodes and InputMeta Info + std::unordered_map + input_target_nodes_inputmeta_map; + GetTargetNodesInfo(inputs, &input_target_nodes_inputmeta_map); + + // Purify potential_startup_ops, remove those nodes that are the same as + // input_target_nodes + PurifyPotentialStartUpNodes(&potential_startup_nodes, + &input_target_nodes_inputmeta_map); + + // Get Graph Info Betweent input target gradnode and outputs + // Record the depending_nodes and potential_stop_nodes + std::unordered_map /* father node */> + depending_nodes; + std::unordered_set potential_stop_nodes; + // std::unordered_set startup_ops; + + GetGraphInfoBetweenTargets(queue, &input_target_nodes_inputmeta_map, + &depending_nodes, &potential_stop_nodes, + &potential_startup_nodes); + + // ready_queue store all startup nodes + std::queue ready_queue; + // startup op's indegree should be 0 + for (auto node : potential_startup_nodes) { + if (node_in_degree_map[node] == 0) { + ready_queue.emplace(node); + } + } + + VLOG(1) << " startup_ops' size is :" << ready_queue.size(); + + std::unordered_map results_map; + + // read_queue is empty only when 1.input equals to output. 2.input can not + // reach to output. + if (ready_queue.size() == 0) { + for (auto input_target_node : input_target_nodes_inputmeta_map) { + // out rank_info of forward op + auto rank_info = input_target_node.second->OutRankInfo(); + if (node_input_buffers_dict[input_target_node.first]) { + auto& target_result = + node_input_buffers_dict[input_target_node.first] + ->Buffers()[rank_info.first][rank_info.second]; + // save the target result + results_map[input_target_node.first] = target_result; + } + } + } + /* --- Topological Visit --- */ // 1. Pop queue // 2. Run node + // |- Check and capture target result // |- node(grads) // |- Prepare for next node // 3. Update queue VLOG(6) << "Run Backward"; - while (!queue.empty()) { - GradNodeBase* node = queue.front(); + while (!ready_queue.empty()) { + GradNodeBase* node = ready_queue.front(); + VLOG(6) << "Running GradNode:" << node->name(); + ready_queue.pop(); paddle::platform::RecordEvent node_record_event( std::string(typeid(*node).name()) + " grad_node", paddle::platform::TracerEventType::Operator, 1); - if (queue.size() > 1 && node_in_degree_map[node] != 0) { - queue.pop(); - continue; - } - queue.pop(); // Run node: This is where Hook happens PADDLE_ENFORCE( node_input_buffers_dict.count(node), @@ -184,10 +477,45 @@ void RunBackward(const std::vector& tensors, std::unique_ptr node_input_buffer = std::move(node_input_buffers_dict[node]); + // get target grad_var from node_input_buffer by inputmeta + if (input_target_nodes_inputmeta_map.find(node) != + input_target_nodes_inputmeta_map.end()) { + VLOG(6) << "Get target result by by inputmeta"; + // out rank_info of forward op + auto rank_info = input_target_nodes_inputmeta_map[node]->OutRankInfo(); + // rank_info is a pair, first means slot_id, second means rank. + auto& target_result = + node_input_buffer->Buffers()[rank_info.first][rank_info.second]; + // save the target result + results_map[node] = target_result; + } + + // no_grad_vars + if (no_grad_var_nodes_inputmeta_map.find(node) != + no_grad_var_nodes_inputmeta_map.end()) { + VLOG(6) << "Change the input buffer[slot][rank] by Zeros"; + auto rank_info = no_grad_var_nodes_inputmeta_map[node]->OutRankInfo(); + node_input_buffer->SetBufferSlotRankZeros(rank_info.first, + rank_info.second); + } + + VLOG(6) << "Running GradNode:" << node->name(); + + // check input + EnforceGradNodeHasInput(node); + VLOG(6) << "Run Backward Kernel with GradTensorHolder"; // Run Pre Backward Node and get outputs std::vector> grad_output_tensors = - (*node)(node_input_buffer->Buffers()); + (*node)(node_input_buffer->Buffers(), create_graph); + + // retain_grad or not + if (!retain_graph) { + VLOG(6) + << "retain_graph is false, need to clear the TensorWrapper of nodes."; + node->ClearTensorWrappers(); + } + // TODO(jiabin): Should we erase it or find a more efficient way. node_input_buffers_dict.erase(node); @@ -252,18 +580,44 @@ void RunBackward(const std::vector& tensors, // Update queue node_in_degree_map[next_node]--; + PADDLE_ENFORCE( node_in_degree_map[next_node] >= 0, paddle::platform::errors::Fatal( "Detected in-degree value smaller than zero. For Node: %s" "Node's in-degree cannot be negative", next_node->name())); - if (node_in_degree_map[next_node] == 0) { - queue.emplace(std::move(next_node)); + + bool is_potential_stop_node = potential_stop_nodes.count(next_node); + + if (node_in_degree_map[next_node] == 0 && !is_potential_stop_node) { + ready_queue.emplace(std::move(next_node)); } } } } + + return GetResults(inputs, &results_map, allow_unused, create_graph); } +void Backward( + const std::vector& tensors, // output + const std::vector& grad_tensors, + bool retain_graph) { + VLOG(6) << "Run in Backward"; + paddle::platform::RecordEvent backward_record_event( + "backward", paddle::platform::TracerEventType::Operator, 1); + RunBackward(tensors, grad_tensors, retain_graph); +} + +std::vector Grad( + const std::vector& tensors, // output + const std::vector& inputs, + const std::vector& grad_tensors, + bool retain_graph, bool create_graph, bool only_inputs, bool allow_unused, + const std::vector& no_grad_vars) { + VLOG(6) << "Run in Grad"; + return RunBackward(tensors, grad_tensors, retain_graph, create_graph, inputs, + allow_unused, no_grad_vars); +} } // namespace egr diff --git a/paddle/fluid/eager/backward.h b/paddle/fluid/eager/backward.h index 2856d9fb87f..bebe664838e 100644 --- a/paddle/fluid/eager/backward.h +++ b/paddle/fluid/eager/backward.h @@ -19,12 +19,20 @@ namespace egr { -// run_backward(): +// Backward(): // tensors corresponds to those lived in the backward graph // each grad_tensors[i] keeps the value for its corresponding tensors[i] -void RunBackward(const std::vector &tensors, - const std::vector &grad_tensors, - bool retain_graph = false); +void Backward(const std::vector& tensors, + const std::vector& grad_tensors, + bool retain_graph = false); + +std::vector Grad( + const std::vector& tensors, + const std::vector& inputs, + const std::vector& grad_tensors = {}, + bool retain_graph = false, bool create_graph = false, + bool only_inputs = false, bool allow_unused = false, + const std::vector& no_grad_vars = {}); // Reserved for gradient() diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc index 48ac8c8358a..72af1cc4b06 100644 --- a/paddle/fluid/eager/custom_operator/custom_operator_node.cc +++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc @@ -20,8 +20,8 @@ namespace egr { std::vector> RunCustomOpNode:: -operator()( - const std::vector>& grads) { +operator()(const std::vector>& grads, + bool create_graph) { paddle::CustomOpKernelContext ctx; auto grad_inputs_name = paddle::framework::OpMetaInfoHelper::GetInputs( egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]); diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.h b/paddle/fluid/eager/custom_operator/custom_operator_node.h index e5ddef9c062..6ece2658575 100644 --- a/paddle/fluid/eager/custom_operator/custom_operator_node.h +++ b/paddle/fluid/eager/custom_operator/custom_operator_node.h @@ -37,8 +37,8 @@ class RunCustomOpNode : public GradNodeBase { // Functor: perform backward computations virtual std::vector> operator()( - const std::vector>& grads) - override; + const std::vector>& grads, + bool create_graph) override; std::string name() { return paddle::string::Sprintf("RunCustomOpNode: %s_grad", op_type_); @@ -62,6 +62,12 @@ class RunCustomOpNode : public GradNodeBase { return res; } + void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } + bool IsTensorWrappersCleared() override { + VLOG(6) << "Do nothing here now"; + return false; + } + void SetAttrs(const std::vector& attr) { attrs_ = attr; } public: diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h index 16513f05e07..168e1bcca77 100644 --- a/paddle/fluid/eager/grad_node_info.h +++ b/paddle/fluid/eager/grad_node_info.h @@ -95,8 +95,12 @@ class GradNodeBase { * is better choice to fit this format. * **/ virtual std::vector> operator()( - const std::vector>& grads) = 0; + const std::vector>& grads, + bool create_graph = false) = 0; + virtual void ClearTensorWrappers() = 0; + + virtual bool IsTensorWrappersCleared() = 0; /** * AddEdges is designed to set input tensors' backward Node as current * node's Edges. diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc index 69fc7df2f14..163d25e85ce 100644 --- a/paddle/fluid/eager/grad_tensor_holder.cc +++ b/paddle/fluid/eager/grad_tensor_holder.cc @@ -21,6 +21,11 @@ namespace egr { +void GradTensorHolder::SetBufferSlotRankZeros(size_t slot_id, size_t rank) { + buffer_[slot_id][rank] = + paddle::experimental::zeros_like(buffer_[slot_id][rank]); +} + void GradTensorHolder::add(size_t slot_id, size_t rank, const paddle::experimental::Tensor& t, bool fill_one) { diff --git a/paddle/fluid/eager/grad_tensor_holder.h b/paddle/fluid/eager/grad_tensor_holder.h index d66a81fe828..9059b403607 100644 --- a/paddle/fluid/eager/grad_tensor_holder.h +++ b/paddle/fluid/eager/grad_tensor_holder.h @@ -56,6 +56,8 @@ class GradTensorHolder { return buffer_; } + void SetBufferSlotRankZeros(size_t slot_id, size_t rank); + private: std::vector> buffer_; }; diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h index 31aaa93c416..0e11444b815 100644 --- a/paddle/fluid/eager/tensor_wrapper.h +++ b/paddle/fluid/eager/tensor_wrapper.h @@ -98,6 +98,8 @@ class TensorWrapper { } } + void clear() { intermidiate_tensor_.reset(); } + private: bool full_reserved_ = false; std::pair out_rank_info_; diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h index 535c93ac53b..0b167203735 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h @@ -32,8 +32,8 @@ class GradTestNode : public egr::GradNodeBase { GradTestNode() : GradNodeBase() { val_ = 1.0; } std::string name() override { return "GradTestNode"; } std::vector> operator()( - const std::vector>& grads) - override { + const std::vector>& grads, + bool create_graph = false) override { val_ = std::dynamic_pointer_cast(grads[0][0].impl()) ->data()[0]; phi::DenseTensorMeta meta = @@ -49,6 +49,11 @@ class GradTestNode : public egr::GradNodeBase { std::vector> res = {{et1}}; return res; } + void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } + bool IsTensorWrappersCleared() override { + VLOG(6) << "Do nothing here now"; + return false; + } float val_; }; } // namespace eager_test diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc index 769bd7f687f..887ea3e3acf 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc @@ -58,7 +58,7 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor, } std::vector target_tensors = {input_tensor}; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); if (accuracy_check) { // Examine Forward Grad (w.r.t max_num_runs = 10) @@ -80,7 +80,7 @@ void benchmark_eager_matmul(const paddle::experimental::Tensor& X, } std::vector target_tensors = {input_tensor0}; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); if (accuracy_check) { // Examine Forward Grad (w.r.t max_num_runs = 2) @@ -106,7 +106,7 @@ void benchmark_eager_intermediate_matmul(const paddle::experimental::Tensor& X, } std::vector target_tensors = {input_tensor0}; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); if (accuracy_check) { // Examine Forward Grad (w.r.t max_num_runs = 2) @@ -137,7 +137,7 @@ void benchmark_eager_intermediate_mlp( reduce_sum_dygraph_function(input0, {{"reduce_all", true}}); std::vector target_tensors = {Out}; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); if (accuracy_check) { std::unordered_map result = diff --git a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt index c65ad4641cf..52dba6b9218 100644 --- a/paddle/fluid/eager/tests/task_tests/CMakeLists.txt +++ b/paddle/fluid/eager/tests/task_tests/CMakeLists.txt @@ -5,6 +5,7 @@ cc_test(test_egr_task_backward SRCS backward_test.cc DEPS ${eager_deps} ${fluid_ cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) cc_test(test_egr_task_cross_batch SRCS cross_batch_accumulation_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) cc_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) +cc_test(test_egr_task_grad SRCS grad_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) cc_test(test_egr_task_hook_intermidiate SRCS hook_test_intermidiate.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} dygraph_node) diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc index 0c894ed267f..87f8f6eca1f 100644 --- a/paddle/fluid/eager/tests/task_tests/backward_test.cc +++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc @@ -33,6 +33,7 @@ #include "paddle/phi/core/kernel_registry.h" PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT); namespace egr { @@ -79,7 +80,7 @@ TEST(Backward, SingleNodeEmptyGrad) { } std::vector outs = {target_tensor}; // Run Backward - RunBackward(outs, {}); + Backward(outs, {}); // Check Output Value eager_test::CompareGradTensorWithValue(leaf_tensor, 5.0); @@ -138,7 +139,7 @@ TEST(Backward, SingleNodeCustomGrad) { } // Run Backward - RunBackward(target_tensors, grad_tensors); + Backward(target_tensors, grad_tensors); // Check Output Value eager_test::CompareGradTensorWithValue(leaf_tensor, 50.0); @@ -211,7 +212,7 @@ TEST(Backward, LinearNodes) { } // Use Empty Grad Tensor - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); // Check Output Value eager_test::CompareGradTensorWithValue(leaf_tensor, 50.0); @@ -315,7 +316,7 @@ TEST(Backward, WithAccumulation) { node2_ptr->AddEdges(&res2, 0); } - RunBackward(target_tensors, grad_tensors); + Backward(target_tensors, grad_tensors); eager_test::CompareGradTensorWithValue(leaf_tensor, 2500.0); } diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc index 36594f1aac8..8b0759c17ed 100644 --- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc +++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc @@ -71,12 +71,12 @@ TEST(CrossBatchAccumulation, SingleScaleNode) { std::vector res = {meta}; scale_node_ptr->AddEdges(&res, 0); - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(target_tensor, 1.0); eager_test::CompareGradTensorWithValue(leaf_tensor, 5.0); - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(target_tensor, 1.0); eager_test::CompareGradTensorWithValue(leaf_tensor, 10.0); diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc index f7fa642ea8d..882695e98d1 100644 --- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc +++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc @@ -86,7 +86,7 @@ TEST(FwdBwdJoint, SingleNode) { std::vector outs = {out}; // 4. Run Backward - RunBackward(outs, {}); + Backward(outs, {}); VLOG(7) << "Target Grad is: " << std::static_pointer_cast( @@ -137,7 +137,7 @@ TEST(FwdBwdJoint, LinearNodes) { std::vector outs = {out1}; // 4. Run Backward - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad eager_test::CompareGradTensorWithValue(tensor, 10.0); @@ -203,7 +203,7 @@ TEST(FwdBwdJoint, BranchedNodes) { // 4. Run Backward std::vector outs = {out1, out2}; - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad eager_test::CompareGradTensorWithValue(tensor, 30.0); @@ -260,7 +260,7 @@ TEST(FwdBwdJoint, GradientHook) { // 4. Run Backward std::vector outs = {out1, out2}; - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad // leaf grad @@ -318,13 +318,13 @@ TEST(FwdBwdJoint, CrossBatchAccumulation) { // 4. Run Backward std::vector outs = {out1, out2}; - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad eager_test::CompareGradTensorWithValue(tensor, 30.0); // Cross Batch Accumulation - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad eager_test::CompareGradTensorWithValue(tensor, 60.0); @@ -356,7 +356,7 @@ TEST(FwdBwdJoint, SingleNodeCUDA) { std::vector outs = {out}; // 4. Run Backward - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad eager_test::CompareGradTensorWithValue(tensor, 2.0); @@ -412,7 +412,7 @@ TEST(FwdBwdJoint, BranchedNodesCUDA) { // TODO(jiabin): fix this with add functor // 4. Run Backward std::vector outs = {out1, out2}; - RunBackward(outs, {}); + Backward(outs, {}); // Examine Backward Grad eager_test::CompareGradTensorWithValue(tensor, 30.0); diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc index 2a5ad53204a..68820443a2d 100644 --- a/paddle/fluid/eager/tests/task_tests/generated_test.cc +++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc @@ -57,7 +57,7 @@ TEST(Generated, Sigmoid) { std::vector target_tensors = {output_tensor}; VLOG(6) << "Runing Backward"; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); VLOG(6) << "Finish Backward"; eager_test::CompareGradTensorWithValue(tensor, 0.25); @@ -89,7 +89,7 @@ TEST(Generated, Matmul_v2) { eager_test::CompareTensorWithValue(output_tensor, 96); std::vector target_tensors = {output_tensor}; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(X, 2.0 * 20); eager_test::CompareGradTensorWithValue(Y, 3.0 * 4); @@ -120,7 +120,7 @@ TEST(Generated, ElementwiseAdd) { eager_test::CompareTensorWithValue(output_tensor, 5); std::vector target_tensors = {output_tensor}; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(X, 1.0); eager_test::CompareGradTensorWithValue(Y, 1.0); diff --git a/paddle/fluid/eager/tests/task_tests/grad_test.cc b/paddle/fluid/eager/tests/task_tests/grad_test.cc new file mode 100644 index 00000000000..6b03799c486 --- /dev/null +++ b/paddle/fluid/eager/tests/task_tests/grad_test.cc @@ -0,0 +1,339 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "paddle/fluid/eager/accumulation/accumulation_node.h" +#include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h" +#include "paddle/fluid/eager/api/utils/tensor_utils.h" +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/backward.h" +#include "paddle/fluid/eager/grad_node_info.h" +#include "paddle/fluid/eager/tests/test_utils.h" + +#include "paddle/fluid/eager/api/all.h" + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT); +namespace egr { + +TEST(Grad, SingleNodeEmptyGrad) { + // Prepare Device Contexts + eager_test::InitEnv(paddle::platform::CPUPlace()); + + // Prepare Inputs + paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32}); + + // Create Target Tensor (output) + paddle::experimental::Tensor output_tensor = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); + + // Create input tensor + const paddle::experimental::Tensor leaf_tensor = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/); + + { + // Create Scale Node + auto node0_ptr = std::make_shared(1, 1); + node0_ptr->SetAttributes_scale(5.0 /*scale*/); + + // Set grad in/out meta + node0_ptr->SetDefaultGradInOutMeta(); + + // Output_tensor set GradNode、OutRank、StopGradient propertis + AutogradMeta* auto_grad_meta = EagerUtils::autograd_meta(&output_tensor); + auto_grad_meta->SetGradNode( + std::dynamic_pointer_cast(node0_ptr)); + auto_grad_meta->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta->SetStopGradient(false); + + // Get autograd_meta from input tensor + AutogradMeta* auto_grad_meta1 = + EagerUtils::unsafe_autograd_meta(leaf_tensor); + + // Connect Tensor and AccumulationNode via AutoGradMeta + auto acc_node_ptr = + std::make_shared(auto_grad_meta1); + + // input tensor set GradNode、OutRank、StopGradient propertis + auto_grad_meta1->SetGradNode( + std::dynamic_pointer_cast(acc_node_ptr)); + auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta1->SetStopGradient(false); + + // grad_node Add Edges + std::vector res = {auto_grad_meta1}; + node0_ptr->AddEdges(&res, 0); + } + std::vector outs = {output_tensor}; + + // Run Grad + auto result = Grad(outs, {leaf_tensor}, {}); + // Check Output Value + eager_test::CompareTensorWithValue(result[0], 5.0); +} + +TEST(Grad, SingleNodeCustomGrad) { + // Prepare Device Contexts + eager_test::InitEnv(paddle::platform::CPUPlace()); + + // Prepare Inputs + std::vector target_tensors; + paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32}); + + // Create Target Tensor + paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); + target_tensors.emplace_back(std::move(tensor)); + + std::vector grad_tensors; + // Create Grad Tensor + paddle::experimental::Tensor grad_tensor = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/); + grad_tensors.emplace_back(std::move(grad_tensor)); + + paddle::experimental::Tensor leaf_tensor = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/); + + { + // Create Scale Node + auto node0_ptr = std::make_shared(1, 1); + node0_ptr->SetAttributes_scale(5.0 /*scale*/); + + // Set grad in/out meta + node0_ptr->SetDefaultGradInOutMeta(); + + // Connect Tensor and Node via AutoGradMeta + AutogradMeta* auto_grad_meta = + EagerUtils::autograd_meta(&(target_tensors[0])); + auto_grad_meta->SetGradNode( + std::dynamic_pointer_cast(node0_ptr)); + auto_grad_meta->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta->SetStopGradient(false); + + AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); + // Connect Tensor and AccumulationNode via AutoGradMeta + auto acc_node_ptr = + std::make_shared(auto_grad_meta1); + + auto_grad_meta1->SetGradNode( + std::dynamic_pointer_cast(acc_node_ptr)); + auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta1->SetStopGradient(false); + std::vector res = {auto_grad_meta1}; + node0_ptr->AddEdges(&res, 0); + } + + auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors); + + // Check Output Value + eager_test::CompareTensorWithValue(result[0], 50.0); +} + +/* +Node1 + | +Node0 + | + { } // empty grad tensor +*/ +TEST(Grad, LinearNodes) { + // Prepare Device Contexts + eager_test::InitEnv(paddle::platform::CPUPlace()); + + // Prepare Target Tensor + std::vector target_tensors; + paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32}); + + // Create Target Tensor + paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); + target_tensors.emplace_back(std::move(tensor)); + + paddle::experimental::Tensor leaf_tensor = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/); + { + // Create Node0 + auto node0_ptr = std::make_shared(1, 1); + node0_ptr->SetAttributes_scale(5.0 /*scale*/); + + // Set grad in/out meta for node0 + node0_ptr->SetDefaultGradInOutMeta(); + + // Create Node1 + auto node1_ptr = std::make_shared(1, 1); + node1_ptr->SetAttributes_scale(10.0 /*scale*/); + + // Set grad in/out meta for node1 + node1_ptr->SetDefaultGradInOutMeta(); + + // Connect Input Tensor and Node0 via AutoGradMeta + AutogradMeta* auto_grad_meta = + EagerUtils::autograd_meta(&(target_tensors[0])); + auto_grad_meta->SetGradNode( + std::dynamic_pointer_cast(node0_ptr)); + auto_grad_meta->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta->SetStopGradient(false); + // Connect Node0 -> Node1 via Edge + auto meta0 = egr::AutogradMeta(); + meta0.SetStopGradient(false); + meta0.SetSingleOutRankWithSlot(0, 0); + meta0.SetGradNode(node1_ptr); + std::vector res0 = {&meta0}; + node0_ptr->AddEdges(&res0, 0); + + AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); + // Connect Tensor and AccumulationNode via AutoGradMeta + auto acc_node_ptr = + std::make_shared(auto_grad_meta1); + + auto_grad_meta1->SetGradNode( + std::dynamic_pointer_cast(acc_node_ptr)); + auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); + + auto_grad_meta1->SetStopGradient(false); + std::vector res1 = {auto_grad_meta1}; + node1_ptr->AddEdges(&res1, 0); + } + + // Use Empty Grad Tensor + auto result = Grad(target_tensors, {leaf_tensor}, {}); + + // Check Output Value + eager_test::CompareTensorWithValue(result[0], 50.0); +} + +/* + Node2 + | | +Node0 Node1 + | | + in0 in1 +*/ +TEST(Grad, WithAccumulation) { + // Prepare Device Contexts + eager_test::InitEnv(paddle::platform::CPUPlace()); + + // Prepare Inputs + paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32}); + + // Create Target Tensor + std::vector target_tensors; + paddle::experimental::Tensor tensor0 = egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); + paddle::experimental::Tensor tensor1 = egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/); + target_tensors.emplace_back(std::move(tensor0)); + target_tensors.emplace_back(std::move(tensor1)); + + // Create Grad Tensor + std::vector grad_tensors; + paddle::experimental::Tensor grad_tensor0 = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/); + paddle::experimental::Tensor grad_tensor1 = + egr_utils_api::CreateTensorWithValue( + ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/); + grad_tensors.emplace_back(std::move(grad_tensor0)); + grad_tensors.emplace_back(std::move(grad_tensor1)); + + paddle::experimental::Tensor leaf_tensor; + { + // Create Node0 + auto node0_ptr = std::make_shared(1, 1); + node0_ptr->SetAttributes_scale(5.0 /*scale*/); + node0_ptr->SetDefaultGradInOutMeta(); + + // Create Node1 + auto node1_ptr = std::make_shared(1, 1); + node1_ptr->SetAttributes_scale(10.0 /*scale*/); + node1_ptr->SetDefaultGradInOutMeta(); + // Create Node2 + auto node2_ptr = std::make_shared(1, 1); + node2_ptr->SetAttributes_scale(20.0 /*scale*/); + node2_ptr->SetDefaultGradInOutMeta(); + // Connect Inp0 and Node0 via AutoGradMeta + AutogradMeta* auto_grad_meta0 = + EagerUtils::autograd_meta(&(target_tensors[0])); + auto_grad_meta0->SetGradNode( + std::dynamic_pointer_cast(node0_ptr)); + auto_grad_meta0->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta0->SetStopGradient(false); + // Connect Inp1 and Node1 via AutoGradMeta + AutogradMeta* auto_grad_meta1 = + EagerUtils::autograd_meta(&(target_tensors[1])); + auto_grad_meta1->SetGradNode( + std::dynamic_pointer_cast(node1_ptr)); + auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta1->SetStopGradient(false); + + // Connect Node0 -> Node2 via Edge + auto meta0 = egr::AutogradMeta(); + meta0.SetStopGradient(false); + meta0.SetSingleOutRankWithSlot(0, 0); + meta0.SetGradNode(node2_ptr); + std::vector res0 = {&meta0}; + node0_ptr->AddEdges(&res0, 0); + + // Connect Node1 -> Node2 via Edge + auto meta1 = egr::AutogradMeta(); + meta1.SetStopGradient(false); + meta1.SetSingleOutRankWithSlot(0, 0); + meta1.SetGradNode(node2_ptr); + std::vector res1 = {&meta1}; + node1_ptr->AddEdges(&res1, 0); + + AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor); + // Connect Tensor and AccumulationNode via AutoGradMeta + auto acc_node_ptr = + std::make_shared(auto_grad_meta2); + + auto_grad_meta2->SetGradNode( + std::dynamic_pointer_cast(acc_node_ptr)); + auto_grad_meta2->SetSingleOutRankWithSlot(0, 0); + + auto_grad_meta2->SetStopGradient(false); + std::vector res2 = {auto_grad_meta2}; + node2_ptr->AddEdges(&res2, 0); + } + + auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors); + + eager_test::CompareTensorWithValue(result[0], 2500.0); +} + +} // namespace egr diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc index d546df4ed08..2c53fc89f65 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc @@ -132,7 +132,7 @@ TEST(RetainGrad, HookBeforeRetainGrad) { leaf_tensor); // result: 4.0*5.0 + 3.0 = 23.0 } - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(target_tensor, 4.0); eager_test::CompareGradTensorWithValue(leaf_tensor, 23.0); @@ -199,7 +199,7 @@ TEST(RetainGrad, HookAfterRetainGrad) { leaf_tensor, std::make_shared(hook_function)); } - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(target_tensor, 1.0); eager_test::CompareGradTensorWithValue(leaf_tensor, 23.0); } diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc index 56813c498d2..0ee171c73c6 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc @@ -108,7 +108,7 @@ void test_sigmoid(bool is_remove_gradient_hook) { } VLOG(6) << "Runing Backward"; - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); VLOG(6) << "Finish Backward"; eager_test::CompareGradTensorWithValue( @@ -166,7 +166,7 @@ void test_elementwiseAdd(bool is_remove_gradient_hook) { grad_node_tmp->RemoveGradientHook(hook_id); } - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(X, 1.0); eager_test::CompareGradTensorWithValue( @@ -224,7 +224,7 @@ void test_matmul(bool is_remove_gradient_hook) { grad_node_tmp->RemoveGradientHook(hook_id); } - RunBackward(target_tensors, {}); + Backward(target_tensors, {}); eager_test::CompareGradTensorWithValue(X, 2.0 * 20); eager_test::CompareGradTensorWithValue( diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h index d99624e4932..4eaa64d3ac6 100644 --- a/paddle/fluid/eager/to_static/run_program_op_node.h +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -370,8 +370,8 @@ class GradNodeRunProgram : public egr::GradNodeBase { ~GradNodeRunProgram() override = default; // Functor: perform backward computations virtual std::vector> operator()( - const std::vector> &grads) - override { + const std::vector> &grads, + bool create_graph) override { VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram"; PADDLE_ENFORCE_EQ( grads.size(), 1, @@ -415,6 +415,12 @@ class GradNodeRunProgram : public egr::GradNodeBase { // return {x_grad, details::DereferenceTensors(params_grad_ptr)}; } + void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } + bool IsTensorWrappersCleared() override { + VLOG(6) << "Do nothing here now"; + return false; + } + // SetAttrMap void SetAttrMap(const paddle::framework::AttributeMap &attrs) { attrs_ = attrs; diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index e110432c67d..c9e80c7b4b4 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -122,13 +122,33 @@ static PyObject* eager_api_run_backward(PyObject* self, PyObject* args, EAGER_TRY auto tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0); auto grad_tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1); - egr::RunBackward(tensors, grad_tensors, - CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2)); + egr::Backward(tensors, grad_tensors, + CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2)); Py_INCREF(Py_None); return Py_None; EAGER_CATCH_AND_THROW_RETURN_NULL } +static PyObject* eager_api_run_partial_grad(PyObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + auto tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0); + auto inputs = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1); + auto grad_tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 2), 2); + auto retain_graph = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3); + auto create_graph = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4); + auto only_inputs = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 5), 5); + auto allow_unused = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 6), 6); + auto no_grad_vars = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 7), 7); + + std::vector result = + egr::Grad(tensors, inputs, grad_tensors, retain_graph, create_graph, + only_inputs, allow_unused, no_grad_vars); + VLOG(1) << " in eager_api_run_partial_grad, after runing egr::Grad"; + return ToPyObject(result, true /* return_py_none_if_not_initialize */); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args, PyObject* kwargs) { EAGER_TRY @@ -452,6 +472,9 @@ PyMethodDef variable_functions[] = { METH_VARARGS | METH_KEYWORDS, NULL}, {"run_backward", (PyCFunction)(void (*)(void))eager_api_run_backward, METH_VARARGS | METH_KEYWORDS, NULL}, + {"run_partial_grad", + (PyCFunction)(void (*)(void))eager_api_run_partial_grad, + METH_VARARGS | METH_KEYWORDS, NULL}, {"_run_custom_op", (PyCFunction)(void (*)(void))eager_api_run_costum_op, METH_VARARGS | METH_KEYWORDS, NULL}, {"tensor_copy", (PyCFunction)(void (*)(void))eager_api_tensor_copy, diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index 217edad0c0a..97bb32630d7 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -492,20 +492,26 @@ PyObject* ToPyObject(const std::vector& value) { return result; } -PyObject* ToPyObject(const std::vector& value) { +PyObject* ToPyObject(const std::vector& value, + bool return_py_none_if_not_initialize) { PyObject* result = PyList_New((Py_ssize_t)value.size()); for (size_t i = 0; i < value.size(); i++) { - PyObject* obj = p_tensor_type->tp_alloc(p_tensor_type, 0); - if (obj) { - auto v = reinterpret_cast(obj); - new (&(v->tensor)) paddle::experimental::Tensor(); - v->tensor = value[i]; + if (!value[i].initialized() && return_py_none_if_not_initialize) { + Py_INCREF(Py_None); + PyList_SET_ITEM(result, static_cast(i), Py_None); } else { - PADDLE_THROW(platform::errors::Fatal( - "tp_alloc return null, can not new a PyObject.")); + PyObject* obj = p_tensor_type->tp_alloc(p_tensor_type, 0); + if (obj) { + auto v = reinterpret_cast(obj); + new (&(v->tensor)) paddle::experimental::Tensor(); + v->tensor = value[i]; + } else { + PADDLE_THROW(platform::errors::Fatal( + "tp_alloc return null, can not new a PyObject.")); + } + PyList_SET_ITEM(result, static_cast(i), obj); } - PyList_SET_ITEM(result, static_cast(i), obj); } return result; diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index 2187555e1c3..1c4e2ab69a5 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -68,7 +68,8 @@ PyObject* ToPyObject(const std::vector& value); PyObject* ToPyObject(const std::vector& value); PyObject* ToPyObject(const std::vector& value); PyObject* ToPyObject(const std::vector& value); -PyObject* ToPyObject(const std::vector& value); +PyObject* ToPyObject(const std::vector& value, + bool return_py_none_if_not_initialize = false); PyObject* ToPyObject(const platform::Place& value); PyObject* ToPyObject(const framework::LoDTensor* value); PyObject* ToPyObject(const paddle::framework::proto::VarType::Type& dtype); diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index 8149d69d36a..94399828585 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -565,16 +565,25 @@ def grad(outputs, if isinstance(in_out_list, (list, tuple)): assert len(in_out_list) > 0, "{} cannot be empty".format(name) for each_var in in_out_list: - assert isinstance( - each_var, - core.VarBase), "Elements of {} must be Variable".format( - name) + if core._in_eager_mode(): + assert isinstance( + each_var, core.eager. + Tensor), "Elements of {} must be Tensor".format(name) + else: + assert isinstance( + each_var, + core.VarBase), "Elements of {} must be Variable".format( + name) return in_out_list else: - assert isinstance( - in_out_list, - core.VarBase), "{} must be Variable or list of Variable".format( - name) + if core._in_eager_mode(): + assert isinstance( + in_out_list, core.eager. + Tensor), "{} must be Tensor or list of Tensor".format(name) + else: + assert isinstance( + in_out_list, core.VarBase + ), "{} must be Variable or list of Variable".format(name) return [in_out_list] outputs = check_in_out(outputs, 'outputs') @@ -586,9 +595,14 @@ def grad(outputs, for each_var in grad_outputs: if each_var is not None: - assert isinstance( - each_var, core.VarBase - ), "grad_outputs must be None, a Variable or a list containing None or Variables" + if core._in_eager_mode(): + assert isinstance( + each_var, core.eager.Tensor + ), "grad_outputs must be None, a Variable or a list containing None or Variables" + else: + assert isinstance( + each_var, core.VarBase + ), "grad_outputs must be None, a Variable or a list containing None or Variables" else: grad_outputs = [] @@ -600,14 +614,27 @@ def grad(outputs, no_grad_vars = [] elif isinstance(no_grad_vars, core.VarBase): no_grad_vars = [no_grad_vars] + elif isinstance(no_grad_vars, core.eager.Tensor): + no_grad_vars = [no_grad_vars] elif isinstance(no_grad_vars, (list, tuple, set)): no_grad_vars = list(no_grad_vars) for var in no_grad_vars: - assert isinstance( - var, core.VarBase), "no_grad_vars can only contains Variable" + if core._in_eager_mode(): + assert isinstance( + var, + core.eager.Tensor), "no_grad_vars can only contains Tensor" + else: + assert isinstance( + var, + core.VarBase), "no_grad_vars can only contains Variable" else: - raise AssertionError( - "no_grad_vars must be None, Variable or list/tuple/set of Variables") + if core._in_eager_mode(): + raise AssertionError( + "no_grad_vars must be None, Tensor or list/tuple/set of Tensors") + else: + raise AssertionError( + "no_grad_vars must be None, Variable or list/tuple/set of Variables" + ) assert isinstance(create_graph, bool), "create_graph must be True or False" @@ -622,6 +649,11 @@ def grad(outputs, assert isinstance(only_inputs, bool), "only_inputs must be True or False" assert only_inputs, "only_inputs=False is not supported yet" + if core._in_eager_mode(): + return core.eager.run_partial_grad( + outputs, inputs, grad_outputs, retain_graph, create_graph, + only_inputs, allow_unused, no_grad_vars) + place = core.Place() place.set_place(framework._current_expected_place()) return core.dygraph_partial_grad(inputs, outputs, grad_outputs, diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py index 27aec284de4..98ef339e045 100644 --- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py +++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py @@ -52,7 +52,7 @@ class EagerScaleTestCase(unittest.TestCase): out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True) self.assertIsNone(data_eager.grad) out_eager.backward(grad_eager, False) - self.assertTrue(data_eager.grad._is_initialized()) + self.assertIsNotNone(data_eager.grad) self.assertTrue(np.array_equal(data_eager.grad.numpy(), input_data)) def test_retain_grad_and_run_backward_raises(self): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py index cd4ba5b0542..7436e9eb7b1 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,6 +19,9 @@ from paddle.vision.models import resnet50, resnet101 import unittest from unittest import TestCase import numpy as np +import paddle.compat as cpt +from paddle.fluid.framework import _test_eager_guard +import paddle.fluid.core as core def _dygraph_guard_(func): @@ -40,6 +43,80 @@ def random_var(size, low=-1, high=1, dtype='float32'): return fluid.dygraph.to_variable(x_np) +class TestEagerGrad(TestCase): + def func_simple_example_eager_grad(self): + np.random.seed(2021) + paddle.set_device('cpu') + np_x = np.random.random((3, 3)) + np_y = np.random.random((3, 1)) + x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False) + y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False) + out = paddle.matmul(x, y) + dx = fluid.dygraph.grad(out, x) + + dout = np.ones_like(np_y) + expected_dx = np.matmul(dout, np.transpose(np_y)) + + # stop_gradient = !create_graph, create_graph default false + self.assertEqual(dx[0].stop_gradient, True) + self.assertTrue(np.allclose(dx[0].numpy(), expected_dx[0])) + + def test_simple_example_eager_grad(self): + with _test_eager_guard(): + self.func_simple_example_eager_grad() + self.func_simple_example_eager_grad() + + def func_simple_example_eager_grad_allow_unused(self): + np.random.seed(2021) + paddle.set_device('cpu') + np_x = np.random.random((3, 3)) + np_y = np.random.random((3, 1)) + np_z = np.random.random((3, 1)) + x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False) + y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False) + z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False) + out_z = paddle.nn.functional.sigmoid(z) + out = paddle.matmul(x, y) + + dx = fluid.dygraph.grad(out, [x, z], allow_unused=True) + dout = np.ones_like(np_y) + expected_dx = np.matmul(dout, np.transpose(np_y)) + self.assertTrue(np.allclose(dx[0].numpy(), expected_dx[0])) + # stop_gradient = !create_graph, create_graph default false + self.assertEqual(dx[0].stop_gradient, True) + # x is unused input in the graph + self.assertEqual(dx[1], None) + + def test_simple_example_eager_grad_allow_unused(self): + with _test_eager_guard(): + self.func_simple_example_eager_grad_allow_unused() + self.func_simple_example_eager_grad_allow_unused() + + def func_simple_example_eager_grad_not_allow_unused(self): + np.random.seed(2021) + paddle.set_device('cpu') + np_x = np.random.random((3, 3)) + np_y = np.random.random((3, 1)) + np_z = np.random.random((3, 1)) + x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False) + y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False) + z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False) + out_z = paddle.nn.functional.sigmoid(z) + out = paddle.matmul(x, y) + + try: + # allow_unused is false in default + dx = fluid.dygraph.grad(out, [x, z]) + except ValueError as e: + error_msg = cpt.get_exception_message(e) + assert error_msg.find("allow_unused") > 0 + + def test_simple_example_eager_grad_not_allow_unused(self): + with _test_eager_guard(): + self.func_simple_example_eager_grad_not_allow_unused() + self.func_simple_example_eager_grad_not_allow_unused() + + class TestDygraphDoubleGrad(TestCase): def setUp(self): self.sort_sum_gradient = False @@ -64,7 +141,7 @@ class TestDygraphDoubleGrad(TestCase): allow_unused=allow_unused) @dygraph_guard - def test_exception(self): + def func_exception(self): with self.assertRaises(AssertionError): self.grad(None, None) @@ -93,8 +170,13 @@ class TestDygraphDoubleGrad(TestCase): with self.assertRaises(AssertionError): self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1) + def test_exception(self): + with _test_eager_guard(): + self.func_exception() + self.func_exception() + @dygraph_guard - def test_simple_example(self): + def func_simple_example(self): x = random_var(self.shape) x.stop_gradient = False y = x + 1 @@ -123,8 +205,44 @@ class TestDygraphDoubleGrad(TestCase): self.assertNotEqual(grad_with_none_and_not_none.stop_gradient, create_graph) + def test_simple_example(self): + with _test_eager_guard(): + self.func_simple_example() + self.func_simple_example() + @dygraph_guard - def test_none_one_initial_gradient(self): + def func_example_no_grad_vars(self): + x = random_var(self.shape) + x_np = x.numpy() + numel = x_np.size + x.stop_gradient = False + + y1 = fluid.layers.relu(x) + y2 = fluid.layers.relu(x) + z = y1 + y2 + w = z * z + + w_mean = fluid.layers.reduce_mean(w) + del y1, z, w + + dx_actual, = self.grad( + [w_mean], [x], create_graph=True, no_grad_vars=[y2]) + + self.assertFalse(y2.stop_gradient) + self.assertFalse(dx_actual.stop_gradient) + + dx_expected = (1.0 / float(numel) * (np.maximum(x_np, 0) + y2.numpy()) * + (x_np > 0) * 2).astype('float32') + + self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) + + def test_example_no_grad_vars(self): + with _test_eager_guard(): + self.func_example_no_grad_vars() + self.func_example_no_grad_vars() + + @dygraph_guard + def func_none_one_initial_gradient(self): numel = 1 for s in self.shape: numel *= s @@ -190,8 +308,13 @@ class TestDygraphDoubleGrad(TestCase): np.array_equal(grad_z.numpy(), original_random_grad_z)) + def test_none_one_initial_gradient(self): + with _test_eager_guard(): + self.func_none_one_initial_gradient() + self.func_none_one_initial_gradient() + @dygraph_guard - def test_example_with_gradient_accumulation_and_create_graph(self): + def func_example_with_gradient_accumulation_and_create_graph(self): x = random_var(self.shape) x_np = x.numpy() numel = x_np.size @@ -214,25 +337,33 @@ class TestDygraphDoubleGrad(TestCase): (x_np > 0) * 2).astype('float32') self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) - loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) - loss.backward(retain_graph=True) - - x_grad_actual = x.gradient() - x_grad_expected = (2.0 / float(numel) * - (x_np + dx_expected * - (x_np > 0) * 2 / float(numel))).astype('float32') - self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) - - for i in range(5): + if core._in_eager_mode(): + pass + else: + loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) loss.backward(retain_graph=True) + x_grad_actual = x.gradient() - x_grad_expected = (i + 2) * (2.0 / float(numel) * ( + x_grad_expected = (2.0 / float(numel) * ( x_np + dx_expected * (x_np > 0) * 2 / float(numel))).astype('float32') self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + for i in range(5): + loss.backward(retain_graph=True) + x_grad_actual = x.gradient() + x_grad_expected = (i + 2) * (2.0 / float(numel) * ( + x_np + dx_expected * + (x_np > 0) * 2 / float(numel))).astype('float32') + self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + + def test_example_with_gradient_accumulation_and_create_graph(self): + with _test_eager_guard(): + self.func_example_with_gradient_accumulation_and_create_graph() + self.func_example_with_gradient_accumulation_and_create_graph() + @dygraph_guard - def test_example_with_gradient_accumulation_and_no_grad_vars(self): + def func_example_with_gradient_accumulation_and_no_grad_vars(self): x = random_var(self.shape) x_np = x.numpy() numel = x_np.size @@ -256,17 +387,25 @@ class TestDygraphDoubleGrad(TestCase): (x_np > 0) * 2).astype('float32') self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) - loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) - loss.backward() + if core._in_eager_mode(): + pass + else: + loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) + loss.backward() - x_grad_actual = x.gradient() - x_grad_expected = (2.0 / float(numel) * - (x_np + dx_expected * - (x_np > 0) * 4 / float(numel))).astype('float32') - self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + x_grad_actual = x.gradient() + x_grad_expected = (2.0 / float(numel) * ( + x_np + dx_expected * + (x_np > 0) * 4 / float(numel))).astype('float32') + self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + + def test_example_with_gradient_accumulation_and_no_grad_vars(self): + with _test_eager_guard(): + self.func_example_with_gradient_accumulation_and_no_grad_vars() + self.func_example_with_gradient_accumulation_and_no_grad_vars() @dygraph_guard - def test_example_with_gradient_accumulation_and_not_create_graph(self): + def func_example_with_gradient_accumulation_and_not_create_graph(self): x = random_var(self.shape) x_np = x.numpy() numel = x_np.size @@ -289,12 +428,20 @@ class TestDygraphDoubleGrad(TestCase): self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) - loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) - loss.backward() + if core._in_eager_mode(): + pass + else: + loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) + loss.backward() - x_grad_actual = x.gradient() - x_grad_expected = (2.0 * x_np / float(numel)).astype('float32') - self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + x_grad_actual = x.gradient() + x_grad_expected = (2.0 * x_np / float(numel)).astype('float32') + self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + + def test_example_with_gradient_accumulation_and_not_create_graph(self): + with _test_eager_guard(): + self.func_example_with_gradient_accumulation_and_not_create_graph() + self.func_example_with_gradient_accumulation_and_not_create_graph() class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad): @@ -304,7 +451,7 @@ class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad): class TestDygraphDoubleGradVisitedUniq(TestCase): - def test_compare(self): + def func_compare(self): value = np.random.uniform(-0.5, 0.5, 100).reshape(10, 2, 5).astype("float32") @@ -349,6 +496,11 @@ class TestDygraphDoubleGradVisitedUniq(TestCase): self.assertTrue(np.array_equal(grad_1, grad_2)) + def test_compare(self): + with _test_eager_guard(): + self.func_compare() + self.func_compare() + class TestRaiseNoDoubleGradOp(TestCase): def raise_no_grad_op(self): diff --git a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py index 2ffe523ef6d..531e9663a2b 100644 --- a/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py +++ b/python/paddle/fluid/tests/unittests/test_paddle_imperative_double_grad.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,6 +18,8 @@ import unittest from unittest import TestCase import numpy as np import paddle +from paddle.fluid.framework import _test_eager_guard +import paddle.fluid.core as core def _dygraph_guard_(func): @@ -62,7 +64,7 @@ class TestDygraphDoubleGrad(TestCase): allow_unused=allow_unused) @dygraph_guard - def test_exception(self): + def func_exception(self): with self.assertRaises(AssertionError): self.grad(None, None) @@ -91,8 +93,13 @@ class TestDygraphDoubleGrad(TestCase): with self.assertRaises(AssertionError): self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1) + def test_exception(self): + with _test_eager_guard(): + self.func_exception() + self.func_exception() + @dygraph_guard - def test_simple_example(self): + def func_simple_example(self): x = random_var(self.shape) x.stop_gradient = False y = x + 1 @@ -121,8 +128,13 @@ class TestDygraphDoubleGrad(TestCase): self.assertNotEqual(grad_with_none_and_not_none.stop_gradient, create_graph) + def test_simple_example(self): + with _test_eager_guard(): + self.func_simple_example() + self.func_simple_example() + @dygraph_guard - def test_none_one_initial_gradient(self): + def func_none_one_initial_gradient(self): numel = 1 for s in self.shape: numel *= s @@ -188,8 +200,13 @@ class TestDygraphDoubleGrad(TestCase): np.array_equal(grad_z.numpy(), original_random_grad_z)) + def test_none_one_initial_gradient(self): + with _test_eager_guard(): + self.func_none_one_initial_gradient() + self.func_none_one_initial_gradient() + @dygraph_guard - def test_example_with_gradient_accumulation_and_create_graph(self): + def func_example_with_gradient_accumulation_and_create_graph(self): x = random_var(self.shape) x_np = x.numpy() numel = x_np.size @@ -212,17 +229,25 @@ class TestDygraphDoubleGrad(TestCase): (x_np > 0) * 2).astype('float32') self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) - loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) - loss.backward() + if core._in_eager_mode(): + pass + else: + loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) + loss.backward() - x_grad_actual = x.gradient() - x_grad_expected = (2.0 / float(numel) * - (x_np + dx_expected * - (x_np > 0) * 2 / float(numel))).astype('float32') - self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + x_grad_actual = x.gradient() + x_grad_expected = (2.0 / float(numel) * ( + x_np + dx_expected * + (x_np > 0) * 2 / float(numel))).astype('float32') + self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + + def test_example_with_gradient_accumulation_and_create_graph(self): + with _test_eager_guard(): + self.func_example_with_gradient_accumulation_and_create_graph() + self.func_example_with_gradient_accumulation_and_create_graph() @dygraph_guard - def test_example_with_gradient_accumulation_and_no_grad_vars(self): + def func_example_with_gradient_accumulation_and_no_grad_vars(self): x = random_var(self.shape) x_np = x.numpy() numel = x_np.size @@ -246,17 +271,25 @@ class TestDygraphDoubleGrad(TestCase): (x_np > 0) * 2).astype('float32') self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) - loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) - loss.backward() + if core._in_eager_mode(): + pass + else: + loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) + loss.backward() + + x_grad_actual = x.gradient() + x_grad_expected = (2.0 / float(numel) * ( + x_np + dx_expected * + (x_np > 0) * 4 / float(numel))).astype('float32') + self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) - x_grad_actual = x.gradient() - x_grad_expected = (2.0 / float(numel) * - (x_np + dx_expected * - (x_np > 0) * 4 / float(numel))).astype('float32') - self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + def test_example_with_gradient_accumulation_and_no_grad_vars(self): + with _test_eager_guard(): + self.func_example_with_gradient_accumulation_and_no_grad_vars() + self.func_example_with_gradient_accumulation_and_no_grad_vars() @dygraph_guard - def test_example_with_gradient_accumulation_and_not_create_graph(self): + def func_example_with_gradient_accumulation_and_not_create_graph(self): x = random_var(self.shape) x_np = x.numpy() numel = x_np.size @@ -279,12 +312,20 @@ class TestDygraphDoubleGrad(TestCase): self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) - loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) - loss.backward() + if core._in_eager_mode(): + pass + else: + loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) + loss.backward() - x_grad_actual = x.gradient() - x_grad_expected = (2.0 * x_np / float(numel)).astype('float32') - self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + x_grad_actual = x.gradient() + x_grad_expected = (2.0 * x_np / float(numel)).astype('float32') + self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) + + def test_example_with_gradient_accumulation_and_not_create_graph(self): + with _test_eager_guard(): + self.func_example_with_gradient_accumulation_and_not_create_graph() + self.func_example_with_gradient_accumulation_and_not_create_graph() class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad): -- GitLab From ade721086a2761ecddd3b6127eafd65336c1a8d3 Mon Sep 17 00:00:00 2001 From: xiongkun Date: Thu, 17 Mar 2022 10:55:19 +0800 Subject: [PATCH 130/176] fix xpu compile error: introduced by scalar.cc (#40630) --- paddle/phi/common/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt index 0947870dcd3..9bf69270386 100644 --- a/paddle/phi/common/CMakeLists.txt +++ b/paddle/phi/common/CMakeLists.txt @@ -1,2 +1,2 @@ cc_library(phi_place SRCS place.cc) -cc_library(scalar SRCS scalar.cc) +cc_library(scalar SRCS scalar.cc DEPS phi_enforce) -- GitLab From e3a67782a7bb27505c9dd67515f4dcdfc16c513a Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Thu, 17 Mar 2022 11:00:29 +0800 Subject: [PATCH 131/176] add time of unittests for dataparallel in dygraph mode (#40639) --- .../fluid/tests/unittests/CMakeLists.txt | 4 ++-- ...llel_dygraph_dataparallel_in_eager_mode.py | 20 ++++++++++++++++--- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index cbe360f556c..c82172780b7 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -1118,9 +1118,9 @@ set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 120) set_tests_properties(test_split_program PROPERTIES TIMEOUT 120) if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120) - set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120) + set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 150) set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 200) - set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 120) + set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 150) set_tests_properties(test_parallel_dygraph_no_sync_gradient_check PROPERTIES TIMEOUT 30) set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 200) set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200) diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py index 8ff68a1ce0d..91c340c35d4 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_dataparallel_in_eager_mode.py @@ -19,6 +19,7 @@ import unittest import os import numpy as np import random +import socket import paddle import paddle.nn as nn @@ -31,13 +32,26 @@ from paddle.optimizer import SGD from paddle.fluid.initializer import NumpyArrayInitializer +def net_is_used(port, ip='127.0.0.1'): + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + try: + s.connect((ip, port)) + s.shutdown(2) + return True + except Exception as e: + return False + + def init_process_group(strategy=None): nranks = ParallelEnv().nranks rank = ParallelEnv().local_rank is_master = True if rank == 0 else False - store = paddle.fluid.core.TCPStore("127.0.0.1", 6172, is_master, nranks) - group = core.ProcessGroupNCCL(store, rank, nranks) - return group + for port in range(20000, 21000): + if not net_is_used(port): + store = paddle.fluid.core.TCPStore("127.0.0.1", port, is_master, + nranks) + group = core.ProcessGroupNCCL(store, rank, nranks) + return group class LinearModel(nn.Layer): -- GitLab From 06fee998e3d727d3e40fb3e70ed7083fe2053c66 Mon Sep 17 00:00:00 2001 From: baoachun <962571062@qq.com> Date: Thu, 17 Mar 2022 11:03:58 +0800 Subject: [PATCH 132/176] support gpu mixed precision inference (#40531) --- paddle/fluid/framework/ir/CMakeLists.txt | 1 + .../ir/mixed_precision_configure_pass.cc | 149 ++++++++++++++++++ .../ir/mixed_precision_configure_pass.h | 39 +++++ paddle/fluid/inference/analysis/argument.h | 3 + .../inference/analysis/ir_pass_manager.cc | 4 + .../ir_params_sync_among_devices_pass.cc | 66 ++++++-- .../ir_params_sync_among_devices_pass.h | 7 +- paddle/fluid/inference/api/analysis_config.cc | 33 ++++ .../fluid/inference/api/analysis_predictor.cc | 5 + .../api/analysis_predictor_tester.cc | 26 +++ .../inference/api/paddle_analysis_config.h | 16 ++ .../inference/api/paddle_pass_builder.cc | 34 ++++ .../fluid/inference/api/paddle_pass_builder.h | 12 ++ paddle/fluid/pybind/inference_api.cc | 3 + 14 files changed, 385 insertions(+), 13 deletions(-) create mode 100644 paddle/fluid/framework/ir/mixed_precision_configure_pass.cc create mode 100644 paddle/fluid/framework/ir/mixed_precision_configure_pass.h diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 623c8a048c2..7aaaef712a6 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -97,6 +97,7 @@ pass_library(layer_norm_fuse_pass inference) pass_library(add_support_int8_pass inference) pass_library(matmul_scale_fuse_pass inference) pass_library(gpu_cpu_map_matmul_to_mul_pass inference) +pass_library(mixed_precision_configure_pass inference) pass_library(generate_pass DEPS pass_desc_proto) target_link_libraries(generate_pass pass_desc_proto) diff --git a/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc b/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc new file mode 100644 index 00000000000..4aa59d9196b --- /dev/null +++ b/paddle/fluid/framework/ir/mixed_precision_configure_pass.cc @@ -0,0 +1,149 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mixed_precision_configure_pass.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_version_registry.h" + +namespace paddle { +namespace framework { +namespace ir { + +void MixedPrecisionConfigurePass::InsertCastOps( + Graph* graph, const StringSet& blacklist) const { + VLOG(3) << "Insert the cast op before and after the kernel that does not " + "supports fp16 precision"; + + auto update_cast_desc = [&]( + framework::OpDesc& desc, const std::string& x_name, + const std::string& out_name, const int in_dtype, const int out_dtype) { + desc.SetType("cast"); + desc.SetInput("X", {x_name}); + desc.SetOutput("Out", {out_name}); + desc.SetAttr("in_dtype", in_dtype); + desc.SetAttr("out_dtype", out_dtype); + desc.SetAttr("use_mkldnn", false); + desc.SetAttr("with_quant_attr", false); + desc.Flush(); + }; + + auto cast_input = [&](Graph* graph, Node* op_node, + const StringSet& cast_list) { + auto inlinks = op_node->inputs; + for (auto* pre_node : inlinks) { + if (pre_node->IsVar()) { + const auto is_persistable = pre_node->Var()->Persistable(); + const auto is_float = + pre_node->Var()->GetDataType() == proto::VarType::FP16 || + pre_node->Var()->GetDataType() == proto::VarType::FP32 || + pre_node->Var()->GetDataType() == proto::VarType::FP64; + if (!is_persistable && is_float) { + int suffix = 0; + for (auto* pre_node_input : pre_node->inputs) { + if (!pre_node_input->IsOp()) continue; + const auto& type = pre_node_input->Op()->Type(); + if (!cast_list.count(type) && type != "cast") { + std::string old_name = pre_node->Name(); + std::string new_name = + old_name + "_cast.tmp_" + std::to_string(suffix); + suffix++; + + framework::OpDesc new_op_desc(op_node->Op()->Block()); + // 4 for fp16, 5 for fp32 + update_cast_desc(new_op_desc, old_name, new_name, 4, 5); + auto* new_op = graph->CreateOpNode(&new_op_desc); + + VarDesc out_var(new_name); + out_var.SetPersistable(false); + auto* node_var = graph->CreateVarNode(&out_var); + + op_node->Op()->RenameInput(old_name, new_name); + IR_NODE_LINK_TO(pre_node, new_op); + IR_NODE_LINK_TO(new_op, node_var); + IR_NODE_LINK_TO(node_var, op_node); + } + } + } + } + } + }; + + auto cast_output = [&](Graph* graph, Node* op_node, + const StringSet& cast_list) { + auto outlinks = op_node->outputs; + for (auto* next_node : outlinks) { + if (next_node->IsVar()) { + const auto is_persistable = next_node->Var()->Persistable(); + const auto is_float = + next_node->Var()->GetDataType() == proto::VarType::FP16 || + next_node->Var()->GetDataType() == proto::VarType::FP32 || + next_node->Var()->GetDataType() == proto::VarType::FP64; + if (!is_persistable && is_float) { + int suffix = 0; + for (auto* next_node_output : next_node->outputs) { + if (!next_node_output->IsOp()) continue; + + const auto& type = next_node_output->Op()->Type(); + if (!cast_list.count(type) && type != "cast") { + std::string old_name = next_node->Name(); + std::string new_name = + old_name + "_cast.tmp_" + std::to_string(suffix); + suffix++; + + framework::OpDesc new_op_desc(op_node->Op()->Block()); + // 4 for fp16, 5 for fp32 + update_cast_desc(new_op_desc, old_name, new_name, 5, 4); + auto* new_op = graph->CreateOpNode(&new_op_desc); + + VarDesc out_var(new_name); + out_var.SetPersistable(false); + auto* node_var = graph->CreateVarNode(&out_var); + + next_node_output->Op()->RenameInput(old_name, new_name); + IR_NODE_LINK_TO(next_node, new_op); + IR_NODE_LINK_TO(new_op, node_var); + IR_NODE_LINK_TO(node_var, next_node_output); + } + } + } + } + } + }; + + for (auto* op_node : + ir::TopologyVarientSort(*graph, static_cast(0))) { + if (!op_node->IsOp() || op_node->Op()->Type() == "feed" || + op_node->Op()->Type() == "fetch") + continue; + + const auto& type = op_node->Op()->Type(); + if (blacklist.count(type)) { + cast_input(graph, op_node, blacklist); + cast_output(graph, op_node, blacklist); + } + } +} + +void MixedPrecisionConfigurePass::ApplyImpl(Graph* graph) const { + const auto blacklist = + Get>("gpu_fp16_disabled_op_types"); + InsertCastOps(graph, blacklist); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(mixed_precision_configure_pass, + paddle::framework::ir::MixedPrecisionConfigurePass); diff --git a/paddle/fluid/framework/ir/mixed_precision_configure_pass.h b/paddle/fluid/framework/ir/mixed_precision_configure_pass.h new file mode 100644 index 00000000000..fc5a612ecb8 --- /dev/null +++ b/paddle/fluid/framework/ir/mixed_precision_configure_pass.h @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +using StringSet = std::unordered_set; + +class MixedPrecisionConfigurePass : public FusePassBase { + public: + MixedPrecisionConfigurePass() = default; + virtual ~MixedPrecisionConfigurePass() {} + + protected: + void ApplyImpl(Graph* graph) const override; + + private: + void InsertCastOps(Graph* graph, const StringSet& blacklist) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index a5c32164bf1..74e8ca3f229 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -188,6 +188,9 @@ struct Argument { DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); DECL_ARGUMENT_FIELD(use_fc_padding, UseFcPadding, bool); DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int); + DECL_ARGUMENT_FIELD(use_gpu_fp16, UseGPUFp16, bool); + DECL_ARGUMENT_FIELD(gpu_fp16_disabled_op_types, GpuFp16DisabledOpTypes, + std::unordered_set); // Usually use for trt dynamic shape. // TRT will select the best kernel according to opt shape diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 796c86a3ad1..287c896e49b 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -189,6 +189,10 @@ void IRPassManager::CreatePasses(Argument *argument, new int(argument->dlnne_min_subgraph_size())); pass->Set("program", new framework::ProgramDesc *(&argument->main_program())); + } else if (pass_name == "mixed_precision_configure_pass") { + pass->Set("gpu_fp16_disabled_op_types", + new std::unordered_set( + argument->gpu_fp16_disabled_op_types())); } if (pass_name == "lite_subgraph_pass") { bool lite_enable_int8 = diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc index daa18d8c78b..614eea24a0e 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h" #include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/enforce.h" @@ -65,6 +66,26 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToNpu(Argument *argument) { #else +void IrParamsSyncAmongDevicesPass::GetVarNameToOpTypeMap( + const framework::ir::Graph &graph, + std::unordered_map *var_name_op_type_map) { + std::vector node_list = + framework::ir::TopologyVarientSort( + graph, static_cast(0)); + for (auto *op_node : node_list) { + if (!op_node->IsOp() || op_node->Op()->Type() == "feed" || + op_node->Op()->Type() == "fetch") + continue; + + for (auto *pre_node : op_node->inputs) { + if (pre_node->IsVar() && pre_node->Var()->Persistable()) { + var_name_op_type_map->insert(std::pair( + pre_node->Var()->Name(), op_node->Op()->Type())); + } + } + } +} + void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { // The parameters are on the cpu, therefore, synchronization is not necessary. if (!argument->use_gpu()) return; @@ -102,6 +123,16 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { if (with_dynamic_shape) { reserve_cpu_weights = true; } + + bool mixed_precision_mode = + argument->Has("use_gpu_fp16") && argument->use_gpu_fp16(); + std::unordered_map var_name_op_type_map{}; + std::unordered_set blacklist{}; + if (mixed_precision_mode) { + GetVarNameToOpTypeMap(graph, &var_name_op_type_map); + blacklist = argument->gpu_fp16_disabled_op_types(); + } + for (auto &var_name : all_vars) { if (std::count(repetitive_params.begin(), repetitive_params.end(), var_name)) { @@ -117,18 +148,29 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) { var->IsType()) { auto *t = var->GetMutable(); - platform::CPUPlace cpu_place; - framework::LoDTensor temp_tensor; - temp_tensor.Resize(t->dims()); - temp_tensor.mutable_data(cpu_place); - - // Copy the parameter data to a tmp tensor. - paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor); - // Reallocation the space on GPU - t->clear(); - - // Copy parameter data to newly allocated GPU space. - paddle::framework::TensorCopySync(temp_tensor, place, t); + bool is_float = t->dtype() == paddle::experimental::DataType::FLOAT32 || + t->dtype() == paddle::experimental::DataType::FLOAT64; + if (mixed_precision_mode && + !blacklist.count(var_name_op_type_map[var_name]) && is_float) { + framework::Tensor half_tensor; + half_tensor.set_type(paddle::experimental::DataType::FLOAT16); + half_tensor.Resize(t->dims()); + auto *half_data = + half_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < t->numel(); i++) { + auto *data = t->mutable_data(platform::CPUPlace()); + half_data[i] = static_cast(data[i]); + } + t->clear(); + paddle::framework::TensorCopySync(half_tensor, place, t); + } else { + platform::CPUPlace cpu_place; + framework::LoDTensor temp_tensor; + temp_tensor.Resize(t->dims()); + paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor); + t->clear(); + paddle::framework::TensorCopySync(temp_tensor, place, t); + } } } } diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h index d5e98ec886e..f8209f051d5 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h @@ -38,7 +38,12 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass { #ifdef PADDLE_WITH_ASCEND_CL void CopyParamsToNpu(Argument *argument); #else - void CopyParamsToGpu(Argument *argument); + + void GetVarNameToOpTypeMap( + const framework::ir::Graph& graph, + std::unordered_map* var_name_op_type_map); + + void CopyParamsToGpu(Argument* argument); #endif }; diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 41c01d3b7e2..d08d28a3f62 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -83,6 +83,7 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path, Update(); } + void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -97,12 +98,26 @@ void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, Update(); } + void AnalysisConfig::DisableGpu() { use_gpu_ = false; Update(); } +void AnalysisConfig::Exp_EnableUseGpuFp16( + std::unordered_set op_list) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + use_gpu_fp16_ = true; + gpu_fp16_disabled_op_types_.insert(op_list.begin(), op_list.end()); +#else + LOG(ERROR) << "Please compile with gpu to Exp_EnableUseGpuFp16()"; + use_gpu_fp16_ = false; +#endif + + Update(); +} + void AnalysisConfig::DisableFCPadding() { use_fc_padding_ = false; @@ -213,6 +228,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(use_cudnn_); CP_MEMBER(gpu_device_id_); CP_MEMBER(memory_pool_init_size_mb_); + CP_MEMBER(use_gpu_fp16_); + CP_MEMBER(gpu_fp16_disabled_op_types_); CP_MEMBER(enable_memory_optim_); // TensorRT related. @@ -573,6 +590,20 @@ void AnalysisConfig::Update() { #endif } + if (use_gpu_fp16_) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (!enable_ir_optim_) { + LOG(ERROR) << "Exp_EnableUseGpuFp16() only works when IR optimization is " + "enabled."; + } else if (!use_gpu()) { + LOG(ERROR) + << "Exp_EnableUseGpuFp16() only works when use_gpu is enabled."; + } else { + pass_builder()->Exp_EnableUseGpuFp16(); + } +#endif + } + if (use_mkldnn_) { #ifdef PADDLE_WITH_MKLDNN if (!enable_ir_optim_) { @@ -669,6 +700,8 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << params_file_; ss << use_gpu_; + ss << use_gpu_fp16_; + for (auto &item : gpu_fp16_disabled_op_types_) ss << item; ss << use_fc_padding_; ss << gpu_device_id_; ss << xpu_device_id_; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 871ed596a3e..6f765ef415e 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -872,6 +872,11 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_); } + if (config_.gpu_fp16_enabled()) { + argument_.SetUseGPUFp16(true); + argument_.SetGpuFp16DisabledOpTypes(config_.gpu_fp16_disabled_op_types_); + } + if (config_.lite_engine_enabled()) { argument_.SetCpuMathLibraryNumThreads( config_.cpu_math_library_num_threads()); diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 2c6e8f4f1a4..ecb5eaf9825 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -375,6 +375,19 @@ TEST(AnalysisPredictor, enable_onnxruntime) { ASSERT_TRUE(!config.use_onnxruntime()); } +TEST(AnalysisPredictor, exp_enable_use_gpu_fp16) { + AnalysisConfig config; + config.SwitchIrOptim(); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + config.EnableUseGpu(100, 0); + config.Exp_EnableUseGpuFp16(); + ASSERT_TRUE(config.gpu_fp16_enabled()); +#else + config.DisableGpu(); +#endif + LOG(INFO) << config.Summary(); +} + } // namespace paddle namespace paddle_infer { @@ -434,6 +447,19 @@ TEST(Predictor, EnableONNXRuntime) { auto predictor = CreatePredictor(config); } +TEST(Predictor, Exp_EnableUseGpuFp16) { + Config config; + config.SetModel(FLAGS_dirname); + config.SwitchIrOptim(); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + config.EnableUseGpu(100, 0); + config.Exp_EnableUseGpuFp16(); +#else + config.DisableGpu(); +#endif + auto predictor = CreatePredictor(config); +} + TEST(Tensor, CpuShareExternalData) { Config config; config.SetModel(FLAGS_dirname); diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 7b765e3fa8a..bdfe0e46e9c 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -253,6 +253,19 @@ struct PD_INFER_DECL AnalysisConfig { /// /// void DisableGpu(); + /// + /// \brief Enable GPU fp16 precision computation, in experimental state. + /// + /// \param op_list The operator type list. + /// + void Exp_EnableUseGpuFp16(std::unordered_set op_list = {}); + /// + /// \brief A boolean state telling whether the GPU fp16 precision is turned + /// on. + /// + /// \return bool Whether the GPU fp16 precision is turned on. + /// + bool gpu_fp16_enabled() const { return use_gpu_fp16_; } /// /// \brief Turn on XPU. @@ -859,6 +872,9 @@ struct PD_INFER_DECL AnalysisConfig { int gpu_device_id_{0}; uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB. bool thread_local_stream_{false}; + bool use_gpu_fp16_{false}; + std::unordered_set gpu_fp16_disabled_op_types_{ + "conv2d_fusion", "conv2d", "roll", "strided_slice"}; bool use_cudnn_{false}; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 22d9dedb32e..95975d8f2a8 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -172,6 +172,40 @@ void GpuPassStrategy::EnableCUDNN() { use_cudnn_ = true; } +void GpuPassStrategy::Exp_EnableUseGpuFp16() { + passes_.assign({ + "is_test_pass", // + "simplify_with_basic_ops_pass", // + "conv_bn_fuse_pass", // + "conv_eltwiseadd_bn_fuse_pass", // + "embedding_eltwise_layernorm_fuse_pass", // + "multihead_matmul_fuse_pass_v2", // + "gpu_cpu_squeeze2_matmul_fuse_pass", // + "gpu_cpu_reshape2_matmul_fuse_pass", // + "gpu_cpu_flatten2_matmul_fuse_pass", // + "gpu_cpu_map_matmul_v2_to_mul_pass", // + "gpu_cpu_map_matmul_v2_to_matmul_pass", // + "gpu_cpu_map_matmul_to_mul_pass", // + // "fc_fuse_pass", // + "fc_elementwise_layernorm_fuse_pass", // +#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be + // guaranteed at least v7 +// cudnn8.0 has memory leak problem in conv + eltwise + act, so we +// disable the pass. +#if !(CUDNN_VERSION >= 8000 && CUDNN_VERSION < 8100) + "conv_elementwise_add_act_fuse_pass", // + "conv_elementwise_add2_act_fuse_pass", // +#endif + "conv_elementwise_add_fuse_pass", // +#endif // + "transpose_flatten_concat_fuse_pass", // + "mixed_precision_configure_pass", // + "runtime_context_cache_pass" // + }); + + use_gpu_fp16_ = true; +} + void GpuPassStrategy::EnableMKLDNN() { LOG(ERROR) << "GPU not support MKLDNN yet"; } diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 351cf71e5ca..02290ed33ff 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -125,6 +125,9 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { /// \brief Enable the use of cuDNN kernel. virtual void EnableCUDNN() {} + /// \brief Enable use gpu fp16 kernel. + virtual void Exp_EnableUseGpuFp16() {} + /// \brief Enable the use of MKLDNN. /// The MKLDNN control exists in both CPU and GPU mode, because there can /// still be some CPU kernels running in GPU mode. @@ -140,6 +143,10 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { /// \return A bool variable implying whether we are in gpu mode. bool use_gpu() const { return use_gpu_; } + /// \brief Check if we are using gpu fp16 kernel. + /// \return A bool variable implying whether we are in gpu fp16 mode. + bool use_gpu_fp16() const { return use_gpu_fp16_; } + /// \brief Check if we are using xpu. /// \return A bool variable implying whether we are in xpu mode. bool use_xpu() const { return use_xpu_; } @@ -162,6 +169,7 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder { bool use_npu_{false}; bool use_ipu_{false}; bool use_mkldnn_{false}; + bool use_gpu_fp16_{false}; /// \endcond }; @@ -223,6 +231,9 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy { /// \brief Enable the use of cuDNN kernel. void EnableCUDNN() override; + /// \brief Enable the use of gpu fp16 kernel. + void Exp_EnableUseGpuFp16() override; + /// \brief Not supported in GPU mode yet. void EnableMKLDNN() override; @@ -238,6 +249,7 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy { protected: /// \cond Protected bool use_cudnn_{false}; + bool use_gpu_fp16_{false}; /// \endcond }; diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index b008308e27d..c8f0acd0b8a 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -551,6 +551,9 @@ void BindAnalysisConfig(py::module *m) { .def("params_file", &AnalysisConfig::params_file) .def("enable_use_gpu", &AnalysisConfig::EnableUseGpu, py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0) + .def("exp_enable_use_gpu_fp16", &AnalysisConfig::Exp_EnableUseGpuFp16, + py::arg("gpu_fp16_disabled_op_types") = + std::unordered_set({})) .def("enable_xpu", &AnalysisConfig::EnableXpu, py::arg("l3_workspace_size") = 16 * 1024 * 1024, py::arg("locked") = false, py::arg("autotune") = true, -- GitLab From 81848fff05468c5985614ef3fc6c9d4d4647fbf6 Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Thu, 17 Mar 2022 13:35:45 +0800 Subject: [PATCH 133/176] [fleet executor] fleet executor for npu (#40607) --- .../distributed/fleet_executor/CMakeLists.txt | 2 +- .../distributed/fleet_executor/message_bus.cc | 12 ++++-------- .../distributed/fleet_executor/message_bus.h | 9 +++------ .../fleet_executor/message_service.cc | 3 +-- .../fleet_executor/message_service.h | 3 +-- .../fluid/inference/api/analysis_predictor.cc | 18 ++++++------------ .../fluid/inference/api/analysis_predictor.h | 9 +++------ 7 files changed, 19 insertions(+), 37 deletions(-) diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt index 3e734b1b9ed..8641b36a1be 100644 --- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt +++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt @@ -4,7 +4,7 @@ if(WITH_PYTHON) endif() proto_library(interceptor_message_proto SRCS interceptor_message.proto) -if(WITH_DISTRIBUTE AND WITH_PSCORE AND NOT (WITH_ASCEND OR WITH_ASCEND_CL)) +if(WITH_DISTRIBUTE AND WITH_PSCORE) set(BRPC_DEPS brpc ssl crypto protobuf zlib leveldb snappy gflags glog) else() set(BRPC_DEPS "") diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc index 8d2ec5c41d8..80a6b4667aa 100644 --- a/paddle/fluid/distributed/fleet_executor/message_bus.cc +++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc @@ -67,8 +67,7 @@ bool MessageBus::IsInit() const { return is_init_; } MessageBus::~MessageBus() { VLOG(3) << "Message bus releases resource."; -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) server_.Stop(1000); server_.Join(); #endif @@ -87,8 +86,7 @@ bool MessageBus::Send(int64_t dst_rank, IsInit(), true, platform::errors::PreconditionNotMet( "Using message bus since it has not been initialized.")); -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) int retry_time = 0; // message bus will retry sending for 10 times while (retry_time < 10) { ++retry_time; @@ -173,8 +171,7 @@ void MessageBus::ListenPort() { LOG(INFO) << "No need listen to port since training on single card."; return; } -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) // function keep listen the port and handle the message PADDLE_ENFORCE_EQ( server_.AddService(&message_service_, brpc::SERVER_DOESNT_OWN_SERVICE), 0, @@ -203,8 +200,7 @@ void MessageBus::ListenPort() { #endif } -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) bool MessageBus::SendInterRank(int64_t dst_rank, const InterceptorMessage& interceptor_message) { const auto& dst_addr = GetAddr(dst_rank); diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.h b/paddle/fluid/distributed/fleet_executor/message_bus.h index d805ac81606..dfd65fdbc00 100644 --- a/paddle/fluid/distributed/fleet_executor/message_bus.h +++ b/paddle/fluid/distributed/fleet_executor/message_bus.h @@ -20,8 +20,7 @@ #include #include -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) #include "brpc/channel.h" #include "brpc/server.h" #include "paddle/fluid/distributed/fleet_executor/message_service.h" @@ -64,8 +63,7 @@ class MessageBus final { const std::string& GetAddr(int64_t rank) const; -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) // send the message inter rank (dst is different rank with src) bool SendInterRank(int64_t dst_rank, const InterceptorMessage& interceptor_message); @@ -81,8 +79,7 @@ class MessageBus final { // the ip needs to be listened std::string addr_; -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) MessageServiceImpl message_service_; // brpc server brpc::Server server_; diff --git a/paddle/fluid/distributed/fleet_executor/message_service.cc b/paddle/fluid/distributed/fleet_executor/message_service.cc index c3fff98f684..1c66d83ea34 100644 --- a/paddle/fluid/distributed/fleet_executor/message_service.cc +++ b/paddle/fluid/distributed/fleet_executor/message_service.cc @@ -11,8 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) #include "paddle/fluid/distributed/fleet_executor/message_service.h" #include "brpc/server.h" #include "paddle/fluid/distributed/fleet_executor/global.h" diff --git a/paddle/fluid/distributed/fleet_executor/message_service.h b/paddle/fluid/distributed/fleet_executor/message_service.h index 02f73471e3b..5ab687ff93d 100644 --- a/paddle/fluid/distributed/fleet_executor/message_service.h +++ b/paddle/fluid/distributed/fleet_executor/message_service.h @@ -11,8 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) #pragma once #include "brpc/server.h" diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 6f765ef415e..a7caa3e369f 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -50,8 +50,7 @@ #include "paddle/phi/api/ext/op_meta_info.h" #include "paddle/utils/string/split.h" -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h" #include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h" #include "paddle/fluid/distributed/fleet_executor/task_node.h" @@ -374,8 +373,7 @@ static void DisablePrepareDataOpt( } bool AnalysisPredictor::PrepareExecutor() { -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) if (config_.dist_config().use_dist_model()) { VLOG(3) << "use_dist_model is enabled, will init FleetExecutor."; return PrepareFleetExecutor(); @@ -393,8 +391,7 @@ bool AnalysisPredictor::PrepareExecutor() { return true; } -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) bool AnalysisPredictor::PrepareFleetExecutor() { VLOG(3) << "AnalysisPredictor::PrepareFleetExecutor()"; if (config_.dist_config().nranks() > 1 && !CommInit()) { @@ -1194,8 +1191,7 @@ std::vector AnalysisPredictor::GetOutputNames() { std::unique_ptr AnalysisPredictor::GetInputTensor( const std::string &name) { framework::Scope *scope; -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) if (config_.dist_config().use_dist_model()) { scope = scope_.get(); } else { @@ -1244,8 +1240,7 @@ std::unique_ptr AnalysisPredictor::GetInputTensor( std::unique_ptr AnalysisPredictor::GetOutputTensor( const std::string &name) { framework::Scope *scope; -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) if (config_.dist_config().use_dist_model()) { scope = scope_.get(); } else { @@ -1292,8 +1287,7 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( } bool AnalysisPredictor::ZeroCopyRun() { -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) if (config_.dist_config().use_dist_model()) { VLOG(3) << "ZeroCopyRun will use the fleet executor."; inference::Timer timer; diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 21a7e9658bb..d9992f3fbef 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -18,8 +18,7 @@ #include #include #include -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h" #endif #include "paddle/fluid/framework/naive_executor.h" @@ -395,8 +394,7 @@ class AnalysisPredictor : public PaddlePredictor { void StatisticShapeRangeInfo(); void CollectShapeRangeInfo(); -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) // fleet exe related /// @@ -488,8 +486,7 @@ class AnalysisPredictor : public PaddlePredictor { std::map>> shape_info_; static int clone_num_; -#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ - !defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) // fleet executor related distributed::FleetExecutorDesc executor_desc_; std::shared_ptr fleet_exe_; -- GitLab From c142e37dae03f3544a47a47acd9ff9bd403eb330 Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Thu, 17 Mar 2022 13:41:45 +0800 Subject: [PATCH 134/176] Replace PADDLE_WITH_XPU2 with PADDLE_WITH_KP (#40560) * Replace PADDLE_WITH_XPU2 with PADDLE_WITH_KP --- paddle/phi/kernels/funcs/reduce_function.h | 192 ++++++------------ .../kernels/primitive/datamover_primitives.h | 8 + .../primitive/datamover_primitives_xpu2.h | 10 + 3 files changed, 75 insertions(+), 135 deletions(-) diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h index 5834f091d9a..85c371e9f9d 100644 --- a/paddle/phi/kernels/funcs/reduce_function.h +++ b/paddle/phi/kernels/funcs/reduce_function.h @@ -14,8 +14,8 @@ #pragma once -// CUDA and HIP use same api -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +// CUDA, XPU and HIP use same api +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(__xpu__) #include #include @@ -220,7 +220,7 @@ struct IndexCalculator { phi::Array dims; phi::Array strides; phi::Array reduce_strides; -#ifndef PADDLE_WITH_XPU2 +#ifndef PADDLE_WITH_XPU_KP phi::Array divmoders; #endif }; @@ -231,81 +231,65 @@ struct ReduceIndexMapping { HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims) : dim(dims) {} +#ifdef PADDLE_WITH_XPU_KP __device__ __forceinline__ int BlockIdX() { -#ifdef PADDLE_WITH_XPU2 if (ReduceLastDim) { return (cluster_id() / dim.split_num_x % dim.split_num_y); } else { return cluster_id() % dim.split_num_x; } -#else - return blockIdx.x; -#endif } __device__ __forceinline__ int BlockIdY() { -#ifdef PADDLE_WITH_XPU2 if (ReduceLastDim) { return (cluster_id() % dim.split_num_x); } else { return (cluster_id() / dim.split_num_x % dim.split_num_y); } -#else - return blockIdx.y; -#endif } - __device__ __forceinline__ int BlockDimX() { -#ifdef PADDLE_WITH_XPU2 - return dim.deal_size_x; -#else - return blockDim.x; -#endif - } + __device__ __forceinline__ int BlockDimX() { return dim.deal_size_x; } - __device__ __forceinline__ int BlockDimY() { -#ifdef PADDLE_WITH_XPU2 - return 1; -#else - return blockDim.y; -#endif - } + __device__ __forceinline__ int BlockDimY() { return 1; } __device__ __forceinline__ int GridDimX() { -#ifdef PADDLE_WITH_XPU2 if (ReduceLastDim) { return dim.split_num_y; } else { return dim.split_num_x; } -#else - return gridDim.x; -#endif } __device__ __forceinline__ int GridDimY() { -#ifdef PADDLE_WITH_XPU2 if (ReduceLastDim) { return dim.split_num_x; } else { return dim.split_num_y; } -#else - return gridDim.y; -#endif } __device__ __forceinline__ int GetLoopSize() { -#ifdef PADDLE_WITH_XPU2 if (ReduceLastDim) { return dim.deal_size_y; } else { return dim.deal_size_x; } + } #else - return 1; + __device__ __forceinline__ int BlockIdX() { return blockIdx.x; } + + __device__ __forceinline__ int BlockIdY() { return blockIdx.y; } + + __device__ __forceinline__ int BlockDimX() { return blockDim.x; } + + __device__ __forceinline__ int BlockDimY() { return blockDim.y; } + + __device__ __forceinline__ int GridDimX() { return gridDim.x; } + + __device__ __forceinline__ int GridDimY() { return gridDim.y; } + + __device__ int GetLoopSize() { return 1; } #endif - } }; // when reduce_type == kReduceLastDim this struct will be used @@ -341,7 +325,7 @@ struct ReduceConfig { // when should_reduce_again is true, we need malloc temp space for temp data void SetOutputData(Ty* y_data, - const phi::GPUContext& dev_ctx, + const KPDevice& dev_ctx, phi::DenseTensor* tmp) { if (should_reduce_again) { tmp->Resize(phi::make_ddim( @@ -640,9 +624,7 @@ struct ReduceConfig { int blocking_size; bool should_reduce_again; bool reduce_last_dim; - Ty* output_data; - dim3 block; dim3 grid; }; @@ -770,9 +752,10 @@ __global__ void ReduceAnyKernel(const Tx* x, kps::Reduce( &reduce_var, &reduce_var, reducer, reduce_last_dim); - if (need_store) { - y[store_offset + i] = static_cast(reduce_var); - } + + Ty result = static_cast(reduce_var); + kps::details::WriteData( + y + store_offset + i, &result, static_cast(need_store)); } } @@ -882,30 +865,18 @@ static void LaunchReduceKernel(const Tx* x_data, dim.SetRem(config.reduce_num % config.block.x, 0, 0); #ifdef PADDLE_WITH_XPU_KP - ReduceAnyKernel<<<8, 64, 0, stream>>>( - x_data, - config.output_data, - reducer, - transform, - init, - config.reduce_num, - config.left_num, - config.reduce_last_dim, - reduce_index_calculator, - left_index_calculator, - dim); + auto grid_num = 8; + auto block_num = 64; #else + auto grid_num = config.grid; + auto block_num = config.block; +#endif ReduceAnyKernel<<>>( + OneDimIndexCal><<>>( x_data, config.output_data, reducer, @@ -917,7 +888,6 @@ static void LaunchReduceKernel(const Tx* x_data, reduce_index_calculator, left_index_calculator, dim); -#endif } else { int reduce_rank = config.reduce_strides.size(); @@ -938,30 +908,18 @@ static void LaunchReduceKernel(const Tx* x_data, dim.SetRem(config.reduce_num % config.block.x, 0, 0); #ifdef PADDLE_WITH_XPU_KP - ReduceAnyKernel<<<8, 64, 0, stream>>>( - x_data, - config.output_data, - reducer, - transform, - init, - config.reduce_num, - config.left_num, - config.reduce_last_dim, - reduce_index_calculator, - left_index_calculator, - dim); + auto grid_num = 8; + auto block_num = 64; #else + auto grid_num = config.grid; + auto block_num = config.block; +#endif ReduceAnyKernel<<>>( + IndexCalculator><<>>( x_data, config.output_data, reducer, @@ -973,7 +931,6 @@ static void LaunchReduceKernel(const Tx* x_data, reduce_index_calculator, left_index_calculator, dim); -#endif } if (config.should_reduce_again) { @@ -993,22 +950,9 @@ static void LaunchReduceKernel(const Tx* x_data, kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0); dim.SetRem(config.left_num % block.x, 0, 0); #ifdef PADDLE_WITH_XPU_KP - ReduceHigherDimKernel< - Ty, - Ty, - MPType, - ReduceOp, - kps::IdentityFunctor><<<8, 64, 0, stream>>>( - config.output_data, - y_data, - reducer, - kps::IdentityFunctor(), - init, - config.grid.y, - config.left_num, - config.grid.y, - dim); -#else + grid = 8; + block = 64; +#endif ReduceHigherDimKernel< Ty, Ty, @@ -1024,7 +968,6 @@ static void LaunchReduceKernel(const Tx* x_data, config.left_num, config.grid.y, dim); -#endif } } @@ -1038,7 +981,7 @@ CubTensorReduceImpl(const Tx* x_data, Ty* y_data, const TransformOp& transform, int reduce_num, - const phi::GPUContext& dev_ctx, + const KPDevice& dev_ctx, KPStream stream) { auto reducer = ReduceOp(); cub::TransformInputIterator trans_x(x_data, @@ -1077,7 +1020,7 @@ CubTensorReduceImpl(const Tx* x_data, Ty* y_data, const TransformOp& transform, int reduce_num, - const phi::GPUContext& dev_ctx, + const KPDevice& dev_ctx, KPStream stream) { PADDLE_THROW(phi::errors::InvalidArgument( "Tx should not be float16 when using cub::DeviceReduce::Reduce().")); @@ -1087,12 +1030,16 @@ template class ReduceOp, typename TransformOp> -void ReduceKernel(const phi::GPUContext& dev_ctx, +void ReduceKernel(const KPDevice& dev_ctx, const phi::DenseTensor& x, phi::DenseTensor* y, const TransformOp& transform, const std::vector& origin_reduce_dims) { +#ifdef PADDLE_WITH_XPU_KP + auto stream = dev_ctx.x_context()->xpu_stream; +#else auto stream = dev_ctx.stream(); +#endif dev_ctx.Alloc(y); auto x_dim = phi::vectorize(x.dims()); @@ -1149,11 +1096,17 @@ void ReduceKernel(const phi::GPUContext& dev_ctx, 0); #ifdef PADDLE_WITH_XPU_KP + auto grid_num = 8; + auto block_num = 64; +#else + auto grid_num = config.grid; + auto block_num = config.block; +#endif ReduceHigherDimKernel, - TransformOp><<<8, 64, 0, stream>>>( + TransformOp><<>>( x_data, config.output_data, reducer, @@ -1163,23 +1116,6 @@ void ReduceKernel(const phi::GPUContext& dev_ctx, config.left_num, config.blocking_size, dim); -#else - ReduceHigherDimKernel< - Tx, - Ty, - MPType, - ReduceOp, - TransformOp><<>>( - x_data, - config.output_data, - reducer, - transform, - reducer.initial(), - config.reduce_num, - config.left_num, - config.blocking_size, - dim); -#endif if (config.should_reduce_again) { dim3 block = dim3(config.block.x, 1, 1); @@ -1189,22 +1125,9 @@ void ReduceKernel(const phi::GPUContext& dev_ctx, dim2.SetRem(config.left_num % config.block.x, 0, 0); #ifdef PADDLE_WITH_XPU_KP - ReduceHigherDimKernel< - Ty, - Ty, - MPType, - ReduceOp, - kps::IdentityFunctor><<<8, 64, 0, stream>>>( - config.output_data, - y_data, - reducer, - kps::IdentityFunctor(config.grid.y), - reducer.initial(), - config.grid.y, - config.left_num, - config.grid.y, - dim2); -#else + grid = 8; + block = 64; +#endif ReduceHigherDimKernel< Ty, Ty, @@ -1220,7 +1143,6 @@ void ReduceKernel(const phi::GPUContext& dev_ctx, config.left_num, config.grid.y, dim2); -#endif } return; } diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h index 2f1e2f589c5..1d4181f3b9a 100644 --- a/paddle/phi/kernels/primitive/datamover_primitives.h +++ b/paddle/phi/kernels/primitive/datamover_primitives.h @@ -115,6 +115,14 @@ struct BroadcastConfig { } }; +template +__device__ __forceinline__ void WriteData(T* dst, + T* __restrict__ src, + int num) { + for (int i = 0; i < num; i++) { + dst[i] = src[i]; + } +} #undef INT_BITS } // namespace details diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h index 53a8b7d0c9e..d2cfdbdec30 100644 --- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h +++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h @@ -76,6 +76,16 @@ struct BroadcastConfig { }; #pragma pack() +template +__device__ __forceinline__ void WriteData(T* _global_ptr_ dst, + T* src, + int num) { + if (num > 0) { + LM2GM(src, dst, num * sizeof(T)); + } +} +#undef INT_BITS + } // namespace details /** -- GitLab From 9ee03302b15b31c1805b80d744fb4e382b7de0d0 Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Thu, 17 Mar 2022 14:05:38 +0800 Subject: [PATCH 135/176] [Phi]Move infershape of top_k/expand_as/kron/searchsorted to phi (#40632) * [Phi]Move infershape of top_k/expand_as/kron/searchsorted to phi * add set_dtype * fix order --- paddle/fluid/operators/expand_as_v2_op.cc | 28 ++----- paddle/fluid/operators/kron_op.cc | 28 ++----- paddle/fluid/operators/searchsorted_op.cc | 61 ++------------ paddle/fluid/operators/top_k_v2_op.cc | 57 ++----------- paddle/phi/infermeta/binary.cc | 99 +++++++++++++++++++++++ paddle/phi/infermeta/binary.h | 13 +++ paddle/phi/infermeta/unary.cc | 49 +++++++++++ paddle/phi/infermeta/unary.h | 9 +++ 8 files changed, 194 insertions(+), 150 deletions(-) diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc index 97a35a34f23..9361edd43bf 100755 --- a/paddle/fluid/operators/expand_as_v2_op.cc +++ b/paddle/fluid/operators/expand_as_v2_op.cc @@ -12,7 +12,9 @@ limitations under the License. */ #include "paddle/fluid/operators/expand_as_v2_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -22,27 +24,6 @@ using framework::Tensor; class ExpandAsV2Op : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandAsV2"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ExpandAsV2"); - auto x_dims = ctx->GetInputDim("X"); - auto target_shape = ctx->Attrs().Get>("target_shape"); - PADDLE_ENFORCE_GE( - target_shape.size(), static_cast(x_dims.size()), - platform::errors::InvalidArgument( - "The rank of target_shape must be greater than or equal " - "to the rank of Input(X). But received Input(X): input " - "rank %u; received target_shape: rank %u.", - x_dims.size(), target_shape.size())); - PADDLE_ENFORCE_LE(target_shape.size(), MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank of target_shape must be less than or equal " - "to %d. But received: rank %u.", - MAX_RANK_SUPPORTED, target_shape.size())); - ctx->SetOutputDim("Out", phi::make_ddim(target_shape)); - } }; class ExpandAsV2OpMaker : public framework::OpProtoAndCheckerMaker { @@ -116,9 +97,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandAsV2GradNoNeedBufVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(expand_as_v2, ExpandAsInferShapeFunctor, + PD_INFER_META(phi::ExpandAsInferMeta)); REGISTER_OPERATOR(expand_as_v2, ops::ExpandAsV2Op, ops::ExpandAsV2OpMaker, ops::ExpandAsV2GradOpMaker, - ops::ExpandAsV2GradOpMaker); + ops::ExpandAsV2GradOpMaker, + ExpandAsInferShapeFunctor); REGISTER_OPERATOR(expand_as_v2_grad, ops::ExpandAsV2GradOp, ops::ExpandAsV2GradNoNeedBufVarsInferer); diff --git a/paddle/fluid/operators/kron_op.cc b/paddle/fluid/operators/kron_op.cc index 68d0c7978b4..60390016d66 100644 --- a/paddle/fluid/operators/kron_op.cc +++ b/paddle/fluid/operators/kron_op.cc @@ -17,7 +17,9 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -26,27 +28,6 @@ class KronOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "kron"); - OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "kron"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "kron"); - - auto dim_x = ctx->GetInputDim("X"); - auto dim_y = ctx->GetInputDim("Y"); - auto rank_x = dim_x.size(); - auto rank_y = dim_y.size(); - auto rank = (rank_x > rank_y) ? rank_x : rank_y; - - std::vector dim_out; - dim_out.reserve(rank); - for (int i = 0; i < rank; i++) { - int64_t dim_xi = (i < rank - rank_x) ? 1 : dim_x.at(i - (rank - rank_x)); - int64_t dim_yi = (i < rank - rank_y) ? 1 : dim_y.at(i - (rank - rank_y)); - dim_out.push_back(dim_xi == -1 || dim_yi == -1 ? -1 : dim_xi * dim_yi); - } - ctx->SetOutputDim("Out", phi::make_ddim(dim_out)); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -173,7 +154,10 @@ class KronGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(kron, KronInferShapeFunctor, + PD_INFER_META(phi::KronInferMeta)); REGISTER_OPERATOR(kron, ops::KronOp, ops::KronOpMaker, ops::KronGradOpMaker, - ops::KronGradOpMaker); + ops::KronGradOpMaker, + KronInferShapeFunctor); REGISTER_OPERATOR(kron_grad, ops::KronGradOp); diff --git a/paddle/fluid/operators/searchsorted_op.cc b/paddle/fluid/operators/searchsorted_op.cc index d0290795455..3a6fdbaa261 100644 --- a/paddle/fluid/operators/searchsorted_op.cc +++ b/paddle/fluid/operators/searchsorted_op.cc @@ -12,8 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -21,60 +23,6 @@ namespace operators { class SearchSortedOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - static bool SearchsortedDimsMatchedBeforeLastDim( - const framework::DDim& sequences_dims, - const framework::DDim& values_dims) { - if (sequences_dims.size() != values_dims.size()) { - return false; - } - const auto& sequences_dims_size = sequences_dims.size(); - for (int64_t dim = 0; dim < sequences_dims_size - 1; ++dim) { - if (sequences_dims[dim] != values_dims[dim]) { - return false; - } - } - return true; - } - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("SortedSequence"), "Input", "SortedSequence", - "searchsorted"); - OP_INOUT_CHECK(ctx->HasInput("Values"), "Input", "Values", "searchsorted"); - - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "searchsorted"); - - auto sequences_dims = ctx->GetInputDim("SortedSequence"); - auto values_dims = ctx->GetInputDim("Values"); - auto out_int32 = ctx->Attrs().Get("out_int32"); - - if (sequences_dims.size() != 1) { - PADDLE_ENFORCE_EQ( - SearchsortedDimsMatchedBeforeLastDim(sequences_dims, values_dims), - true, - platform::errors::Unavailable( - "The dimensions of sorted_sequence tensor ( %s ) and values " - "tensor ( %s ) can not match. Because the input sorted_sequence " - "tensor must be 1 dimension or the first N-1 dimensions of " - "sorted_sequence tensor and input values tensor must match. " - "Please input appropriate sorted_sequence and values again! ", - sequences_dims, values_dims)); - } - - if (out_int32) { - PADDLE_ENFORCE_LT( - sequences_dims[sequences_dims.size() - 1], - std::numeric_limits::max(), - platform::errors::Unavailable( - "The size of sorted_sequence %d exceed the maximum limit d%. " - "Because the size of sorted_sequence should be less than the " - "output maximum value for int32 bit. Please set appropriate " - "sorted_sequence to meet this requirement! ", - sequences_dims[sequences_dims.size() - 1], - std::numeric_limits::max())); - } - - ctx->SetOutputDim("Out", values_dims); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -115,4 +63,7 @@ class SearchSortedOpMaker : public framework::OpProtoAndCheckerMaker { namespace ops = paddle::operators; -REGISTER_OPERATOR(searchsorted, ops::SearchSortedOp, ops::SearchSortedOpMaker); +DECLARE_INFER_SHAPE_FUNCTOR(searchsorted, SearchsortedInferShapeFunctor, + PD_INFER_META(phi::SearchsortedInferMeta)); +REGISTER_OPERATOR(searchsorted, ops::SearchSortedOp, ops::SearchSortedOpMaker, + SearchsortedInferShapeFunctor); diff --git a/paddle/fluid/operators/top_k_v2_op.cc b/paddle/fluid/operators/top_k_v2_op.cc index d1add111e1d..0a9ae789b01 100644 --- a/paddle/fluid/operators/top_k_v2_op.cc +++ b/paddle/fluid/operators/top_k_v2_op.cc @@ -14,7 +14,9 @@ limitations under the License. */ #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -23,56 +25,6 @@ class TopkV2Op : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "topk_v2"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "topk_v2"); - OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "topk_v2"); - - auto input_dims = ctx->GetInputDim("X"); - const int& dim_size = input_dims.size(); - int axis = static_cast(ctx->Attrs().Get("axis")); - PADDLE_ENFORCE_EQ( - (axis < dim_size) && (axis >= (-1 * dim_size)), true, - paddle::platform::errors::InvalidArgument( - "the axis of topk must be [-%d, %d), but you set axis is %d", - dim_size, dim_size, axis)); - - if (axis < 0) axis += dim_size; - - int k; - auto k_is_tensor = ctx->HasInput("K"); - if (k_is_tensor) { - k = -1; - } else { - k = static_cast(ctx->Attrs().Get("k")); - PADDLE_ENFORCE_EQ(k >= 1, true, - paddle::platform::errors::InvalidArgument( - "the attribute of k in the topk must >= 1 or be a " - "Tensor, but received %d .", - k)); - } - - PADDLE_ENFORCE_GE(input_dims.size(), 1, - paddle::platform::errors::InvalidArgument( - "input of topk must have >= 1d shape")); - - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_GE( - input_dims[axis], k, - paddle::platform::errors::InvalidArgument( - "input of topk op must have >= %d columns in axis of %d", k, - axis)); - } - - framework::DDim dims = input_dims; - - dims[axis] = k; - ctx->SetOutputDim("Out", dims); - ctx->SetOutputDim("Indices", dims); - ctx->ShareLoD("X", "Out"); - ctx->ShareLoD("X", "Indices"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -169,8 +121,11 @@ class TopkV2GradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(top_k_v2, TopKInferShapeFunctor, + PD_INFER_META(phi::TopKInferMeta)); REGISTER_OPERATOR(top_k_v2, ops::TopkV2Op, ops::TopkV2OpMaker, ops::TopkV2GradOpMaker, - ops::TopkV2GradOpMaker); + ops::TopkV2GradOpMaker, + TopKInferShapeFunctor); REGISTER_OPERATOR(top_k_v2_grad, ops::TopkV2OpGrad); diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 521f2a9bf06..b7a7a4ec231 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -476,6 +476,33 @@ void ElementwiseRawInferMeta(const MetaTensor& x, out->share_lod(x); } +void ExpandAsInferMeta(const MetaTensor& x, + paddle::optional y, + const std::vector& target_shape, + MetaTensor* out) { +#define MAX_RANK_SUPPORTED 6 + auto x_dims = x.dims(); + PADDLE_ENFORCE_GE( + target_shape.size(), + static_cast(x_dims.size()), + phi::errors::InvalidArgument( + "The rank of target_shape must be greater than or equal " + "to the rank of Input(X). But received Input(X): input " + "rank %u; received target_shape: rank %u.", + x_dims.size(), + target_shape.size())); + PADDLE_ENFORCE_LE(target_shape.size(), + MAX_RANK_SUPPORTED, + phi::errors::InvalidArgument( + "The rank of target_shape must be less than or equal " + "to %d. But received: rank %u.", + MAX_RANK_SUPPORTED, + target_shape.size())); + out->set_dims(phi::make_ddim(target_shape)); + out->set_dtype(x.dtype()); +#undef MAX_RANK_SUPPORTED +} + void GatherInferMeta(const MetaTensor& x, const MetaTensor& index, const Scalar& axis, @@ -728,6 +755,24 @@ void IndexSelectInferMeta(const MetaTensor& x, output->share_lod(x); } +void KronInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { + auto dim_x = x.dims(); + auto dim_y = y.dims(); + auto rank_x = dim_x.size(); + auto rank_y = dim_y.size(); + auto rank = (rank_x > rank_y) ? rank_x : rank_y; + + std::vector dim_out; + dim_out.reserve(rank); + for (int i = 0; i < rank; i++) { + int64_t dim_xi = (i < rank - rank_x) ? 1 : dim_x.at(i - (rank - rank_x)); + int64_t dim_yi = (i < rank - rank_y) ? 1 : dim_y.at(i - (rank - rank_y)); + dim_out.push_back(dim_xi == -1 || dim_yi == -1 ? -1 : dim_xi * dim_yi); + } + out->set_dims(phi::make_ddim(dim_out)); + out->set_dtype(x.dtype()); +} + void LogLossInferMeta(const MetaTensor& input, const MetaTensor& label, float epsilon, @@ -873,6 +918,60 @@ void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) { out->share_lod(x); } +void SearchsortedInferMeta(const MetaTensor& sorted_sequence, + const MetaTensor& value, + bool out_int32, + bool right, + MetaTensor* out) { + auto sequences_dims = sorted_sequence.dims(); + auto values_dims = value.dims(); + + bool flag = true; + if (sequences_dims.size() != values_dims.size()) { + flag = false; + } + const auto& sequences_dims_size = sequences_dims.size(); + for (int64_t dim = 0; dim < sequences_dims_size - 1; ++dim) { + if (sequences_dims[dim] != values_dims[dim]) { + flag = false; + break; + } + } + if (sequences_dims.size() != 1) { + PADDLE_ENFORCE_EQ( + flag, + true, + phi::errors::Unavailable( + "The dimensions of sorted_sequence tensor ( %s ) and values " + "tensor ( %s ) can not match. Because the input sorted_sequence " + "tensor must be 1 dimension or the first N-1 dimensions of " + "sorted_sequence tensor and input values tensor must match. " + "Please input appropriate sorted_sequence and values again! ", + sequences_dims, + values_dims)); + } + + if (out_int32) { + PADDLE_ENFORCE_LT( + sequences_dims[sequences_dims.size() - 1], + std::numeric_limits::max(), + phi::errors::Unavailable( + "The size of sorted_sequence %d exceed the maximum limit d%. " + "Because the size of sorted_sequence should be less than the " + "output maximum value for int32 bit. Please set appropriate " + "sorted_sequence to meet this requirement! ", + sequences_dims[sequences_dims.size() - 1], + std::numeric_limits::max())); + } + + out->set_dims(values_dims); + if (out_int32) { + out->set_dtype(DataType::INT32); + } else { + out->set_dtype(DataType::INT64); + } +} + void SegmentPoolInferMeta(const MetaTensor& x, const MetaTensor& segment_ids, const std::string& pooltype, diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index 9e1a35640ad..cb680415e7d 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -90,6 +90,11 @@ void ElementwiseRawInferMeta(const MetaTensor& x_meta, int axis, MetaTensor* out); +void ExpandAsInferMeta(const MetaTensor& x, + paddle::optional y, + const std::vector& target_shape, + MetaTensor* out); + void GatherInferMeta(const MetaTensor& x, const MetaTensor& index, const Scalar& axis, @@ -125,6 +130,8 @@ void IndexSelectInferMeta(const MetaTensor& x, int dim, MetaTensor* output); +void KronInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out); + void LogLossInferMeta(const MetaTensor& input, const MetaTensor& label, float epsilon, @@ -139,6 +146,12 @@ void MatmulInferMeta(const MetaTensor& x, void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out); +void SearchsortedInferMeta(const MetaTensor& sorted_sequence, + const MetaTensor& value, + bool out_int32, + bool right, + MetaTensor* out); + void SegmentPoolInferMeta(const MetaTensor& x, const MetaTensor& segment_ids, const std::string& pooltype, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index f81f4a1b7c7..bc6ab524277 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -1384,6 +1384,55 @@ void TileInferMeta(const MetaTensor& x, } } +void TopKInferMeta(const MetaTensor& x, + const Scalar& k_scalar, + int axis, + bool largest, + bool sorted, + MetaTensor* out, + MetaTensor* indices, + MetaConfig config) { + auto input_dims = x.dims(); + const int& dim_size = input_dims.size(); + PADDLE_ENFORCE_EQ( + (axis < dim_size) && (axis >= (-1 * dim_size)), + true, + phi::errors::InvalidArgument( + "the axis of topk must be [-%d, %d), but you set axis is %d", + dim_size, + dim_size, + axis)); + + if (axis < 0) axis += dim_size; + + int k = k_scalar.to(); + if (k_scalar.FromTensor()) { + k = -1; + } else { + PADDLE_ENFORCE_EQ(k >= 1, + true, + phi::errors::InvalidArgument( + "the attribute of k in the topk must >= 1 or be a " + "Tensor, but received %d .", + k)); + } + + PADDLE_ENFORCE_GE( + input_dims.size(), + 1, + phi::errors::InvalidArgument("input of topk must have >= 1d shape")); + + phi::DDim dims = input_dims; + + dims[axis] = k; + out->set_dims(dims); + out->share_lod(x); + out->set_dtype(x.dtype()); + indices->set_dims(dims); + indices->share_lod(x); + indices->set_dtype(DataType::INT64); +} + void TraceInferMeta( const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out) { int dim1 = axis1; diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index eb894003e53..6cb9653624f 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -215,6 +215,15 @@ void TileInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void TopKInferMeta(const MetaTensor& x, + const Scalar& k_scalar, + int axis, + bool largest, + bool sorted, + MetaTensor* out, + MetaTensor* indices, + MetaConfig config = MetaConfig()); + void TraceInferMeta( const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out); -- GitLab From ed8a9370d91ec107440d3b92116d47e0c5b029ac Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Thu, 17 Mar 2022 14:08:37 +0800 Subject: [PATCH 136/176] move activation sigmoid (#40626) --- .../eager/tests/task_tests/generated_test.cc | 2 +- .../task_tests/hook_test_intermidiate.cc | 2 +- .../new_executor/standalone_executor_test.cc | 4 +- .../tensorrt/convert/test_activation_op.cc | 2 +- paddle/fluid/operators/activation_op.cc | 28 +- paddle/fluid/operators/activation_op.h | 409 +----------------- paddle/fluid/operators/activation_op.kps | 142 +----- paddle/phi/kernels/activation_grad_kernel.h | 98 +++-- paddle/phi/kernels/activation_kernel.h | 4 +- .../phi/kernels/cpu/activation_grad_kernel.cc | 29 ++ paddle/phi/kernels/cpu/activation_kernel.cc | 9 + paddle/phi/kernels/funcs/activation_functor.h | 317 ++++++++++++++ .../phi/kernels/gpu/activation_grad_kernel.cu | 30 ++ paddle/phi/kernels/gpu/activation_kernel.cu | 9 + .../phi/kernels/impl/activation_grad_impl.h | 53 +++ paddle/phi/ops/compat/activation_sig.cc | 33 +- 16 files changed, 563 insertions(+), 608 deletions(-) diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc index 68820443a2d..49e517dc9b3 100644 --- a/paddle/fluid/eager/tests/task_tests/generated_test.cc +++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc @@ -128,6 +128,6 @@ TEST(Generated, ElementwiseAdd) { } // namespace egr -USE_OP(sigmoid); +USE_OP_ITSELF(sigmoid); USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(matmul_v2); diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc index 0ee171c73c6..b86865e2d12 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc @@ -255,6 +255,6 @@ TEST(Hook_intermidiate, Matmul_v2) { } } // namespace egr -USE_OP(sigmoid); +USE_OP_ITSELF(sigmoid); USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(matmul_v2); diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index eadb00b9e88..28e1145db42 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -31,7 +31,7 @@ USE_OP(slice); USE_OP(concat); USE_OP(matmul); USE_OP_ITSELF(elementwise_add); -USE_OP(sigmoid); +USE_OP_ITSELF(sigmoid); USE_OP_ITSELF(tanh); USE_OP(elementwise_mul); USE_OP(softmax_with_cross_entropy); @@ -47,7 +47,7 @@ USE_OP(square); USE_OP(transpose2_grad); USE_OP(concat_grad); USE_OP_ITSELF(elementwise_mul_grad); -USE_OP(sigmoid_grad); +USE_OP_ITSELF(sigmoid_grad); USE_OP_ITSELF(tanh_grad); USE_OP(sum); USE_OP(slice_grad); diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index 1946f9e2838..1ad82df4173 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -53,6 +53,6 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); } } // namespace paddle USE_OP_ITSELF(relu); -USE_OP(sigmoid); +USE_OP_ITSELF(sigmoid); USE_OP_ITSELF(tanh); USE_OP(relu6); diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index c835cf8ea14..845d0ed073b 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -1492,6 +1492,10 @@ REGISTER_ACTIVATION_OP(softshrink, SoftShrink, SoftShrinkFunctor, REGISTER_ACTIVATION_OP(tanh_shrink, TanhShrink, TanhShrinkFunctor, TanhShrinkGradFunctor); REGISTER_ACTIVATION_OP(silu, Silu, SiluFunctor, SiluGradFunctor); +REGISTER_ACTIVATION_OP(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, + HardSigmoidGradFunctor); +REGISTER_ACTIVATION_OP(logsigmoid, LogSigmoid, LogSigmoidFunctor, + LogSigmoidGradFunctor); /* ========================== sigmoid register ============================= */ @@ -1526,30 +1530,6 @@ REGISTER_OPERATOR(sigmoid_triple_grad, ops::SigmoidTripleGradFunctor::FwdDeps()>, ops::ActivationTripleGradOpInplaceInferer); -// Register Sigmoid/GradSigmoid Kernels -REGISTER_ACTIVATION_CPU_KERNEL(sigmoid, Sigmoid, SigmoidFunctor, - SigmoidGradFunctor); - -// Register DoubleGrad Kernel -REGISTER_OP_CPU_KERNEL( - sigmoid_grad_grad, - ops::SigmoidDoubleGradKernel>, - ops::SigmoidDoubleGradKernel>, - ops::SigmoidDoubleGradKernel>); - -// Register TripleGrad Kernel -REGISTER_OP_CPU_KERNEL( - sigmoid_triple_grad, - ops::SigmoidTripleGradKernel>, - ops::SigmoidTripleGradKernel>, - ops::SigmoidTripleGradKernel>); - /* ========================================================================== */ /* ========================== tanh register ============================= */ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 4f197b95b21..f1984af6e15 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -238,15 +238,6 @@ struct BaseActivationFunctor { AttrPair GetAttrs() { return AttrPair(); } }; -// sigmoid(x) = 1 / (1 + exp(-x)) -template -struct SigmoidFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = static_cast(1) / (static_cast(1) + (-x).exp()); - } -}; - #define USE_PHI_FUNCTOR(name) \ template \ using name##Functor = phi::funcs::name##Functor; \ @@ -285,160 +276,15 @@ USE_PHI_FUNCTOR(TanhShrink) USE_PHI_FUNCTOR(Silu) USE_PHI_FUNCTOR(ELU) USE_PHI_DOUBLE_GRAD_FUNCTOR(ELU) +USE_PHI_FUNCTOR(Sigmoid) +USE_PHI_DOUBLE_GRAD_FUNCTOR(Sigmoid) +USE_PHI_TRIPLE_GRAD_FUNCTOR(Sigmoid) +USE_PHI_FUNCTOR(LogSigmoid) +USE_PHI_FUNCTOR(HardSigmoid) template using ELUGradNegativeAlphaFunctor = phi::funcs::ELUGradNegativeAlphaFunctor; -template -struct SigmoidGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * out * (static_cast(1) - out); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -/* - Out - DOut -> SigmoidGradGrad -> DOutNew - DDX DDOut - - DDOut = (1-Out)*Out*DDX - DOutNew = (1-2*Out)*DOut*DDX -*/ -template -struct SigmoidGradGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* Out, - const framework::Tensor* ddX, const framework::Tensor* dOut, - framework::Tensor* dOutNew, framework::Tensor* ddOut) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidGradGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidGradGrad")); - - if (dOutNew) { - auto dout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidGradGrad")); - auto dout_new = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SigmoidGradGrad")); - dout_new.device(*d) = - (static_cast(1) - static_cast(2) * out) * dout * ddx; - } - if (ddOut) { - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SigmoidGradGrad")); - ddout.device(*d) = (static_cast(1) - out) * out * ddx; - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -/* - Out - DOut D_Dout - DDx -> SigmoidTripleGrad -> D_DDx - D_DDout d_OutNew - D_Dout_new - - D_Dout = (1-2*Out)*DDx*D_Dout_new - D_DDx = (1-Out)*Out*D_DDout + (1-2*Out)*DOut*D_Dout_new - D_OutNew = (DDx-2*Out*DDx)*D_DDout - 2*DOut*DDx*D_Dout_new - - Out, DDX, DOut, D_DDOut, D_DOut_New // input - D_OutNew, D_DOut, D_DDx // output -*/ -template -struct SigmoidTripleGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* Out, - const framework::Tensor* ddX, const framework::Tensor* dOut, - const framework::Tensor* d_DDOut, - const framework::Tensor* d_dOut_New, - framework::Tensor* d_d_Out, framework::Tensor* d_Out_New, - framework::Tensor* d_DDx) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidTripleGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidTripleGrad")); - auto dout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidTripleGrad")); - auto d_ddOut = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "SigmoidTripleGrad")); - auto d_dOutNew = framework::EigenVector::Flatten(GET_DATA_SAFELY( - d_dOut_New, "Input", "D_DOut_New", "SigmoidTripleGrad")); - - if (d_Out_New) { - auto d_OutNew = framework::EigenVector::Flatten(GET_DATA_SAFELY( - d_Out_New, "Output", "D_OutNew", "SigmoidTripleGrad")); - d_OutNew.device(*d) = (ddx - static_cast(2) * out * ddx) * d_ddOut - - static_cast(2) * dout * ddx * d_dOutNew; - } - if (d_d_Out) { - auto d_dOut = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "SigmoidTripleGrad")); - d_dOut.device(*d) = - (static_cast(1) - static_cast(2) * out) * ddx * d_dOutNew; - } - if (d_DDx) { - auto d_ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "SigmoidTripleGrad")); - d_ddx.device(*d) = - (static_cast(1) - out) * out * d_ddOut + - (static_cast(1) - static_cast(2) * out) * dout * d_dOutNew; - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -// Originally: logsigmoid(x) = -log (1 + exp(-x)) -// For numerical stability, we can use the log-sum-exp trick: -// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ -// We can rewrite the above equation as: -// out = -log( exp(0) + exp(-x)) [since exp(0) = 1] -// = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0))) -// = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x - -// max(-x, 0))) -// = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0)))) -// = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))) -// -// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0)) -// + exp(-x - max(-x, 0)))) -template -struct LogSigmoidFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - auto temp = (-x).cwiseMax(static_cast(0)); // temp = max(-x, 0) - out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log()); - } -}; - -// Originally: f' = exp(-x) / (1 + exp(-x)) -// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + -// exp(-x - max(-x, 0))) -template -struct LogSigmoidGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto temp = (-x).cwiseMax(static_cast(0)); // temp = max(-x, 0) - dx.device(d) = - dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp())); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - // exp(x) = e^x template struct ExpFunctor : public BaseActivationFunctor { @@ -1101,43 +947,6 @@ struct STanhGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; -template -struct HardSigmoidFunctor : public BaseActivationFunctor { - float slope; - float offset; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"slope", &slope}, {"offset", &offset}}; - } - - template - void operator()(Device d, X x, Out out) const { - auto temp = x * static_cast(slope) + static_cast(offset); - out.device(d) = - temp.cwiseMax(static_cast(0)).cwiseMin(static_cast(1)); - } -}; - -template -struct HardSigmoidGradFunctor : public BaseActivationFunctor { - float slope; - float offset; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"slope", &slope}, {"offset", &offset}}; - } - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * - ((out > static_cast(0)) * (out < static_cast(1))) - .template cast() * - static_cast(slope); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - template struct SwishFunctor : public BaseActivationFunctor { float beta; @@ -1365,211 +1174,6 @@ inline void ExtractDoubleGradTensorWithInputDOut( } } -template -class SigmoidDoubleGradKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& ctx) const override { - const framework::Tensor *Out, *ddX, *dOut; - framework::Tensor *dOutNew, *ddOut; - Out = ddX = dOut = nullptr; - dOutNew = ddOut = nullptr; - // extract ddx(input) and out(input) - ddX = ctx.Input("DDX"); - Out = ctx.Input("Out"); - PADDLE_ENFORCE_NOT_NULL( - ddX, platform::errors::NotFound( - "Cannot get input Variable ddX, variable name = %s", - ctx.InputName("DDX"))); - PADDLE_ENFORCE_NOT_NULL( - Out, platform::errors::NotFound( - "Cannot get input Variable Out, variable name = %s", - ctx.InputName("Out"))); - // set output ddout - ddOut = ctx.Output("DDOut"); - // extract dOut(intput) - dOut = ctx.Input("DOut"); - PADDLE_ENFORCE_NOT_NULL( - dOut, platform::errors::NotFound( - "Cannot get input Variable dOut, variable name = %s", - ctx.InputName("DOut"))); - dOutNew = ctx.Output("DOutNew"); - if (dOutNew) dOutNew->mutable_data(Out->dims(), ctx.GetPlace()); - if (ddOut) ddOut->mutable_data(Out->dims(), ctx.GetPlace()); - auto& place = ctx.template device_context(); - Functor functor; - functor(place, Out, ddX, dOut, dOutNew, ddOut); - } -}; - -// Out, DDX, DOut, D_DDOut, D_DOut_New // input -// D_OutNew, D_DOut, D_DDx // output -template -class SigmoidTripleGradKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& ctx) const override { - const framework::Tensor *Out, *ddX, *dOut, *d_ddOut, *d_dOutNew; - framework::Tensor *d_OutNew, *d_dOut, *d_ddx; - Out = ddX = dOut = d_ddOut = d_dOutNew = nullptr; - d_OutNew = d_dOut = d_ddx = nullptr; - - // extract ddx(input), out(input), dOut(input), d_ddOut(input), - // d_dOutNew(input) - ddX = ctx.Input("DDX"); - Out = ctx.Input("Out"); - dOut = ctx.Input("DOut"); - d_ddOut = ctx.Input("D_DDOut"); - d_dOutNew = ctx.Input("D_DOut_New"); - - PADDLE_ENFORCE_NOT_NULL( - ddX, platform::errors::NotFound( - "Cannot get input Variable ddX, variable name = %s", - ctx.InputName("DDX"))); - PADDLE_ENFORCE_NOT_NULL( - Out, platform::errors::NotFound( - "Cannot get input Variable Out, variable name = %s", - ctx.InputName("Out"))); - PADDLE_ENFORCE_NOT_NULL( - dOut, platform::errors::NotFound( - "Cannot get input Variable dOut, variable name = %s", - ctx.InputName("DOut"))); - PADDLE_ENFORCE_NOT_NULL( - d_ddOut, platform::errors::NotFound( - "Cannot get input Variable d_ddOut, variable name = %s", - ctx.InputName("D_DDOut"))); - PADDLE_ENFORCE_NOT_NULL( - d_dOutNew, - platform::errors::NotFound( - "Cannot get input Variable d_dOutNew, variable name = %s", - ctx.InputName("D_DOutNew"))); - - // set output d_OutNew、d_dOut、d_ddx - d_dOut = ctx.Output("D_DOut"); - d_OutNew = ctx.Output("D_OutNew"); - d_ddx = ctx.Output("D_DDx"); - - if (d_dOut) d_dOut->mutable_data(Out->dims(), ctx.GetPlace()); - if (d_OutNew) d_OutNew->mutable_data(Out->dims(), ctx.GetPlace()); - if (d_ddx) d_ddx->mutable_data(ddX->dims(), ctx.GetPlace()); - auto& place = ctx.template device_context(); - Functor functor; - functor(place, Out, ddX, dOut, d_ddOut, d_dOutNew, // input - d_dOut, d_OutNew, d_ddx); // output - } -}; - -template -class TanhDoubleGradKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& ctx) const override { - const framework::Tensor *Out, *ddX, *dOut; - framework::Tensor *dOutNew, *ddOut; - Out = ddX = dOut = nullptr; - dOutNew = ddOut = nullptr; - - // extract ddx(input) and out(input) - auto ddx_var = ctx.InputVar("DDX"); - auto out_var = ctx.InputVar("Out"); - PADDLE_ENFORCE_NOT_NULL( - ddx_var, platform::errors::NotFound( - "Cannot get input Variable ddx, variable name = %s", - ctx.InputName("DDX"))); - PADDLE_ENFORCE_NOT_NULL( - out_var, platform::errors::NotFound( - "Cannot get input Variable out, variable name = %s", - ctx.InputName("Out"))); - ddX = ctx.Input("DDX"); - Out = ctx.Input("Out"); - - // set output ddout - auto ddout_var = ctx.OutputVar("DDOut"); - if (ddout_var) { - ddOut = ctx.Output("DDOut"); - } - - // extract dOut(intput) - auto dout_var = ctx.InputVar("DOut"); - PADDLE_ENFORCE_NOT_NULL( - dout_var, platform::errors::NotFound( - "Cannot get input Variable dout_var, variable name = %s", - ctx.InputName("DOut"))); - dOut = ctx.Input("DOut"); - - // set output dout_new - auto dout_new_var = ctx.OutputVar("DOutNew"); - if (dout_new_var) { - dOutNew = ctx.Output("DOutNew"); - } - - if (dOutNew) dOutNew->mutable_data(Out->dims(), ctx.GetPlace()); - if (ddOut) ddOut->mutable_data(Out->dims(), ctx.GetPlace()); - auto& place = ctx.template device_context(); - Functor functor; - functor(place, Out, ddX, dOut, dOutNew, ddOut); - } -}; - -template -class TanhTripeGradKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& ctx) const override { - const framework::Tensor *Out, *ddX, *dOut, *d_ddOut, *d_dOutNew; - framework::Tensor *d_OutNew, *d_dOut, *d_ddx; - Out = ddX = dOut = d_ddOut = d_dOutNew = nullptr; - d_OutNew = d_dOut = d_ddx = nullptr; - - // extract ddx(input), out(input), dOut(input), d_ddOut(input), - // d_dOutNew(input) - ddX = ctx.Input("DDX"); - Out = ctx.Input("Out"); - dOut = ctx.Input("DOut"); - d_ddOut = ctx.Input("D_DDOut"); - d_dOutNew = ctx.Input("D_DOut_New"); - - PADDLE_ENFORCE_NOT_NULL( - ddX, platform::errors::NotFound( - "Cannot get input Variable ddX, variable name = %s", - ctx.InputName("DDX"))); - PADDLE_ENFORCE_NOT_NULL( - Out, platform::errors::NotFound( - "Cannot get input Variable Out, variable name = %s", - ctx.InputName("Out"))); - PADDLE_ENFORCE_NOT_NULL( - dOut, platform::errors::NotFound( - "Cannot get input Variable dOut, variable name = %s", - ctx.InputName("DOut"))); - PADDLE_ENFORCE_NOT_NULL( - d_ddOut, platform::errors::NotFound( - "Cannot get input Variable d_ddOut, variable name = %s", - ctx.InputName("D_DDOut"))); - PADDLE_ENFORCE_NOT_NULL( - d_dOutNew, - platform::errors::NotFound( - "Cannot get input Variable d_dOutNew, variable name = %s", - ctx.InputName("D_DOutNew"))); - - // set output d_OutNew、d_dOut、d_ddx - d_dOut = ctx.Output("D_DOut"); - d_OutNew = ctx.Output("D_OutNew"); - d_ddx = ctx.Output("D_DDx"); - - if (d_dOut) d_dOut->mutable_data(Out->dims(), ctx.GetPlace()); - if (d_OutNew) d_OutNew->mutable_data(Out->dims(), ctx.GetPlace()); - if (d_ddx) d_ddx->mutable_data(ddX->dims(), ctx.GetPlace()); - auto& place = ctx.template device_context(); - Functor functor; - functor(place, Out, ddX, dOut, d_ddOut, d_dOutNew, // input - d_dOut, d_OutNew, d_ddx); // output - } -}; - template class SquareDoubleGradKernel : public framework::OpKernel { @@ -1952,7 +1556,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor { } // namespace paddle #define FOR_EACH_ACTIVATION_OP(__macro) \ - __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \ __macro(floor, Floor, FloorFunctor, ZeroGradFunctor); \ __macro(round, Round, RoundFunctor, ZeroGradFunctor); \ @@ -1965,8 +1568,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor { __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \ __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor); \ __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor); \ - __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, \ - HardSigmoidGradFunctor); \ __macro(swish, Swish, SwishFunctor, SwishGradFunctor); \ __macro(mish, Mish, MishFunctor, MishGradFunctor); \ __macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor); diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index 865943696c3..7c1b2880801 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -20,69 +20,6 @@ limitations under the License. */ namespace paddle { namespace operators { -template -struct CudaSigmoidFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // sigmoid(x) = 1 / (1 + exp(-x)) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(one / (one + exp(-x))); - } -}; - -template -struct CudaSigmoidGradFunctor : public BaseActivationFunctor { - T one = static_cast(1.0f); - - // dx = dout * out * (1 - out) - __device__ __forceinline__ T operator()(const T dout, const T out) const { - return dout * out * (one - out); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - -template -struct CudaLogSigmoidFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType zero = static_cast(0.0f); - - // logsigmoid(x) = log(1 / (1 + exp(-x))) - // For numerical stability, - // logsigmoid(x) = - // - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - MPType temp = x > zero ? zero : -x; - return static_cast(-temp - log(exp(-temp) + exp(-x - temp))); - } -}; - -template -struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType zero = static_cast(0.0f); - - // dx = dout * exp(-x) / (1 + exp(-x)) - // For numerical stability: - // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x, - // 0))) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - MPType temp1 = x > zero ? zero : -x; - MPType temp2 = exp(-x - temp1); - return static_cast(dout * (temp2 / (exp(-temp1) + temp2))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } -}; - template struct CudaCeilFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; @@ -551,49 +488,6 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor { } }; -template -struct CudaHardSigmoidFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - T one = static_cast(1.0f); - float slope; - float offset; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"slope", &slope}, {"offset", &offset}}; - } - - // hard_sigmoid(x) = 0, when x <= -3 - // 1, when x >= 3 - // x * slope + offset, otherwise - __device__ __forceinline__ T operator()(const T x) const { - T temp = x * static_cast(slope) + static_cast(offset); - T temp_max = temp > zero ? temp : zero; - T temp_min = temp_max < one ? temp_max : one; - return temp_min; - } -}; - -template -struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - T one = static_cast(1.0f); - float slope; - float offset; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"slope", &slope}, {"offset", &offset}}; - } - - // dx = (out > 0 && out < 1) ? dout * slope : 0 - __device__ __forceinline__ T operator()(const T dout, const T out) const { - return (out > zero && out < one) ? dout * static_cast(slope) : zero; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { - return ActBwdOpFwdDeps::kDepOut; - } -}; - template struct CudaSwishFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; @@ -876,6 +770,9 @@ USE_PHI_FUNCTOR(CudaSoftShrink) USE_PHI_FUNCTOR(CudaTanhShrink) USE_PHI_FUNCTOR(CudaSilu) USE_PHI_FUNCTOR(CudaELU) +USE_PHI_FUNCTOR(CudaSigmoid) +USE_PHI_FUNCTOR(CudaLogSigmoid) +USE_PHI_FUNCTOR(CudaHardSigmoid) template using CudaELUGradNegativeAlphaFunctor = @@ -954,35 +851,6 @@ REGISTER_OP_CUDA_KERNEL( ops::CELUGradGradFunctor>); /* ========================================================================== */ -/* =========================== sigmoid register ============================ - */ -REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor, - CudaSigmoidGradFunctor); - -REGISTER_OP_CUDA_KERNEL( - sigmoid_grad_grad, - ops::SigmoidDoubleGradKernel>, - ops::SigmoidDoubleGradKernel>, - ops::SigmoidDoubleGradKernel>, - ops::SigmoidDoubleGradKernel>); - -REGISTER_OP_CUDA_KERNEL( - sigmoid_triple_grad, - ops::SigmoidTripleGradKernel>, - ops::SigmoidTripleGradKernel>, - ops::SigmoidTripleGradKernel>, - ops::SigmoidTripleGradKernel< - plat::CUDADeviceContext, - ops::SigmoidTripleGradFunctor>); -/* ========================================================================== */ - /* =========================== sqrt register ============================= */ REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, CudaSqrtFunctor, CudaSqrtGradFunctor); @@ -1120,8 +988,6 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ #define FOR_EACH_ACTIVATION_CUDA_OP(__macro) \ - __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, \ - CudaLogSigmoidGradFunctor); \ __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor, \ CudaSoftShrinkGradFunctor); \ __macro(ceil, Ceil, CudaCeilFunctor, CudaZeroGradFunctor); \ @@ -1141,8 +1007,6 @@ REGISTER_OP_CUDA_KERNEL( CudaTanhShrinkGradFunctor); \ __macro(hard_shrink, HardShrink, CudaHardShrinkFunctor, \ CudaHardShrinkGradFunctor); \ - __macro(hard_sigmoid, HardSigmoid, CudaHardSigmoidFunctor, \ - CudaHardSigmoidGradFunctor); \ __macro(swish, Swish, CudaSwishFunctor, CudaSwishGradFunctor); \ __macro(mish, Mish, CudaMishFunctor, CudaMishGradFunctor); \ __macro(hard_swish, HardSwish, CudaHardSwishFunctor, \ diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h index e0dfca756e1..241a80d85ea 100644 --- a/paddle/phi/kernels/activation_grad_kernel.h +++ b/paddle/phi/kernels/activation_grad_kernel.h @@ -19,14 +19,14 @@ limitations under the License. */ namespace phi { -#define DECLARE_ACTIVATION_GRAD_KERNEL_DepX(name) \ +#define DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(name) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& x, \ const DenseTensor& dout, \ DenseTensor* dx); -#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(name, attr) \ +#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(name, attr) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& x, \ @@ -34,7 +34,7 @@ namespace phi { float attr, \ DenseTensor* dx); -#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(name, attr1, attr2) \ +#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(name, attr1, attr2) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& x, \ @@ -43,19 +43,28 @@ namespace phi { float attr2, \ DenseTensor* dx); -#define DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(name) \ +#define DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(name) \ template \ void name##GradKernel(const Context& dev_ctx, \ const DenseTensor& out, \ const DenseTensor& dout, \ DenseTensor* dx); -#define DECLARE_ACTIVATION_GRAD_KERNEL_WITH_ONE_ATTRS_DepOut(name, attr) \ - template \ - void name##GradKernel(const Context& dev_ctx, \ - const DenseTensor& out, \ - const DenseTensor& dout, \ - float attr, \ +#define DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(name, attr) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + float attr, \ + DenseTensor* dx); + +#define DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(name, attr1, attr2) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + float attr1, \ + float attr2, \ DenseTensor* dx); template @@ -107,28 +116,51 @@ void EluDoubleGradKernel(const Context& dev_ctx, DenseTensor* dx, DenseTensor* ddout); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cos); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Tan); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acos); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sin); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asin); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atan); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sinh); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cosh); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asinh); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acosh); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atanh); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(TanhShrink); -DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Silu); - -DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Relu); -DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Tanh); - -DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(LeakyRelu, alpha) - DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(ThresholdedRelu, threshold) - DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(SoftShrink, lambda) - DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DepX(HardShrink, threshold) - - DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DepX(BRelu, t_min, t_max) +template +void SigmoidDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + const DenseTensor& dout, + DenseTensor* dout_new, + DenseTensor* ddout); + +template +void SigmoidTripleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + const DenseTensor& dout, + const DenseTensor& d_ddout, + const DenseTensor& d_dout_new, + DenseTensor* d_out_new, + DenseTensor* d_dout, + DenseTensor* d_ddx); + +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Cos); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Tan); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Acos); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Sin); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Asin); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Atan); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Sinh); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Cosh); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Asinh); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Acosh); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Atanh); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Silu); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid); + +DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid); + +DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, alpha); +DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu, threshold); +DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, lambda); +DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, threshold); + +DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, t_min, t_max); + +DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid, slope, offset); } // namespace phi diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h index 0762ce43ff8..dbc63a636ed 100644 --- a/paddle/phi/kernels/activation_kernel.h +++ b/paddle/phi/kernels/activation_kernel.h @@ -54,6 +54,8 @@ DECLARE_ACTIVATION_KERNEL(Relu) DECLARE_ACTIVATION_KERNEL(Tanh) DECLARE_ACTIVATION_KERNEL(TanhShrink) DECLARE_ACTIVATION_KERNEL(Silu) +DECLARE_ACTIVATION_KERNEL(Sigmoid) +DECLARE_ACTIVATION_KERNEL(LogSigmoid) DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu, alpha) DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, threshold) @@ -62,5 +64,5 @@ DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold) DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha) DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(BRelu, t_min, t_max) - +DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardSigmoid, slope, offset) } // namespace phi diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc index 11b396a84d0..c5822615962 100644 --- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc @@ -90,6 +90,23 @@ namespace phi { dev_ctx, nullptr, &out, &dout, dx, functor); \ } +#define DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT( \ + name, functor_class, attr1, attr2) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + float attr1, \ + float attr2, \ + DenseTensor* dx) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationGradImpl>( \ + dev_ctx, nullptr, &out, &dout, dx, functor); \ + } + DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CosGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, TanGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, AcosGradFunctor); @@ -103,9 +120,11 @@ DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, AcoshGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, AtanhGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, TanhShrinkGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, SiluGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, LogSigmoidGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluGradFunctor); DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid, SigmoidGradFunctor); DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, LeakyReluGradFunctor, @@ -125,6 +144,11 @@ DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, t_min, t_max); +DEFINE_CPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid, + HardSigmoidGradFunctor, + slope, + offset); + template void EluGradKernel(const Context& dev_ctx, const DenseTensor& x, @@ -204,3 +228,8 @@ PD_REGISTER_KERNEL(tanh_triple_grad, float, double, phi::dtype::float16) {} +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_sigmoid_grad, HardSigmoidGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(logsigmoid_grad, LogSigmoidGradKernel) diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc index 59ce18a11cc..1d7b77ea444 100644 --- a/paddle/phi/kernels/cpu/activation_kernel.cc +++ b/paddle/phi/kernels/cpu/activation_kernel.cc @@ -72,6 +72,8 @@ DEFINE_CPU_ACTIVATION_KERNEL(Relu, ReluCPUFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Tanh, TanhFunctor) DEFINE_CPU_ACTIVATION_KERNEL(TanhShrink, TanhShrinkFunctor) DEFINE_CPU_ACTIVATION_KERNEL(Silu, SiluFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Sigmoid, SigmoidFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(LogSigmoid, LogSigmoidFunctor) DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha) DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, @@ -82,6 +84,10 @@ DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, SoftShrinkFunctor, lambda) DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, ELUFunctor, alpha) DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, BReluFunctor, t_min, t_max) +DEFINE_CPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid, + HardSigmoidFunctor, + slope, + offset) } // namespace phi PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {} @@ -109,3 +115,6 @@ PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel) PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel) PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel) PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel) +PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel) +PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel) +PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel) diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h index 663258fa560..6c5ffbd06e3 100644 --- a/paddle/phi/kernels/funcs/activation_functor.h +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -1012,6 +1012,217 @@ struct SiluGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; +// sigmoid(x) = 1 / (1 + exp(-x)) +template +struct SigmoidFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = static_cast(1) / (static_cast(1) + (-x).exp()); + } +}; + +template +struct SigmoidGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * out * (static_cast(1) - out); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +/* + Out + DOut -> SigmoidGradGrad -> DOutNew + DDX DDOut + + DDOut = (1-Out)*Out*DDX + DOutNew = (1-2*Out)*DOut*DDX +*/ +template +struct SigmoidGradGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, + const DenseTensor* Out, + const DenseTensor* ddX, + const DenseTensor* dOut, + DenseTensor* dOutNew, + DenseTensor* ddOut) const { + auto* d = dev.eigen_device(); + auto ddx = EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidGradGrad")); + auto out = EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidGradGrad")); + + if (dOutNew) { + auto dout = EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidGradGrad")); + auto dout_new = EigenVector::Flatten( + GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SigmoidGradGrad")); + dout_new.device(*d) = + (static_cast(1) - static_cast(2) * out) * dout * ddx; + } + if (ddOut) { + auto ddout = EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SigmoidGradGrad")); + ddout.device(*d) = (static_cast(1) - out) * out * ddx; + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +/* + Out + DOut D_Dout + DDx -> SigmoidTripleGrad -> D_DDx + D_DDout d_OutNew + D_Dout_new + + D_Dout = (1-2*Out)*DDx*D_Dout_new + D_DDx = (1-Out)*Out*D_DDout + (1-2*Out)*DOut*D_Dout_new + D_OutNew = (DDx-2*Out*DDx)*D_DDout - 2*DOut*DDx*D_Dout_new + + Out, DDX, DOut, D_DDOut, D_DOut_New // input + D_OutNew, D_DOut, D_DDx // output +*/ +template +struct SigmoidTripleGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, + const DenseTensor* Out, + const DenseTensor* ddX, + const DenseTensor* dOut, + const DenseTensor* d_DDOut, + const DenseTensor* d_dOut_New, + DenseTensor* d_d_Out, + DenseTensor* d_Out_New, + DenseTensor* d_DDx) const { + auto* d = dev.eigen_device(); + auto ddx = EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidTripleGrad")); + auto out = EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidTripleGrad")); + auto dout = EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidTripleGrad")); + auto d_ddOut = EigenVector::Flatten( + GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "SigmoidTripleGrad")); + auto d_dOutNew = EigenVector::Flatten(GET_DATA_SAFELY( + d_dOut_New, "Input", "D_DOut_New", "SigmoidTripleGrad")); + + if (d_Out_New) { + auto d_OutNew = EigenVector::Flatten(GET_DATA_SAFELY( + d_Out_New, "Output", "D_OutNew", "SigmoidTripleGrad")); + d_OutNew.device(*d) = (ddx - static_cast(2) * out * ddx) * d_ddOut - + static_cast(2) * dout * ddx * d_dOutNew; + } + if (d_d_Out) { + auto d_dOut = EigenVector::Flatten( + GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "SigmoidTripleGrad")); + d_dOut.device(*d) = + (static_cast(1) - static_cast(2) * out) * ddx * d_dOutNew; + } + if (d_DDx) { + auto d_ddx = EigenVector::Flatten( + GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "SigmoidTripleGrad")); + d_ddx.device(*d) = + (static_cast(1) - out) * out * d_ddOut + + (static_cast(1) - static_cast(2) * out) * dout * d_dOutNew; + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +// Originally: logsigmoid(x) = -log (1 + exp(-x)) +// For numerical stability, we can use the log-sum-exp trick: +// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ +// We can rewrite the above equation as: +// out = -log( exp(0) + exp(-x)) [since exp(0) = 1] +// = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0))) +// = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x - +// max(-x, 0))) +// = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0)))) +// = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))) +// +// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0)) +// + exp(-x - max(-x, 0)))) +template +struct LogSigmoidFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + auto temp = (-x).cwiseMax(static_cast(0)); // temp = max(-x, 0) + out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log()); + } +}; + +// Originally: f' = exp(-x) / (1 + exp(-x)) +// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + +// exp(-x - max(-x, 0))) +template +struct LogSigmoidGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp = (-x).cwiseMax(static_cast(0)); // temp = max(-x, 0) + dx.device(d) = + dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp())); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct HardSigmoidFunctor : public BaseActivationFunctor { + float slope; + float offset; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + + template + void operator()(Device d, X x, Out out) const { + auto temp = x * static_cast(slope) + static_cast(offset); + out.device(d) = + temp.cwiseMax(static_cast(0)).cwiseMin(static_cast(1)); + } +}; + +template +struct HardSigmoidGradFunctor : public BaseActivationFunctor { + float slope; + float offset; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * + ((out > static_cast(0)) * (out < static_cast(1))) + .template cast() * + static_cast(slope); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) template struct CudaReluFunctor : public BaseActivationFunctor { @@ -1653,6 +1864,112 @@ struct CudaSiluGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; +template +struct CudaSigmoidFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // sigmoid(x) = 1 / (1 + exp(-x)) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(one / (one + exp(-x))); + } +}; + +template +struct CudaSigmoidGradFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // dx = dout * out * (1 - out) + __device__ __forceinline__ T operator()(const T dout, const T out) const { + return dout * out * (one - out); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +template +struct CudaLogSigmoidFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType zero = static_cast(0.0f); + + // logsigmoid(x) = log(1 / (1 + exp(-x))) + // For numerical stability, + // logsigmoid(x) = + // - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + MPType temp = x > zero ? zero : -x; + return static_cast(-temp - log(exp(-temp) + exp(-x - temp))); + } +}; + +template +struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType zero = static_cast(0.0f); + + // dx = dout * exp(-x) / (1 + exp(-x)) + // For numerical stability: + // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x, + // 0))) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + MPType temp1 = x > zero ? zero : -x; + MPType temp2 = exp(-x - temp1); + return static_cast(dout * (temp2 / (exp(-temp1) + temp2))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaHardSigmoidFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + T one = static_cast(1.0f); + float slope; + float offset; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + + // hard_sigmoid(x) = 0, when x <= -3 + // 1, when x >= 3 + // x * slope + offset, otherwise + __device__ __forceinline__ T operator()(const T x) const { + T temp = x * static_cast(slope) + static_cast(offset); + T temp_max = temp > zero ? temp : zero; + T temp_min = temp_max < one ? temp_max : one; + return temp_min; + } +}; + +template +struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + T one = static_cast(1.0f); + float slope; + float offset; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + + // dx = (out > 0 && out < 1) ? dout * slope : 0 + __device__ __forceinline__ T operator()(const T dout, const T out) const { + return (out > zero && out < one) ? dout * static_cast(slope) : zero; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + #endif } // namespace funcs diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu index b12fc6975b3..c912d0c4686 100644 --- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu @@ -142,8 +142,27 @@ void ActivationGradGPUImpl(const Context& dev_ctx, dev_ctx, nullptr, &out, &dout, dx, functor); \ } +#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT( \ + name, functor_class, attr1, attr2) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + float attr1, \ + float attr2, \ + DenseTensor* dx) { \ + funcs::functor_class functor; \ + auto attrs = functor.GetAttrs(); \ + *(attrs[0].second) = attr1; \ + *(attrs[1].second) = attr2; \ + ActivationGradGPUImpl>( \ + dev_ctx, nullptr, &out, &dout, dx, functor); \ + } + DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, CudaReluGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, CudaTanhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid, CudaSigmoidGradFunctor); + DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Cos, CudaCosGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Tan, CudaTanGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acos, CudaAcosGradFunctor); @@ -157,6 +176,7 @@ DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Acosh, CudaAcoshGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, CudaAtanhGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, CudaTanhShrinkGradFunctor); DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, CudaSiluGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, CudaLogSigmoidGradFunctor); DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, CudaLeakyReluGradFunctor, @@ -176,6 +196,11 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, t_min, t_max); +DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid, + CudaHardSigmoidGradFunctor, + slope, + offset); + template void EluGradKernel(const Context& dev_ctx, const DenseTensor& x, @@ -270,3 +295,8 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(tanh_shrink_grad, TanhShrinkGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(silu_grad, SiluGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel) PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_double_grad, EluDoubleGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_sigmoid_grad, HardSigmoidGradKernel) +PD_REGISTER_ACTIVATION_GRAD_KERNEL(logsigmoid_grad, LogSigmoidGradKernel) diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu index cd9330ead84..6b598c764de 100644 --- a/paddle/phi/kernels/gpu/activation_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_kernel.cu @@ -91,6 +91,8 @@ DEFINE_GPU_ACTIVATION_KERNEL(Relu, CudaReluFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Tanh, CudaTanhFunctor) DEFINE_GPU_ACTIVATION_KERNEL(TanhShrink, CudaTanhShrinkFunctor) DEFINE_GPU_ACTIVATION_KERNEL(Silu, CudaSiluFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Sigmoid, CudaSigmoidFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(LogSigmoid, CudaLogSigmoidFunctor) DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha) DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, @@ -103,6 +105,10 @@ DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(SoftShrink, CudaSoftShrinkFunctor, lambda) DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(Elu, CudaELUFunctor, alpha) DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(BRelu, CudaBReluFunctor, t_min, t_max) +DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid, + CudaHardSigmoidFunctor, + slope, + offset) } // namespace phi @@ -155,3 +161,6 @@ PD_REGISTER_ACTIVATION_KERNEL(soft_shrink, SoftShrinkKernel) PD_REGISTER_ACTIVATION_KERNEL(tanh_shrink, TanhShrinkKernel) PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel) PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel) +PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel) +PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel) +PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel) diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h index a95f49c0e7c..7d6b6dc72ea 100644 --- a/paddle/phi/kernels/impl/activation_grad_impl.h +++ b/paddle/phi/kernels/impl/activation_grad_impl.h @@ -222,4 +222,57 @@ void EluDoubleGradKernel(const Context& dev_ctx, functor(dev_ctx, &x, &ddx, ddout, &dout, dx); } +template +void SigmoidDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + const DenseTensor& dout, + DenseTensor* dout_new, + DenseTensor* ddout) { + if (dout_new) { + dout_new->Resize(out.dims()); + dev_ctx.template Alloc(dout_new); + } + if (ddout) { + ddout->Resize(out.dims()); + dev_ctx.template Alloc(ddout); + } + funcs::SigmoidGradGradFunctor functor; + functor(dev_ctx, &out, &ddx, &dout, dout_new, ddout); +} + +template +void SigmoidTripleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + const DenseTensor& dout, + const DenseTensor& d_ddout, + const DenseTensor& d_dout_new, + DenseTensor* d_out_new, + DenseTensor* d_dout, + DenseTensor* d_ddx) { + if (d_dout) { + d_dout->Resize(out.dims()); + dev_ctx.template Alloc(d_dout); + } + if (d_out_new) { + d_dout->Resize(out.dims()); + dev_ctx.template Alloc(d_out_new); + } + if (d_ddx) { + d_dout->Resize(ddx.dims()); + dev_ctx.template Alloc(d_ddx); + } + funcs::SigmoidTripleGradFunctor functor; + functor(dev_ctx, + &out, + &ddx, + &dout, + &d_ddout, + &d_dout_new, + d_dout, + d_out_new, + d_ddx); +} + } // namespace phi diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc index 890dbadf17c..7ae0dc45c5e 100644 --- a/paddle/phi/ops/compat/activation_sig.cc +++ b/paddle/phi/ops/compat/activation_sig.cc @@ -56,9 +56,14 @@ DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(SoftShrink, "soft_shrink", "lambda"); DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardShrink, "hard_shrink", "threshold"); DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(TanhShrink, "tanh_shrink", ); // NOLINT DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Silu, "silu", ); // NOLINT +DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LogSigmoid, "logsigmoid", ); // NOLINT -DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", ); // NOLINT -DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", ); // NOLINT +DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", ); // NOLINT +DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", ); // NOLINT +DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Sigmoid, "sigmoid", ); // NOLINT +DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(HardSigmoid, + "hard_sigmoid", + "slope" comma "offset"); // NOLINT KernelSignature ReluDoubleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { @@ -79,6 +84,20 @@ KernelSignature TanhTripleGradOpArgumentMapping( {"D_OutNew", "D_DOut", "D_DDx"}); } +KernelSignature SigmoidDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "sigmoid_double_grad", {"Out", "DDX", "DOut"}, {}, {"DOutNew", "DDOut"}); +} + +KernelSignature SigmoidTripleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("sigmoid_triple_grad", + {"Out", "DDX", "DOut", "D_DDOut", "D_DOut_New"}, + {}, + {"D_OutNew", "D_DOut", "D_DDx"}); +} + KernelSignature LeakyReluDoubleGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( @@ -114,6 +133,7 @@ PD_REGISTER_BASE_KERNEL_NAME(leaky_relu_grad_grad, leaky_relu_double_grad); PD_REGISTER_BASE_KERNEL_NAME(softshrink, soft_shrink); PD_REGISTER_BASE_KERNEL_NAME(softshrink_grad, soft_shrink_grad); PD_REGISTER_BASE_KERNEL_NAME(elu_grad_grad, elu_double_grad); +PD_REGISTER_BASE_KERNEL_NAME(sigmoid_grad_grad, sigmoid_double_grad); PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping); @@ -152,3 +172,12 @@ PD_REGISTER_ARG_MAPPING_FN(elu, phi::EluOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(elu_grad, phi::EluGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(elu_grad_grad, phi::EluDoubleGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(silu_grad, phi::SiluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sigmoid_grad, phi::SigmoidGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sigmoid_grad_grad, + phi::SigmoidDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sigmoid_triple_grad, + phi::SigmoidTripleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(logsigmoid_grad, + phi::LogSigmoidGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(hard_sigmoid_grad, + phi::HardSigmoidGradOpArgumentMapping); -- GitLab From c335288dad565243e6223f6dc545ebd19bdc96ee Mon Sep 17 00:00:00 2001 From: zyfncg Date: Thu, 17 Mar 2022 14:20:13 +0800 Subject: [PATCH 137/176] move infershape of set_value to phi (#40636) --- paddle/fluid/framework/infershape_utils.cc | 45 ++++++++++++++++++++++ paddle/fluid/operators/set_value_op.cc | 24 ++++++------ paddle/phi/infermeta/unary.cc | 10 +++++ paddle/phi/infermeta/unary.h | 2 + paddle/phi/ops/compat/set_value_sig.cc | 28 +++++++------- 5 files changed, 83 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index b1d7059f311..dec8d1d846c 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -442,6 +442,51 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, attr_name, infershape_input.size())); } } + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + auto& attr = attr_reader.GetAttr(attr_name); + if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + infer_meta_context.EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + infer_meta_context.EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + infer_meta_context.EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + infer_meta_context.EmplaceBackAttr(std::move(scalar_list)); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported cast op attribute `%s` to vector when " + "construct InferMetaContext.", + attr_names[i])); + } } else if (ctx->HasAttr(attr_name)) { // Emplace Back Attr according to the type of attr. auto& attr = attr_reader.GetAttr(attr_name); diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc index 513ab46e9b5..73655bcb185 100644 --- a/paddle/fluid/operators/set_value_op.cc +++ b/paddle/fluid/operators/set_value_op.cc @@ -13,9 +13,15 @@ // limitations under the License. #include "paddle/fluid/operators/set_value_op.h" + #include + +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace framework { class InferShapeContext; @@ -34,6 +40,8 @@ class CPUDeviceContext; namespace paddle { namespace operators { +using Tensor = framework::Tensor; + class SetValue : public framework::OperatorWithKernel { public: SetValue(const std::string &type, const framework::VariableNameMap &inputs, @@ -41,17 +49,6 @@ class SetValue : public framework::OperatorWithKernel { const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "SetValue"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SetValue"); - auto in_dims = ctx->GetInputDim("Input"); - PADDLE_ENFORCE_LT( - in_dims.size(), 7, - platform::errors::InvalidArgument( - "The rank of input should be less than 7, but received %d.", - in_dims.size())); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -236,10 +233,13 @@ DECLARE_INPLACE_OP_INFERER(SetValueOpInplaceInferer, {"Input", "Out"}); namespace ops = paddle::operators; namespace plat = paddle::platform; +DECLARE_INFER_SHAPE_FUNCTOR(set_value, SetValueInferShapeFunctor, + PD_INFER_META(phi::SetValueInferMeta)); + REGISTER_OPERATOR(set_value, ops::SetValue, ops::SetValueMaker, ops::SetValueGradMaker, ops::SetValueGradMaker, - ops::SetValueOpInplaceInferer); + ops::SetValueOpInplaceInferer, SetValueInferShapeFunctor); REGISTER_OPERATOR(set_value_grad, ops::SetValueGrad); diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index bc6ab524277..8a2d718f124 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -1090,6 +1090,16 @@ void RollInferMeta(const MetaTensor& x, out->set_dtype(x.dtype()); } +void SetValueInferMeta(const MetaTensor& x, MetaTensor* out) { + auto in_dims = x.dims(); + PADDLE_ENFORCE_LT( + in_dims.size(), + 7, + phi::errors::InvalidArgument( + "The rank of input should be less than 7, but received %d.", + in_dims.size())); +} + void ShapeInferMeta(const MetaTensor& input, MetaTensor* out) { auto in_dim = input.dims(); out->set_dims(phi::make_ddim({in_dim.size()})); diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 6cb9653624f..7203a327b55 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -177,6 +177,8 @@ void RollInferMeta(const MetaTensor& x, const std::vector& axis, MetaTensor* out); +void SetValueInferMeta(const MetaTensor& x, MetaTensor* out); + void ShapeInferMeta(const MetaTensor& input, MetaTensor* out); void ShardIndexInferMeta(const MetaTensor& in, diff --git a/paddle/phi/ops/compat/set_value_sig.cc b/paddle/phi/ops/compat/set_value_sig.cc index 9653250bded..5feff54b028 100644 --- a/paddle/phi/ops/compat/set_value_sig.cc +++ b/paddle/phi/ops/compat/set_value_sig.cc @@ -19,9 +19,9 @@ namespace phi { KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) { if (ctx.IsDenseTensorInput("Input")) { - if (ctx.HasInput("StartsTensorList")) { - if (ctx.HasInput("EndsTensorList")) { - if (ctx.HasInput("StepsTensorList")) { + if (ctx.InputSize("StartsTensorList") > 0) { + if (ctx.InputSize("EndsTensorList") > 0) { + if (ctx.InputSize("StepsTensorList") > 0) { if (ctx.HasInput("ValueTensor")) { return KernelSignature("set_value_with_tensor", {"Input", "ValueTensor"}, @@ -197,7 +197,7 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) { } } } else { - if (ctx.HasInput("StepsTensorList")) { + if (ctx.InputSize("StepsTensorList") > 0) { if (ctx.HasInput("ValueTensor")) { return KernelSignature("set_value_with_tensor", {"Input", "ValueTensor"}, @@ -374,8 +374,8 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) { } } } else { - if (ctx.HasInput("EndsTensorList")) { - if (ctx.HasInput("StepsTensorList")) { + if (ctx.InputSize("EndsTensorList") > 0) { + if (ctx.InputSize("StepsTensorList") > 0) { if (ctx.HasInput("ValueTensor")) { return KernelSignature("set_value_with_tensor", {"Input", "ValueTensor"}, @@ -551,7 +551,7 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) { } } } else { - if (ctx.HasInput("StepsTensorList")) { + if (ctx.InputSize("StepsTensorList") > 0) { if (ctx.HasInput("ValueTensor")) { return KernelSignature("set_value_with_tensor", {"Input", "ValueTensor"}, @@ -734,9 +734,9 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) { KernelSignature SetValueGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - if (ctx.HasInput("StartsTensorList")) { - if (ctx.HasInput("EndsTensorList")) { - if (ctx.HasInput("StepsTensorList")) { + if (ctx.InputSize("StartsTensorList") > 0) { + if (ctx.InputSize("EndsTensorList") > 0) { + if (ctx.InputSize("StepsTensorList") > 0) { return KernelSignature( "set_value_grad", {GradVarName("Out")}, @@ -760,7 +760,7 @@ KernelSignature SetValueGradOpArgumentMapping( {GradVarName("Input"), GradVarName("ValueTensor")}); } } else { - if (ctx.HasInput("StepsTensorList")) { + if (ctx.InputSize("StepsTensorList") > 0) { return KernelSignature( "set_value_grad", {GradVarName("Out")}, @@ -785,8 +785,8 @@ KernelSignature SetValueGradOpArgumentMapping( } } } else { - if (ctx.HasInput("EndsTensorList")) { - if (ctx.HasInput("StepsTensorList")) { + if (ctx.InputSize("EndsTensorList") > 0) { + if (ctx.InputSize("StepsTensorList") > 0) { return KernelSignature( "set_value_grad", {GradVarName("Out")}, @@ -810,7 +810,7 @@ KernelSignature SetValueGradOpArgumentMapping( {GradVarName("Input"), GradVarName("ValueTensor")}); } } else { - if (ctx.HasInput("StepsTensorList")) { + if (ctx.InputSize("StepsTensorList") > 0) { return KernelSignature( "set_value_grad", {GradVarName("Out")}, -- GitLab From 681a6865467f114e11a07037c71afb4a92dd4cef Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Thu, 17 Mar 2022 14:43:33 +0800 Subject: [PATCH 138/176] Move layer norm to phi (#40193) * update * fix bugs; test=develop * update; test=develop * fix test compile error; test=develop * fix cpu compile error; test=develop * fix test error; test=develo * fix layer_norm_op plugin error; test=develop * fix error; test=develop * fix test bug; test=develop * update; test=develop * polish code; test=develop * fix bugs; test=develop * remove unused depency; test=develop * polish code; test=develop --- .../tensorrt/convert/layer_norm_op.cc | 2 +- .../tensorrt/plugin/layer_norm_op_plugin.cu | 6 +- .../operators/fused/fused_dropout_test.h | 24 +- ...ed_layernorm_residual_dropout_bias_test.cu | 1 - paddle/fluid/operators/layer_norm_kernel.cu.h | 17 +- paddle/fluid/operators/layer_norm_op.cc | 10 +- paddle/fluid/operators/layer_norm_op.cu | 289 -------------- paddle/fluid/operators/layer_norm_op.h | 374 ------------------ paddle/fluid/operators/layer_norm_op_npu.cc | 2 +- paddle/fluid/operators/layer_norm_op_xpu.cc | 2 +- .../operators/mkldnn/layer_norm_mkldnn_op.cc | 5 +- .../phi/kernels/cpu/layer_norm_grad_kernel.cc | 186 +++++++++ paddle/phi/kernels/cpu/layer_norm_kernel.cc | 145 +++++++ paddle/phi/kernels/funcs/layer_norm_util.h | 165 ++++++++ paddle/phi/kernels/funcs/math_function.cc | 8 + .../phi/kernels/gpu/layer_norm_grad_kernel.cu | 139 +++++++ paddle/phi/kernels/gpu/layer_norm_kernel.cu | 229 +++++++++++ paddle/phi/kernels/layer_norm_grad_kernel.h | 36 ++ paddle/phi/kernels/layer_norm_kernel.h | 51 +++ paddle/phi/ops/compat/layer_norm_sig.cc | 39 ++ .../tests/unittests/test_layer_norm_op.py | 4 + 21 files changed, 1036 insertions(+), 698 deletions(-) delete mode 100644 paddle/fluid/operators/layer_norm_op.cu delete mode 100644 paddle/fluid/operators/layer_norm_op.h create mode 100644 paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/layer_norm_kernel.cc create mode 100644 paddle/phi/kernels/funcs/layer_norm_util.h create mode 100644 paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/layer_norm_kernel.cu create mode 100644 paddle/phi/kernels/layer_norm_grad_kernel.h create mode 100644 paddle/phi/kernels/layer_norm_kernel.h create mode 100644 paddle/phi/ops/compat/layer_norm_sig.cc diff --git a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc index 67e7c78b62e..496e8932a69 100644 --- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc @@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/layer_norm_op.h" + #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h" diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu index 861e98e4437..67d44184a76 100644 --- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu @@ -17,7 +17,7 @@ #include #include "glog/logging.h" #include "paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h" -#include "paddle/fluid/operators/layer_norm_op.h" +#include "paddle/phi/kernels/layer_norm_kernel.h" namespace paddle { namespace inference { @@ -83,7 +83,7 @@ int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs, cudaMemcpyAsync(bias_d, bias_.data(), sizeof(float) * feature_size, cudaMemcpyHostToDevice, stream); - paddle::operators::LayerNormDirectCUDAFunctor layer_norm; + phi::LayerNormDirectCUDAFunctor layer_norm; layer_norm(stream, input, input_shape, bias_d, scale_d, output, mean_d, variance_d, begin_norm_axis, eps); return cudaGetLastError() != cudaSuccess; @@ -177,7 +177,7 @@ int LayerNormPluginDynamic::enqueue( cudaMemcpyAsync(bias_d, bias_.data(), sizeof(float) * feature_size, cudaMemcpyHostToDevice, stream); - paddle::operators::LayerNormDirectCUDAFunctor layer_norm; + phi::LayerNormDirectCUDAFunctor layer_norm; layer_norm(stream, input, input_shape, bias_d, scale_d, output, mean_d, variance_d, begin_norm_axis, eps); } else { diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h index 18c7187fc8e..a9b72a9cdf3 100644 --- a/paddle/fluid/operators/fused/fused_dropout_test.h +++ b/paddle/fluid/operators/fused/fused_dropout_test.h @@ -25,14 +25,16 @@ limitations under the License. */ #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/layer_norm_kernel.cu.h" #include "paddle/fluid/string/printf.h" +#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/layer_norm_kernel.h" namespace framework = paddle::framework; namespace platform = paddle::platform; namespace memory = paddle::memory; USE_OP_ITSELF(dropout); -USE_OP(layer_norm); +USE_OP_ITSELF(layer_norm); template using CudnnDataType = platform::CudnnDataType; @@ -136,18 +138,23 @@ void LayerNorm(const std::vector> &scale, const platform::CUDADeviceContext &ctx) { framework::Scope scope; auto place = ctx.GetPlace(); + paddle::optional scale_opt = paddle::none; if (scale.size() > 0) { auto var_scale = scope.Var("Scale"); auto tensor_scale = var_scale->GetMutable(); framework::TensorFromVector(scale, ctx, tensor_scale); tensor_scale->Resize({cols}); + scale_opt = *tensor_scale; } + paddle::optional bias_opt = paddle::none; if (bias.size() > 0) { auto var_bias = scope.Var("Bias"); auto tensor_bias = var_bias->GetMutable(); framework::TensorFromVector(bias, ctx, tensor_bias); tensor_bias->Resize({cols}); + + bias_opt = *tensor_bias; } auto var_x = scope.Var("X"); @@ -157,20 +164,19 @@ void LayerNorm(const std::vector> &scale, auto var_y = scope.Var("Y"); auto tensor_y = var_y->GetMutable(); + tensor_y->Resize({rows, cols}); auto var_mean = scope.Var("Mean"); auto tensor_mean = var_mean->GetMutable(); + tensor_mean->Resize({rows}); auto var_variance = scope.Var("Variance"); auto tensor_variance = var_variance->GetMutable(); - - framework::AttributeMap attrs; - attrs.insert({"epsilon", epsilon}); - - auto op = framework::OpRegistry::CreateOp( - "layer_norm", {{"X", {"X"}}, {"Scale", {"Scale"}}, {"Bias", {"Bias"}}}, - {{"Y", {"Y"}}, {"Mean", {"Mean"}}, {"Variance", {"Variance"}}}, attrs); - op->Run(scope, place); + tensor_variance->Resize({rows}); + ctx.Wait(); + phi::LayerNormKernel(static_cast(ctx), *tensor_x, + scale_opt, bias_opt, 1e-5, 1, false, tensor_y, + tensor_mean, tensor_variance); framework::TensorToVector(*tensor_y, ctx, y); framework::TensorToVector(*tensor_mean, ctx, means); framework::TensorToVector(*tensor_variance, ctx, vars); diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu index 032440d7f04..c7e1f4a5463 100644 --- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu @@ -198,7 +198,6 @@ struct TestFusedLayernormResidualDropoutBias { residual_vec[i * cols + j] + out2[i * cols + j]; } } - LayerNorm(scale_vec, layernorm_bias_vec, correct_out, &correct_means, &correct_vars, &correct_layernorm_out, epsilon, rows, cols, *ctx); diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h index 412ae3c49b5..c0a4b88fc76 100644 --- a/paddle/fluid/operators/layer_norm_kernel.cu.h +++ b/paddle/fluid/operators/layer_norm_kernel.cu.h @@ -758,12 +758,14 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel( */ template -void ln_bwd_1024_kernel_driver( - const platform::CUDADeviceContext &dev_ctx, const int rows, const int cols, - float epsilon, const T *x_ptr, const ScaleT *scale_ptr, const U *mean_ptr, - const U *var_ptr, const T *dout_ptr, T *dx_ptr, ScaleT *dscale_ptr, - ScaleT *dbias_ptr, const MaskType *mask_ptr = nullptr, - T factor = static_cast(0), T *d_dropout_src_ptr = nullptr) { +void ln_bwd_1024_kernel_driver(const phi::GPUContext &dev_ctx, const int rows, + const int cols, float epsilon, const T *x_ptr, + const ScaleT *scale_ptr, const U *mean_ptr, + const U *var_ptr, const T *dout_ptr, T *dx_ptr, + ScaleT *dscale_ptr, ScaleT *dbias_ptr, + const MaskType *mask_ptr = nullptr, + T factor = static_cast(0), + T *d_dropout_src_ptr = nullptr) { auto stream = dev_ctx.stream(); if (cols == 1024) { // step-1: compute dx and reduced part results of dscale and dbias. @@ -1334,8 +1336,7 @@ static void LayerNormBackward( const U *mean, const U *var, T *d_x, LayerNormScaleBiasT *d_scale, LayerNormScaleBiasT *d_bias, float epsilon, - int64_t batch_size, int64_t feature_size, - const platform::CUDADeviceContext &dev_ctx) { + int64_t batch_size, int64_t feature_size, const phi::GPUContext &dev_ctx) { auto stream = dev_ctx.stream(); #ifdef __HIPCC__ const int kMaxBlockDim = 256; diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc index e7d676479be..224ab748dab 100644 --- a/paddle/fluid/operators/layer_norm_op.cc +++ b/paddle/fluid/operators/layer_norm_op.cc @@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/layer_norm_op.h" - #include #include +#include "paddle/fluid/framework/op_registry.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -278,10 +277,3 @@ REGISTER_OPERATOR(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker, ops::LayerNormGradOpMaker); REGISTER_OPERATOR(layer_norm_grad, ops::LayerNormGradOp, ops::LayerNormGradNoNeedBufferVarInferer); -REGISTER_OP_CPU_KERNEL( - layer_norm, ops::LayerNormKernel, - ops::LayerNormKernel); -REGISTER_OP_CPU_KERNEL( - layer_norm_grad, - ops::LayerNormGradKernel, - ops::LayerNormGradKernel); diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu deleted file mode 100644 index dfe73d37271..00000000000 --- a/paddle/fluid/operators/layer_norm_op.cu +++ /dev/null @@ -1,289 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/operators/layer_norm_kernel.cu.h" -#include "paddle/fluid/operators/layer_norm_op.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace operators { - -template -void LayerNormDirectCUDAFunctor::operator()(gpuStream_t stream, - const T *input, - std::vector input_shape, - const T *bias, const T *scale, - T *output, T *mean, T *variance, - int begin_norm_axis, float eps) { - const auto x_dims = phi::make_ddim(input_shape); - auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); - int64_t batch_size = static_cast(matrix_dim[0]); - int64_t feature_size = static_cast(matrix_dim[1]); - switch (GetDesiredBlockDim(feature_size)) { - FIXED_BLOCK_DIM_CASE( - LayerNormForward<<>>( - input, scale, bias, output, mean, variance, eps, feature_size)); - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Product from begin_norm_axis to end in layer_norm must be larger " - "than 1")); - break; - } -} - -template -class LayerNormKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - using U = LayerNormParamType; - const float epsilon = ctx.Attr("epsilon"); - auto *scale = ctx.Input("Scale"); - auto *bias = ctx.Input("Bias"); - auto *x = ctx.Input("X"); - - auto *y = ctx.Output("Y"); - auto *mean = ctx.Output("Mean"); - auto *var = ctx.Output("Variance"); - const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); - - const auto x_dims = x->dims(); - auto *x_data = x->data(); - auto *y_data = y->mutable_data(ctx.GetPlace()); - auto *mean_data = mean->mutable_data(ctx.GetPlace()); - auto *var_data = var->mutable_data(ctx.GetPlace()); - - auto *void_scale_data = (scale == nullptr ? nullptr : scale->data()); - auto *void_bias_data = (bias == nullptr ? nullptr : bias->data()); - - framework::proto::VarType::Type x_dtype = - framework::TransToProtoVarType(x->dtype()); - framework::proto::VarType::Type scale_bias_dtype; - if (void_scale_data != nullptr) { - scale_bias_dtype = framework::TransToProtoVarType(scale->dtype()); - if (void_bias_data != nullptr) { - PADDLE_ENFORCE_EQ(scale_bias_dtype, - framework::TransToProtoVarType(bias->dtype()), - platform::errors::InvalidArgument( - "Thie Scale and Bias of layer_norm op " - "should have the same data type.")); - } - } else { - scale_bias_dtype = (void_bias_data != nullptr - ? framework::TransToProtoVarType(bias->dtype()) - : x_dtype); - } - - bool is_scale_bias_same_dtype_with_x = x_dtype == scale_bias_dtype; - if (!is_scale_bias_same_dtype_with_x) { - PADDLE_ENFORCE_EQ(scale_bias_dtype, - framework::DataTypeTrait::DataType(), - platform::errors::InvalidArgument( - "Unsupported data type of Scale and Bias: %s", - framework::DataTypeToString(scale_bias_dtype))); - } - - auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); - int64_t batch_size = static_cast(matrix_dim[0]); - int64_t feature_size = static_cast(matrix_dim[1]); - - auto stream = ctx.cuda_device_context().stream(); - -#define PADDLE_LAUNCH_LAYERNORM_FWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \ - do { \ - switch (GetDesiredBlockDim(feature_size)) { \ - FIXED_BLOCK_DIM_CASE( \ - LayerNormForward<<< \ - batch_size, kBlockDim, 0, stream>>>( \ - x_data, static_cast(void_scale_data), \ - static_cast(void_bias_data), y_data, \ - mean_data, var_data, epsilon, feature_size)); \ - default: \ - PADDLE_THROW(platform::errors::InvalidArgument( \ - "Product from begin_norm_axis to end must be larger than 1")); \ - break; \ - } \ - } while (0) - -#ifdef PADDLE_WITH_CUDA - bool can_call_1024_kernel = false; - if (feature_size == 1024 && scale != nullptr && bias != nullptr) { - can_call_1024_kernel = true; - } - if (can_call_1024_kernel) { - const int WARPS_M = 4; - const int WARPS_N = 1; - const int THREADS_PER_WARP = 32; - const int BYTES_PER_LDG = 16; - const int VecSize = BYTES_PER_LDG / sizeof(T); - - const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M; - const int ROWS_PER_CTA = WARPS_M; - - const int grid = static_cast( - std::ceil(batch_size / static_cast(ROWS_PER_CTA))); - if (is_scale_bias_same_dtype_with_x) { - ln_fwd_1024_kernel<<>>( - batch_size, feature_size, epsilon, x_data, - static_cast(void_scale_data), - static_cast(void_bias_data), mean_data, var_data, - y_data); - } else { - ln_fwd_1024_kernel<<>>( - batch_size, feature_size, epsilon, x_data, - static_cast(void_scale_data), - static_cast(void_bias_data), mean_data, var_data, - y_data); - } - } else { -#endif - if (is_scale_bias_same_dtype_with_x) { - PADDLE_LAUNCH_LAYERNORM_FWD(T, true); - } else { - PADDLE_LAUNCH_LAYERNORM_FWD(U, false); - } -#ifdef PADDLE_WITH_CUDA - } -#endif - -#undef PADDLE_LAUNCH_LAYERNORM_FWD - } -}; - -template -class LayerNormGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - using U = LayerNormParamType; - const float epsilon = ctx.Attr("epsilon"); - // d_x, d_scale, d_bias may be nullptr - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_scale = ctx.Output(framework::GradVarName("Scale")); - auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - - auto *x = ctx.Input("X"); - auto *mean = ctx.Input("Mean"); - auto *var = ctx.Input("Variance"); - auto *scale = ctx.Input("Scale"); - auto *bias = ctx.Input("Bias"); - auto *d_y = ctx.Input(framework::GradVarName("Y")); - - const auto &x_dims = x->dims(); - const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); - auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); - int64_t batch_size = static_cast(matrix_dim[0]); - int64_t feature_size = static_cast(matrix_dim[1]); - - auto *x_data = x->data(); - auto *d_y_data = d_y->data(); - - auto *mean_data = mean->data(); - auto *var_data = var->data(); - - auto *d_x_data = - (d_x == nullptr ? nullptr : d_x->mutable_data(ctx.GetPlace())); - - framework::proto::VarType::Type x_dtype = - framework::TransToProtoVarType(x->dtype()); - framework::proto::VarType::Type scale_bias_dtype; - if (scale != nullptr) { - scale_bias_dtype = framework::TransToProtoVarType(scale->dtype()); - } else { - // FIXME(zengjinle): do not find a better way to get the right - // data type of the d_scale and d_bias if scale == nullptr. - auto *bias = ctx.Input("Bias"); - if (bias != nullptr) { - scale_bias_dtype = framework::TransToProtoVarType(bias->dtype()); - } else { - scale_bias_dtype = x_dtype; - } - } - -#define PADDLE_LAUNCH_LAYERNORM_BWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \ - do { \ - auto *scale_data = \ - (scale == nullptr ? nullptr : scale->data()); \ - auto *d_scale_data = \ - (d_scale == nullptr ? nullptr : d_scale->mutable_data( \ - ctx.GetPlace())); \ - auto *d_bias_data = \ - (d_bias == nullptr ? nullptr : d_bias->mutable_data( \ - ctx.GetPlace())); \ - auto *d_x_data = \ - (d_x == nullptr ? nullptr : d_x->mutable_data(ctx.GetPlace())); \ - LayerNormBackward( \ - x_data, d_y_data, scale_data, mean_data, var_data, d_x_data, \ - d_scale_data, d_bias_data, epsilon, batch_size, feature_size, \ - ctx.cuda_device_context()); \ - } while (0) - - if (scale_bias_dtype == x_dtype) { - PADDLE_LAUNCH_LAYERNORM_BWD(T, true); - } else { - PADDLE_LAUNCH_LAYERNORM_BWD(U, false); - } - -#undef PADDLE_LAUNCH_LAYERNORM_BWD - } -}; - -template class LayerNormDirectCUDAFunctor; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -#ifdef PADDLE_WITH_HIP -// MIOPEN do not support double -REGISTER_OP_CUDA_KERNEL( - layer_norm, - ops::LayerNormKernel, - ops::LayerNormKernel); -REGISTER_OP_CUDA_KERNEL( - layer_norm_grad, - ops::LayerNormGradKernel, - ops::LayerNormGradKernel); -#elif CUDNN_VERSION_MIN(8, 1, 0) -REGISTER_OP_CUDA_KERNEL( - layer_norm, - ops::LayerNormKernel, - ops::LayerNormKernel, - ops::LayerNormKernel, - ops::LayerNormKernel); -REGISTER_OP_CUDA_KERNEL( - layer_norm_grad, - ops::LayerNormGradKernel, - ops::LayerNormGradKernel, - ops::LayerNormGradKernel, - ops::LayerNormGradKernel); -#else -REGISTER_OP_CUDA_KERNEL( - layer_norm, - ops::LayerNormKernel, - ops::LayerNormKernel, - ops::LayerNormKernel); -REGISTER_OP_CUDA_KERNEL( - layer_norm_grad, - ops::LayerNormGradKernel, - ops::LayerNormGradKernel, - ops::LayerNormGradKernel); -#endif diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h deleted file mode 100644 index 9d70b7cf707..00000000000 --- a/paddle/fluid/operators/layer_norm_op.h +++ /dev/null @@ -1,374 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \ - !defined(__OSX__) -#include "paddle/fluid/operators/jit/kernels.h" -#endif -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace platform { -class CPUDeviceContext; -class CUDADeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { - -// Wrap RowwiseMean and ColwiseMean. -// Reuse the cpu codes and replace the gpu codes with cublas_gemv, which is -// significantly faster. Unlike the RowwiseMean and ColwiseMean, the -// implementation only considers 2D. -template -struct RowwiseMean2D { - RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx); - - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, framework::Tensor* vec); -}; - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -class RowwiseMean2D { - public: - RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx) - : left_(left), right_(right) { - framework::DDim ones_dim({right_}); - divisor_.mutable_data(ones_dim, dev_ctx.GetPlace()); - phi::funcs::set_constant(dev_ctx, &divisor_, 1.0 / right); - } - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, framework::Tensor* out) { - phi::funcs::GetBlas(context).GEMV( - false, left_, right_, 1., input.data(), divisor_.data(), 0., - out->data()); - } - - private: - int left_; - int right_; - framework::Tensor divisor_; -}; -#endif - -template -class RowwiseMean2D { - public: - RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx) {} - - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, framework::Tensor* out) { - row_mean_(context, input, out); - } - - private: - phi::funcs::RowwiseMean row_mean_; -}; - -template -struct ColwiseSum2D { - ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx); - - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, framework::Tensor* vec); -}; - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -class ColwiseSum2D { - public: - ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx) - : left_(left), right_(right) { - framework::DDim ones_dim({left_}); - divisor_.mutable_data(ones_dim, dev_ctx.GetPlace()); - phi::funcs::set_constant(dev_ctx, &divisor_, 1.0); - } - - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, framework::Tensor* out) { - phi::funcs::GetBlas(context).GEMV( - true, left_, right_, 1., input.data(), divisor_.data(), 0., - out->data()); - } - - private: - int left_; - int right_; - framework::Tensor divisor_; -}; -#endif - -template -class ColwiseSum2D { - public: - ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx) {} - - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, framework::Tensor* out) { - col_wise_(context, input, out); - } - - private: - phi::funcs::ColwiseSum col_wise_; -}; - -template -struct SubAndSquareFunctor { - inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); } -}; - -template -struct DivAndSqrtFunctor { - explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; } - inline HOSTDEVICE T operator()(T a, T b) const { - return a / (sqrt(b + epsilon_)); - } - - private: - T epsilon_; -}; - -template -struct MulInvVarFunctor { - inline HOSTDEVICE T operator()(T a, T b) const { - return a * std::sqrt(1.0 / b); - } -}; - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; -using DataLayout = framework::DataLayout; - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -class LayerNormDirectCUDAFunctor { - public: - void operator()(gpuStream_t stream, const T* input, - std::vector input_shape, const T* bias, const T* scale, - T* output, T* mean, T* variance, int begin_norm_axis, - float eps); -}; -#endif - -template -class LayerNormKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const float epsilon = ctx.Attr("epsilon"); - auto* scale = ctx.Input("Scale"); - auto* bias = ctx.Input("Bias"); - auto x = *ctx.Input("X"); - - auto* y = ctx.Output("Y"); - auto* mean = ctx.Output("Mean"); - auto* var = ctx.Output("Variance"); - const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); - - const auto x_dims = x.dims(); - - y->mutable_data(ctx.GetPlace()); - mean->mutable_data(ctx.GetPlace()); - var->mutable_data(ctx.GetPlace()); - - auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); - int left = static_cast(matrix_dim[0]); - int right = static_cast(matrix_dim[1]); - framework::DDim matrix_shape({left, right}); - - x.Resize(matrix_shape); - Tensor out; - out.ShareDataWith(*y); - out.Resize(matrix_shape); - -#if defined(PADDLE_WITH_CUDA) || defined(_WIN32) || defined(__APPLE__) || \ - defined(__OSX__) - auto& dev_ctx = ctx.template device_context(); - RowwiseMean2D row_mean(left, right, ctx.device_context()); - - // get mean - row_mean(dev_ctx, x, mean); - - // get variance - ElementwiseComputeEx, DeviceContext, T>( - ctx, &x, mean, /*axis*/ 0, SubAndSquareFunctor(), &out); - row_mean(dev_ctx, out, var); - - // get x_norm - ElementwiseComputeEx, DeviceContext, T>( - ctx, &x, mean, /*axis*/ 0, SubFunctor(), &out); - ElementwiseComputeEx, DeviceContext, T>( - ctx, &out, var, /*axis*/ 0, - DivAndSqrtFunctor(static_cast(epsilon)), &out); - - if (scale) { - ElementwiseComputeEx, DeviceContext, T>( - ctx, &out, scale, /*axis*/ 1, MulFunctor(), &out); - } - if (bias) { - ElementwiseComputeEx, DeviceContext, T>( - ctx, &out, bias, /*axis*/ 1, AddFunctor(), &out); - } -#else - PADDLE_ENFORCE_EQ(mean->numel(), left, - platform::errors::InvalidArgument( - "mean's length (%d) is not equal with expected (%d).", - mean->numel(), left)); - PADDLE_ENFORCE_EQ(var->numel(), left, - platform::errors::InvalidArgument( - "var's length (%d) is not equal with expected (%d).", - var->numel(), left)); - if (scale) { - PADDLE_ENFORCE_EQ( - scale->numel(), right, - platform::errors::InvalidArgument( - "scale's length (%d) is not equal with expected (%d).", - scale->numel(), right)); - } - if (bias) { - PADDLE_ENFORCE_EQ( - bias->numel(), right, - platform::errors::InvalidArgument( - "bias's length (%d) is not equal with expected (%d).", - bias->numel(), right)); - } - - auto ker = - jit::KernelFuncs, platform::CPUPlace>::Cache() - .At(right); - ker(x.data(), out.data(), mean->data(), var->data(), - scale ? scale->data() : nullptr, bias ? bias->data() : nullptr, - static_cast(left), static_cast(epsilon), right); -#endif - } -}; - -template -class LayerNormGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const float epsilon = ctx.Attr("epsilon"); - auto x = *ctx.Input("X"); - auto* mean = ctx.Input("Mean"); - auto* var = ctx.Input("Variance"); - auto* scale = ctx.Input("Scale"); - auto d_y = *ctx.Input(framework::GradVarName("Y")); - const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); - - // init output - auto* d_x = ctx.Output(framework::GradVarName("X")); - auto* d_scale = ctx.Output(framework::GradVarName("Scale")); - auto* d_bias = ctx.Output(framework::GradVarName("Bias")); - - const auto& x_dims = x.dims(); - auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); - int left = static_cast(matrix_dim[0]); - int right = static_cast(matrix_dim[1]); - framework::DDim matrix_shape({left, right}); - - d_y.Resize(matrix_shape); - auto& dev_ctx = ctx.template device_context(); - ColwiseSum2D colwise_sum(left, right, - ctx.device_context()); - - Tensor temp; - Tensor temp_norm; - if (d_scale || d_x) { - x.Resize(matrix_shape); - temp.mutable_data(matrix_shape, ctx.GetPlace()); - - temp_norm.mutable_data(matrix_shape, ctx.GetPlace()); - // get x_norm - ElementwiseComputeEx, DeviceContext, T>( - ctx, &x, mean, /*axis*/ 0, SubFunctor(), &temp_norm); - ElementwiseComputeEx, DeviceContext, T>( - ctx, &temp_norm, var, /*axis*/ 0, - DivAndSqrtFunctor(static_cast(epsilon)), &temp_norm); - } - - if (d_bias) { - d_bias->mutable_data(ctx.GetPlace()); - colwise_sum(dev_ctx, d_y, d_bias); - } - if (d_scale) { - d_scale->mutable_data(ctx.GetPlace()); - ElementwiseComputeEx, DeviceContext, T>( - ctx, &temp_norm, &d_y, /*axis*/ 0, MulFunctor(), &temp); - colwise_sum(dev_ctx, temp, d_scale); - } - - if (d_x) { - framework::DDim vec_shape({left}); - d_x->mutable_data(ctx.GetPlace()); - auto dx_dim = d_x->dims(); - Tensor temp_vec; - temp_vec.mutable_data(vec_shape, ctx.GetPlace()); - - RowwiseMean2D row_mean(left, right, - ctx.device_context()); - - if (d_scale) { - // dy_dx - ElementwiseComputeEx, DeviceContext, T>( - ctx, &d_y, scale, /*axis*/ 1, MulFunctor(), &temp); - framework::TensorCopy(temp, ctx.GetPlace(), ctx.device_context(), d_x); - - // dy_dmean_dx - row_mean(dev_ctx, temp, &temp_vec); - ElementwiseComputeEx, DeviceContext, T>( - ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor(), d_x); - - // dy_var_dx - ElementwiseComputeEx, DeviceContext, T>( - ctx, &temp, &temp_norm, /*axis*/ 0, MulFunctor(), &temp); - } else { - // dy_dx - framework::TensorCopy(d_y, ctx.GetPlace(), ctx.device_context(), d_x); - - // dy_dmean_dx - row_mean(dev_ctx, d_y, &temp_vec); - ElementwiseComputeEx, DeviceContext, T>( - ctx, d_x, &temp_vec, /*axis*/ 0, SubFunctor(), d_x); - - // dy_var_dx - ElementwiseComputeEx, DeviceContext, T>( - ctx, &d_y, &temp_norm, /*axis*/ 0, MulFunctor(), &temp); - } - // dy_var_dx - row_mean(dev_ctx, temp, &temp_vec); - ElementwiseComputeEx, DeviceContext, T>( - ctx, &temp_norm, &temp_vec, /*axis*/ 0, MulFunctor(), &temp); - ElementwiseComputeEx, DeviceContext, T>( - ctx, d_x, &temp, /*axis*/ 0, SubFunctor(), d_x); - - ElementwiseComputeEx, DeviceContext, T>( - ctx, d_x, var, /*axis*/ 0, - DivAndSqrtFunctor(static_cast(epsilon)), d_x); - d_x->Resize(dx_dim); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/layer_norm_op_npu.cc b/paddle/fluid/operators/layer_norm_op_npu.cc index c88880b43ff..3c7e5bf9593 100644 --- a/paddle/fluid/operators/layer_norm_op_npu.cc +++ b/paddle/fluid/operators/layer_norm_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/layer_norm_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/layer_norm_op_xpu.cc b/paddle/fluid/operators/layer_norm_op_xpu.cc index 0480a354c8b..3b21a55f8df 100644 --- a/paddle/fluid/operators/layer_norm_op_xpu.cc +++ b/paddle/fluid/operators/layer_norm_op_xpu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/layer_norm_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc index 812c55cdd50..2e82b47e8da 100644 --- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc @@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/layer_norm_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/mkldnn_reuse.h" +#include "paddle/phi/common/data_type.h" namespace paddle { namespace operators { @@ -139,7 +140,7 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel { layer_norm_p->execute(astream, args); astream.wait(); - y->set_layout(DataLayout::kMKLDNN); + y->set_layout(phi::DataLayout::kMKLDNN); y->set_format(platform::GetMKLDNNFormat(*dst_memory)); } }; diff --git a/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc new file mode 100644 index 00000000000..cee48ed96db --- /dev/null +++ b/paddle/phi/kernels/cpu/layer_norm_grad_kernel.cc @@ -0,0 +1,186 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/layer_norm_grad_kernel.h" +#include "paddle/phi/kernels/cpu/elementwise.h" +#include "paddle/phi/kernels/funcs/layer_norm_util.h" +#if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \ + !defined(__OSX__) +#include "paddle/fluid/operators/jit/kernels.h" +#endif +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/funcs/elementwise_functor.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void LayerNormGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& mean, + const DenseTensor& variance, + paddle::optional scale_opt, + paddle::optional bias_opt, + const DenseTensor& out_grad, + float epsilon, + int begin_norm_axis, + bool is_test, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* bias_grad) { + auto* scale = scale_opt.get_ptr(); + auto d_y = out_grad; + + // init output + auto* d_x = x_grad; + auto* d_scale = scale_grad; + auto* d_bias = bias_grad; + + const auto& x_dims = x.dims(); + auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); + DDim matrix_shape({left, right}); + + d_y.Resize(matrix_shape); + + funcs::ColwiseSum2D colwise_sum(left, right, dev_ctx); + DenseTensor x_tmp = x; + + DenseTensor temp; + DenseTensor temp_norm; + if (d_scale || d_x) { + x_tmp.Resize(matrix_shape); + temp.Resize(matrix_shape); + dev_ctx.template Alloc(&temp); + + temp_norm.Resize(matrix_shape); + dev_ctx.template Alloc(&temp_norm); + // get x_norm + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, + x_tmp, + mean, + /*axis*/ 0, + funcs::SubtractFunctor(), + &temp_norm); + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, + temp_norm, + variance, + /*axis*/ 0, + funcs::DivAndSqrtFunctor(static_cast(epsilon)), + &temp_norm); + } + + if (d_bias) { + dev_ctx.template Alloc(d_bias); + colwise_sum(dev_ctx, d_y, d_bias); + } + if (d_scale) { + dev_ctx.template Alloc(d_scale); + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, temp_norm, d_y, 0, funcs::MultiplyFunctor(), &temp); + colwise_sum(dev_ctx, temp, d_scale); + } + + if (d_x) { + DDim vec_shape({left}); + dev_ctx.template Alloc(d_x); + auto dx_dim = d_x->dims(); + DenseTensor temp_vec; + temp_vec.Resize(vec_shape); + dev_ctx.template Alloc(&temp_vec); + + funcs::RowwiseMean2D row_mean(left, right, dev_ctx); + + if (d_scale) { + // dy_dx + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, d_y, *scale, /*axis*/ 1, funcs::MultiplyFunctor(), &temp); + phi::Copy(dev_ctx, temp, dev_ctx.GetPlace(), false, d_x); + + // dy_dmean_dx + row_mean(dev_ctx, temp, &temp_vec); + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, + *d_x, + temp_vec, + /*axis*/ 0, + funcs::SubtractFunctor(), + d_x); + + // dy_var_dx + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, + temp, + temp_norm, + /*axis*/ 0, + funcs::MultiplyFunctor(), + &temp); + } else { + // dy_dx + phi::Copy(dev_ctx, d_y, dev_ctx.GetPlace(), false, d_x); + + // dy_dmean_dx + row_mean(dev_ctx, d_y, &temp_vec); + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, + *d_x, + temp_vec, + /*axis*/ 0, + funcs::SubtractFunctor(), + d_x); + + // dy_var_dx + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, + d_y, + temp_norm, + /*axis*/ 0, + funcs::MultiplyFunctor(), + &temp); + } + // dy_var_dx + row_mean(dev_ctx, temp, &temp_vec); + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, + temp_norm, + temp_vec, + /*axis*/ 0, + funcs::MultiplyFunctor(), + &temp); + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, *d_x, temp, /*axis*/ 0, funcs::SubtractFunctor(), d_x); + + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, + *d_x, + variance, + /*axis*/ 0, + funcs::DivAndSqrtFunctor(static_cast(epsilon)), + d_x); + d_x->Resize(dx_dim); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + layer_norm_grad, CPU, ALL_LAYOUT, phi::LayerNormGradKernel, float, double) { +} diff --git a/paddle/phi/kernels/cpu/layer_norm_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_kernel.cc new file mode 100644 index 00000000000..5b09d68c7ca --- /dev/null +++ b/paddle/phi/kernels/cpu/layer_norm_kernel.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/layer_norm_kernel.h" +#include "paddle/phi/kernels/cpu/elementwise.h" +#include "paddle/phi/kernels/funcs/layer_norm_util.h" +#if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \ + !defined(__OSX__) +#include "paddle/fluid/operators/jit/kernels.h" +#endif +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/funcs/elementwise_functor.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void LayerNormKernel(const Context& dev_ctx, + const DenseTensor& x, + paddle::optional scale_opt, + paddle::optional bias_opt, + float epsilon, + int begin_norm_axis, + bool is_test, + DenseTensor* y, + DenseTensor* mean, + DenseTensor* var) { + const auto x_dims = x.dims(); + auto* scale = scale_opt.get_ptr(); + auto* bias = bias_opt.get_ptr(); + + dev_ctx.template Alloc(y); + dev_ctx.template Alloc(mean); + dev_ctx.template Alloc(var); + + auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); + DDim matrix_shape({left, right}); + + auto x_tmp = x; + x_tmp.Resize(matrix_shape); + DenseTensor out; + out.ShareDataWith(*y); + out.Resize(matrix_shape); + +#if defined(PADDLE_WITH_CUDA) || defined(_WIN32) || defined(__APPLE__) || \ + defined(__OSX__) + + funcs::RowwiseMean2D row_mean(left, right, dev_ctx); + + // get mean + row_mean(dev_ctx, x_tmp, mean); + + // get variance + + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, x_tmp, *mean, 0, funcs::SubAndSquareFunctor(), &out); + + row_mean(dev_ctx, out, var); + + // get x_norm + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, x_tmp, *mean, 0, funcs::SubtractFunctor(), &out); + + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, + out, + *var, + 0, + funcs::DivAndSqrtFunctor(static_cast(epsilon)), + &out); + + if (scale) { + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, out, *scale, 1, funcs::MultiplyFunctor(), &out); + } + if (bias) { + phi::funcs::ElementwiseCompute, T, T>( + dev_ctx, out, *bias, 1, funcs::AddFunctor(), &out); + } +#else + PADDLE_ENFORCE_EQ(mean->numel(), + left, + phi::errors::InvalidArgument( + "mean's length (%d) is not equal with expected (%d).", + mean->numel(), + left)); + PADDLE_ENFORCE_EQ(var->numel(), + left, + phi::errors::InvalidArgument( + "var's length (%d) is not equal with expected (%d).", + var->numel(), + left)); + if (scale) { + PADDLE_ENFORCE_EQ( + scale->numel(), + right, + phi::errors::InvalidArgument( + "scale's length (%d) is not equal with expected (%d).", + scale->numel(), + right)); + } + if (bias) { + PADDLE_ENFORCE_EQ(bias->numel(), + right, + phi::errors::InvalidArgument( + "bias's length (%d) is not equal with expected (%d).", + bias->numel(), + right)); + } + + auto ker = paddle::operators::jit::KernelFuncs< + paddle::operators::jit::LayerNormTuple, + phi::CPUPlace>::Cache() + .At(right); + ker(x_tmp.data(), + out.data(), + mean->data(), + var->data(), + scale ? scale->data() : nullptr, + bias ? bias->data() : nullptr, + static_cast(left), + static_cast(epsilon), + right); +#endif +} + +} // namespace phi + +PD_REGISTER_KERNEL( + layer_norm, CPU, ALL_LAYOUT, phi::LayerNormKernel, float, double) {} diff --git a/paddle/phi/kernels/funcs/layer_norm_util.h b/paddle/phi/kernels/funcs/layer_norm_util.h new file mode 100644 index 00000000000..e78730cbf38 --- /dev/null +++ b/paddle/phi/kernels/funcs/layer_norm_util.h @@ -0,0 +1,165 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/device_context.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { +namespace funcs { + +// Wrap RowwiseMean and ColwiseMean. +// Reuse the cpu codes and replace the gpu codes with cublas_gemv, which is +// significantly faster. Unlike the RowwiseMean and ColwiseMean, the +// implementation only considers 2D. +template +struct RowwiseMean2D { + RowwiseMean2D(int left, int right, const DeviceContext& dev_ctx); + + void operator()(const DeviceContext& context, + const DenseTensor& input, + DenseTensor* vec); +}; + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +template +class RowwiseMean2D { + public: + RowwiseMean2D(int left, int right, const DeviceContext& dev_ctx) + : left_(left), right_(right) { + DDim ones_dim({right_}); + divisor_.Resize(ones_dim); + dev_ctx.template Alloc(&divisor_); + phi::funcs::set_constant(dev_ctx, &divisor_, 1.0 / right); + } + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + DenseTensor* out) { + phi::funcs::GetBlas(context).GEMV(false, + left_, + right_, + 1., + input.data(), + divisor_.data(), + 0., + out->data()); + } + + private: + int left_; + int right_; + DenseTensor divisor_; +}; +#endif + +template +class RowwiseMean2D { + public: + RowwiseMean2D(int left, int right, const DeviceContext& dev_ctx) {} + + void operator()(const phi::CPUContext& context, + const DenseTensor& input, + DenseTensor* out) { + row_mean_(context, input, out); + } + + private: + phi::funcs::RowwiseMean row_mean_; +}; + +template +struct ColwiseSum2D { + ColwiseSum2D(int left, int right, const DeviceContext& dev_ctx); + + void operator()(const phi::DeviceContext& context, + const DenseTensor& input, + DenseTensor* vec); +}; + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +template +class ColwiseSum2D { + public: + ColwiseSum2D(int left, int right, const phi::GPUContext& dev_ctx) + : left_(left), right_(right) { + DDim ones_dim({left_}); + divisor_.Resize(ones_dim); + dev_ctx.template Alloc(&divisor_); + phi::funcs::set_constant(dev_ctx, &divisor_, 1.0); + } + + void operator()(const phi::GPUContext& context, + const DenseTensor& input, + DenseTensor* out) { + phi::funcs::GetBlas(context).GEMV(true, + left_, + right_, + 1., + input.data(), + divisor_.data(), + 0., + out->data()); + } + + private: + int left_; + int right_; + DenseTensor divisor_; +}; +#endif + +template +class ColwiseSum2D { + public: + ColwiseSum2D(int left, int right, const phi::CPUContext& dev_ctx) {} + + void operator()(const phi::CPUContext& context, + const DenseTensor& input, + DenseTensor* out) { + col_wise_(context, input, out); + } + + private: + phi::funcs::ColwiseSum col_wise_; +}; + +template +struct SubAndSquareFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); } +}; + +template +struct DivAndSqrtFunctor { + explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; } + inline HOSTDEVICE T operator()(T a, T b) const { + return a / (sqrt(b + epsilon_)); + } + + private: + T epsilon_; +}; + +template +struct MulInvVarFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { + return a * std::sqrt(1.0 / b); + } +}; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc index 4201a75be8a..afa2214f5b9 100644 --- a/paddle/phi/kernels/funcs/math_function.cc +++ b/paddle/phi/kernels/funcs/math_function.cc @@ -331,12 +331,20 @@ template struct ColwiseSum; template struct ColwiseSum; template struct ColwiseSum; +template struct ColwiseSum; +template struct ColwiseSum; +template struct ColwiseSum; +template struct ColwiseSum; + template struct RowwiseSum; template struct RowwiseSum; template struct RowwiseMean; template struct RowwiseMean; +template struct RowwiseMean; +template struct RowwiseMean; + template struct ElementwiseAddTo { void operator()(paddle::platform::CPUDeviceContext* ctx, diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu new file mode 100644 index 00000000000..c3f7a526171 --- /dev/null +++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu @@ -0,0 +1,139 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/layer_norm_grad_kernel.h" + +#include "paddle/fluid/operators/layer_norm_kernel.cu.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/layer_norm_util.h" + +namespace phi { + +template +void LayerNormGradKernel(const Context &dev_ctx, + const DenseTensor &x, + const DenseTensor &mean, + const DenseTensor &variance, + paddle::optional scale_opt, + paddle::optional bias_opt, + const DenseTensor &out_grad, + float epsilon, + int begin_norm_axis, + bool is_test, + DenseTensor *x_grad, + DenseTensor *scale_grad, + DenseTensor *bias_grad) { + using U = paddle::operators::LayerNormParamType; + // d_x, d_scale, d_bias may be nullptr + auto *d_x = x_grad; + auto *d_scale = scale_grad; + auto *d_bias = bias_grad; + + auto *scale = scale_opt.get_ptr(); + auto *bias = bias_opt.get_ptr(); + auto *d_y = &out_grad; + + const auto &x_dims = x.dims(); + auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); + int64_t batch_size = static_cast(matrix_dim[0]); + int64_t feature_size = static_cast(matrix_dim[1]); + + auto *x_data = x.data(); + auto *d_y_data = d_y->data(); + + auto *mean_data = mean.data(); + auto *var_data = variance.data(); + + auto *d_x_data = (d_x == nullptr ? nullptr : dev_ctx.template Alloc(d_x)); + + auto x_dtype = x.dtype(); + + phi::DataType scale_bias_dtype; + if (scale != nullptr) { + scale_bias_dtype = scale->dtype(); + } else { + // FIXME(zengjinle): do not find a better way to get the right + // data type of the d_scale and d_bias if scale == nullptr. + if (bias != nullptr) { + scale_bias_dtype = bias->dtype(); + } else { + scale_bias_dtype = x_dtype; + } + } + +#define PADDLE_LAUNCH_LAYERNORM_BWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \ + do { \ + auto *scale_data = \ + (scale == nullptr ? nullptr : scale->data()); \ + auto *d_scale_data = \ + (d_scale == nullptr ? nullptr \ + : dev_ctx.template Alloc(d_scale)); \ + auto *d_bias_data = \ + (d_bias == nullptr ? nullptr \ + : dev_ctx.template Alloc(d_bias)); \ + auto *d_x_data = \ + (d_x == nullptr ? nullptr : dev_ctx.template Alloc(d_x)); \ + paddle::operators::LayerNormBackward( \ + x_data, \ + d_y_data, \ + scale_data, \ + mean_data, \ + var_data, \ + d_x_data, \ + d_scale_data, \ + d_bias_data, \ + epsilon, \ + batch_size, \ + feature_size, \ + dev_ctx); \ + } while (0) + + if (scale_bias_dtype == x_dtype) { + PADDLE_LAUNCH_LAYERNORM_BWD(T, true); + } else { + PADDLE_LAUNCH_LAYERNORM_BWD(U, false); + } + +#undef PADDLE_LAUNCH_LAYERNORM_BWD +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +// MIOPEN do not support double +PD_REGISTER_KERNEL(layer_norm_grad, + GPU, + ALL_LAYOUT, + phi::LayerNormGradKernel, + float, + phi::dtype::float16) {} +#elif CUDNN_VERSION_MIN(8, 1, 0) +PD_REGISTER_KERNEL(layer_norm_grad, + GPU, + ALL_LAYOUT, + phi::LayerNormGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else +PD_REGISTER_KERNEL(layer_norm_grad, + GPU, + ALL_LAYOUT, + phi::LayerNormGradKernel, + float, + double, + phi::dtype::float16) {} +#endif diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu new file mode 100644 index 00000000000..d87b7c21938 --- /dev/null +++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu @@ -0,0 +1,229 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/layer_norm_kernel.h" + +#include "paddle/fluid/operators/layer_norm_kernel.cu.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/layer_norm_util.h" + +namespace phi { + +template +void LayerNormDirectCUDAFunctor::operator()(gpuStream_t stream, + const T *input, + std::vector input_shape, + const T *bias, + const T *scale, + T *output, + T *mean, + T *variance, + int begin_norm_axis, + float eps) { + const auto x_dims = phi::make_ddim(input_shape); + auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); + int64_t batch_size = static_cast(matrix_dim[0]); + int64_t feature_size = static_cast(matrix_dim[1]); + switch (paddle::operators::GetDesiredBlockDim(feature_size)) { + FIXED_BLOCK_DIM_CASE(paddle::operators::LayerNormForward< + T, + T, + kBlockDim><<>>( + input, scale, bias, output, mean, variance, eps, feature_size)); + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "Product from begin_norm_axis to end in layer_norm must be larger " + "than 1")); + break; + } +} + +template class LayerNormDirectCUDAFunctor; + +template +void LayerNormKernel(const Context &dev_ctx, + const DenseTensor &x, + paddle::optional scale_opt, + paddle::optional bias_opt, + float epsilon, + int begin_norm_axis, + bool is_test, + DenseTensor *y, + DenseTensor *mean, + DenseTensor *var) { + using U = paddle::operators::LayerNormParamType; + auto *scale = scale_opt.get_ptr(); + auto *bias = bias_opt.get_ptr(); + + const auto x_dims = x.dims(); + auto *x_data = x.data(); + auto *y_data = dev_ctx.template Alloc(y); + auto *mean_data = dev_ctx.template Alloc(mean); + auto *var_data = dev_ctx.template Alloc(var); + + auto *void_scale_data = (scale == nullptr ? nullptr : scale->data()); + auto *void_bias_data = (bias == nullptr ? nullptr : bias->data()); + + auto x_dtype = x.dtype(); + phi::DataType scale_bias_dtype; + if (void_scale_data != nullptr) { + scale_bias_dtype = scale->dtype(); + if (void_bias_data != nullptr) { + PADDLE_ENFORCE_EQ( + scale->dtype(), + bias->dtype(), + phi::errors::InvalidArgument("Thie Scale and Bias of layer_norm op " + "should have the same data type.")); + } + } else { + scale_bias_dtype = (void_bias_data != nullptr ? bias->dtype() : x_dtype); + } + + bool is_scale_bias_same_dtype_with_x = x_dtype == scale_bias_dtype; + if (!is_scale_bias_same_dtype_with_x) { + PADDLE_ENFORCE_EQ(scale_bias_dtype, + paddle::experimental::CppTypeToDataType::Type(), + phi::errors::InvalidArgument( + "Unsupported data type of Scale and Bias")); + } + + auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); + int64_t batch_size = static_cast(matrix_dim[0]); + int64_t feature_size = static_cast(matrix_dim[1]); + + auto stream = dev_ctx.stream(); + +#define PADDLE_LAUNCH_LAYERNORM_FWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \ + do { \ + switch (paddle::operators::GetDesiredBlockDim(feature_size)) { \ + FIXED_BLOCK_DIM_CASE(paddle::operators::LayerNormForward< \ + T, \ + U, \ + kBlockDim, \ + IsScaleBiasSameDTypeWithX><<>>( \ + x_data, \ + static_cast(void_scale_data), \ + static_cast(void_bias_data), \ + y_data, \ + mean_data, \ + var_data, \ + epsilon, \ + feature_size)); \ + default: \ + PADDLE_THROW(phi::errors::InvalidArgument( \ + "Product from begin_norm_axis to end must be larger than 1")); \ + break; \ + } \ + } while (0) + +#ifdef PADDLE_WITH_CUDA + bool can_call_1024_kernel = false; + if (feature_size == 1024 && scale != nullptr && bias != nullptr) { + can_call_1024_kernel = true; + } + if (can_call_1024_kernel) { + const int WARPS_M = 4; + const int WARPS_N = 1; + const int THREADS_PER_WARP = 32; + const int BYTES_PER_LDG = 16; + const int VecSize = BYTES_PER_LDG / sizeof(T); + + const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M; + const int ROWS_PER_CTA = WARPS_M; + + const int grid = static_cast( + std::ceil(batch_size / static_cast(ROWS_PER_CTA))); + if (is_scale_bias_same_dtype_with_x) { + paddle::operators::ln_fwd_1024_kernel< + T, + U, + T, + VecSize, + WARPS_M, + WARPS_N, + BYTES_PER_LDG><<>>( + batch_size, + feature_size, + epsilon, + x_data, + static_cast(void_scale_data), + static_cast(void_bias_data), + mean_data, + var_data, + y_data); + } else { + paddle::operators::ln_fwd_1024_kernel< + T, + U, + U, + VecSize, + WARPS_M, + WARPS_N, + BYTES_PER_LDG><<>>( + batch_size, + feature_size, + epsilon, + x_data, + static_cast(void_scale_data), + static_cast(void_bias_data), + mean_data, + var_data, + y_data); + } + } else { +#endif + if (is_scale_bias_same_dtype_with_x) { + PADDLE_LAUNCH_LAYERNORM_FWD(T, true); + } else { + PADDLE_LAUNCH_LAYERNORM_FWD(U, false); + } +#ifdef PADDLE_WITH_CUDA + } +#endif + +#undef PADDLE_LAUNCH_LAYERNORM_FWD +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +// MIOPEN do not support double +PD_REGISTER_KERNEL(layer_norm, + GPU, + ALL_LAYOUT, + phi::LayerNormKernel, + float, + phi::dtype::float16) {} +#elif CUDNN_VERSION_MIN(8, 1, 0) +PD_REGISTER_KERNEL(layer_norm, + GPU, + ALL_LAYOUT, + phi::LayerNormKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else +PD_REGISTER_KERNEL(layer_norm, + GPU, + ALL_LAYOUT, + phi::LayerNormKernel, + float, + double, + phi::dtype::float16) {} +#endif diff --git a/paddle/phi/kernels/layer_norm_grad_kernel.h b/paddle/phi/kernels/layer_norm_grad_kernel.h new file mode 100644 index 00000000000..c32be63db41 --- /dev/null +++ b/paddle/phi/kernels/layer_norm_grad_kernel.h @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void LayerNormGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& mean, + const DenseTensor& variance, + paddle::optional scale, + paddle::optional bias, + const DenseTensor& out_grad, + float epsilon, + int begin_norm_axis, + bool is_test, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* bias_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/layer_norm_kernel.h b/paddle/phi/kernels/layer_norm_kernel.h new file mode 100644 index 00000000000..c9679420bda --- /dev/null +++ b/paddle/phi/kernels/layer_norm_kernel.h @@ -0,0 +1,51 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/backends/gpu/gpu_decls.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void LayerNormKernel(const Context& ctx, + const DenseTensor& x, + paddle::optional scale, + paddle::optional bias, + float epsilon, + int begin_norm_axis, + bool is_test, + DenseTensor* out, + DenseTensor* mean, + DenseTensor* variance); + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +template +class LayerNormDirectCUDAFunctor { + public: + void operator()(gpuStream_t stream, + const T* input, + std::vector input_shape, + const T* bias, + const T* scale, + T* output, + T* mean, + T* variance, + int begin_norm_axis, + float eps); +}; +#endif + +} // namespace phi diff --git a/paddle/phi/ops/compat/layer_norm_sig.cc b/paddle/phi/ops/compat/layer_norm_sig.cc new file mode 100644 index 00000000000..17a81e9ec01 --- /dev/null +++ b/paddle/phi/ops/compat/layer_norm_sig.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature LayerNormOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("layer_norm", + {"X", "Scale", "Bias"}, + {"epsilon", "begin_norm_axis", "is_test"}, + {"Y", "Mean", "Variance"}); +} + +KernelSignature LayerNormGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "layer_norm_grad", + {"X", "Mean", "Variance", "Scale", "Bias", GradVarName("Y")}, + {"epsilon", "begin_norm_axis", "is_test"}, + {GradVarName("X"), GradVarName("Scale"), GradVarName("Bias")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(layer_norm, phi::LayerNormOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(layer_norm_grad, + phi::LayerNormGradOpArgumentMapping); diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py index ca9a489c749..b75dc2c964c 100644 --- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py @@ -215,6 +215,8 @@ class TestLayerNormOp(unittest.TestCase): for name in ['x', 'scale', 'bias', 'y@GRAD'] }, fetch_list=fetch_list) + # print(y) + # print(out[0]) self.__assert_close(y, out[0], "y") self.__assert_close(mean, out[1], "mean") self.__assert_close(variance, out[2], "variance", 1e-3) @@ -238,6 +240,7 @@ class TestLayerNormOp(unittest.TestCase): def test_check_forward_backward_with_scale_and_bias(self): self.check_forward_backward(shape=[1, 3, 4, 5], begin_norm_axis=1) + self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1) self.check_forward_backward( shape=[2, 3, 4, 5], @@ -432,4 +435,5 @@ class TestGetSetKeepLayerNormScaleBiasFP32Flag(unittest.TestCase): if __name__ == '__main__': + paddle.enable_static() unittest.main() -- GitLab From 7d0db6299a3e0b5cf1caa88d20321cfc564637fe Mon Sep 17 00:00:00 2001 From: zyfncg Date: Thu, 17 Mar 2022 14:46:53 +0800 Subject: [PATCH 139/176] [PHI] move roi_pool kernel to phi (#40574) * move roi_pool forward kernel to phi * move roi_pool_grad to phi * fix compile bug * fix compile bug * fix register data_type --- paddle/fluid/operators/roi_pool_op.cc | 16 +- paddle/fluid/operators/roi_pool_op.cu | 306 ------------------ paddle/fluid/operators/roi_pool_op.h | 250 -------------- .../phi/kernels/cpu/roi_pool_grad_kernel.cc | 108 +++++++ paddle/phi/kernels/cpu/roi_pool_kernel.cc | 163 ++++++++++ paddle/phi/kernels/gpu/roi_align_kernel.cu | 1 - .../phi/kernels/gpu/roi_pool_grad_kernel.cu | 165 ++++++++++ paddle/phi/kernels/gpu/roi_pool_kernel.cu | 220 +++++++++++++ paddle/phi/kernels/roi_pool_grad_kernel.h | 34 ++ paddle/phi/kernels/roi_pool_kernel.h | 35 ++ paddle/phi/ops/compat/roi_pool_sig.cc | 37 +++ 11 files changed, 766 insertions(+), 569 deletions(-) delete mode 100644 paddle/fluid/operators/roi_pool_op.cu delete mode 100644 paddle/fluid/operators/roi_pool_op.h create mode 100644 paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/roi_pool_kernel.cc create mode 100644 paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/roi_pool_kernel.cu create mode 100644 paddle/phi/kernels/roi_pool_grad_kernel.h create mode 100644 paddle/phi/kernels/roi_pool_kernel.h create mode 100644 paddle/phi/ops/compat/roi_pool_sig.cc diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc index a512e7dcd68..9fd66590cb7 100644 --- a/paddle/fluid/operators/roi_pool_op.cc +++ b/paddle/fluid/operators/roi_pool_op.cc @@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/roi_pool_op.h" #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/kernels/roi_pool_kernel.h" namespace paddle { namespace operators { @@ -57,7 +58,7 @@ class ROIPoolOp : public framework::OperatorWithKernel { "%d-dimensional LoDTensor", rois_dims.size())); PADDLE_ENFORCE_EQ( - rois_dims[1], kROISize, + rois_dims[1], phi::kROISize, platform::errors::InvalidArgument( "ROIs should be a 2-D LoDTensor with shape (num_rois, 4)" "given as [[x1, y1, x2, y2], ...]. But the second dimension of " @@ -216,16 +217,7 @@ REGISTER_OPERATOR(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, ops::ROIPoolGradMaker, ops::ROIPoolGradMaker); REGISTER_OPERATOR(roi_pool_grad, ops::ROIPoolGradOp); -REGISTER_OP_CPU_KERNEL( - roi_pool, - ops::CPUROIPoolOpKernel, - ops::CPUROIPoolOpKernel, - ops::CPUROIPoolOpKernel); -REGISTER_OP_CPU_KERNEL( - roi_pool_grad, - ops::CPUROIPoolGradOpKernel, - ops::CPUROIPoolGradOpKernel, - ops::CPUROIPoolGradOpKernel); + REGISTER_OP_VERSION(roi_pool) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu deleted file mode 100644 index b907b1114bb..00000000000 --- a/paddle/fluid/operators/roi_pool_op.cu +++ /dev/null @@ -1,306 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/operators/roi_pool_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -static constexpr int kNumCUDAThreads = 512; -static constexpr int kNumMaxinumNumBlocks = 4096; - -static inline int NumBlocks(const int N) { - return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, - kNumMaxinumNumBlocks); -} - -template -__global__ void GPUROIPoolForward( - const int nthreads, const T* input_data, const T* input_rois, - const float spatial_scale, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - int* roi_batch_id_data, T* output_data, int64_t* argmax_data) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - int offset = blockDim.x * gridDim.x; - for (size_t i = index; i < nthreads; i += offset) { - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % channels; - int n = i / pooled_width / pooled_height / channels; - - const T* offset_input_rois = input_rois + n * kROISize; - int roi_batch_ind = roi_batch_id_data[n]; - int roi_start_w = round(offset_input_rois[0] * spatial_scale); - int roi_start_h = round(offset_input_rois[1] * spatial_scale); - int roi_end_w = round(offset_input_rois[2] * spatial_scale); - int roi_end_h = round(offset_input_rois[3] * spatial_scale); - - int roi_width = max(roi_end_w - roi_start_w + 1, 1); - int roi_height = max(roi_end_h - roi_start_h + 1, 1); - - int hstart = static_cast(floor(static_cast(ph) * - static_cast(roi_height) / - static_cast(pooled_height))); - int wstart = static_cast(floor(static_cast(pw) * - static_cast(roi_width) / - static_cast(pooled_width))); - int hend = static_cast(ceil(static_cast(ph + 1) * - static_cast(roi_height) / - static_cast(pooled_height))); - int wend = static_cast(ceil(static_cast(pw + 1) * - static_cast(roi_width) / - static_cast(pooled_width))); - hstart = min(max(hstart + roi_start_h, 0), height); - hend = min(max(hend + roi_start_h, 0), height); - wstart = min(max(wstart + roi_start_w, 0), width); - wend = min(max(wend + roi_start_w, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - T maxval = is_empty ? 0 : -std::numeric_limits::max(); - int maxidx = -1; - const T* offset_input_data = - input_data + (roi_batch_ind * channels + c) * height * width; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - int input_data_index = h * width + w; - if (offset_input_data[input_data_index] > maxval) { - maxval = offset_input_data[input_data_index]; - maxidx = input_data_index; - } - } - } - output_data[i] = maxval; - if (argmax_data) { - argmax_data[i] = maxidx; - } - } -} - -template -__global__ void GPUROIPoolBackward( - const int nthreads, const T* input_rois, const T* output_grad, - const int64_t* argmax_data, const int num_rois, const float spatial_scale, - const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, int* roi_batch_id_data, - T* input_grad) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - int offset = blockDim.x * gridDim.x; - for (int i = index; i < nthreads; i += offset) { - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % channels; - int n = i / pooled_width / pooled_height / channels; - - int roi_batch_ind = roi_batch_id_data[n]; - int input_offset = (roi_batch_ind * channels + c) * height * width; - int output_offset = (n * channels + c) * pooled_height * pooled_width; - const T* offset_output_grad = output_grad + output_offset; - T* offset_input_grad = input_grad + input_offset; - const int64_t* offset_argmax_data = argmax_data + output_offset; - - int argmax = offset_argmax_data[ph * pooled_width + pw]; - if (argmax != -1) { - platform::CudaAtomicAdd( - offset_input_grad + argmax, - static_cast(offset_output_grad[ph * pooled_width + pw])); - } - } -} - -template -class GPUROIPoolOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - auto* argmax = ctx.Output("Argmax"); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - auto in_stride = phi::stride(in_dims); - int channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - - int rois_num = rois->dims()[0]; - - if (rois_num == 0) return; - - int output_size = out->numel(); - int blocks = NumBlocks(output_size); - int threads = kNumCUDAThreads; - - framework::Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - auto cplace = platform::CPUPlace(); - int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); - auto& dev_ctx = ctx.cuda_device_context(); - auto gplace = ctx.GetPlace(); - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - int rois_batch_size = rois_num_t->numel(); - - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of input(ROIs) and input(X) must be the same but " - "received batch size of input(ROIs) and input(X) is %d and %d " - "respectively.", - rois_batch_size, batch_size)); - std::vector rois_num_list(rois_batch_size); - memory::Copy(cplace, rois_num_list.data(), gplace, - rois_num_t->data(), sizeof(int) * rois_batch_size, 0); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_list[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_list[n]; - } - } else { - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of input(ROIs) and input(X) must be the same but " - "received batch size of input(ROIs) and input(X) is %d and %d " - "respectively.", - rois_batch_size, batch_size)); - - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod, - platform::errors::InvalidArgument( - "The number of rois from input(ROIs) and its LOD " - "must be the same. Received rois %d of input(ROIs) " - "but the number of rois %d from its LOD is %d", - rois_num, rois_num_with_lod)); - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - int bytes = roi_batch_id_list.numel() * sizeof(int); - auto roi_ptr = memory::Alloc(dev_ctx, bytes); - int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); - memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, - dev_ctx.stream()); - - GPUROIPoolForward<<>>( - output_size, in->data(), rois->data(), spatial_scale, channels, - height, width, pooled_height, pooled_width, roi_id_data, - out->mutable_data(ctx.GetPlace()), - argmax->mutable_data(ctx.GetPlace())); - } -}; - -template -class GPUROIPoolGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* rois_lod = ctx.Input("RoisNum"); - auto* argmax = ctx.Input("Argmax"); - - auto* out_grad = ctx.Input(framework::GradVarName("Out")); - auto* x_grad = ctx.Output(framework::GradVarName("X")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - int rois_num = rois->dims()[0]; - int channels = in->dims()[1]; - int height = in->dims()[2]; - int width = in->dims()[3]; - - if (x_grad) { - framework::Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - auto cplace = platform::CPUPlace(); - int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); - - auto& dev_ctx = ctx.cuda_device_context(); - auto gplace = ctx.GetPlace(); - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - int rois_batch_size = rois_num_t->numel(); - std::vector rois_num_list(rois_batch_size); - memory::Copy(cplace, rois_num_list.data(), gplace, - rois_num_t->data(), sizeof(int) * rois_batch_size, 0); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_list[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_list[n]; - } - } else { - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - int bytes = roi_batch_id_list.numel() * sizeof(int); - auto roi_ptr = memory::Alloc(dev_ctx, bytes); - int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); - memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, - dev_ctx.stream()); - - x_grad->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, x_grad, static_cast(0)); - - int output_grad_size = out_grad->numel(); - int blocks = NumBlocks(output_grad_size); - int threads = kNumCUDAThreads; - - if (output_grad_size > 0) { - GPUROIPoolBackward<<>>( - output_grad_size, rois->data(), out_grad->data(), - argmax->data(), rois_num, spatial_scale, channels, height, - width, pooled_height, pooled_width, roi_id_data, - x_grad->mutable_data(ctx.GetPlace())); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - roi_pool, - ops::GPUROIPoolOpKernel, - ops::GPUROIPoolOpKernel); -REGISTER_OP_CUDA_KERNEL( - roi_pool_grad, - ops::GPUROIPoolGradOpKernel, - ops::GPUROIPoolGradOpKernel); diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h deleted file mode 100644 index a104fd49eb3..00000000000 --- a/paddle/fluid/operators/roi_pool_op.h +++ /dev/null @@ -1,250 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/memory/memcpy.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -static constexpr int kROISize = 4; - -template -class CPUROIPoolOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - auto* argmax = ctx.Output("Argmax"); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - auto in_stride = phi::stride(in_dims); - auto argmax_stride = phi::stride(argmax->dims()); - auto roi_stride = phi::stride(rois->dims()); - auto out_stride = phi::stride(out->dims()); - - const T* input_data = in->data(); - - framework::Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - int* roi_batch_id_data = - roi_batch_id_list.mutable_data(ctx.GetPlace()); - - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument("The rois_batch_size and imgs " - "batch_size must be the same.")); - auto* rois_num_data = rois_num_t->data(); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_data[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_data[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument("The rois_batch_size and imgs " - "batch_size must be the same.")); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ( - rois_num, rois_num_with_lod, - platform::errors::InvalidArgument("The rois_num from input " - "and lod must be the same.")); - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - - T* output_data = out->mutable_data(ctx.GetPlace()); - int64_t* argmax_data = argmax->mutable_data(ctx.GetPlace()); - - const T* rois_data = rois->data(); - for (int n = 0; n < rois_num; ++n) { - int roi_batch_id = roi_batch_id_data[n]; - int roi_start_w = round(rois_data[0] * spatial_scale); - int roi_start_h = round(rois_data[1] * spatial_scale); - int roi_end_w = round(rois_data[2] * spatial_scale); - int roi_end_h = round(rois_data[3] * spatial_scale); - - // Force malformed ROIs to be 1x1 - int roi_height = std::max(roi_end_h - roi_start_h + 1, 1); - int roi_width = std::max(roi_end_w - roi_start_w + 1, 1); - - const float bin_size_h = - static_cast(roi_height) / static_cast(pooled_height); - const float bin_size_w = - static_cast(roi_width) / static_cast(pooled_width); - - const T* batch_data = input_data + roi_batch_id * in_stride[0]; - - for (int c = 0; c < channels; ++c) { - for (int ph = 0; ph < pooled_height; ++ph) { - for (int pw = 0; pw < pooled_width; ++pw) { - // Compute pooling region for this output unit: - // start (included) = floor(ph * roi_height / pooled_height_) - // end (excluded) = ceil((ph + 1) * roi_height / pooled_height_) - int hstart = - static_cast(floor(static_cast(ph) * bin_size_h)); - int wstart = - static_cast(floor(static_cast(pw) * bin_size_w)); - int hend = - static_cast(ceil(static_cast(ph + 1) * bin_size_h)); - int wend = - static_cast(ceil(static_cast(pw + 1) * bin_size_w)); - - hstart = std::min(std::max(hstart + roi_start_h, 0), height); - hend = std::min(std::max(hend + roi_start_h, 0), height); - wstart = std::min(std::max(wstart + roi_start_w, 0), width); - wend = std::min(std::max(wend + roi_start_w, 0), width); - - const int pool_index = ph * pooled_width + pw; - - // Define an empty pooling region to be zero - bool is_empty = (hend <= hstart) || (wend <= wstart); - output_data[pool_index] = - is_empty ? 0 : -std::numeric_limits::max(); - argmax_data[pool_index] = -1; - - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - const int index = h * width + w; - if (batch_data[index] > output_data[pool_index]) { - output_data[pool_index] = batch_data[index]; - argmax_data[pool_index] = index; - } - } - } - } - } - - batch_data += in_stride[1]; - output_data += out_stride[1]; - argmax_data += argmax_stride[1]; - } - // Increment ROI data pointer - rois_data += roi_stride[0]; - } - return; - } -}; - -template -class CPUROIPoolGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* argmax = ctx.Input("Argmax"); - auto* out_grad = - ctx.Input(framework::GradVarName("Out")); - auto* in_grad = ctx.Output(framework::GradVarName("X")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - - if (in_grad) { - int rois_num = rois->dims()[0]; - framework::Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - int* roi_batch_id_data = - roi_batch_id_list.mutable_data(ctx.GetPlace()); - - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - auto* rois_num_data = rois_num_t->data(); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_data[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_data[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - - const T* rois_data = rois->data(); - const T* out_grad_data = out_grad->data(); - const int64_t* argmax_data = argmax->data(); - T* in_grad_data = in_grad->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(ctx.template device_context(), in_grad, - static_cast(0)); - - auto in_stride = phi::stride(in->dims()); - auto argmax_stride = phi::stride(argmax->dims()); - auto roi_stride = phi::stride(rois->dims()); - auto out_stride = phi::stride(out_grad->dims()); - - int channels = in->dims()[1]; - - for (int n = 0; n < rois_num; ++n) { - int roi_batch_idx = roi_batch_id_data[n]; - T* batch_grad_data = in_grad_data + roi_batch_idx * in_stride[0]; - for (int c = 0; c < channels; ++c) { - for (int ph = 0; ph < pooled_height; ++ph) { - for (int pw = 0; pw < pooled_width; ++pw) { - int pool_index = ph * pooled_width + pw; - if (argmax_data[pool_index] >= 0) { - auto index = argmax_data[pool_index]; - batch_grad_data[index] += out_grad_data[pool_index]; - } - } - } - batch_grad_data += in_stride[1]; - out_grad_data += out_stride[1]; - argmax_data += argmax_stride[1]; - } - rois_data += roi_stride[0]; - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc new file mode 100644 index 00000000000..0eaa873590e --- /dev/null +++ b/paddle/phi/kernels/cpu/roi_pool_grad_kernel.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roi_pool_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void RoiPoolGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + const DenseTensor& arg_max, + const DenseTensor& out_grad, + int pooled_height, + int pooled_width, + float spatial_scale, + DenseTensor* dx) { + if (dx) { + int rois_num = boxes.dims()[0]; + DenseTensor box_batch_id_list = Empty(dev_ctx, {rois_num}); + int* box_batch_id_data = box_batch_id_list.data(); + + int boxes_batch_size; + if (boxes_num) { + boxes_batch_size = boxes_num->numel(); + auto* boxes_num_data = boxes_num->data(); + int start = 0; + for (int n = 0; n < boxes_batch_size; ++n) { + for (int i = start; i < start + boxes_num_data[n]; ++i) { + box_batch_id_data[i] = n; + } + start += boxes_num_data[n]; + } + } else { + auto boxes_lod = boxes.lod().back(); + boxes_batch_size = boxes_lod.size() - 1; + for (int n = 0; n < boxes_batch_size; ++n) { + for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) { + box_batch_id_data[i] = n; + } + } + } + + const T* boxes_data = boxes.data(); + const T* out_grad_data = out_grad.data(); + const int64_t* arg_max_data = arg_max.data(); + T* dx_data = dev_ctx.template Alloc(dx); + + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, dx, static_cast(0)); + + auto in_stride = phi::stride(x.dims()); + auto arg_max_stride = phi::stride(arg_max.dims()); + auto roi_stride = phi::stride(boxes.dims()); + auto out_stride = phi::stride(out_grad.dims()); + + int channels = x.dims()[1]; + + for (int n = 0; n < rois_num; ++n) { + int roi_batch_idx = box_batch_id_data[n]; + T* batch_grad_data = dx_data + roi_batch_idx * in_stride[0]; + for (int c = 0; c < channels; ++c) { + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + int pool_index = ph * pooled_width + pw; + if (arg_max_data[pool_index] >= 0) { + auto index = arg_max_data[pool_index]; + batch_grad_data[index] += out_grad_data[pool_index]; + } + } + } + batch_grad_data += in_stride[1]; + out_grad_data += out_stride[1]; + arg_max_data += arg_max_stride[1]; + } + boxes_data += roi_stride[0]; + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(roi_pool_grad, + CPU, + ALL_LAYOUT, + phi::RoiPoolGradKernel, + float, + double, + int) { + kernel->InputAt(3).SetDataType(phi::DataType::INT64); +} diff --git a/paddle/phi/kernels/cpu/roi_pool_kernel.cc b/paddle/phi/kernels/cpu/roi_pool_kernel.cc new file mode 100644 index 00000000000..02020354cd3 --- /dev/null +++ b/paddle/phi/kernels/cpu/roi_pool_kernel.cc @@ -0,0 +1,163 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roi_pool_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" + +namespace phi { + +template +void RoiPoolKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + int pooled_height, + int pooled_width, + float spatial_scale, + DenseTensor* out, + DenseTensor* arg_max) { + auto x_dims = x.dims(); + int batch_size = x_dims[0]; + int channels = x_dims[1]; + int height = x_dims[2]; + int width = x_dims[3]; + int rois_num = boxes.dims()[0]; + + auto in_stride = phi::stride(x_dims); + auto arg_max_stride = phi::stride(arg_max->dims()); + auto box_stride = phi::stride(boxes.dims()); + auto out_stride = phi::stride(out->dims()); + + const T* input_data = x.data(); + + DenseTensor box_batch_id_list = Empty(dev_ctx, {rois_num}); + int* box_batch_id_data = box_batch_id_list.data(); + + int boxes_batch_size; + if (boxes_num) { + boxes_batch_size = boxes_num->numel(); + PADDLE_ENFORCE_EQ( + boxes_batch_size, + batch_size, + phi::errors::InvalidArgument("The boxes_batch_size and imgs " + "batch_size must be the same.")); + auto* boxes_num_data = boxes_num->data(); + int start = 0; + for (int n = 0; n < boxes_batch_size; ++n) { + for (int i = start; i < start + boxes_num_data[n]; ++i) { + box_batch_id_data[i] = n; + } + start += boxes_num_data[n]; + } + } else { + auto boxes_lod = boxes.lod().back(); + boxes_batch_size = boxes_lod.size() - 1; + PADDLE_ENFORCE_EQ( + boxes_batch_size, + batch_size, + phi::errors::InvalidArgument("The boxes_batch_size and imgs " + "batch_size must be the same.")); + int rois_num_with_lod = boxes_lod[boxes_batch_size]; + PADDLE_ENFORCE_EQ( + rois_num, + rois_num_with_lod, + phi::errors::InvalidArgument("The rois_num from input " + "and lod must be the same.")); + for (int n = 0; n < boxes_batch_size; ++n) { + for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) { + box_batch_id_data[i] = n; + } + } + } + + T* output_data = dev_ctx.template Alloc(out); + int64_t* arg_max_data = dev_ctx.template Alloc(arg_max); + + const T* boxes_data = boxes.data(); + for (int n = 0; n < rois_num; ++n) { + int box_batch_id = box_batch_id_data[n]; + int box_start_w = round(boxes_data[0] * spatial_scale); + int box_start_h = round(boxes_data[1] * spatial_scale); + int box_end_w = round(boxes_data[2] * spatial_scale); + int box_end_h = round(boxes_data[3] * spatial_scale); + + // Force malformed ROIs to be 1x1 + int box_height = std::max(box_end_h - box_start_h + 1, 1); + int box_width = std::max(box_end_w - box_start_w + 1, 1); + + const float bin_size_h = + static_cast(box_height) / static_cast(pooled_height); + const float bin_size_w = + static_cast(box_width) / static_cast(pooled_width); + + const T* batch_data = input_data + box_batch_id * in_stride[0]; + + for (int c = 0; c < channels; ++c) { + for (int ph = 0; ph < pooled_height; ++ph) { + for (int pw = 0; pw < pooled_width; ++pw) { + // Compute pooling region for this output unit: + // start (included) = floor(ph * box_height / pooled_height_) + // end (excluded) = ceil((ph + 1) * box_height / pooled_height_) + int hstart = + static_cast(floor(static_cast(ph) * bin_size_h)); + int wstart = + static_cast(floor(static_cast(pw) * bin_size_w)); + int hend = + static_cast(ceil(static_cast(ph + 1) * bin_size_h)); + int wend = + static_cast(ceil(static_cast(pw + 1) * bin_size_w)); + + hstart = std::min(std::max(hstart + box_start_h, 0), height); + hend = std::min(std::max(hend + box_start_h, 0), height); + wstart = std::min(std::max(wstart + box_start_w, 0), width); + wend = std::min(std::max(wend + box_start_w, 0), width); + + const int pool_index = ph * pooled_width + pw; + + // Define an empty pooling region to be zero + bool is_empty = (hend <= hstart) || (wend <= wstart); + output_data[pool_index] = + is_empty ? 0 : -std::numeric_limits::max(); + arg_max_data[pool_index] = -1; + + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + const int index = h * width + w; + if (batch_data[index] > output_data[pool_index]) { + output_data[pool_index] = batch_data[index]; + arg_max_data[pool_index] = index; + } + } + } + } + } + + batch_data += in_stride[1]; + output_data += out_stride[1]; + arg_max_data += arg_max_stride[1]; + } + // Increment ROI data pointer + boxes_data += box_stride[0]; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + roi_pool, CPU, ALL_LAYOUT, phi::RoiPoolKernel, float, double, int) { + kernel->OutputAt(1).SetDataType(phi::DataType::INT64); +} diff --git a/paddle/phi/kernels/gpu/roi_align_kernel.cu b/paddle/phi/kernels/gpu/roi_align_kernel.cu index cd4ed29cdd1..cb3375dee95 100644 --- a/paddle/phi/kernels/gpu/roi_align_kernel.cu +++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu @@ -18,7 +18,6 @@ #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/empty_kernel.h" #include "paddle/fluid/memory/memory.h" diff --git a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu new file mode 100644 index 00000000000..d093a71d23f --- /dev/null +++ b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu @@ -0,0 +1,165 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roi_pool_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" + +namespace phi { + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +template +__global__ void GPURoiPoolBackward(const int nthreads, + const T* input_rois, + const T* output_grad, + const int64_t* arg_max_data, + const int num_rois, + const float spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + int* box_batch_id_data, + T* input_grad) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (int i = index; i < nthreads; i += offset) { + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % channels; + int n = i / pooled_width / pooled_height / channels; + + int roi_batch_ind = box_batch_id_data[n]; + int input_offset = (roi_batch_ind * channels + c) * height * width; + int output_offset = (n * channels + c) * pooled_height * pooled_width; + const T* offset_output_grad = output_grad + output_offset; + T* offset_input_grad = input_grad + input_offset; + const int64_t* offset_arg_max_data = arg_max_data + output_offset; + + int arg_max = offset_arg_max_data[ph * pooled_width + pw]; + if (arg_max != -1) { + paddle::platform::CudaAtomicAdd( + offset_input_grad + arg_max, + static_cast(offset_output_grad[ph * pooled_width + pw])); + } + } +} + +template +void RoiPoolGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + const DenseTensor& arg_max, + const DenseTensor& out_grad, + int pooled_height, + int pooled_width, + float spatial_scale, + DenseTensor* dx) { + auto x_dims = x.dims(); + int channels = x_dims[1]; + int height = x_dims[2]; + int width = x_dims[3]; + int rois_num = boxes.dims()[0]; + + if (dx) { + DenseTensor box_batch_id_list; + box_batch_id_list.Resize({rois_num}); + int* box_batch_id_data = + dev_ctx.template HostAlloc(&box_batch_id_list); + + auto gplace = dev_ctx.GetPlace(); + if (boxes_num) { + int boxes_batch_size = boxes_num->numel(); + std::vector boxes_num_list(boxes_batch_size); + paddle::memory::Copy(phi::CPUPlace(), + boxes_num_list.data(), + gplace, + boxes_num->data(), + sizeof(int) * boxes_batch_size, + 0); + int start = 0; + for (int n = 0; n < boxes_batch_size; ++n) { + for (int i = start; i < start + boxes_num_list[n]; ++i) { + box_batch_id_data[i] = n; + } + start += boxes_num_list[n]; + } + } else { + auto boxes_lod = boxes.lod().back(); + int boxes_batch_size = boxes_lod.size() - 1; + for (int n = 0; n < boxes_batch_size; ++n) { + for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) { + box_batch_id_data[i] = n; + } + } + } + int bytes = box_batch_id_list.numel() * sizeof(int); + auto roi_ptr = paddle::memory::Alloc(dev_ctx, bytes); + int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); + paddle::memory::Copy(gplace, + roi_id_data, + phi::CPUPlace(), + box_batch_id_data, + bytes, + dev_ctx.stream()); + + dev_ctx.template Alloc(dx); + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, dx, static_cast(0)); + + int output_grad_size = out_grad.numel(); + int blocks = NumBlocks(output_grad_size); + int threads = kNumCUDAThreads; + + if (output_grad_size > 0) { + GPURoiPoolBackward<<>>( + output_grad_size, + boxes.data(), + out_grad.data(), + arg_max.data(), + rois_num, + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + roi_id_data, + dx->data()); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + roi_pool_grad, GPU, ALL_LAYOUT, phi::RoiPoolGradKernel, float, double) { + kernel->InputAt(3).SetDataType(phi::DataType::INT64); +} diff --git a/paddle/phi/kernels/gpu/roi_pool_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_kernel.cu new file mode 100644 index 00000000000..ab33e2cf647 --- /dev/null +++ b/paddle/phi/kernels/gpu/roi_pool_kernel.cu @@ -0,0 +1,220 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/roi_pool_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/fluid/memory/memory.h" + +namespace phi { + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +template +__global__ void GPURoiPoolForward(const int nthreads, + const T* input_data, + const T* input_rois, + const float spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + int* box_batch_id_data, + T* output_data, + int64_t* arg_max_data) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (size_t i = index; i < nthreads; i += offset) { + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % channels; + int n = i / pooled_width / pooled_height / channels; + + const T* offset_input_rois = input_rois + n * kROISize; + int box_batch_ind = box_batch_id_data[n]; + int box_start_w = round(offset_input_rois[0] * spatial_scale); + int box_start_h = round(offset_input_rois[1] * spatial_scale); + int box_end_w = round(offset_input_rois[2] * spatial_scale); + int box_end_h = round(offset_input_rois[3] * spatial_scale); + + int box_width = max(box_end_w - box_start_w + 1, 1); + int box_height = max(box_end_h - box_start_h + 1, 1); + + int hstart = static_cast(floor(static_cast(ph) * + static_cast(box_height) / + static_cast(pooled_height))); + int wstart = static_cast(floor(static_cast(pw) * + static_cast(box_width) / + static_cast(pooled_width))); + int hend = static_cast(ceil(static_cast(ph + 1) * + static_cast(box_height) / + static_cast(pooled_height))); + int wend = static_cast(ceil(static_cast(pw + 1) * + static_cast(box_width) / + static_cast(pooled_width))); + hstart = min(max(hstart + box_start_h, 0), height); + hend = min(max(hend + box_start_h, 0), height); + wstart = min(max(wstart + box_start_w, 0), width); + wend = min(max(wend + box_start_w, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + T maxval = is_empty ? 0 : -std::numeric_limits::max(); + int maxidx = -1; + const T* offset_input_data = + input_data + (box_batch_ind * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int input_data_index = h * width + w; + if (offset_input_data[input_data_index] > maxval) { + maxval = offset_input_data[input_data_index]; + maxidx = input_data_index; + } + } + } + output_data[i] = maxval; + if (arg_max_data) { + arg_max_data[i] = maxidx; + } + } +} + +template +void RoiPoolKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + int pooled_height, + int pooled_width, + float spatial_scale, + DenseTensor* out, + DenseTensor* arg_max) { + auto x_dims = x.dims(); + int batch_size = x_dims[0]; + auto in_stride = phi::stride(x_dims); + int channels = x_dims[1]; + int height = x_dims[2]; + int width = x_dims[3]; + + int rois_num = boxes.dims()[0]; + + if (rois_num == 0) return; + + int output_size = out->numel(); + int blocks = NumBlocks(output_size); + int threads = kNumCUDAThreads; + + DenseTensor box_batch_id_list; + box_batch_id_list.Resize({rois_num}); + int* box_batch_id_data = dev_ctx.template HostAlloc(&box_batch_id_list); + auto gplace = dev_ctx.GetPlace(); + + if (boxes_num) { + int boxes_batch_size = boxes_num->numel(); + PADDLE_ENFORCE_EQ( + boxes_batch_size, + batch_size, + phi::errors::InvalidArgument( + "The batch size of input(ROIs) and input(X) must be the same but " + "received batch size of input(ROIs) and input(X) is %d and %d " + "respectively.", + boxes_batch_size, + batch_size)); + std::vector boxes_num_list(boxes_batch_size); + paddle::memory::Copy(phi::CPUPlace(), + boxes_num_list.data(), + gplace, + boxes_num->data(), + sizeof(int) * boxes_batch_size, + 0); + int start = 0; + for (int n = 0; n < boxes_batch_size; ++n) { + for (int i = start; i < start + boxes_num_list[n]; ++i) { + box_batch_id_data[i] = n; + } + start += boxes_num_list[n]; + } + } else { + auto boxes_lod = boxes.lod().back(); + int boxes_batch_size = boxes_lod.size() - 1; + PADDLE_ENFORCE_EQ( + boxes_batch_size, + batch_size, + phi::errors::InvalidArgument( + "The batch size of input(ROIs) and input(X) must be the same but " + "received batch size of input(ROIs) and input(X) is %d and %d " + "respectively.", + boxes_batch_size, + batch_size)); + + int boxes_num_with_lod = boxes_lod[boxes_batch_size]; + PADDLE_ENFORCE_EQ(rois_num, + boxes_num_with_lod, + phi::errors::InvalidArgument( + "The number of rois from input(ROIs) and its LOD " + "must be the same. Received rois %d of input(ROIs) " + "but the number of rois %d from its LOD is %d", + rois_num, + boxes_num_with_lod)); + for (int n = 0; n < boxes_batch_size; ++n) { + for (size_t i = boxes_lod[n]; i < boxes_lod[n + 1]; ++i) { + box_batch_id_data[i] = n; + } + } + } + + int bytes = box_batch_id_list.numel() * sizeof(int); + auto box_ptr = paddle::memory::Alloc(dev_ctx, bytes); + int* box_id_data = reinterpret_cast(box_ptr->ptr()); + paddle::memory::Copy(gplace, + box_id_data, + phi::CPUPlace(), + box_batch_id_data, + bytes, + dev_ctx.stream()); + + T* output_data = dev_ctx.template Alloc(out); + int64_t* arg_max_data = dev_ctx.template Alloc(arg_max); + + GPURoiPoolForward<<>>( + output_size, + x.data(), + boxes.data(), + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + box_id_data, + output_data, + arg_max_data); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + roi_pool, GPU, ALL_LAYOUT, phi::RoiPoolKernel, float, double) { + kernel->OutputAt(1).SetDataType(phi::DataType::INT64); +} diff --git a/paddle/phi/kernels/roi_pool_grad_kernel.h b/paddle/phi/kernels/roi_pool_grad_kernel.h new file mode 100644 index 00000000000..d7f1c378f75 --- /dev/null +++ b/paddle/phi/kernels/roi_pool_grad_kernel.h @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/utils/optional.h" + +namespace phi { + +template +void RoiPooGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + const DenseTensor& arg_max, + const DenseTensor& out_grad, + int pooled_height, + int pooled_width, + float spatial_scale, + DenseTensor* dx); + +} // namespace phi diff --git a/paddle/phi/kernels/roi_pool_kernel.h b/paddle/phi/kernels/roi_pool_kernel.h new file mode 100644 index 00000000000..c6ff6f22361 --- /dev/null +++ b/paddle/phi/kernels/roi_pool_kernel.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/utils/optional.h" + +namespace phi { + +static constexpr int kROISize = 4; + +template +void RoiPoolKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& boxes, + paddle::optional boxes_num, + int pooled_height, + int pooled_width, + float spatial_scale, + DenseTensor* out, + DenseTensor* arg_max); + +} // namespace phi diff --git a/paddle/phi/ops/compat/roi_pool_sig.cc b/paddle/phi/ops/compat/roi_pool_sig.cc new file mode 100644 index 00000000000..d04c645f183 --- /dev/null +++ b/paddle/phi/ops/compat/roi_pool_sig.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature RoiPoolOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("roi_pool", + {"X", "ROIs", "RoisNum"}, + {"pooled_height", "pooled_width", "spatial_scale"}, + {"Out", "Argmax"}); +} + +KernelSignature RoiPoolOpGradArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("roi_pool_grad", + {"X", "ROIs", "RoisNum", "Argmax", GradVarName("Out")}, + {"pooled_height", "pooled_width", "spatial_scale"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(roi_pool, phi::RoiPoolOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(roi_pool_grad, phi::RoiPoolOpGradArgumentMapping); -- GitLab From 883a8eeaca3e934b070c8dbac6c4e0e733fea8dd Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Thu, 17 Mar 2022 15:12:39 +0800 Subject: [PATCH 140/176] rename math (#40641) --- paddle/fluid/operators/determinant_op.h | 2 +- paddle/fluid/operators/eig_op.h | 2 +- .../elementwise/elementwise_add_op.h | 2 +- .../elementwise/elementwise_mul_op.h | 2 +- paddle/fluid/operators/lu_op.h | 2 +- paddle/phi/kernels/CMakeLists.txt | 3 +- paddle/phi/kernels/cpu/elementwise_kernel.cc | 117 +++++++++++++++ paddle/phi/kernels/cpu/math_kernel.cc | 140 ------------------ .../phi/kernels/cpu/matrix_rank_tol_kernel.cc | 2 +- .../{math_kernel.cc => elementwise_kernel.cc} | 2 +- paddle/phi/kernels/elementwise_kernel.h | 98 +++++++++++- paddle/phi/kernels/gpu/elementwise_kernel.cu | 93 ++++++++++++ paddle/phi/kernels/gpu/math_kernel.cu | 125 ---------------- .../phi/kernels/gpu/matrix_rank_tol_kernel.cu | 2 +- .../impl/cholesky_solve_grad_kernel_impl.h | 2 +- .../impl/determinant_grad_kernel_impl.h | 2 +- .../phi/kernels/impl/eigh_grad_kernel_impl.h | 2 +- paddle/phi/kernels/math_kernel.h | 117 --------------- .../tests/kernels/test_elementwise_dev_api.cc | 2 +- 19 files changed, 320 insertions(+), 397 deletions(-) delete mode 100644 paddle/phi/kernels/cpu/math_kernel.cc rename paddle/phi/kernels/{math_kernel.cc => elementwise_kernel.cc} (98%) delete mode 100644 paddle/phi/kernels/gpu/math_kernel.cu delete mode 100644 paddle/phi/kernels/math_kernel.h diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h index e6de0ee3548..a1fe8a25665 100644 --- a/paddle/fluid/operators/determinant_op.h +++ b/paddle/fluid/operators/determinant_op.h @@ -22,6 +22,7 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/complex_kernel.h" +#include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/funcs/common_shape.h" #include "paddle/phi/kernels/funcs/diag_functor.h" @@ -30,7 +31,6 @@ #include "paddle/phi/kernels/funcs/unsqueeze.h" #include "paddle/phi/kernels/impl/determinant_grad_kernel_impl.h" #include "paddle/phi/kernels/impl/determinant_kernel_impl.h" -#include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/matmul_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h index 5e4c83e1a45..6daf05a9d77 100644 --- a/paddle/fluid/operators/eig_op.h +++ b/paddle/fluid/operators/eig_op.h @@ -21,13 +21,13 @@ #include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/complex_kernel.h" +#include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/phi/kernels/funcs/complex_functors.h" #include "paddle/phi/kernels/funcs/diag_functor.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/slice.h" #include "paddle/phi/kernels/funcs/unsqueeze.h" -#include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/matmul_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h index a995877778e..c28abb916b7 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h @@ -27,7 +27,7 @@ limitations under the License. */ // only can include the headers in paddle/phi/include dirs #include "paddle/phi/kernels/elementwise_grad_kernel.h" -#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/elementwise_kernel.h" #endif namespace paddle { diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index 58a3123c7e3..6f4aba93d56 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/platform/cpu_info.h" -#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/elementwise_kernel.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h index 6e2ac4617da..2414ae68438 100644 --- a/paddle/fluid/operators/lu_op.h +++ b/paddle/fluid/operators/lu_op.h @@ -18,9 +18,9 @@ limitations under the License. */ #include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/operators/set_value_op.h" #include "paddle/fluid/operators/svd_helper.h" +#include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" #include "paddle/phi/kernels/funcs/tril_triu_compute.h" -#include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/triangular_solve_kernel.h" namespace paddle { diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index d16f5f725df..02b5b2d74ad 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -27,7 +27,7 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel) # Some kernels depend on some targets that are not commonly used. # These targets are not suitable for common dependencies. # In this case, you need to manually generate them here. -set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel math_kernel +set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel @@ -35,7 +35,6 @@ set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_k kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function) kernel_library(gumbel_softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(gumbel_softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) -kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel) kernel_library(matrix_power_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting) diff --git a/paddle/phi/kernels/cpu/elementwise_kernel.cc b/paddle/phi/kernels/cpu/elementwise_kernel.cc index 37ad18df56e..095d11720ce 100644 --- a/paddle/phi/kernels/cpu/elementwise_kernel.cc +++ b/paddle/phi/kernels/cpu/elementwise_kernel.cc @@ -12,10 +12,81 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/phi/kernels/cpu/elementwise.h" +#include "paddle/phi/api/ext/dispatch.h" #include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/complex.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h" +namespace phi { + +#define DEFINE_CPU_ELEMENTWISE_OP(name) \ + template \ + void name##RawKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + int axis, \ + DenseTensor* out) { \ + dev_ctx.template Alloc(out); \ + if (x.dims() == y.dims()) { \ + SameDimsElementwiseCompute>()( \ + dev_ctx, x, y, out); \ + } else { \ + auto x_dims = x.dims(); \ + auto y_dims = y.dims(); \ + if (x_dims.size() >= y_dims.size()) { \ + funcs::ElementwiseCompute, T>( \ + dev_ctx, x, y, axis, funcs::name##Functor(), out); \ + } else { \ + funcs::ElementwiseCompute, T>( \ + dev_ctx, x, y, axis, funcs::Inverse##name##Functor(), out); \ + } \ + } \ + } + +template +void DivideRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out) { + // allocate memory for out + dev_ctx.template Alloc(out); + if (x.dims() == y.dims() && std::is_floating_point::value) { + SameDimsElementwiseCompute>()( + dev_ctx, x, y, out); + } else { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + if (x_dims.size() >= y_dims.size()) { + funcs::ElementwiseCompute, T>( + dev_ctx, x, y, axis, funcs::DivideFunctor(), out); + } else { + funcs::ElementwiseCompute, T>( + dev_ctx, x, y, axis, funcs::InverseDivideFunctor(), out); + } + } +} + +// Create the definition of Add +DEFINE_CPU_ELEMENTWISE_OP(Add) + +// Create the definition of Subtract +DEFINE_CPU_ELEMENTWISE_OP(Subtract) + +// Create the definition of Multiply +DEFINE_CPU_ELEMENTWISE_OP(Multiply) + +} // namespace phi + +using complex64 = ::phi::dtype::complex; +using complex128 = ::phi::dtype::complex; + +// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16 +// using bfloat16 = ::phi::dtype::bfloat16; + PD_REGISTER_KERNEL(elementwise_fmax, CPU, ALL_LAYOUT, @@ -33,3 +104,49 @@ PD_REGISTER_KERNEL(elementwise_fmin, double, int, int64_t) {} + +PD_REGISTER_KERNEL(add_raw, + CPU, + ALL_LAYOUT, + phi::AddRawKernel, + float, + double, + int16_t, + int, + int64_t, + complex64, + complex128) {} +PD_REGISTER_KERNEL(subtract_raw, + CPU, + ALL_LAYOUT, + phi::SubtractRawKernel, + float, + double, + int16_t, + int, + int64_t, + complex64, + complex128, + phi::dtype::bfloat16) {} +PD_REGISTER_KERNEL(divide_raw, + CPU, + ALL_LAYOUT, + phi::DivideRawKernel, + float, + double, + int, + int64_t, + complex64, + complex128) {} +PD_REGISTER_KERNEL(multiply_raw, + CPU, + ALL_LAYOUT, + phi::MultiplyRawKernel, + float, + double, + int, + int64_t, + bool, + complex64, + complex128, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/cpu/math_kernel.cc b/paddle/phi/kernels/cpu/math_kernel.cc deleted file mode 100644 index 0047940fd17..00000000000 --- a/paddle/phi/kernels/cpu/math_kernel.cc +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/math_kernel.h" - -#include "paddle/phi/api/ext/dispatch.h" -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/common/scalar.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/elementwise.h" -#include "paddle/phi/kernels/funcs/elementwise_base.h" -#include "paddle/phi/kernels/funcs/elementwise_functor.h" - -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/framework/eigen.h" -#include "paddle/phi/common/bfloat16.h" -#include "paddle/phi/common/complex.h" - -namespace phi { - -#define DEFINE_CPU_ELEMENTWISE_OP(name) \ - template \ - void name##RawKernel(const Context& dev_ctx, \ - const DenseTensor& x, \ - const DenseTensor& y, \ - int axis, \ - DenseTensor* out) { \ - dev_ctx.template Alloc(out); \ - if (x.dims() == y.dims()) { \ - SameDimsElementwiseCompute>()( \ - dev_ctx, x, y, out); \ - } else { \ - auto x_dims = x.dims(); \ - auto y_dims = y.dims(); \ - if (x_dims.size() >= y_dims.size()) { \ - funcs::ElementwiseCompute, T>( \ - dev_ctx, x, y, axis, funcs::name##Functor(), out); \ - } else { \ - funcs::ElementwiseCompute, T>( \ - dev_ctx, x, y, axis, funcs::Inverse##name##Functor(), out); \ - } \ - } \ - } - -template -void DivideRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out) { - // allocate memory for out - dev_ctx.template Alloc(out); - if (x.dims() == y.dims() && std::is_floating_point::value) { - SameDimsElementwiseCompute>()( - dev_ctx, x, y, out); - } else { - auto x_dims = x.dims(); - auto y_dims = y.dims(); - if (x_dims.size() >= y_dims.size()) { - funcs::ElementwiseCompute, T>( - dev_ctx, x, y, axis, funcs::DivideFunctor(), out); - } else { - funcs::ElementwiseCompute, T>( - dev_ctx, x, y, axis, funcs::InverseDivideFunctor(), out); - } - } -} - -// Create the definition of Add -DEFINE_CPU_ELEMENTWISE_OP(Add) - -// Create the definition of Subtract -DEFINE_CPU_ELEMENTWISE_OP(Subtract) - -// Create the definition of Multiply -DEFINE_CPU_ELEMENTWISE_OP(Multiply) - -} // namespace phi - -using complex64 = ::phi::dtype::complex; -using complex128 = ::phi::dtype::complex; - -// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16 -// using bfloat16 = ::phi::dtype::bfloat16; -PD_REGISTER_KERNEL(add_raw, - CPU, - ALL_LAYOUT, - phi::AddRawKernel, - float, - double, - int16_t, - int, - int64_t, - complex64, - complex128) {} -PD_REGISTER_KERNEL(subtract_raw, - CPU, - ALL_LAYOUT, - phi::SubtractRawKernel, - float, - double, - int16_t, - int, - int64_t, - complex64, - complex128, - phi::dtype::bfloat16) {} -PD_REGISTER_KERNEL(divide_raw, - CPU, - ALL_LAYOUT, - phi::DivideRawKernel, - float, - double, - int, - int64_t, - complex64, - complex128) {} -PD_REGISTER_KERNEL(multiply_raw, - CPU, - ALL_LAYOUT, - phi::MultiplyRawKernel, - float, - double, - int, - int64_t, - bool, - complex64, - complex128, - phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc index 636018ffa68..ae1e406d16e 100644 --- a/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc +++ b/paddle/phi/kernels/cpu/matrix_rank_tol_kernel.cc @@ -17,12 +17,12 @@ #include #include #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/funcs/compare_functors.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h" -#include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/reduce_kernel.h" namespace phi { diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/elementwise_kernel.cc similarity index 98% rename from paddle/phi/kernels/math_kernel.cc rename to paddle/phi/kernels/elementwise_kernel.cc index 5aad2375ebb..9d10a48c9e0 100644 --- a/paddle/phi/kernels/math_kernel.cc +++ b/paddle/phi/kernels/elementwise_kernel.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/phi/backends/all_context.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/elementwise_kernel.h b/paddle/phi/kernels/elementwise_kernel.h index c1e73ad91c6..b064ecc454c 100644 --- a/paddle/phi/kernels/elementwise_kernel.h +++ b/paddle/phi/kernels/elementwise_kernel.h @@ -15,7 +15,7 @@ #pragma once #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/device_context.h" +#include "paddle/phi/infermeta/binary.h" namespace phi { @@ -33,4 +33,100 @@ void ElementwiseFMinKernel(const Context& dev_ctx, int axis, DenseTensor* out); +template +void AddRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +template +void AddKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); + +template +void SubtractRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +template +void SubtractKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); + +template +void DivideRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +template +void DivideKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); + +template +void MultiplyRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +template +void MultiplyKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); + +template +DenseTensor Add(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y) { + DenseTensor dense_out; + MetaTensor meta_out(&dense_out); + ElementwiseInferMeta(x, y, &meta_out); + AddKernel(dev_ctx, x, y, &dense_out); + return dense_out; +} + +template +DenseTensor Subtract(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y) { + DenseTensor dense_out; + MetaTensor meta_out(&dense_out); + ElementwiseInferMeta(x, y, &meta_out); + SubtractKernel(dev_ctx, x, y, &dense_out); + return dense_out; +} + +template +DenseTensor Divide(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y) { + DenseTensor dense_out; + MetaTensor meta_out(&dense_out); + ElementwiseInferMeta(x, y, &meta_out); + DivideKernel(dev_ctx, x, y, &dense_out); + return dense_out; +} + +template +DenseTensor Multiply(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y) { + DenseTensor dense_out; + MetaTensor meta_out(&dense_out); + ElementwiseInferMeta(x, y, &meta_out); + MultiplyKernel(dev_ctx, x, y, &dense_out); + return dense_out; +} + } // namespace phi diff --git a/paddle/phi/kernels/gpu/elementwise_kernel.cu b/paddle/phi/kernels/gpu/elementwise_kernel.cu index 2cffc68fa06..a57d89013f9 100644 --- a/paddle/phi/kernels/gpu/elementwise_kernel.cu +++ b/paddle/phi/kernels/gpu/elementwise_kernel.cu @@ -13,9 +13,50 @@ // limitations under the License. #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/complex.h" +#include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h" +namespace phi { + +#define DEFINE_CUDA_ELEMENTWISE_OP(name) \ + template \ + void name##RawKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + int axis, \ + DenseTensor* out) { \ + std::vector inputs; \ + std::vector outputs; \ + inputs.emplace_back(&x); \ + inputs.emplace_back(&y); \ + outputs.emplace_back(out); \ + dev_ctx.template Alloc(out); \ + funcs::BroadcastKernel( \ + dev_ctx, inputs, &outputs, axis, funcs::name##Functor()); \ + } + +/** + * Kernels + */ + +// Create the definition of Add +DEFINE_CUDA_ELEMENTWISE_OP(Add) +// Create the definition of Subtract +DEFINE_CUDA_ELEMENTWISE_OP(Subtract) +// Create the definition of Multiply +DEFINE_CUDA_ELEMENTWISE_OP(Multiply) +// Create the definition of Divide +DEFINE_CUDA_ELEMENTWISE_OP(Divide) + +} // namespace phi + +using float16 = phi::dtype::float16; +using bfloat16 = phi::dtype::bfloat16; +using complex64 = ::phi::dtype::complex; +using complex128 = ::phi::dtype::complex; + PD_REGISTER_KERNEL(elementwise_fmax, GPU, ALL_LAYOUT, @@ -33,3 +74,55 @@ PD_REGISTER_KERNEL(elementwise_fmin, double, int, int64_t) {} + +PD_REGISTER_KERNEL(add_raw, + GPU, + ALL_LAYOUT, + phi::AddRawKernel, + float, + double, + int16_t, + int, + int64_t, + float16, + bfloat16, + complex64, + complex128) {} +PD_REGISTER_KERNEL(subtract_raw, + GPU, + ALL_LAYOUT, + phi::SubtractRawKernel, + float, + double, + int16_t, + int, + int64_t, + float16, + bfloat16, + complex64, + complex128) {} +PD_REGISTER_KERNEL(divide_raw, + GPU, + ALL_LAYOUT, + phi::DivideRawKernel, + float, + double, + int, + int64_t, + float16, + bfloat16, + complex64, + complex128) {} +PD_REGISTER_KERNEL(multiply_raw, + GPU, + ALL_LAYOUT, + phi::MultiplyRawKernel, + float, + double, + int, + int64_t, + bool, + float16, + complex64, + complex128, + bfloat16) {} diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu deleted file mode 100644 index d33f2164682..00000000000 --- a/paddle/phi/kernels/gpu/math_kernel.cu +++ /dev/null @@ -1,125 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/kernels/math_kernel.h" - -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/kernels/funcs/broadcast_function.h" -#include "paddle/phi/kernels/funcs/elementwise_functor.h" -#include "paddle/phi/kernels/gpu/reduce.h" - -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif - -#include "paddle/phi/common/complex.h" -#include "paddle/phi/common/float16.h" -#include "paddle/phi/core/compat/convert_utils.h" -#include "paddle/phi/core/enforce.h" -#include "paddle/phi/core/kernel_registry.h" - -namespace phi { - -#define DEFINE_CUDA_ELEMENTWISE_OP(name) \ - template \ - void name##RawKernel(const Context& dev_ctx, \ - const DenseTensor& x, \ - const DenseTensor& y, \ - int axis, \ - DenseTensor* out) { \ - std::vector inputs; \ - std::vector outputs; \ - inputs.emplace_back(&x); \ - inputs.emplace_back(&y); \ - outputs.emplace_back(out); \ - dev_ctx.template Alloc(out); \ - funcs::BroadcastKernel( \ - dev_ctx, inputs, &outputs, axis, funcs::name##Functor()); \ - } - -/** - * Kernels - */ - -// Create the definition of Add -DEFINE_CUDA_ELEMENTWISE_OP(Add) -// Create the definition of Subtract -DEFINE_CUDA_ELEMENTWISE_OP(Subtract) -// Create the definition of Multiply -DEFINE_CUDA_ELEMENTWISE_OP(Multiply) -// Create the definition of Divide -DEFINE_CUDA_ELEMENTWISE_OP(Divide) - -} // namespace phi - -using float16 = phi::dtype::float16; -using bfloat16 = phi::dtype::bfloat16; -using complex64 = ::phi::dtype::complex; -using complex128 = ::phi::dtype::complex; - -PD_REGISTER_KERNEL(add_raw, - GPU, - ALL_LAYOUT, - phi::AddRawKernel, - float, - double, - int16_t, - int, - int64_t, - float16, - bfloat16, - complex64, - complex128) {} -PD_REGISTER_KERNEL(subtract_raw, - GPU, - ALL_LAYOUT, - phi::SubtractRawKernel, - float, - double, - int16_t, - int, - int64_t, - float16, - bfloat16, - complex64, - complex128) {} -PD_REGISTER_KERNEL(divide_raw, - GPU, - ALL_LAYOUT, - phi::DivideRawKernel, - float, - double, - int, - int64_t, - float16, - bfloat16, - complex64, - complex128) {} -PD_REGISTER_KERNEL(multiply_raw, - GPU, - ALL_LAYOUT, - phi::MultiplyRawKernel, - float, - double, - int, - int64_t, - bool, - float16, - complex64, - complex128, - bfloat16) {} diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu index 7796132ec07..66ba30f7ce6 100644 --- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu +++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu @@ -23,11 +23,11 @@ #include "paddle/phi/backends/dynload/cusolver.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/abs_kernel.h" +#include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/compare_functors.h" #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h" -#include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/reduce_kernel.h" namespace phi { diff --git a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h index 72741e6d3a0..e3ea10705d2 100644 --- a/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/cholesky_solve_grad_kernel_impl.h @@ -19,6 +19,7 @@ #include "paddle/phi/kernels/cholesky_solve_kernel.h" #include "paddle/phi/kernels/complex_kernel.h" #include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/expand_kernel.h" #include "paddle/phi/kernels/funcs/blas/blas.h" @@ -27,7 +28,6 @@ #include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/matrix_reduce.h" #include "paddle/phi/kernels/funcs/tril_triu_compute.h" -#include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" namespace phi { diff --git a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h index 038ef0c214b..e4356e9af39 100644 --- a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h @@ -17,13 +17,13 @@ #include "paddle/phi/kernels/determinant_grad_kernel.h" #include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/matrix_inverse.h" #include "paddle/phi/kernels/funcs/unsqueeze.h" -#include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" namespace phi { diff --git a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h index 5b71fd7fa3a..5e06435b28e 100644 --- a/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/eigh_grad_kernel_impl.h @@ -16,11 +16,11 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/complex_kernel.h" +#include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/phi/kernels/funcs/diag_functor.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/unsqueeze.h" -#include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/kernels/matmul_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" diff --git a/paddle/phi/kernels/math_kernel.h b/paddle/phi/kernels/math_kernel.h deleted file mode 100644 index ddc3a46e989..00000000000 --- a/paddle/phi/kernels/math_kernel.h +++ /dev/null @@ -1,117 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/infermeta/binary.h" -namespace phi { - -template -void AddRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out); - -template -void AddKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out); - -template -void SubtractRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out); - -template -void SubtractKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out); - -template -void DivideRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out); - -template -void DivideKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out); - -template -void MultiplyRawKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out); - -template -void MultiplyKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out); - -template -DenseTensor Add(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y) { - DenseTensor dense_out; - MetaTensor meta_out(&dense_out); - ElementwiseInferMeta(x, y, &meta_out); - AddKernel(dev_ctx, x, y, &dense_out); - return dense_out; -} - -template -DenseTensor Subtract(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y) { - DenseTensor dense_out; - MetaTensor meta_out(&dense_out); - ElementwiseInferMeta(x, y, &meta_out); - SubtractKernel(dev_ctx, x, y, &dense_out); - return dense_out; -} - -template -DenseTensor Divide(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y) { - DenseTensor dense_out; - MetaTensor meta_out(&dense_out); - ElementwiseInferMeta(x, y, &meta_out); - DivideKernel(dev_ctx, x, y, &dense_out); - return dense_out; -} - -template -DenseTensor Multiply(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y) { - DenseTensor dense_out; - MetaTensor meta_out(&dense_out); - ElementwiseInferMeta(x, y, &meta_out); - MultiplyKernel(dev_ctx, x, y, &dense_out); - return dense_out; -} - -} // namespace phi diff --git a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc index 3e5f9650741..9552c02976f 100644 --- a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc +++ b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include #include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/phi/api/lib/utils/allocator.h" -- GitLab From 60899549196cf1785b09f54f2a5100c922080cc4 Mon Sep 17 00:00:00 2001 From: Weilong Wu Date: Thu, 17 Mar 2022 15:12:44 +0800 Subject: [PATCH 141/176] [Bug fixes] Fix partial grad conflicts (#40655) * [Eager] Support eager grad interface, draft version * Support eager grad interface with allow_unused and multi startup_op * Fix code format * Fix allow_unused case, return PyNone if tensor not initialize * Support output's stop_gradient related to create_graph * Support grad exception case in eager mode, fix coverage CI * Update ToPyObject, return PyNone if not initialize * AccumulationNode add FLAGS_retain_grad_for_all_tensor * Fix ci issue * Fix CI issue * fix, use core.eager.Tensor * Add func SetBufferSlotRankZeros for GradTensorHolder * Support retain_graph by using ClearTensorWrappers * Support retain_graph by using ClearTensorWrappers * Update retain_graph and no_grad_vars related test case * Update code gen logic for ClearTensorWrappers * Fix by override statement * fix override func args * Support retain_graph, update unit tests * Updated ClearTensorWrappers logic * fix grad python interface * Use deep copy and update unit tests * Polish code * Polish code * Fix CI issue, Deep copy only use when user set grad_tensors * Fix CI, use Backward instead RunBackward * Fix CI, Declare kernel explicitly in test file * Polish, remove vector of TensorWrapper * Refactor the logic of grad/backward, polish codes * Update code after merge upstream develop * Polish after merge upstream develop * Update to adapt new GradNodeBase superclass * Fix error introduced during conflict resolution * Update purify potential_startup_nodes logic * Fix errors * Polish code * Remove useless args for ToPyObject * Remove useless TensorWrappersSet * Fix code-format, re-install pre-commit * Fix pre-process logic for potential_startup_ops * Update unit tests, use eager mode * Fix conflicts --- paddle/fluid/eager/backward.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index f2d5f338bd4..75ddfb92275 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -370,7 +370,7 @@ std::vector RunBackward( if (grad_tensors[i].is_initialized()) { // Deep copy paddle::experimental::Tensor tmp_tensor; - tmp_tensor.copy_(grad_tensors[i], true); + tmp_tensor.copy_(grad_tensors[i], grad_tensors[i].inner_place(), true); node_input_buffers_dict[grad_node]->add(input_info.first, input_info.second, tmp_tensor); } else { -- GitLab From da558f0e4e0bdcd2bc7e210c7eda8d617397b084 Mon Sep 17 00:00:00 2001 From: Qi Li Date: Thu, 17 Mar 2022 15:14:11 +0800 Subject: [PATCH 142/176] [ROCm] fix bfloat16 support, test=develop (#40401) --- paddle/fluid/imperative/gradient_accumulator.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 12aa13bbacc..499cf4d8ad6 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -423,7 +423,7 @@ void TensorAdd(const VarType& src, VarType* dst) { } if (data_type == framework::proto::VarType::BF16) { if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) return TensorAddImpl( src_tensor, dst_tensor, place); #else -- GitLab From fcbb7440bfd76c0fb26c32feb70584b3f2fd75f5 Mon Sep 17 00:00:00 2001 From: heliqi <1101791222@qq.com> Date: Thu, 17 Mar 2022 15:18:23 +0800 Subject: [PATCH 143/176] CopyFromCpu and CopyToCpu of Onnxruntime back-end optimize (#40561) * add onnxruntime predictor * Add code comments * support link paddle2onnx onnxruntime * support onnxruntime with python * support onnxruntime with python * support onnxruntime with windows * paddle2onnx compile with windows * supoort windows compile * supoort windows compile with onnxruntime * supoort windows compile with paddle2onnx * supoort mac compile * compile with mac * compile with mac * add code comments * fix remind word * code optimization * add test case * add test case * add inference demo_ci test case * fix compile paddle2onnx with no python * add inference demo_ci test case * add inference demo_ci test case * add inference infer_ut test case * support c go api and test cases * add converage test case * add converage test case * add capi test case * add capi test case * fix onnxruntime copyfromcpu and copytocpu * fix goapi * modify code --- cmake/external/paddle2onnx.cmake | 1 + .../inference/api/details/CMakeLists.txt | 6 +- .../inference/api/details/zero_copy_tensor.cc | 151 +++++++++++++++- .../inference/api/onnxruntime_predictor.cc | 161 +++++++----------- .../inference/api/onnxruntime_predictor.h | 28 +-- paddle/fluid/inference/api/paddle_tensor.h | 22 +++ 6 files changed, 237 insertions(+), 132 deletions(-) diff --git a/cmake/external/paddle2onnx.cmake b/cmake/external/paddle2onnx.cmake index 661c3675c84..ba6f0396008 100644 --- a/cmake/external/paddle2onnx.cmake +++ b/cmake/external/paddle2onnx.cmake @@ -61,6 +61,7 @@ set(PADDLE2ONNX_OPTIONAL_ARGS -DONNX_CUSTOM_PROTOC_PATH=${PROTOC_BIN_PATH} -DWITH_STATIC=OFF -DCMAKE_INSTALL_PREFIX=${PADDLE2ONNX_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR} -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} ${EXTERNAL_OPTIONAL_ARGS} diff --git a/paddle/fluid/inference/api/details/CMakeLists.txt b/paddle/fluid/inference/api/details/CMakeLists.txt index 4341fb0a9cc..b2cfb060dd3 100644 --- a/paddle/fluid/inference/api/details/CMakeLists.txt +++ b/paddle/fluid/inference/api/details/CMakeLists.txt @@ -14,7 +14,11 @@ # cc_library(reset_tensor_array SRCS reset_tensor_array.cc DEPS lod_tensor scope) -cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce) +if (WITH_ONNXRUNTIME) + cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce onnxruntime) +else (WITH_ONNXRUNTIME) + cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce) +endif (WITH_ONNXRUNTIME) cc_library(zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc) cc_test(zero_copy_tensor_test SRCS zero_copy_tensor_test.cc DEPS paddle_inference_api) diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index 18b1d09f0e8..66dec0157d9 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -22,12 +22,22 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/float16.h" #include "paddle/phi/core/allocator.h" +#ifdef PADDLE_WITH_ONNXRUNTIME +#include "paddle/fluid/inference/api/onnxruntime_predictor.h" +#endif namespace paddle_infer { using float16 = paddle::platform::float16; void Tensor::Reshape(const std::vector &shape) { +#ifdef PADDLE_WITH_ONNXRUNTIME + if (is_ort_tensor_) { + shape_.assign(shape.begin(), shape.end()); + return; + } +#endif + PADDLE_ENFORCE_EQ( name_.empty(), false, paddle::platform::errors::PreconditionNotMet( @@ -123,6 +133,11 @@ T *Tensor::data(PlaceType *place, int *size) const { } DataType Tensor::type() const { +#ifdef PADDLE_WITH_ONNXRUNTIME + if (is_ort_tensor_) { + return dtype_; + } +#endif EAGER_GET_TENSOR(paddle::framework::LoDTensor); auto type = paddle::framework::TransToProtoVarType(tensor->dtype()); if (type == paddle::framework::proto::VarType::FP32) { @@ -145,6 +160,13 @@ PlaceType Tensor::place() const { return place_; } template void Tensor::CopyFromCpu(const T *data) { +#ifdef PADDLE_WITH_ONNXRUNTIME + if (is_ort_tensor_) { + ORTCopyFromCpu(data); + return; + } +#endif + EAGER_GET_TENSOR(paddle::framework::LoDTensor); PADDLE_ENFORCE_GE(tensor->numel(), 0, paddle::platform::errors::PreconditionNotMet( @@ -382,6 +404,13 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, template void Tensor::CopyToCpu(T *data) const { +#ifdef PADDLE_WITH_ONNXRUNTIME + if (is_ort_tensor_) { + ORTCopyToCpu(data); + return; + } +#endif + CopyToCpuImpl(data, nullptr, nullptr, nullptr); } @@ -489,12 +518,7 @@ template PD_INFER_DECL uint8_t *Tensor::mutable_data(PlaceType place); template PD_INFER_DECL int8_t *Tensor::mutable_data(PlaceType place); template PD_INFER_DECL float16 *Tensor::mutable_data(PlaceType place); -Tensor::Tensor(void *scope) : scope_{scope} { - PADDLE_ENFORCE_NOT_NULL(scope_, - paddle::platform::errors::PreconditionNotMet( - "The `scope` can not be nullptr. It should be " - "set to the pointer of scope.")); -} +Tensor::Tensor(void *scope) : scope_{scope} {} template void *Tensor::FindTensor() const { @@ -513,6 +537,26 @@ void *Tensor::FindTensor() const { } std::vector Tensor::shape() const { +#ifdef PADDLE_WITH_ONNXRUNTIME + if (is_ort_tensor_) { + std::vector shape; + // input handle + if (idx_ < 0) { + shape.assign(shape_.begin(), shape_.end()); + } else { // output handle + auto binding = binding_.lock(); + PADDLE_ENFORCE_NOT_NULL(binding, + paddle::platform::errors::PreconditionNotMet( + "output tensor [%s] no binding ptr", name_)); + std::vector outputs = binding->GetOutputValues(); + Ort::Value &value = outputs[idx_]; + auto info = value.GetTensorTypeAndShapeInfo(); + auto ort_shape = info.GetShape(); + shape.assign(ort_shape.begin(), ort_shape.end()); + } + return shape; + } +#endif EAGER_GET_TENSOR(paddle::framework::LoDTensor); PADDLE_ENFORCE_NOT_NULL( tensor_, paddle::platform::errors::PreconditionNotMet( @@ -573,4 +617,99 @@ void Tensor::SetPlace(PlaceType place, int device) { device_ = device; } +#ifdef PADDLE_WITH_ONNXRUNTIME +void Tensor::SetOrtMark(bool is_ort_tensor) { is_ort_tensor_ = is_ort_tensor; } + +void Tensor::SetOrtBinding(const std::shared_ptr binding) { + binding_ = binding; +} + +Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, float *data, + size_t size, const int64_t *shape, size_t shape_len) { + return Ort::Value::CreateTensor(memory_info, data, size, shape, + shape_len); +} + +Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int64_t *data, + size_t size, const int64_t *shape, size_t shape_len) { + return Ort::Value::CreateTensor(memory_info, data, size, shape, + shape_len); +} + +Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int32_t *data, + size_t size, const int64_t *shape, size_t shape_len) { + return Ort::Value::CreateTensor(memory_info, data, size, shape, + shape_len); +} + +Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, uint8_t *data, + size_t size, const int64_t *shape, size_t shape_len) { + return Ort::Value::CreateTensor(memory_info, data, size, shape, + shape_len); +} + +Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, int8_t *data, + size_t size, const int64_t *shape, size_t shape_len) { + return Ort::Value::CreateTensor(memory_info, data, size, shape, + shape_len); +} + +Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, float16 *data, + size_t size, const int64_t *shape, size_t shape_len) { + return Ort::Value::CreateTensor(memory_info, static_cast(data), + size * sizeof(float16), shape, shape_len, + ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16); +} + +template +void Tensor::ORTCopyFromCpu(const T *data) { + auto binding = binding_.lock(); + PADDLE_ENFORCE_NOT_NULL(binding, + paddle::platform::errors::PreconditionNotMet( + "input tensor [%s] no binding ptr", name_)); + const char *device_name = place_ == PlaceType::kCPU ? "Cpu" : "Cuda"; + Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, device_, + OrtMemTypeDefault); + size_t size = std::accumulate(begin(shape_), end(shape_), 1UL, + std::multiplies()); + auto ort_value = GetOrtVaule(memory_info, const_cast(data), size, + shape_.data(), shape_.size()); + binding->BindInput(name_.c_str(), ort_value); +} + +template +void Tensor::ORTCopyToCpu(T *data) const { + auto binding = binding_.lock(); + PADDLE_ENFORCE_NOT_NULL(binding, + paddle::platform::errors::PreconditionNotMet( + "output tensor [%s] no binding ptr", name_)); + std::vector outputs = binding->GetOutputValues(); + Ort::Value &value = outputs[idx_]; + auto info = value.GetTensorTypeAndShapeInfo(); + size_t size = info.GetElementCount() * sizeof(T); + + if (place_ == PlaceType::kCPU) { + std::memcpy(static_cast(data), value.GetTensorData(), size); + } else { + paddle::memory::Copy(paddle::platform::CPUPlace(), + static_cast(data), + paddle::platform::CUDAPlace(device_), + value.GetTensorData(), size, nullptr); + } +} + +template void Tensor::ORTCopyFromCpu(const float *data); +template void Tensor::ORTCopyFromCpu(const int64_t *data); +template void Tensor::ORTCopyFromCpu(const int32_t *data); +template void Tensor::ORTCopyFromCpu(const uint8_t *data); +template void Tensor::ORTCopyFromCpu(const int8_t *data); +template void Tensor::ORTCopyFromCpu(const float16 *data); + +template void Tensor::ORTCopyToCpu(float *data) const; +template void Tensor::ORTCopyToCpu(int32_t *data) const; +template void Tensor::ORTCopyToCpu(uint8_t *data) const; +template void Tensor::ORTCopyToCpu(int8_t *data) const; +template void Tensor::ORTCopyToCpu(float16 *data) const; +#endif + } // namespace paddle_infer diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.cc b/paddle/fluid/inference/api/onnxruntime_predictor.cc index ee82da139d8..bd9de252a09 100644 --- a/paddle/fluid/inference/api/onnxruntime_predictor.cc +++ b/paddle/fluid/inference/api/onnxruntime_predictor.cc @@ -25,11 +25,7 @@ #include #include "paddle/fluid//platform/device/gpu/gpu_types.h" -#include "paddle/fluid/framework/feed_fetch_method.h" -#include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/var_type_traits.h" -#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/framework/version.h" #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" @@ -45,24 +41,23 @@ namespace paddle { -framework::proto::VarType::Type ConvertONNXType( - ONNXTensorElementDataType type) { +paddle_infer::DataType ConvertONNXType(ONNXTensorElementDataType type) { switch (type) { case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: - return framework::proto::VarType::FP32; - // case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: - // return DataType::FP16; + return paddle_infer::DataType::FLOAT32; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: + return paddle_infer::DataType::FLOAT16; case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: - return framework::proto::VarType::INT8; + return paddle_infer::DataType::INT8; case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: - return framework::proto::VarType::INT32; + return paddle_infer::DataType::INT32; case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: - return framework::proto::VarType::INT64; + return paddle_infer::DataType::INT64; case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: - return framework::proto::VarType::UINT8; + return paddle_infer::DataType::UINT8; default: LOG(ERROR) << "unsupported ONNX Tensor Type: " << static_cast(type); - return framework::proto::VarType::FP32; + return paddle_infer::DataType::FLOAT32; } } @@ -87,13 +82,12 @@ bool ONNXRuntimePredictor::Init() { VLOG(3) << "ONNXRuntime Predictor::init()"; // Now ONNXRuntime only suuport CPU + const char *device_name = config_.use_gpu() ? "Cuda" : "Cpu"; if (config_.use_gpu()) { place_ = paddle::platform::CUDAPlace(config_.gpu_device_id()); } else { place_ = paddle::platform::CPUPlace(); } - scope_.reset(new paddle::framework::Scope()); - sub_scope_ = &scope_->NewScope(); std::string onnx_proto; paddle2onnx::Export(config_.prog_file(), config_.params_file(), &onnx_proto, @@ -125,13 +119,12 @@ bool ONNXRuntimePredictor::Init() { "generated."; } session_ = {env_, onnx_proto.data(), onnx_proto.size(), session_options}; + binding_ = std::make_shared(session_); - auto memory_info = - Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); + Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, + place_.GetDeviceId(), OrtMemTypeDefault); Ort::Allocator allocator(session_, memory_info); - framework::proto::VarType::Type proto_type = - framework::proto::VarType::LOD_TENSOR; size_t n_inputs = session_.GetInputCount(); for (size_t i = 0; i < n_inputs; ++i) { auto input_name = session_.GetInputName(i, allocator); @@ -141,8 +134,6 @@ bool ONNXRuntimePredictor::Init() { ONNXTensorElementDataType data_type = type_info.GetTensorTypeAndShapeInfo().GetElementType(); input_desc_.emplace_back(ONNXDesc{input_name, shape, data_type}); - auto *ptr = scope_->Var(input_name); - framework::InitializeVariable(ptr, proto_type); allocator.Free(input_name); } @@ -155,11 +146,13 @@ bool ONNXRuntimePredictor::Init() { ONNXTensorElementDataType data_type = type_info.GetTensorTypeAndShapeInfo().GetElementType(); output_desc_.emplace_back(ONNXDesc{output_name, shape, data_type}); - auto *ptr = scope_->Var(output_name); - framework::InitializeVariable(ptr, proto_type); + + Ort::MemoryInfo out_memory_info(device_name, OrtDeviceAllocator, + place_.GetDeviceId(), OrtMemTypeDefault); + binding_->BindOutput(output_name, out_memory_info); + allocator.Free(output_name); } - return true; } @@ -216,15 +209,26 @@ std::vector ONNXRuntimePredictor::GetOutputNames() { return output_names; } +bool ONNXRuntimePredictor::FindONNXDesc(const std::string &name, + bool is_input) { + if (is_input) { + for (auto i : input_desc_) + if (i.name == name) return true; + } else { + for (auto i : output_desc_) + if (i.name == name) return true; + } + return false; +} + std::unique_ptr ONNXRuntimePredictor::GetInputTensor( const std::string &name) { - PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name), - platform::errors::PreconditionNotMet( - "The in variable named %s is not found in the " - "scope of the ONNXPredictor.", - name)); - std::unique_ptr res( - new ZeroCopyTensor(static_cast(scope_.get()))); + PADDLE_ENFORCE_EQ(FindONNXDesc(name, true), true, + platform::errors::PreconditionNotMet( + "The in variable named %s is not found in the " + "ONNXPredictor.", + name)); + std::unique_ptr res(new ZeroCopyTensor(nullptr)); res->input_or_output_ = true; res->SetName(name); if (platform::is_cpu_place(place_)) { @@ -233,18 +237,19 @@ std::unique_ptr ONNXRuntimePredictor::GetInputTensor( auto gpu_place = place_; res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); } + res->SetOrtMark(true); + res->SetOrtBinding(binding_); return res; } std::unique_ptr ONNXRuntimePredictor::GetOutputTensor( const std::string &name) { - PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name), - platform::errors::PreconditionNotMet( - "The out variable named %s is not found in the " - "scope of the ONNXPredictor.", - name)); - std::unique_ptr res( - new ZeroCopyTensor(static_cast(scope_.get()))); + PADDLE_ENFORCE_EQ(FindONNXDesc(name, false), true, + platform::errors::PreconditionNotMet( + "The out variable named %s is not found in the " + "ONNXPredictor.", + name)); + std::unique_ptr res(new ZeroCopyTensor(nullptr)); res->input_or_output_ = false; res->SetName(name); if (platform::is_cpu_place(place_)) { @@ -253,46 +258,18 @@ std::unique_ptr ONNXRuntimePredictor::GetOutputTensor( auto gpu_place = place_; res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); } + res->SetOrtMark(true); + res->SetOrtBinding(binding_); + int size = output_desc_.size(); + for (int i = 0; i < size; ++i) + if (output_desc_[i].name == name) { + res->idx_ = i; + res->dtype_ = ConvertONNXType(output_desc_[i].dtype); + break; + } return res; } -Ort::Value ONNXRuntimePredictor::GetOrtValue(const ONNXDesc &desc, - const char *device_name) { - Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, - place_.GetDeviceId(), OrtMemTypeDefault); - auto *var = scope_->FindVar(desc.name); - auto *tensor = var->GetMutable(); - size_t size = - tensor->numel() * - framework::SizeOfType(framework::TransToProtoVarType(tensor->dtype())); - std::vector shape = phi::vectorize(tensor->dims()); - return Ort::Value::CreateTensor(memory_info, - static_cast(tensor->data()), size, - shape.data(), shape.size(), desc.dtype); -} - -void ONNXRuntimePredictor::AsTensor(const Ort::Value &value, - const ONNXDesc &desc) { - auto info = value.GetTensorTypeAndShapeInfo(); - - auto *var = scope_->FindVar(desc.name); - auto *tensor = var->GetMutable(); - tensor->Resize(phi::make_ddim(info.GetShape())); - auto dtype = ConvertONNXType(info.GetElementType()); - auto *ptr = tensor->mutable_data(place_, dtype); - - if (platform::is_cpu_place(place_)) { - std::memcpy(ptr, const_cast(value.GetTensorData()), - tensor->numel() * framework::SizeOfType(dtype)); - } else { - auto src_place = place_; - auto dst_place = place_; - memory::Copy(dst_place, ptr, src_place, - const_cast(value.GetTensorData()), - tensor->numel() * framework::SizeOfType(dtype)); - } -} - bool ONNXRuntimePredictor::Run(const std::vector &inputs, std::vector *output_data, int batch_size) { @@ -302,31 +279,7 @@ bool ONNXRuntimePredictor::Run(const std::vector &inputs, bool ONNXRuntimePredictor::ZeroCopyRun() { try { - Ort::IoBinding binding(session_); - std::vector inputs; - std::vector outputs; - Ort::RunOptions options; - - inputs.reserve(input_desc_.size()); - const char *device_name = config_.use_gpu() ? "Cuda" : "Cpu"; - for (auto desc : input_desc_) { - inputs.push_back(GetOrtValue(desc, device_name)); - binding.BindInput(desc.name.c_str(), inputs.back()); - } - - // TODO(heliqi): Optimization —— move to Init() - for (auto desc : output_desc_) { - Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, - place_.GetDeviceId(), OrtMemTypeDefault); - binding.BindOutput(desc.name.c_str(), memory_info); - } - - session_.Run({}, binding); - - outputs = binding.GetOutputValues(); - for (size_t i = 0; i < output_desc_.size(); ++i) { - AsTensor(outputs[i], output_desc_[i]); - } + session_.Run({}, *(binding_.get())); } catch (const std::exception &e) { LOG(ERROR) << e.what(); return false; @@ -345,9 +298,9 @@ uint64_t ONNXRuntimePredictor::TryShrinkMemory() { } ONNXRuntimePredictor::~ONNXRuntimePredictor() { - if (sub_scope_) { - scope_->DeleteScope(sub_scope_); - } + binding_->ClearBoundInputs(); + binding_->ClearBoundOutputs(); + memory::Release(place_); } diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.h b/paddle/fluid/inference/api/onnxruntime_predictor.h index 7fb07aa97bd..d01756e4b96 100644 --- a/paddle/fluid/inference/api/onnxruntime_predictor.h +++ b/paddle/fluid/inference/api/onnxruntime_predictor.h @@ -94,9 +94,8 @@ class ONNXRuntimePredictor : public PaddlePredictor { /// \param[in] AnalysisConfig config /// explicit ONNXRuntimePredictor(const AnalysisConfig &config) - : config_(config) { + : config_(config), env_(ORT_LOGGING_LEVEL_WARNING, "onnx") { predictor_id_ = inference::GetUniqueId(); - env_ = Ort::Env(ORT_LOGGING_LEVEL_INFO, "onnx"); } /// /// \brief Destroy the ONNXRuntime Predictor object @@ -177,30 +176,17 @@ class ONNXRuntimePredictor : public PaddlePredictor { /// std::unique_ptr Clone() override; - std::shared_ptr scope_; - private: /// - /// \brief get the Ort Value(input Tensor). - /// - /// \param[in] desc ONNXDesce(name、shape、dtype) - /// - /// \param[in] device_name "cpu" or "gpu" of device - /// - /// \return get a Ort::Value - /// - Ort::Value GetOrtValue(const ONNXDesc &desc, const char *device_name); - - /// - /// \brief Ort::Value to Paddle::ZeroCopyTensor. + /// \brief Whether to find in/out by name. /// - /// \param[in] value Ort::Value(output Tensor) + /// \param[in] name input or output name /// - /// \param[in] desc a ONNXDesce(name、shape、dtype) + /// \param[in] is_input input(true) or output(false) /// - /// \return get a Ort::Value + /// \return Whether to find by name /// - void AsTensor(const Ort::Value &value, const ONNXDesc &desc); + bool FindONNXDesc(const std::string &name, bool is_input); private: AnalysisConfig config_; @@ -208,9 +194,9 @@ class ONNXRuntimePredictor : public PaddlePredictor { // ONNXRuntime Ort::Env env_; Ort::Session session_{nullptr}; + std::shared_ptr binding_; platform::Place place_; - framework::Scope *sub_scope_{nullptr}; std::vector input_desc_; std::vector output_desc_; int predictor_id_; diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h index 5a98d109aed..2afe2d32e2f 100644 --- a/paddle/fluid/inference/api/paddle_tensor.h +++ b/paddle/fluid/inference/api/paddle_tensor.h @@ -18,6 +18,11 @@ #include "paddle_infer_declare.h" // NOLINT +#ifdef PADDLE_WITH_ONNXRUNTIME +#include "onnxruntime_c_api.h" // NOLINT +#include "onnxruntime_cxx_api.h" // NOLINT +#endif + namespace paddle_infer { /// \brief Experimental. @@ -175,6 +180,23 @@ class PD_INFER_DECL Tensor { PlaceType place_; int device_; +#ifdef PADDLE_WITH_ONNXRUNTIME + bool is_ort_tensor_{false}; + std::vector shape_; + std::weak_ptr binding_; + int idx_{-1}; + + void SetOrtMark(bool is_ort_tensor); + + void SetOrtBinding(const std::shared_ptr binding); + + template + void ORTCopyFromCpu(const T* data); + + template + void ORTCopyToCpu(T* data) const; +#endif + friend class paddle_infer::contrib::TensorUtils; #if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST) friend class paddle_infer::InferApiTesterUtils; -- GitLab From 7dad9f7078bd08b07f95530617900ece166e3a6a Mon Sep 17 00:00:00 2001 From: TeFeng Chen Date: Thu, 17 Mar 2022 16:30:18 +0800 Subject: [PATCH 144/176] fix double-free bug in variables of cinn subgraph (#40609) --- .../memory_optimize_pass/share_varinfo_into_cinn_pass.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc index 1b2a62695fb..9fc6de3c8c1 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/share_varinfo_into_cinn_pass.cc @@ -73,8 +73,10 @@ static void ShareVarInfoToCinnLaunch( varinfo_maps.at(cinn_launch_op->GetScopeIdx()); // collect all MemOptVarInfos of external variables - // that would be eager deleted after the cinn_launch subgraph executed, - // and store them as attribute of the subgraph + // that were eager deleted after the cinn_launch subgraph executed, + // and we will delete them in advance among eager_deletion_ops + // inside cinn_launch subgraph, so store them as attribute of the subgraph + // to pass to the inner eager_deletion_ops. for (const auto& var_name : vars_to_delete) { auto it = src_varinfo_map.find(var_name); PADDLE_ENFORCE_NE(it, src_varinfo_map.end(), @@ -82,6 +84,8 @@ static void ShareVarInfoToCinnLaunch( "MemOptVarInfo of var[%s] not found", var_name)); dst_varinfo_map.emplace(var_name, it->second); } + // skip running of the followed eager_deletion_op + followed_eager_deletion_op->SetSkipRunning(true); } static void TakeVarInfoFromMainGraph( -- GitLab From 96d2f337d1e0bfb5dd18bb6c1828d0d50174a0f5 Mon Sep 17 00:00:00 2001 From: tanzhipeng <51696454+tiancaitzp@users.noreply.github.com> Date: Thu, 17 Mar 2022 17:22:35 +0800 Subject: [PATCH 145/176] modify sequence_conv_xpu op test. test=kunlun (#40347) --- .../xpu/test_sequence_conv_op_xpu.py | 374 +++++++++--------- 1 file changed, 193 insertions(+), 181 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py index 2ad79dd0cca..99992170418 100755 --- a/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py @@ -21,6 +21,8 @@ import random import sys sys.path.append("../") from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types +from xpu.get_test_cover_info import XPUOpTestWrapper paddle.enable_static() np.set_printoptions(threshold=np.inf) @@ -73,188 +75,198 @@ def seqconv(x, return np.dot(col, filter) -class TestSeqProject(XPUOpTest): - def setUp(self): - self.init_test_case() - self.op_type = 'sequence_conv' - self.use_xpu = True - - if self.context_length == 1 \ - and self.context_start == 0 \ - and self.padding_trainable: - print("If context_start is 0 " \ - "and context_length is 1," \ - " padding_trainable should be false.") - return - - # one level, batch size - x = np.random.uniform(-6.10907e-05, 0.000104218, - [self.input_size[0], - self.input_size[1]]).astype('float32') - w = np.random.uniform(-3.17068e-05, 0.000159822, [ - self.context_length * self.input_size[1], self.output_represention - ]).astype('float32') - - begin_pad = np.max([0, -self.context_start]) - end_pad = np.max([0, self.context_start + self.context_length - 1]) - total_pad = begin_pad + end_pad - padding_data = np.random.uniform( - 0, 0, [total_pad, self.input_size[1]]).astype('float32') - self.pad_data = padding_data - self.inputs = { - 'X': (x, self.lod), - 'Filter': w, - } - self.inputs_val = ['X', 'Filter'] - self.inputs_val_no_x = ['Filter'] - self.inputs_val_no_f = ['X'] - - if total_pad != 0: - self.inputs['PaddingData'] = padding_data - self.inputs_val = ['X', 'PaddingData', 'Filter'] - self.inputs_val_no_x = ['PaddingData', 'Filter'] - self.inputs_val_no_f = ['PaddingData', 'X'] - - self.attrs = { - 'contextStart': self.context_start, - 'contextLength': self.context_length, - 'paddingTrainable': self.padding_trainable, - 'contextStride': self.context_stride - } - out = seqconv(x, self.lod, w, self.context_length, self.context_start, - self.padding_trainable, self.pad_data) - self.outputs = {'Out': out} - - def test_check_output(self): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad_input(self): - self.check_grad(['X'], 'Out', no_grad_set=set(self.inputs_val_no_x)) - - def test_check_grad_padding_data(self): - if self.padding_trainable: +class XPUTestSequenceConv(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'sequence_conv' + + class TestSeqProject(XPUOpTest): + def setUp(self): + self.init_test_case() + self.op_type = 'sequence_conv' + self.dtype = self.in_type + self.use_xpu = True + + if self.context_length == 1 \ + and self.context_start == 0 \ + and self.padding_trainable: + print("If context_start is 0 " \ + "and context_length is 1," \ + " padding_trainable should be false.") + return + + # one level, batch size + x = np.random.uniform(-6.10907e-05, 0.000104218, + [self.input_size[0], + self.input_size[1]]).astype(self.dtype) + w = np.random.uniform(-3.17068e-05, 0.000159822, [ + self.context_length * self.input_size[1], + self.output_represention + ]).astype(self.dtype) + + begin_pad = np.max([0, -self.context_start]) + end_pad = np.max([0, self.context_start + self.context_length - 1]) + total_pad = begin_pad + end_pad + padding_data = np.random.uniform( + 0, 0, [total_pad, self.input_size[1]]).astype(self.dtype) + self.pad_data = padding_data + self.inputs = { + 'X': (x, self.lod), + 'Filter': w, + } + self.inputs_val = ['X', 'Filter'] + self.inputs_val_no_x = ['Filter'] + self.inputs_val_no_f = ['X'] + + if total_pad != 0: + self.inputs['PaddingData'] = padding_data + self.inputs_val = ['X', 'PaddingData', 'Filter'] + self.inputs_val_no_x = ['PaddingData', 'Filter'] + self.inputs_val_no_f = ['PaddingData', 'X'] + + self.attrs = { + 'contextStart': self.context_start, + 'contextLength': self.context_length, + 'paddingTrainable': self.padding_trainable, + 'contextStride': self.context_stride + } + out = seqconv(x, self.lod, w, self.context_length, + self.context_start, self.padding_trainable, + self.pad_data) + self.outputs = {'Out': out} + + def test_check_output(self): + place = paddle.XPUPlace(0) + self.check_output_with_place(place) + + def test_check_grad_input(self): + self.check_grad(['X'], 'Out', no_grad_set=set(self.inputs_val_no_x)) + + def test_check_grad_padding_data(self): + if self.padding_trainable: + self.check_grad( + ['PaddingData'], 'Out', no_grad_set=set(['X', 'Filter'])) + + def test_check_grad_Filter(self): self.check_grad( - ['PaddingData'], 'Out', no_grad_set=set(['X', 'Filter'])) - - def test_check_grad_Filter(self): - self.check_grad( - ['Filter'], 'Out', no_grad_set=set(self.inputs_val_no_f)) - - def test_check_grad_input_filter(self): - if self.padding_trainable: - self.check_grad( - ['X', 'Filter'], 'Out', no_grad_set=set(['PaddingData'])) - - def test_check_grad_padding_input(self): - if self.padding_trainable: - self.check_grad( - self.inputs_val_no_f, 'Out', no_grad_set=set(['Filter'])) - - def test_check_grad_padding_filter(self): - if self.padding_trainable: - self.check_grad(self.inputs_val_no_x, 'Out', no_grad_set=set(['X'])) - - def init_test_case(self): - self.input_row = 7 - self.input_col = 25 - self.context_start = -2 - self.context_length = 5 - self.padding_trainable = False - self.context_stride = 1 - - self.input_size = [self.input_row, self.input_col] - offset_lod = [[0, 1, self.input_row]] - self.lod = [[]] - # convert from offset-based lod to length-based lod - for i in range(len(offset_lod[0]) - 1): - self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) - self.output_represention = 8 # output feature size - - -class TestSeqProjectCase1(TestSeqProject): - def init_test_case(self): - self.input_row = 11 - self.context_start = -2 - self.context_length = 5 - self.padding_trainable = False - self.context_stride = 1 - - self.input_size = [self.input_row, 50] - offset_lod = [[0, 4, 5, 8, self.input_row]] - self.lod = [[]] - # convert from offset-based lod to length-based lod - for i in range(len(offset_lod[0]) - 1): - self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) - self.output_represention = 8 # output feature size - - -class TestSeqProjectCase2Len0(TestSeqProject): - def init_test_case(self): - self.input_row = 11 - self.context_start = -2 - self.context_length = 5 - self.padding_trainable = False - self.context_stride = 1 - - self.input_size = [self.input_row, 50] - offset_lod = [[0, 0, 4, 5, 5, 8, self.input_row, self.input_row]] - self.lod = [[]] - # convert from offset-based lod to length-based lod - for i in range(len(offset_lod[0]) - 1): - self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) - self.output_represention = 8 # output feature size - - -class TestSeqProjectCase3(TestSeqProject): - def init_test_case(self): - self.input_row = 25 - self.context_start = -2 - self.context_length = 5 - self.padding_trainable = False - self.context_stride = 1 - - self.input_size = [self.input_row, 25] - idx = list(range(self.input_size[0])) - del idx[0] - offset_lod = [[0] + np.sort(random.sample(idx, 8)).tolist() + - [self.input_size[0]]] - self.lod = [[]] - # convert from offset-based lod to length-based lod - for i in range(len(offset_lod[0]) - 1): - self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) - self.output_represention = 8 # output feature size - - -class TestSeqProjectCase4(TestSeqProject): - def init_test_case(self): - self.input_row = 7835 - self.input_col = 128 - self.context_start = -2 - self.context_length = 5 - self.padding_trainable = False - self.context_stride = 1 - - self.input_size = [self.input_row, self.input_col] - offset_lod = [[ - 0, 1, 2, 3, 131, 241, 242, 263, 264, 265, 266, 267, 268, 387, 515, - 516, 644, 645, 772, 794, 922, 923, 924, 944, 945, 1073, 1074, 1202, - 1330, 1458, 1556, 1557, 1558, 1686, 1748, 1876, 1912, 1913, 1914, - 2032, 2066, 2194, 2308, 2309, 2347, 2475, 2476, 2477, 2478, 2606, - 2607, 2735, 2736, 2737, 2738, 2838, 2966, 2967, 2968, 2969, 3097, - 3225, 3353, 3481, 3482, 3520, 3642, 3643, 3754, 3882, 3883, 4010, - 4011, 4012, 4140, 4219, 4228, 4356, 4357, 4415, 4475, 4476, 4604, - 4605, 4606, 4694, 4695, 4808, 4936, 4961, 4962, 5004, 5132, 5260, - 5312, 5440, 5441, 5569, 5570, 5675, 5676, 5750, 5810, 5811, 5939, - 6021, 6149, 6277, 6278, 6364, 6425, 6519, 6647, 6648, 6739, 6867, - 6995, 6996, 7120, 7223, 7244, 7367, 7407, 7408, 7467, 7595, 7699, - 7827, 7835 - ]] - self.lod = [[]] - # convert from offset-based lod to length-based lod - for i in range(len(offset_lod[0]) - 1): - self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) - self.output_represention = 8 # output feature size + ['Filter'], 'Out', no_grad_set=set(self.inputs_val_no_f)) + + def test_check_grad_input_filter(self): + if self.padding_trainable: + self.check_grad( + ['X', 'Filter'], 'Out', no_grad_set=set(['PaddingData'])) + + def test_check_grad_padding_input(self): + if self.padding_trainable: + self.check_grad( + self.inputs_val_no_f, 'Out', no_grad_set=set(['Filter'])) + + def test_check_grad_padding_filter(self): + if self.padding_trainable: + self.check_grad( + self.inputs_val_no_x, 'Out', no_grad_set=set(['X'])) + + def init_test_case(self): + self.input_row = 7 + self.input_col = 25 + self.context_start = -2 + self.context_length = 5 + self.padding_trainable = False + self.context_stride = 1 + + self.input_size = [self.input_row, self.input_col] + offset_lod = [[0, 1, self.input_row]] + self.lod = [[]] + # convert from offset-based lod to length-based lod + for i in range(len(offset_lod[0]) - 1): + self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) + self.output_represention = 8 # output feature size + + class TestSeqProjectCase1(TestSeqProject): + def init_test_case(self): + self.input_row = 11 + self.context_start = -2 + self.context_length = 5 + self.padding_trainable = False + self.context_stride = 1 + + self.input_size = [self.input_row, 50] + offset_lod = [[0, 4, 5, 8, self.input_row]] + self.lod = [[]] + # convert from offset-based lod to length-based lod + for i in range(len(offset_lod[0]) - 1): + self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) + self.output_represention = 8 # output feature size + + class TestSeqProjectCase2Len0(TestSeqProject): + def init_test_case(self): + self.input_row = 11 + self.context_start = -2 + self.context_length = 5 + self.padding_trainable = False + self.context_stride = 1 + + self.input_size = [self.input_row, 50] + offset_lod = [[0, 0, 4, 5, 5, 8, self.input_row, self.input_row]] + self.lod = [[]] + # convert from offset-based lod to length-based lod + for i in range(len(offset_lod[0]) - 1): + self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) + self.output_represention = 8 # output feature size + + class TestSeqProjectCase3(TestSeqProject): + def init_test_case(self): + self.input_row = 25 + self.context_start = -2 + self.context_length = 5 + self.padding_trainable = False + self.context_stride = 1 + + self.input_size = [self.input_row, 25] + idx = list(range(self.input_size[0])) + del idx[0] + offset_lod = [[0] + np.sort(random.sample(idx, 8)).tolist() + + [self.input_size[0]]] + self.lod = [[]] + # convert from offset-based lod to length-based lod + for i in range(len(offset_lod[0]) - 1): + self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) + self.output_represention = 8 # output feature size + + class TestSeqProjectCase4(TestSeqProject): + def init_test_case(self): + self.input_row = 7835 + self.input_col = 128 + self.context_start = -2 + self.context_length = 5 + self.padding_trainable = False + self.context_stride = 1 + + self.input_size = [self.input_row, self.input_col] + offset_lod = [[ + 0, 1, 2, 3, 131, 241, 242, 263, 264, 265, 266, 267, 268, 387, + 515, 516, 644, 645, 772, 794, 922, 923, 924, 944, 945, 1073, + 1074, 1202, 1330, 1458, 1556, 1557, 1558, 1686, 1748, 1876, + 1912, 1913, 1914, 2032, 2066, 2194, 2308, 2309, 2347, 2475, + 2476, 2477, 2478, 2606, 2607, 2735, 2736, 2737, 2738, 2838, + 2966, 2967, 2968, 2969, 3097, 3225, 3353, 3481, 3482, 3520, + 3642, 3643, 3754, 3882, 3883, 4010, 4011, 4012, 4140, 4219, + 4228, 4356, 4357, 4415, 4475, 4476, 4604, 4605, 4606, 4694, + 4695, 4808, 4936, 4961, 4962, 5004, 5132, 5260, 5312, 5440, + 5441, 5569, 5570, 5675, 5676, 5750, 5810, 5811, 5939, 6021, + 6149, 6277, 6278, 6364, 6425, 6519, 6647, 6648, 6739, 6867, + 6995, 6996, 7120, 7223, 7244, 7367, 7407, 7408, 7467, 7595, + 7699, 7827, 7835 + ]] + self.lod = [[]] + # convert from offset-based lod to length-based lod + for i in range(len(offset_lod[0]) - 1): + self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) + self.output_represention = 8 # output feature size + + +support_types = get_xpu_op_support_types('sequence_conv') +for stype in support_types: + create_test_class(globals(), XPUTestSequenceConv, stype) class TestSeqConvApi(unittest.TestCase): -- GitLab From 942a85b9f4476e12716552078bda6a90039b42a8 Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Thu, 17 Mar 2022 17:34:26 +0800 Subject: [PATCH 146/176] Modified masked_select_kernel and where_index with kernel primitive api(#40517) --- paddle/phi/kernels/funcs/select_impl.cu.h | 447 ++++++++++++++++++ .../phi/kernels/gpu/masked_select_kernel.cu | 74 +-- paddle/phi/kernels/gpu/where_index_kernel.cu | 161 ++----- .../kernels/primitive/compute_primitives.h | 3 +- 4 files changed, 500 insertions(+), 185 deletions(-) create mode 100644 paddle/phi/kernels/funcs/select_impl.cu.h diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h new file mode 100644 index 00000000000..3a1d9b8ea7a --- /dev/null +++ b/paddle/phi/kernels/funcs/select_impl.cu.h @@ -0,0 +1,447 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +// CUDA and HIP use same api +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +#include +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/primitive/kernel_primitives.h" + +namespace kps = phi::kps; + +namespace phi { +namespace funcs { +using Mode = kps::details::ReduceMode; + +/* +* Count how many of the data being processed by the current block are true +* 1. Load data from global memory and cast from bool to int64_t +* 2. Get result of this thread according to thread reduce +* 3. Get result of this block according to block reduce +* 4. first block store 0 and current result +*/ +template +struct NonZeroFunctor { + HOSTDEVICE NonZeroFunctor() {} + HOSTDEVICE inline T operator()(const T in) { + if (in) { + return static_cast(1); + } else { + return static_cast(0); + } + } +}; + +template +__device__ void GetBlockCountImpl(const InT *in, + OutT *out, + int num, + int repeat) { + InT in_data[VecSize]; + OutT temp[VecSize]; + OutT result = static_cast(0.0f); + using Add = kps::AddFunctor; + using Cast = NonZeroFunctor; + int store_fix = BLOCK_ID_X + repeat * GRID_NUM_X; + + kps::Init(&in_data[0], static_cast(0.0f)); + kps::ReadData(&in_data[0], in, num); + kps::ElementwiseUnary( + &temp[0], &in_data[0], Cast()); + kps::Reduce( + &result, &temp[0], Add(), true); + kps::Reduce( + &result, &result, Add(), true); + if (store_fix == 0) { + // first block's fix_size = 0; + OutT tmp = static_cast(0.0f); + kps::WriteData(out + store_fix, &tmp, 1); + } + + // store num of this block + kps::WriteData(out + store_fix + 1, &result, 1); +} + +// Count how many data is not zero in current block +template +__global__ void GetBlockCountKernel(const InT *in, + OutT *out, + int64_t numel, + int64_t main_offset) { + int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize; + int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize; + int repeat = 0; + for (; data_offset < main_offset; data_offset += stride) { + GetBlockCountImpl( + in + data_offset, out, BLOCK_NUM_X * VecSize, repeat); + repeat++; // to get the real blockIdx + } + + int num = numel - data_offset; + if (num > 0) { + GetBlockCountImpl( + in + data_offset, out, num, repeat); + } +} + +/* +* Get block num prefix us one block, VecSize must be 2 +* 1. Each thread load 2 data : threadIdx.x and threadIdx.x + blockDimx.x +* 2. Cumsum limitation is blockDim.x must be less than 512 +*/ + +template +__device__ void CumsumImpl( + const InT *in, OutT *out, OutT *pre_cumsum, int num, Functor func) { + __shared__ OutT max_thread_data; + OutT temp[VecSize]; + InT arg[VecSize]; + OutT result[VecSize]; + // init data_pr + kps::Init(&arg[0], static_cast(0.0f)); + // set pre_cumsum + kps::Init(&temp[0], *pre_cumsum); + // load data to arg + kps::ReadData( + &arg[0], in, num, 1, BLOCK_NUM_X, 1); + // block cumsum + kps::Cumsum(&result[0], &arg[0], func); + // result = cumsum_result + pre_cumsum + kps::ElementwiseBinary( + &result[0], &result[0], &temp[0], func); + // get the last prefix sum + if ((THREAD_ID_X == BLOCK_NUM_X - 1) && !IsBoundary) { + max_thread_data = result[VecSize - 1]; + } + __syncthreads(); + // update pre_cumsum + *pre_cumsum = max_thread_data; + kps::WriteData( + out, &result[0], num, 1, BLOCK_NUM_X, 1); +} + +// Compute this store_offset of this block +template +__global__ void CumsumOneBlock( + const InT *in, OutT *out, int numel, int main_offset, Functor func) { + int stride = BLOCK_NUM_X * VecSize; + int offset = 0; + OutT pre_cumsum = static_cast(0); + for (; offset < main_offset; offset += stride) { + CumsumImpl( + in + offset, out + offset, &pre_cumsum, BLOCK_NUM_X * VecSize, func); + } + + int num = numel - offset; + if (num > 0) { + CumsumImpl( + in + offset, out + offset, &pre_cumsum, num, func); + } +} + +template +struct SelectCaller { + __device__ void inline operator()(OutT *store_data, + const MT *mask_data, + const InT *in, + Functor func, + int num, + int data_offset) { + // where_index op + IdT index_reg[VecSize]; + // Set data index of global + kps::InitWithDataIndex(&index_reg[0], data_offset); + // Get store data according to mask_idt + kps::OperatorTernary( + store_data, mask_data, &index_reg[0], func, VecSize); + } +}; + +template +struct SelectCaller { // masked_select + __device__ void inline operator()(OutT *store_data, + const MT *mask_data, + const InT *in, + Functor func, + int num, + int data_offset) { + InT in_data[VecSize]; + kps::ReadData(&in_data[0], in, num); + // Get store data according to mask_idt + kps::OperatorTernary( + store_data, mask_data, &in_data[0], func, VecSize); + } +}; + +/** +* Get mask's index if mask == true +*/ +template // SelectType = 1 Mask_select else where_index +__device__ void +SelectKernelImpl(OutT *out, + const MT *mask, + const InT *in, + Functor func, + int num, + int data_offset, + int store_rank) { + const int kCVecSize = 2; + // each thread cumsum 2 data + using IdT = int64_t; + // Set index data type + using Add = kps::AddFunctor; // for cumsum + using Cast = NonZeroFunctor; // for mask + + IdT init_idx = static_cast(0.0f); + MT init_mask = static_cast(0.0f); + + IdT num_thread[kCVecSize]; + IdT cumsum_thread[kCVecSize]; + + OutT store_data[VecSize * phi::DDim::kMaxRank]; + MT mask_data[VecSize]; + IdT mask_idt[VecSize]; + // init data_pr + kps::Init(&cumsum_thread[0], init_idx); + kps::Init(&num_thread[0], init_idx); + kps::Init(&mask_data[0], init_mask); + // Load mask + kps::ReadData(&mask_data[0], mask, num); + // Cast from MT to int + kps::ElementwiseUnary( + &mask_idt[0], &mask_data[0], Cast()); + // Get the num of thread only num_thread[1] has data + kps::Reduce( + &num_thread[0], &mask_idt[0], Add(), true); + // Get cumsum_thread cumsum from 0 to num_thread cumsum_thread[0] is the + // thread_fix + kps::Cumsum(&cumsum_thread[0], &num_thread[0], Add()); + // Get store data(index) according to mask_idt + SelectCaller + compute; + compute(&store_data[0], &mask_data[0], in, func, num, data_offset); + // get thread_fix + int thread_fix = + (static_cast(cumsum_thread[0] - num_thread[0]) * store_rank); + // get how many data need to store + int store_num = static_cast(num_thread[0]) * store_rank; + // thread store num data, each thread may has different num + kps::details::WriteData(out + thread_fix, &store_data[0], store_num); +} + +template +__global__ void SelectKernel(OutT *out, + const MT *mask, + const InT *in, + CT *cumsum, + Functor func, + const int64_t numel, + int64_t main_offset, + int store_rank) { + int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize; + int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize; + int repeat = 0; + int size = VecSize * BLOCK_ID_X; + for (; data_offset < main_offset; data_offset += stride) { + // Cumsum index + int idx_cumsum = repeat * GRID_NUM_X + BLOCK_ID_X; + // niuliling todo: us ReadData API + int block_store_offset = cumsum[idx_cumsum]; + SelectKernelImpl( + out + block_store_offset * store_rank, + mask + data_offset, + in + data_offset, + func, + size, + data_offset, + store_rank); + repeat++; + } + + int num = numel - data_offset; + if (num > 0) { + // Cumsum index + int idx_cumsum = repeat * GRID_NUM_X + BLOCK_ID_X; + // niuliling todo: us ReadData API + int block_store_offset = static_cast(cumsum[idx_cumsum]); + SelectKernelImpl( + out + block_store_offset * store_rank, + mask + data_offset, + in + data_offset, + func, + num, + data_offset, + store_rank); + } +} + +inline int64_t Floor(int64_t in, int64_t div) { return in / div * div; } + +// SelectData = 1 then masked_select; SelectData = 0 then where_index +template +void SelectKernel(const KPDevice &dev_ctx, + const DenseTensor &condition, + const DenseTensor &in_data, + DenseTensor *out, + Functor func) { + const MT *cond_data = condition.data(); + const int64_t numel = condition.numel(); + auto dims = condition.dims(); + int rank = SelectData ? 1 : dims.size(); + const InT *in_data_ptr = SelectData ? in_data.data() : nullptr; + // calculate the inclusive prefix sum of "true_num_array" + // to get the index of "out" tensor, + // and the total number of cond_data[i]==true. + // Example: + // condition: F T T F F F T T + // before: 0 1 1 0 0 0 1 1 + // after: 0 1 2 2 2 2 3 4 + // out: 1 2 6 7 + // alloc for cpu + using CT = int64_t; // set Count_data Type + const int t_size = sizeof(CT); + + const paddle::platform::CUDAPlace &cuda_place = dev_ctx.GetPlace(); + paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace(); + + // 1.1 get stored data num of per block + int total_true_num = 0; // init + const int kVecSize = 4; +#ifdef PADDLE_WITH_XPU_KP + int block = 64; + auto stream = dev_ctx.x_context()->xpu_stream; + const int num_per_block = kVecSize * block; + const int need_grids = (numel + num_per_block - 1) / num_per_block; + const int grid = std::min(need_grids, 8); +#else + const int block = 256; + const int num_per_block = kVecSize * block; + const int need_grids = (numel + num_per_block - 1) / num_per_block; + const int grid = std::min(need_grids, 256); + auto stream = dev_ctx.stream(); +#endif + const int64_t main_offset = Floor(numel, num_per_block); + // 1.2 alloc tmp data for CoutBlock + const int size_count_block = need_grids + 1; + std::vector dims_vec = {size_count_block * 2}; + ScalarArray dims_array(dims_vec); + DenseTensor count_mem = phi::Empty(dev_ctx, dims_array); + CT *count_data = count_mem.data(); + // 1.3 launch CountKernl + GetBlockCountKernel<<>>( + cond_data, count_data, numel, main_offset); + // 2.1 alloc cumsum data for CoutBlock prefix + DenseTensor cumsum_mem = phi::Empty(dev_ctx, dims_array); + CT *cumsum_data = cumsum_mem.data(); + // 2.2 get prefix of count_data for real out_index + const int kCumVesize = 2; + const int block_c = 256; + const int main_offset_c = Floor(size_count_block, (kCumVesize * block_c)); + using Add = kps::AddFunctor; + CumsumOneBlock<<<1, block_c, 0, stream>>>( + count_data, cumsum_data, size_count_block, main_offset_c, Add()); + // 3.1 set temp ptr for in; + // 3.1 alloc for out + // 3.1.1 get true_num for gpu place the last cumsum is the true_num + paddle::memory::Copy(cpu_place, + &total_true_num, + cuda_place, + cumsum_data + need_grids, + t_size, + dev_ctx.stream()); + + dev_ctx.Wait(); + // 3.1.2 allock for out with total_true_num + std::vector out_dim = {static_cast(total_true_num)}; + if (SelectData == 0) { // where_index + out_dim.push_back(rank); + } + out->Resize(phi::make_ddim(out_dim)); + auto out_data = out->mutable_data(cuda_place); + // 3.2 get true data's index according to cond_data and cumsum_data + if (total_true_num <= 0) return; + SelectKernel<<>>(out_data, + cond_data, + in_data_ptr, + cumsum_data, + func, + numel, + main_offset, + rank); +} + +} // namespace funcs +} // namespace phi + +#endif diff --git a/paddle/phi/kernels/gpu/masked_select_kernel.cu b/paddle/phi/kernels/gpu/masked_select_kernel.cu index fc4adca2f42..b443ae6b8fb 100644 --- a/paddle/phi/kernels/gpu/masked_select_kernel.cu +++ b/paddle/phi/kernels/gpu/masked_select_kernel.cu @@ -19,34 +19,27 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/select_impl.cu.h" #include "paddle/phi/kernels/masked_select_kernel.h" namespace phi { -__global__ void SetMaskArray(const bool* mask, int32_t* mask_array, int size) { - int idx = blockDim.x * blockIdx.x + threadIdx.x; - for (; idx < size; idx += blockDim.x * gridDim.x) { - if (mask[idx]) - mask_array[idx] = 1; - else - mask_array[idx] = 0; - } -} +template +struct MaskedSelectFunctor { + HOSTDEVICE MaskedSelectFunctor() {} -template -__global__ void SelectWithPrefixMask(const int32_t* mask_prefix_sum, - const bool* mask, - const T* input, - T* out, - int size) { - int idx = blockDim.x * blockIdx.x + threadIdx.x; - for (; idx < size; idx += blockDim.x * gridDim.x) { - if (mask[idx]) { - int index = mask_prefix_sum[idx]; - out[index] = input[idx]; + HOSTDEVICE inline void operator()(OutT* out, + const MT* mask, + const InT* value, + int num) { + int store_fix = 0; + for (int idx = 0; idx < num; idx++) { + if (mask[idx]) { + out[store_fix++] = value[idx]; + } } } -} +}; template void MaskedSelectKernel(const Context& dev_ctx, @@ -68,42 +61,9 @@ void MaskedSelectKernel(const Context& dev_ctx, "value.", input_dim, mask_dim)); - - thrust::device_ptr mask_dev_ptr = - thrust::device_pointer_cast(mask_data); - thrust::device_vector mask_vec(mask_dev_ptr, mask_dev_ptr + mask_size); - auto out_size = thrust::count(mask_vec.begin(), mask_vec.end(), true); - - DDim out_dim{out_size}; - out->Resize(out_dim); - auto out_data = out->mutable_data(dev_ctx.GetPlace()); - - DenseTensor mask_array; - DenseTensor mask_prefix_sum; - mask_array.Resize(mask_dim); - mask_prefix_sum.Resize(mask_dim); - - int32_t* mask_array_data = - mask_array.mutable_data(dev_ctx.GetPlace()); - int32_t* mask_prefix_sum_data = - mask_prefix_sum.mutable_data(dev_ctx.GetPlace()); - int threads = 512; - int grid = (mask_size + threads - 1) / threads; - auto stream = dev_ctx.stream(); - SetMaskArray<<>>( - mask_data, mask_array_data, mask_size); - - thrust::device_ptr mask_array_dev_ptr = - thrust::device_pointer_cast(mask_array_data); - thrust::device_vector mask_array_vec(mask_array_dev_ptr, - mask_array_dev_ptr + mask_size); - thrust::exclusive_scan(thrust::device, - mask_array_vec.begin(), - mask_array_vec.end(), - mask_prefix_sum_data); - - SelectWithPrefixMask<<>>( - mask_prefix_sum_data, mask_data, input_data, out_data, mask_size); + using Functor = MaskedSelectFunctor; + phi::funcs::SelectKernel( + dev_ctx, mask, x, out, Functor()); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/where_index_kernel.cu b/paddle/phi/kernels/gpu/where_index_kernel.cu index 535cb812a20..9538533f70d 100644 --- a/paddle/phi/kernels/gpu/where_index_kernel.cu +++ b/paddle/phi/kernels/gpu/where_index_kernel.cu @@ -20,150 +20,59 @@ namespace cub = hipcub; #endif -#include "paddle/phi/kernels/where_index_kernel.h" - -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/core/ddim.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/select_impl.cu.h" +#include "paddle/phi/kernels/where_index_kernel.h" namespace phi { - -template -__global__ void GetTrueNum(const T *cond_data, - const int64_t numel, - int64_t *true_num_array) { - const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x; - - for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) { - true_num_array[idx] = - static_cast(static_cast(cond_data[idx])); +template +struct IndexFunctor { + T2 stride[phi::DDim::kMaxRank]; + int dims; + explicit IndexFunctor(const phi::DDim &in_dims) { + dims = in_dims.size(); + std::vector strides_in_tmp; + strides_in_tmp.resize(dims, 1); + // get strides according to in_dims + for (T2 i = 1; i < dims; i++) { + strides_in_tmp[i] = strides_in_tmp[i - 1] * in_dims[dims - i]; + } + memcpy(stride, strides_in_tmp.data(), dims * sizeof(T2)); } -} - -template -__global__ void SetTrueIndex(int64_t *out_ptr, - const T *cond_data, - const int64_t numel, - const int64_t *stride_array, - const int64_t rank, - const int64_t *true_num_array) { - const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x; - for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) { - // true_num_array is calculated by cub::InclusiveSum, - // cause the first element of true_num_array is 1, - // so we need substract 1 to get true index. - const int64_t true_index = true_num_array[idx] - 1; - if (static_cast(cond_data[idx])) { - int64_t rank_index = idx; - for (int j = 0; j < rank; j++) { - const int64_t out_index = rank_index / stride_array[j]; - out_ptr[true_index * rank + j] = out_index; - rank_index -= out_index * stride_array[j]; + HOSTDEVICE inline void operator()(OutT *out, + const T1 *mask, + const T2 *index, + const int num) { + int store_fix = 0; + for (int idx = 0; idx < num; idx++) { + if (mask[idx]) { + T2 data_index = index[idx]; + // get index + for (int rank_id = dims - 1; rank_id >= 0; --rank_id) { + out[store_fix] = static_cast(data_index / stride[rank_id]); + data_index = data_index % stride[rank_id]; + store_fix++; + } } } } -} +}; template void WhereIndexKernel(const Context &dev_ctx, const DenseTensor &condition, DenseTensor *out) { - const T *cond_data = condition.data(); - const int64_t numel = condition.numel(); + DenseTensor in_data; auto dims = condition.dims(); - const int rank = dims.size(); - - auto d_array_mem = - paddle::memory::Alloc(dev_ctx, (numel + rank) * sizeof(int64_t)); - auto h_array_mem = - paddle::memory::Alloc(phi::CPUPlace(), (rank + 1) * sizeof(int64_t)); - - // "stride_array" is an array and len(stride_array)==rank, - // each element is the stride of each dimension -- the length from i to i+1. - int64_t *h_stride_array = reinterpret_cast(h_array_mem->ptr()); - int64_t *d_stride_array = reinterpret_cast(d_array_mem->ptr()); - - // "true_num_array" is an array and len(stride_array)==numel, - // at the beginning, - // "true_num_array" will set 1 if condition[i] == true else 0, - // then it will be calculated by cub::InclusiveSum, - // so that we can get the true number before i as the out index - int64_t *d_true_num_array = d_stride_array + rank; - - // the total_true_num is the total number of condition[i] == true - int64_t *h_total_true_num = h_stride_array + rank; - - // alloce cub memory - size_t cub_size = 0; - cub::DeviceScan::InclusiveSum(nullptr, - cub_size, - d_true_num_array, - d_true_num_array, - numel, - dev_ctx.stream()); - auto cub_mem = paddle::memory::Alloc(dev_ctx, cub_size * sizeof(int64_t)); - void *cub_data = cub_mem->ptr(); - - // set d_true_num_array[i]=1 if cond_data[i]==true else 0 - const int threads = std::min(numel, static_cast(128)); - const int64_t need_grids = (numel + threads - 1) / threads; - const int grids = std::min(need_grids, static_cast(256)); - GetTrueNum<<>>( - cond_data, numel, d_true_num_array); - - // calculate the inclusive prefix sum of "true_num_array" - // to get the index of "out" tensor, - // and the total number of cond_data[i]==true. - // Example: - // condition: F T T F F F T T - // before: 0 1 1 0 0 0 1 1 - // after: 0 1 2 2 2 2 3 4 - // out: 1 2 6 7 - cub::DeviceScan::InclusiveSum(cub_data, - cub_size, - d_true_num_array, - d_true_num_array, - numel, - dev_ctx.stream()); - - // calculate each dimension's stride - h_stride_array[rank - 1] = 1; - for (int i = rank - 2; i >= 0; i--) { - h_stride_array[i] = h_stride_array[i + 1] * dims[i + 1]; - } - paddle::memory::Copy(dev_ctx.GetPlace(), - d_stride_array, - phi::CPUPlace(), - h_stride_array, - rank * sizeof(int64_t), - dev_ctx.stream()); - - // get total ture number and set output size - // the last element of cub::InclusiveSum is the total number - paddle::memory::Copy(phi::CPUPlace(), - h_total_true_num, - dev_ctx.GetPlace(), - d_true_num_array + numel - 1, - sizeof(int64_t), - dev_ctx.stream()); - dev_ctx.Wait(); - - int64_t true_num = *h_total_true_num; - out->Resize(phi::make_ddim({static_cast(true_num), rank})); - auto *out_data = dev_ctx.template Alloc(out); - - if (true_num == 0) { - return; - } - - // using true_num_array and stride_array to calculate the output index - SetTrueIndex<<>>( - out_data, cond_data, numel, d_stride_array, rank, d_true_num_array); + using Functor = IndexFunctor; + Functor index_functor = Functor(dims); + phi::funcs::SelectKernel( + dev_ctx, condition, in_data, out, index_functor); } - } // namespace phi PD_REGISTER_KERNEL(where_index, diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h index 632ad00f6d0..e02f4450a8b 100644 --- a/paddle/phi/kernels/primitive/compute_primitives.h +++ b/paddle/phi/kernels/primitive/compute_primitives.h @@ -22,7 +22,6 @@ #endif #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -// #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" namespace phi { @@ -591,7 +590,7 @@ __device__ __forceinline__ void Cumsum(OutT* out, int index = (tidx + 1) * 2 * stride - 1; if (index < (blockDim.x * 2)) { temp[index + index / 32] = - compute(temp[index + index / 2], + compute(temp[index + index / 32], temp[index - stride + (index - stride) / 32]); } } -- GitLab From 313bff6ba83843dde4a5a90f5af22e05ee2a523c Mon Sep 17 00:00:00 2001 From: Chang Xu Date: Thu, 17 Mar 2022 19:04:17 +0800 Subject: [PATCH 147/176] Revert "Fix truncated norm operator (#40287)" (#40614) This reverts commit 0c3335433525d4f156ee7afc475274df75a34736. --- .../ps/table/depends/initializers.h | 11 +++----- .../operators/truncated_gaussian_random_op.h | 17 ++++++++++-- .../truncated_gaussian_random_op_npu.cc | 9 ++----- .../truncated_gaussian_random_op_xpu.cc | 9 ++----- .../cpu/truncated_gaussian_random_kernel.cc | 9 ++----- .../gpu/truncated_gaussian_random_kernel.cu | 27 +++++++------------ .../truncated_gaussian_random_kernel.h | 14 ++++++++-- 7 files changed, 47 insertions(+), 49 deletions(-) diff --git a/paddle/fluid/distributed/ps/table/depends/initializers.h b/paddle/fluid/distributed/ps/table/depends/initializers.h index 5ac0c08f97d..f46e659a88b 100644 --- a/paddle/fluid/distributed/ps/table/depends/initializers.h +++ b/paddle/fluid/distributed/ps/table/depends/initializers.h @@ -23,6 +23,7 @@ #include "gflags/gflags.h" #include "paddle/fluid/framework/generator.h" + #include "paddle/fluid/operators/truncated_gaussian_random_op.h" namespace paddle { @@ -117,13 +118,9 @@ class TruncatedGaussianInitializer : public Initializer { seed_ = static_cast(std::stoi(attrs[1])); mean_ = std::stof(attrs[2]); std_ = std::stof(attrs[3]); - auto normal_cdf = [](float x) { - return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; - }; - float a_normal_cdf = normal_cdf((-2.0 - mean_) / std_); - float b_normal_cdf = normal_cdf((2.0 - mean_) / std_); - std::uniform_real_distribution dist_(2.0 * a_normal_cdf - 1.0, - 2.0 * b_normal_cdf - 1.0); + + std::uniform_real_distribution dist_( + std::numeric_limits::min(), 1.0); random_engine_ = framework::GetCPURandomEngine(seed_); } diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.h b/paddle/fluid/operators/truncated_gaussian_random_op.h index 8af6e281424..a6ff2f686cb 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op.h +++ b/paddle/fluid/operators/truncated_gaussian_random_op.h @@ -1,8 +1,11 @@ /* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -137,9 +140,19 @@ T Erfinv(T x) { template struct TruncatedNormal { T mean, std; - TruncatedNormal(T mean, T std) : mean(mean), std(std) {} + T a_normal_cdf; + T b_normal_cdf; + TruncatedNormal(T mean, T std) : mean(mean), std(std) { + auto normal_cdf = [](T x) { + return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; + }; + a_normal_cdf = normal_cdf(-2.0); + b_normal_cdf = normal_cdf(2.0); + } + T operator()(T value) const { - return std::sqrt(2.0) * Erfinv(value) * std + mean; + auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; + return std::sqrt(2.0) * Erfinv(2 * p - 1) * std + mean; } }; diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc index 4ed0dd22ec0..261d9cee2d5 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc @@ -84,13 +84,8 @@ class NPUTruncatedGaussianRandomKernel : public framework::OpKernel { Tensor cpu_tensor(tensor->dtype()); cpu_tensor.Resize(tensor->dims()); T* cpu_data = cpu_tensor.mutable_data(platform::CPUPlace()); - auto normal_cdf = [](float x) { - return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; - }; - float a_normal_cdf = normal_cdf((-2.0 - mean) / std); - float b_normal_cdf = normal_cdf((2.0 - mean) / std); - std::uniform_real_distribution dist(2.0 * a_normal_cdf - 1.0, - 2.0 * b_normal_cdf - 1.0); + std::uniform_real_distribution dist(std::numeric_limits::min(), + 1.0); TruncatedNormal truncated_normal(mean, std); int64_t size = tensor->numel(); diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc index 984d9f397cc..803b61fbe81 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc @@ -32,13 +32,8 @@ class XPUTruncatedGaussianRandomKernel : public framework::OpKernel { auto* tensor = context.Output("Out"); T* data = tensor->mutable_data(context.GetPlace()); - auto normal_cdf = [](float x) { - return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; - }; - float a_normal_cdf = normal_cdf((-2.0 - mean) / std); - float b_normal_cdf = normal_cdf((2.0 - mean) / std); - std::uniform_real_distribution dist(2.0 * a_normal_cdf - 1.0, - 2.0 * b_normal_cdf - 1.0); + std::uniform_real_distribution dist(std::numeric_limits::min(), + 1.0); TruncatedNormal truncated_normal(mean, std); int64_t size = tensor->numel(); diff --git a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc index ab3d3c2376b..4247e597ace 100644 --- a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc +++ b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc @@ -37,13 +37,8 @@ void TruncatedGaussianRandomKernel(const Context& dev_ctx, T* data = dev_ctx.template Alloc(tensor); - auto normal_cdf = [](float x) { - return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; - }; - float a_normal_cdf = normal_cdf((-2.0 - mean) / std); - float b_normal_cdf = normal_cdf((2.0 - mean) / std); - std::uniform_real_distribution dist(2.0 * a_normal_cdf - 1.0, - 2.0 * b_normal_cdf - 1.0); + std::uniform_real_distribution dist(std::numeric_limits::min(), + 1.0); TruncatedNormal truncated_normal(mean, std); int64_t size = tensor->numel(); diff --git a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu index bb04e7ee851..f27b32ca7b8 100644 --- a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu +++ b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu @@ -33,27 +33,23 @@ struct GPUTruncatedNormal { T mean, std; T a_normal_cdf; T b_normal_cdf; - unsigned int seed; T numeric_min; __host__ __device__ GPUTruncatedNormal(T mean, T std, T numeric_min, int seed) : mean(mean), std(std), seed(seed), numeric_min(numeric_min) { - auto normal_cdf = [](float x) { - return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; - }; - a_normal_cdf = normal_cdf((-2.0 - mean) / std); - b_normal_cdf = normal_cdf((2.0 - mean) / std); + a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0; + b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0; } __host__ __device__ T operator()(const unsigned int n) const { thrust::minstd_rand rng; rng.seed(seed); - thrust::uniform_real_distribution dist(2.0 * a_normal_cdf - 1.0, - 2.0 * b_normal_cdf - 1.0); + thrust::uniform_real_distribution dist(numeric_min, 1); rng.discard(n); T value = dist(rng); - return std::sqrt(2.0) * erfinvf(value) * std + mean; + auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; + return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean; } }; @@ -73,21 +69,18 @@ struct TruncatedNormalOffset { seed(seed), numeric_min(numeric_min), offset_(offset) { - auto normal_cdf = [](float x) { - return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; - }; - a_normal_cdf = normal_cdf((-2.0 - mean) / std); - b_normal_cdf = normal_cdf((2.0 - mean) / std); + a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0; + b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0; } __host__ __device__ T operator()(const unsigned int n) const { thrust::minstd_rand rng; rng.seed(seed); - thrust::uniform_real_distribution dist(2.0 * a_normal_cdf - 1.0, - 2.0 * b_normal_cdf - 1.0); + thrust::uniform_real_distribution dist(numeric_min, 1); rng.discard(n + offset_); T value = dist(rng); - return std::sqrt(2.0) * erfinvf(value) * std + mean; + auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; + return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean; } }; diff --git a/paddle/phi/kernels/truncated_gaussian_random_kernel.h b/paddle/phi/kernels/truncated_gaussian_random_kernel.h index c4c13578a98..f8547ced419 100644 --- a/paddle/phi/kernels/truncated_gaussian_random_kernel.h +++ b/paddle/phi/kernels/truncated_gaussian_random_kernel.h @@ -141,9 +141,19 @@ T Erfinv(T x) { template struct TruncatedNormal { T mean, std; - TruncatedNormal(T mean, T std) : mean(mean), std(std) {} + T a_normal_cdf; + T b_normal_cdf; + TruncatedNormal(T mean, T std) : mean(mean), std(std) { + auto normal_cdf = [](T x) { + return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; + }; + a_normal_cdf = normal_cdf(-2.0); + b_normal_cdf = normal_cdf(2.0); + } + T operator()(T value) const { - return std::sqrt(2.0) * Erfinv(value) * std + mean; + auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; + return std::sqrt(2.0) * Erfinv(2 * p - 1) * std + mean; } }; -- GitLab From 317761996a940bf2cd5abf2f4e9456903ea52bb2 Mon Sep 17 00:00:00 2001 From: seemingwang Date: Thu, 17 Mar 2022 19:13:55 +0800 Subject: [PATCH 148/176] merge cpu and gpu graph engines (#40597) * extract sub-graph * graph-engine merging * fix * fix * fix heter-ps config --- paddle/fluid/distributed/ps.proto | 23 + .../ps/service/graph_brpc_client.cc | 52 +-- .../ps/service/graph_brpc_client.h | 16 +- .../ps/service/graph_brpc_server.cc | 48 +- .../ps/service/ps_service/graph_py_service.cc | 29 +- .../ps/service/ps_service/graph_py_service.h | 48 +- .../fluid/distributed/ps/table/CMakeLists.txt | 1 - .../ps/table/common_graph_table.cc | 438 +++++++++++++++--- .../distributed/ps/table/common_graph_table.h | 203 ++++++-- .../distributed/ps/table/graph/class_macro.h | 39 ++ .../distributed/ps/table/graph/graph_edge.cc | 4 +- .../distributed/ps/table/graph/graph_edge.h | 9 +- .../distributed/ps/table/graph/graph_node.h | 2 + paddle/fluid/distributed/ps/table/table.cc | 2 + paddle/fluid/distributed/test/CMakeLists.txt | 3 + .../distributed/test/graph_node_split_test.cc | 8 +- .../fluid/distributed/test/graph_node_test.cc | 60 +-- .../test/graph_table_sample_test.cc | 148 ++++++ .../framework/fleet/heter_ps/CMakeLists.txt | 3 +- .../framework/fleet/heter_ps/gpu_graph_node.h | 120 +++++ .../fleet/heter_ps/graph_gpu_ps_table.h | 117 +---- .../fleet/heter_ps/graph_gpu_ps_table_inl.h | 31 +- .../framework/fleet/heter_ps/heter_comm_inl.h | 1 + .../fleet/heter_ps/test_cpu_graph_sample.cu | 108 +++++ paddle/fluid/pybind/fleet_py.cc | 4 +- 25 files changed, 1199 insertions(+), 318 deletions(-) create mode 100644 paddle/fluid/distributed/ps/table/graph/class_macro.h create mode 100644 paddle/fluid/distributed/test/graph_table_sample_test.cc create mode 100644 paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h create mode 100644 paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu diff --git a/paddle/fluid/distributed/ps.proto b/paddle/fluid/distributed/ps.proto index 0ae87812bce..fac30e26c38 100644 --- a/paddle/fluid/distributed/ps.proto +++ b/paddle/fluid/distributed/ps.proto @@ -115,6 +115,7 @@ message TableParameter { optional CommonAccessorParameter common = 6; optional TableType type = 7; optional bool compress_in_save = 8 [ default = false ]; + optional GraphParameter graph_parameter = 9; } message TableAccessorParameter { @@ -211,3 +212,25 @@ message SparseAdamSGDParameter { // SparseAdamSGDRule optional double ada_epsilon = 5 [ default = 1e-08 ]; repeated float weight_bounds = 6; } + +message GraphParameter { + optional int32 task_pool_size = 1 [ default = 24 ]; + optional bool gpups_mode = 2 [ default = false ]; + optional string gpups_graph_sample_class = 3 + [ default = "CompleteGraphSampler" ]; + optional string gpups_graph_sample_args = 4 [ default = "" ]; + optional bool use_cache = 5 [ default = true ]; + optional float cache_ratio = 6 [ default = 0.3 ]; + optional int32 cache_ttl = 7 [ default = 5 ]; + optional GraphFeature graph_feature = 8; + optional string table_name = 9 [ default = "" ]; + optional string table_type = 10 [ default = "" ]; + optional int32 gpups_mode_shard_num = 11 [ default = 127 ]; + optional int32 gpu_num = 12 [ default = 1 ]; +} + +message GraphFeature { + repeated string name = 1; + repeated string dtype = 2; + repeated int32 shape = 3; +} \ No newline at end of file diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_client.cc b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc index 301708f6b7b..a3db88e3b67 100644 --- a/paddle/fluid/distributed/ps/service/graph_brpc_client.cc +++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.cc @@ -44,7 +44,7 @@ void GraphPsService_Stub::service( } } -int GraphBrpcClient::get_server_index_by_id(uint64_t id) { +int GraphBrpcClient::get_server_index_by_id(int64_t id) { int shard_num = get_shard_num(); int shard_per_server = shard_num % server_size == 0 ? shard_num / server_size @@ -53,7 +53,7 @@ int GraphBrpcClient::get_server_index_by_id(uint64_t id) { } std::future GraphBrpcClient::get_node_feat( - const uint32_t &table_id, const std::vector &node_ids, + const uint32_t &table_id, const std::vector &node_ids, const std::vector &feature_names, std::vector> &res) { std::vector request2server; @@ -66,7 +66,7 @@ std::future GraphBrpcClient::get_node_feat( } } size_t request_call_num = request2server.size(); - std::vector> node_id_buckets(request_call_num); + std::vector> node_id_buckets(request_call_num); std::vector> query_idx_buckets(request_call_num); for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) { int server_index = get_server_index_by_id(node_ids[query_idx]); @@ -129,7 +129,7 @@ std::future GraphBrpcClient::get_node_feat( closure->request(request_idx) ->add_params((char *)node_id_buckets[request_idx].data(), - sizeof(uint64_t) * node_num); + sizeof(int64_t) * node_num); std::string joint_feature_name = paddle::string::join_strings(feature_names, '\t'); closure->request(request_idx) @@ -179,9 +179,9 @@ std::future GraphBrpcClient::clear_nodes(uint32_t table_id) { return fut; } std::future GraphBrpcClient::add_graph_node( - uint32_t table_id, std::vector &node_id_list, + uint32_t table_id, std::vector &node_id_list, std::vector &is_weighted_list) { - std::vector> request_bucket; + std::vector> request_bucket; std::vector> is_weighted_bucket; bool add_weight = is_weighted_list.size() > 0; std::vector server_index_arr; @@ -191,7 +191,7 @@ std::future GraphBrpcClient::add_graph_node( if (index_mapping[server_index] == -1) { index_mapping[server_index] = request_bucket.size(); server_index_arr.push_back(server_index); - request_bucket.push_back(std::vector()); + request_bucket.push_back(std::vector()); if (add_weight) is_weighted_bucket.push_back(std::vector()); } request_bucket[index_mapping[server_index]].push_back( @@ -229,7 +229,7 @@ std::future GraphBrpcClient::add_graph_node( size_t node_num = request_bucket[request_idx].size(); closure->request(request_idx) ->add_params((char *)request_bucket[request_idx].data(), - sizeof(uint64_t) * node_num); + sizeof(int64_t) * node_num); if (add_weight) { bool weighted[is_weighted_bucket[request_idx].size() + 1]; for (size_t j = 0; j < is_weighted_bucket[request_idx].size(); j++) @@ -248,8 +248,8 @@ std::future GraphBrpcClient::add_graph_node( return fut; } std::future GraphBrpcClient::remove_graph_node( - uint32_t table_id, std::vector &node_id_list) { - std::vector> request_bucket; + uint32_t table_id, std::vector &node_id_list) { + std::vector> request_bucket; std::vector server_index_arr; std::vector index_mapping(server_size, -1); for (size_t query_idx = 0; query_idx < node_id_list.size(); ++query_idx) { @@ -257,7 +257,7 @@ std::future GraphBrpcClient::remove_graph_node( if (index_mapping[server_index] == -1) { index_mapping[server_index] = request_bucket.size(); server_index_arr.push_back(server_index); - request_bucket.push_back(std::vector()); + request_bucket.push_back(std::vector()); } request_bucket[index_mapping[server_index]].push_back( node_id_list[query_idx]); @@ -291,7 +291,7 @@ std::future GraphBrpcClient::remove_graph_node( closure->request(request_idx) ->add_params((char *)request_bucket[request_idx].data(), - sizeof(uint64_t) * node_num); + sizeof(int64_t) * node_num); // PsService_Stub rpc_stub(get_cmd_channel(server_index)); GraphPsService_Stub rpc_stub = getServiceStub(get_cmd_channel(server_index)); @@ -303,9 +303,9 @@ std::future GraphBrpcClient::remove_graph_node( } // char* &buffer,int &actual_size std::future GraphBrpcClient::batch_sample_neighbors( - uint32_t table_id, std::vector node_ids, int sample_size, - // std::vector>> &res, - std::vector> &res, + uint32_t table_id, std::vector node_ids, int sample_size, + // std::vector>> &res, + std::vector> &res, std::vector> &res_weight, bool need_weight, int server_index) { if (server_index != -1) { @@ -337,7 +337,7 @@ std::future GraphBrpcClient::batch_sample_neighbors( int start = 0; while (start < actual_size) { res[node_idx].emplace_back( - *(uint64_t *)(node_buffer + offset + start)); + *(int64_t *)(node_buffer + offset + start)); start += GraphNode::id_size; if (need_weight) { res_weight[node_idx].emplace_back( @@ -358,7 +358,7 @@ std::future GraphBrpcClient::batch_sample_neighbors( closure->request(0)->set_table_id(table_id); closure->request(0)->set_client_id(_client_id); closure->request(0)->add_params((char *)node_ids.data(), - sizeof(uint64_t) * node_ids.size()); + sizeof(int64_t) * node_ids.size()); closure->request(0)->add_params((char *)&sample_size, sizeof(int)); closure->request(0)->add_params((char *)&need_weight, sizeof(bool)); ; @@ -380,14 +380,14 @@ std::future GraphBrpcClient::batch_sample_neighbors( server2request[server_index] = request2server.size(); request2server.push_back(server_index); } - // res.push_back(std::vector>()); + // res.push_back(std::vector>()); res.push_back({}); if (need_weight) { res_weight.push_back({}); } } size_t request_call_num = request2server.size(); - std::vector> node_id_buckets(request_call_num); + std::vector> node_id_buckets(request_call_num); std::vector> query_idx_buckets(request_call_num); for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) { int server_index = get_server_index_by_id(node_ids[query_idx]); @@ -428,7 +428,7 @@ std::future GraphBrpcClient::batch_sample_neighbors( int start = 0; while (start < actual_size) { res[query_idx].emplace_back( - *(uint64_t *)(node_buffer + offset + start)); + *(int64_t *)(node_buffer + offset + start)); start += GraphNode::id_size; if (need_weight) { res_weight[query_idx].emplace_back( @@ -459,7 +459,7 @@ std::future GraphBrpcClient::batch_sample_neighbors( closure->request(request_idx) ->add_params((char *)node_id_buckets[request_idx].data(), - sizeof(uint64_t) * node_num); + sizeof(int64_t) * node_num); closure->request(request_idx) ->add_params((char *)&sample_size, sizeof(int)); closure->request(request_idx) @@ -476,7 +476,7 @@ std::future GraphBrpcClient::batch_sample_neighbors( } std::future GraphBrpcClient::random_sample_nodes( uint32_t table_id, int server_index, int sample_size, - std::vector &ids) { + std::vector &ids) { DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [&](void *done) { int ret = 0; auto *closure = (DownpourBrpcClosure *)done; @@ -490,7 +490,7 @@ std::future GraphBrpcClient::random_sample_nodes( auto size = io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size); int index = 0; while (index < bytes_size) { - ids.push_back(*(uint64_t *)(buffer + index)); + ids.push_back(*(int64_t *)(buffer + index)); index += GraphNode::id_size; } delete[] buffer; @@ -633,7 +633,7 @@ std::future GraphBrpcClient::pull_graph_list( } std::future GraphBrpcClient::set_node_feat( - const uint32_t &table_id, const std::vector &node_ids, + const uint32_t &table_id, const std::vector &node_ids, const std::vector &feature_names, const std::vector> &features) { std::vector request2server; @@ -646,7 +646,7 @@ std::future GraphBrpcClient::set_node_feat( } } size_t request_call_num = request2server.size(); - std::vector> node_id_buckets(request_call_num); + std::vector> node_id_buckets(request_call_num); std::vector> query_idx_buckets(request_call_num); std::vector>> features_idx_buckets( request_call_num); @@ -696,7 +696,7 @@ std::future GraphBrpcClient::set_node_feat( closure->request(request_idx) ->add_params((char *)node_id_buckets[request_idx].data(), - sizeof(uint64_t) * node_num); + sizeof(int64_t) * node_num); std::string joint_feature_name = paddle::string::join_strings(feature_names, '\t'); closure->request(request_idx) diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_client.h b/paddle/fluid/distributed/ps/service/graph_brpc_client.h index 06e753d028b..e2b8a518615 100644 --- a/paddle/fluid/distributed/ps/service/graph_brpc_client.h +++ b/paddle/fluid/distributed/ps/service/graph_brpc_client.h @@ -63,8 +63,8 @@ class GraphBrpcClient : public BrpcPsClient { virtual ~GraphBrpcClient() {} // given a batch of nodes, sample graph_neighbors for each of them virtual std::future batch_sample_neighbors( - uint32_t table_id, std::vector node_ids, int sample_size, - std::vector>& res, + uint32_t table_id, std::vector node_ids, int sample_size, + std::vector>& res, std::vector>& res_weight, bool need_weight, int server_index = -1); @@ -75,20 +75,20 @@ class GraphBrpcClient : public BrpcPsClient { virtual std::future random_sample_nodes(uint32_t table_id, int server_index, int sample_size, - std::vector& ids); + std::vector& ids); virtual std::future get_node_feat( - const uint32_t& table_id, const std::vector& node_ids, + const uint32_t& table_id, const std::vector& node_ids, const std::vector& feature_names, std::vector>& res); virtual std::future set_node_feat( - const uint32_t& table_id, const std::vector& node_ids, + const uint32_t& table_id, const std::vector& node_ids, const std::vector& feature_names, const std::vector>& features); virtual std::future clear_nodes(uint32_t table_id); virtual std::future add_graph_node( - uint32_t table_id, std::vector& node_id_list, + uint32_t table_id, std::vector& node_id_list, std::vector& is_weighted_list); virtual std::future use_neighbors_sample_cache(uint32_t table_id, size_t size_limit, @@ -96,11 +96,11 @@ class GraphBrpcClient : public BrpcPsClient { virtual std::future load_graph_split_config(uint32_t table_id, std::string path); virtual std::future remove_graph_node( - uint32_t table_id, std::vector& node_id_list); + uint32_t table_id, std::vector& node_id_list); virtual int32_t initialize(); int get_shard_num() { return shard_num; } void set_shard_num(int shard_num) { this->shard_num = shard_num; } - int get_server_index_by_id(uint64_t id); + int get_server_index_by_id(int64_t id); void set_local_channel(int index) { this->local_channel = get_cmd_channel(index); } diff --git a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc index 441f489fb30..20a55e4d119 100644 --- a/paddle/fluid/distributed/ps/service/graph_brpc_server.cc +++ b/paddle/fluid/distributed/ps/service/graph_brpc_server.cc @@ -140,9 +140,9 @@ int32_t GraphBrpcService::add_graph_node(Table *table, return 0; } - size_t node_num = request.params(0).size() / sizeof(uint64_t); - uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); - std::vector node_ids(node_data, node_data + node_num); + size_t node_num = request.params(0).size() / sizeof(int64_t); + int64_t *node_data = (int64_t *)(request.params(0).c_str()); + std::vector node_ids(node_data, node_data + node_num); std::vector is_weighted_list; if (request.params_size() == 2) { size_t weight_list_size = request.params(1).size() / sizeof(bool); @@ -165,9 +165,9 @@ int32_t GraphBrpcService::remove_graph_node(Table *table, "graph_get_node_feat request requires at least 1 argument"); return 0; } - size_t node_num = request.params(0).size() / sizeof(uint64_t); - uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); - std::vector node_ids(node_data, node_data + node_num); + size_t node_num = request.params(0).size() / sizeof(int64_t); + int64_t *node_data = (int64_t *)(request.params(0).c_str()); + std::vector node_ids(node_data, node_data + node_num); ((GraphTable *)table)->remove_graph_node(node_ids); return 0; @@ -386,9 +386,9 @@ int32_t GraphBrpcService::graph_random_sample_neighbors( "graph_random_sample_neighbors request requires at least 3 arguments"); return 0; } - size_t node_num = request.params(0).size() / sizeof(uint64_t); - uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); - int sample_size = *(uint64_t *)(request.params(1).c_str()); + size_t node_num = request.params(0).size() / sizeof(int64_t); + int64_t *node_data = (int64_t *)(request.params(0).c_str()); + int sample_size = *(int64_t *)(request.params(1).c_str()); bool need_weight = *(bool *)(request.params(2).c_str()); std::vector> buffers(node_num); std::vector actual_sizes(node_num, 0); @@ -407,7 +407,7 @@ int32_t GraphBrpcService::graph_random_sample_neighbors( int32_t GraphBrpcService::graph_random_sample_nodes( Table *table, const PsRequestMessage &request, PsResponseMessage &response, brpc::Controller *cntl) { - size_t size = *(uint64_t *)(request.params(0).c_str()); + size_t size = *(int64_t *)(request.params(0).c_str()); std::unique_ptr buffer; int actual_size; if (((GraphTable *)table)->random_sample_nodes(size, buffer, actual_size) == @@ -430,9 +430,9 @@ int32_t GraphBrpcService::graph_get_node_feat(Table *table, "graph_get_node_feat request requires at least 2 arguments"); return 0; } - size_t node_num = request.params(0).size() / sizeof(uint64_t); - uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); - std::vector node_ids(node_data, node_data + node_num); + size_t node_num = request.params(0).size() / sizeof(int64_t); + int64_t *node_data = (int64_t *)(request.params(0).c_str()); + std::vector node_ids(node_data, node_data + node_num); std::vector feature_names = paddle::string::split_string(request.params(1), "\t"); @@ -464,16 +464,16 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers( "at least 3 arguments"); return 0; } - size_t node_num = request.params(0).size() / sizeof(uint64_t), + size_t node_num = request.params(0).size() / sizeof(int64_t), size_of_size_t = sizeof(size_t); - uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); - int sample_size = *(uint64_t *)(request.params(1).c_str()); - bool need_weight = *(uint64_t *)(request.params(2).c_str()); - // std::vector res = ((GraphTable + int64_t *node_data = (int64_t *)(request.params(0).c_str()); + int sample_size = *(int64_t *)(request.params(1).c_str()); + bool need_weight = *(int64_t *)(request.params(2).c_str()); + // std::vector res = ((GraphTable // *)table).filter_out_non_exist_nodes(node_data, sample_size); std::vector request2server; std::vector server2request(server_size, -1); - std::vector local_id; + std::vector local_id; std::vector local_query_idx; size_t rank = get_rank(); for (int query_idx = 0; query_idx < node_num; ++query_idx) { @@ -496,7 +496,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers( std::vector> local_buffers; std::vector local_actual_sizes; std::vector seq; - std::vector> node_id_buckets(request_call_num); + std::vector> node_id_buckets(request_call_num); std::vector> query_idx_buckets(request_call_num); for (int query_idx = 0; query_idx < node_num; ++query_idx) { int server_index = @@ -583,7 +583,7 @@ int32_t GraphBrpcService::sample_neighbors_across_multi_servers( closure->request(request_idx) ->add_params((char *)node_id_buckets[request_idx].data(), - sizeof(uint64_t) * node_num); + sizeof(int64_t) * node_num); closure->request(request_idx) ->add_params((char *)&sample_size, sizeof(int)); closure->request(request_idx) @@ -618,9 +618,9 @@ int32_t GraphBrpcService::graph_set_node_feat(Table *table, "graph_set_node_feat request requires at least 3 arguments"); return 0; } - size_t node_num = request.params(0).size() / sizeof(uint64_t); - uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); - std::vector node_ids(node_data, node_data + node_num); + size_t node_num = request.params(0).size() / sizeof(int64_t); + int64_t *node_data = (int64_t *)(request.params(0).c_str()); + std::vector node_ids(node_data, node_data + node_num); std::vector feature_names = paddle::string::split_string(request.params(1), "\t"); diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc index 088edcb75bb..c8be0f79710 100644 --- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc +++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.cc @@ -44,9 +44,9 @@ void GraphPyService::add_table_feat_conf(std::string table_name, } } -void add_graph_node(std::vector node_ids, +void add_graph_node(std::vector node_ids, std::vector weight_list) {} -void remove_graph_node(std::vector node_ids) {} +void remove_graph_node(std::vector node_ids) {} void GraphPyService::set_up(std::string ips_str, int shard_num, std::vector node_types, std::vector edge_types) { @@ -260,7 +260,7 @@ void GraphPyClient::clear_nodes(std::string name) { } void GraphPyClient::add_graph_node(std::string name, - std::vector& node_ids, + std::vector& node_ids, std::vector& weight_list) { if (this->table_id_map.count(name)) { uint32_t table_id = this->table_id_map[name]; @@ -271,7 +271,7 @@ void GraphPyClient::add_graph_node(std::string name, } void GraphPyClient::remove_graph_node(std::string name, - std::vector& node_ids) { + std::vector& node_ids) { if (this->table_id_map.count(name)) { uint32_t table_id = this->table_id_map[name]; auto status = get_ps_client()->remove_graph_node(table_id, node_ids); @@ -290,13 +290,12 @@ void GraphPyClient::load_node_file(std::string name, std::string filepath) { } } -std::pair>, std::vector> +std::pair>, std::vector> GraphPyClient::batch_sample_neighbors(std::string name, - std::vector node_ids, + std::vector node_ids, int sample_size, bool return_weight, bool return_edges) { - // std::vector>> v; - std::vector> v; + std::vector> v; std::vector> v1; if (this->table_id_map.count(name)) { uint32_t table_id = this->table_id_map[name]; @@ -309,7 +308,7 @@ GraphPyClient::batch_sample_neighbors(std::string name, // res.first[1]: slice index // res.first[2]: src nodes // res.second: edges weight - std::pair>, std::vector> res; + std::pair>, std::vector> res; res.first.push_back({}); res.first.push_back({}); if (return_edges) res.first.push_back({}); @@ -342,10 +341,10 @@ void GraphPyClient::use_neighbors_sample_cache(std::string name, status.wait(); } } -std::vector GraphPyClient::random_sample_nodes(std::string name, - int server_index, - int sample_size) { - std::vector v; +std::vector GraphPyClient::random_sample_nodes(std::string name, + int server_index, + int sample_size) { + std::vector v; if (this->table_id_map.count(name)) { uint32_t table_id = this->table_id_map[name]; auto status = @@ -357,7 +356,7 @@ std::vector GraphPyClient::random_sample_nodes(std::string name, // (name, dtype, ndarray) std::vector> GraphPyClient::get_node_feat( - std::string node_type, std::vector node_ids, + std::string node_type, std::vector node_ids, std::vector feature_names) { std::vector> v( feature_names.size(), std::vector(node_ids.size())); @@ -371,7 +370,7 @@ std::vector> GraphPyClient::get_node_feat( } void GraphPyClient::set_node_feat( - std::string node_type, std::vector node_ids, + std::string node_type, std::vector node_ids, std::vector feature_names, const std::vector> features) { if (this->table_id_map.count(node_type)) { diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h index c25ef503545..85707137c18 100644 --- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h +++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h @@ -70,18 +70,34 @@ class GraphPyService { ::paddle::distributed::TableAccessorParameter* accessor_proto = sparse_table_proto->mutable_accessor(); - ::paddle::distributed::CommonAccessorParameter* common_proto = - sparse_table_proto->mutable_common(); + // ::paddle::distributed::CommonAccessorParameter* common_proto = + // sparse_table_proto->mutable_common(); + ::paddle::distributed::GraphParameter* graph_proto = + sparse_table_proto->mutable_graph_parameter(); + + ::paddle::distributed::GraphFeature* graph_feature = + graph_proto->mutable_graph_feature(); + + graph_proto->set_task_pool_size(24); + + graph_proto->set_table_name(table_name); + graph_proto->set_table_type(table_type); + graph_proto->set_use_cache(false); // Set GraphTable Parameter - common_proto->set_table_name(table_name); - common_proto->set_name(table_type); + // common_proto->set_table_name(table_name); + // common_proto->set_name(table_type); + // for (size_t i = 0; i < feat_name.size(); i++) { + // common_proto->add_params(feat_dtype[i]); + // common_proto->add_dims(feat_shape[i]); + // common_proto->add_attributes(feat_name[i]); + // } + for (size_t i = 0; i < feat_name.size(); i++) { - common_proto->add_params(feat_dtype[i]); - common_proto->add_dims(feat_shape[i]); - common_proto->add_attributes(feat_name[i]); + graph_feature->add_dtype(feat_dtype[i]); + graph_feature->add_shape(feat_shape[i]); + graph_feature->add_name(feat_name[i]); } - accessor_proto->set_accessor_class("CommMergeAccessor"); } @@ -143,24 +159,24 @@ class GraphPyClient : public GraphPyService { void load_edge_file(std::string name, std::string filepath, bool reverse); void load_node_file(std::string name, std::string filepath); void clear_nodes(std::string name); - void add_graph_node(std::string name, std::vector& node_ids, + void add_graph_node(std::string name, std::vector& node_ids, std::vector& weight_list); - void remove_graph_node(std::string name, std::vector& node_ids); + void remove_graph_node(std::string name, std::vector& node_ids); int get_client_id() { return client_id; } void set_client_id(int client_id) { this->client_id = client_id; } void start_client(); - std::pair>, std::vector> - batch_sample_neighbors(std::string name, std::vector node_ids, + std::pair>, std::vector> + batch_sample_neighbors(std::string name, std::vector node_ids, int sample_size, bool return_weight, bool return_edges); - std::vector random_sample_nodes(std::string name, int server_index, - int sample_size); + std::vector random_sample_nodes(std::string name, int server_index, + int sample_size); std::vector> get_node_feat( - std::string node_type, std::vector node_ids, + std::string node_type, std::vector node_ids, std::vector feature_names); void use_neighbors_sample_cache(std::string name, size_t total_size_limit, size_t ttl); - void set_node_feat(std::string node_type, std::vector node_ids, + void set_node_feat(std::string node_type, std::vector node_ids, std::vector feature_names, const std::vector> features); std::vector pull_graph_list(std::string name, int server_index, diff --git a/paddle/fluid/distributed/ps/table/CMakeLists.txt b/paddle/fluid/distributed/ps/table/CMakeLists.txt index be916bf2e80..2fa5ecb4051 100644 --- a/paddle/fluid/distributed/ps/table/CMakeLists.txt +++ b/paddle/fluid/distributed/ps/table/CMakeLists.txt @@ -53,7 +53,6 @@ cc_library(memory_sparse_table SRCS memory_sparse_table.cc DEPS ps_framework_pro set_source_files_properties(memory_sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_library(memory_sparse_geo_table SRCS memory_sparse_geo_table.cc DEPS ps_framework_proto ${TABLE_DEPS} common_table) - cc_library(table SRCS table.cc DEPS memory_sparse_table memory_sparse_geo_table common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost) target_link_libraries(table -fopenmp) diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.cc b/paddle/fluid/distributed/ps/table/common_graph_table.cc index 54b98cb96ce..2c07bd65d63 100644 --- a/paddle/fluid/distributed/ps/table/common_graph_table.cc +++ b/paddle/fluid/distributed/ps/table/common_graph_table.cc @@ -27,6 +27,288 @@ namespace paddle { namespace distributed { +#ifdef PADDLE_WITH_HETERPS + +int CompleteGraphSampler::run_graph_sampling() { + pthread_rwlock_t *rw_lock = graph_table->rw_lock.get(); + pthread_rwlock_rdlock(rw_lock); + std::cout << "in graph sampling" << std::endl; + sample_nodes.clear(); + sample_neighbors.clear(); + sample_res.clear(); + sample_nodes.resize(gpu_num); + sample_neighbors.resize(gpu_num); + sample_res.resize(gpu_num); + std::vector>> + sample_nodes_ex(graph_table->task_pool_size_); + std::vector>> sample_neighbors_ex( + graph_table->task_pool_size_); + for (int i = 0; i < graph_table->task_pool_size_; i++) { + sample_nodes_ex[i].resize(gpu_num); + sample_neighbors_ex[i].resize(gpu_num); + } + std::vector> tasks; + for (size_t i = 0; i < graph_table->shards.size(); ++i) { + tasks.push_back( + graph_table->_shards_task_pool[i % graph_table->task_pool_size_] + ->enqueue([&, i, this]() -> int { + if (this->status == GraphSamplerStatus::terminating) return 0; + paddle::framework::GpuPsGraphNode node; + std::vector &v = + this->graph_table->shards[i]->get_bucket(); + size_t ind = i % this->graph_table->task_pool_size_; + for (size_t j = 0; j < v.size(); j++) { + size_t location = v[j]->get_id() % this->gpu_num; + node.node_id = v[j]->get_id(); + node.neighbor_size = v[j]->get_neighbor_size(); + node.neighbor_offset = + (int)sample_neighbors_ex[ind][location].size(); + sample_nodes_ex[ind][location].emplace_back(node); + for (int k = 0; k < node.neighbor_size; k++) + sample_neighbors_ex[ind][location].push_back( + v[j]->get_neighbor_id(k)); + } + return 0; + })); + } + for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); + tasks.clear(); + for (size_t i = 0; i < gpu_num; i++) { + tasks.push_back( + graph_table->_shards_task_pool[i % graph_table->task_pool_size_] + ->enqueue([&, i, this]() -> int { + if (this->status == GraphSamplerStatus::terminating) return 0; + int total_offset = 0; + size_t ind = i % this->graph_table->task_pool_size_; + for (int j = 0; j < this->graph_table->task_pool_size_; j++) { + for (size_t k = 0; k < sample_nodes_ex[j][ind].size(); k++) { + sample_nodes[ind].push_back(sample_nodes_ex[j][ind][k]); + sample_nodes[ind].back().neighbor_offset += total_offset; + } + size_t neighbor_size = sample_neighbors_ex[j][ind].size(); + total_offset += neighbor_size; + for (size_t k = 0; k < neighbor_size; k++) { + sample_neighbors[ind].push_back( + sample_neighbors_ex[j][ind][k]); + } + } + return 0; + })); + } + for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); + + if (this->status == GraphSamplerStatus::terminating) { + pthread_rwlock_unlock(rw_lock); + return 0; + } + for (size_t i = 0; i < gpu_num; i++) { + sample_res[i].node_list = sample_nodes[i].data(); + sample_res[i].neighbor_list = sample_neighbors[i].data(); + sample_res[i].node_size = sample_nodes[i].size(); + sample_res[i].neighbor_size = sample_neighbors[i].size(); + } + pthread_rwlock_unlock(rw_lock); + if (this->status == GraphSamplerStatus::terminating) { + return 0; + } + callback(sample_res); + return 0; +} +void CompleteGraphSampler::init(size_t gpu_num, GraphTable *graph_table, + std::vector args) { + this->gpu_num = gpu_num; + this->graph_table = graph_table; +} + +int BasicBfsGraphSampler::run_graph_sampling() { + pthread_rwlock_t *rw_lock = graph_table->rw_lock.get(); + pthread_rwlock_rdlock(rw_lock); + while (rounds > 0 && status == GraphSamplerStatus::running) { + for (size_t i = 0; i < sample_neighbors_map.size(); i++) { + sample_neighbors_map[i].clear(); + } + sample_neighbors_map.clear(); + std::vector nodes_left(graph_table->shards.size(), + node_num_for_each_shard); + std::promise prom; + std::future fut = prom.get_future(); + sample_neighbors_map.resize(graph_table->task_pool_size_); + int task_size = 0; + std::vector> tasks; + int init_size = 0; + //__sync_fetch_and_add + std::function bfs = [&, this](int i, int id) -> int { + VLOG(0) << "in bfs " << i << " " << id; + if (this->status == GraphSamplerStatus::terminating) { + int task_left = __sync_sub_and_fetch(&task_size, 1); + if (task_left == 0) { + prom.set_value(0); + } + return 0; + } + size_t ind = i % this->graph_table->task_pool_size_; + if (nodes_left[i] > 0) { + nodes_left[i]--; + auto iter = sample_neighbors_map[ind].find(id); + if (iter == sample_neighbors_map[ind].end()) { + sample_neighbors_map[ind][id] = std::vector(); + iter = sample_neighbors_map[ind].find(id); + Node *node = graph_table->shards[i]->find_node(id); + if (node != NULL) { + size_t edge_fetch_size = + std::min((size_t) this->edge_num_for_each_node, + node->get_neighbor_size()); + for (size_t k = 0; k < edge_fetch_size; k++) { + int64_t neighbor_id = node->get_neighbor_id(k); + int node_location = neighbor_id % this->graph_table->shard_num % + this->graph_table->task_pool_size_; + __sync_add_and_fetch(&task_size, 1); + graph_table->_shards_task_pool[node_location]->enqueue( + bfs, neighbor_id % this->graph_table->shard_num, neighbor_id); + iter->second.push_back(neighbor_id); + } + } + } + } + int task_left = __sync_sub_and_fetch(&task_size, 1); + if (task_left == 0) { + prom.set_value(0); + } + return 0; + }; + for (size_t i = 0; i < graph_table->shards.size(); ++i) { + std::vector &v = graph_table->shards[i]->get_bucket(); + if (v.size() > 0) { + init_size++; + __sync_add_and_fetch(&task_size, 1); + int64_t id = v[0]->get_id(); + graph_table->_shards_task_pool[i % graph_table->task_pool_size_] + ->enqueue(bfs, i, id); + } // if + } + if (init_size == 0) { + prom.set_value(0); + } + fut.get(); + if (this->status == GraphSamplerStatus::terminating) { + pthread_rwlock_unlock(rw_lock); + return 0; + } + std::cout << "bfs over" << std::endl; + sample_nodes.clear(); + sample_neighbors.clear(); + sample_res.clear(); + sample_nodes.resize(gpu_num); + sample_neighbors.resize(gpu_num); + sample_res.resize(gpu_num); + std::vector>> + sample_nodes_ex(graph_table->task_pool_size_); + std::vector>> sample_neighbors_ex( + graph_table->task_pool_size_); + for (int i = 0; i < graph_table->task_pool_size_; i++) { + sample_nodes_ex[i].resize(gpu_num); + sample_neighbors_ex[i].resize(gpu_num); + } + tasks.clear(); + for (size_t i = 0; i < (size_t)graph_table->task_pool_size_; ++i) { + tasks.push_back( + graph_table->_shards_task_pool[i]->enqueue([&, i, this]() -> int { + if (this->status == GraphSamplerStatus::terminating) { + return 0; + } + paddle::framework::GpuPsGraphNode node; + auto iter = sample_neighbors_map[i].begin(); + size_t ind = i; + for (; iter != sample_neighbors_map[i].end(); iter++) { + size_t location = iter->first % this->gpu_num; + node.node_id = iter->first; + node.neighbor_size = iter->second.size(); + node.neighbor_offset = + (int)sample_neighbors_ex[ind][location].size(); + sample_nodes_ex[ind][location].emplace_back(node); + for (auto k : iter->second) + sample_neighbors_ex[ind][location].push_back(k); + } + return 0; + })); + } + + for (size_t i = 0; i < tasks.size(); i++) { + tasks[i].get(); + sample_neighbors_map[i].clear(); + } + tasks.clear(); + if (this->status == GraphSamplerStatus::terminating) { + pthread_rwlock_unlock(rw_lock); + return 0; + } + for (size_t i = 0; i < gpu_num; i++) { + tasks.push_back( + graph_table->_shards_task_pool[i % graph_table->task_pool_size_] + ->enqueue([&, i, this]() -> int { + if (this->status == GraphSamplerStatus::terminating) { + pthread_rwlock_unlock(rw_lock); + return 0; + } + int total_offset = 0; + size_t ind = i % graph_table->task_pool_size_; + for (int j = 0; j < this->graph_table->task_pool_size_; j++) { + for (size_t k = 0; k < sample_nodes_ex[j][ind].size(); k++) { + sample_nodes[i].push_back(sample_nodes_ex[j][ind][k]); + sample_nodes[i].back().neighbor_offset += total_offset; + // neighbor_offset[i].push_back(total_offset + + // neighbor_offset_ex[j][i][k]); + } + size_t neighbor_size = sample_neighbors_ex[j][ind].size(); + total_offset += neighbor_size; + for (size_t k = 0; k < neighbor_size; k++) { + sample_neighbors[ind].push_back( + sample_neighbors_ex[j][ind][k]); + } + } + return 0; + })); + } + for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); + if (this->status == GraphSamplerStatus::terminating) { + pthread_rwlock_unlock(rw_lock); + return 0; + } + // int64_t total_neighbors = + // std::accumulate(shard_neighbor_size.begin(),shard_neighbor_size.end(),0); + for (size_t i = 0; i < gpu_num; i++) { + sample_res[i].node_list = sample_nodes[i].data(); + sample_res[i].neighbor_list = sample_neighbors[i].data(); + sample_res[i].node_size = sample_nodes[i].size(); + sample_res[i].neighbor_size = sample_neighbors[i].size(); + } + pthread_rwlock_unlock(rw_lock); + if (this->status == GraphSamplerStatus::terminating) { + return 0; + } + callback(sample_res); + rounds--; + if (rounds > 0) { + for (int i = 0; + i < interval && this->status == GraphSamplerStatus::running; i++) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + } + } + } + return 0; +} +void BasicBfsGraphSampler::init(size_t gpu_num, GraphTable *graph_table, + std::vector args) { + this->gpu_num = gpu_num; + this->graph_table = graph_table; + node_num_for_each_shard = args.size() > 0 ? std::stoi(args[0]) : 10; + edge_num_for_each_node = args.size() > 1 ? std::stoi(args[1]) : 10; + rounds = args.size() > 2 ? std::stoi(args[2]) : 1; + interval = args.size() > 3 ? std::stoi(args[3]) : 60; +} + +#endif + std::vector GraphShard::get_batch(int start, int end, int step) { if (start < 0) start = 0; std::vector res; @@ -38,10 +320,10 @@ std::vector GraphShard::get_batch(int start, int end, int step) { size_t GraphShard::get_size() { return bucket.size(); } -int32_t GraphTable::add_graph_node(std::vector &id_list, +int32_t GraphTable::add_graph_node(std::vector &id_list, std::vector &is_weight_list) { size_t node_size = id_list.size(); - std::vector>> batch(task_pool_size_); + std::vector>> batch(task_pool_size_); for (size_t i = 0; i < node_size; i++) { size_t shard_id = id_list[i] % shard_num; if (shard_id >= shard_end || shard_id < shard_start) { @@ -65,9 +347,9 @@ int32_t GraphTable::add_graph_node(std::vector &id_list, return 0; } -int32_t GraphTable::remove_graph_node(std::vector &id_list) { +int32_t GraphTable::remove_graph_node(std::vector &id_list) { size_t node_size = id_list.size(); - std::vector> batch(task_pool_size_); + std::vector> batch(task_pool_size_); for (size_t i = 0; i < node_size; i++) { size_t shard_id = id_list[i] % shard_num; if (shard_id >= shard_end || shard_id < shard_start) continue; @@ -98,7 +380,7 @@ void GraphShard::clear() { GraphShard::~GraphShard() { clear(); } -void GraphShard::delete_node(uint64_t id) { +void GraphShard::delete_node(int64_t id) { auto iter = node_location.find(id); if (iter == node_location.end()) return; int pos = iter->second; @@ -110,7 +392,7 @@ void GraphShard::delete_node(uint64_t id) { node_location.erase(id); bucket.pop_back(); } -GraphNode *GraphShard::add_graph_node(uint64_t id) { +GraphNode *GraphShard::add_graph_node(int64_t id) { if (node_location.find(id) == node_location.end()) { node_location[id] = bucket.size(); bucket.push_back(new GraphNode(id)); @@ -126,7 +408,7 @@ GraphNode *GraphShard::add_graph_node(Node *node) { } return (GraphNode *)bucket[node_location[id]]; } -FeatureNode *GraphShard::add_feature_node(uint64_t id) { +FeatureNode *GraphShard::add_feature_node(int64_t id) { if (node_location.find(id) == node_location.end()) { node_location[id] = bucket.size(); bucket.push_back(new FeatureNode(id)); @@ -134,11 +416,11 @@ FeatureNode *GraphShard::add_feature_node(uint64_t id) { return (FeatureNode *)bucket[node_location[id]]; } -void GraphShard::add_neighbor(uint64_t id, uint64_t dst_id, float weight) { +void GraphShard::add_neighbor(int64_t id, int64_t dst_id, float weight) { find_node(id)->add_edge(dst_id, weight); } -Node *GraphShard::find_node(uint64_t id) { +Node *GraphShard::find_node(int64_t id) { auto iter = node_location.find(id); return iter == node_location.end() ? nullptr : bucket[iter->second]; } @@ -185,14 +467,14 @@ int32_t GraphTable::load(const std::string &path, const std::string ¶m) { } int32_t GraphTable::get_nodes_ids_by_ranges( - std::vector> ranges, std::vector &res) { + std::vector> ranges, std::vector &res) { int start = 0, end, index = 0, total_size = 0; res.clear(); - std::vector>> tasks; + std::vector>> tasks; for (size_t i = 0; i < shards.size() && index < (int)ranges.size(); i++) { end = total_size + shards[i]->get_size(); start = total_size; - while (start < end && index < ranges.size()) { + while (start < end && index < (int)ranges.size()) { if (ranges[index].second <= start) index++; else if (ranges[index].first >= end) { @@ -204,7 +486,7 @@ int32_t GraphTable::get_nodes_ids_by_ranges( first -= total_size; second -= total_size; tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue( - [this, first, second, i]() -> std::vector { + [this, first, second, i]() -> std::vector { return shards[i]->get_ids_by_range(first, second); })); } @@ -276,6 +558,9 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) { } int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { +#ifdef PADDLE_WITH_HETERPS + if (gpups_mode) pthread_rwlock_rdlock(rw_lock.get()); +#endif auto paths = paddle::string::split_string(path, ";"); int64_t count = 0; std::string sample_type = "random"; @@ -351,6 +636,13 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { /*----------------------- relocate the duplicate nodes to make them distributed evenly among threads. */ + if (!use_duplicate_nodes) { +#ifdef PADDLE_WITH_HETERPS + if (gpups_mode) pthread_rwlock_unlock(rw_lock.get()); +#endif + + return 0; + } for (auto &shard : extra_shards) { auto bucket = shard->get_bucket(); for (size_t i = 0; i < bucket.size(); i++) { @@ -360,13 +652,13 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { int size = extra_nodes_to_thread_index.size(); if (size == 0) return 0; std::vector index; - for (int i = 0; i < used.size(); i++) index.push_back(i); + for (int i = 0; i < (int)used.size(); i++) index.push_back(i); sort(index.begin(), index.end(), [&](int &a, int &b) { return used[a] < used[b]; }); std::vector alloc(index.size(), 0), has_alloc(index.size(), 0); int t = 1, aim = 0, mod = 0; - for (; t < used.size(); t++) { + for (; t < (int)used.size(); t++) { if ((used[index[t]] - used[index[t - 1]]) * t >= size) { break; } else { @@ -380,7 +672,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { if (t - x <= mod) alloc[index[x]]++; alloc[index[x]] -= used[index[x]]; } - std::vector vec[index.size()]; + std::vector vec[index.size()]; for (auto p : extra_nodes_to_thread_index) { has_alloc[p.second]++; vec[p.second].push_back(p.first); @@ -395,7 +687,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { has_alloc[index[right]] - alloc[index[right]]); has_alloc[index[left]] += x; has_alloc[index[right]] -= x; - uint64_t id; + int64_t id; while (x--) { id = vec[index[right]].back(); vec[index[right]].pop_back(); @@ -424,10 +716,13 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { delete extra_shards[i]; extra_shards[i] = extra_shards_copy[i]; } +#ifdef PADDLE_WITH_HETERPS + if (gpups_mode) pthread_rwlock_unlock(rw_lock.get()); +#endif return 0; } -Node *GraphTable::find_node(uint64_t id) { +Node *GraphTable::find_node(int64_t id) { size_t shard_id = id % shard_num; if (shard_id >= shard_end || shard_id < shard_start) { if (use_duplicate_nodes == false || extra_nodes_to_thread_index.size() == 0) @@ -443,7 +738,7 @@ Node *GraphTable::find_node(uint64_t id) { Node *node = shards[index]->find_node(id); return node; } -uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) { +uint32_t GraphTable::get_thread_pool_index(int64_t node_id) { if (use_duplicate_nodes == false || extra_nodes_to_thread_index.size() == 0) return node_id % shard_num % shard_num_per_server % task_pool_size_; size_t src_shard_id = node_id % shard_num; @@ -456,8 +751,7 @@ uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) { return src_shard_id % shard_num_per_server % task_pool_size_; } -uint32_t GraphTable::get_thread_pool_index_by_shard_index( - uint64_t shard_index) { +uint32_t GraphTable::get_thread_pool_index_by_shard_index(int64_t shard_index) { return shard_index % shard_num_per_server % task_pool_size_; } @@ -484,7 +778,7 @@ int32_t GraphTable::random_sample_nodes(int sample_size, std::unique_ptr &buffer, int &actual_size) { int total_size = 0; - for (int i = 0; i < shards.size(); i++) { + for (int i = 0; i < (int)shards.size(); i++) { total_size += shards[i]->get_size(); } if (sample_size > total_size) sample_size = total_size; @@ -537,16 +831,16 @@ int32_t GraphTable::random_sample_nodes(int sample_size, } } for (auto &pair : first_half) second_half.push_back(pair); - std::vector res; + std::vector res; get_nodes_ids_by_ranges(second_half, res); - actual_size = res.size() * sizeof(uint64_t); + actual_size = res.size() * sizeof(int64_t); buffer.reset(new char[actual_size]); char *pointer = buffer.get(); memcpy(pointer, res.data(), actual_size); return 0; } int32_t GraphTable::random_sample_neighbors( - uint64_t *node_ids, int sample_size, + int64_t *node_ids, int sample_size, std::vector> &buffers, std::vector &actual_sizes, bool need_weight) { size_t node_num = buffers.size(); @@ -560,10 +854,10 @@ int32_t GraphTable::random_sample_neighbors( seq_id[index].emplace_back(idx); id_list[index].emplace_back(node_ids[idx], sample_size, need_weight); } - for (int i = 0; i < seq_id.size(); i++) { + for (int i = 0; i < (int)seq_id.size(); i++) { if (seq_id[i].size() == 0) continue; tasks.push_back(_shards_task_pool[i]->enqueue([&, i, this]() -> int { - uint64_t node_id; + int64_t node_id; std::vector> r; LRUResponse response = LRUResponse::blocked; if (use_cache) { @@ -576,7 +870,7 @@ int32_t GraphTable::random_sample_neighbors( std::vector sample_keys; auto &rng = _shards_task_rng_pool[i]; for (size_t k = 0; k < id_list[i].size(); k++) { - if (index < r.size() && + if (index < (int)r.size() && r[index].first.node_key == id_list[i][k].node_key) { idx = seq_id[i][k]; actual_sizes[idx] = r[index].second.actual_size; @@ -597,7 +891,7 @@ int32_t GraphTable::random_sample_neighbors( res.size() * (need_weight ? (Node::id_size + Node::weight_size) : Node::id_size); int offset = 0; - uint64_t id; + int64_t id; float weight; char *buffer_addr = new char[actual_size]; if (response == LRUResponse::ok) { @@ -632,13 +926,13 @@ int32_t GraphTable::random_sample_neighbors( return 0; } -int32_t GraphTable::get_node_feat(const std::vector &node_ids, +int32_t GraphTable::get_node_feat(const std::vector &node_ids, const std::vector &feature_names, std::vector> &res) { size_t node_num = node_ids.size(); std::vector> tasks; for (size_t idx = 0; idx < node_num; ++idx) { - uint64_t node_id = node_ids[idx]; + int64_t node_id = node_ids[idx]; tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue( [&, idx, node_id]() -> int { Node *node = find_node(node_id); @@ -646,7 +940,8 @@ int32_t GraphTable::get_node_feat(const std::vector &node_ids, if (node == nullptr) { return 0; } - for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) { + for (int feat_idx = 0; feat_idx < (int)feature_names.size(); + ++feat_idx) { const std::string &feature_name = feature_names[feat_idx]; if (feat_id_map.find(feature_name) != feat_id_map.end()) { // res[feat_idx][idx] = @@ -665,19 +960,20 @@ int32_t GraphTable::get_node_feat(const std::vector &node_ids, } int32_t GraphTable::set_node_feat( - const std::vector &node_ids, + const std::vector &node_ids, const std::vector &feature_names, const std::vector> &res) { size_t node_num = node_ids.size(); std::vector> tasks; for (size_t idx = 0; idx < node_num; ++idx) { - uint64_t node_id = node_ids[idx]; + int64_t node_id = node_ids[idx]; tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue( [&, idx, node_id]() -> int { size_t index = node_id % this->shard_num - this->shard_start; auto node = shards[index]->add_feature_node(node_id); node->set_feature_size(this->feat_name.size()); - for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) { + for (int feat_idx = 0; feat_idx < (int)feature_names.size(); + ++feat_idx) { const std::string &feature_name = feature_names[feat_idx]; if (feat_id_map.find(feature_name) != feat_id_map.end()) { node->set_feature(feat_id_map[feature_name], res[feat_idx][idx]); @@ -771,35 +1067,68 @@ int32_t GraphTable::pull_graph_list(int start, int total_size, return 0; } -int32_t GraphTable::get_server_index_by_id(uint64_t id) { +int32_t GraphTable::get_server_index_by_id(int64_t id) { return id % shard_num / shard_num_per_server; } +int32_t GraphTable::initialize(const TableParameter &config, + const FsClientParameter &fs_config) { + LOG(INFO) << "in graphTable initialize"; + _config = config; + if (initialize_accessor() != 0) { + LOG(WARNING) << "Table accessor initialize failed"; + return -1; + } -int32_t GraphTable::initialize() { + if (_afs_client.initialize(fs_config) != 0) { + LOG(WARNING) << "Table fs_client initialize failed"; + // return -1; + } + auto graph = config.graph_parameter(); + shard_num = _config.shard_num(); + LOG(INFO) << "in graphTable initialize over"; + return initialize(graph); +} +int32_t GraphTable::initialize(const GraphParameter &graph) { +#ifdef PADDLE_WITH_HETERPS + if (graph.gpups_mode()) { + gpups_mode = true; + if (shard_num == 0) { + shard_num = graph.gpups_mode_shard_num(); + server_num = 1; + _shard_idx = 0; + } + auto *sampler = + CREATE_PSCORE_CLASS(GraphSampler, graph.gpups_graph_sample_class()); + auto slices = + string::split_string(graph.gpups_graph_sample_args(), ","); + std::cout << "slices" << std::endl; + for (auto x : slices) std::cout << x << std::endl; + sampler->init(graph.gpu_num(), this, slices); + graph_sampler.reset(sampler); + } +#endif + task_pool_size_ = graph.task_pool_size(); _shards_task_pool.resize(task_pool_size_); for (size_t i = 0; i < _shards_task_pool.size(); ++i) { _shards_task_pool[i].reset(new ::ThreadPool(1)); _shards_task_rng_pool.push_back(paddle::framework::GetCPURandomEngine(0)); } - server_num = _shard_num; - // VLOG(0) << "in init graph table server num = " << server_num; - /* - _shard_num is actually server number here - when a server initialize its tables, it sets tables' _shard_num to server_num, - and _shard_idx to server - rank - */ - auto common = _config.common(); - - this->table_name = common.table_name(); - this->table_type = common.name(); + auto graph_feature = graph.graph_feature(); + // this->table_name = common.table_name(); + // this->table_type = common.name(); + this->table_name = graph.table_name(); + this->table_type = graph.table_type(); VLOG(0) << " init graph table type " << this->table_type << " table name " << this->table_name; - int feat_conf_size = static_cast(common.attributes().size()); + // int feat_conf_size = static_cast(common.attributes().size()); + int feat_conf_size = static_cast(graph_feature.name().size()); for (int i = 0; i < feat_conf_size; i++) { - auto &f_name = common.attributes()[i]; - auto &f_shape = common.dims()[i]; - auto &f_dtype = common.params()[i]; + // auto &f_name = common.attributes()[i]; + // auto &f_shape = common.dims()[i]; + // auto &f_dtype = common.params()[i]; + auto &f_name = graph_feature.name()[i]; + auto &f_shape = graph_feature.shape()[i]; + auto &f_dtype = graph_feature.dtype()[i]; this->feat_name.push_back(f_name); this->feat_shape.push_back(f_shape); this->feat_dtype.push_back(f_dtype); @@ -807,8 +1136,6 @@ int32_t GraphTable::initialize() { VLOG(0) << "init graph table feat conf name:" << f_name << " shape:" << f_shape << " dtype:" << f_dtype; } - - shard_num = _config.shard_num(); VLOG(0) << "in init graph table shard num = " << shard_num << " shard_idx" << _shard_idx; shard_num_per_server = sparse_local_shard_num(shard_num, server_num); @@ -826,5 +1153,6 @@ int32_t GraphTable::initialize() { return 0; } + } // namespace distributed }; // namespace paddle diff --git a/paddle/fluid/distributed/ps/table/common_graph_table.h b/paddle/fluid/distributed/ps/table/common_graph_table.h index c76a62248c8..7946569525c 100644 --- a/paddle/fluid/distributed/ps/table/common_graph_table.h +++ b/paddle/fluid/distributed/ps/table/common_graph_table.h @@ -38,10 +38,14 @@ #include #include "paddle/fluid/distributed/ps/table/accessor.h" #include "paddle/fluid/distributed/ps/table/common_table.h" +#include "paddle/fluid/distributed/ps/table/graph/class_macro.h" #include "paddle/fluid/distributed/ps/table/graph/graph_node.h" #include "paddle/fluid/string/string_helper.h" #include "paddle/phi/core/utils/rw_lock.h" +#ifdef PADDLE_WITH_HETERPS +#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h" +#endif namespace paddle { namespace distributed { class GraphShard { @@ -51,37 +55,37 @@ class GraphShard { ~GraphShard(); std::vector &get_bucket() { return bucket; } std::vector get_batch(int start, int end, int step); - std::vector get_ids_by_range(int start, int end) { - std::vector res; + std::vector get_ids_by_range(int start, int end) { + std::vector res; for (int i = start; i < end && i < (int)bucket.size(); i++) { res.push_back(bucket[i]->get_id()); } return res; } - GraphNode *add_graph_node(uint64_t id); + GraphNode *add_graph_node(int64_t id); GraphNode *add_graph_node(Node *node); - FeatureNode *add_feature_node(uint64_t id); - Node *find_node(uint64_t id); - void delete_node(uint64_t id); + FeatureNode *add_feature_node(int64_t id); + Node *find_node(int64_t id); + void delete_node(int64_t id); void clear(); - void add_neighbor(uint64_t id, uint64_t dst_id, float weight); - std::unordered_map &get_node_location() { + void add_neighbor(int64_t id, int64_t dst_id, float weight); + std::unordered_map &get_node_location() { return node_location; } private: - std::unordered_map node_location; + std::unordered_map node_location; std::vector bucket; }; enum LRUResponse { ok = 0, blocked = 1, err = 2 }; struct SampleKey { - uint64_t node_key; + int64_t node_key; size_t sample_size; bool is_weighted; - SampleKey(uint64_t _node_key, size_t _sample_size, bool _is_weighted) + SampleKey(int64_t _node_key, size_t _sample_size, bool _is_weighted) : node_key(_node_key), sample_size(_sample_size), is_weighted(_is_weighted) {} @@ -300,7 +304,7 @@ class ScaledLRU { node_size += lru_pool[i].node_size - lru_pool[i].remove_count; } - if (node_size <= size_t(1.1 * size_limit) + 1) return 0; + if ((size_t)node_size <= size_t(1.1 * size_limit) + 1) return 0; if (pthread_rwlock_wrlock(&rwlock) == 0) { // VLOG(0)<"in shrink\n"; global_count = 0; @@ -308,9 +312,9 @@ class ScaledLRU { global_count += lru_pool[i].node_size - lru_pool[i].remove_count; } // VLOG(0)<<"global_count "< size_limit) { + if ((size_t)global_count > size_limit) { size_t remove = global_count - size_limit; - for (int i = 0; i < lru_pool.size(); i++) { + for (size_t i = 0; i < lru_pool.size(); i++) { lru_pool[i].total_diff = 0; lru_pool[i].remove_count += 1.0 * (lru_pool[i].node_size - lru_pool[i].remove_count) / @@ -352,9 +356,69 @@ class ScaledLRU { friend class RandomSampleLRU; }; +#ifdef PADDLE_WITH_HETERPS +enum GraphSamplerStatus { waiting = 0, running = 1, terminating = 2 }; +class GraphTable; +class GraphSampler { + public: + GraphSampler() { + status = GraphSamplerStatus::waiting; + thread_pool.reset(new ::ThreadPool(1)); + callback = [](std::vector &res) { + return; + }; + } + virtual int run_graph_sampling() = 0; + virtual int start_graph_sampling() { + if (status != GraphSamplerStatus::waiting) { + return -1; + } + std::promise prom; + std::future fut = prom.get_future(); + graph_sample_task_over = thread_pool->enqueue([&prom, this]() { + prom.set_value(0); + status = GraphSamplerStatus::running; + return run_graph_sampling(); + }); + return fut.get(); + } + virtual void init(size_t gpu_num, GraphTable *graph_table, + std::vector args) = 0; + virtual void set_graph_sample_callback( + std::function &)> + callback) { + this->callback = callback; + } + + virtual int end_graph_sampling() { + if (status == GraphSamplerStatus::running) { + status = GraphSamplerStatus::terminating; + return graph_sample_task_over.get(); + } + return -1; + } + virtual GraphSamplerStatus get_graph_sampler_status() { return status; } + + protected: + std::function &)> + callback; + std::shared_ptr<::ThreadPool> thread_pool; + GraphSamplerStatus status; + std::future graph_sample_task_over; + std::vector sample_res; +}; +#endif + class GraphTable : public SparseTable { public: - GraphTable() { use_cache = false; } + GraphTable() { + use_cache = false; + shard_num = 0; +#ifdef PADDLE_WITH_HETERPS + gpups_mode = false; +#endif + rw_lock.reset(new pthread_rwlock_t()); + } virtual ~GraphTable(); virtual int32_t pull_graph_list(int start, int size, std::unique_ptr &buffer, @@ -362,7 +426,7 @@ class GraphTable : public SparseTable { int step); virtual int32_t random_sample_neighbors( - uint64_t *node_ids, int sample_size, + int64_t *node_ids, int sample_size, std::vector> &buffers, std::vector &actual_sizes, bool need_weight); @@ -370,9 +434,11 @@ class GraphTable : public SparseTable { int &actual_sizes); virtual int32_t get_nodes_ids_by_ranges( - std::vector> ranges, std::vector &res); - virtual int32_t initialize(); - + std::vector> ranges, std::vector &res); + virtual int32_t initialize() { return 0; } + virtual int32_t initialize(const TableParameter &config, + const FsClientParameter &fs_config); + virtual int32_t initialize(const GraphParameter &config); int32_t load(const std::string &path, const std::string ¶m); int32_t load_graph_split_config(const std::string &path); @@ -380,13 +446,13 @@ class GraphTable : public SparseTable { int32_t load_nodes(const std::string &path, std::string node_type); - int32_t add_graph_node(std::vector &id_list, + int32_t add_graph_node(std::vector &id_list, std::vector &is_weight_list); - int32_t remove_graph_node(std::vector &id_list); + int32_t remove_graph_node(std::vector &id_list); - int32_t get_server_index_by_id(uint64_t id); - Node *find_node(uint64_t id); + int32_t get_server_index_by_id(int64_t id); + Node *find_node(int64_t id); virtual int32_t pull_sparse(float *values, const PullSparseValue &pull_value) { @@ -407,16 +473,27 @@ class GraphTable : public SparseTable { return 0; } virtual int32_t initialize_shard() { return 0; } - virtual uint32_t get_thread_pool_index_by_shard_index(uint64_t shard_index); - virtual uint32_t get_thread_pool_index(uint64_t node_id); + virtual int32_t set_shard(size_t shard_idx, size_t server_num) { + _shard_idx = shard_idx; + /* + _shard_num is not used in graph_table, this following operation is for the + purpose of + being compatible with base class table. + */ + _shard_num = server_num; + this->server_num = server_num; + return 0; + } + virtual uint32_t get_thread_pool_index_by_shard_index(int64_t shard_index); + virtual uint32_t get_thread_pool_index(int64_t node_id); virtual std::pair parse_feature(std::string feat_str); - virtual int32_t get_node_feat(const std::vector &node_ids, + virtual int32_t get_node_feat(const std::vector &node_ids, const std::vector &feature_names, std::vector> &res); virtual int32_t set_node_feat( - const std::vector &node_ids, + const std::vector &node_ids, const std::vector &feature_names, const std::vector> &res); @@ -433,11 +510,25 @@ class GraphTable : public SparseTable { } return 0; } - +#ifdef PADDLE_WITH_HETERPS + virtual int32_t start_graph_sampling() { + return this->graph_sampler->start_graph_sampling(); + } + virtual int32_t end_graph_sampling() { + return this->graph_sampler->end_graph_sampling(); + } + virtual int32_t set_graph_sample_callback( + std::function &)> + callback) { + graph_sampler->set_graph_sample_callback(callback); + return 0; + } +// virtual GraphSampler *get_graph_sampler() { return graph_sampler.get(); } +#endif protected: std::vector shards, extra_shards; size_t shard_start, shard_end, server_num, shard_num_per_server, shard_num; - const int task_pool_size_ = 24; + int task_pool_size_ = 24; const int random_sample_nodes_ranges = 3; std::vector feat_name; @@ -450,11 +541,61 @@ class GraphTable : public SparseTable { std::vector> _shards_task_pool; std::vector> _shards_task_rng_pool; std::shared_ptr> scaled_lru; - std::unordered_set extra_nodes; - std::unordered_map extra_nodes_to_thread_index; + std::unordered_set extra_nodes; + std::unordered_map extra_nodes_to_thread_index; bool use_cache, use_duplicate_nodes; mutable std::mutex mutex_; + std::shared_ptr rw_lock; +#ifdef PADDLE_WITH_HETERPS + // paddle::framework::GpuPsGraphTable gpu_graph_table; + bool gpups_mode; + // std::shared_ptr<::ThreadPool> graph_sample_pool; + std::shared_ptr graph_sampler; + REGISTER_GRAPH_FRIEND_CLASS(2, CompleteGraphSampler, BasicBfsGraphSampler) +#endif +}; + +#ifdef PADDLE_WITH_HETERPS +REGISTER_PSCORE_REGISTERER(GraphSampler); +class CompleteGraphSampler : public GraphSampler { + public: + CompleteGraphSampler() {} + ~CompleteGraphSampler() {} + // virtual pthread_rwlock_t *export_rw_lock(); + virtual int run_graph_sampling(); + virtual void init(size_t gpu_num, GraphTable *graph_table, + std::vector args_); + + protected: + GraphTable *graph_table; + std::vector> sample_nodes; + std::vector> sample_neighbors; + // std::vector sample_res; + // std::shared_ptr random; + int gpu_num; +}; + +class BasicBfsGraphSampler : public GraphSampler { + public: + BasicBfsGraphSampler() {} + ~BasicBfsGraphSampler() {} + // virtual pthread_rwlock_t *export_rw_lock(); + virtual int run_graph_sampling(); + virtual void init(size_t gpu_num, GraphTable *graph_table, + std::vector args_); + + protected: + GraphTable *graph_table; + // std::vector> sample_nodes; + std::vector> sample_nodes; + std::vector> sample_neighbors; + size_t gpu_num; + int node_num_for_each_shard, edge_num_for_each_node; + int rounds, interval; + std::vector>> + sample_neighbors_map; }; +#endif } // namespace distributed }; // namespace paddle diff --git a/paddle/fluid/distributed/ps/table/graph/class_macro.h b/paddle/fluid/distributed/ps/table/graph/class_macro.h new file mode 100644 index 00000000000..bf59dbacb25 --- /dev/null +++ b/paddle/fluid/distributed/ps/table/graph/class_macro.h @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#define DECLARE_GRAPH_FRIEND_CLASS(a) friend class a; +#define DECLARE_1_FRIEND_CLASS(a, ...) DECLARE_GRAPH_FRIEND_CLASS(a) +#define DECLARE_2_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_1_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_3_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_2_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_4_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_3_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_5_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_4_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_6_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_5_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_7_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_6_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_8_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_7_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_9_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_8_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_10_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_9_FRIEND_CLASS(__VA_ARGS__) +#define DECLARE_11_FRIEND_CLASS(a, ...) \ + DECLARE_GRAPH_FRIEND_CLASS(a) DECLARE_10_FRIEND_CLASS(__VA_ARGS__) +#define REGISTER_GRAPH_FRIEND_CLASS(n, ...) \ + DECLARE_##n##_FRIEND_CLASS(__VA_ARGS__) diff --git a/paddle/fluid/distributed/ps/table/graph/graph_edge.cc b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc index d1961b655d8..004a536e8e5 100644 --- a/paddle/fluid/distributed/ps/table/graph/graph_edge.cc +++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.cc @@ -17,11 +17,11 @@ namespace paddle { namespace distributed { -void GraphEdgeBlob::add_edge(uint64_t id, float weight = 1) { +void GraphEdgeBlob::add_edge(int64_t id, float weight = 1) { id_arr.push_back(id); } -void WeightedGraphEdgeBlob::add_edge(uint64_t id, float weight = 1) { +void WeightedGraphEdgeBlob::add_edge(int64_t id, float weight = 1) { id_arr.push_back(id); weight_arr.push_back(weight); } diff --git a/paddle/fluid/distributed/ps/table/graph/graph_edge.h b/paddle/fluid/distributed/ps/table/graph/graph_edge.h index 3dfe5a6f357..5fc785fe256 100644 --- a/paddle/fluid/distributed/ps/table/graph/graph_edge.h +++ b/paddle/fluid/distributed/ps/table/graph/graph_edge.h @@ -24,19 +24,20 @@ class GraphEdgeBlob { GraphEdgeBlob() {} virtual ~GraphEdgeBlob() {} size_t size() { return id_arr.size(); } - virtual void add_edge(uint64_t id, float weight); - uint64_t get_id(int idx) { return id_arr[idx]; } + virtual void add_edge(int64_t id, float weight); + int64_t get_id(int idx) { return id_arr[idx]; } virtual float get_weight(int idx) { return 1; } + std::vector& export_id_array() { return id_arr; } protected: - std::vector id_arr; + std::vector id_arr; }; class WeightedGraphEdgeBlob : public GraphEdgeBlob { public: WeightedGraphEdgeBlob() {} virtual ~WeightedGraphEdgeBlob() {} - virtual void add_edge(uint64_t id, float weight); + virtual void add_edge(int64_t id, float weight); virtual float get_weight(int idx) { return weight_arr[idx]; } protected: diff --git a/paddle/fluid/distributed/ps/table/graph/graph_node.h b/paddle/fluid/distributed/ps/table/graph/graph_node.h index b838c2c1258..c6c594036d4 100644 --- a/paddle/fluid/distributed/ps/table/graph/graph_node.h +++ b/paddle/fluid/distributed/ps/table/graph/graph_node.h @@ -48,6 +48,7 @@ class Node { virtual void set_feature(int idx, std::string str) {} virtual void set_feature_size(int size) {} virtual int get_feature_size() { return 0; } + virtual size_t get_neighbor_size() { return 0; } protected: uint64_t id; @@ -70,6 +71,7 @@ class GraphNode : public Node { } virtual uint64_t get_neighbor_id(int idx) { return edges->get_id(idx); } virtual float get_neighbor_weight(int idx) { return edges->get_weight(idx); } + virtual size_t get_neighbor_size() { return edges->size(); } protected: Sampler *sampler; diff --git a/paddle/fluid/distributed/ps/table/table.cc b/paddle/fluid/distributed/ps/table/table.cc index fa8169da07a..fc2ea56e95d 100644 --- a/paddle/fluid/distributed/ps/table/table.cc +++ b/paddle/fluid/distributed/ps/table/table.cc @@ -37,6 +37,8 @@ REGISTER_PSCORE_CLASS(Table, CommonDenseTable); REGISTER_PSCORE_CLASS(Table, CommonSparseTable); #ifdef PADDLE_WITH_HETERPS REGISTER_PSCORE_CLASS(Table, SSDSparseTable); +REGISTER_PSCORE_CLASS(GraphSampler, CompleteGraphSampler); +REGISTER_PSCORE_CLASS(GraphSampler, BasicBfsGraphSampler); #endif REGISTER_PSCORE_CLASS(Table, SparseGeoTable); REGISTER_PSCORE_CLASS(Table, BarrierTable); diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt index 2223334ccc4..cb46c38d4de 100644 --- a/paddle/fluid/distributed/test/CMakeLists.txt +++ b/paddle/fluid/distributed/test/CMakeLists.txt @@ -24,6 +24,9 @@ cc_test(graph_node_test SRCS graph_node_test.cc DEPS graph_py_service scope serv set_source_files_properties(graph_node_split_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(graph_node_split_test SRCS graph_node_split_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS}) +set_source_files_properties(graph_table_sample_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test(graph_table_sample_test SRCS graph_table_sample_test.cc DEPS scope server communicator ps_service boost table ps_framework_proto ${COMMON_DEPS}) + set_source_files_properties(feature_value_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(feature_value_test SRCS feature_value_test.cc DEPS ${COMMON_DEPS} boost table) diff --git a/paddle/fluid/distributed/test/graph_node_split_test.cc b/paddle/fluid/distributed/test/graph_node_split_test.cc index 9949dce4e93..a2f495de3c9 100644 --- a/paddle/fluid/distributed/test/graph_node_split_test.cc +++ b/paddle/fluid/distributed/test/graph_node_split_test.cc @@ -236,7 +236,7 @@ void RunGraphSplit() { sleep(2); std::map> dense_regions; dense_regions.insert( - std::pair>(0, {})); + std::pair>(0, {})); auto regions = dense_regions[0]; RunClient(dense_regions, 0, pserver_ptr_->get_service()); @@ -250,16 +250,16 @@ void RunGraphSplit() { worker_ptr_->load(0, std::string(edge_file_name), std::string("e>")); srand(time(0)); pull_status.wait(); - std::vector> _vs; + std::vector> _vs; std::vector> vs; pull_status = worker_ptr_->batch_sample_neighbors( - 0, std::vector(1, 10240001024), 4, _vs, vs, true); + 0, std::vector(1, 10240001024), 4, _vs, vs, true); pull_status.wait(); ASSERT_EQ(0, _vs[0].size()); _vs.clear(); vs.clear(); pull_status = worker_ptr_->batch_sample_neighbors( - 0, std::vector(1, 97), 4, _vs, vs, true); + 0, std::vector(1, 97), 4, _vs, vs, true); pull_status.wait(); ASSERT_EQ(3, _vs[0].size()); std::remove(edge_file_name); diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc index 22c2d1e6099..565d51379d5 100644 --- a/paddle/fluid/distributed/test/graph_node_test.cc +++ b/paddle/fluid/distributed/test/graph_node_test.cc @@ -48,10 +48,10 @@ namespace distributed = paddle::distributed; void testSampleNodes( std::shared_ptr& worker_ptr_) { - std::vector ids; + std::vector ids; auto pull_status = worker_ptr_->random_sample_nodes(0, 0, 6, ids); - std::unordered_set s; - std::unordered_set s1 = {37, 59}; + std::unordered_set s; + std::unordered_set s1 = {37, 59}; pull_status.wait(); for (auto id : ids) s.insert(id); ASSERT_EQ(true, s.size() == s1.size()); @@ -106,14 +106,14 @@ void testFeatureNodeSerializeFloat64() { void testSingleSampleNeighboor( std::shared_ptr& worker_ptr_) { - std::vector> vs; + std::vector> vs; std::vector> vs1; auto pull_status = worker_ptr_->batch_sample_neighbors( - 0, std::vector(1, 37), 4, vs, vs1, true); + 0, std::vector(1, 37), 4, vs, vs1, true); pull_status.wait(); - std::unordered_set s; - std::unordered_set s1 = {112, 45, 145}; + std::unordered_set s; + std::unordered_set s1 = {112, 45, 145}; for (auto g : vs[0]) { s.insert(g); } @@ -126,7 +126,7 @@ void testSingleSampleNeighboor( vs.clear(); vs1.clear(); pull_status = worker_ptr_->batch_sample_neighbors( - 0, std::vector(1, 96), 4, vs, vs1, true); + 0, std::vector(1, 96), 4, vs, vs1, true); pull_status.wait(); s1 = {111, 48, 247}; for (auto g : vs[0]) { @@ -147,30 +147,30 @@ void testAddNode( std::shared_ptr& worker_ptr_) { worker_ptr_->clear_nodes(0); int total_num = 270000; - uint64_t id; - std::unordered_set id_set; + int64_t id; + std::unordered_set id_set; for (int i = 0; i < total_num; i++) { while (id_set.find(id = rand()) != id_set.end()) ; id_set.insert(id); } - std::vector id_list(id_set.begin(), id_set.end()); + std::vector id_list(id_set.begin(), id_set.end()); std::vector weight_list; auto status = worker_ptr_->add_graph_node(0, id_list, weight_list); status.wait(); - std::vector ids[2]; + std::vector ids[2]; for (int i = 0; i < 2; i++) { auto sample_status = worker_ptr_->random_sample_nodes(0, i, total_num, ids[i]); sample_status.wait(); } - std::unordered_set id_set_check(ids[0].begin(), ids[0].end()); + std::unordered_set id_set_check(ids[0].begin(), ids[0].end()); for (auto x : ids[1]) id_set_check.insert(x); ASSERT_EQ(id_set.size(), id_set_check.size()); for (auto x : id_set) { ASSERT_EQ(id_set_check.find(x) != id_set_check.end(), true); } - std::vector remove_ids; + std::vector remove_ids; for (auto p : id_set_check) { if (remove_ids.size() == 0) remove_ids.push_back(p); @@ -187,7 +187,7 @@ void testAddNode( worker_ptr_->random_sample_nodes(0, i, total_num, ids[i]); sample_status.wait(); } - std::unordered_set id_set_check1(ids[0].begin(), ids[0].end()); + std::unordered_set id_set_check1(ids[0].begin(), ids[0].end()); for (auto x : ids[1]) id_set_check1.insert(x); ASSERT_EQ(id_set_check1.size(), id_set_check.size()); for (auto x : id_set_check1) { @@ -196,14 +196,14 @@ void testAddNode( } void testBatchSampleNeighboor( std::shared_ptr& worker_ptr_) { - std::vector> vs; + std::vector> vs; std::vector> vs1; - std::vector v = {37, 96}; + std::vector v = {37, 96}; auto pull_status = worker_ptr_->batch_sample_neighbors(0, v, 4, vs, vs1, false); pull_status.wait(); - std::unordered_set s; - std::unordered_set s1 = {112, 45, 145}; + std::unordered_set s; + std::unordered_set s1 = {112, 45, 145}; for (auto g : vs[0]) { s.insert(g); } @@ -417,7 +417,7 @@ void RunBrpcPushSparse() { std::map> dense_regions; dense_regions.insert( - std::pair>(0, {})); + std::pair>(0, {})); auto regions = dense_regions[0]; RunClient(dense_regions, 0, pserver_ptr_->get_service()); @@ -427,14 +427,14 @@ void RunBrpcPushSparse() { worker_ptr_->load(0, std::string(edge_file_name), std::string("e>")); srand(time(0)); pull_status.wait(); - std::vector> _vs; + std::vector> _vs; std::vector> vs; testSampleNodes(worker_ptr_); sleep(5); testSingleSampleNeighboor(worker_ptr_); testBatchSampleNeighboor(worker_ptr_); pull_status = worker_ptr_->batch_sample_neighbors( - 0, std::vector(1, 10240001024), 4, _vs, vs, true); + 0, std::vector(1, 10240001024), 4, _vs, vs, true); pull_status.wait(); ASSERT_EQ(0, _vs[0].size()); paddle::distributed::GraphTable* g = @@ -445,14 +445,14 @@ void RunBrpcPushSparse() { while (round--) { vs.clear(); pull_status = worker_ptr_->batch_sample_neighbors( - 0, std::vector(1, 37), 1, _vs, vs, false); + 0, std::vector(1, 37), 1, _vs, vs, false); pull_status.wait(); for (int i = 0; i < ttl; i++) { - std::vector> vs1; + std::vector> vs1; std::vector> vs2; pull_status = worker_ptr_->batch_sample_neighbors( - 0, std::vector(1, 37), 1, vs1, vs2, false); + 0, std::vector(1, 37), 1, vs1, vs2, false); pull_status.wait(); ASSERT_EQ(_vs[0].size(), vs1[0].size()); @@ -540,7 +540,7 @@ void RunBrpcPushSparse() { // Test Pull by step - std::unordered_set count_item_nodes; + std::unordered_set count_item_nodes; // pull by step 2 for (int test_step = 1; test_step < 4; test_step++) { count_item_nodes.clear(); @@ -558,18 +558,18 @@ void RunBrpcPushSparse() { ASSERT_EQ(count_item_nodes.size(), 12); } - std::pair>, std::vector> res; + std::pair>, std::vector> res; res = client1.batch_sample_neighbors( - std::string("user2item"), std::vector(1, 96), 4, true, false); + std::string("user2item"), std::vector(1, 96), 4, true, false); ASSERT_EQ(res.first[0].size(), 3); - std::vector node_ids; + std::vector node_ids; node_ids.push_back(96); node_ids.push_back(37); res = client1.batch_sample_neighbors(std::string("user2item"), node_ids, 4, true, false); ASSERT_EQ(res.first[1].size(), 1); - std::vector nodes_ids = client2.random_sample_nodes("user", 0, 6); + std::vector nodes_ids = client2.random_sample_nodes("user", 0, 6); ASSERT_EQ(nodes_ids.size(), 2); ASSERT_EQ(true, (nodes_ids[0] == 59 && nodes_ids[1] == 37) || (nodes_ids[0] == 37 && nodes_ids[1] == 59)); diff --git a/paddle/fluid/distributed/test/graph_table_sample_test.cc b/paddle/fluid/distributed/test/graph_table_sample_test.cc new file mode 100644 index 00000000000..65455028247 --- /dev/null +++ b/paddle/fluid/distributed/test/graph_table_sample_test.cc @@ -0,0 +1,148 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include // NOLINT +#include +#include +#include +#include // NOLINT +#include +#include +#include "google/protobuf/text_format.h" + +#include +#include "gtest/gtest.h" +#include "paddle/fluid/distributed/ps.pb.h" +#include "paddle/fluid/distributed/ps/service/env.h" +#include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" +#include "paddle/fluid/distributed/ps/table/common_graph_table.h" +#include "paddle/fluid/distributed/ps/table/graph/graph_node.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/printf.h" +#include "paddle/phi/kernels/funcs/math_function.h" +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace operators = paddle::operators; +namespace memory = paddle::memory; +namespace distributed = paddle::distributed; + +std::vector edges = { + std::string("37\t45\t0.34"), std::string("37\t145\t0.31"), + std::string("37\t112\t0.21"), std::string("96\t48\t1.4"), + std::string("96\t247\t0.31"), std::string("96\t111\t1.21"), + std::string("59\t45\t0.34"), std::string("59\t145\t0.31"), + std::string("59\t122\t0.21"), std::string("97\t48\t0.34"), + std::string("97\t247\t0.31"), std::string("97\t111\t0.21")}; +// odd id:96 48 122 112 +char edge_file_name[] = "edges.txt"; + +std::vector nodes = { + std::string("user\t37\ta 0.34\tb 13 14\tc hello\td abc"), + std::string("user\t96\ta 0.31\tb 15 10\tc 96hello\td abcd"), + std::string("user\t59\ta 0.11\tb 11 14"), + std::string("user\t97\ta 0.11\tb 12 11"), + std::string("item\t45\ta 0.21"), + std::string("item\t145\ta 0.21"), + std::string("item\t112\ta 0.21"), + std::string("item\t48\ta 0.21"), + std::string("item\t247\ta 0.21"), + std::string("item\t111\ta 0.21"), + std::string("item\t46\ta 0.21"), + std::string("item\t146\ta 0.21"), + std::string("item\t122\ta 0.21"), + std::string("item\t49\ta 0.21"), + std::string("item\t248\ta 0.21"), + std::string("item\t113\ta 0.21")}; +char node_file_name[] = "nodes.txt"; + +void prepare_file(char file_name[], std::vector data) { + std::ofstream ofile; + ofile.open(file_name); + for (auto x : data) { + ofile << x << std::endl; + } + + ofile.close(); +} + +void testGraphSample() { +#ifdef PADDLE_WITH_HETERPS + ::paddle::distributed::GraphParameter table_proto; + table_proto.set_gpups_mode(true); + table_proto.set_gpups_mode_shard_num(127); + table_proto.set_gpu_num(2); + + distributed::GraphTable graph_table, graph_table1; + graph_table.initialize(table_proto); + prepare_file(edge_file_name, edges); + graph_table.load(std::string(edge_file_name), std::string("e>")); + std::vector res; + std::promise prom; + std::future fut = prom.get_future(); + graph_table.set_graph_sample_callback( + [&res, &prom](std::vector &res0) { + res = res0; + prom.set_value(0); + }); + graph_table.start_graph_sampling(); + fut.get(); + graph_table.end_graph_sampling(); + ASSERT_EQ(2, res.size()); + // 37 59 97 + for (int i = 0; i < (int)res[1].node_size; i++) { + std::cout << res[1].node_list[i].node_id << std::endl; + } + ASSERT_EQ(3, res[1].node_size); + + ::paddle::distributed::GraphParameter table_proto1; + table_proto1.set_gpups_mode(true); + table_proto1.set_gpups_mode_shard_num(127); + table_proto1.set_gpu_num(2); + table_proto1.set_gpups_graph_sample_class("BasicBfsGraphSampler"); + table_proto1.set_gpups_graph_sample_args("5,5,1,1"); + graph_table1.initialize(table_proto1); + graph_table1.load(std::string(edge_file_name), std::string("e>")); + std::vector res1; + std::promise prom1; + std::future fut1 = prom1.get_future(); + graph_table1.set_graph_sample_callback( + [&res1, &prom1](std::vector &res0) { + res1 = res0; + prom1.set_value(0); + }); + graph_table1.start_graph_sampling(); + fut1.get(); + graph_table1.end_graph_sampling(); + // distributed::BasicBfsGraphSampler *sampler1 = + // (distributed::BasicBfsGraphSampler *)graph_table1.get_graph_sampler(); + // sampler1->start_graph_sampling(); + // std::this_thread::sleep_for (std::chrono::seconds(1)); + // std::vector res1;// = + // sampler1->fetch_sample_res(); + ASSERT_EQ(2, res1.size()); + // odd id:96 48 122 112 + for (int i = 0; i < (int)res1[0].node_size; i++) { + std::cout << res1[0].node_list[i].node_id << std::endl; + } + ASSERT_EQ(4, res1[0].node_size); +#endif +} + +TEST(testGraphSample, Run) { testGraphSample(); } diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt index 17346f5fd93..2b8b4b3ff95 100644 --- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt @@ -10,8 +10,9 @@ IF(WITH_GPU) nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS}) nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm) nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm) - nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm) + nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm table) nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps) + nv_test(test_cpu_graph_sample SRCS test_cpu_graph_sample.cu DEPS graph_gpu_ps) ENDIF() IF(WITH_ROCM) hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context) diff --git a/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h new file mode 100644 index 00000000000..235f7a226ad --- /dev/null +++ b/paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h @@ -0,0 +1,120 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#ifdef PADDLE_WITH_HETERPS +namespace paddle { +namespace framework { +struct GpuPsGraphNode { + int64_t node_id; + int neighbor_size, neighbor_offset; + // this node's neighbor is stored on [neighbor_offset,neighbor_offset + + // neighbor_size) of int64_t *neighbor_list; +}; + +struct GpuPsCommGraph { + int64_t *neighbor_list; + GpuPsGraphNode *node_list; + int neighbor_size, node_size; + // the size of neighbor array and graph_node_list array + GpuPsCommGraph() + : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {} + GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_, + int neighbor_size_, int node_size_) + : neighbor_list(neighbor_list_), + node_list(node_list_), + neighbor_size(neighbor_size_), + node_size(node_size_) {} +}; + +/* +suppose we have a graph like this + +0----3-----5----7 + \ |\ |\ + 17 8 9 1 2 + +we save the nodes in arbitrary order, +in this example,the order is +[0,5,1,2,7,3,8,9,17] +let us name this array u_id; +we record each node's neighbors: +0:3,17 +5:3,7 +1:7 +2:7 +7:1,2,5 +3:0,5,8,9 +8:3 +9:3 +17:0 + +by concatenating each node's neighbor_list in the order we save the node id. +we get [3,17,3,7,7,7,1,2,5,0,5,8,9,3,3,0] +this is the neighbor_list of GpuPsCommGraph +given this neighbor_list and the order to save node id, +we know, +node 0's neighbors are in the range [0,1] of neighbor_list +node 5's neighbors are in the range [2,3] of neighbor_list +node 1's neighbors are in the range [4,4] of neighbor_list +node 2:[5,5] +node 7:[6,6] +node 3:[9,12] +node 8:[13,13] +node 9:[14,14] +node 17:[15,15] +... +by the above information, +we generate a node_list:GpuPsGraphNode *graph_node_list in GpuPsCommGraph +of size 9, +where node_list[i].id = u_id[i] +then we have: +node_list[0]-> node_id:0, neighbor_size:2, neighbor_offset:0 +node_list[1]-> node_id:5, neighbor_size:2, neighbor_offset:2 +node_list[2]-> node_id:1, neighbor_size:1, neighbor_offset:4 +node_list[3]-> node_id:2, neighbor_size:1, neighbor_offset:5 +node_list[4]-> node_id:7, neighbor_size:3, neighbor_offset:6 +node_list[5]-> node_id:3, neighbor_size:4, neighbor_offset:9 +node_list[6]-> node_id:8, neighbor_size:1, neighbor_offset:13 +node_list[7]-> node_id:9, neighbor_size:1, neighbor_offset:14 +node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15 +*/ +struct NeighborSampleResult { + int64_t *val; + int *actual_sample_size, sample_size, key_size; + NeighborSampleResult(int _sample_size, int _key_size) + : sample_size(_sample_size), key_size(_key_size) { + actual_sample_size = NULL; + val = NULL; + }; + ~NeighborSampleResult() { + if (val != NULL) cudaFree(val); + if (actual_sample_size != NULL) cudaFree(actual_sample_size); + } +}; + +struct NodeQueryResult { + int64_t *val; + int actual_sample_size; + NodeQueryResult() { + val = NULL; + actual_sample_size = 0; + }; + ~NodeQueryResult() { + if (val != NULL) cudaFree(val); + } +}; +} +}; +#endif diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h index a6508bf96c0..b8f9f0bfec9 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h @@ -14,114 +14,25 @@ #pragma once #include "heter_comm.h" +#include "paddle/fluid/distributed/ps/table/common_graph_table.h" +#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h" #include "paddle/fluid/platform/enforce.h" #ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { -struct GpuPsGraphNode { - int64_t node_id; - int neighbor_size, neighbor_offset; - // this node's neighbor is stored on [neighbor_offset,neighbor_offset + - // neighbor_size) of int64_t *neighbor_list; -}; - -struct GpuPsCommGraph { - int64_t *neighbor_list; - GpuPsGraphNode *node_list; - int neighbor_size, node_size; - // the size of neighbor array and graph_node_list array - GpuPsCommGraph() - : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {} - GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_, - int neighbor_size_, int node_size_) - : neighbor_list(neighbor_list_), - node_list(node_list_), - neighbor_size(neighbor_size_), - node_size(node_size_) {} -}; - -/* -suppose we have a graph like this -0----3-----5----7 - \ |\ |\ - 17 8 9 1 2 - -we save the nodes in arbitrary order, -in this example,the order is -[0,5,1,2,7,3,8,9,17] -let us name this array u_id; -we record each node's neighbors: -0:3,17 -5:3,7 -1:7 -2:7 -7:1,2,5 -3:0,5,8,9 -8:3 -9:3 -17:0 - -by concatenating each node's neighbor_list in the order we save the node id. -we get [3,17,3,7,7,7,1,2,5,0,5,8,9,3,3,0] -this is the neighbor_list of GpuPsCommGraph -given this neighbor_list and the order to save node id, -we know, -node 0's neighbors are in the range [0,1] of neighbor_list -node 5's neighbors are in the range [2,3] of neighbor_list -node 1's neighbors are in the range [4,4] of neighbor_list -node 2:[5,5] -node 7:[6,6] -node 3:[9,12] -node 8:[13,13] -node 9:[14,14] -node 17:[15,15] -... -by the above information, -we generate a node_list:GpuPsGraphNode *graph_node_list in GpuPsCommGraph -of size 9, -where node_list[i].id = u_id[i] -then we have: -node_list[0]-> node_id:0, neighbor_size:2, neighbor_offset:0 -node_list[1]-> node_id:5, neighbor_size:2, neighbor_offset:2 -node_list[2]-> node_id:1, neighbor_size:1, neighbor_offset:4 -node_list[3]-> node_id:2, neighbor_size:1, neighbor_offset:5 -node_list[4]-> node_id:7, neighbor_size:3, neighbor_offset:6 -node_list[5]-> node_id:3, neighbor_size:4, neighbor_offset:9 -node_list[6]-> node_id:8, neighbor_size:1, neighbor_offset:13 -node_list[7]-> node_id:9, neighbor_size:1, neighbor_offset:14 -node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15 -*/ -struct NeighborSampleResult { - int64_t *val; - int *actual_sample_size, sample_size, key_size; - NeighborSampleResult(int _sample_size, int _key_size) - : sample_size(_sample_size), key_size(_key_size) { - actual_sample_size = NULL; - val = NULL; - }; - ~NeighborSampleResult() { - if (val != NULL) cudaFree(val); - if (actual_sample_size != NULL) cudaFree(actual_sample_size); - } -}; - -struct NodeQueryResult { - int64_t *val; - int actual_sample_size; - NodeQueryResult() { - val = NULL; - actual_sample_size = 0; - }; - ~NodeQueryResult() { - if (val != NULL) cudaFree(val); - } -}; class GpuPsGraphTable : public HeterComm { public: GpuPsGraphTable(std::shared_ptr resource) : HeterComm(1, resource) { load_factor_ = 0.25; + rw_lock.reset(new pthread_rwlock_t()); + cpu_table_status = -1; + } + ~GpuPsGraphTable() { + if (cpu_table_status != -1) { + end_graph_sampling(); + } } void build_graph_from_cpu(std::vector &cpu_node_list); NodeQueryResult *graph_node_sample(int gpu_id, int sample_size); @@ -134,9 +45,19 @@ class GpuPsGraphTable : public HeterComm { int *h_right, int64_t *src_sample_res, int *actual_sample_size); + int init_cpu_table(const paddle::distributed::GraphParameter &graph); + int load(const std::string &path, const std::string ¶m); + virtual int32_t end_graph_sampling() { + return cpu_graph_table->end_graph_sampling(); + } private: std::vector gpu_graph_list; + std::shared_ptr cpu_graph_table; + std::shared_ptr rw_lock; + mutable std::mutex mutex_; + std::condition_variable cv_; + int cpu_table_status; }; } }; diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h index 839c7e5468c..16a6857ae96 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h @@ -14,6 +14,7 @@ #pragma once #ifdef PADDLE_WITH_HETERPS +//#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" namespace paddle { namespace framework { /* @@ -45,6 +46,33 @@ __global__ void neighbor_sample_example(GpuPsCommGraph graph, int* index, } } +int GpuPsGraphTable::init_cpu_table( + const paddle::distributed::GraphParameter& graph) { + cpu_graph_table.reset(new paddle::distributed::GraphTable); + cpu_table_status = cpu_graph_table->initialize(graph); + if (cpu_table_status != 0) return cpu_table_status; + std::function&)> callback = + [this](std::vector& res) { + pthread_rwlock_wrlock(this->rw_lock.get()); + this->clear_graph_info(); + this->build_graph_from_cpu(res); + pthread_rwlock_unlock(this->rw_lock.get()); + cv_.notify_one(); + }; + cpu_graph_table->set_graph_sample_callback(callback); + return cpu_table_status; +} + +int GpuPsGraphTable::load(const std::string& path, const std::string& param) { + int status = cpu_graph_table->load(path, param); + if (status != 0) { + return status; + } + std::unique_lock lock(mutex_); + cpu_graph_table->start_graph_sampling(); + cv_.wait(lock); + return 0; +} /* comment 1 @@ -68,6 +96,7 @@ __global__ void neighbor_sample_example(GpuPsCommGraph graph, int* index, that's what fill_dvals does. */ + void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu( int gpu_id, int gpu_num, int sample_size, int* h_left, int* h_right, int64_t* src_sample_res, int* actual_sample_size) { @@ -258,7 +287,7 @@ NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, auto d_shard_keys = memory::Alloc(place, len * sizeof(int64_t)); int64_t* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); - auto d_shard_vals = memory::Alloc(place, len * sizeof(int64_t)); + auto d_shard_vals = memory::Alloc(place, sample_size * len * sizeof(int64_t)); int64_t* d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int)); int* d_shard_actual_sample_size_ptr = diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index 2cf702969f9..f85ed330dc8 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once #ifdef PADDLE_WITH_HETERPS +//#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h" #include namespace paddle { diff --git a/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu b/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu new file mode 100644 index 00000000000..8c7ea10b265 --- /dev/null +++ b/paddle/fluid/framework/fleet/heter_ps/test_cpu_graph_sample.cu @@ -0,0 +1,108 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" +#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" +#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h" +#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h" +#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h" +#include "paddle/fluid/platform/cuda_device_guard.h" + +using namespace paddle::framework; +void prepare_file(char file_name[], std::vector data) { + std::ofstream ofile; + ofile.open(file_name); + for (auto x : data) { + ofile << x << std::endl; + } + + ofile.close(); +} +char edge_file_name[] = "edges.txt"; +TEST(TEST_FLEET, graph_sample) { + std::vector edges; + int gpu_count = 3; + std::vector dev_ids; + dev_ids.push_back(0); + dev_ids.push_back(1); + dev_ids.push_back(2); + + std::shared_ptr resource = + std::make_shared(dev_ids); + resource->enable_p2p(); + GpuPsGraphTable g(resource); + int node_count = 10; + std::vector> neighbors(node_count); + int ind = 0; + int64_t node_id = 0; + // std::vector graph_list(gpu_count); + while (ind < node_count) { + int neighbor_size = ind + 1; + while (neighbor_size--) { + edges.push_back(std::to_string(ind) + "\t" + std::to_string(node_id) + + "\t1.0"); + node_id++; + } + ind++; + } + /* + gpu 0: + 0,3,6,9 + gpu 1: + 1,4,7 + gpu 2: + 2,5,8 + + query(2,6) returns nodes [6,9,1,4,7,2] + */ + ::paddle::distributed::GraphParameter table_proto; + table_proto.set_gpups_mode(true); + table_proto.set_gpups_mode_shard_num(127); + table_proto.set_gpu_num(3); + table_proto.set_gpups_graph_sample_class("BasicBfsGraphSampler"); + table_proto.set_gpups_graph_sample_args("5,5,1,1"); + prepare_file(edge_file_name, edges); + g.init_cpu_table(table_proto); + g.load(std::string(edge_file_name), std::string("e>")); + /* + node x's neighbor list = [(1+x)*x/2,(1+x)*x/2 + 1,.....,(1+x)*x/2 + x] + so node 6's neighbors are [21,22...,27] + node 7's neighbors are [28,29,..35] + node 0's neighbors are [0] + query([7,0,6],sample_size=3) should return [28,29,30,0,x,x,21,22,23] + 6 --index-->2 + 0 --index--->0 + 7 --index-->2 + */ + int64_t cpu_key[3] = {7, 0, 6}; + void *key; + cudaMalloc((void **)&key, 3 * sizeof(int64_t)); + cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice); + auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 3, 3); + int64_t *res = new int64_t[9]; + cudaMemcpy(res, neighbor_sample_res->val, 72, cudaMemcpyDeviceToHost); + std::sort(res, res + 3); + std::sort(res + 6, res + 9); + int64_t expected_sample_val[] = {28, 29, 30, 0, -1, -1, 21, 22, 23}; + for (int i = 0; i < 9; i++) { + if (expected_sample_val[i] != -1) { + ASSERT_EQ(res[i], expected_sample_val[i]); + } + } + delete[] res; + delete neighbor_sample_res; +} diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc index 3145a9cf765..01dae420cc6 100644 --- a/paddle/fluid/pybind/fleet_py.cc +++ b/paddle/fluid/pybind/fleet_py.cc @@ -225,7 +225,7 @@ void BindGraphPyClient(py::module* m) { .def("stop_server", &GraphPyClient::stop_server) .def("get_node_feat", [](GraphPyClient& self, std::string node_type, - std::vector node_ids, + std::vector node_ids, std::vector feature_names) { auto feats = self.get_node_feat(node_type, node_ids, feature_names); @@ -239,7 +239,7 @@ void BindGraphPyClient(py::module* m) { }) .def("set_node_feat", [](GraphPyClient& self, std::string node_type, - std::vector node_ids, + std::vector node_ids, std::vector feature_names, std::vector> bytes_feats) { std::vector> feats(bytes_feats.size()); -- GitLab From 1904572ac8edb57dfb528e711588758002a168dd Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 17 Mar 2022 21:28:39 +0800 Subject: [PATCH 149/176] [Phi] Move assign kernel into phi (#40022) * move assign kernel init commit * change vec to vec * support tensor array * support api declare * fix test_list failed * fix npu and xpu failed * fix infrt failed * remove assign array size in operator * move assign sr header into sr dir * add infermeta for assign * test op success * fix test_list failed * fix kunlun failed * add set host allocator in tests * support tensor array in arg ctx * open set layout in share_meta * fix meta tensor layout error * fix test failed --- paddle/fluid/framework/infershape_utils.cc | 65 ++++++++++++++++--- paddle/fluid/framework/operator.cc | 32 +++++++-- paddle/fluid/framework/operator.h | 4 ++ paddle/fluid/imperative/prepared_operator.h | 26 +++++++- paddle/fluid/operators/assign_op.cc | 61 ++--------------- paddle/fluid/operators/assign_op_npu_test.cc | 2 +- .../dialect/phi/pass/proto_arg_map_context.cc | 4 ++ .../dialect/phi/pass/proto_arg_map_context.h | 1 + paddle/phi/core/compat/arg_map_context.h | 2 + paddle/phi/core/kernel_context.cc | 14 ++++ paddle/phi/core/kernel_context.h | 6 ++ paddle/phi/kernels/assign_kernel.cc | 63 ++++++++++++++++++ paddle/phi/kernels/assign_kernel.h | 34 ++++++++++ paddle/phi/kernels/cpu/copy_kernel.cc | 2 +- .../kernels/selected_rows/assign_kernel.cc | 49 ++++++++++++++ .../phi/kernels/selected_rows/assign_kernel.h | 28 ++++++++ paddle/phi/ops/compat/assign_sig.cc | 35 ++++++++++ paddle/phi/tests/kernels/test_copy_dev_api.cc | 4 ++ .../phi/tests/kernels/test_flatten_dev_api.cc | 4 ++ .../phi/tests/kernels/test_reshape_dev_api.cc | 4 ++ paddle/phi/tests/ops/test_op_signature.h | 5 ++ 21 files changed, 371 insertions(+), 74 deletions(-) create mode 100644 paddle/phi/kernels/assign_kernel.cc create mode 100644 paddle/phi/kernels/assign_kernel.h create mode 100644 paddle/phi/kernels/selected_rows/assign_kernel.cc create mode 100644 paddle/phi/kernels/selected_rows/assign_kernel.h create mode 100644 paddle/phi/ops/compat/assign_sig.cc diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index dec8d1d846c..2babecc6ddf 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -78,6 +78,11 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext { return var_types[0] == proto::VarType::SELECTED_ROWS; } + bool IsDenseTensorVectorInput(const std::string& name) const override { + auto var_types = ctx_.GetInputsVarType(name); + return var_types[0] == proto::VarType::LOD_TENSOR_ARRAY; + } + bool IsDenseTensorOutput(const std::string& name) const override { auto var_types = ctx_.GetOutputsVarType(name); return var_types[0] == proto::VarType::LOD_TENSOR; @@ -125,9 +130,14 @@ class CompatMetaTensor : public phi::MetaTensor { return var->Get().dims(); } else if (var->IsType()) { return var->Get().dims(); + } else if (var->IsType()) { + // use tensor array size as dims + auto& tensor_array = var->Get(); + return phi::make_ddim({static_cast(tensor_array.size())}); } else { PADDLE_THROW(platform::errors::Unimplemented( - "Currently, only can get dims from DenseTensor or SelectedRows.")); + "Currently, only can get dims from DenseTensor or SelectedRows or " + "DenseTensorArray.")); } } else { auto* var = BOOST_GET_CONST(VarDesc*, var_); @@ -144,6 +154,10 @@ class CompatMetaTensor : public phi::MetaTensor { return var->Get().dtype(); } else if (var->IsType()) { return var->Get().dtype(); + } else if (var->IsType()) { + // NOTE(chenweihang): do nothing + // Unsupported get dtype from LoDTensorArray now + return phi::DataType::UNDEFINED; } else { PADDLE_THROW(platform::errors::Unimplemented( "Currently, only can get dtype from DenseTensor or SelectedRows.")); @@ -157,7 +171,19 @@ class CompatMetaTensor : public phi::MetaTensor { DataLayout layout() const override { if (is_runtime_) { auto* var = BOOST_GET_CONST(Variable*, var_); - return var->Get().layout(); + if (var->IsType()) { + return var->Get().layout(); + } else if (var->IsType()) { + return var->Get().layout(); + } else if (var->IsType()) { + // NOTE(chenweihang): do nothing + // Unsupported get layout from LoDTensorArray now + return phi::DataLayout::UNDEFINED; + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Currently, only can get layout from DenseTensor or " + "SelectedRows.")); + } } else { // NOTE(chenweihang): do nothing // Unsupported get layout for VarDesc now @@ -174,6 +200,16 @@ class CompatMetaTensor : public phi::MetaTensor { } else if (var->IsType()) { auto* tensor = var->GetMutable()->mutable_value(); phi::DenseTensorUtils::GetMutableMeta(tensor)->dims = dims; + } else if (var->IsType()) { + auto* tensor_array = var->GetMutable(); + // Note: Here I want enforce `tensor_array->size() == 0UL`, because + // inplace using on LoDTensorArray is dangerous, but the unittest + // `test_list` contains this behavior + PADDLE_ENFORCE_EQ(dims.size(), 1UL, + platform::errors::InvalidArgument( + "LoDTensorArray can only have one dimension.")); + // only set the array size for LoDTensorArray input + tensor_array->resize(dims[0]); } else { PADDLE_THROW(platform::errors::Unimplemented( "Currently, only can set dims from DenseTensor or SelectedRows.")); @@ -193,6 +229,9 @@ class CompatMetaTensor : public phi::MetaTensor { } else if (var->IsType()) { auto* tensor = var->GetMutable()->mutable_value(); phi::DenseTensorUtils::GetMutableMeta(tensor)->dtype = dtype; + } else if (var->IsType()) { + // NOTE(chenweihang): do nothing + // Unsupported set dtype for LoDTensorArray now } else { PADDLE_THROW(platform::errors::Unimplemented( "Currently, only can set dtype from DenseTensor or SelectedRows.")); @@ -206,10 +245,20 @@ class CompatMetaTensor : public phi::MetaTensor { void set_layout(DataLayout layout) override { if (is_runtime_) { auto* var = BOOST_GET(Variable*, var_); - LoDTensor* tensor = var->GetMutable(); - phi::DenseTensorUtils::GetMutableMeta( - static_cast(tensor)) - ->layout = layout; + if (var->IsType()) { + auto* tensor = var->GetMutable(); + phi::DenseTensorUtils::GetMutableMeta(tensor)->layout = layout; + } else if (var->IsType()) { + auto* tensor = var->GetMutable()->mutable_value(); + phi::DenseTensorUtils::GetMutableMeta(tensor)->layout = layout; + } else if (var->IsType()) { + // NOTE(chenweihang): do nothing + // Unsupported set dtype for LoDTensorArray now + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Currently, only can set layout from DenseTensor or " + "SelectedRows.")); + } } else { // NOTE(chenweihang): do nothing // Unsupported set layout for VarDesc now @@ -251,9 +300,7 @@ class CompatMetaTensor : public phi::MetaTensor { void share_meta(const MetaTensor& meta_tensor) override { share_dims(meta_tensor); set_dtype(meta_tensor.dtype()); - // VarDesc doesn't contains layout, so we cannot share layout - // set_layout(meta_tensor.layout()); - + set_layout(meta_tensor.layout()); // special case: share lod of LoDTensor share_lod(meta_tensor); } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index ad01adf1a25..ec28c98d598 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2103,16 +2103,25 @@ void OperatorWithKernel::BuildPhiKernelContext( auto* var = ins_vector[offset]; if (var->IsType()) { tensor_in = &(var->Get()); + pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in); } else if (var->IsType()) { tensor_in = &(var->Get()); + pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in); + } else if (var->IsType()) { + paddle::SmallVector tensor_vector; + auto& tensor_array = var->Get(); + for (auto& t : tensor_array) { + tensor_vector.emplace_back(&t); + } + pt_kernel_context->EmplaceBackInputsWithoutSetRange(tensor_vector); + end_idx += tensor_array.size() - 1; } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported input `%s` type when call pt kernel.", framework::ToTypeName(var->Type()))); } - - pt_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in); } + // Note: here cannot deal with vector input pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i); } VLOG(4) << "Done inputs"; @@ -2140,22 +2149,33 @@ void OperatorWithKernel::BuildPhiKernelContext( for (size_t offset = 0; offset < outs_vector.size(); ++offset) { phi::TensorBase* tensor_out = nullptr; auto* var = outs_vector[offset]; - if (var) { if (var->template IsType()) { tensor_out = var->template GetMutable(); + pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); } else if (var->template IsType()) { tensor_out = var->template GetMutable(); + pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); + } else if (var->template IsType()) { + paddle::SmallVector tensor_vector; + auto* tensor_array = + var->template GetMutable(); + // Note: If the input LoDTensorArray size is 0, the output + // LoDTensorArray is also 0 + for (auto& t : *tensor_array) { + tensor_vector.emplace_back(&t); + } + pt_kernel_context->EmplaceBackOutputsWithoutSetRange(tensor_vector); + end_idx += tensor_array->size() - 1; } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported output `%s` type when call pt kernel.", framework::ToTypeName(var->Type()))); } + } else { + pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); } - - pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); } - pt_kernel_context->AssignOutputRange(std::make_pair(start_idx, end_idx), i); } VLOG(4) << "Done outputs"; diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 1a1171f1dba..6f68c261d2b 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -483,6 +483,10 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext { return ctx_.InputVar(name)->IsType(); } + bool IsDenseTensorVectorInput(const std::string& name) const override { + return ctx_.InputVar(name)->IsType(); + } + bool IsDenseTensorOutput(const std::string& name) const override { return ctx_.OutputVar(name)->IsType(); } diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 16f2df79246..f70f44878e3 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -289,14 +289,23 @@ void BuildDygraphPhiKernelContext( auto& var = ins_vector[offset]->Var(); if (var.template IsType()) { tensor_in = &(var.template Get()); + kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in); } else if (var.template IsType()) { tensor_in = &(var.template Get()); + kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in); + } else if (var.template IsType()) { + paddle::SmallVector tensor_vector; + auto& tensor_array = var.template Get(); + for (auto& t : tensor_array) { + tensor_vector.emplace_back(&t); + } + kernel_ctx->EmplaceBackInputsWithoutSetRange(tensor_vector); + end_idx += tensor_array.size() - 1; } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported input `%s` type when call pt kernel.", framework::ToTypeName(var.Type()))); } - kernel_ctx->EmplaceBackInputWithoutSetRange(tensor_in); } kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); } @@ -326,16 +335,27 @@ void BuildDygraphPhiKernelContext( if (var) { if (var->template IsType()) { tensor_out = var->template GetMutable(); + kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out); } else if (var->template IsType()) { tensor_out = var->template GetMutable(); + kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out); + } else if (var->template IsType()) { + paddle::SmallVector tensor_vector; + auto* tensor_array = + var->template GetMutable(); + for (auto& t : *tensor_array) { + tensor_vector.emplace_back(&t); + } + kernel_ctx->EmplaceBackOutputsWithoutSetRange(tensor_vector); + end_idx += tensor_array->size() - 1; } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported output `%s` type when call pt kernel.", framework::ToTypeName(var->Type()))); } + } else { + kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out); } - - kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out); } kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i); } diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc index 684ac5bafd0..ea6614cbfbd 100644 --- a/paddle/fluid/operators/assign_op.cc +++ b/paddle/fluid/operators/assign_op.cc @@ -16,6 +16,9 @@ limitations under the License. */ #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace framework { class OpDesc; @@ -36,26 +39,6 @@ class AssignOp : public framework::OperatorWithKernel { const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - if (ctx->HasInput("X")) { - auto type = ctx->GetInputsVarType("X")[0]; - if (type == framework::proto::VarType::SELECTED_ROWS || - type == framework::proto::VarType::LOD_TENSOR) { - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - if (type == framework::proto::VarType::LOD_TENSOR) { - ctx->ShareLoD("X", /*->*/ "Out"); - } - } else if (type == framework::proto::VarType::LOD_TENSOR_ARRAY) { - if (ctx->IsRuntime()) { - // The runtime output shape is determined in kernel. - return; - } else { - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - } - } - } - } - protected: framework::OpKernelType GetKernelTypeForVar( const std::string &var_name, const framework::Tensor &tensor, @@ -91,24 +74,6 @@ class AssignInferVarType : public framework::VarTypeInference { } }; -class AssignKernel { - public: - void operator()(const framework::ExecutionContext &ctx) const { - auto *x = ctx.InputVar("X"); - if (x == nullptr) { - return; - } - PADDLE_ENFORCE_EQ( - ctx.HasOutput("Out"), true, - platform::errors::NotFound("Output(Out) of assign_op is not found.")); - auto *out = ctx.OutputVar("Out"); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(ctx.GetPlace()); - - framework::VisitVarType(*x, AssignFunctor(out, dev_ctx)); - } -}; - class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -147,23 +112,11 @@ DECLARE_INPLACE_OP_INFERER(AssignOpInplaceInferer, {"X", "Out"}); namespace ops = paddle::operators; namespace plat = paddle::platform; + +DECLARE_INFER_SHAPE_FUNCTOR(assign, AssignInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(assign, ops::AssignOp, ops::AssignGradMaker, ops::AssignGradMaker, ops::AssignOpProtoMaker, ops::AssignOpInplaceInferer, - ops::AssignInferVarType); - -REGISTER_OP_CPU_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double, - ops::AssignKernel, int, ops::AssignKernel, - int64_t, ops::AssignKernel, uint8_t, - ops::AssignKernel, bool, ops::AssignKernel, - plat::float16, ops::AssignKernel, plat::bfloat16, - ops::AssignKernel); - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -REGISTER_OP_CUDA_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double, - ops::AssignKernel, int, ops::AssignKernel, - int64_t, ops::AssignKernel, uint8_t, - ops::AssignKernel, bool, ops::AssignKernel, - plat::float16, ops::AssignKernel); -#endif + ops::AssignInferVarType, AssignInferShapeFunctor); diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc index b452dea8536..b91eb50646f 100644 --- a/paddle/fluid/operators/assign_op_npu_test.cc +++ b/paddle/fluid/operators/assign_op_npu_test.cc @@ -29,7 +29,7 @@ limitations under the License. */ namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(assign); +USE_OP_ITSELF(assign); USE_OP_DEVICE_KERNEL(assign, NPU); template diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc index 64b18435970..1cd5b5a8551 100644 --- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc +++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc @@ -60,6 +60,10 @@ bool ProtoArgumentMappingContext::IsSelectedRowsInput( const std::string& name) const { return false; } +bool ProtoArgumentMappingContext::IsDenseTensorVectorInput( + const std::string& name) const { + return false; +} bool ProtoArgumentMappingContext::IsDenseTensorOutput( const std::string& name) const { diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h index 7d08c32161b..5cf2ef97907 100644 --- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h +++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h @@ -42,6 +42,7 @@ class ProtoArgumentMappingContext : public ::phi::ArgumentMappingContext { bool IsDenseTensorInput(const std::string& name) const override; bool IsSelectedRowsInput(const std::string& name) const override; + bool IsDenseTensorVectorInput(const std::string& name) const override; bool IsDenseTensorOutput(const std::string& name) const override; bool IsSelectedRowsOutput(const std::string& name) const override; diff --git a/paddle/phi/core/compat/arg_map_context.h b/paddle/phi/core/compat/arg_map_context.h index 25b80279ecf..71cec011411 100644 --- a/paddle/phi/core/compat/arg_map_context.h +++ b/paddle/phi/core/compat/arg_map_context.h @@ -89,6 +89,8 @@ class ArgumentMappingContext { virtual bool IsDenseTensorInput(const std::string& name) const = 0; virtual bool IsSelectedRowsInput(const std::string& name) const = 0; + // For compatibility with LoDTensorArray + virtual bool IsDenseTensorVectorInput(const std::string& name) const = 0; virtual bool IsDenseTensorOutput(const std::string& name) const = 0; virtual bool IsSelectedRowsOutput(const std::string& name) const = 0; diff --git a/paddle/phi/core/kernel_context.cc b/paddle/phi/core/kernel_context.cc index a32e0e44f46..234e3528c36 100644 --- a/paddle/phi/core/kernel_context.cc +++ b/paddle/phi/core/kernel_context.cc @@ -37,6 +37,13 @@ void KernelContext::EmplaceBackInputs( std::make_move_iterator(inputs.end())); } +void KernelContext::EmplaceBackInputsWithoutSetRange( + paddle::SmallVector inputs) { + inputs_.insert(inputs_.end(), + std::make_move_iterator(inputs.begin()), + std::make_move_iterator(inputs.end())); +} + void KernelContext::EmplaceBackOutput(TensorBase* output) { int index = outputs_.size(); outputs_.emplace_back(output); @@ -59,6 +66,13 @@ void KernelContext::EmplaceBackOutputs( std::make_move_iterator(outputs.end())); } +void KernelContext::EmplaceBackOutputsWithoutSetRange( + paddle::SmallVector outputs) { + outputs_.insert(outputs_.end(), + std::make_move_iterator(outputs.begin()), + std::make_move_iterator(outputs.end())); +} + void KernelContext::EmplaceBackAttr(paddle::any attr) { attrs_.emplace_back(std::move(attr)); } diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h index 213ac47d30b..d3ca1ffc61c 100644 --- a/paddle/phi/core/kernel_context.h +++ b/paddle/phi/core/kernel_context.h @@ -52,12 +52,18 @@ class KernelContext { void EmplaceBackInputs(paddle::SmallVector inputs); + void EmplaceBackInputsWithoutSetRange( + paddle::SmallVector inputs); + void EmplaceBackOutput(TensorBase* output); void EmplaceBackOutputWithoutSetRange(TensorBase* output); void EmplaceBackOutputs(paddle::SmallVector outputs); + void EmplaceBackOutputsWithoutSetRange( + paddle::SmallVector outputs); + void EmplaceBackAttr(paddle::any attr); const std::pair& InputRangeAt(size_t idx) const; diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc new file mode 100644 index 00000000000..9faaace6917 --- /dev/null +++ b/paddle/phi/kernels/assign_kernel.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/assign_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/utils/optional.h" + +namespace phi { + +template +void AssignKernel(const Context& dev_ctx, + paddle::optional x, + DenseTensor* out) { + if (!x.is_initialized()) { + return; + } + auto& x_tensor = *x.get_ptr(); + Copy(dev_ctx, x_tensor, x_tensor.place(), false, out); +} + +// Note: use `const paddle::optional&> x` +// as input if needed +template +void AssignArrayKernel(const Context& dev_ctx, + const std::vector& x, + std::vector out) { + for (size_t i = 0; i < x.size(); ++i) { + AssignKernel(dev_ctx, *x[i], out.at(i)); + } +} + +} // namespace phi + +PD_REGISTER_GENERAL_KERNEL( + assign, CPU, ALL_LAYOUT, phi::AssignKernel, ALL_DTYPE) {} +PD_REGISTER_GENERAL_KERNEL(assign_array, + CPU, + ALL_LAYOUT, + phi::AssignArrayKernel, + ALL_DTYPE) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_GENERAL_KERNEL( + assign, GPU, ALL_LAYOUT, phi::AssignKernel, ALL_DTYPE) {} +PD_REGISTER_GENERAL_KERNEL(assign_array, + GPU, + ALL_LAYOUT, + phi::AssignArrayKernel, + ALL_DTYPE) {} +#endif diff --git a/paddle/phi/kernels/assign_kernel.h b/paddle/phi/kernels/assign_kernel.h new file mode 100644 index 00000000000..7cc06818dc0 --- /dev/null +++ b/paddle/phi/kernels/assign_kernel.h @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +// In order to be compatible with the `AsDispensable` input in the original +// assign op maker, the input parameter here needs to be dispensable, but +// this looks weird +template +void AssignKernel(const Context& dev_ctx, + paddle::optional x, + DenseTensor* out); + +template +void AssignArrayKernel(const Context& dev_ctx, + const std::vector& x, + std::vector out); + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/copy_kernel.cc b/paddle/phi/kernels/cpu/copy_kernel.cc index 1af071f23dd..fa11fd05bf1 100644 --- a/paddle/phi/kernels/cpu/copy_kernel.cc +++ b/paddle/phi/kernels/cpu/copy_kernel.cc @@ -38,7 +38,7 @@ void Copy(const Context& dev_ctx, << src_place; dst->Resize(src.dims()); - auto* dst_ptr = dev_ctx.Alloc(dst, src.dtype()); + auto* dst_ptr = dev_ctx.HostAlloc(dst, src.dtype()); if (src_ptr == dst_ptr) { VLOG(3) << "Skip copy the same data async from " << src_place << " to " diff --git a/paddle/phi/kernels/selected_rows/assign_kernel.cc b/paddle/phi/kernels/selected_rows/assign_kernel.cc new file mode 100644 index 00000000000..fae876facfc --- /dev/null +++ b/paddle/phi/kernels/selected_rows/assign_kernel.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/selected_rows/assign_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/assign_kernel.h" + +namespace phi { +namespace sr { + +// Note: use `const paddle::optional x` +// as input if needed +template +void AssignKernel(const Context& dev_ctx, + const SelectedRows& x, + SelectedRows* out) { + out->set_rows(x.rows()); + out->set_height(x.height()); + phi::AssignKernel(dev_ctx, x.value(), out->mutable_value()); +} + +} // namespace sr +} // namespace phi + +PD_REGISTER_GENERAL_KERNEL(assign_sr, + CPU, + ALL_LAYOUT, + phi::sr::AssignKernel, + ALL_DTYPE) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_GENERAL_KERNEL(assign_sr, + GPU, + ALL_LAYOUT, + phi::sr::AssignKernel, + ALL_DTYPE) {} +#endif diff --git a/paddle/phi/kernels/selected_rows/assign_kernel.h b/paddle/phi/kernels/selected_rows/assign_kernel.h new file mode 100644 index 00000000000..2ba465615a7 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/assign_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/selected_rows.h" + +namespace phi { +namespace sr { + +template +void AssignKernel(const Context& dev_ctx, + const SelectedRows& x, + SelectedRows* out); + +} // namespace sr +} // namespace phi diff --git a/paddle/phi/ops/compat/assign_sig.cc b/paddle/phi/ops/compat/assign_sig.cc new file mode 100644 index 00000000000..d149e8e6a9a --- /dev/null +++ b/paddle/phi/ops/compat/assign_sig.cc @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature AssignOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.HasInput("X")) { + if (ctx.IsDenseTensorVectorInput("X")) { + return KernelSignature("assign_array", {"X"}, {}, {"Out"}); + } else if (ctx.IsSelectedRowsInput("X")) { + return KernelSignature("assign_sr", {"X"}, {}, {"Out"}); + } else { + return KernelSignature("assign", {"X"}, {}, {"Out"}); + } + } else { + return KernelSignature("assign", {"X"}, {}, {"Out"}); + } +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(assign, phi::AssignOpArgumentMapping); diff --git a/paddle/phi/tests/kernels/test_copy_dev_api.cc b/paddle/phi/tests/kernels/test_copy_dev_api.cc index d69c7b2174f..460d85f8313 100644 --- a/paddle/phi/tests/kernels/test_copy_dev_api.cc +++ b/paddle/phi/tests/kernels/test_copy_dev_api.cc @@ -61,6 +61,10 @@ TEST(DEV_API, copy) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); + dev_ctx.SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); dev_ctx.Init(); phi::Copy( dev_ctx, *(dense_src.get()), phi::CPUPlace(), false, dense_dst.get()); diff --git a/paddle/phi/tests/kernels/test_flatten_dev_api.cc b/paddle/phi/tests/kernels/test_flatten_dev_api.cc index dc283728ee5..e3f2e8b57e3 100644 --- a/paddle/phi/tests/kernels/test_flatten_dev_api.cc +++ b/paddle/phi/tests/kernels/test_flatten_dev_api.cc @@ -58,6 +58,10 @@ TEST(DEV_API, flatten) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); + dev_ctx.SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); dev_ctx.Init(); // 2. test API diff --git a/paddle/phi/tests/kernels/test_reshape_dev_api.cc b/paddle/phi/tests/kernels/test_reshape_dev_api.cc index 16ad4fc341b..7de039372fa 100644 --- a/paddle/phi/tests/kernels/test_reshape_dev_api.cc +++ b/paddle/phi/tests/kernels/test_reshape_dev_api.cc @@ -50,6 +50,10 @@ TEST(DEV_API, reshape) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); + dev_ctx.SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); dev_ctx.Init(); auto out = phi::Reshape(dev_ctx, dense_x, shape); // 3. check result diff --git a/paddle/phi/tests/ops/test_op_signature.h b/paddle/phi/tests/ops/test_op_signature.h index 06048f33d94..8468dad10eb 100644 --- a/paddle/phi/tests/ops/test_op_signature.h +++ b/paddle/phi/tests/ops/test_op_signature.h @@ -72,6 +72,11 @@ class TestArgumentMappingContext : public phi::ArgumentMappingContext { return selected_rows_inputs.count(name) > 0; } + // add member if needed + bool IsDenseTensorVectorInput(const std::string& name) const override { + return false; + } + bool IsDenseTensorOutput(const std::string& name) const override { return dense_tensor_outputs.count(name) > 0; } -- GitLab From 081e4307b47d2c9e119b470d274886188a14391a Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Fri, 18 Mar 2022 09:40:54 +0800 Subject: [PATCH 150/176] Optimize perf of softmax_with_cross_entropy_bwd (#40643) * Optimize perf of softmax_with_cross_entropy_bwd * fix * fix --- .../softmax_with_cross_entropy_op.cu | 31 ++++++++++++------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu index 19a395e7231..41545a1ca20 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -760,8 +760,9 @@ static void SoftmaxWithCrossEntropyHardLabel( */ template __global__ void SoftmaxWithCrossEntropyGradHardLabel( - T* logits_grad, const T* loss_grad, const LabelT* labels, const int64_t n, - const int64_t dim, const int64_t d, const int ignore_index) { + T* logits_grad, const T* loss_grad, const T* softmax, const LabelT* labels, + const int64_t n, const int64_t dim, const int64_t d, + const int ignore_index) { int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; int64_t idx_n = idx / (d * dim); int64_t idx_dim = (idx / d) % dim; @@ -773,10 +774,9 @@ __global__ void SoftmaxWithCrossEntropyGradHardLabel( if (lbl == ignore_index) { logits_grad[idx] = static_cast(0.0); } else if (lbl == idx_dim) { - logits_grad[idx] = - (logits_grad[idx] - static_cast(1.0)) * loss_grad[ids]; + logits_grad[idx] = (softmax[idx] - static_cast(1.0)) * loss_grad[ids]; } else { - logits_grad[idx] *= loss_grad[ids]; + logits_grad[idx] = softmax[idx] * loss_grad[ids]; } } } @@ -1395,11 +1395,20 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { Tensor* logit_grad = context.Output(framework::GradVarName("Logits")); const Tensor* softmax = context.Input("Softmax"); - if (logit_grad != softmax) { + auto stream = context.cuda_device_context().stream(); + auto ignore_index = context.Attr("ignore_index"); + auto use_softmax = context.Attr("use_softmax"); + + T* logit_grad_data = nullptr; + bool copy_flag = (logit_grad != softmax && (!use_softmax || soft_label)); + if (copy_flag) { framework::TensorCopy(*softmax, context.GetPlace(), context.device_context(), logit_grad); + logit_grad_data = logit_grad->template data(); + } else { + logit_grad_data = + logit_grad->template mutable_data(context.GetPlace()); } - T* logit_grad_data = logit_grad->template data(); const int rank = logit_grad->dims().size(); const int axis = phi::funcs::CanonicalAxis(context.Attr("axis"), rank); @@ -1414,9 +1423,6 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { #else int block = 512; #endif - auto stream = context.cuda_device_context().stream(); - auto ignore_index = context.Attr("ignore_index"); - auto use_softmax = context.Attr("use_softmax"); // do not with softmax op, and input is softmax if (!use_softmax) { @@ -1451,11 +1457,12 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { SoftCrossEntropyGradientKernel<<>>( logit_grad_data, loss_grad_data, label_data, n, d, remain); } else { + const T* softmax_data = softmax->template data(); const auto* label_data = labels.template data(); int grid = (n * d + block - 1) / block; SoftmaxWithCrossEntropyGradHardLabel<<>>( - logit_grad_data, loss_grad_data, label_data, n, d / remain, remain, - ignore_index); + logit_grad_data, loss_grad_data, softmax_data, label_data, n, + d / remain, remain, ignore_index); } } }; -- GitLab From ef4ef15427255d8128b1cc4754ab9d9c02f87651 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?= <78149749+winter-wang@users.noreply.github.com> Date: Fri, 18 Mar 2022 10:02:12 +0800 Subject: [PATCH 151/176] [infrt] rename pd dialect from mlir to infrt. (#40651) * [infrt] rename pd dialect from mlir to infrt. test=develop * [infrt] fix the kernel signature generator bug. --- .../pybind/kernel_signature_generator.cc | 13 +++++-- paddle/infrt/dialect/init_dialects.cc | 7 ++-- paddle/infrt/dialect/pd/ir/pd_op_base.td | 10 +++--- paddle/infrt/dialect/pd/ir/pd_ops.cc | 35 ++++++++++--------- .../dialect/tensorrt/trt_graph_fuse_pass.cc | 14 ++++---- .../dialect/tensorrt/trt_graph_split_pass.cc | 8 ++--- .../dialect/tensorrt/trt_op_converter_pass.cc | 2 +- .../dialect/tensorrt/trt_op_teller_pass.cc | 8 ++--- paddle/infrt/host_context/paddle_mlir.cc | 2 +- paddle/infrt/tests/model/test_abs.cc | 2 +- tools/infrt/custom_pdop.td | 2 +- 11 files changed, 57 insertions(+), 46 deletions(-) diff --git a/paddle/fluid/pybind/kernel_signature_generator.cc b/paddle/fluid/pybind/kernel_signature_generator.cc index 8d78adaf5a4..1520174fba2 100644 --- a/paddle/fluid/pybind/kernel_signature_generator.cc +++ b/paddle/fluid/pybind/kernel_signature_generator.cc @@ -46,10 +46,19 @@ int main(int argc, char **argv) { auto &kernel_factory = phi::KernelFactory::Instance(); std::string kernel_signature_map_str{"{"}; for (const auto &op_kernel_pair : kernel_factory.kernels()) { - if (kernel_signature_map.Has(op_kernel_pair.first)) { + std::string op_name = op_kernel_pair.first; + const paddle::flat_hash_map &kernel_name_map = + phi::OpUtilsMap::Instance().base_kernel_name_map(); + for (auto &it : kernel_name_map) { + if (it.second == op_name) { + op_name = it.first; + break; + } + } + if (kernel_signature_map.Has(op_name)) { kernel_signature_map_str = kernel_signature_map_str + "\"" + op_kernel_pair.first + "\":{"; - auto &args = kernel_signature_map.Get(op_kernel_pair.first).args; + auto &args = kernel_signature_map.Get(op_name).args; kernel_signature_map_str += "\"inputs\":["; auto inputs_ = std::get<0>(args); diff --git a/paddle/infrt/dialect/init_dialects.cc b/paddle/infrt/dialect/init_dialects.cc index 6183295cafb..56c375c72d2 100644 --- a/paddle/infrt/dialect/init_dialects.cc +++ b/paddle/infrt/dialect/init_dialects.cc @@ -33,13 +33,14 @@ void registerCinnDialects(mlir::DialectRegistry ®istry) { // NOLINT registry.insert(); } diff --git a/paddle/infrt/dialect/pd/ir/pd_op_base.td b/paddle/infrt/dialect/pd/ir/pd_op_base.td index 7cab0eca45a..e28854a8480 100644 --- a/paddle/infrt/dialect/pd/ir/pd_op_base.td +++ b/paddle/infrt/dialect/pd/ir/pd_op_base.td @@ -17,7 +17,7 @@ def Paddle_Dialect : Dialect { This dialect contains the PaddlePaddle operators. }]; let hasConstantMaterializer = 1; - let cppNamespace = "mlir::pd"; + let cppNamespace = "infrt::pd"; } class PD_Op traits = []> : @@ -25,7 +25,7 @@ class PD_Op traits = []> : class PD_PaddleAttr : - Attr()">, + Attr()">, "PaddlePaddle " # description # " attribute">; @@ -33,12 +33,12 @@ class PD_PaddleAttr : // PaddlePaddle type definitions //===----------------------------------------------------------------------===// -def PD_PDDialectType : Type()">, "PaddlePaddle type">; +def PD_PDDialectType : Type()">, "PaddlePaddle type">; class PD_PaddleType : - Type()">, + Type()">, "Paddle " # description # " type">, - BuildableType<"getType()">; + BuildableType<"getType()">; //===----------------------------------------------------------------------===// // Integer types diff --git a/paddle/infrt/dialect/pd/ir/pd_ops.cc b/paddle/infrt/dialect/pd/ir/pd_ops.cc index d105aa07dd0..b5ba48581ee 100644 --- a/paddle/infrt/dialect/pd/ir/pd_ops.cc +++ b/paddle/infrt/dialect/pd/ir/pd_ops.cc @@ -24,7 +24,7 @@ #define GET_OP_CLASSES #include "paddle/infrt/dialect/pd/ir/pd_extra_ops.cpp.inc" // NOLINT -namespace mlir { +namespace infrt { namespace pd { void PaddleDialect::initialize() { addOperations< @@ -43,33 +43,34 @@ mlir::Operation *PaddleDialect::materializeConstant(mlir::OpBuilder &builder, return builder.create(loc, value); } -void ConstantOp::build(OpBuilder &builder, - OperationState &state, - Attribute value) { - if (auto elem_attr = value.dyn_cast()) { +void ConstantOp::build(mlir::OpBuilder &builder, + mlir::OperationState &state, + mlir::Attribute value) { + if (auto elem_attr = value.dyn_cast()) { return ConstantOp::build(builder, state, elem_attr); - } else if (value.isa()) { - ShapedType type = RankedTensorType::get(/*shape=*/{}, value.getType()); - state.addAttribute("value", DenseElementsAttr::get(type, value)); + } else if (value.isa()) { + mlir::ShapedType type = + mlir::RankedTensorType::get(/*shape=*/{}, value.getType()); + state.addAttribute("value", mlir::DenseElementsAttr::get(type, value)); state.addTypes(type); return; } llvm_unreachable("unsupported attribute type for building pd.constant"); } -LogicalResult ConstantOp::inferReturnTypes( - MLIRContext *context, - Optional location, - ValueRange operands, - DictionaryAttr attributes, - RegionRange regions, - SmallVectorImpl &inferredReturnTypes) { +mlir::LogicalResult ConstantOp::inferReturnTypes( + mlir::MLIRContext *context, + mlir::Optional location, + mlir::ValueRange operands, + mlir::DictionaryAttr attributes, + mlir::RegionRange regions, + llvm::SmallVectorImpl &inferredReturnTypes) { inferredReturnTypes.push_back(attributes.get("value").getType()); - return success(); + return mlir::success(); } mlir::OpFoldResult ConstantOp::fold( ::llvm::ArrayRef operands) { return value(); } } // namespace pd -} // namespace mlir +} // namespace infrt diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc index e22a2309cbe..0878163a955 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc @@ -55,8 +55,8 @@ bool reverseDfs(std::vector source, // merge the first&second graph op to a new graph op. void mergeTwoAdjacentGraphOp(mlir::OpBuilder &builder, // NOLINT - mlir::pd::GraphOp first, - mlir::pd::GraphOp second) { + infrt::pd::GraphOp first, + infrt::pd::GraphOp second) { // comput inputs and outputs ::llvm::SmallVector inputs(first.getOperands()), outputs; for (mlir::Value input : second.getOperands()) { @@ -85,7 +85,7 @@ void mergeTwoAdjacentGraphOp(mlir::OpBuilder &builder, // NOLINT // create the new graph op builder.setInsertionPoint(first); auto loc = first.getLoc(); - auto graph_op = builder.create(loc, return_types, inputs); + auto graph_op = builder.create(loc, return_types, inputs); mlir::Block *block = new mlir::Block; auto copy_range = second.getBody()->without_terminator(); block->getOperations().splice(block->begin(), @@ -150,13 +150,13 @@ void TRTGraphFusePass::runOnFunction() { do { changed = false; for (auto &op : body) { - mlir::pd::GraphOp graph_op = - ::llvm::dyn_cast_or_null(&op); + infrt::pd::GraphOp graph_op = + ::llvm::dyn_cast_or_null(&op); if (nullptr == graph_op) continue; for (auto user_op : op.getUsers()) { - mlir::pd::GraphOp user_graph_op = - ::llvm::dyn_cast_or_null(user_op); + infrt::pd::GraphOp user_graph_op = + ::llvm::dyn_cast_or_null(user_op); if (nullptr == user_graph_op) continue; // get all dst input nodes except src. std::vector source_nodes; diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc index f81179e548f..ade61bfc370 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc @@ -21,18 +21,18 @@ namespace infrt { namespace trt { // Implementation of the trtGraphSplitPass。 void TRTGraphSplitPass::runOnFunction() { - std::vector worklist; + std::vector worklist; mlir::Block& block = getFunction().front(); for (auto& op : block) { - mlir::pd::GraphOp graph_op = - ::llvm::dyn_cast_or_null(&op); + infrt::pd::GraphOp graph_op = + ::llvm::dyn_cast_or_null(&op); if (nullptr != graph_op && graph_op.getBody()->getOperations().size() <= min_subgraph_size_) { worklist.push_back(graph_op); } } while (!worklist.empty()) { - mlir::pd::GraphOp graph_op = worklist.back(); + infrt::pd::GraphOp graph_op = worklist.back(); worklist.pop_back(); mlir::Block* body = graph_op.getBody(); auto return_op = body->getTerminator(); diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc index 1e6a3e13805..19c6b13e971 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc @@ -27,7 +27,7 @@ struct PD2TRT_GraphLower : public ::mlir::RewritePattern { : ::mlir::RewritePattern("pd.graph", 1, context, {"trt.create_engine"}) {} ::mlir::LogicalResult matchAndRewrite( ::mlir::Operation *op, ::mlir::PatternRewriter &rewriter) const override { - auto casted_op = ::llvm::dyn_cast(op); + auto casted_op = ::llvm::dyn_cast(op); ::mlir::Operation::operand_range inputs = casted_op.inputs(); auto ods_loc = rewriter.getFusedLoc(op->getLoc()); CreateEngineOp create_engine_op; diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc index 2c6f08277c8..ef9ccc82678 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc @@ -35,13 +35,13 @@ void TRTOpTellerPass::runOnFunction() { auto *op = worklist.back(); worklist.pop_back(); if (op == nullptr) continue; - if (::llvm::dyn_cast_or_null(op)) continue; - if (::llvm::dyn_cast_or_null(op)) continue; - if (::llvm::dyn_cast_or_null(op)) continue; + if (::llvm::dyn_cast_or_null(op)) continue; + if (::llvm::dyn_cast_or_null(op)) continue; + if (::llvm::dyn_cast_or_null(op)) continue; if (::llvm::dyn_cast_or_null<::infrt::ReturnOp>(op)) continue; builder.setInsertionPoint(op); auto loc = getFunction().getLoc(); - auto graph_op = builder.create( + auto graph_op = builder.create( loc, op->getResultTypes(), op->getOperands()); ::llvm::SmallVector tblgen_repl_values; diff --git a/paddle/infrt/host_context/paddle_mlir.cc b/paddle/infrt/host_context/paddle_mlir.cc index 48999a23ef3..4e7de9e2df1 100644 --- a/paddle/infrt/host_context/paddle_mlir.cc +++ b/paddle/infrt/host_context/paddle_mlir.cc @@ -22,7 +22,7 @@ MLIRModelGenImpl::MLIRModelGenImpl() context_->getOrLoadDialect(); context_->getOrLoadDialect(); context_->getOrLoadDialect(); - context_->getOrLoadDialect(); + context_->getOrLoadDialect(); context_->getOrLoadDialect<::infrt::InfrtDialect>(); module_ = mlir::ModuleOp::create(mlir::UnknownLoc::get(context_)); } diff --git a/paddle/infrt/tests/model/test_abs.cc b/paddle/infrt/tests/model/test_abs.cc index 5de159b86fc..49266910dbd 100644 --- a/paddle/infrt/tests/model/test_abs.cc +++ b/paddle/infrt/tests/model/test_abs.cc @@ -72,7 +72,7 @@ TEST(ABS_MODEL, convert_and_execute) { context->getOrLoadDialect(); context->getOrLoadDialect(); context->getOrLoadDialect(); - context->getOrLoadDialect(); + context->getOrLoadDialect(); context->getOrLoadDialect(); context->getOrLoadDialect(); diff --git a/tools/infrt/custom_pdop.td b/tools/infrt/custom_pdop.td index 861b3194120..ae0316036f1 100644 --- a/tools/infrt/custom_pdop.td +++ b/tools/infrt/custom_pdop.td @@ -42,6 +42,6 @@ def PD_ConstantOp : PD_Op<"constant", [NoSideEffect, ConstantLike, DeclareOpInte let hasFolder = 1; let builders = [ - OpBuilder<(ins "Attribute":$value)>, + OpBuilder<(ins "mlir::Attribute":$value)>, ]; } -- GitLab From 1a13fa0fcbc7a0d7c1b9b56c770cc438cc9579ce Mon Sep 17 00:00:00 2001 From: furnace <34057289+windstamp@users.noreply.github.com> Date: Fri, 18 Mar 2022 10:43:47 +0800 Subject: [PATCH 152/176] [NPU] fix fp16 (PART II) (#40537) [NPU] fix fp16 (PART II) --- paddle/fluid/operators/conv_op_npu.cc | 2 +- paddle/fluid/operators/pad_op_npu.cc | 2 +- .../fluid/tests/unittests/npu/CMakeLists.txt | 1 + .../npu/test_conv2d_op_depthwise_conv_npu.py | 110 +++++++++++------- .../tests/unittests/npu/test_conv2d_op_npu.py | 63 +++++----- .../npu/test_elementwise_add_op_npu.py | 63 ++++++---- .../npu/test_elementwise_max_op_npu.py | 12 -- .../npu/test_elementwise_min_op_npu.py | 49 ++++---- .../npu/test_elementwise_pow_op_npu.py | 8 -- .../tests/unittests/npu/test_expand_op_npu.py | 14 +-- .../unittests/npu/test_huber_loss_op_npu.py | 6 - .../unittests/npu/test_label_smooth_op_npu.py | 6 +- .../unittests/npu/test_leaky_relu_op_npu.py | 6 +- .../unittests/npu/test_log_softmax_op_npu.py | 10 +- .../npu/test_lookup_table_v2_op_npu.py | 6 +- .../npu/test_nearest_interp_v2_op_npu.py | 25 +++- .../tests/unittests/npu/test_pad_op_npu.py | 7 +- .../tests/unittests/npu/test_relu_op_npu.py | 37 ++---- .../tests/unittests/npu/test_slice_op_npu.py | 17 ++- 19 files changed, 255 insertions(+), 189 deletions(-) diff --git a/paddle/fluid/operators/conv_op_npu.cc b/paddle/fluid/operators/conv_op_npu.cc index 8897f7b229c..fcda16a3e72 100644 --- a/paddle/fluid/operators/conv_op_npu.cc +++ b/paddle/fluid/operators/conv_op_npu.cc @@ -356,7 +356,7 @@ class NPUConvGradOpKernel : public framework::OpKernel { auto stream = ctx.template device_context().stream(); if (filter_grad) { - filter_grad->mutable_data(ctx.GetPlace()); + filter_grad->mutable_data(ctx.GetPlace()); std::vector filter_shape_vec = phi::vectorize(filter->dims()); const auto& runner = NpuOpRunner( diff --git a/paddle/fluid/operators/pad_op_npu.cc b/paddle/fluid/operators/pad_op_npu.cc index d0cb674b404..adc4a2ffaf8 100644 --- a/paddle/fluid/operators/pad_op_npu.cc +++ b/paddle/fluid/operators/pad_op_npu.cc @@ -90,5 +90,5 @@ namespace plat = paddle::platform; REGISTER_OP_NPU_KERNEL(pad, ops::PadNPUKernel, ops::PadNPUKernel, ops::PadNPUKernel); -REGISTER_OP_NPU_KERNEL(pad_grad, ops::PadNPUKernel, +REGISTER_OP_NPU_KERNEL(pad_grad, ops::PadGradNPUKernel, ops::PadGradNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt index 8e31d58195b..e9d9af5c113 100644 --- a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt @@ -22,4 +22,5 @@ if (WITH_ASCEND_CL) set_tests_properties(test_conv2d_transpose_op_npu PROPERTIES TIMEOUT 200) set_tests_properties(test_conv2d_op_npu PROPERTIES TIMEOUT 300) set_tests_properties(test_matmulv2_op_npu PROPERTIES TIMEOUT 300) + set_tests_properties(test_elementwise_add_op_npu PROPERTIES TIMEOUT 200) endif() diff --git a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py index 012a6e59e77..2e15a1eac2b 100755 --- a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_depthwise_conv_npu.py @@ -132,36 +132,50 @@ class TestDepthwiseConvNPU(OpTest): self.check_output_with_place(self.place, atol=1e-2) def test_check_grad(self): - if self.dtype == np.float16: - return if self.dilations[0] == 1 and self.dilations[1] == 1: - self.check_grad_with_place( - self.place, {'Input', 'Filter'}, - 'Output', - max_relative_error=0.03, - numeric_place=paddle.CPUPlace()) + if self.dtype == np.float16: + self.check_grad_with_place( + self.place, {'Input', 'Filter'}, + 'Output', + max_relative_error=0.9) + else: + self.check_grad_with_place( + self.place, {'Input', 'Filter'}, + 'Output', + max_relative_error=0.03, + numeric_place=paddle.CPUPlace()) def test_check_grad_no_filter(self): if self.dtype == np.float16: - return - self.check_grad_with_place( - self.place, ['Input'], - 'Output', - no_grad_set=set(['Filter']), - max_relative_error=0.03, - numeric_place=paddle.CPUPlace()) - - def test_check_grad_no_input(self): - if self.dtype == np.float16: - return - if self.dilations[0] == 1 and self.dilations[1] == 1: self.check_grad_with_place( - self.place, ['Filter'], + self.place, ['Input'], 'Output', - no_grad_set=set(['Input']), + no_grad_set=set(['Filter']), + max_relative_error=0.9) + else: + self.check_grad_with_place( + self.place, ['Input'], + 'Output', + no_grad_set=set(['Filter']), max_relative_error=0.03, numeric_place=paddle.CPUPlace()) + def test_check_grad_no_input(self): + if self.dilations[0] == 1 and self.dilations[1] == 1: + if self.dtype == np.float16: + self.check_grad_with_place( + self.place, ['Filter'], + 'Output', + no_grad_set=set(['Input']), + max_relative_error=0.9) + else: + self.check_grad_with_place( + self.place, ['Filter'], + 'Output', + no_grad_set=set(['Input']), + max_relative_error=0.03, + numeric_place=paddle.CPUPlace()) + def init_data_format(self): self.data_format = "NCHW" @@ -267,32 +281,46 @@ class TestDepthwiseConvNPU_Padding(OpTest): def test_check_grad(self): if self.dtype == np.float16: - return - self.check_grad_with_place( - self.place, {'Input', 'Filter'}, - 'Output', - max_relative_error=0.03, - numeric_place=paddle.CPUPlace()) + self.check_grad_with_place( + self.place, {'Input', 'Filter'}, + 'Output', + max_relative_error=1.2) + else: + self.check_grad_with_place( + self.place, {'Input', 'Filter'}, + 'Output', + max_relative_error=0.03, + numeric_place=paddle.CPUPlace()) def test_check_grad_no_filter(self): if self.dtype == np.float16: - return - self.check_grad_with_place( - self.place, ['Input'], - 'Output', - max_relative_error=0.03, - no_grad_set=set(['Filter']), - numeric_place=paddle.CPUPlace()) + self.check_grad_with_place( + self.place, ['Input'], + 'Output', + max_relative_error=0.7, + no_grad_set=set(['Filter'])) + else: + self.check_grad_with_place( + self.place, ['Input'], + 'Output', + max_relative_error=0.03, + no_grad_set=set(['Filter']), + numeric_place=paddle.CPUPlace()) def test_check_grad_no_input(self): if self.dtype == np.float16: - return - self.check_grad_with_place( - self.place, ['Filter'], - 'Output', - max_relative_error=0.03, - no_grad_set=set(['Input']), - numeric_place=paddle.CPUPlace()) + self.check_grad_with_place( + self.place, ['Filter'], + 'Output', + max_relative_error=0.8, + no_grad_set=set(['Input'])) + else: + self.check_grad_with_place( + self.place, ['Filter'], + 'Output', + max_relative_error=0.03, + no_grad_set=set(['Input']), + numeric_place=paddle.CPUPlace()) def init_data_format(self): self.data_format = "NCHW" diff --git a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py index d0dc86055a1..4070d0267d9 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_conv2d_op_npu.py @@ -127,8 +127,6 @@ class TestConv2DOp(OpTest): self.check_output_with_place(fluid.NPUPlace(0), atol=1e-2) def test_check_grad(self): - if self.dtype == np.float16: - return self.check_grad_with_place( fluid.NPUPlace(0), {'Input', 'Filter'}, 'Output', @@ -136,8 +134,6 @@ class TestConv2DOp(OpTest): numeric_place=paddle.CPUPlace()) def test_check_grad_no_filter(self): - if self.dtype == np.float16: - return self.check_grad_with_place( fluid.NPUPlace(0), ['Input'], 'Output', @@ -146,8 +142,6 @@ class TestConv2DOp(OpTest): numeric_place=paddle.CPUPlace()) def test_check_grad_no_input(self): - if self.dtype == np.float16: - return self.check_grad_with_place( fluid.NPUPlace(0), ['Filter'], 'Output', @@ -276,10 +270,13 @@ class TestConv2DOp_v2(OpTest): def set_npu(self): self.__class__.use_npu = True + def init_dtype(self): + self.dtype = np.float32 + def setUp(self): self.set_npu() self.op_type = "conv2d" - self.dtype = np.float32 + self.init_dtype() self.init_kernel_type() self.init_group() self.init_dilation() @@ -320,31 +317,45 @@ class TestConv2DOp_v2(OpTest): def test_check_grad(self): if self.dtype == np.float16: - return - self.check_grad_with_place( - paddle.NPUPlace(0), {'Input', 'Filter'}, - 'Output', - max_relative_error=0.02, - numeric_place=paddle.CPUPlace()) + self.check_grad_with_place( + paddle.NPUPlace(0), {'Input', 'Filter'}, + 'Output', + max_relative_error=1.1) + else: + self.check_grad_with_place( + paddle.NPUPlace(0), {'Input', 'Filter'}, + 'Output', + max_relative_error=0.02, + numeric_place=paddle.CPUPlace()) def test_check_grad_no_filter(self): if self.dtype == np.float16: - return - self.check_grad_with_place( - paddle.NPUPlace(0), ['Input'], - 'Output', - max_relative_error=0.02, - no_grad_set=set(['Filter']), - numeric_place=paddle.CPUPlace()) + self.check_grad_with_place( + paddle.NPUPlace(0), ['Input'], + 'Output', + max_relative_error=0.99, + no_grad_set=set(['Filter'])) + else: + self.check_grad_with_place( + paddle.NPUPlace(0), ['Input'], + 'Output', + max_relative_error=0.02, + no_grad_set=set(['Filter']), + numeric_place=paddle.CPUPlace()) def test_check_grad_no_input(self): if self.dtype == np.float16: - return - self.check_grad_with_place( - paddle.NPUPlace(0), ['Filter'], - 'Output', - no_grad_set=set(['Input']), - numeric_place=paddle.CPUPlace()) + self.check_grad_with_place( + paddle.NPUPlace(0), ['Filter'], + 'Output', + max_relative_error=0.99, + no_grad_set=set(['Input'])) + else: + self.check_grad_with_place( + paddle.NPUPlace(0), ['Filter'], + 'Output', + no_grad_set=set(['Input']), + numeric_place=paddle.CPUPlace()) def init_test_case(self): self.pad = [0, 0] diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py index 75c70e0a131..f24c6c455a0 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py @@ -65,36 +65,59 @@ class TestElementwiseAddOp(OpTest): self.check_output_with_place(self.place) def test_check_grad_normal(self): - if self.dtype == np.float16 or self.dtype == np.int64: + if self.dtype == np.int64: return - self.check_grad_with_place( - self.place, - ['X', 'Y'], - 'Out', - max_relative_error=0.006, ) + if self.dtype == np.float16: + self.check_grad_with_place( + self.place, + ['X', 'Y'], + 'Out', + max_relative_error=0.15, ) + else: + self.check_grad_with_place( + self.place, + ['X', 'Y'], + 'Out', + max_relative_error=0.006, ) def test_check_grad_ingore_x(self): - if self.dtype == np.float16 or self.dtype == np.int64: + if self.dtype == np.int64: return - self.check_grad_with_place( - self.place, - ['Y'], - 'Out', - no_grad_set=set("X"), - max_relative_error=0.006, ) + if self.dtype == np.float16: + self.check_grad_with_place( + self.place, + ['Y'], + 'Out', + no_grad_set=set("X"), + max_relative_error=0.92, ) + else: + self.check_grad_with_place( + self.place, + ['Y'], + 'Out', + no_grad_set=set("X"), + max_relative_error=0.006, ) def test_check_grad_ingore_y(self): - if self.dtype == np.float16 or self.dtype == np.int64: + if self.dtype == np.int64: return - self.check_grad_with_place( - self.place, - ['X'], - 'Out', - no_grad_set=set("Y"), - max_relative_error=0.006, ) + if self.dtype == np.float16: + self.check_grad_with_place( + self.place, + ['X'], + 'Out', + no_grad_set=set("Y"), + max_relative_error=0.8, ) + else: + self.check_grad_with_place( + self.place, + ['X'], + 'Out', + no_grad_set=set("Y"), + max_relative_error=0.006, ) class TestFP16ElementwiseAddOp(TestElementwiseAddOp): diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py index 461e15352e3..cbfc07f3544 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py @@ -116,19 +116,13 @@ class TestElementwiseMaxOp(OpTest): self.check_output_with_place(self.place) def test_check_grad_normal(self): - if self.dtype == np.float16: - return self.check_grad_with_place(self.place, ['X', 'Y'], 'Out') def test_check_grad_ingore_x(self): - if self.dtype == np.float16: - return self.check_grad_with_place( self.place, ['Y'], 'Out', no_grad_set=set("X")) def test_check_grad_ingore_y(self): - if self.dtype == np.float16: - return self.check_grad_with_place( self.place, ['X'], 'Out', no_grad_set=set("Y")) @@ -213,15 +207,11 @@ class TestElementwiseMaxOp_broadcast_2(TestElementwiseMaxOp): self.out = np.maximum(self.x, self.y.reshape(1, 1, 100)) def test_check_grad_normal(self): - if self.dtype == np.float16: - return dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis) self.check_grad_with_place( self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy]) def test_check_grad_ingore_x(self): - if self.dtype == np.float16: - return _, dy = ComputeGrad(self.x, self.y, self.out, self.axis) self.check_grad_with_place( self.place, ['Y'], @@ -230,8 +220,6 @@ class TestElementwiseMaxOp_broadcast_2(TestElementwiseMaxOp): user_defined_grads=[dy]) def test_check_grad_ingore_y(self): - if self.dtype == np.float16: - return dx, _ = ComputeGrad(self.x, self.y, self.out, self.axis) self.check_grad_with_place( self.place, ['X'], diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py index 51cf5cdaf6d..e191224df81 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py @@ -64,32 +64,41 @@ class TestElementwiseMinOp(OpTest): def test_check_grad_normal(self): if self.dtype == np.float16: - return - - self.check_grad_with_place( - self.place, - ['X', 'Y'], - 'Out', ) + self.check_grad_with_place( + self.place, ['X', 'Y'], 'Out', max_relative_error=0.5) + else: + self.check_grad_with_place( + self.place, + ['X', 'Y'], + 'Out', ) def test_check_grad_ingore_x(self): if self.dtype == np.float16: - return - - self.check_grad_with_place( - self.place, - ['Y'], - 'Out', - no_grad_set=set("X"), ) + self.check_grad_with_place( + self.place, ['Y'], + 'Out', + no_grad_set=set("X"), + max_relative_error=0.9) + else: + self.check_grad_with_place( + self.place, + ['Y'], + 'Out', + no_grad_set=set("X"), ) def test_check_grad_ingore_y(self): if self.dtype == np.float16: - return - - self.check_grad_with_place( - self.place, - ['X'], - 'Out', - no_grad_set=set("Y"), ) + self.check_grad_with_place( + self.place, ['X'], + 'Out', + no_grad_set=set("Y"), + max_relative_error=0.1) + else: + self.check_grad_with_place( + self.place, + ['X'], + 'Out', + no_grad_set=set("Y"), ) class TestElementwiseMinOpFp16(TestElementwiseMinOp): diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py index ce645f317d0..907e149c8b2 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py @@ -114,8 +114,6 @@ class TestElementwisePow(OpTest): self.out = np.power(self.x, self.y) def test_check_grad_normal(self): - if self.dtype == np.float16: - return dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis) self.check_grad_with_place( self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy]) @@ -184,8 +182,6 @@ class TestElementwisePowOp_broadcast_0(TestElementwisePow): self.out = np.power(self.x, self.y) def test_check_grad_normal(self): - if self.dtype == np.float16: - return dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis) self.check_grad_with_place( self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy]) @@ -218,8 +214,6 @@ class TestElementwisePowOp_broadcast_1(TestElementwisePow): self.out = np.power(self.x, self.y.reshape(1, 100, 1)) def test_check_grad_normal(self): - if self.dtype == np.float16: - return dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis) self.check_grad_with_place( self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy]) @@ -252,8 +246,6 @@ class TestElementwisePowOp_broadcast_2(TestElementwisePow): self.out = np.power(self.x, self.y.reshape(100, 1, 1)) def test_check_grad_normal(self): - if self.dtype == np.float16: - return dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis) self.check_grad_with_place( self.place, ['X', 'Y'], 'Out', user_defined_grads=[dx, dy]) diff --git a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py index 89ac9e09aa3..83b65630d80 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py @@ -34,7 +34,7 @@ class TestExpand(OpTest): self.init_dtype() np.random.seed(SEED) - x = np.random.randn(3, 1, 7).astype(self.dtype) + x = np.random.randn(30, 1, 7).astype(self.dtype) out = np.tile(x, [1, 10, 1]) self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} @@ -50,12 +50,8 @@ class TestExpand(OpTest): def test_check_output(self): self.check_output_with_place(self.place) - # TODO(ascendrc): Add grad test - # def test_check_grad(self): - # if self.dtype == np.float16: - # return - # self.check_grad(['X'], 'Out') - # + def test_check_grad(self): + self.check_grad(['X'], 'Out') class TestExpandV2(TestExpand): @@ -66,7 +62,7 @@ class TestExpandV2(TestExpand): self.init_dtype() np.random.seed(SEED) - x = np.random.randn(3, 1, 7).astype(self.dtype) + x = np.random.randn(30, 1, 7).astype(self.dtype) out = np.tile(x, [1, 10, 1]) expand_times = np.array([1, 10, 1]).astype(np.int32) @@ -145,7 +141,7 @@ class TestExpand_expand_times_all_one(TestExpand): self.init_dtype() np.random.seed(SEED) - x = np.random.randn(3, 1, 7).astype(self.dtype) + x = np.random.randn(30, 1, 7).astype(self.dtype) out = np.tile(x, [1, 1, 1]) self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} diff --git a/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py index 1c9f499d22d..a9c195bb8cd 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_huber_loss_op_npu.py @@ -81,13 +81,9 @@ class TestHuberLossOp(OpTest): self.check_output_with_place(self.place) def test_check_grad_normal(self): - if self.dtype == np.float16: - return self.check_grad_with_place(self.place, ['X', 'Y'], 'Out') def test_check_grad_ingore_x(self): - if self.dtype == np.float16: - return self.check_grad_with_place( self.place, ['Y'], 'Out', @@ -95,8 +91,6 @@ class TestHuberLossOp(OpTest): no_grad_set=set("residual")) def test_check_grad_ingore_y(self): - if self.dtype == np.float16: - return self.check_grad_with_place( self.place, ['X'], 'Out', diff --git a/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py index 6e5b4c01205..d02ddae461b 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_label_smooth_op_npu.py @@ -78,8 +78,10 @@ class TestLabelSmoothOp(OpTest): def test_check_grad(self): if self.dtype == np.float16: - return - self.check_grad_with_place(self.place, ['X'], 'Out') + self.check_grad_with_place( + self.place, ['X'], 'Out', max_relative_error=0.5) + else: + self.check_grad_with_place(self.place, ['X'], 'Out') class TestLabelSmoothOpWithPriorDist(TestLabelSmoothOp): diff --git a/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py index 590a9612699..a0472f9611e 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_leaky_relu_op_npu.py @@ -63,8 +63,10 @@ class TestLeadyRelu(OpTest): def test_check_grad(self): if self.dtype == np.float16: - return - self.check_grad_with_place(self.place, ['X'], 'Out') + self.check_grad_with_place( + self.place, ['X'], 'Out', max_relative_error=0.006) + else: + self.check_grad_with_place(self.place, ['X'], 'Out') class TestLeadyReluFP16(TestLeadyRelu): diff --git a/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py index f6baefec7f2..10ec8621ffa 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_log_softmax_op_npu.py @@ -63,9 +63,13 @@ class TestLogSoftmaxNPUOp(OpTest): def test_check_grad(self): if self.dtype == np.float16: - return - self.check_grad_with_place( - self.place, ['X'], ['Out'], user_defined_grads=[self.x_grad]) + self.check_grad_with_place( + self.place, ['X'], ['Out'], + user_defined_grads=[self.x_grad], + max_relative_error=0.02) + else: + self.check_grad_with_place( + self.place, ['X'], ['Out'], user_defined_grads=[self.x_grad]) def test_class(op_type, typename): diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py index fefff0974ae..8ec9eb1cf35 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py @@ -77,8 +77,10 @@ class TestLookupTableV2(OpTest): def test_check_grad(self): if self.dtype == np.float16: - return - self.check_grad_with_place(self.place, ['W'], 'Out') + self.check_grad_with_place( + self.place, ['W'], 'Out', max_relative_error=0.01) + else: + self.check_grad_with_place(self.place, ['W'], 'Out') class TestLookupTableV2FP16(TestLookupTableV2): diff --git a/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py index f3df1fca307..ec51dcf3f8e 100755 --- a/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_nearest_interp_v2_op_npu.py @@ -39,10 +39,11 @@ class TestNearestInterpOp(OpTest): self.set_npu() self.out_size = None self.actual_shape = None + self.init_dtype() self.data_layout = 'NCHW' self.init_test_case() self.op_type = "nearest_interp_v2" - input_np = np.random.random(self.input_shape).astype("float32") + input_np = np.random.random(self.input_shape).astype(self.dtype) if self.data_layout == "NCHW": in_h = self.input_shape[2] @@ -95,8 +96,21 @@ class TestNearestInterpOp(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - self.check_grad_with_place( - self.place, ['X'], 'Out', in_place=True, max_relative_error=0.006) + if self.dtype == np.float16: + self.check_grad_with_place( + self.place, ['X'], + 'Out', + in_place=True, + max_relative_error=0.02) + else: + self.check_grad_with_place( + self.place, ['X'], + 'Out', + in_place=True, + max_relative_error=0.006) + + def init_dtype(self): + self.dtype = np.float32 def init_test_case(self): self.interp_method = 'nearest' @@ -108,6 +122,11 @@ class TestNearestInterpOp(OpTest): self.align_corners = False +class TestNearestNeighborInterpFP16(TestNearestInterpOp): + def init_dtype(self): + self.dtype = np.float16 + + class TestNearestNeighborInterpCase1(TestNearestInterpOp): def init_test_case(self): self.interp_method = 'nearest' diff --git a/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py index 7d6c3b9bdb4..d1d2e8b3467 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_pad_op_npu.py @@ -50,9 +50,10 @@ class TestPadOp(OpTest): def test_check_grad_normal(self): if self.dtype == np.float16: - return - - self.check_grad_with_place(self.place, ['X'], 'Out') + self.check_grad_with_place( + self.place, ['X'], 'Out', max_relative_error=0.6) + else: + self.check_grad_with_place(self.place, ['X'], 'Out') def set_npu(self): self.__class__.use_npu = True diff --git a/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py index a2547808e6f..c909b14b514 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py @@ -34,11 +34,12 @@ class TestRelu(OpTest): self.init_dtype() np.random.seed(SEED) - x = np.random.rand(3, 2).astype(self.dtype) - out = x - self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} - self.attrs = {} + x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype) + # The same reason with TestAbs + x[np.abs(x) < 0.005] = 0.02 + out = np.maximum(x, 0) + self.inputs = {'X': x} self.outputs = {'Out': out} def set_npu(self): @@ -50,32 +51,18 @@ class TestRelu(OpTest): def test_check_output(self): self.check_output_with_place(self.place) + def test_check_grad(self): + if self.dtype == np.float16: + self.check_grad_with_place( + self.place, ['X'], 'Out', max_relative_error=0.006) + else: + self.check_grad_with_place(self.place, ['X'], 'Out') -class TestReluFp16(OpTest): - def setUp(self): - self.set_npu() - self.op_type = "relu" - self.place = paddle.NPUPlace(0) - - self.init_dtype() - np.random.seed(SEED) - x = np.random.rand(3, 2).astype(self.dtype) - out = x - - self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} - self.attrs = {} - self.outputs = {'Out': out} - - def set_npu(self): - self.__class__.use_npu = True - self.__class__.no_need_check_grad = True +class TestReluFp16(TestRelu): def init_dtype(self): self.dtype = np.float16 - def test_check_output(self): - self.check_output_with_place(self.place, atol=1e-5) - class TestReluNeg(OpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py index 611691109e1..a5b203b6eea 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py @@ -58,12 +58,17 @@ class TestSliceOp(OpTest): self.place = paddle.NPUPlace(0) def test_check_output(self): - self.check_output_with_place(self.place) + if self.dtype == np.float16: + self.check_output_with_place(self.place) + else: + self.check_output_with_place(self.place) def test_check_grad_normal(self): if self.dtype == np.float16: - return - self.check_grad_with_place(self.place, ['Input'], 'Out') + self.check_grad_with_place( + self.place, ['Input'], 'Out', max_relative_error=0.02) + else: + self.check_grad_with_place(self.place, ['Input'], 'Out') class TestSliceOp2(TestSliceOp): @@ -347,8 +352,10 @@ class TestSliceOpDecsDim(OpTest): def test_check_grad_normal(self): if self.dtype == np.float16: - return - self.check_grad_with_place(self.place, ['Input'], 'Out') + self.check_grad_with_place( + self.place, ['Input'], 'Out', max_relative_error=0.5) + else: + self.check_grad_with_place(self.place, ['Input'], 'Out') class TestSliceOpDecsDimFp16(TestSliceOpDecsDim): -- GitLab From 70726696bf0f7c84e208a5d588d0c3e4342d18f0 Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Fri, 18 Mar 2022 10:49:36 +0800 Subject: [PATCH 153/176] [Phi] move reduce_grad kernel into phi (#40522) * move reduce_mean_grad kernel into phi * move reduce_max/min_grad into phi * remove raw max/min grad kernel * fix bug * fix max/min grad error * move all reduce_grad kernel into one file * add prod grad kernel * add infermeta for prod kernel --- .../new_executor/standalone_executor_test.cc | 2 +- .../operators/reduce_ops/reduce_max_op.cc | 10 -- .../reduce_ops/reduce_max_op.part.cu | 25 ---- .../operators/reduce_ops/reduce_mean_op.cc | 9 -- .../reduce_ops/reduce_mean_op.part.cu | 25 ---- .../operators/reduce_ops/reduce_min_op.cc | 10 -- .../reduce_ops/reduce_min_op.part.cu | 25 ---- .../operators/reduce_ops/reduce_prod_op.cc | 30 +++-- .../reduce_ops/reduce_prod_op.part.cu | 25 ---- paddle/phi/core/compat/op_utils.h | 5 + paddle/phi/kernels/CMakeLists.txt | 3 +- ...m_grad_kernel.cc => reduce_grad_kernel.cc} | 103 ++++++++++----- paddle/phi/kernels/funcs/reduce_functor.h | 77 ++++++++++++ .../phi/kernels/funcs/reduce_grad_functions.h | 6 +- paddle/phi/kernels/gpu/reduce_grad.h | 54 ++++++++ paddle/phi/kernels/gpu/reduce_grad_kernel.cu | 119 ++++++++++++++++++ .../phi/kernels/gpu/reduce_sum_grad_kernel.cu | 90 ------------- .../phi/kernels/{cpu => impl}/reduce_grad.h | 0 .../reduce_max_grad_kernel_impl.h} | 23 +++- .../impl/reduce_min_grad_kernel_impl.h | 47 +++++++ .../impl/reduce_prod_grad_kernel_impl.h | 47 +++++++ paddle/phi/kernels/reduce_grad_kernel.h | 79 ++++++++++++ paddle/phi/kernels/reduce_kernel.h | 1 - paddle/phi/ops/compat/reduce_sig.cc | 48 +++++++ 24 files changed, 594 insertions(+), 269 deletions(-) delete mode 100644 paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu delete mode 100644 paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu delete mode 100644 paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu delete mode 100644 paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu rename paddle/phi/kernels/cpu/{reduce_sum_grad_kernel.cc => reduce_grad_kernel.cc} (53%) create mode 100644 paddle/phi/kernels/gpu/reduce_grad_kernel.cu delete mode 100644 paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu rename paddle/phi/kernels/{cpu => impl}/reduce_grad.h (100%) rename paddle/phi/kernels/{reduce_sum_grad_kernel.h => impl/reduce_max_grad_kernel_impl.h} (51%) create mode 100644 paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/reduce_grad_kernel.h diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index 28e1145db42..7fe1852f739 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -38,7 +38,7 @@ USE_OP(softmax_with_cross_entropy); USE_OP_ITSELF(reduce_mean); USE_OP_ITSELF(reduce_sum); USE_OP_ITSELF(reduce_sum_grad); -USE_OP(reduce_mean_grad); +USE_OP_ITSELF(reduce_mean_grad); USE_OP_ITSELF(reshape2_grad); USE_OP(softmax_with_cross_entropy_grad); USE_OP_ITSELF(elementwise_add_grad); diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc index 41df8e4a15f..15812778e00 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc @@ -35,13 +35,3 @@ REGISTER_OPERATOR( paddle::framework::DefaultGradOpMaker, ReduceMaxInferShapeFunctor); REGISTER_OPERATOR(reduce_max_grad, ops::ReduceGradOp) - -REGISTER_OP_CPU_KERNEL( - reduce_max_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu deleted file mode 100644 index 5ee38b8fa46..00000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" - -REGISTER_OP_CUDA_KERNEL( - reduce_max_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc index 4a183309138..dc41979defb 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc @@ -107,12 +107,3 @@ REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp, ops::ReduceMeanDoubleGradDescMaker, ops::ReduceMeanDoubleGradOpBaseMaker, ops::ReduceMeanGradNoNeedBufferVarInferer); - -template -using CPUReduceMeanGradKernel = - ops::ReduceGradKernel; - -REGISTER_OP_CPU_KERNEL(reduce_mean_grad, CPUReduceMeanGradKernel, - CPUReduceMeanGradKernel, - CPUReduceMeanGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu deleted file mode 100644 index a578c9f7d81..00000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// .part used to speed up nvcc compile -#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h" - -template -using CUDAReduceMeanGradKernel = - ops::ReduceCudaGradKernel; - -REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel, - CUDAReduceMeanGradKernel, - CUDAReduceMeanGradKernel, - CUDAReduceMeanGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc index b9915f2b484..5e5b04d57b0 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc @@ -35,13 +35,3 @@ REGISTER_OPERATOR( paddle::framework::DefaultGradOpMaker, ReduceMinInferShapeFunctor); REGISTER_OPERATOR(reduce_min_grad, ops::ReduceGradOp) - -REGISTER_OP_CPU_KERNEL( - reduce_min_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu deleted file mode 100644 index bf886063786..00000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" - -REGISTER_OP_CUDA_KERNEL( - reduce_min_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc index eb745ab9c56..b1abdf9e8a7 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc @@ -14,6 +14,10 @@ #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace framework { class OpDesc; @@ -26,14 +30,20 @@ class CPUDeviceContext; } // namespace platform } // namespace paddle -REGISTER_REDUCE_OP(reduce_prod); +namespace ops = paddle::operators; + +class ReduceProdOpMaker : public ops::ReduceOpMaker { + protected: + virtual std::string GetName() const { return "reduce_prod"; } + virtual std::string GetOpType() const { return "Reduce reduce_prod"; } +}; + +DECLARE_INFER_SHAPE_FUNCTOR(reduce_prod, ReduceProdInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); -REGISTER_OP_CPU_KERNEL(reduce_prod_grad, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); +REGISTER_OPERATOR( + reduce_prod, ops::ReduceOp, ReduceProdOpMaker, + paddle::framework::DefaultGradOpMaker, + paddle::framework::DefaultGradOpMaker, + ReduceProdInferShapeFunctor); +REGISTER_OPERATOR(reduce_prod_grad, ops::ReduceGradOp); diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu deleted file mode 100644 index 0610cdd94f8..00000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h" - -REGISTER_OP_CUDA_KERNEL( - reduce_prod_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h index b1da573c49f..946230cb169 100644 --- a/paddle/phi/core/compat/op_utils.h +++ b/paddle/phi/core/compat/op_utils.h @@ -47,8 +47,13 @@ const std::unordered_set deprecated_op_names({"diag", "matmul_grad", "matmul_grad_grad", "mean", + "mean_grad", "max", + "max_grad", "min", + "min_grad", + "prod", + "prod_grad", "any", "all", "reshape", diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 02b5b2d74ad..aa76561c5ce 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -31,10 +31,11 @@ set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_k matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel - triangular_solve_grad_kernel determinant_grad_kernel) + triangular_solve_grad_kernel determinant_grad_kernel reduce_kernel) kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function) kernel_library(gumbel_softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(gumbel_softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) +kernel_library(reduce_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel) kernel_library(matrix_power_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) kernel_library(matrix_power_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_inverse) kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting) diff --git a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_grad_kernel.cc similarity index 53% rename from paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc rename to paddle/phi/kernels/cpu/reduce_grad_kernel.cc index efea054555e..78a7ae8d415 100644 --- a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/reduce_grad_kernel.cc @@ -12,33 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/reduce_sum_grad_kernel.h" +#include "paddle/phi/kernels/reduce_grad_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/cast_kernel.h" -#include "paddle/phi/kernels/cpu/reduce_grad.h" #include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" +#include "paddle/phi/kernels/impl/reduce_grad.h" +#include "paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h" +#include "paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h" +#include "paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h" namespace phi { -struct SumGradFunctor { - template - void operator()(const DeviceContext& place, - X* x, - Y* y, - DX* dx, - DY* dy, - const Dim& dim, - int size) { - dx->device(place) = dy->broadcast(dim); - } -}; - template void ComputeFromInput(const Context& dev_ctx, const DenseTensor& x, @@ -111,16 +97,38 @@ void ReduceSumGradKernel(const Context& dev_ctx, } } - ReduceGradKernel(dev_ctx, - x, - out_grad, - paddle::none, - dims, - keep_dim, - reduce_all, - in_dtype, - out_dtype, - x_grad); + ReduceGradKernel(dev_ctx, + x, + out_grad, + paddle::none, + dims, + keep_dim, + reduce_all, + in_dtype, + out_dtype, + x_grad); +} + +template +void ReduceMeanGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad) { + ReduceGradKernel(dev_ctx, + x, + out_grad, + paddle::none, + dims, + keep_dim, + reduce_all, + in_dtype, + out_dtype, + x_grad); } } // namespace phi @@ -137,3 +145,38 @@ PD_REGISTER_KERNEL(sum_grad, int64_t, phi::dtype::complex, phi::dtype::complex) {} + +PD_REGISTER_KERNEL(mean_grad, + CPU, + ALL_LAYOUT, + phi::ReduceMeanGradKernel, + bool, + float, + double) {} + +PD_REGISTER_KERNEL(prod_grad, + CPU, + ALL_LAYOUT, + phi::ReduceProdGradKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL(max_grad, + CPU, + ALL_LAYOUT, + phi::ReduceMaxGradKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL(min_grad, + CPU, + ALL_LAYOUT, + phi::ReduceMinGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/funcs/reduce_functor.h b/paddle/phi/kernels/funcs/reduce_functor.h index c74880e0432..b793afb63b1 100644 --- a/paddle/phi/kernels/funcs/reduce_functor.h +++ b/paddle/phi/kernels/funcs/reduce_functor.h @@ -73,5 +73,82 @@ struct AnyFunctor { } }; +struct MeanGradFunctor { + template + void operator()(const DeviceContext& place, + X* x, + Y* y, + DX* dx, + DY* dy, + const Dim& dim, + int size) { + dx->device(place) = dy->broadcast(dim) / dx->constant(size); + } +}; + +struct SumGradFunctor { + template + void operator()(const DeviceContext& place, + X* x, + Y* y, + DX* dx, + DY* dy, + const Dim& dim, + int size) { + dx->device(place) = dy->broadcast(dim); + } +}; + +struct ProdGradFunctor { + template + void operator()(const DeviceContext& place, + X* x, + Y* y, + DX* dx, + DY* dy, + const Dim& dim, + int size) { + dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse(); + } +}; + +struct MaxOrMinGradFunctor { + template + void operator()(const DeviceContext& place, + X* x, + Y* y, + DX* dx, + DY* dy, + const Dim& dim, + int size) { + auto equals = (*x) == y->broadcast(dim); + auto ones = dx->constant(1); + auto zeros = dx->constant(0); + // If there are multiple minimum or maximum elements, the subgradient of + // each is the set [0, 1], and we pass gradient to all of them here. + dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros); + } +}; + } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/reduce_grad_functions.h b/paddle/phi/kernels/funcs/reduce_grad_functions.h index 3488b6f2f86..11197a52261 100644 --- a/paddle/phi/kernels/funcs/reduce_grad_functions.h +++ b/paddle/phi/kernels/funcs/reduce_grad_functions.h @@ -41,14 +41,14 @@ void ReduceGradFunctor(const Context& dev_ctx, Eigen::array broadcast_dim; for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1; - int broad_cats_times = 1; + int broad_cast_times = 1; for (size_t i = 0; i < dims_ref.size(); ++i) { if (dims_ref[i] < 0) { dims_ref[i] = x_rank + dims_ref[i]; } reduced_dims_v[dims_ref[i]] = 1; broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]]; - broad_cats_times *= x_dims[dims_ref[i]]; + broad_cast_times *= x_dims[dims_ref[i]]; } auto reduced_dims = phi::make_ddim(reduced_dims_v); auto x_reduce = EigenTensor::From(input1, reduced_dims); @@ -62,7 +62,7 @@ void ReduceGradFunctor(const Context& dev_ctx, &x_grad, &x_reduce_grad, broadcast_dim, - broad_cats_times); + broad_cast_times); } inline void GetOriginDimFromShuffled(const DDim& src_dim, diff --git a/paddle/phi/kernels/gpu/reduce_grad.h b/paddle/phi/kernels/gpu/reduce_grad.h index d21c8a3fa46..e32101b7372 100644 --- a/paddle/phi/kernels/gpu/reduce_grad.h +++ b/paddle/phi/kernels/gpu/reduce_grad.h @@ -43,5 +43,59 @@ void ReduceGrad(const GPUContext& dev_ctx, })); } +template class TransformOp> +void ReduceGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad) { + auto* in_x = &x; + auto* d_out = &out_grad; + auto* d_x = x_grad; + + auto pt_out_dtype = in_dtype; + + // get reduce_dim and reduce_num for reduce_mean_grad + int dim_size = in_x->dims().size(); + std::vector reduce_dims = + funcs::details::GetReduceDim(dims, dim_size, reduce_all); + + auto update_dims = vectorize(d_x->dims()); + int reduce_num = 1; + for (auto i : reduce_dims) { + reduce_num *= (in_x->dims())[i]; + update_dims[i] = 1; + } + // make new tensor + DenseTensor new_d_out(d_out->dtype()); + new_d_out.ShareDataWith(*d_out); + new_d_out.Resize(phi::make_ddim(update_dims)); + if (in_dtype != DataType::UNDEFINED) { + dev_ctx.Alloc(d_x, in_dtype); + } else { + dev_ctx.Alloc(d_x, d_out->dtype()); + } + + auto pt_d_out = new_d_out; + auto pt_d_x = *d_x; + if (in_dtype == DataType::UNDEFINED) { + pt_out_dtype = d_out->dtype(); + } + using MPType = typename kps::details::MPTypeTrait::Type; + + phi::ReduceGrad>( + dev_ctx, + &pt_d_out, + &pt_d_x, + pt_out_dtype, + TransformOp(reduce_num)); +} + } // namespace phi #endif diff --git a/paddle/phi/kernels/gpu/reduce_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_grad_kernel.cu new file mode 100644 index 00000000000..5256048267e --- /dev/null +++ b/paddle/phi/kernels/gpu/reduce_grad_kernel.cu @@ -0,0 +1,119 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" +#include "paddle/phi/kernels/gpu/reduce_grad.h" +#include "paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h" +#include "paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h" +#include "paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h" + +namespace phi { + +template +void ReduceSumGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad) { + ReduceGradKernel(dev_ctx, + x, + out_grad, + dims, + keep_dim, + reduce_all, + in_dtype, + out_dtype, + x_grad); +} + +template +void ReduceMeanGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad) { + ReduceGradKernel(dev_ctx, + x, + out_grad, + dims, + keep_dim, + reduce_all, + in_dtype, + out_dtype, + x_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL(sum_grad, + GPU, + ALL_LAYOUT, + phi::ReduceSumGradKernel, + bool, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} + +PD_REGISTER_KERNEL(mean_grad, + GPU, + ALL_LAYOUT, + phi::ReduceMeanGradKernel, + bool, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(prod_grad, + GPU, + ALL_LAYOUT, + phi::ReduceProdGradKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL(max_grad, + GPU, + ALL_LAYOUT, + phi::ReduceMaxGradKernel, + float, + double, + int, + int64_t) {} + +PD_REGISTER_KERNEL(min_grad, + GPU, + ALL_LAYOUT, + phi::ReduceMinGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu deleted file mode 100644 index 9f4ddc3cf37..00000000000 --- a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/reduce_sum_grad_kernel.h" - -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/funcs/reduce_function.h" -#include "paddle/phi/kernels/gpu/reduce_grad.h" - -namespace phi { - -template -void ReduceSumGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& out_grad, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DataType in_dtype, - DataType out_dtype, - DenseTensor* x_grad) { - auto* in_x = &x; - auto* d_out = &out_grad; - auto* d_x = x_grad; - - auto pt_out_dtype = in_dtype; - - // get reduce_dim and reduce_num for reduce_mean_grad - int dim_size = in_x->dims().size(); - std::vector reduce_dims = - funcs::details::GetReduceDim(dims, dim_size, reduce_all); - - auto update_dims = vectorize(d_x->dims()); - int reduce_num = 1; - for (auto i : reduce_dims) { - reduce_num *= (in_x->dims())[i]; - update_dims[i] = 1; - } - // make new tensor - DenseTensor new_d_out(d_out->dtype()); - new_d_out.ShareDataWith(*d_out); - new_d_out.Resize(phi::make_ddim(update_dims)); - if (in_dtype != DataType::UNDEFINED) { - dev_ctx.Alloc(d_x, in_dtype); - } else { - dev_ctx.Alloc(d_x, d_out->dtype()); - } - - auto pt_d_out = new_d_out; - auto pt_d_x = *d_x; - if (in_dtype == DataType::UNDEFINED) { - pt_out_dtype = d_out->dtype(); - } - using MPType = typename kps::details::MPTypeTrait::Type; - - phi::ReduceGrad>( - dev_ctx, - &pt_d_out, - &pt_d_x, - pt_out_dtype, - kps::IdentityFunctor(reduce_num)); -} - -} // namespace phi - -PD_REGISTER_KERNEL(sum_grad, - GPU, - ALL_LAYOUT, - phi::ReduceSumGradKernel, - bool, - float, - double, - phi::dtype::float16, - phi::dtype::bfloat16, - int, - int64_t, - phi::dtype::complex, - phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/reduce_grad.h b/paddle/phi/kernels/impl/reduce_grad.h similarity index 100% rename from paddle/phi/kernels/cpu/reduce_grad.h rename to paddle/phi/kernels/impl/reduce_grad.h diff --git a/paddle/phi/kernels/reduce_sum_grad_kernel.h b/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h similarity index 51% rename from paddle/phi/kernels/reduce_sum_grad_kernel.h rename to paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h index ab4d63297ef..4a74416e391 100644 --- a/paddle/phi/kernels/reduce_sum_grad_kernel.h +++ b/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h @@ -14,19 +14,34 @@ #pragma once -#include "paddle/phi/common/data_type.h" -#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/reduce_grad_kernel.h" + +#include "paddle/phi/kernels/funcs/reduce_functor.h" +#include "paddle/phi/kernels/impl/reduce_grad.h" + namespace phi { template -void ReduceSumGradKernel(const Context& dev_ctx, +void ReduceMaxGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& out_grad, + const DenseTensor& out, const std::vector& dims, bool keep_dim, bool reduce_all, DataType in_dtype, DataType out_dtype, - DenseTensor* x_grad); + DenseTensor* x_grad) { + ReduceGradKernel(dev_ctx, + x, + out_grad, + out, + dims, + keep_dim, + reduce_all, + in_dtype, + out_dtype, + x_grad); +} } // namespace phi diff --git a/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h new file mode 100644 index 00000000000..baaa544f137 --- /dev/null +++ b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h @@ -0,0 +1,47 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/reduce_grad_kernel.h" + +#include "paddle/phi/kernels/funcs/reduce_functor.h" +#include "paddle/phi/kernels/impl/reduce_grad.h" + +namespace phi { + +template +void ReduceMinGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const DenseTensor& out, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad) { + ReduceGradKernel(dev_ctx, + x, + out_grad, + out, + dims, + keep_dim, + reduce_all, + in_dtype, + out_dtype, + x_grad); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h new file mode 100644 index 00000000000..6b93e98cec0 --- /dev/null +++ b/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h @@ -0,0 +1,47 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/reduce_grad_kernel.h" + +#include "paddle/phi/kernels/funcs/reduce_functor.h" +#include "paddle/phi/kernels/impl/reduce_grad.h" + +namespace phi { + +template +void ReduceProdGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const DenseTensor& out, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad) { + ReduceGradKernel(dev_ctx, + x, + out_grad, + out, + dims, + keep_dim, + reduce_all, + in_dtype, + out_dtype, + x_grad); +} + +} // namespace phi diff --git a/paddle/phi/kernels/reduce_grad_kernel.h b/paddle/phi/kernels/reduce_grad_kernel.h new file mode 100644 index 00000000000..ee6f3d19a09 --- /dev/null +++ b/paddle/phi/kernels/reduce_grad_kernel.h @@ -0,0 +1,79 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/dense_tensor.h" +namespace phi { + +template +void ReduceSumGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad); + +template +void ReduceMeanGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad); + +template +void ReduceProdGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const DenseTensor& out, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad); + +template +void ReduceMaxGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const DenseTensor& out, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad); + +template +void ReduceMinGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const DenseTensor& out, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType in_dtype, + DataType out_dtype, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/reduce_kernel.h b/paddle/phi/kernels/reduce_kernel.h index 75f52c36beb..69bcb47bc98 100644 --- a/paddle/phi/kernels/reduce_kernel.h +++ b/paddle/phi/kernels/reduce_kernel.h @@ -16,7 +16,6 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/infermeta/unary.h" -#include "paddle/phi/kernels/empty_kernel.h" namespace phi { template diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc index 789496ccbd0..4bca0523801 100644 --- a/paddle/phi/ops/compat/reduce_sig.cc +++ b/paddle/phi/ops/compat/reduce_sig.cc @@ -136,6 +136,42 @@ KernelSignature ReduceSumGradOpArgumentMapping( {GradVarName("X")}); } +KernelSignature ReduceMeanGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "mean_grad", + {"X", GradVarName("Out")}, + {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, + {GradVarName("X")}); +} + +KernelSignature ReduceMaxGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "max_grad", + {"X", GradVarName("Out"), "Out"}, + {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, + {GradVarName("X")}); +} + +KernelSignature ReduceMinGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "min_grad", + {"X", GradVarName("Out"), "Out"}, + {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, + {GradVarName("X")}); +} + +KernelSignature ReduceProdGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "prod_grad", + {"X", GradVarName("Out"), "Out"}, + {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, + {GradVarName("X")}); +} + } // namespace phi PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum); @@ -147,6 +183,10 @@ PD_REGISTER_BASE_KERNEL_NAME(reduce_all, all); PD_REGISTER_BASE_KERNEL_NAME(reduce_any, any); PD_REGISTER_BASE_KERNEL_NAME(reduce_sum_grad, sum_grad); +PD_REGISTER_BASE_KERNEL_NAME(reduce_mean_grad, mean_grad); +PD_REGISTER_BASE_KERNEL_NAME(reduce_prod_grad, prod_grad); +PD_REGISTER_BASE_KERNEL_NAME(reduce_max_grad, max_grad); +PD_REGISTER_BASE_KERNEL_NAME(reduce_min_grad, min_grad); PD_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping); @@ -158,3 +198,11 @@ PD_REGISTER_ARG_MAPPING_FN(reduce_any, phi::ReduceAnyOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(reduce_sum_grad, phi::ReduceSumGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reduce_mean_grad, + phi::ReduceMeanGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reduce_prod_grad, + phi::ReduceProdGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reduce_max_grad, + phi::ReduceMaxGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reduce_min_grad, + phi::ReduceMinGradOpArgumentMapping); -- GitLab From bb2cb7622bab19f878e32778f794055ee54cc846 Mon Sep 17 00:00:00 2001 From: lilong12 Date: Fri, 18 Mar 2022 10:50:03 +0800 Subject: [PATCH 154/176] Use store for gloo process group (#40629) --- .../collective/ProcessGroupGloo.cc | 8 ++++---- .../distributed/collective/ProcessGroupGloo.h | 13 ++++++------ paddle/fluid/pybind/distributed_py.cc | 20 ++++--------------- .../tests/unittests/process_group_gloo.py | 4 +--- 4 files changed, 15 insertions(+), 30 deletions(-) diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc index 5dc43af1178..cb82677a281 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc @@ -171,10 +171,10 @@ ProcessGroupGloo::GlooTask::GlooTask(int rank, "Only CPU place is supported for ProcessGroupGloo.")); } -ProcessGroupGloo::ProcessGroupGloo(const std::shared_ptr& store, - int rank, int world_size, - const std::shared_ptr options) - : ProcessGroup(rank, world_size), _tag(0), _store(store) { +ProcessGroupGloo::ProcessGroupGloo( + const std::shared_ptr& store, int rank, + int world_size, const std::shared_ptr options) + : ProcessGroup(rank, world_size), _tag(0), _store(new GlooStore(store)) { _context = std::make_shared(rank, world_size); auto prefix_store = ::gloo::rendezvous::PrefixStore(std::to_string(0), *_store); diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h index 24f156571a4..71e0a40f8a7 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h @@ -52,8 +52,7 @@ class ProcessGroupGloo : public ProcessGroup { class GlooStore : public ::gloo::rendezvous::Store { public: - explicit GlooStore( - const std::shared_ptr& store) + explicit GlooStore(const std::shared_ptr& store) : _store(store) {} ~GlooStore() = default; @@ -87,7 +86,7 @@ class ProcessGroupGloo : public ProcessGroup { } protected: - std::shared_ptr _store; + std::shared_ptr _store; }; class GlooOptions { @@ -100,9 +99,9 @@ class ProcessGroupGloo : public ProcessGroup { std::shared_ptr<::gloo::transport::Device> device; }; - explicit ProcessGroupGloo(const std::shared_ptr& store, int rank, - int world_size, - std::shared_ptr options); + explicit ProcessGroupGloo( + const std::shared_ptr& store, int rank, + int world_size, std::shared_ptr options); ~ProcessGroupGloo() = default; @@ -145,7 +144,7 @@ class ProcessGroupGloo : public ProcessGroup { protected: uint32_t _tag; std::shared_ptr _context; - std::shared_ptr _store; + std::shared_ptr<::gloo::rendezvous::Store> _store; }; } // namespace distributed diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index 1df917b8c35..e89d8d96342 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -235,25 +235,13 @@ void BindDistributed(py::module *m) { py::call_guard()); #if defined(PADDLE_WITH_GLOO) - py::class_(*m, "GlooOptions") - .def(py::init<>()) - .def_readwrite("_device", &GlooOptions::device) - .def_static("create", &GlooOptions::create); - - py::class_>(*m, "GlooStore") - .def(py::init( - [](const std::shared_ptr &store) { - return std::make_shared(store); - }), - py::call_guard()); - py::class_>( *m, "ProcessGroupGloo", ProcessGroup) - .def(py::init &, int, int, - std::shared_ptr &>(), + .def(py::init &, int, + int, std::shared_ptr &>(), py::call_guard()) - .def(py::init([](const std::shared_ptr &store, int rank, - int world_size) { + .def(py::init([](const std::shared_ptr &store, + int rank, int world_size) { auto opts = GlooOptions::create(); char *ifname = getenv(GLOO_SOCKET_IFNAME_ENV.c_str()); if (ifname && strlen(ifname) > 1) { diff --git a/python/paddle/fluid/tests/unittests/process_group_gloo.py b/python/paddle/fluid/tests/unittests/process_group_gloo.py index c62c4615f74..b1f3a71ab3e 100644 --- a/python/paddle/fluid/tests/unittests/process_group_gloo.py +++ b/python/paddle/fluid/tests/unittests/process_group_gloo.py @@ -47,9 +47,7 @@ class TestProcessGroupFp32(unittest.TestCase): is_master = True if rank == 0 else False store = paddle.fluid.core.TCPStore("127.0.0.1", 6172, is_master, nranks, datetime.timedelta(0)) - gloo_store = paddle.fluid.core.GlooStore(store) - opt = paddle.fluid.core.GlooOptions() - pg = paddle.fluid.core.ProcessGroupGloo(gloo_store, rank, nranks) + pg = paddle.fluid.core.ProcessGroupGloo(store, rank, nranks) # test allreduce sum # rank 0 -- GitLab From 35a5e8ee9c7bb06728c64b0ac6971e56b11b59fc Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Fri, 18 Mar 2022 11:03:05 +0800 Subject: [PATCH 155/176] Refactored Final State Python-C Code Generation Scripts (#40650) * Refactored Final State Python-C Code Generation Scripts. * Bug fix --- .../final_state_generator/CMakeLists.txt | 1 + .../final_state_generator/python_c_gen.py | 495 ++++++++++++------ 2 files changed, 329 insertions(+), 167 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt index 53af6c1048d..771351dd4af 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt @@ -27,6 +27,7 @@ add_custom_target(eager_final_state_codegen set(tmp_python_c_output_path "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/tmp_eager_final_state_op_function_impl.h") set(python_c_output_path "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/eager_final_state_op_function_impl.h") + add_custom_target(eager_final_state_python_c_codegen COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py" "--api_yaml_path=${api_yaml_path}" diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index c0ed77ecdc4..753c8ca3aaf 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -14,9 +14,18 @@ import os import argparse +import logging from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap -skipped_fwd_api_names = set(["scale"]) +########################### +## Global Configurations ## +########################### +skipped_forward_api_names = set(["scale"]) + + +def SkipAPIGeneration(forward_api_name): + return (forward_api_name in skipped_forward_api_names) + atype_to_parsing_function = { "bool": "CastPyArg2Boolean", @@ -39,64 +48,31 @@ atype_to_parsing_function = { } -def ParseArguments(): - parser = argparse.ArgumentParser( - description='Eager Code Generator Args Parser') - parser.add_argument('--api_yaml_path', type=str) - parser.add_argument('--output_path', type=str) - - args = parser.parse_args() - return args - - def FindParsingFunctionFromAttributeType(atype): if atype not in atype_to_parsing_function.keys(): - print(f"Unable to find {atype} in atype_to_parsing_function.") - assert False + assert False, f"Unable to find {atype} in atype_to_parsing_function." return atype_to_parsing_function[atype] -def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map, - forward_attrs_list, forward_outputs_position_map, - optional_inputs, is_forward_only): - # forward_inputs_position_map = { "name" : [type, fwd_position] } - # forward_outputs_position_map = { "name" : [type, fwd_position] } - # forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...] - # optional_inputs = [name0, ...] - - # Get EagerTensor from args - # Get dygraph function call args - num_args = len(forward_inputs_position_map.keys()) + len(forward_attrs_list) - num_input_tensors = len(forward_inputs_position_map.keys()) - dygraph_function_call_list = ["" for i in range(num_args)] - get_eager_tensor_str = "" - for name, (ttype, pos) in forward_inputs_position_map.items(): - is_optional = (name in optional_inputs) - if IsVectorTensorType(ttype): - get_eager_tensor_str += f" auto {name} = GetTensorListFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n" - else: - if is_optional: - get_eager_tensor_str += f" auto {name} = GetOptionalTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n" - else: - get_eager_tensor_str += f" auto {name} = GetTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n" - dygraph_function_call_list[pos] = f"{name}" +########################## +## Refactored Functions ## +########################## +PARSE_PYTHON_C_TENSORS_TEMPLATE = \ +" auto {} = {}(\"{}\", \"{}\", args, {}, false);\n" + - parse_attributes_str = "" - # Get Attributes - for name, atype, _, pos in forward_attrs_list: - parsing_function = FindParsingFunctionFromAttributeType(atype) - key = f"{name}" +PARSE_PYTHON_C_ARGS_TEMPLATE = \ +""" PyObject* {}_obj = PyTuple_GET_ITEM(args, {});\n + {} {} = {}({}_obj, \"{}\", {});\n""" - parse_attributes_str += f" PyObject* {name}_obj = PyTuple_GET_ITEM(args, {pos});\n" - parse_attributes_str += f" {atype} {name} = {parsing_function}({name}_obj, \"{fwd_api_name}\", {pos});\n" - dygraph_function_call_list[pos] = f"{name}" - dygraph_function_call_str = ",".join(dygraph_function_call_list) +RECORD_EVENT_TEMPLATE = \ +" paddle::platform::RecordEvent {}(\"{} {}\", paddle::platform::TracerEventType::Operator, 1);" - pythonc_event_str = f"paddle::platform::RecordEvent pythonc_record_event(\"{fwd_api_name} pybind_imperative_func\", paddle::platform::TracerEventType::Operator, 1);" - PYTHON_C_FUNCTION_TEMPLATE = """ +PYTHON_C_FUNCTION_TEMPLATE = \ +""" static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObject *kwargs) {{ {} @@ -130,26 +106,50 @@ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObj }} """ - namespace_str = "" - if len(namespace) > 0: - namespace_str = f"{namespace}::" - if is_forward_only: - fwd_function_name = "paddle::experimental::" + namespace_str + fwd_api_name - else: - fwd_function_name = namespace_str + GetForwardFunctionName(fwd_api_name) - python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format( - fwd_api_name, pythonc_event_str, fwd_api_name, get_eager_tensor_str, - parse_attributes_str, fwd_function_name, dygraph_function_call_str) +FUNCTION_NAME_TEMPLATE = \ +"{}{}{}" - python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void)) {namespace_str}eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n" - return python_c_function_str, python_c_function_reg_str +PYTHON_C_FUNCTION_REG_TEMPLATE = \ +"{{\"final_state_{}\", (PyCFunction)(void(*)(void)) {}eager_final_state_api_{}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {} in dygraph.\"}}" -def GenerateCoreOpsInfoMap(): - result = """ +PYTHON_C_WRAPPER_TEMPLATE = \ +""" +#pragma once + +#include "pybind11/detail/common.h" +#include "paddle/phi/api/all.h" +#include "paddle/phi/api/lib/dygraph_api.h" +#include "paddle/phi/common/backend.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/api/include/sparse_api.h" +#include "paddle/fluid/pybind/op_function_common.h" +#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" +#include "paddle/fluid/pybind/exception.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" +#include + +namespace paddle {{ +namespace pybind {{ + +{} + +static PyMethodDef EagerFinalStateMethods[] = {{ + {} +}}; + +}} // namespace pybind +}} // namespace paddle +""" + + +CORE_OPS_INFO = \ +""" static PyObject * eager_get_final_state_core_ops_args_info(PyObject *self) { PyThreadState *tstate = nullptr; try @@ -194,9 +194,11 @@ static PyObject * eager_get_final_state_core_ops_returns_info(PyObject *self) { return nullptr; } } - """ +""" + - core_ops_infos_registry = """ +CORE_OPS_INFO_REGISTRY = \ +""" {\"get_final_state_core_ops_args_info\", (PyCFunction)(void(*)(void))eager_get_final_state_core_ops_args_info, METH_NOARGS, \"C++ interface function for eager_get_final_state_core_ops_args_info.\"}, @@ -209,7 +211,259 @@ static PyObject * eager_get_final_state_core_ops_returns_info(PyObject *self) { METH_NOARGS, \"C++ interface function for eager_get_final_state_core_ops_returns_info.\"}, """ - return result, core_ops_infos_registry +NAMESPACE_WRAPPER_TEMPLATE = \ +"""namespace {} {{ + {} +}} +""" + + +####################### +## Generator Classes ## +####################### +class PythonCSingleFunctionGenerator: + def __init__(self, fwd_api_contents, namespace): + self.fwd_api_contents = fwd_api_contents + self.namespace = namespace + + # Raw Contents + self.forward_api_name = "" + self.forward_args_str = "" + self.forward_returns_str = "" + + # Raw Data + self.forward_attrs_list = None #[ [attr_name, attr_type, default_value, orig_position], ...] + self.forward_inputs_list = None #[ [arg_name, arg_type, orig_position], ...] + self.forward_returns_list = None #[ [ret_name, ret_type, orig_position], ...] + + # Processed Data + self.forward_inputs_position_map = None #{ "name" : [type, fwd_position] } + self.forward_outputs_position_map = None #{ "name" : [type, fwd_position] } + + # Special Op Attributes + self.optional_inputs = [] #[name, ...] + self.is_forward_only = True + + # Generated Results + self.python_c_function_str = "" + self.python_c_function_reg_str = "" + + def CollectRawContents(self): + fwd_api_contents = self.fwd_api_contents + + assert 'api' in fwd_api_contents.keys( + ), "Unable to find \"api\" in fwd_api_contents keys" + assert 'args' in fwd_api_contents.keys( + ), "Unable to find \"args\" in fwd_api_contents keys" + assert 'output' in fwd_api_contents.keys( + ), "Unable to find \"output\" in fwd_api_contents keys" + + self.forward_api_name = fwd_api_contents['api'] + self.forward_args_str = fwd_api_contents['args'] + self.forward_returns_str = fwd_api_contents['output'] + + def CollectIsForwardOnly(self): + fwd_api_contents = self.fwd_api_contents + self.is_forward_only = False if 'backward' in fwd_api_contents.keys( + ) else True + + def CollectOptionalInputs(self): + fwd_api_contents = self.fwd_api_contents + if 'optional' in fwd_api_contents.keys(): + self.optional_inputs = ParseDispensable(fwd_api_contents[ + 'optional']) + + def CollectForwardInOutAttr(self): + forward_args_str = self.forward_args_str + forward_returns_str = self.forward_returns_str + + self.forward_inputs_list, self.forward_attrs_list, self.forward_returns_list = ParseYamlForward( + forward_args_str, forward_returns_str) + + def CollectForwardPositionMap(self): + forward_inputs_list = self.forward_inputs_list + forward_returns_list = self.forward_returns_list + + self.forward_inputs_position_map, self.forward_outputs_position_map = DetermineForwardPositionMap( + forward_inputs_list, forward_returns_list) + + def GeneratePythonCFunction(self): + namespace = self.namespace + forward_api_name = self.forward_api_name + forward_attrs_list = self.forward_attrs_list + forward_inputs_position_map = self.forward_inputs_position_map + forward_outputs_position_map = self.forward_outputs_position_map + optional_inputs = self.optional_inputs + is_forward_only = self.is_forward_only + + # Generate Python-C Tensors Parsing Logic + get_eager_tensor_str = "" + for name, (ttype, pos) in forward_inputs_position_map.items(): + is_optional = (name in optional_inputs) + if IsVectorTensorType(ttype): + get_eager_tensor_str += PARSE_PYTHON_C_TENSORS_TEMPLATE.format( + name, "GetTensorListFromArgs", forward_api_name, name, pos) + else: + if is_optional: + get_eager_tensor_str += PARSE_PYTHON_C_TENSORS_TEMPLATE.format( + name, "GetOptionalTensorFromArgs", forward_api_name, + name, pos) + else: + get_eager_tensor_str += PARSE_PYTHON_C_TENSORS_TEMPLATE.format( + name, "GetTensorFromArgs", forward_api_name, name, pos) + + parse_attributes_str = "" + + # Generate Python-C Attributes Parsing Logic + for name, atype, _, pos in forward_attrs_list: + parsing_function_name = FindParsingFunctionFromAttributeType(atype) + parse_attributes_str += PARSE_PYTHON_C_ARGS_TEMPLATE.format( + name, pos, atype, name, parsing_function_name, name, + forward_api_name, pos) + + # Generate Dygraph Function Call Logic + num_args = len(forward_inputs_position_map.keys()) + len( + forward_attrs_list) + dygraph_function_call_list = ["" for i in range(num_args)] + for name, (_, pos) in forward_inputs_position_map.items(): + dygraph_function_call_list[pos] = f"{name}" + for name, _, _, pos in forward_attrs_list: + dygraph_function_call_list[pos] = f"{name}" + dygraph_function_call_str = ",".join(dygraph_function_call_list) + + # Generate Python-C Function Definitions + if is_forward_only: + fwd_function_name = FUNCTION_NAME_TEMPLATE.format( + "paddle::experimental::", namespace, forward_api_name) + else: + fwd_function_name = FUNCTION_NAME_TEMPLATE.format( + "", namespace, GetForwardFunctionName(forward_api_name)) + + # Generate Record Event for performance profiling + pythonc_record_event_str = RECORD_EVENT_TEMPLATE.format( + "pythonc_record_event", forward_api_name, "pybind_imperative_func") + self.python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format( + forward_api_name, pythonc_record_event_str, forward_api_name, + get_eager_tensor_str, parse_attributes_str, fwd_function_name, + dygraph_function_call_str) + + # Generate Python-C Function Registration + self.python_c_function_reg_str = PYTHON_C_FUNCTION_REG_TEMPLATE.format( + forward_api_name, namespace, forward_api_name, forward_api_name) + + def run(self): + # Initialized is_forward_only + self.CollectIsForwardOnly() + + # Initialized forward_api_name, forward_args_str, forward_returns_str + self.CollectRawContents() + if SkipAPIGeneration(self.forward_api_name): return False + + # Initialized optional_inputs + self.CollectOptionalInputs() + + # Initialized forward_inputs_list, forward_returns_list, forward_attrs_list + self.CollectForwardInOutAttr() + logging.info( + f"Parsed Original Forward Inputs List: \n{self.forward_inputs_list}") + logging.info( + f"Prased Original Forward Attrs List: \n{self.forward_attrs_list}") + logging.info( + f"Parsed Original Forward Returns List: \n{self.forward_returns_list}" + ) + + # Initialized forward_inputs_position_map, forward_outputs_position_map + self.CollectForwardPositionMap() + logging.info( + f"Generated Forward Input Position Map: {self.forward_inputs_position_map}" + ) + logging.info( + f"Generated Forward Output Position Map: {self.forward_outputs_position_map}" + ) + + # Code Generation + self.GeneratePythonCFunction() + logging.info( + f"Generated Python-C Function: {self.python_c_function_str}") + logging.info( + f"Generated Python-C Function Declaration: {self.python_c_function_reg_str}" + ) + + return True + + +class PythonCYamlGenerator: + def __init__(self, path): + self.yaml_path = path + + self.namespace = "" + self.forward_api_list = [] + + # Generated Result + self.python_c_functions_reg_str = "" + self.python_c_functions_str = "" + + def ParseYamlContents(self): + yaml_path = self.yaml_path + self.forward_api_list = ReadFwdFile(yaml_path) + + def GeneratePythonCFunctions(self): + namespace = self.namespace + forward_api_list = self.forward_api_list + + for forward_api_content in forward_api_list: + f_generator = PythonCSingleFunctionGenerator(forward_api_content, + namespace) + status = f_generator.run() + + if status == True: + self.python_c_functions_reg_str += f_generator.python_c_function_reg_str + ",\n" + self.python_c_functions_str += f_generator.python_c_function_str + "\n" + + def InferNameSpace(self): + yaml_path = self.yaml_path + if "sparse" in yaml_path: + self.namespace = "sparse::" + + def AttachNamespace(self): + namespace = self.namespace + python_c_functions_str = self.python_c_functions_str + + if namespace != "": + if namespace.endswith("::"): + namespace = namespace[:-2] + self.python_c_functions_str = NAMESPACE_WRAPPER_TEMPLATE.format( + namespace, python_c_functions_str) + + def run(self): + # Infer namespace from yaml_path + self.InferNameSpace() + + # Read Yaml file + self.ParseYamlContents() + + # Code Generation + self.GeneratePythonCFunctions() + + # Wrap with namespace + self.AttachNamespace() + + +############################ +## Code Generation Helper ## +############################ +def ParseArguments(): + parser = argparse.ArgumentParser( + description='Eager Code Generator Args Parser') + parser.add_argument('--api_yaml_path', type=str) + parser.add_argument('--output_path', type=str) + + args = parser.parse_args() + return args + + +def GenerateCoreOpsInfoMap(): + return CORE_OPS_INFO, CORE_OPS_INFO_REGISTRY def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str): @@ -221,36 +475,6 @@ def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str): python_c_function_reg_str += core_ops_infos_registry python_c_function_reg_str += "\n {nullptr,nullptr,0,nullptr}" - PYTHON_C_WRAPPER_TEMPLATE = """ -#pragma once - -#include "pybind11/detail/common.h" -#include "paddle/phi/api/all.h" -#include "paddle/phi/api/lib/dygraph_api.h" -#include "paddle/phi/common/backend.h" -#include "paddle/phi/common/data_type.h" -#include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" -#include "paddle/phi/api/include/sparse_api.h" -#include "paddle/fluid/pybind/op_function_common.h" -#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" -#include "paddle/fluid/pybind/exception.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" -#include - -namespace paddle {{ -namespace pybind {{ - -{} - -static PyMethodDef EagerFinalStateMethods[] = {{ - {} -}}; - -}} // namespace pybind -}} // namespace paddle - -""" python_c_str = PYTHON_C_WRAPPER_TEMPLATE.format(python_c_function_str, python_c_function_reg_str) @@ -264,86 +488,23 @@ def GeneratePythonCFile(filepath, python_c_str): if __name__ == "__main__": args = ParseArguments() - api_yaml_paths = args.api_yaml_path.split(",") - python_c_functions_reg_str = "" - python_c_functions_str = "" - + generated_python_c_functions = "" + generated_python_c_registration = "" for i in range(len(api_yaml_paths)): api_yaml_path = api_yaml_paths[i] - if "sparse" in api_yaml_path: - namespace = "sparse" - else: - namespace = "" - - fwd_api_list = ReadFwdFile(api_yaml_path) - - python_c_function_list = [] - python_c_function_reg_list = [] - for fwd_api in fwd_api_list: - - # We only generate Ops with grad - is_forward_only = False - if 'backward' not in fwd_api.keys(): - is_forward_only = True - - assert 'api' in fwd_api.keys() - assert 'args' in fwd_api.keys() - assert 'output' in fwd_api.keys() - - fwd_api_name = fwd_api['api'] - fwd_args_str = fwd_api['args'] - fwd_returns_str = fwd_api['output'] - - if fwd_api_name in skipped_fwd_api_names: - continue - - # Parse Dispensable Inputs - optional_inputs = [] - if 'optional' in fwd_api.keys(): - optional_inputs = ParseDispensable(fwd_api['optional']) - - # Collect Original Forward Inputs/Outputs and then perform validation checks - forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward( - fwd_args_str, fwd_returns_str) - print("Parsed Original Forward Inputs List: ", forward_inputs_list) - print("Prased Original Forward Attrs List: ", forward_attrs_list) - print("Parsed Original Forward Returns List: ", - forward_returns_list) - - forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap( - forward_inputs_list, forward_returns_list) - print("Generated Forward Input Position Map: ", - forward_inputs_position_map) - print("Generated Forward Output Position Map: ", - forward_outputs_position_map) - - python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction( - fwd_api_name, forward_inputs_position_map, forward_attrs_list, - forward_outputs_position_map, optional_inputs, is_forward_only) - python_c_function_list.append(python_c_function_str) - python_c_function_reg_list.append(python_c_function_reg_str) - print("Generated Python-C Function: ", python_c_function_str) - - # Append Namespace - python_c_functions_reg_str += ",\n".join( - python_c_function_reg_list) + "," - python_c_functions = "\n".join(python_c_function_list) - if len(namespace) > 0: - python_c_functions_str += f"""namespace {namespace} {{ - {python_c_functions} -}} -""" + y_generator = PythonCYamlGenerator(api_yaml_path) + y_generator.run() - else: - python_c_functions_str += python_c_functions + generated_python_c_functions += y_generator.python_c_functions_str + "\n" + generated_python_c_registration += y_generator.python_c_functions_reg_str + "\n" - python_c_str = GeneratePythonCWrappers(python_c_functions_str, - python_c_functions_reg_str) + python_c_str = GeneratePythonCWrappers(generated_python_c_functions, + generated_python_c_registration) - print("Generated Python-C Codes: ", python_c_str) + logging.info(f"Generated Python-C Codes: \n{python_c_str}") output_path = args.output_path for path in [output_path]: -- GitLab From 755a6c533d92f777535dc7d2624ce84bd7c6d777 Mon Sep 17 00:00:00 2001 From: Wilber Date: Fri, 18 Mar 2022 11:06:50 +0800 Subject: [PATCH 156/176] support register with attr (#40564) * support register with attr * add infrt_with_gpu macor --- tools/infrt/get_phi_kernel_function.sh | 33 +++++- tools/infrt/get_phi_kernel_info.py | 133 +++++++++++++++++++------ 2 files changed, 136 insertions(+), 30 deletions(-) diff --git a/tools/infrt/get_phi_kernel_function.sh b/tools/infrt/get_phi_kernel_function.sh index 3b9f4b72735..6b2586d4081 100644 --- a/tools/infrt/get_phi_kernel_function.sh +++ b/tools/infrt/get_phi_kernel_function.sh @@ -41,7 +41,37 @@ python3 ${PADDLE_ROOT}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py \ grep PD_REGISTER_INFER_META_FN ${temp_path}/generate.cc \ | awk -F "\(|,|::|\)" '{print $2, $4}' > ${temp_path}/wrap_info.txt -#step 3: merge all infos + +#step 3:get ir's attr_name. +ir_attr_name_info_file=`mktemp` +# phi_cpu attr +all_ir_name=`grep -Eo "PDTCPU_Kernel<.*\"" paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td | awk -v FS="<" '{gsub(/\"/,"");print $2}'` +for ir in $all_ir_name +do + attr_name=`grep "<\"$ir" -A 3 paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td | grep -Eo "Attr:.*)" \ + | awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BOOLAttr/,""); \ + gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \ + gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \ + gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \ + gsub(/Attr/,"");gsub(/\)/,""); \ + gsub(/[,:]/,"");print $a}'` + echo phi_cpu.$ir $attr_name >> $ir_attr_name_info_file +done +# phi_gpu attr +all_ir_name=`grep -Eo "PDTGPU_Kernel<.*\"" paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td | awk -v FS="<" '{gsub(/\"/,"");print $2}'` +for ir in $all_ir_name +do + attr_name=`grep "<\"$ir" -A 3 paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td | grep -Eo "Attr:.*)" \ + | awk '{gsub(/F32Attr/,"");gsub(/F64Attr/,"");gsub(/StrAttr/,"");gsub(/BOOLAttr/,""); \ + gsub(/SI1Attr/,"");gsub(/SI8Attr/,"");gsub(/SI16Attr/,"");gsub(/SI32Attr/,"");gsub(/SI64Attr/,""); \ + gsub(/UI1Attr/,"");gsub(/UI8Attr/,"");gsub(/I16Attr/,"");gsub(/I32Attr/,"");gsub(/I64Attr/,""); \ + gsub(/I1Attr/,"");gsub(/I8Attr/,"");gsub(/UI16Attr/,"");gsub(/UI32Attr/,"");gsub(/UI64Attr/,""); \ + gsub(/Attr/,"");gsub(/\)/,""); \ + gsub(/[,:]/,"");print $a}'` + echo phi_gpu.$ir $attr_name >> $ir_attr_name_info_file +done + +#step 4: merge all infos # @input1 => phi kernel infomation : kernel_name kernel_key(GPU/CPU, precision, layout) # @input2 => information from api.yaml : kernel_name kernel_function_name inferMeta_function_name # @input3 => information from wrapped_infermeta_gen : ensure the inferMeta function has @@ -50,4 +80,5 @@ python3 ${PADDLE_ROOT}/tools/infrt/get_phi_kernel_info.py \ --paddle_root_path ${PADDLE_ROOT} \ --kernel_info_file $kernel_register_info_file \ --infermeta_wrap_file ${temp_path}/wrap_info.txt \ + --attr_info_file $ir_attr_name_info_file \ --generate_file ${PADDLE_ROOT}/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc diff --git a/tools/infrt/get_phi_kernel_info.py b/tools/infrt/get_phi_kernel_info.py index 774f6cd6bf3..85ad585cdef 100644 --- a/tools/infrt/get_phi_kernel_info.py +++ b/tools/infrt/get_phi_kernel_info.py @@ -37,6 +37,8 @@ def parse_args(): type=str, required=True, help="inferMeta wrap info file.") + parser.add_argument( + "--attr_info_file", type=str, required=True, help="attr info file.") parser.add_argument( "--generate_file", type=str, @@ -59,6 +61,23 @@ def get_kernel_info(file_path): return [l.strip() for l in cont] +def get_attr_info(file_path): + """ + phi_gpu.argsort.float64.any $axisBool$descending + """ + ret = {} + with open(file_path, 'r') as f: + cont = f.readlines() + for l in cont: + datas = l.strip().split(' ') + if len(datas) == 2: + attrs = datas[1].split('$') + ret[datas[0]] = attrs[1:] + else: + ret[datas[0]] = None + return ret + + def merge(infer_meta_data, kernel_data, wrap_data): meta_map = {} for api in infer_meta_data: @@ -114,14 +133,14 @@ namespace kernel { def gen_context(val): if val == "CPU": - return "phi::CPUContext" - # elif val == "GPU": - # return "phi::GPUContext" + return "phi::CPUContext", "phi_cpu" + elif val == "GPU": + return "phi::GPUContext", "phi_gpu" # elif val == "XPU": - # return "phi::XPUContext" + # return "phi::XPUContext", "phi_xpu" else: # raise Exception(f"Unknown context type {val}") - return "" + return "", "" def gen_layout(val): @@ -195,34 +214,53 @@ def gen_dtype(vals: List[str]): return ir_dtypes, origin_dtypes -# TODO(wilber): Now only process CPUContext. -def gen_register_info(resources: List[List[str]]): +# Note: Now only process CPUContext and GPUContext. + + +def gen_register_code_info(item: List[str], attr_data: Dict[str, List[str]]): """ - resources: [['add', 'CPU', 'ALL_LAYOUT', 'AddKernel', 'float', 'double', '...'(varaidic types), 'ElementwiseInferMeta'], ...] + item: ['add', 'CPU', 'ALL_LAYOUT', 'AddKernel', 'float', 'double', '...'(varaidic types), 'ElementwiseInferMeta'] + attr_data: {'phi_cpu.arg_min.float32.any': ['axisBool', 'keepdimsBool', 'flatten', 'dtype']} """ - res = "void RegisterInferShapeLaunchers(host_context::KernelRegistry* registry) {" - for item in resources: - # The output string is polluted by C++ macros, here the \ is removed - update_item = [v.strip('\\') for v in item] + ctx_name, ir_ctx_name = gen_context(item[1]) + if (ctx_name == ""): + return "" + item[2] = gen_layout(item[2]) + ir_dtypes, origin_dtypes = gen_dtype(item[4:-1]) + infer_shape_func = "&phi::" + item[-1] - ctx_name = gen_context(update_item[1]) - if (ctx_name == ""): - continue - update_item[2] = gen_layout(update_item[2]) - ir_dtypes, origin_dtypes = gen_dtype(update_item[4:-1]) - infer_shape_func = "&phi::" + update_item[-1] + res = "" - if update_item[-1] == "unknown": - # TODO(wilber): handle the unknown inferShape func. - continue + if item[-1] == "unknown": + # TODO(wilber): handle the unknown inferShape func. + return "" + + for ir_dtype, origin_dtype in zip(ir_dtypes, origin_dtypes): + kernel_func = gen_kernel_func(item[3], ctx_name, origin_dtype) + ir_name = ir_ctx_name + '.' + item[0].lower( + ) + '.' + ir_dtype + '.' + item[2].lower() + if ir_name in attr_data.keys() and attr_data[ir_name] is not None: + attr_names = ', '.join( + ["\"" + a + "\"" for a in attr_data[ir_name]]) + res += f""" +registry->AddKernelWithAttrs("{ir_name}",""" + + res += f""" + std::bind(&KernelLauncherFunc, + KernelLauncher(), + std::placeholders::_1), + {{{attr_names}}}); +""" - for ir_dtype, origin_dtype in zip(ir_dtypes, origin_dtypes): - kernel_func = gen_kernel_func(update_item[3], ctx_name, - origin_dtype) - ir_name = 'phi_cpu.' + update_item[0].lower( - ) + '.' + ir_dtype + '.' + update_item[2].lower() + else: res += f""" - registry->AddKernel("{ir_name}",""" +registry->AddKernel("{ir_name}",""" res += f""" std::bind(&KernelLauncherFunc Date: Fri, 18 Mar 2022 11:19:24 +0800 Subject: [PATCH 157/176] [DataParallel]Support control flow in new DP (#40593) * fix bug * fix bug --- .../distributed/collective/CMakeLists.txt | 2 +- .../fluid/distributed/collective/reducer.cc | 325 +++++++++++++++--- paddle/fluid/distributed/collective/reducer.h | 23 +- paddle/fluid/pybind/eager_method.cc | 36 +- ...el_dygraph_gradient_check_in_eager_mode.py | 163 +++++++++ .../test_parallel_dygraph_dataparallel.py | 5 + 6 files changed, 494 insertions(+), 60 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index 3fca45cc068..49ba9479d49 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -1,5 +1,5 @@ cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api) -cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup phi phi_api) +cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup phi phi_api string_helper) if (WITH_DISTRIBUTE) cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi phi_api eager_api gloo_wrapper) diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index 5533f3f4cbf..be4c5423943 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -17,6 +17,20 @@ namespace paddle { namespace distributed { +static Backend TransToBackend(platform::Place place) { + static const std::map type_backend = { + {phi::AllocationType::GPU, Backend::GPU}, + {phi::AllocationType::CPU, Backend::CPU}, + }; + + phi::AllocationType type = place.GetType(); + auto it = type_backend.find(type); + PADDLE_ENFORCE_EQ(it != type_backend.end(), true, + platform::errors::InvalidArgument( + "Place type (%s) is not supported. ", place)); + return it->second; +} + std::vector> Eager_AssignGroupBySize( const std::vector tensors, const std::vector &is_sparse_gradient, @@ -297,10 +311,18 @@ EagerReducer::EagerReducer( std::dynamic_pointer_cast(grad_node); accumulation_grad_node->RegisterReduceHook( std::make_shared(reduce_hook)); + + gradnode_index_map_[grad_node.get()] = global_var_index; } vars_marked_ready_.resize(tensors_.size(), false); local_used_vars_.resize(tensors_.size(), 0); + + if (find_unused_vars_each_step_) { + global_used_vars_ = paddle::experimental::empty( + ScalarArray({static_cast(tensors_.size())}), DataType::INT32, + TransToBackend(inner_place_)); + } } std::shared_ptr EagerReducer::GetGradNodeFromTensor( @@ -341,21 +363,10 @@ void EagerReducer::InitializeGroups( } else { // process the dense gradient. InitializeDenseGroups(tensor_indices_, &group); - experimental::Backend backend; - switch (inner_place_.GetType()) { - case phi::AllocationType::GPU: - backend = experimental::Backend::GPU; - break; - case phi::AllocationType::CPU: - backend = experimental::Backend::CPU; - break; - default: - PADDLE_THROW(platform::errors::Unimplemented( - "Place type (%s) is not supported. ", inner_place_)); - break; - } + // experimental::Backend backend = TransToBackend(inner_place_); group.dense_contents_ = paddle::experimental::empty( - ScalarArray({group.all_length_}), group.dtype_, backend); + ScalarArray({group.all_length_}), group.dtype_, + TransToBackend(inner_place_)); } // map tensors to this group by VariableLocator @@ -418,6 +429,53 @@ void EagerReducer::InitializeDenseGroups( p_group->all_length_ = all_length; } +void EagerReducer::TraverseBackwardGraph(const std::vector &outputs) { + std::queue queue; + std::set visited; + + for (const auto &output : outputs) { + auto *auto_grad_meta = + static_cast(output.get_autograd_meta()); + if (!auto_grad_meta) continue; + auto shared_grad_node = auto_grad_meta->GetMutableGradNode(); + if (shared_grad_node == nullptr || shared_grad_node.get() == nullptr || + auto_grad_meta->StopGradient()) { + continue; + } + egr::GradNodeBase *grad_node = shared_grad_node.get(); + queue.emplace(grad_node); + } + + while (!queue.empty()) { + egr::GradNodeBase *node = queue.front(); + queue.pop(); + const std::vector> &edges = node->GetEdges(); + for (size_t i = 0; i < edges.size(); i++) { + for (size_t j = 0; j < edges[i].size(); j++) { + const egr::Edge &edge = edges[i][j]; + auto next_node_shared = edge.GetMutableGradNode(); + if (!next_node_shared || !next_node_shared.get()) { + continue; + } + auto *next_node = next_node_shared.get(); + const bool was_inserted = visited.insert(next_node).second; + if (was_inserted) { + queue.emplace(next_node); + } + } + } + } + + for (const auto &it : gradnode_index_map_) { + if (visited.count(it.first) == 0) { + unused_vars_.push_back(it.second); + VLOG(3) << "[Rank " << process_group_->GetRank() << "]: " + << "Tensor " << tensors_[it.second].name() << " at index " + << it.second << " is marked as unused."; + } + } +} + void EagerReducer::PrepareForBackward(const std::vector &outputs) { VLOG(3) << "after forward, then reset count for backward."; grad_need_hooks_ = true; @@ -429,6 +487,51 @@ void EagerReducer::PrepareForBackward(const std::vector &outputs) { // reinitialize vars_marked_ready_ for next iteration vars_marked_ready_.clear(); vars_marked_ready_.resize(tensors_.size(), false); + + PADDLE_ENFORCE_EQ( + groups_need_finalize_, false, + platform::errors::PreconditionNotMet( + "A serious error has occurred here. Please " + "set find_unused_parameters=True to traverse backward graph " + "in each step to prepare reduce in advance. If you have " + "set, There may be several reasons for this error: " + "1) Please note that all forward outputs derived from the module " + "parameters must participate in the calculation of losses and " + "subsequent gradient calculations. If not, the wrapper will hang, " + "waiting for autograd to generate gradients for these parameters. " + "you can use detach or stop_gradient to make the unused parameters " + "detached from the autograd graph. " + "2) Used multiple forwards and one backward. You may be able to wrap " + "multiple forwards in a model.")); + + // The first var to trigger the unused parameter + has_marked_unused_vars_ = false; + + if (find_unused_vars_once_ || find_unused_vars_each_step_) { + unused_vars_.clear(); + TraverseBackwardGraph(outputs); + // only check once in first step + find_unused_vars_once_ = false; + } + + if (find_unused_vars_each_step_ && unused_vars_.empty()) { + LOG_FIRST_N(WARNING, 1) + << "All parameters are involved in the backward pass. " + "It is recommended to set find_unused_parameters to False " + "to improve performance. However, if unused parameters " + "appear in subsequent iterative training, then an error " + "will occur. Please make it clear that in the subsequent " + "training, there will be no parameters that are not used " + "in the backward pass, and then set find_unused_parameters"; + } + + if (unused_vars_.size() == tensors_.size()) { + LOG_FIRST_N(WARNING, 1) + << "There is no parameter in the device involved " + "in the backward calculation. If there are " + "parameters on other devices involved in the " + "backward, then a serious error will occur here."; + } } void EagerReducer::AddDistHook(size_t var_index) { @@ -446,36 +549,104 @@ void EagerReducer::AddDistHook(size_t var_index) { auto &tensor = tensors_[var_index]; const auto &grad_node = GetGradNodeFromTensor(&tensor); - VLOG(3) << "Var[" << var_index << "] [" << (*grad_node).name() - << "] arrived and triggered disthook"; + VLOG(3) << "Tensor[" << var_index << "] [" << tensors_[var_index].name() + << "@Grad] arrived and triggered disthook"; local_used_vars_[var_index] = 1; + if (!has_marked_unused_vars_) { + has_marked_unused_vars_ = true; + for (const auto unused_index : unused_vars_) { + MarkVarReady(unused_index, false); + } + } MarkVarReady(var_index, true); } void EagerReducer::MarkVarReady(const size_t var_index, const bool is_used_var) { + VLOG(3) << "Tensor[" << var_index << "][" << tensors_[var_index].name() + << "] is marked ready."; + // error happened, if the var is ready before. + if (vars_marked_ready_[var_index]) { + auto error_info = string::Sprintf( + "Error happened, when parameter[%d][%s] has been ready before. " + "Please set find_unused_parameters=True to traverse backward graph " + "in each step to prepare reduce in advance. If you have set, " + "there may be several reasons for this error: " + "1) In multiple reentrant backward phase, some parameters are reused." + "2) Using model parameters outside of forward function. Please " + "make sure that model parameters are not shared in concurrent " + "forward-backward passes.", + var_index, tensors_[var_index].name()); + + PADDLE_ENFORCE_EQ(has_marked_unused_vars_, false, + platform::errors::PreconditionNotMet(error_info)); + + error_info += + "3) Unused parameters retrieval is incorrect. " + "The return value of forward will be used to retrieve" + " the unused parameters of the entire model. These " + "gradients of unused parameters will not be synchronized " + "between multiple cards. However, if the unused " + "parameters participate in the backward calculation " + "again at a later time (e.g. after the forward function, " + "the loss calculation uses the unused " + "paramters of the forward and trigger backward), " + "its gradient will be wrong."; + + PADDLE_ENFORCE_EQ(has_marked_unused_vars_, true, + platform::errors::PreconditionNotMet(error_info)); + } else { + vars_marked_ready_[var_index] = true; + } + groups_need_finalize_ = true; + const auto &var_locator = variable_locators_[var_index]; const auto group_index = var_locator.group_index; const auto inside_group_index = var_locator.inside_group_index; auto &group = groups_[group_index]; auto &group_tensor = group.dense_tensors_[inside_group_index]; - auto *autograd_meta = tensors_[var_index].get_autograd_meta(); - auto &grad_tensor = static_cast(autograd_meta)->Grad(); - - group_tensor - .ShareDataWith( - *(std::dynamic_pointer_cast(grad_tensor.impl()))) - .Resize({grad_tensor.numel()}); - - vars_marked_ready_[var_index] = true; + const auto length = group.length_[inside_group_index]; + + if (is_used_var) { + auto *autograd_meta = tensors_[var_index].get_autograd_meta(); + auto &grad_tensor = static_cast(autograd_meta)->Grad(); + group_tensor + .ShareDataWith( + *(std::dynamic_pointer_cast(grad_tensor.impl()))) + .Resize({grad_tensor.numel()}); + } else { + // TODO(shenliang03): maybe save the memory by avoiding tensor construction + if (!group_tensor.initialized()) { + group_tensor.Resize({static_cast(length)}); + group_tensor.mutable_data(inner_place_, group.dtype_); + } + if (HasGrad(var_index)) { + VLOG(3) << "Tensor[" << tensors_[var_index].name() << "] has grad"; + auto grad_tensor = egr::EagerUtils::mutable_grad(tensors_[var_index]); + group_tensor + .ShareDataWith(*( + std::dynamic_pointer_cast(grad_tensor->impl()))) + .Resize({length}); + } else { + VLOG(3) << "Tensor[" << tensors_[var_index].name() + << "] doesn't have grad"; + auto *dev_ctx = platform::DeviceContextPool::Instance().Get(inner_place_); + group_tensor.Resize({static_cast(length)}); + phi::funcs::set_constant(*dev_ctx, &group_tensor, 0.0); + } + } if (--group.pending_ == 0) { // can start allreduce MarkGroupReady(group_index); } + + if (next_group_ == groups_.size()) { + FinalizeBackward(); + } } void EagerReducer::MarkGroupReady(size_t group_index) { @@ -501,6 +672,92 @@ void EagerReducer::MarkGroupReady(size_t group_index) { } } +bool EagerReducer::HasGrad(size_t var_index) { + auto grad = egr::EagerUtils::mutable_grad(tensors_[var_index]); + if (grad && grad->is_initialized()) { + return true; + } else { + return false; + } +} + +void EagerReducer::ProcessUnusedDenseVars() { + // The calculation stream must be used here to + // avoid conflicts with communication. + VLOG(3) << "Local used vars : " + << string::join_strings(local_used_vars_, ','); + + const auto *dev_ctx = + platform::DeviceContextPool::Instance().Get(inner_place_); + auto *global_used_tensor = + std::dynamic_pointer_cast(global_used_vars_.impl()) + .get(); + framework::TensorFromVector(local_used_vars_, *dev_ctx, + global_used_tensor); + + distributed::AllreduceOptions opts; + opts.reduce_op = ReduceOp::SUM; + std::vector reduce_tensors = {global_used_vars_}; + process_group_->AllReduce(reduce_tensors, opts)->Synchronize(); + + framework::TensorToVector(*global_used_tensor, *dev_ctx, + &local_used_vars_); + dev_ctx->Wait(); + + // sync compute stream to get global used var message, + // but maybe affect speed performance + VLOG(3) << "Global used vars : " + << string::join_strings(local_used_vars_, ','); + + for (const auto var_index : unused_vars_) { + const bool global_unused = (local_used_vars_[var_index] == 0); + + // global used but local unused, set grad + VLOG(3) << "[Rank " << process_group_->GetRank() << "]: " + << "Var [" << var_index << "] [" << tensors_[var_index].name() + << "] global_unused: " << global_unused + << " has grad: " << HasGrad(var_index); + + if (!global_unused) { + VLOG(3) << "Set Tensor[" << var_index << "]'s Grad for [Rank " + << process_group_->GetRank() << "]"; + const auto &var_locator = variable_locators_[var_index]; + const auto group_index = var_locator.group_index; + const auto &group = groups_[group_index]; + const auto inside_group_index = var_locator.inside_group_index; + auto &src_tensor = group.dense_tensors_[inside_group_index]; + + Tensor grad_value(std::make_shared(src_tensor)); + + auto dest_var_base = tensors_[var_index]; + auto grad_tensor = egr::EagerUtils::mutable_grad(dest_var_base); + grad_tensor->copy_(grad_value, inner_place_, true); + grad_tensor->reshape(dest_var_base.shape()); + } + } +} + +void EagerReducer::FinalizeBackward() { + groups_need_finalize_ = false; + grad_need_hooks_ = false; + for (auto &group : groups_) { + group.task->Synchronize(); + } + + for (auto &group : groups_) { + group.SplitTensors(inner_place_); + } + + if (find_unused_vars_each_step_) { + ProcessUnusedDenseVars(); + local_used_vars_.clear(); + local_used_vars_.resize(tensors_.size(), 0); + VLOG(3) << "ProcessUnusedDenseVars is finished."; + } + + VLOG(3) << "In the batch, Reducer is finished."; +} + void EagerReducer::FusedAllReduceSchedule(EagerGroup *group, const int curr_group_index) { // The overall timeline: concat > div_nranks > allreduce > split @@ -513,24 +770,14 @@ void EagerReducer::FusedAllReduceSchedule(EagerGroup *group, group->ConcatTensors(inner_place_); // div nranks - double scaling = 1.0 / nranks_; - paddle::experimental::scale_(group->dense_contents_, scaling, 0.0, false); + paddle::experimental::scale_(group->dense_contents_, 1.0 / nranks_, 0.0, + false); // all_reduce std::vector reduce_tensors = {group->dense_contents_}; - tasks_.push_back(process_group_->AllReduce(reduce_tensors, opts)); + group->task = process_group_->AllReduce(reduce_tensors, opts); - if (tasks_.size() == groups_.size()) { - for (size_t index = 0; index < tasks_.size(); index++) { - auto &task = tasks_.back(); - task->Synchronize(); - tasks_.pop_back(); - } - for (size_t index = 0; index < groups_.size(); index++) { - auto &group = groups_[index]; - group.SplitTensors(inner_place_); - } - } + // split in FinalizeBackward() } std::ostream &operator<<(std::ostream &out, const EagerGroup &group) { diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h index ac6f3fbe595..d3ffa8498a1 100644 --- a/paddle/fluid/distributed/collective/reducer.h +++ b/paddle/fluid/distributed/collective/reducer.h @@ -28,6 +28,8 @@ #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/api/lib/ext_compat_utils.h" #include "paddle/phi/common/data_type.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/utils/string/string_helper.h" namespace paddle { namespace distributed { @@ -35,6 +37,7 @@ using Tensor = paddle::experimental::Tensor; using Scalar = paddle::experimental::ScalarBase; using ScalarArray = paddle::experimental::ScalarArrayBase; +using Backend = paddle::experimental::Backend; std::vector> Eager_AssignGroupBySize( const std::vector, const std::vector &is_sparse_gradient, @@ -61,6 +64,9 @@ class EagerGroup { // external message of group phi::DataType dtype_; + // help to sync + std::shared_ptr task; + // context is used to select the stream for concat void ConcatTensors(const platform::Place &); @@ -98,6 +104,10 @@ class EagerReducer { void MarkVarReady(const size_t var_index, const bool is_used_var); void MarkGroupReady(const size_t group_index); void FusedAllReduceSchedule(EagerGroup *group, const int curr_group_index); + void FinalizeBackward(); + void TraverseBackwardGraph(const std::vector &outputs); + void ProcessUnusedDenseVars(); + bool HasGrad(size_t var_index); private: std::vector tensors_; @@ -105,7 +115,6 @@ class EagerReducer { std::vector is_sparse_gradient_; std::shared_ptr process_group_; std::vector group_size_limits_; - bool find_unused_vars_each_step_; std::vector groups_; std::vector variable_locators_; @@ -113,12 +122,20 @@ class EagerReducer { platform::Place inner_place_; size_t next_group_ = 0; int64_t nranks_ = -1; - std::vector> tasks_; bool grad_need_hooks_{false}; std::vector vars_marked_ready_; - std::vector local_used_vars_; + std::vector local_used_vars_; + + // Following variables are to help unused vars + std::vector unused_vars_; + std::map gradnode_index_map_; + bool has_marked_unused_vars_{false}; + bool find_unused_vars_each_step_{false}; + bool find_unused_vars_once_{true}; + bool groups_need_finalize_{false}; + Tensor global_used_vars_; }; } // namespace distributed diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 7f8fcd351fe..d4bbfa0e66e 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -327,23 +327,25 @@ static PyObject* tensor_clear_gradient(TensorObject* self, PyObject* args, grad = meta->MutableGrad(); } - if (grad->is_selected_rows()) { - auto selected_rows = - std::dynamic_pointer_cast(grad->impl()); - if (selected_rows->mutable_value()->IsInitialized()) { - selected_rows->mutable_rows()->clear(); - selected_rows->mutable_value()->clear(); - } - } else if (grad->is_dense_tensor()) { - if (grad->initialized()) { - if (set_to_zero) { - grad->set_impl(paddle::experimental::zeros_like(*grad).impl()); - } else { - VLOG(4) << "Gradient of " << self->tensor.name() - << " is initialized, will be released."; - auto dense_tensor = - std::dynamic_pointer_cast(grad->impl()); - dense_tensor->MoveMemoryHolder(); + if (grad->impl()) { + if (grad->is_selected_rows()) { + auto selected_rows = + std::dynamic_pointer_cast(grad->impl()); + if (selected_rows->mutable_value()->IsInitialized()) { + selected_rows->mutable_rows()->clear(); + selected_rows->mutable_value()->clear(); + } + } else if (grad->is_dense_tensor()) { + if (grad->initialized()) { + if (set_to_zero) { + grad->set_impl(paddle::experimental::zeros_like(*grad).impl()); + } else { + VLOG(4) << "Gradient of " << self->tensor.name() + << " is initialized, will be released."; + auto dense_tensor = + std::dynamic_pointer_cast(grad->impl()); + dense_tensor->MoveMemoryHolder(); + } } } } diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py new file mode 100644 index 00000000000..214f41c78a3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check_in_eager_mode.py @@ -0,0 +1,163 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import unittest +import os + +import paddle +import numpy as np +import paddle.distributed as dist +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Linear +from paddle.fluid.framework import _test_eager_guard +from paddle.fluid.dygraph.parallel import ParallelEnv +import paddle.fluid.core as core + +paddle.seed(1024) +np.random.seed(2021) + +batch = 5 +in_dim = 10 +out_dim = 20 + + +def init_process_group(strategy=None): + nranks = ParallelEnv().nranks + rank = ParallelEnv().local_rank + is_master = True if rank == 0 else False + store = paddle.fluid.core.TCPStore("127.0.0.1", 6174, is_master, nranks) + group = core.ProcessGroupNCCL(store, rank, nranks) + return group + + +class SimpleNet(fluid.Layer): + def __init__(self, train_id): + super(SimpleNet, self).__init__() + self.w1 = self.create_parameter( + shape=[in_dim, out_dim], dtype="float32") + self.w2 = self.create_parameter( + shape=[in_dim, out_dim], dtype="float32") + self.share_net = Linear(out_dim, 10) + + self.unused_param = self.create_parameter( + shape=[out_dim, in_dim], dtype="float64") + + # just for test sync_params_buffers + # self.register_buffer("queue", paddle.randn([10, 5])) + # self.queue = paddle.nn.functional.normalize(self.queue, axis=0) + # self.register_buffer("queue_ptr", paddle.zeros([1], 'int64')) + + self.trainer_id = train_id + + def forward(self, x): + is_use = (paddle.equal_all( + x, paddle.ones(shape=(batch, in_dim))).numpy()[0] and + self.trainer_id == 1) + + if is_use: + tmp = paddle.matmul(x, self.w1) + else: + tmp = paddle.matmul(x, self.w2) + + return self.share_net(tmp) + + +class TestDistTraning(unittest.TestCase): + def test_multiple_gpus(self): + dist.init_parallel_env() + self.trainer_id = dist.get_rank() + + process_group = init_process_group() + self.pg = process_group + with _test_eager_guard(): + + model_a = SimpleNet(self.trainer_id) + model_b = SimpleNet(self.trainer_id) + + state_dict = model_a.state_dict() + model_b.set_state_dict(state_dict) + + model_a = paddle.DataParallel( + model_a, + find_unused_parameters=True, + process_group=process_group) + model_b = paddle.DataParallel( + model_b, + find_unused_parameters=True, + process_group=process_group) + + ones_input = paddle.ones(shape=(batch, in_dim)) + ones_input.stop_gradient = True + + w1_grad_sum = np.zeros((in_dim, out_dim), dtype='float32') + w2_grad_sum = np.zeros((in_dim, out_dim), dtype='float32') + + for step_id in range(5): + print("==============", step_id) + random_input = paddle.rand(shape=(batch, in_dim)) + random_input.stop_gradient = True + + if step_id % 2 == 0: + out_a = model_a(random_input) + out_b = model_b(random_input) + else: + out_a = model_a(ones_input) + out_b = model_b(ones_input) + + out_a.sum().backward() + out_b.sum().backward() + + self.check_gradient(model_a.parameters()) + self.check_gradient(model_b.parameters()) + + # test acc gradient + w1_grad_sum = self.check_acc(model_a._layers.w1.grad, + w1_grad_sum, + model_b._layers.w1.grad) + w2_grad_sum = self.check_acc(model_a._layers.w2.grad, + w2_grad_sum, + model_b._layers.w2.grad) + + model_a.clear_gradients() + + def check_acc(self, grad, grad_sum, acc_grad): + if grad is not None: + grad_sum = grad_sum + grad.numpy() + acc_grad = acc_grad.numpy() if acc_grad is not None else None + np.testing.assert_allclose(grad_sum, acc_grad, rtol=1e-6) + return grad_sum + + def print_trainer_0(self, *args): + if self.trainer_id == 0: + print(*args) + + def broadcast_param(self, param, root): + self.pg.broadcast(param, root) + return param + + def check_gradient(self, params): + other_param = [] + for param in params: + if param.trainable and (param.grad is not None): + grad = param.grad + other_grad = self.broadcast_param(grad, root=1) + if self.trainer_id == 0: + np.testing.assert_allclose(other_grad.numpy(), grad.numpy()) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py index 802fcc96288..2530fc07753 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py @@ -205,5 +205,10 @@ class TestDataParallelInEagerMode(TestMultipleGpus): self.run_mnist_2gpu('parallel_dygraph_dataparallel_in_eager_mode.py') +class TestGradientCheckInEagerMode(TestMultipleGpus): + def test_multiple_gpus_dynamic(self): + self.run_mnist_2gpu('parallel_dygraph_gradient_check_in_eager_mode.py') + + if __name__ == "__main__": unittest.main() -- GitLab From 8c71322378aa72e84818b58e2bec05c3353ea116 Mon Sep 17 00:00:00 2001 From: Aganlengzi Date: Fri, 18 Mar 2022 13:35:07 +0800 Subject: [PATCH 158/176] [NPU] fix no allocator error (#40687) --- paddle/fluid/memory/allocation/allocator_facade.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 61e292a922f..4a44448dc84 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -219,6 +219,12 @@ class AllocatorFacadePrivate { } InitNaiveBestFitCUDAPinnedAllocator(); #endif +#ifdef PADDLE_WITH_ASCEND_CL + for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) { + InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id)); + } + InitNaiveBestFitNPUPinnedAllocator(); +#endif #ifdef PADDLE_WITH_XPU for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) { InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); -- GitLab From d7ccd6bf5962866a7eae60839f6cc37892529d1f Mon Sep 17 00:00:00 2001 From: xiongkun Date: Fri, 18 Mar 2022 13:53:44 +0800 Subject: [PATCH 159/176] [phi] tranfer kthvalue from fluid to phi (#40676) * tranfer kthvalue from fluid to phi * transfer infershape --- paddle/fluid/operators/kthvalue_op.cc | 68 +---- paddle/fluid/operators/kthvalue_op.cu | 278 ----------------- paddle/fluid/operators/kthvalue_op.h | 281 ------------------ paddle/phi/infermeta/unary.cc | 61 ++++ paddle/phi/infermeta/unary.h | 8 + .../phi/kernels/cpu/kthvalue_grad_kernel.cc | 168 +++++++++++ paddle/phi/kernels/cpu/kthvalue_kernel.cc | 167 +++++++++++ .../phi/kernels/gpu/kthvalue_grad_kernel.cu | 70 +++++ paddle/phi/kernels/gpu/kthvalue_kernel.cu | 252 ++++++++++++++++ paddle/phi/kernels/kthvalue_grad_kernel.h | 30 ++ paddle/phi/kernels/kthvalue_kernel.h | 30 ++ paddle/phi/ops/compat/kthvalue_sig.cc | 29 ++ 12 files changed, 822 insertions(+), 620 deletions(-) delete mode 100644 paddle/fluid/operators/kthvalue_op.cu delete mode 100644 paddle/fluid/operators/kthvalue_op.h create mode 100644 paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/kthvalue_kernel.cc create mode 100644 paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/kthvalue_kernel.cu create mode 100644 paddle/phi/kernels/kthvalue_grad_kernel.h create mode 100644 paddle/phi/kernels/kthvalue_kernel.h create mode 100644 paddle/phi/ops/compat/kthvalue_sig.cc diff --git a/paddle/fluid/operators/kthvalue_op.cc b/paddle/fluid/operators/kthvalue_op.cc index 2a79cee2781..4c679d30263 100644 --- a/paddle/fluid/operators/kthvalue_op.cc +++ b/paddle/fluid/operators/kthvalue_op.cc @@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/kthvalue_op.h" #include #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -25,54 +26,6 @@ class KthvalueOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "kthvalue"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "kthvalue"); - OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "kthvalue"); - auto input_dims = ctx->GetInputDim("X"); - const int& dim_size = input_dims.size(); - int axis = static_cast(ctx->Attrs().Get("axis")); - PADDLE_ENFORCE_LT(axis, dim_size, - paddle::platform::errors::InvalidArgument( - "the axis must be [-%d, %d), but received %d .", - dim_size, dim_size, axis)); - PADDLE_ENFORCE_GE(axis, -dim_size, - paddle::platform::errors::InvalidArgument( - "the axis must be [-%d, %d), but received %d .", - dim_size, dim_size, axis)); - if (axis < 0) axis += dim_size; - int k = static_cast(ctx->Attrs().Get("k")); - PADDLE_ENFORCE_GE( - k, 1, paddle::platform::errors::InvalidArgument( - "the k in the kthvalue must >= 1, but received %d .", k)); - PADDLE_ENFORCE_GE(input_dims.size(), 1, - paddle::platform::errors::InvalidArgument( - "input of kthvalue must have >= 1d shape")); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_GE( - input_dims[axis], k, - paddle::platform::errors::InvalidArgument( - "input of kthvalue must have >= %d columns in axis of %d", k, - axis)); - } - bool keepdim = ctx->Attrs().Get("keepdim"); - std::vector dimvec; - for (int64_t i = 0; i < axis; i++) { - dimvec.emplace_back(input_dims[i]); - } - if (keepdim) { - dimvec.emplace_back(static_cast(1)); - } - for (int64_t i = axis + 1; i < dim_size; i++) { - dimvec.emplace_back(input_dims[i]); - } - framework::DDim dims = phi::make_ddim(dimvec); - ctx->SetOutputDim("Out", dims); - ctx->SetOutputDim("Indices", dims); - ctx->ShareLoD("X", "Out"); - ctx->ShareLoD("X", "Indices"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -155,20 +108,13 @@ class KthvalueGradOpMaker : public framework::SingleGradOpMaker { } // namespace operators } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(kthvalue, KthvalueInferShapeFunctor, + PD_INFER_META(phi::KthvalueInferMeta)); + namespace ops = paddle::operators; REGISTER_OPERATOR(kthvalue, ops::KthvalueOp, ops::KthvalueOpMaker, ops::KthvalueGradOpMaker, - ops::KthvalueGradOpMaker); -REGISTER_OP_CPU_KERNEL( - kthvalue, ops::KthvalueCPUKernel, - ops::KthvalueCPUKernel, - ops::KthvalueCPUKernel, - ops::KthvalueCPUKernel); + ops::KthvalueGradOpMaker, + KthvalueInferShapeFunctor); REGISTER_OPERATOR(kthvalue_grad, ops::KthvalueOpGrad); -REGISTER_OP_CPU_KERNEL( - kthvalue_grad, - ops::KthvalueGradCPUKernel, - ops::KthvalueGradCPUKernel, - ops::KthvalueGradCPUKernel, - ops::KthvalueGradCPUKernel); diff --git a/paddle/fluid/operators/kthvalue_op.cu b/paddle/fluid/operators/kthvalue_op.cu deleted file mode 100644 index f6f56f70f1a..00000000000 --- a/paddle/fluid/operators/kthvalue_op.cu +++ /dev/null @@ -1,278 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/kthvalue_op.h" -#include "paddle/fluid/operators/top_k_function_cuda.h" -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -#endif - -namespace paddle { -namespace operators { - -int getBlockSize(int col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; -} - -template -bool SortKthvalue(const platform::CUDADeviceContext& ctx, - const framework::Tensor* input_tensor, const int64_t num_cols, - const int64_t num_rows, const int k, - framework::Tensor* out_tensor, - framework::Tensor* indices_tensor) { - auto cu_stream = ctx.stream(); - framework::Tensor input_indices; - const std::vector dims = {num_rows, num_cols}; - auto dim = phi::make_ddim(dims); - input_indices.Resize(dim); - input_indices.mutable_data(ctx.GetPlace()); - size_t temp_storage_bytes = -1; - int block_size = getBlockSize(num_cols); - unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - unsigned int grid_size = num_rows < maxGridDimX - ? static_cast(num_rows) - : maxGridDimX; - InitIndex<<>>( - input_indices.data(), num_rows, num_cols); - cub::CountingInputIterator counting_iter(0); - cub::TransformInputIterator> - segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols)); - T* sorted_values_ptr; - int64_t* sorted_indices_ptr; - framework::Tensor temp_values, temp_indices; - const T* input = input_tensor->data(); - T* values = out_tensor->data(); - int64_t* indices = indices_tensor->mutable_data(ctx.GetPlace()); - temp_values.Resize(dim); - temp_indices.Resize(dim); - sorted_values_ptr = temp_values.mutable_data(ctx.GetPlace()); - sorted_indices_ptr = temp_indices.mutable_data(ctx.GetPlace()); - auto err = cub::DeviceSegmentedRadixSort::SortPairs( - nullptr, temp_storage_bytes, input, sorted_values_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); -#ifdef __HIPCC__ - if (err != hipSuccess) { - LOG(ERROR) << "KthvalueOP failed as could not launch " - "hipcub::DeviceSegmentedRadixSort::SortPairs, status: " - << hipGetErrorString(err); - return false; - } -#else - if (err != cudaSuccess) { - LOG(ERROR) << "KthvalueOP failed as could not launch " - "cub::DeviceSegmentedRadixSort::SortPairs, status: " - << cudaGetErrorString(err); - return false; - } -#endif - framework::Tensor temp_storage; - temp_storage.mutable_data(ctx.GetPlace(), temp_storage_bytes); - - err = cub::DeviceSegmentedRadixSort::SortPairs( - temp_storage.data(), temp_storage_bytes, input, - sorted_values_ptr, input_indices.data(), sorted_indices_ptr, - num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1, - 0, sizeof(T) * 8, cu_stream); -#ifdef __HIPCC__ - if (err != hipSuccess) { - LOG(ERROR) << "KthvalueOP failed as could not launch " - "hipcub::DeviceSegmentedRadixSort::SortPairs, " - << temp_storage_bytes << ", status: " << hipGetErrorString(err); - return false; - } -#else - if (err != cudaSuccess) { - LOG(ERROR) << "KthvalueOP failed as could not launch " - "cub::DeviceSegmentedRadixSort::SortPairs, " - << temp_storage_bytes << ", status: " << cudaGetErrorString(err); - return false; - } -#endif - auto& dev = *ctx.eigen_device(); - const Eigen::DSizes slice_indices{0, k - 1}; - const Eigen::DSizes slice_sizes{num_rows, 1}; - auto e_indices = framework::EigenMatrix::From(*indices_tensor, dim); - auto e_tmp_indices = framework::EigenMatrix::From( - static_cast(temp_indices)); - std::vector odims = {static_cast(num_rows), static_cast(1)}; - dim = phi::make_ddim(odims); - auto e_values = framework::EigenMatrix::From(*out_tensor, dim); - auto e_tmp_values = framework::EigenMatrix::From( - static_cast(temp_values)); - - EigenSlice, int64_t, 2>::Eval( - dev, e_indices, e_tmp_indices, slice_indices, slice_sizes); - EigenSlice, T, 2>::Eval( - dev, e_values, e_tmp_values, slice_indices, slice_sizes); - return true; -} - -template -class KthvalueOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - int k = static_cast(ctx.Attr("k")); - int axis = static_cast(ctx.Attr("axis")); - bool keepdim = static_cast(ctx.Attr("keepdim")); - const auto& in_dims = input->dims(); - if (axis < 0) axis += in_dims.size(); - auto out_dims = output->dims(); - const T* input_data = input->data(); - T* output_data = output->mutable_data(ctx.GetPlace()); - int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); - - if (axis == in_dims.size() - 1) { - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - const auto& dev_ctx = ctx.cuda_device_context(); - PADDLE_ENFORCE_EQ(SortKthvalue(dev_ctx, input, input_width, - input_height, k, output, indices), - true, platform::errors::External( - "KthvalueOP: Error when use cub sorting")); - return; - } else { - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.emplace_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - framework::DDim tmp_out_dims = phi::make_ddim(tmp_out_shape); - output->Resize(tmp_out_dims); - indices->Resize(tmp_out_dims); - } - framework::DDim trans_dims(in_dims); - framework::DDim trans_out_dims(in_dims); - for (int i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - trans_out_dims[i] = in_dims[trans[i]]; - } - trans_out_dims[in_dims.size() - 1] = 1; - framework::Tensor trans_input; - trans_input.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - const auto& dev_ctx = ctx.cuda_device_context(); - TransCompute(ndims, dev_ctx, *input, - &trans_input, trans); - framework::Tensor trans_ind, trans_out; - trans_ind.mutable_data(trans_out_dims, ctx.GetPlace()); - trans_out.mutable_data(trans_out_dims, ctx.GetPlace()); - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - PADDLE_ENFORCE_EQ( - SortKthvalue(dev_ctx, &trans_input, input_width, input_height, k, - &trans_out, &trans_ind), - true, - platform::errors::External("KthvalueOP: Error when use cub sorting")); - TransCompute( - ndims, dev_ctx, trans_ind, indices, trans); - TransCompute(ndims, dev_ctx, trans_out, - output, trans); - if (!keepdim) { - output->Resize(out_dims); - indices->Resize(out_dims); - } - } - } -}; - -template -class KthvalueOpGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(context.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* x = context.Input("X"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = - context.Output(framework::GradVarName("X")); - int axis = context.Attr("axis"); - int k = static_cast(context.Attr("k")); - const auto& in_dims = x->dims(); - auto out_dims = indices->dims(); - if (axis < 0) axis += in_dims.size(); - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - const T* out_grad_data = out_grad->data(); - const int64_t* indices_data = indices->data(); - int pre, n, post; - GetDims(in_dims, axis, &pre, &n, &post); - auto& dev_ctx = context.cuda_device_context(); - int block_size = getBlockSize(post * k); - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1); - int grid_size = std::min(max_blocks, pre); - AssignGradWithAxis<<>>( - out_grad_data, indices_data, x_grad_data, pre, post, n, 1); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - kthvalue, - ops::KthvalueOpCUDAKernel, - ops::KthvalueOpCUDAKernel, - ops::KthvalueOpCUDAKernel, - ops::KthvalueOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL( - kthvalue_grad, - ops::KthvalueOpGradCUDAKernel, - ops::KthvalueOpGradCUDAKernel, - ops::KthvalueOpGradCUDAKernel, - ops::KthvalueOpGradCUDAKernel); diff --git a/paddle/fluid/operators/kthvalue_op.h b/paddle/fluid/operators/kthvalue_op.h deleted file mode 100644 index 15df0a10c69..00000000000 --- a/paddle/fluid/operators/kthvalue_op.h +++ /dev/null @@ -1,281 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/transpose_op.h" - -namespace paddle { -namespace operators { -template -static void getKthvalue(Type input_height, Type input_width, int input_dim, - const framework::Tensor* input, T* t_out, - Type* t_indices, const int& k) { - bool partial_sort_flag = (k * 64) < input_width; -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - std::vector> col_vec; - col_vec.reserve(input_width); - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(j), j)); - } - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(i, j), j)); - } - } - if (partial_sort_flag) { - std::partial_sort( - col_vec.begin(), col_vec.begin() + k, col_vec.end(), - [](const std::pair& l, const std::pair& r) { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - }); - } else { - std::nth_element( - col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(), - [](const std::pair& l, const std::pair& r) { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - }); - } - t_out[i] = col_vec[k - 1].first; - t_indices[i] = col_vec[k - 1].second; - } -} - -template -static void kthvalueAssign(const Type& input_height, const Type& input_width, - const int& input_dim, const framework::Tensor* input, - const framework::Tensor* indices, T* output_data) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - auto e_indices = framework::EigenVector::Flatten(*indices); - output_data[i * input_width + e_indices(0)] = e_input(0); - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - auto e_indices = - framework::EigenMatrix::Reshape(*indices, input_dim - 1); - output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0); - } - } -} - -template -class KthvalueCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - auto* indices = context.Output("Indices"); - const auto& in_dims = input->dims(); - int k = static_cast(context.Attr("k")); - bool keepdim = static_cast(context.Attr("keepdim")); - int axis = static_cast(context.Attr("axis")); - if (axis < 0) axis += in_dims.size(); - T* output_data = output->mutable_data(context.GetPlace()); - int64_t* indices_data = indices->mutable_data(context.GetPlace()); - auto out_dims = output->dims(); - if (axis == in_dims.size() - 1) { - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - getKthvalue(input_height, input_width, in_dims.size(), input, - output_data, indices_data, k); - } else { - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.emplace_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(in_dims[i]); - } - framework::DDim tmp_out_dims = phi::make_ddim(tmp_out_shape); - output->Resize(tmp_out_dims); - indices->Resize(tmp_out_dims); - } - framework::DDim trans_dims(in_dims); - framework::DDim trans_out_dims(in_dims); - - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - trans_out_dims[i] = in_dims[trans[i]]; - } - trans_out_dims[in_dims.size() - 1] = 1; - framework::Tensor trans_inp; - trans_inp.mutable_data(trans_dims, context.GetPlace()); - int ndims = trans.size(); - auto& dev_context = - context.template device_context(); - TransCompute(ndims, dev_context, *input, - &trans_inp, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - framework::Tensor tmp_out, tmp_indices; - T* t_out = tmp_out.mutable_data(trans_out_dims, context.GetPlace()); - auto* t_ind = - tmp_indices.mutable_data(trans_out_dims, context.GetPlace()); - - getKthvalue(input_height, input_width, in_dims.size(), - &trans_inp, t_out, t_ind, k); - TransCompute( - ndims, dev_context, tmp_indices, indices, trans); - TransCompute(ndims, dev_context, tmp_out, - output, trans); - if (!keepdim) { - output->Resize(out_dims); - indices->Resize(out_dims); - } - } - } -}; - -template -class KthvalueGradCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out_grad = - context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = - context.Output(framework::GradVarName("X")); - int axis = static_cast(context.Attr("axis")); - bool keepdim = static_cast(context.Attr("keepdim")); - auto in_dims = x->dims(); - auto out_dims = indices->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - if (!keepdim) { - std::vector tmp_out_shape; - for (int i = 0; i < axis; i++) { - tmp_out_shape.emplace_back(out_dims[i]); - } - tmp_out_shape.emplace_back(1); - for (int i = axis + 1; i < in_dims.size(); i++) { - tmp_out_shape.emplace_back(out_dims[i - 1]); - } - out_dims = phi::make_ddim(tmp_out_shape); - } - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - if (axis == in_dims.size() - 1) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - memset(x_grad_data, 0, x_grad->numel() * sizeof(T)); - if (keepdim) { - kthvalueAssign(input_height, input_width, in_dims.size(), out_grad, - indices, x_grad_data); - } else { - auto& dev_context = - context.template device_context(); - framework::Tensor out_grad_tmp, indices_tmp; - out_grad_tmp.mutable_data(out_grad->dims(), dev_context.GetPlace()); - indices_tmp.mutable_data(indices->dims(), - dev_context.GetPlace()); - framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context, - &out_grad_tmp); - framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context, - &indices_tmp); - out_grad_tmp.Resize(out_dims); - indices_tmp.Resize(out_dims); - kthvalueAssign(input_height, input_width, in_dims.size(), &out_grad_tmp, - &indices_tmp, x_grad_data); - } - } else { - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.emplace_back(out_dims.size() - 1); - for (int i = axis + 1; i < out_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - framework::DDim trans_dims(out_dims); - framework::DDim trans_in_dims(in_dims); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = out_dims[trans[i]]; - trans_in_dims[i] = in_dims[trans[i]]; - } - framework::Tensor trans_dO, trans_ind; - trans_dO.mutable_data(trans_dims, context.GetPlace()); - trans_ind.mutable_data(trans_dims, context.GetPlace()); - int ndims = trans.size(); - auto& dev_context = - context.template device_context(); - if (keepdim) { - TransCompute( - ndims, dev_context, *out_grad, &trans_dO, trans); - TransCompute( - ndims, dev_context, *indices, &trans_ind, trans); - } else { - framework::Tensor out_grad_tmp, indices_tmp; - out_grad_tmp.mutable_data(out_grad->dims(), dev_context.GetPlace()); - indices_tmp.mutable_data(indices->dims(), - dev_context.GetPlace()); - framework::TensorCopy(*out_grad, dev_context.GetPlace(), dev_context, - &out_grad_tmp); - framework::TensorCopy(*indices, dev_context.GetPlace(), dev_context, - &indices_tmp); - out_grad_tmp.Resize(out_dims); - indices_tmp.Resize(out_dims); - TransCompute( - ndims, dev_context, out_grad_tmp, &trans_dO, trans); - TransCompute( - ndims, dev_context, indices_tmp, &trans_ind, trans); - } - const int64_t input_height = phi::product( - phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1)); - const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1]; - framework::Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_in_dims, context.GetPlace()); - memset(t_out, 0, x_grad->numel() * sizeof(T)); - kthvalueAssign(input_height, input_width, in_dims.size(), - &trans_dO, &trans_ind, t_out); - TransCompute(ndims, dev_context, tmp_out, - x_grad, trans); - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 8a2d718f124..baa5b39670f 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -554,6 +554,67 @@ void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) { out->set_dtype(DataType::BOOL); } +void KthvalueInferMeta(const MetaTensor& x, + int k, + int axis, + bool keepdim, + MetaTensor* out, + MetaTensor* indices, + MetaConfig config) { + auto input_dims = x.dims(); + const int& dim_size = input_dims.size(); + PADDLE_ENFORCE_LT(axis, + dim_size, + phi::errors::InvalidArgument( + "the axis must be [-%d, %d), but received %d .", + dim_size, + dim_size, + axis)); + PADDLE_ENFORCE_GE(axis, + -dim_size, + phi::errors::InvalidArgument( + "the axis must be [-%d, %d), but received %d .", + dim_size, + dim_size, + axis)); + if (axis < 0) axis += dim_size; + PADDLE_ENFORCE_GE( + k, + 1, + phi::errors::InvalidArgument( + "the k in the kthvalue must >= 1, but received %d .", k)); + PADDLE_ENFORCE_GE( + input_dims.size(), + 1, + phi::errors::InvalidArgument("input of kthvalue must have >= 1d shape")); + if (config.is_runtime) { + PADDLE_ENFORCE_GE( + input_dims[axis], + k, + phi::errors::InvalidArgument( + "input of kthvalue must have >= %d columns in axis of %d", + k, + axis)); + } + std::vector dimvec; + for (int64_t i = 0; i < axis; i++) { + dimvec.emplace_back(input_dims[i]); + } + if (keepdim) { + dimvec.emplace_back(static_cast(1)); + } + for (int64_t i = axis + 1; i < dim_size; i++) { + dimvec.emplace_back(input_dims[i]); + } + DDim dims = phi::make_ddim(dimvec); + out->set_dims(dims); + out->share_lod(x); + out->set_dtype(x.dtype()); + indices->set_dims(dims); + indices->share_lod(x); + indices->set_dtype(x.dtype()); +} + void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out) { auto dims = x.dims(); auto n_dim = dims.size(); diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 7203a327b55..00026f8598b 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -100,6 +100,14 @@ void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out); void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out); +void KthvalueInferMeta(const MetaTensor& x, + int k, + int axis, + bool keepdim, + MetaTensor* out, + MetaTensor* indices, + MetaConfig = MetaConfig()); + void MatrixPowerInferMeta(const MetaTensor& x, int n, MetaTensor* out); void MaxPoolWithIndexInferMeta(const MetaTensor& x, diff --git a/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc b/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc new file mode 100644 index 00000000000..185d6cbedc8 --- /dev/null +++ b/paddle/phi/kernels/cpu/kthvalue_grad_kernel.cc @@ -0,0 +1,168 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kthvalue_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { +template +static void kthvalueAssign(const Type& input_height, + const Type& input_width, + const int& input_dim, + const DenseTensor* input, + const DenseTensor* indices, + T* output_data) { +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (Type i = 0; i < input_height; ++i) { + if (input_dim == 1) { + auto e_input = EigenVector::Flatten(*input); + auto e_indices = EigenVector::Flatten(*indices); + output_data[i * input_width + e_indices(0)] = e_input(0); + } else { + auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); + auto e_indices = EigenMatrix::Reshape(*indices, input_dim - 1); + output_data[i * input_width + e_indices(i, 0)] = e_input(i, 0); + } + } +} + +template +void KthvalueGradKernel(const Context& dev_ctx, + const DenseTensor& d_out, + const DenseTensor& x, + const DenseTensor& indices, + int k, + int axis, + bool keepdim, + DenseTensor* d_x) { + auto in_dims = x.dims(); + auto out_dims = indices.dims(); + axis = (axis < 0) ? (in_dims.size() + axis) : axis; + if (!keepdim) { + std::vector tmp_out_shape; + for (int i = 0; i < axis; i++) { + tmp_out_shape.emplace_back(out_dims[i]); + } + tmp_out_shape.emplace_back(1); + for (int i = axis + 1; i < in_dims.size(); i++) { + tmp_out_shape.emplace_back(out_dims[i - 1]); + } + out_dims = phi::make_ddim(tmp_out_shape); + } + T* x_grad_data = dev_ctx.template Alloc(d_x); + if (axis == in_dims.size() - 1) { + const int64_t input_height = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t input_width = in_dims[in_dims.size() - 1]; + memset(x_grad_data, 0, d_x->numel() * sizeof(T)); + if (keepdim) { + kthvalueAssign(input_height, + input_width, + in_dims.size(), + &d_out, + &indices, + x_grad_data); + } else { + DenseTensor out_grad_tmp, indices_tmp; + out_grad_tmp.Resize(d_out.dims()); + indices_tmp.Resize(indices.dims()); + dev_ctx.template Alloc(&out_grad_tmp); + dev_ctx.template Alloc(&indices_tmp); + Copy(dev_ctx, d_out, dev_ctx.GetPlace(), false, &out_grad_tmp); + Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp); + out_grad_tmp.Resize(out_dims); + indices_tmp.Resize(out_dims); + kthvalueAssign(input_height, + input_width, + in_dims.size(), + &out_grad_tmp, + &indices_tmp, + x_grad_data); + } + } else { + std::vector trans; + for (int i = 0; i < axis; i++) { + trans.emplace_back(i); + } + trans.emplace_back(out_dims.size() - 1); + for (int i = axis + 1; i < out_dims.size() - 1; i++) { + trans.emplace_back(i); + } + trans.emplace_back(axis); + DDim trans_dims(out_dims); + DDim trans_in_dims(in_dims); + for (size_t i = 0; i < trans.size(); i++) { + trans_dims[i] = out_dims[trans[i]]; + trans_in_dims[i] = in_dims[trans[i]]; + } + DenseTensor trans_dO, trans_ind; + trans_dO.Resize(trans_dims); + trans_ind.Resize(trans_dims); + dev_ctx.template Alloc(&trans_dO); + dev_ctx.template Alloc(&trans_ind); + int ndims = trans.size(); + if (keepdim) { + funcs::TransCompute( + ndims, dev_ctx, d_out, &trans_dO, trans); + funcs::TransCompute( + ndims, dev_ctx, indices, &trans_ind, trans); + } else { + DenseTensor out_grad_tmp, indices_tmp; + out_grad_tmp.Resize(d_out.dims()); + indices_tmp.Resize(indices.dims()); + dev_ctx.template Alloc(&out_grad_tmp); + dev_ctx.template Alloc(&indices_tmp); + Copy(dev_ctx, d_out, dev_ctx.GetPlace(), false, &out_grad_tmp); + Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &indices_tmp); + out_grad_tmp.Resize(out_dims); + indices_tmp.Resize(out_dims); + funcs::TransCompute( + ndims, dev_ctx, out_grad_tmp, &trans_dO, trans); + funcs::TransCompute( + ndims, dev_ctx, indices_tmp, &trans_ind, trans); + } + const int64_t input_height = phi::product( + phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1)); + const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1]; + DenseTensor tmp_out; + tmp_out.Resize(trans_in_dims); + T* t_out = dev_ctx.template Alloc(&tmp_out); + memset(t_out, 0, d_x->numel() * sizeof(T)); + kthvalueAssign(input_height, + input_width, + in_dims.size(), + &trans_dO, + &trans_ind, + t_out); + funcs::TransCompute( + ndims, dev_ctx, tmp_out, d_x, trans); + } +} +} // namespace phi + +PD_REGISTER_KERNEL(kthvalue_grad, + CPU, + ALL_LAYOUT, + phi::KthvalueGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/kthvalue_kernel.cc b/paddle/phi/kernels/cpu/kthvalue_kernel.cc new file mode 100644 index 00000000000..5e436623cae --- /dev/null +++ b/paddle/phi/kernels/cpu/kthvalue_kernel.cc @@ -0,0 +1,167 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kthvalue_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { +template +static void getKthvalue(Type input_height, + Type input_width, + int input_dim, + const DenseTensor* input, + T* t_out, + Type* t_indices, + const int& k) { + bool partial_sort_flag = (k * 64) < input_width; +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (Type i = 0; i < input_height; ++i) { + std::vector> col_vec; + col_vec.reserve(input_width); + if (input_dim == 1) { + auto e_input = EigenVector::Flatten(*input); + for (Type j = 0; j < input_width; ++j) { + col_vec.emplace_back(std::pair(e_input(j), j)); + } + } else { + auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); + for (Type j = 0; j < input_width; ++j) { + col_vec.emplace_back(std::pair(e_input(i, j), j)); + } + } + if (partial_sort_flag) { + std::partial_sort( + col_vec.begin(), + col_vec.begin() + k, + col_vec.end(), + [](const std::pair& l, const std::pair& r) { + return (!std::isnan(static_cast(l.first)) && + std::isnan(static_cast(r.first))) || + (l.first < r.first); + }); + } else { + std::nth_element( + col_vec.begin(), + col_vec.begin() + k - 1, + col_vec.end(), + [](const std::pair& l, const std::pair& r) { + return (!std::isnan(static_cast(l.first)) && + std::isnan(static_cast(r.first))) || + (l.first < r.first); + }); + } + t_out[i] = col_vec[k - 1].first; + t_indices[i] = col_vec[k - 1].second; + } +} + +template +void KthvalueKernel(const Context& dev_ctx, + const DenseTensor& x, + int k, + int axis, + bool keepdim, + DenseTensor* output, + DenseTensor* indices) { + const auto& in_dims = x.dims(); + if (axis < 0) axis += in_dims.size(); + T* output_data = dev_ctx.template Alloc(output); + int64_t* indices_data = dev_ctx.template Alloc(indices); + auto out_dims = output->dims(); + if (axis == in_dims.size() - 1) { + const int64_t& input_height = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t& input_width = in_dims[in_dims.size() - 1]; + getKthvalue(input_height, + input_width, + in_dims.size(), + &x, + output_data, + indices_data, + k); + } else { + std::vector trans; + for (int i = 0; i < axis; i++) { + trans.emplace_back(i); + } + trans.emplace_back(in_dims.size() - 1); + for (int i = axis + 1; i < in_dims.size() - 1; i++) { + trans.emplace_back(i); + } + trans.emplace_back(axis); + if (!keepdim) { + std::vector tmp_out_shape; + for (int i = 0; i < axis; i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + tmp_out_shape.emplace_back(1); + for (int i = axis + 1; i < in_dims.size(); i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + DDim tmp_out_dims = phi::make_ddim(tmp_out_shape); + output->Resize(tmp_out_dims); + indices->Resize(tmp_out_dims); + } + DDim trans_dims(in_dims); + DDim trans_out_dims(in_dims); + + for (size_t i = 0; i < trans.size(); i++) { + trans_dims[i] = in_dims[trans[i]]; + trans_out_dims[i] = in_dims[trans[i]]; + } + trans_out_dims[in_dims.size() - 1] = 1; + DenseTensor trans_inp; + trans_inp.Resize(trans_dims); + dev_ctx.template Alloc(&trans_inp); + int ndims = trans.size(); + funcs::TransCompute( + ndims, dev_ctx, x, &trans_inp, trans); + + const int64_t input_height = + phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); + const int64_t input_width = trans_dims[trans_dims.size() - 1]; + DenseTensor tmp_out, tmp_indices; + tmp_out.Resize(trans_out_dims); + T* t_out = dev_ctx.template Alloc(&tmp_out); + tmp_indices.Resize(trans_out_dims); + int64_t* t_ind = dev_ctx.template Alloc(&tmp_indices); + getKthvalue( + input_height, input_width, in_dims.size(), &trans_inp, t_out, t_ind, k); + funcs::TransCompute( + ndims, dev_ctx, tmp_indices, indices, trans); + funcs::TransCompute( + ndims, dev_ctx, tmp_out, output, trans); + if (!keepdim) { + output->Resize(out_dims); + indices->Resize(out_dims); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(kthvalue, + CPU, + ALL_LAYOUT, + phi::KthvalueKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu new file mode 100644 index 00000000000..f6e96046a2b --- /dev/null +++ b/paddle/phi/kernels/gpu/kthvalue_grad_kernel.cu @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kthvalue_grad_kernel.h" + +#include "paddle/fluid/operators/top_k_function_cuda.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +static int getBlockSize(int col) { + if (col > 512) + return 1024; + else if (col > 256 && col <= 512) + return 512; + else if (col > 128 && col <= 256) + return 256; + else if (col > 64 && col <= 128) + return 128; + else + return 64; +} + +template +void KthvalueGradKernel(const Context& dev_ctx, + const DenseTensor& d_out, + const DenseTensor& x, + const DenseTensor& indices, + int k, + int axis, + bool keepdim, + DenseTensor* d_x) { + const auto& in_dims = x.dims(); + auto out_dims = indices.dims(); + if (axis < 0) axis += in_dims.size(); + T* x_grad_data = dev_ctx.template Alloc(d_x); + const T* out_grad_data = d_out.data(); + const int64_t* indices_data = indices.data(); + int pre, n, post; + paddle::operators::GetDims(in_dims, axis, &pre, &n, &post); + int block_size = getBlockSize(post * k); + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1); + int grid_size = std::min(max_blocks, pre); + paddle::operators::AssignGradWithAxis< + T><<>>( + out_grad_data, indices_data, x_grad_data, pre, post, n, 1); +} + +} // namespace phi + +PD_REGISTER_KERNEL(kthvalue_grad, + GPU, + ALL_LAYOUT, + phi::KthvalueGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/kthvalue_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_kernel.cu new file mode 100644 index 00000000000..4218e153ec2 --- /dev/null +++ b/paddle/phi/kernels/gpu/kthvalue_kernel.cu @@ -0,0 +1,252 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/kthvalue_kernel.h" + +#include "paddle/fluid/operators/top_k_function_cuda.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { +inline int getBlockSize(int col) { + if (col > 512) + return 1024; + else if (col > 256 && col <= 512) + return 512; + else if (col > 128 && col <= 256) + return 256; + else if (col > 64 && col <= 128) + return 128; + else + return 64; +} + +template +bool SortKthvalue(const phi::GPUContext& dev_ctx, + const DenseTensor* input_tensor, + const int64_t num_cols, + const int64_t num_rows, + const int k, + DenseTensor* out_tensor, + DenseTensor* indices_tensor) { + auto cu_stream = dev_ctx.stream(); + DenseTensor input_indices; + const std::vector dims = {num_rows, num_cols}; + auto dim = phi::make_ddim(dims); + input_indices.Resize(dim); + dev_ctx.template Alloc(&input_indices); + size_t temp_storage_bytes = -1; + int block_size = getBlockSize(num_cols); + unsigned int maxGridDimX = dev_ctx.GetCUDAMaxGridDimSize()[0]; + unsigned int grid_size = num_rows < maxGridDimX + ? static_cast(num_rows) + : maxGridDimX; + paddle::operators::InitIndex< + int64_t><<>>( + input_indices.data(), num_rows, num_cols); + cub::CountingInputIterator counting_iter(0); + cub::TransformInputIterator> + segment_offsets_t(counting_iter, + paddle::operators::SegmentOffsetIter(num_cols)); + T* sorted_values_ptr; + int64_t* sorted_indices_ptr; + DenseTensor temp_values, temp_indices; + const T* input = input_tensor->data(); + T* values = out_tensor->data(); + int64_t* indices = indices_tensor->mutable_data(dev_ctx.GetPlace()); + temp_values.Resize(dim); + temp_indices.Resize(dim); + sorted_values_ptr = dev_ctx.template Alloc(&temp_values); + sorted_indices_ptr = dev_ctx.template Alloc(&temp_indices); + auto err = + cub::DeviceSegmentedRadixSort::SortPairs(nullptr, + temp_storage_bytes, + input, + sorted_values_ptr, + input_indices.data(), + sorted_indices_ptr, + num_cols * num_rows, + num_rows, + segment_offsets_t, + segment_offsets_t + 1, + 0, + sizeof(T) * 8, + cu_stream); +#ifdef __HIPCC__ + if (err != hipSuccess) { + LOG(ERROR) << "KthvalueOP failed as could not launch " + "hipcub::DeviceSegmentedRadixSort::SortPairs, status: " + << hipGetErrorString(err); + return false; + } +#else + if (err != cudaSuccess) { + LOG(ERROR) << "KthvalueOP failed as could not launch " + "cub::DeviceSegmentedRadixSort::SortPairs, status: " + << cudaGetErrorString(err); + return false; + } +#endif + DenseTensor temp_storage; + temp_storage.Resize({static_cast(temp_storage_bytes / sizeof(uint8_t))}); + uint8_t* temp_storage_data = dev_ctx.template Alloc(&temp_storage); + + err = cub::DeviceSegmentedRadixSort::SortPairs(temp_storage_data, + temp_storage_bytes, + input, + sorted_values_ptr, + input_indices.data(), + sorted_indices_ptr, + num_cols * num_rows, + num_rows, + segment_offsets_t, + segment_offsets_t + 1, + 0, + sizeof(T) * 8, + cu_stream); +#ifdef __HIPCC__ + if (err != hipSuccess) { + LOG(ERROR) << "KthvalueOP failed as could not launch " + "hipcub::DeviceSegmentedRadixSort::SortPairs, " + << temp_storage_bytes << ", status: " << hipGetErrorString(err); + return false; + } +#else + if (err != cudaSuccess) { + LOG(ERROR) << "KthvalueOP failed as could not launch " + "cub::DeviceSegmentedRadixSort::SortPairs, " + << temp_storage_bytes << ", status: " << cudaGetErrorString(err); + return false; + } +#endif + auto& dev = *dev_ctx.eigen_device(); + const Eigen::DSizes slice_indices{0, k - 1}; + const Eigen::DSizes slice_sizes{num_rows, 1}; + auto e_indices = EigenMatrix::From(*indices_tensor, dim); + auto e_tmp_indices = + EigenMatrix::From(static_cast(temp_indices)); + std::vector odims = {static_cast(num_rows), static_cast(1)}; + dim = phi::make_ddim(odims); + auto e_values = EigenMatrix::From(*out_tensor, dim); + auto e_tmp_values = + EigenMatrix::From(static_cast(temp_values)); + + funcs::EigenSlice, int64_t, 2>::Eval( + dev, e_indices, e_tmp_indices, slice_indices, slice_sizes); + funcs::EigenSlice, T, 2>::Eval( + dev, e_values, e_tmp_values, slice_indices, slice_sizes); + return true; +} + +template +void KthvalueKernel(const Context& dev_ctx, + const DenseTensor& x, + int k, + int axis, + bool keepdim, + DenseTensor* output, + DenseTensor* indices) { + const auto& in_dims = x.dims(); + if (axis < 0) axis += in_dims.size(); + auto out_dims = output->dims(); + const T* input_data = x.data(); + T* output_data = dev_ctx.template Alloc(output); + int64_t* indices_data = dev_ctx.template Alloc(indices); + + if (axis == in_dims.size() - 1) { + const int64_t& input_height = + phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t& input_width = in_dims[in_dims.size() - 1]; + PADDLE_ENFORCE_EQ( + SortKthvalue( + dev_ctx, &x, input_width, input_height, k, output, indices), + true, + phi::errors::External("KthvalueOP: Error when use cub sorting")); + return; + } else { + std::vector trans; + for (int i = 0; i < axis; i++) { + trans.emplace_back(i); + } + trans.emplace_back(in_dims.size() - 1); + for (int i = axis + 1; i < in_dims.size() - 1; i++) { + trans.emplace_back(i); + } + trans.emplace_back(axis); + if (!keepdim) { + std::vector tmp_out_shape; + for (int i = 0; i < axis; i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + tmp_out_shape.emplace_back(1); + for (int i = axis + 1; i < in_dims.size(); i++) { + tmp_out_shape.emplace_back(in_dims[i]); + } + DDim tmp_out_dims = phi::make_ddim(tmp_out_shape); + output->Resize(tmp_out_dims); + indices->Resize(tmp_out_dims); + } + DDim trans_dims(in_dims); + DDim trans_out_dims(in_dims); + for (int i = 0; i < trans.size(); i++) { + trans_dims[i] = in_dims[trans[i]]; + trans_out_dims[i] = in_dims[trans[i]]; + } + trans_out_dims[in_dims.size() - 1] = 1; + DenseTensor trans_input; + trans_input.mutable_data(trans_dims, dev_ctx.GetPlace()); + int ndims = trans.size(); + funcs::TransCompute( + ndims, dev_ctx, x, &trans_input, trans); + DenseTensor trans_ind, trans_out; + trans_ind.mutable_data(trans_out_dims, dev_ctx.GetPlace()); + trans_out.mutable_data(trans_out_dims, dev_ctx.GetPlace()); + const int64_t input_height = + phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); + const int64_t input_width = trans_dims[trans_dims.size() - 1]; + PADDLE_ENFORCE_EQ( + SortKthvalue(dev_ctx, + &trans_input, + input_width, + input_height, + k, + &trans_out, + &trans_ind), + true, + phi::errors::External("KthvalueOP: Error when use cub sorting")); + funcs::TransCompute( + ndims, dev_ctx, trans_ind, indices, trans); + funcs::TransCompute( + ndims, dev_ctx, trans_out, output, trans); + if (!keepdim) { + output->Resize(out_dims); + indices->Resize(out_dims); + } + } +} +} // namespace phi + +PD_REGISTER_KERNEL(kthvalue, + GPU, + ALL_LAYOUT, + phi::KthvalueKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/kthvalue_grad_kernel.h b/paddle/phi/kernels/kthvalue_grad_kernel.h new file mode 100644 index 00000000000..488dde8237b --- /dev/null +++ b/paddle/phi/kernels/kthvalue_grad_kernel.h @@ -0,0 +1,30 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { +template +void KthvalueGradKernel(const Context& dev_ctx, + const DenseTensor& d_out, + const DenseTensor& x, + const DenseTensor& indices, + int k, + int axis, + bool keepdim, + DenseTensor* d_x); +} // namespace phi diff --git a/paddle/phi/kernels/kthvalue_kernel.h b/paddle/phi/kernels/kthvalue_kernel.h new file mode 100644 index 00000000000..4809b9af483 --- /dev/null +++ b/paddle/phi/kernels/kthvalue_kernel.h @@ -0,0 +1,30 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void KthvalueKernel(const Context& dev_ctx, + const DenseTensor& x, + int k, + int axis, + bool keepdim, + DenseTensor* out, + DenseTensor* indices); +} // namespace phi diff --git a/paddle/phi/ops/compat/kthvalue_sig.cc b/paddle/phi/ops/compat/kthvalue_sig.cc new file mode 100644 index 00000000000..e59e9de1e43 --- /dev/null +++ b/paddle/phi/ops/compat/kthvalue_sig.cc @@ -0,0 +1,29 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature KthvalueGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("kthvalue_grad", + {GradVarName("Out"), "X", "Indices"}, + {"k", "axis", "keepdim"}, + {GradVarName("X")}); +} + +} // namespace phi +PD_REGISTER_ARG_MAPPING_FN(kthvalue_grad, phi::KthvalueGradOpArgumentMapping); -- GitLab From e3b2a0351c3daa9e8fc9ba97ea5a2f30fcd5e147 Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Fri, 18 Mar 2022 14:18:39 +0800 Subject: [PATCH 160/176] Supported Complex2Real Conversion for Eager Dygraph (#39878) * Supported Complex2Real Conversion for Eager Dygraph * Supported Complex2Real Conversion for Eager Dygraph * Enabled complex type promotion test for matmul_v2 * Fix CI issues * Merged adj_edges_ with GradSlotMeta * Fixed monir issue * Adjusted num runs * Recovered Eager performance tests configurations * Recovered Eager performance tests configurations * Adjusted performance tests configurations * Fixed Minor Issues with performance tests * Moved out Edge from GradSlotMeta * Fixed issues from merge * Fixed typo * Addressed review comments * Fixed minor issues --- .../eager_generated/forwards/scale.cc | 4 +- .../auto_code_generator/eager_generator.cc | 18 +- .../final_state_generator/eager_gen.py | 16 +- paddle/fluid/eager/backward.cc | 4 +- paddle/fluid/eager/grad_node_info.cc | 301 ++++++++++++------ paddle/fluid/eager/grad_node_info.h | 70 ++-- paddle/fluid/eager/grad_tensor_holder.h | 11 +- .../grad_node_info_test.cc | 71 +++-- .../grad_tensor_holder_test.cc | 9 +- .../performance_tests/benchmark_utils.cc | 2 +- .../eager/to_static/run_program_op_func.h | 6 +- paddle/fluid/pybind/eager_functions.cc | 18 +- .../tests/unittests/test_matmul_v2_op.py | 4 +- 13 files changed, 352 insertions(+), 182 deletions(-) diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc index ba6a936d686..1be3b31de00 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc +++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc @@ -86,9 +86,9 @@ paddle::experimental::Tensor scale(const paddle::experimental::Tensor& x, scale_node->SetTensorWrappers_X({x}); // Set Grad out rank as same as fwd input and set stop gradient to bwd - scale_node->SetGradOutMeta(p_autograd_in, /*slot id*/ 0); + scale_node->SetGradOutMeta(x, /*slot id*/ 0); // Set Grad out rank as same as fwd input and set stop gradient to bwd - scale_node->SetGradInMeta(p_autograd_out, /*slot id*/ 0); + scale_node->SetGradInMeta(out, /*slot id*/ 0); // Set History for output set current Grad Node for EagerUtils::SetHistory(p_autograd_out, scale_node); diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index d9f201dc9f1..d15c413339a 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -1117,7 +1117,7 @@ static std::string GenerateGradNodeCreationContent( const char* SET_GRAD_OUT_META_TEMPLATE = " grad_node->SetGradOutMeta(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( - SET_GRAD_OUT_META_TEMPLATE, input_autograd_name, input_position); + SET_GRAD_OUT_META_TEMPLATE, input_name, input_position); const char* ADD_EDGES_TEMPLATE = " if(%s) grad_node->AddEdges(%s, %d);\n"; @@ -1129,9 +1129,9 @@ static std::string GenerateGradNodeCreationContent( size_t input_position = fwd_inputs_name_pos_map.at(input_name); const char* SET_GRAD_OUT_META_TEMPLATE = - " grad_node->SetGradOutMeta(&%s, %d);\n"; + " grad_node->SetGradOutMeta(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( - SET_GRAD_OUT_META_TEMPLATE, input_autograd_name, input_position); + SET_GRAD_OUT_META_TEMPLATE, input_name, input_position); const char* ADD_EDGES_TEMPLATE = " grad_node->AddEdges(&%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( @@ -1165,9 +1165,9 @@ static std::string GenerateGradNodeCreationContent( paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name); } const char* SET_GRAD_IN_META_TEMPLATE = - " grad_node->SetGradInMeta(&%s, %d);\n"; + " grad_node->SetGradInMeta(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( - SET_GRAD_IN_META_TEMPLATE, output_autograd_name, output_position); + SET_GRAD_IN_META_TEMPLATE, output_name, output_position); } else { pass_stop_gradient_args += ", " + output_autograd_name; @@ -1186,7 +1186,7 @@ static std::string GenerateGradNodeCreationContent( const char* SET_GRAD_IN_META_TEMPLATE = " grad_node->SetGradInMeta(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( - SET_GRAD_IN_META_TEMPLATE, output_autograd_name, output_position); + SET_GRAD_IN_META_TEMPLATE, output_name, output_position); } // Intermediate Tensor does not require CheckAndRetainGrad @@ -1834,7 +1834,7 @@ static std::string GenerateSingleOpBase( !is_op_base_per_duplicable_input) { const char* GRAD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", egr::EagerUtils::CreateVars( " - "this->OutputMeta()[%d].Size() ) },"; + "this->OutputMeta()[%d].size() ) },"; outs_contents_str += paddle::string::Sprintf( GRAD_OUTS_CONTENT_TEMPLATE, grad_output_name, fwd_input_position); } else { @@ -2053,7 +2053,7 @@ static std::string GenerateGradNodeCCContents( if (is_op_base_per_duplicable_input) { const char* OP_BASE_PER_DUP_INPUT_TEMPLATE = - " for(int i = 0; i < this->OutputMeta()[0].Size(); i++) {\n" + " for(size_t i = 0; i < this->OutputMeta()[0].size(); i++) {\n" " %s\n" " }\n"; generated_grad_function_body = paddle::string::Sprintf( @@ -2065,6 +2065,8 @@ static std::string GenerateGradNodeCCContents( "GradNode%s::ApplyGradientHooks(grads);\n" " std::vector> outputs(%d);\n" " %s\n" + " if(NeedComplexToRealConversion()) " + "HandleComplexGradToRealGrad(&outputs);\n" " return outputs;\n"; generated_grad_function_body = paddle::string::Sprintf(BWD_RETURN_TEMPLATE, fwd_op_type, in_vars.size(), diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 4c1e5b00cba..588fe312a3c 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -656,6 +656,7 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map, else: # Rearrange output order accordingly returns_str += f"returns[{fwd_position}] = grad_api_returns[{grad_api_position}];\n" + returns_str += f"if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n" returns_str += f"return returns;\n" grad_node_name = GetGradNodeName(fwd_api_name) @@ -783,7 +784,7 @@ def GenerateNodeCreationCodes( set_edges_list = [] for name, (_, pos) in forward_inputs_position_map.items(): input_autograd_meta_name = GetAutoGradMetaName(name) - set_grad_out_meta = f" grad_node->SetGradOutMeta({input_autograd_meta_name}, {pos});" + set_grad_out_meta = f" grad_node->SetGradOutMeta({name}, {pos});" set_edges = f" grad_node->AddEdges({input_autograd_meta_name}, {pos});" set_grad_out_meta_list.append(set_grad_out_meta) set_edges_list.append(set_edges) @@ -800,17 +801,18 @@ def GenerateNodeCreationCodes( output_autograd_meta_name = GetAutoGradMetaName(name) set_out_rank = f" egr::EagerUtils::SetOutRankWithSlot({output_autograd_meta_name}, {pos});" set_history = f" egr::EagerUtils::SetHistory({output_autograd_meta_name}, grad_node);" - set_grad_in_meta = f" grad_node->SetGradInMeta({output_autograd_meta_name}, {pos});" - - set_out_rank_list.append(set_out_rank) - set_history_list.append(set_history) - set_grad_in_meta_list.append(set_grad_in_meta) - if num_outputs == 1: set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(api_result);" + set_grad_in_meta = f" grad_node->SetGradInMeta(api_result, {pos});" else: set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(api_result[{pos}]);" + set_grad_in_meta = f" grad_node->SetGradInMeta(api_result[{pos}], {pos});" + + set_out_rank_list.append(set_out_rank) + set_history_list.append(set_history) + set_grad_in_meta_list.append(set_grad_in_meta) set_retain_grad_list.append(set_retain_grad) + set_out_rank_str = "\n".join(set_out_rank_list) set_history_str = "\n".join(set_history_list) set_grad_in_meta_str = "\n".join(set_grad_in_meta_list) diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 75ddfb92275..17bc2441488 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -517,11 +517,11 @@ std::vector RunBackward( } // TODO(jiabin): Should we erase it or find a more efficient way. + node_input_buffers_dict.erase(node); // Prepare GradTensorHolder for next node const std::vector>& edges = node->GetEdges(); - PADDLE_ENFORCE(edges.size() == grad_output_tensors.size() || edges.empty(), paddle::platform::errors::Fatal( "Number of edges should be either empty ( for leaf node " @@ -532,6 +532,7 @@ std::vector RunBackward( for (size_t i = 0; i < edges.size(); i++) { for (size_t j = 0; j < edges[i].size(); j++) { const Edge& edge = edges[i][j]; + auto edge_rank = edge.GetEdgeRankInfo(); // Since we make edge has as same rank as bwd outputs, we indexing them // with @@ -545,6 +546,7 @@ std::vector RunBackward( grad_output_tensors[i].empty()) { continue; } + PADDLE_ENFORCE_LT( j, grad_output_tensors[i].size(), paddle::platform::errors::Fatal( diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index 7eb2902d935..891ad4d8983 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -15,10 +15,16 @@ #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/utils.h" + #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/var_type.h" + #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/errors.h" @@ -33,7 +39,6 @@ GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) { VLOG(6) << "Construct GradNodeBase"; bwd_in_meta_.resize(bwd_in_slot_num); bwd_out_meta_.resize(bwd_out_slot_num); - // adj_edges has the same num as backward outputs adj_edges_.resize(bwd_out_slot_num); } @@ -44,24 +49,20 @@ void GradNodeBase::AddEdges(std::vector* metas, size_t slot_id) { "Given slot id is out of range of adj_edges outter size, " "adj_edges is designed to has the same size of grad " "inputs's slot num.")); - for (const auto& meta : *metas) { + + for (size_t i = 0; i < metas->size(); i++) { + const auto& meta = (*metas)[i]; // adj_edges has as same rank as fwd inputs, and record it's output rank // from // its pre-ops if (meta && !meta->StopGradient()) { auto node = meta->GetMutableGradNode(); - if (node && node.get()) { - VLOG(6) << "Add Edges for slot: " << slot_id - << " which is: " << meta->GetMutableGradNode()->name(); - adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), - meta->OutRankInfo()); - } else { + if (!node || !node.get()) { meta->SetGradNode(std::make_shared(meta)); - VLOG(6) << "Add Edges for slot: " << slot_id - << " which is: " << meta->GetMutableGradNode()->name(); - adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), - meta->OutRankInfo()); } + + adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), + meta->OutRankInfo()); } } } @@ -73,130 +74,205 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) { "Given slot id is out of range of adj_edges outter size, " "adj_edges is designed to has the same size of grad " "inputs's slot num.")); + if (meta && !meta->StopGradient()) { auto node = meta->GetMutableGradNode(); - if (node && node.get()) { - VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " - << this->name() << " to " << meta->GetMutableGradNode()->name(); - adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), - meta->OutRankInfo()); - } else { + if (!node || !node.get()) { meta->SetGradNode(std::make_shared(meta)); - VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " - << this->name() << " to " << meta->GetMutableGradNode()->name(); - adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), - meta->OutRankInfo()); } + VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " + << this->name() << " to " << meta->GetMutableGradNode()->name(); + + adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), + meta->OutRankInfo()); } } -const std::vector& GradNodeBase::InputMeta() const { +const std::vector>& GradNodeBase::InputMeta() const { return bwd_in_meta_; } -const std::vector& GradNodeBase::OutputMeta() const { +const std::vector>& GradNodeBase::OutputMeta() const { return bwd_out_meta_; } -void GradNodeBase::SetGradInMeta(std::vector* fwd_out, +void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out, size_t slot_rank) { - size_t slot_size = fwd_out->size(); + auto* fwd_out_meta = egr::EagerUtils::nullable_autograd_meta(fwd_out); PADDLE_ENFORCE_LE( slot_rank, (bwd_in_meta_.size() - 1), paddle::platform::errors::InvalidArgument( "Slot Rank should less equal than bwd_in_meta_ size, since " "bwd_in_meta_ is designed to hold as same num as backward " "inputs.")); - auto& meta = bwd_in_meta_.at(slot_rank); - PADDLE_ENFORCE_EQ(meta.IsInitialized(), false, - paddle::platform::errors::PreconditionNotMet( - "Bwd_in_meta should only be init once, addition " - "initialization for it is forbidden. If you got this " - "error, it indicates bugs in framework.")); - // Init stop gradient vector before use to avoid push back - meta.Init(slot_size); - for (size_t i = 0; i < slot_size; i++) { - PADDLE_ENFORCE_NOT_NULL((*fwd_out)[i], - paddle::platform::errors::PreconditionNotMet( - "Bwd_in_meta should only be called while " - "autograd_meta is not null. If you got this " - "error, it indicates bugs in framework.")); - if ((*fwd_out)[i]->StopGradient()) { - // Set Stop Gradient only when its true or non-initialized autograd_meta, - // since all default value is false. - meta.SetStopGradient(i, (*fwd_out)[i]->StopGradient()); + auto& metas = bwd_in_meta_.at(slot_rank); + if (metas.size() == 0) { + metas.resize(1); + } + + auto& meta = metas[0]; + meta.SetStopGradient(fwd_out_meta->StopGradient()); + + // Record TensorMeta + if (phi::DenseTensor::classof(fwd_out.impl().get())) { + // Only Copy Meta + phi::DenseTensor* dense_tensor = + static_cast(fwd_out.impl().get()); + + PADDLE_ENFORCE_NE( + dense_tensor->meta().dtype, phi::DataType::UNDEFINED, + paddle::platform::errors::Fatal( + "Attempting to copy DenseTensorMeta with phi::DataType::UNDEFINED," + "which is illegal.")); + meta.SetTensorMeta(dense_tensor->meta()); + + if (paddle::framework::IsComplexType( + paddle::framework::TransToProtoVarType(dense_tensor->type()))) { + need_complex_to_real_ = true; } + } else { + VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with " + "non-DenseTensor argument."; } } -void GradNodeBase::SetGradInMeta(AutogradMeta* fwd_out, size_t slot_rank) { +void GradNodeBase::SetGradInMeta( + const std::vector& fwd_out, + size_t slot_rank) { + size_t slot_size = fwd_out.size(); PADDLE_ENFORCE_LE( slot_rank, (bwd_in_meta_.size() - 1), paddle::platform::errors::InvalidArgument( "Slot Rank should less equal than bwd_in_meta_ size, since " "bwd_in_meta_ is designed to hold as same num as backward " "inputs.")); - auto& meta = bwd_in_meta_.at(slot_rank); - PADDLE_ENFORCE_EQ(meta.IsInitialized(), false, - paddle::platform::errors::PreconditionNotMet( - "Bwd_in_meta should only be init once, Additional " - "initialization for it is forbidden. If you got this " - "error, it indicates bugs in framework.")); + auto& metas = bwd_in_meta_.at(slot_rank); // Init stop gradient vector before use to avoid push back - VLOG(7) << "Init bwd_in_meta_ with slot rank: " << slot_rank; - meta.Init(1); - meta.SetStopGradient(0, fwd_out->StopGradient()); + if (metas.size() < slot_size) { + VLOG(7) << "Init bwd_in_meta_ with slot rank: " << slot_rank; + metas.resize(slot_size); + } + for (size_t i = 0; i < slot_size; i++) { + auto& meta = metas[i]; + const auto& fwd_out_tensor = fwd_out[i]; + auto* fwd_out_meta = + egr::EagerUtils::nullable_autograd_meta(fwd_out_tensor); + PADDLE_ENFORCE_NOT_NULL(fwd_out_meta, + paddle::platform::errors::PreconditionNotMet( + "Bwd_in_meta should only be called while " + "autograd_meta is not null. If you got this " + "error, it indicates bugs in framework.")); + if (fwd_out_meta->StopGradient()) { + // Set Stop Gradient only when its true or non-initialized autograd_meta, + // since all default value is false. + meta.SetStopGradient(fwd_out_meta->StopGradient()); + } + + // Record TensorMeta + if (phi::DenseTensor::classof(fwd_out_tensor.impl().get())) { + // Only Copy Meta + phi::DenseTensor* dense_tensor = + static_cast(fwd_out_tensor.impl().get()); + + PADDLE_ENFORCE_NE( + dense_tensor->meta().dtype, phi::DataType::UNDEFINED, + paddle::platform::errors::Fatal("Attempting to copy DenseTensorMeta " + "with phi::DataType::UNDEFINED," + "which is illegal.")); + meta.SetTensorMeta(dense_tensor->meta()); + if (paddle::framework::IsComplexType( + paddle::framework::TransToProtoVarType(dense_tensor->type()))) { + need_complex_to_real_ = true; + } + } else { + VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta " + "with non-DenseTensor argument."; + } + } } -void GradNodeBase::SetGradOutMeta(std::vector* fwd_in, +void GradNodeBase::SetGradOutMeta(const paddle::experimental::Tensor& fwd_in, size_t slot_rank) { - size_t slot_size = fwd_in->size(); + auto* fwd_in_meta = egr::EagerUtils::nullable_autograd_meta(fwd_in); PADDLE_ENFORCE_LE( - slot_rank, (bwd_out_meta_.size() - 1), + (slot_rank + 1), bwd_out_meta_.size(), paddle::platform::errors::InvalidArgument( "Slot Rank should less equal than bwd_out_meta_ size, " "since bwd_out_meta_ is designed to hold as same num as " "backward outputs.")); - auto& meta = bwd_out_meta_.at(slot_rank); - PADDLE_ENFORCE_EQ(meta.IsInitialized(), false, - paddle::platform::errors::PreconditionNotMet( - "Bwd_out_meta should only be init once. Additional " - "initialization for it is forbidden. If you got this " - "error, it indicates bugs in framework.")); + auto& metas = bwd_out_meta_.at(slot_rank); // Init stop gradient vector before use to avoid push back - meta.Init(slot_size); - for (size_t i = 0; i < slot_size; i++) { - if (!(*fwd_in)[i]) { - meta.SetStopGradient(i, true); - continue; - } - if ((*fwd_in)[i]->StopGradient()) { - // Set Stop Gradient only when its true or non-initialized autograd_meta, - // since all default value is false. - meta.SetStopGradient(i, (*fwd_in)[i]->StopGradient()); + if (metas.size() == 0) { + metas.resize(1); + } + auto& meta = metas[0]; + if (fwd_in_meta) { + meta.SetStopGradient(fwd_in_meta->StopGradient()); + } else { + meta.SetStopGradient(true); + } + + // Record TensorMeta + if (fwd_in.impl() && fwd_in.impl().get()) { + if (phi::DenseTensor::classof(fwd_in.impl().get())) { + // Only Copy Meta + phi::DenseTensor* dense_tensor = + static_cast(fwd_in.impl().get()); + PADDLE_ENFORCE_NE( + dense_tensor->meta().dtype, phi::DataType::UNDEFINED, + paddle::platform::errors::Fatal("Attempting to copy DenseTensorMeta " + "with phi::DataType::UNDEFINED," + "which is illegal.")); + meta.SetTensorMeta(dense_tensor->meta()); } + } else { + VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with " + "non-DenseTensor argument."; } } -void GradNodeBase::SetGradOutMeta(AutogradMeta* fwd_in, size_t slot_rank) { +void GradNodeBase::SetGradOutMeta( + const std::vector& fwd_in, size_t slot_rank) { + size_t slot_size = fwd_in.size(); PADDLE_ENFORCE_LE( - (slot_rank + 1), bwd_out_meta_.size(), + slot_rank, (bwd_out_meta_.size() - 1), paddle::platform::errors::InvalidArgument( "Slot Rank should less equal than bwd_out_meta_ size, " "since bwd_out_meta_ is designed to hold as same num as " "backward outputs.")); - auto& meta = bwd_out_meta_.at(slot_rank); - PADDLE_ENFORCE_EQ(meta.IsInitialized(), false, - paddle::platform::errors::PreconditionNotMet( - "Bwd_out_meta should only be init once. Additional " - "initialization for it is forbidden. If you got this " - "error, it indicates bugs in framework.")); + auto& metas = bwd_out_meta_.at(slot_rank); // Init stop gradient vector before use to avoid push back - meta.Init(1); - if (fwd_in) { - meta.SetStopGradient(0, fwd_in->StopGradient()); - } else { - meta.SetStopGradient(0, true); + if (metas.size() < slot_size) { + metas.resize(slot_size); + } + for (size_t i = 0; i < slot_size; i++) { + const auto& fwd_in_tensor = fwd_in[i]; + auto& meta = metas[i]; + auto* fwd_in_meta = egr::EagerUtils::nullable_autograd_meta(fwd_in_tensor); + if (fwd_in_meta) { + // Set Stop Gradient only when its true or non-initialized autograd_meta, + // since all default value is false. + meta.SetStopGradient(fwd_in_meta->StopGradient()); + } + + // Record TensorMeta + if (fwd_in_tensor.impl() && fwd_in_tensor.impl().get()) { + if (phi::DenseTensor::classof(fwd_in_tensor.impl().get())) { + // Only Copy Meta + phi::DenseTensor* dense_tensor = + static_cast(fwd_in_tensor.impl().get()); + + PADDLE_ENFORCE_NE(dense_tensor->meta().dtype, phi::DataType::UNDEFINED, + paddle::platform::errors::Fatal( + "Attempting to copy DenseTensorMeta with " + "phi::DataType::UNDEFINED," + "which is illegal.")); + meta.SetTensorMeta(dense_tensor->meta()); + } + } else { + VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta " + "with non-DenseTensor argument."; + } } } @@ -207,12 +283,8 @@ void GradNodeBase::SetDefaultGradInOutMeta() { "meta setter, other size of inputs and outputs should " "create with Setter and Getters")); // Default stop_gradient is false and slot id is 0, slot size is 1; - bwd_out_meta_[0].Init(1); - bwd_in_meta_[0].Init(1); -} - -const std::vector>& GradNodeBase::GetEdges() const { - return adj_edges_; + bwd_out_meta_[0].resize(1); + bwd_in_meta_[0].resize(1); } int64_t GradNodeBase::RegisterGradientHook( @@ -222,6 +294,10 @@ int64_t GradNodeBase::RegisterGradientHook( return next_hook_id_++; } +const std::vector>& GradNodeBase::GetEdges() const { + return adj_edges_; +} + std::vector> GradNodeBase::ApplyGradientHooks( const std::vector>& tensors) { @@ -270,4 +346,45 @@ GradNodeBase::ApplyGradientHooks( return outs; } +void GradNodeBase::HandleComplexGradToRealGrad( + std::vector>* out_grads) { + for (size_t slot_id = 0; slot_id < out_grads->size(); slot_id++) { + const std::vector& slot_out_grads = + (*out_grads)[slot_id]; + for (size_t rank_id = 0; rank_id < slot_out_grads.size(); rank_id++) { + const GradSlotMeta& slot_meta = bwd_out_meta_[slot_id][rank_id]; + + PADDLE_ENFORCE( + slot_meta.HasTensorMeta() > 0, + paddle::platform::errors::Fatal( + "We require TensorMeta in GradInputMeta() to obtain forward data " + "types." + "However, no TensorMeta is detected in bwd_out_meta_.")); + + auto fwd_data_type = paddle::framework::TransToProtoVarType( + slot_meta.GetTensorMeta().dtype); + const paddle::experimental::Tensor& grad = slot_out_grads[rank_id]; + + if (paddle::framework::IsComplexType(fwd_data_type)) continue; + + // Only Handle Complex To Real for DenseTensor for now + if (phi::DenseTensor::classof(grad.impl().get())) { + phi::DenseTensor* grad_dense_tensor = + static_cast(grad.impl().get()); + + auto curr_data_type = + paddle::framework::TransToProtoVarType(grad_dense_tensor->type()); + if (!paddle::framework::IsComplexType(curr_data_type)) continue; + + // Convert Complex GradOut to Real + auto out = std::make_shared(); + paddle::framework::TransComplexToReal(fwd_data_type, curr_data_type, + *grad_dense_tensor, out.get()); + + (*out_grads)[slot_id][rank_id].set_impl(out); + } + } + } +} + } // namespace egr diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h index 168e1bcca77..4b21a193ee0 100644 --- a/paddle/fluid/eager/grad_node_info.h +++ b/paddle/fluid/eager/grad_node_info.h @@ -57,21 +57,28 @@ class AutogradMeta; class GradSlotMeta { public: GradSlotMeta() = default; - void Init(size_t size) { - size_ = static_cast(size); - stop_gradient_.resize(size, false); + bool IsStopGradient() const { return stop_gradient_; } + void SetStopGradient(bool stop_gradient = true) { + stop_gradient_ = stop_gradient; } - bool IsInitialized() const { return size_ != -1; } - bool IsStopGradient(size_t rank) const { return stop_gradient_[rank]; } - int Size() const { return size_; } - void SetStopGradient(size_t rank, bool stop_gradient = true) { - stop_gradient_.at(rank) = stop_gradient; + void SetTensorMeta(const phi::DenseTensorMeta& meta) { + meta_ = std::make_shared(meta); + } + bool HasTensorMeta() const { return meta_ && meta_.get(); } + const phi::DenseTensorMeta& GetTensorMeta() const { + if (!HasTensorMeta()) { + PADDLE_THROW(paddle::platform::errors::Fatal( + "meta_ of GradSlotMeta has not been initialized yet." + "You're expected to check Edge availability with HasTensorMeta()" + "before calling GetTensorMeta() interface.")); + } + return *meta_.get(); } private: - int size_{-1}; - std::vector stop_gradient_{false}; + bool stop_gradient_{false}; + std::shared_ptr meta_ = nullptr; }; class GradNodeBase { @@ -112,25 +119,30 @@ class GradNodeBase { void AddEdges(std::vector* metas, size_t slot_id); void AddEdges(AutogradMeta* meta, size_t slot_id); - /** - * GetEdges is designed to get all edges of current node**/ - const std::vector>& GetEdges() const; + // adj_edges were moved inside OutputMeta(), so no available direct access + // from GradNodeBase. + // To access Edges, get GradSlotMeta by calling OutputMeta(), then use + // slot_meta.GetEdge() /** * Get Input Meta of current Grad node**/ - const std::vector& InputMeta() const; + const std::vector>& InputMeta() const; /** * Get Output Meta of current Grad node**/ - const std::vector& OutputMeta() const; + const std::vector>& OutputMeta() const; /** * Set bwd ins and outs info with forward vars * **/ - void SetGradInMeta(std::vector* fwd_out, size_t slot_rank); - void SetGradInMeta(AutogradMeta* fwd_out, size_t slot_rank); + void SetGradInMeta(const std::vector& fwd_out, + size_t slot_rank); + void SetGradInMeta(const paddle::experimental::Tensor& fwd_out, + size_t slot_rank); - void SetGradOutMeta(std::vector* fwd_in, size_t slot_rank); - void SetGradOutMeta(AutogradMeta* fwd_in, size_t slot_rank); + void SetGradOutMeta(const std::vector& fwd_in, + size_t slot_rank); + void SetGradOutMeta(const paddle::experimental::Tensor& fwd_in, + size_t slot_rank); /** * Default setters for Grad in/out meta this should be used for same special @@ -162,11 +174,21 @@ class GradNodeBase { std::vector> ApplyGradientHooks( const std::vector>& tensors); + /** + * Handle Complex - Real Type Promotion + * **/ + void HandleComplexGradToRealGrad( + std::vector>* out_grads); + bool NeedComplexToRealConversion() { return need_complex_to_real_; } + virtual std::string name() { return "GradNodeBase"; } - private: - // TODO(jiabin): Use SmallVector instead after merge PR from develop + /** + * GetEdges is designed to get all edges of current node**/ + const std::vector>& GetEdges() const; + private: + // TODO(zhanlve): Merge adj_edges_ into GradOutMeta // Edges recorded the backward related node info, which indicate all edges // linked // by this Grad Node. @@ -174,10 +196,10 @@ class GradNodeBase { std::vector> adj_edges_; // bwd_out_meta_ is used to record Grad output info for backward - std::vector bwd_out_meta_; + std::vector> bwd_out_meta_; // bwd_in_meta_ used to record Grad input info for backward - std::vector bwd_in_meta_; + std::vector> bwd_in_meta_; // Gradient Hooks // Customer may register a list of hooks which will be called in order during // backward @@ -188,6 +210,8 @@ class GradNodeBase { /* hook */ std::shared_ptr>> gradient_hooks_; + // We handle complex to real conversion only if any complex GradIn is involved + bool need_complex_to_real_ = false; int64_t next_hook_id_{0}; }; diff --git a/paddle/fluid/eager/grad_tensor_holder.h b/paddle/fluid/eager/grad_tensor_holder.h index 9059b403607..8c00f9161b6 100644 --- a/paddle/fluid/eager/grad_tensor_holder.h +++ b/paddle/fluid/eager/grad_tensor_holder.h @@ -26,12 +26,13 @@ namespace egr { * GradTensorHolder should have as same format as forward output **/ class GradTensorHolder { public: - explicit GradTensorHolder(const std::vector& meta) { - VLOG(7) << "Init GradTensorHolder with meta size: " << meta.size(); - buffer_.resize(meta.size()); + explicit GradTensorHolder( + const std::vector>& metas) { + VLOG(7) << "Init GradTensorHolder with meta size: " << metas.size(); + buffer_.resize(metas.size()); for (size_t i = 0; i < buffer_.size(); i++) { - VLOG(7) << "Init GradTensorHolder with meta rank: " << meta[i].Size(); - buffer_[i].resize(meta[i].Size()); + VLOG(7) << "Init GradTensorHolder with meta rank: " << metas[i].size(); + buffer_[i].resize(metas[i].size()); } } diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc index e3db309c401..d592b5ccf66 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc @@ -11,6 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + #include "glog/logging.h" #include "gtest/gtest.h" @@ -23,14 +24,9 @@ TEST(GradNodeInfo, GradSlotMeta) { auto grad_slot = egr::GradSlotMeta(); - CHECK(grad_slot.IsInitialized() == false); - VLOG(6) << "Init GradSlotMeta"; - grad_slot.Init(2); - CHECK(grad_slot.IsInitialized() == true); VLOG(6) << "Set SetStopGradient"; - grad_slot.SetStopGradient(0); - CHECK(grad_slot.IsStopGradient(0) == true); - CHECK_EQ(grad_slot.Size(), 2); + grad_slot.SetStopGradient(); + CHECK(grad_slot.IsStopGradient() == true); } void TestGradNodeBase(bool is_remove_gradient_hook) { @@ -56,18 +52,22 @@ void TestGradNodeBase(bool is_remove_gradient_hook) { ->data()[0], 6.0f); VLOG(6) << "Test Add Edges"; - egr::Edge edge0(grad_test_node1, 1, 2); - auto auto_grad0 = std::make_shared(edge0); + egr::Edge tmp_edge0(grad_test_node1, 1, 2); + auto auto_grad0 = std::make_shared(tmp_edge0); auto_grad0->SetStopGradient(false); - egr::Edge edge1(grad_test_node1, 3, 4); - auto auto_grad1 = std::make_shared(edge1); + + egr::Edge tmp_edge1(grad_test_node1, 3, 4); + auto auto_grad1 = std::make_shared(tmp_edge1); + et1.set_autograd_meta(auto_grad1); auto_grad1->SetStopGradient(false); grad_test_node0->AddEdges(auto_grad0.get(), 0); + CHECK_EQ(grad_test_node0->GetEdges()[0][0].GetEdgeRankInfo().first, size_t(1)); CHECK_EQ(grad_test_node0->GetEdges()[0][0].GetEdgeRankInfo().second, size_t(2)); std::vector metas = {auto_grad1.get()}; + grad_test_node0->AddEdges(&metas, 1); CHECK_EQ(grad_test_node0->GetEdges()[1][0].GetEdgeRankInfo().first, size_t(3)); @@ -76,22 +76,30 @@ void TestGradNodeBase(bool is_remove_gradient_hook) { VLOG(6) << "Test Set Meta and Get Meta"; auto_grad1->SetStopGradient(true); - grad_test_node0->SetGradInMeta(&metas, 0); - grad_test_node0->SetGradInMeta(auto_grad1.get(), 1); - grad_test_node0->SetGradOutMeta(&metas, 0); - grad_test_node0->SetGradOutMeta(auto_grad1.get(), 1); - CHECK_EQ(grad_test_node0->InputMeta()[0].Size(), 1); - CHECK_EQ(grad_test_node0->InputMeta()[1].Size(), 1); - CHECK(grad_test_node0->OutputMeta()[0].IsStopGradient(0)); - CHECK(grad_test_node0->OutputMeta()[1].IsStopGradient(0)); + grad_test_node0->SetGradInMeta(et1, 0); + grad_test_node0->SetGradInMeta({et1}, 1); + grad_test_node0->SetGradOutMeta(et1, 0); + grad_test_node0->SetGradOutMeta({et1}, 1); + CHECK_EQ(grad_test_node0->InputMeta()[0].size(), size_t(1)); + CHECK_EQ(grad_test_node0->InputMeta()[1].size(), size_t(1)); + CHECK_EQ(grad_test_node0->InputMeta()[0][0].GetTensorMeta().dtype, + meta.dtype); + CHECK_EQ(grad_test_node0->InputMeta()[1][0].GetTensorMeta().dtype, + meta.dtype); + CHECK(grad_test_node0->OutputMeta()[0][0].IsStopGradient()); + CHECK(grad_test_node0->OutputMeta()[1][0].IsStopGradient()); + CHECK_EQ(grad_test_node0->OutputMeta()[0][0].GetTensorMeta().dtype, + meta.dtype); + CHECK_EQ(grad_test_node0->OutputMeta()[1][0].GetTensorMeta().dtype, + meta.dtype); VLOG(6) << "Test Default Set Meta and Get Meta"; auto grad_test_node2 = std::make_shared( /* val */ 5.0, /* in_num */ 1, /* out_num */ 1); grad_test_node2->SetDefaultGradInOutMeta(); - CHECK(grad_test_node2->OutputMeta()[0].IsInitialized()); - CHECK(grad_test_node2->OutputMeta()[0].IsStopGradient(0) == false); - CHECK_EQ(grad_test_node2->OutputMeta()[0].Size(), 1); + CHECK_GT(grad_test_node2->OutputMeta()[0].size(), size_t(0)); + CHECK(grad_test_node2->OutputMeta()[0][0].IsStopGradient() == false); + CHECK_EQ(grad_test_node2->OutputMeta()[0].size(), size_t(1)); VLOG(6) << "Test Gradient Hook"; auto gradient_hook = []( @@ -135,7 +143,17 @@ TEST(GradNodeInfo, GradNodeBase) { } TEST(GradNodeInfo, Edge) { + phi::DenseTensorMeta meta = + phi::DenseTensorMeta(phi::DataType::FLOAT32, phi::make_ddim({1, 1})); + std::shared_ptr dt = std::make_shared( + std::make_unique( + paddle::platform::CPUPlace()) + .get(), + meta); + paddle::experimental::Tensor et1(dt); + auto grad_test_node0 = std::make_shared(5, 2, 2); + auto auto_grad1 = std::make_shared(); VLOG(6) << "Test Construct Edge"; egr::Edge edge0 = egr::Edge(); CHECK(edge0.IsInitialized() == false); @@ -145,13 +163,12 @@ TEST(GradNodeInfo, Edge) { egr::Edge(grad_test_node0, std::make_pair(size_t(1), size_t(0))); VLOG(6) << "Test Set Edge's Grad Node"; auto* grad_node = edge1.GetGradNode(); + et1.set_autograd_meta(auto_grad1); + grad_node->SetGradInMeta(et1, 0); + CHECK_EQ(grad_node->InputMeta().size(), size_t(2)); - auto mt_grad_node = edge1.GetMutableGradNode(); - auto auto_grad1 = std::make_shared(); std::vector metas = {auto_grad1.get()}; - // Uninitialized AutogradMeta indicates - mt_grad_node->SetGradInMeta(&metas, 0); - CHECK(grad_node->InputMeta()[0].IsStopGradient(0) == true); + CHECK(grad_node->InputMeta()[0][0].IsStopGradient() == true); VLOG(6) << "Test Get/Set Edge Rank Info"; CHECK_EQ(edge2.GetEdgeRankInfo().first, size_t(1)); CHECK_EQ(edge2.GetEdgeRankInfo().second, size_t(0)); diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc index 384fdcd6f97..645eac06ddd 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc @@ -30,8 +30,7 @@ PD_DECLARE_KERNEL(full_like, CPU, ALL_LAYOUT); using namespace egr; // NOLINT TEST(GradTensorHolder, Constructor) { - GradSlotMeta slot_meta; - slot_meta.Init(1); + std::vector slot_meta(1); GradTensorHolder grad_tensor_holder = GradTensorHolder({slot_meta}); GradTensorHolder grad_tensor_holder2 = GradTensorHolder(grad_tensor_holder); @@ -72,8 +71,7 @@ TEST(GradTensorHolder, Interfaces) { paddle::experimental::Tensor et1 = paddle::experimental::Tensor(dt1); // Constructor empty GradTensorHolder - GradSlotMeta slot_meta; - slot_meta.Init(1); + std::vector slot_meta(1); GradTensorHolder grad_tensor_holder = GradTensorHolder({slot_meta, slot_meta}); @@ -138,8 +136,7 @@ TEST(GradTensorHolder, SelectedRowsMergeAdd) { paddle::experimental::Tensor t2(sr2); // Constructor empty GradTensorHolder - GradSlotMeta slot_meta; - slot_meta.Init(1); + std::vector slot_meta(1); GradTensorHolder grad_tensor_holder = GradTensorHolder({slot_meta, slot_meta}); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc index 887ea3e3acf..c8fb6050e9d 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc @@ -37,7 +37,7 @@ #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/memory/memcpy.h" -static size_t max_num_benchmark_runs = 5000; +static size_t max_num_benchmark_runs = 4000; namespace egr { diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h index 9967d8c3690..277319bc700 100644 --- a/paddle/fluid/eager/to_static/run_program_op_func.h +++ b/paddle/fluid/eager/to_static/run_program_op_func.h @@ -66,10 +66,10 @@ inline void run_program_dygraph_function( grad_node->SetStepScope(step_scope); // Set Grad out rank as same as fwd input and set stop gradient to bwd - grad_node->SetGradOutMeta(&p_autograd_x, /*slot id*/ 0); - grad_node->SetGradOutMeta(&p_autograd_params, /*slot id*/ 1); + grad_node->SetGradOutMeta(x, /*slot id*/ 0); + grad_node->SetGradOutMeta(params, /*slot id*/ 1); - grad_node->SetGradInMeta(&p_autograd_outs, 0); + grad_node->SetGradInMeta(deref_out, 0); // Set Next Edges grad_node->AddEdges(&p_autograd_x, /*slot id*/ 0); grad_node->AddEdges(&p_autograd_params, /*slot id*/ 1); diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index c9e80c7b4b4..528bd75eb00 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -375,6 +375,7 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args, ins_auto_grad_metas.resize(ctx.InputRange().size()); VLOG(7) << "We got slot num of outs is: " << ctx.OutputRange().size(); outs_auto_grad_metas.resize(ctx.OutputRange().size()); + for (size_t i = 0; i < ctx.InputRange().size(); i++) { ins_auto_grad_metas[i] = egr::EagerUtils::nullable_autograd_meta(ctx.InputsBetween( @@ -404,11 +405,15 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args, // Prepare Grad outputs size_t no_grad_cnt = 0; for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) { + const std::vector& in_tensors = + ctx.InputsBetween(ctx.InputRangeAt(i).first, + ctx.InputRangeAt(i).second); + if (slot_map[0].find(i) != slot_map[0].end()) { - grad_node->SetGradOutMeta(&ins_auto_grad_metas[i], slot_map[0][i]); + grad_node->SetGradOutMeta(in_tensors, slot_map[0][i]); grad_node->AddEdges(&ins_auto_grad_metas[i], slot_map[0][i]); } else { - grad_node->SetGradOutMeta(&ins_auto_grad_metas[i], + grad_node->SetGradOutMeta(in_tensors, ins_auto_grad_metas.size() - 1 - no_grad_cnt); grad_node->AddEdges(&ins_auto_grad_metas[i], ins_auto_grad_metas.size() - 1 - no_grad_cnt); @@ -417,11 +422,14 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args, } // Prepare Grad inputs with grad of fwd outputs for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) { + const std::vector& out_tensors = + ctx.OutputsBetweeen(ctx.OutputRangeAt(i).first, + ctx.OutputRangeAt(i).second); + egr::EagerUtils::SetOutRankWithSlot(&(outs_auto_grad_metas[i]), i); egr::EagerUtils::SetHistory(&(outs_auto_grad_metas[i]), grad_node); - grad_node->SetGradInMeta(&(outs_auto_grad_metas[i]), i); - egr::EagerUtils::CheckAndRetainGrad(ctx.OutputsBetweeen( - ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second)); + grad_node->SetGradInMeta(out_tensors, i); + egr::EagerUtils::CheckAndRetainGrad(out_tensors); } // Prepare Grad inputs with fwd outputs diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py index d0a40f38ba2..65d0e289f81 100644 --- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py @@ -542,7 +542,7 @@ class TestComplexMatMulOp(OpTest): 'Out', user_defined_grads=[self.grad_x, self.grad_y], user_defined_grad_outputs=[self.grad_out], - check_eager=False) + check_eager=True) def test_check_grad_ingore_x(self): self.check_grad( @@ -560,7 +560,7 @@ class TestComplexMatMulOp(OpTest): no_grad_set=set('Y'), user_defined_grads=[self.grad_x], user_defined_grad_outputs=[self.grad_out], - check_eager=False) + check_eager=True) class TestComplexMatMulOpBroadcast(OpTest): -- GitLab From 869199102e6b43aeb578f4d2dcbd21a0d53104cd Mon Sep 17 00:00:00 2001 From: Wilber Date: Fri, 18 Mar 2022 14:34:12 +0800 Subject: [PATCH 161/176] Trt engine (#40649) --- .../backends/tensorrt/test_trt_engine.cc | 167 ++++++++++++++++++ .../infrt/dialect/infrt/ir/infrt_dialect.cc | 4 +- paddle/infrt/dialect/tensorrt/trt_ops.td | 33 ++++ .../host_context/mlir_to_runtime_translate.cc | 15 +- paddle/infrt/kernel/tensor_kernels.cc | 4 +- paddle/infrt/kernel/tensorrt/trt_helper.h | 66 +++++++ paddle/infrt/kernel/tensorrt/trt_kernels.cc | 122 +++++++------ paddle/infrt/kernel/tensorrt/trt_layers.h | 104 +++++++++++ .../dialect/{ => tensorrt}/disabled_trt.mlir | 0 .../dialect/tensorrt/disabled_trt_conv.mlir | 54 ++++++ .../dialect/tensorrt/disabled_trt_fc.mlir | 46 +++++ 11 files changed, 550 insertions(+), 65 deletions(-) create mode 100644 paddle/infrt/kernel/tensorrt/trt_helper.h create mode 100644 paddle/infrt/kernel/tensorrt/trt_layers.h rename paddle/infrt/tests/dialect/{ => tensorrt}/disabled_trt.mlir (100%) create mode 100644 paddle/infrt/tests/dialect/tensorrt/disabled_trt_conv.mlir create mode 100644 paddle/infrt/tests/dialect/tensorrt/disabled_trt_fc.mlir diff --git a/paddle/infrt/backends/tensorrt/test_trt_engine.cc b/paddle/infrt/backends/tensorrt/test_trt_engine.cc index 0ab64dd51c8..89dd3b0dc7a 100644 --- a/paddle/infrt/backends/tensorrt/test_trt_engine.cc +++ b/paddle/infrt/backends/tensorrt/test_trt_engine.cc @@ -82,9 +82,176 @@ TrtUniquePtr ConstructNetwork( return network; } +TrtUniquePtr ConstructFCNetwork( + nvinfer1::IBuilder* builder, nvinfer1::Dims dims, bool is_static_shape) { + TrtUniquePtr network; + if (is_static_shape) { + network.reset(builder->createNetworkV2(0U)); + } else { + auto networkFlags = + 1U << static_cast( + nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); + network.reset(builder->createNetworkV2(networkFlags)); + } + + ITensor* data = + network->addInput(model_input, nvinfer1::DataType::kFLOAT, dims); + CHECK_NOTNULL(data); + nvinfer1::Weights kernel_weights; + kernel_weights.type = nvinfer1::DataType::kFLOAT; + kernel_weights.count = 7840; + std::vector weight_data(kernel_weights.count); + for (size_t i = 0; i < weight_data.size(); ++i) { + weight_data[i] = i % 255 * 0.02f; + } + kernel_weights.values = weight_data.data(); + auto* layer = network->addFullyConnected( + *data, 10, kernel_weights, nvinfer1::Weights{}); + CHECK_NOTNULL(layer); + auto* out = layer->getOutput(0); + out->setName(model_output); + network->markOutput(*out); + return network; +} + +TrtUniquePtr ConstructConvNetwork( + nvinfer1::IBuilder* builder, nvinfer1::Dims dims, bool is_static_shape) { + TrtUniquePtr network; + if (is_static_shape) { + network.reset(builder->createNetworkV2(0U)); + } else { + auto networkFlags = + 1U << static_cast( + nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); + network.reset(builder->createNetworkV2(networkFlags)); + } + + ITensor* data = + network->addInput(model_input, nvinfer1::DataType::kFLOAT, dims); + CHECK_NOTNULL(data); + nvinfer1::Weights kernel_weights, bias_weights; + kernel_weights.type = nvinfer1::DataType::kFLOAT; + bias_weights.type = nvinfer1::DataType::kFLOAT; + kernel_weights.count = 81; + bias_weights.count = 3; + std::vector weight_data(kernel_weights.count); + for (size_t i = 0; i < weight_data.size(); ++i) { + weight_data[i] = i * 0.02f; + } + std::vector bias_data(bias_weights.count); + for (size_t i = 0; i < bias_data.size(); ++i) { + bias_data[i] = i * 0.5f; + } + kernel_weights.values = weight_data.data(); + bias_weights.values = bias_data.data(); + nvinfer1::Dims ksize; + ksize.nbDims = 2; + ksize.d[0] = 3; + ksize.d[1] = 3; + auto* layer = + network->addConvolutionNd(*data, 3, ksize, kernel_weights, bias_weights); + CHECK_NOTNULL(layer); + auto* out = layer->getOutput(0); + out->setName(model_output); + network->markOutput(*out); + return network; +} + // sigmoid(x) = 1 / (1 + exp(-x)) inline float sigmoid(float x) { return 1.f / (1.f + exp(-1 * x)); } +TEST(trt, run_fc_static) { + TrtEngine engine(0); + auto net = ConstructFCNetwork( + engine.GetTrtBuilder(), nvinfer1::Dims3{1, 28, 28}, true); + BuildOptions build_options; + build_options.max_batch = 4; + build_options.workspace = 1024; + engine.Build(std::move(net), build_options); + + InferenceOptions inference_options; + inference_options.batch = 1; + + phi::GPUPlace place; + phi::GPUContext context; + context.PartialInitWithoutAllocator(); + context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place, context.stream()) + .get()); + context.PartialInitWithAllocator(); + + phi::DenseTensorMeta meta( + phi::DataType::FLOAT32, + phi::make_ddim({inference_options.batch, 1, 28, 28})); + phi::DenseTensor input; + input.set_meta(meta); + context.Alloc(&input, input.numel() * sizeof(float)); + std::vector host_data(inference_options.batch * 1 * 28 * 28, 0); + for (size_t i = 0; i < host_data.size(); ++i) { + host_data[i] = i % 100 * 0.016f; + } + paddle::memory::Copy(place, + input.data(), + phi::CPUPlace(), + host_data.data(), + sizeof(float) * host_data.size(), + context.stream()); + + std::unordered_map inputs; + inputs.emplace(std::make_pair(model_input, &input)); + engine.PrepareOutputHandle("output_0"); + engine.SetUpInference(inference_options, inputs); + engine.GetEngineInfo(); + engine.Run(context); + cudaStreamSynchronize(context.stream()); +} + +TEST(trt, run_conv_static) { + TrtEngine engine(0); + auto net = ConstructConvNetwork( + engine.GetTrtBuilder(), nvinfer1::Dims3{3, 28, 28}, true); + BuildOptions build_options; + build_options.max_batch = 4; + build_options.workspace = 1024; + engine.Build(std::move(net), build_options); + + InferenceOptions inference_options; + inference_options.batch = 1; + + phi::GPUPlace place; + phi::GPUContext context; + context.PartialInitWithoutAllocator(); + context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place, context.stream()) + .get()); + context.PartialInitWithAllocator(); + + phi::DenseTensorMeta meta( + phi::DataType::FLOAT32, + phi::make_ddim({inference_options.batch, 3, 28, 28})); + phi::DenseTensor input; + input.set_meta(meta); + context.Alloc(&input, input.numel() * sizeof(float)); + std::vector host_data(inference_options.batch * 3 * 28 * 28, 0); + for (size_t i = 0; i < host_data.size(); ++i) { + host_data[i] = i % 100 * 0.016f; + } + paddle::memory::Copy(place, + input.data(), + phi::CPUPlace(), + host_data.data(), + sizeof(float) * host_data.size(), + context.stream()); + + std::unordered_map inputs; + inputs.emplace(std::make_pair(model_input, &input)); + engine.PrepareOutputHandle("output_0"); + engine.SetUpInference(inference_options, inputs); + engine.GetEngineInfo(); + engine.Run(context); + cudaStreamSynchronize(context.stream()); +} + TEST(trt, run_static) { TrtEngine static_trt_engine(0); auto net = ConstructNetwork( diff --git a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc index 8966ca13c2b..f8d8f514749 100644 --- a/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc +++ b/paddle/infrt/dialect/infrt/ir/infrt_dialect.cc @@ -142,9 +142,6 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const { return infrt::DenseTensorListType::get(parser.getContext()); } - if (keyword == "dense_tensor_map") { - return DenseTensorMapType::get(parser.getContext()); - } // Todo: parse other type return mlir::Type(); } @@ -181,6 +178,7 @@ void InfrtDialect::printType(::mlir::Type type, if (type.isa()) { os << "tensor_list"; + return; } // print DenseTensorType, for example: !infrt.dense_tensor if (type.isa()) { diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.td b/paddle/infrt/dialect/tensorrt/trt_ops.td index 31b28a38e7c..803a11ed5b7 100755 --- a/paddle/infrt/dialect/tensorrt/trt_ops.td +++ b/paddle/infrt/dialect/tensorrt/trt_ops.td @@ -60,6 +60,39 @@ def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> { let results = (outs DenseTensor:$output); } +def TRT_FullyConnectedOp : TRT_Op<"FullyConnected", [NoSideEffect]> { + let summary = "TensorRT IFullyConnectedLayer"; + let description = [{ + TensorRT IFullyConnectedLayer + }]; + let arguments = (ins + DenseTensor:$input_tensor, + DenseTensor:$kernel_weights, + DenseTensor:$bias_weights, + SI32Attr:$out_channel_num + ); + let results = (outs + DenseTensor:$output_tensor + ); +} + +def TRT_ConvolutionOp : TRT_Op<"Convolution", [NoSideEffect]> { + let summary = "TensorRT IConvolutionLayer"; + let description = [{ + TensorRT IConvolutionLayer + }]; + let arguments = (ins + DenseTensor:$input_tensor, + DenseTensor:$kernel_weights, + DenseTensor:$bias_weights, + SI32Attr:$out_channel_num, + I32ArrayAttr:$kernel_size + ); + let results = (outs + DenseTensor:$output_tensor + ); +} + def TRT_ElementWiseOp : TRT_Op<"ElementWise", [NoSideEffect]> { let summary = "TensorRT IElementWiseLayer"; let description = [{ diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc index 3d5cccb5c32..bcd44540b33 100644 --- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc +++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc @@ -298,14 +298,21 @@ bool MlirToRuntimeTranslator::EmitGeneralOp( // add a naive implement. for (int i = 0, e = op->getNumOperands(); i < e; ++i) { auto operand = op->getOperand(i); + Value* arg_value{nullptr}; if (operand.isa()) { mlir::BlockArgument arg = operand.dyn_cast(); - Value* arg_value = GetValue(arg); - if (arg_value->is_type()) { - impl_->runtime->FeedInArgs( - std::make_pair(std::to_string(i), ValueRef(arg_value))); + arg_value = GetValue(arg); + } else { + arg_value = GetValue(operand); + if (!arg_value) { + auto upstream_op = operand.getDefiningOp(); + arg_value = GetOpResult(upstream_op); } } + if (arg_value->is_type()) { + impl_->runtime->FeedInArgs( + std::make_pair(std::to_string(i), ValueRef(arg_value))); + } } #else CHECK(false) << "should not reach here"; diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc index 79502f9fdfd..a9077220cfc 100644 --- a/paddle/infrt/kernel/tensor_kernels.cc +++ b/paddle/infrt/kernel/tensor_kernels.cc @@ -146,8 +146,8 @@ void RegisterTensorKernels(host_context::KernelRegistry *registry) { // TensorList related methods. #ifdef INFRT_WITH_PHI - registry->AddKernel("dt.tensor_list_get_tensor", - INFRT_KERNEL(TensorListGetTensor)); + registry->AddKernelWithAttrs( + "dt.tensor_list_get_tensor", INFRT_KERNEL(TensorListGetTensor), {"id"}); registry->AddKernel("dt.tensor_list_get_size", INFRT_KERNEL(TensorListGetSize)); #endif diff --git a/paddle/infrt/kernel/tensorrt/trt_helper.h b/paddle/infrt/kernel/tensorrt/trt_helper.h new file mode 100644 index 00000000000..96122bffacd --- /dev/null +++ b/paddle/infrt/kernel/tensorrt/trt_helper.h @@ -0,0 +1,66 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "glog/logging.h" +#include "llvm/Support/ErrorHandling.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace infrt { +namespace kernel { +namespace tensorrt { + +static nvinfer1::DataType TensorTypeToWeightType(phi::DataType tensor_type) { + switch (tensor_type) { + case phi::DataType::FLOAT32: + return nvinfer1::DataType::kFLOAT; + case phi::DataType::INT32: + return nvinfer1::DataType::kINT32; + case phi::DataType::FLOAT16: + return nvinfer1::DataType::kHALF; + default: + llvm_unreachable("should not reach here"); + } +} + +static nvinfer1::Dims ArrayAttrToNvDims(const mlir::ArrayAttr& int_array_attr) { + nvinfer1::Dims dims; + dims.nbDims = int_array_attr.size(); + CHECK(!int_array_attr.empty()); + CHECK(int_array_attr[0].getType().isIntOrIndex()); + for (int i = 0; i < dims.nbDims; ++i) { + dims.d[i] = int_array_attr[i].cast().getInt(); + } + return dims; +} + +static nvinfer1::Weights TensorToWeights(phi::DenseTensor* tensor) { + CHECK_NOTNULL(tensor); + nvinfer1::Weights ret; + ret.type = TensorTypeToWeightType(tensor->dtype()); + ret.count = tensor->numel(); + ret.values = tensor->data(); + return ret; +} + +} // namespace tensorrt +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/tensorrt/trt_kernels.cc b/paddle/infrt/kernel/tensorrt/trt_kernels.cc index 04847ac8982..aa7609092b8 100644 --- a/paddle/infrt/kernel/tensorrt/trt_kernels.cc +++ b/paddle/infrt/kernel/tensorrt/trt_kernels.cc @@ -21,13 +21,19 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/Support/Casting.h" #include "llvm/Support/raw_ostream.h" +#include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Operation.h" #include "mlir/IR/Value.h" + +#include "paddle/infrt/kernel/tensorrt/trt_helper.h" +#include "paddle/infrt/kernel/tensorrt/trt_layers.h" + #include "paddle/infrt/backends/tensorrt/trt_engine.h" #include "paddle/infrt/backends/tensorrt/trt_options.h" #include "paddle/infrt/dialect/tensorrt/trt_ops.h" #include "paddle/infrt/host_context/symbol_table.h" +#include "paddle/phi/common/place.h" #include "paddle/phi/core/dense_tensor.h" namespace infrt { @@ -35,8 +41,7 @@ namespace kernel { namespace tensorrt { ::infrt::backends::tensorrt::TrtEngine CreateTrtEngine( - MlirOperationWithInfrtSymbol - create_engine_op /*, input_tensors, output_tensors, weights*/) { + MlirOperationWithInfrtSymbol create_engine_op) { // TODO(wilber): The device_id needs to get from mlir. int device_id = 0; backends::tensorrt::TrtEngine engine(device_id); @@ -51,6 +56,7 @@ namespace tensorrt { // TODO(wilber): The build option shoule be fiiled from mlir info. backends::tensorrt::BuildOptions options; options.max_batch = 4; + options.workspace = 1024; // Parse mlir Region which only has one block. mlir::Operation& operation = *create_engine_op.operation; @@ -62,8 +68,9 @@ namespace tensorrt { auto& region = operation.getRegion(0); auto& block = region.getBlocks().front(); - llvm::DenseMap map_info; std::unordered_map trt_bind_inputs; + ValueToITensorMap value_to_trt_tensor_map; + ValueToTensorMap value_to_tensor_map; for (auto index_operand : llvm::enumerate(operation.getOperands())) { mlir::Value operand = index_operand.value(); @@ -73,69 +80,72 @@ namespace tensorrt { auto* v = symbol_table->GetValue(std::to_string(idx)); CHECK_NOTNULL(v); auto* t = &v->get(); - trt_bind_inputs[input_name] = t; + value_to_tensor_map[operand] = t; + // TODO(wilber): get input info from mlir. + // TODO(wilber): input dims, now only support static_shape, and just remove - // the first dimension. + // the first dimension. If the first dim is not -1, maybe we can pass the + // origin dims. + // TODO(wilber): now only suppot float input. - nvinfer1::Dims dims; - dims.nbDims = t->dims().size() - 1; - for (int i = 0; i < dims.nbDims; ++i) { - dims.d[i] = t->dims()[i + 1]; - } - auto* in = - network->addInput(input_name.c_str(), nvinfer1::DataType::kFLOAT, dims); - map_info[operand] = in; - } - // TODO(wilber): Find a way to add layer. - for (auto& inner_op : block.without_terminator()) { - if (inner_op.getName().getStringRef() == "trt.Activation") { - trt::ActivationOp act_op = llvm::dyn_cast(inner_op); - auto in_arg = act_op.getOperand(); - if (!map_info.count(in_arg)) { - CHECK(false) << "map_info not has in_arg."; + if (operand.isa()) { + // TODO(wilber): A trick: the weights are CPU tensor and inputs are GPU + // tensor, so we treat all GPU tensors as inputs to trt. + if (t->place().GetType() == phi::AllocationType::GPU) { + trt_bind_inputs[input_name] = t; + nvinfer1::Dims dims; + dims.nbDims = t->dims().size() - 1; + for (int i = 0; i < dims.nbDims; ++i) { + dims.d[i] = t->dims()[i + 1]; + } + auto* in = network->addInput( + input_name.c_str(), nvinfer1::DataType::kFLOAT, dims); + value_to_trt_tensor_map[operand] = in; } - nvinfer1::ActivationType act_type = - static_cast(act_op.activation_type()); - auto* act_layer = network->addActivation(*map_info[in_arg], act_type); - act_layer->setAlpha(act_op.alpha().convertToFloat()); - act_layer->setBeta(act_op.beta().convertToFloat()); - for (size_t i = 0; i < act_op->getNumResults(); ++i) { - nvinfer1::ITensor* act_out_tensor = act_layer->getOutput(i); - mlir::Value act_out = act_op->getResult(i); - map_info[act_out] = act_out_tensor; + } else { + // TODO(wilber): Replace with the op name that generates the weights. + if (operand.getDefiningOp()->getName().getStringRef() != + "phi_dt.create_dense_tensor.cpu") { + trt_bind_inputs[input_name] = t; + nvinfer1::Dims dims; + dims.nbDims = t->dims().size() - 1; + for (int i = 0; i < dims.nbDims; ++i) { + dims.d[i] = t->dims()[i + 1]; + } + auto* in = network->addInput( + input_name.c_str(), nvinfer1::DataType::kFLOAT, dims); + value_to_trt_tensor_map[operand] = in; } } - - // if (inner_op.getName().getStringRef() == "trt.Constant") { - // trt::ConstantOp op = llvm::dyn_cast(inner_op); - // mlir::Value op_out = op.getResult(); - // std::vector weight_data{1}; - // auto* layer = network->addConstant(nvinfer1::Dims2(1, 1), - // nvinfer1::Weights{nvinfer1::DataType::kFLOAT, weight_data.data(), 1}); - // auto* op_out_tenor = layer->getOutput(0); - // map_info[op_out] = op_out_tenor; - // } } - for (auto& inner_op : block.without_terminator()) { - for (mlir::Value v : inner_op.getResults()) { - for (mlir::Operation* user : v.getUsers()) { - if (user->getName().getStringRef() == "infrt.return") { - if (!map_info.count(v)) { - CHECK(false) << "map_info not has value"; - } - network->markOutput(*map_info[v]); - } - } + + // TODO(wilber): Find a way to add layer. + for (auto& operation : block.without_terminator()) { + if (trt::ActivationOp op = llvm::dyn_cast(operation)) { + ActivationFunc( + op, network.get(), value_to_trt_tensor_map, value_to_tensor_map); + } else if (trt::FullyConnectedOp op = + llvm::dyn_cast(operation)) { + FcFunc(op, network.get(), value_to_trt_tensor_map, value_to_tensor_map); + } else if (trt::ConvolutionOp op = + llvm::dyn_cast(operation)) { + ConvFunc(op, network.get(), value_to_trt_tensor_map, value_to_tensor_map); + } else { + CHECK(false) << "not supported operation."; } } - // std::unordered_map trt_bind_outputs; - mlir::Operation* ret = block.getTerminator(); - for (unsigned int i = 0; i < ret->getNumOperands(); ++i) { - mlir::Value arg = ret->getOperand(i); - CHECK(map_info.count(arg)); - map_info[arg]->setName(("output_" + std::to_string(i)).c_str()); + + for (auto index_operand : + llvm::enumerate(block.getTerminator()->getOperands())) { + mlir::Value arg = index_operand.value(); + CHECK(value_to_trt_tensor_map.count(arg)); + // TODO(wilber): A trick that we name trt output tensor's name as output_0, + // output_1, ... + value_to_trt_tensor_map[arg]->setName( + ("output_" + std::to_string(index_operand.index())).c_str()); + network->markOutput(*value_to_trt_tensor_map[arg]); } for (int i = 0; i < network->getNbOutputs(); ++i) { engine.PrepareOutputHandle(network->getOutput(i)->getName()); diff --git a/paddle/infrt/kernel/tensorrt/trt_layers.h b/paddle/infrt/kernel/tensorrt/trt_layers.h new file mode 100644 index 00000000000..19e20c170ec --- /dev/null +++ b/paddle/infrt/kernel/tensorrt/trt_layers.h @@ -0,0 +1,104 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include + +#include "paddle/infrt/dialect/tensorrt/trt_ops.h" +#include "paddle/infrt/kernel/tensorrt/trt_helper.h" + +#include "paddle/phi/core/dense_tensor.h" + +namespace infrt { +namespace kernel { +namespace tensorrt { + +using ValueToTensorMap = llvm::DenseMap; +using ValueToITensorMap = llvm::DenseMap; + +inline void ActivationFunc( + trt::ActivationOp& act_op, // NOLINT + nvinfer1::INetworkDefinition* network, + ValueToITensorMap& value_to_trt_tensor_map, // NOLINT + ValueToTensorMap& value_to_tensor_map) { // NOLINT + auto in_arg = act_op.getOperand(); + CHECK(value_to_trt_tensor_map.count(in_arg)) + << "value_to_trt_tensor_map not has in_arg."; + + nvinfer1::ActivationType act_type = + static_cast(act_op.activation_type()); + auto* act_layer = + network->addActivation(*value_to_trt_tensor_map[in_arg], act_type); + act_layer->setAlpha(act_op.alpha().convertToFloat()); + act_layer->setBeta(act_op.beta().convertToFloat()); + for (size_t i = 0; i < act_op->getNumResults(); ++i) { + nvinfer1::ITensor* act_out_tensor = act_layer->getOutput(i); + mlir::Value act_out = act_op->getResult(i); + value_to_trt_tensor_map[act_out] = act_out_tensor; + } +} + +inline void ConvFunc(trt::ConvolutionOp& op, // NOLINT + nvinfer1::INetworkDefinition* network, + ValueToITensorMap& value_to_trt_tensor_map, // NOLINT + ValueToTensorMap& value_to_tensor_map) { // NOLINT + mlir::Value input_tensor_repr = op.input_tensor(); + int out_channel_num = op.out_channel_num(); + auto size_attrs = op.kernel_size(); + nvinfer1::Dims dims = ArrayAttrToNvDims(size_attrs); + auto kernel_weights = + TensorToWeights(value_to_tensor_map[op.kernel_weights()]); + auto bias_weights = TensorToWeights(value_to_tensor_map[op.bias_weights()]); + + auto* layer = + network->addConvolutionNd(*value_to_trt_tensor_map[input_tensor_repr], + out_channel_num, + dims, + kernel_weights, + bias_weights); + CHECK_NOTNULL(layer); + mlir::Value out_repr = op.output_tensor(); + nvinfer1::ITensor* out_tensor = layer->getOutput(0); + value_to_trt_tensor_map[out_repr] = out_tensor; +} + +inline void FcFunc(trt::FullyConnectedOp& op, // NOLINT + nvinfer1::INetworkDefinition* network, + ValueToITensorMap& value_to_trt_tensor_map, // NOLINT + ValueToTensorMap& value_to_tensor_map) { // NOLINT + mlir::Value input_tensor_repr = op.input_tensor(); + CHECK(value_to_trt_tensor_map.count(input_tensor_repr)); + + auto kernel_weights = + TensorToWeights(value_to_tensor_map[op.kernel_weights()]); + auto bias_weights = TensorToWeights(value_to_tensor_map[op.bias_weights()]); + + int out_channel_num = op.out_channel_num(); + auto* layer = + network->addFullyConnected(*value_to_trt_tensor_map[input_tensor_repr], + out_channel_num, + kernel_weights, + bias_weights); + + mlir::Value out_repr = op.output_tensor(); + nvinfer1::ITensor* out_tensor = layer->getOutput(0); + value_to_trt_tensor_map[out_repr] = out_tensor; +} +} // namespace tensorrt +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/tests/dialect/disabled_trt.mlir b/paddle/infrt/tests/dialect/tensorrt/disabled_trt.mlir similarity index 100% rename from paddle/infrt/tests/dialect/disabled_trt.mlir rename to paddle/infrt/tests/dialect/tensorrt/disabled_trt.mlir diff --git a/paddle/infrt/tests/dialect/tensorrt/disabled_trt_conv.mlir b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_conv.mlir new file mode 100644 index 00000000000..c67d47415bf --- /dev/null +++ b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_conv.mlir @@ -0,0 +1,54 @@ +// RUN: infrtexec -i %s | FileCheck %s + +// CHECK-LABEL: @run_trt +func @run_trt(%input_tensor : !infrt.dense_tensor, %kernel_weight : !infrt.dense_tensor, %kernel_bias : !infrt.dense_tensor, %gpu_ctx : !phi.context) { + %a = "trt.create_engine"(%input_tensor, %kernel_weight, %kernel_bias) ({ + %1 = "trt.Activation"(%input_tensor) {activation_type = 1 : si32, alpha = 1.0 : f32, beta = 6.0 : f32} : (!infrt.dense_tensor) -> !infrt.dense_tensor + %2 = "trt.Convolution"(%input_tensor, %kernel_weight, %kernel_bias) {out_channel_num = 3 : si32, kernel_size = [3:i32, 3:i32]} : (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + "infrt.return"(%1, %2) : (!infrt.dense_tensor, !infrt.dense_tensor) -> () + }) : (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> !trt.engine + "trt.inspect_engine"(%a) {} : (!trt.engine) -> () + + %res = "trt.compute"(%a, %gpu_ctx) {} : (!trt.engine, !phi.context) -> (!infrt.tensor_list) + %size = "dt.tensor_list_get_size"(%res) {} : (!infrt.tensor_list) -> (i32) + "infrt.print.i32"(%size) {} : (i32) -> () + + %ts0 = "dt.tensor_list_get_tensor"(%res) {id = 0 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor) + "phi_dt.print_tensor" (%ts0) : (!infrt.dense_tensor) -> () + + %ts1 = "dt.tensor_list_get_tensor"(%res) {id = 1 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor) + "phi_dt.print_tensor" (%ts1) : (!infrt.dense_tensor) -> () + + infrt.return +} + +// CHECK-LABEL: @main +func @main() { + %gpu_ctx = "phi_dt.create_context.gpu" (): () -> !phi.context + %cpu_ctx = "phi_dt.create_context.cpu" (): () -> !phi.context + + %input_tensor = "phi_dt.create_dense_tensor.gpu" (%gpu_ctx) { + precision=#infrt.precision, + layout=#infrt.layout, + dims=[1:i64, 3:i64, 28:i64, 28:i64], lod=[0:i64]}: (!phi.context) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%input_tensor) {value=[3.8:f32, 2.4:f32, 1.3:f32]} : (!infrt.dense_tensor) -> () + // "phi_dt.print_tensor" (%input_tensor) : (!infrt.dense_tensor) -> () + + %kernel_weight = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) { + precision=#infrt.precision, + layout=#infrt.layout, + dims=[3:i64, 3:i64, 3:i64, 3:i64], lod=[0:i64]} : (!phi.context) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%kernel_weight) {value=[1.:f32, 2.:f32, 3.:f32, 4.:f32, 5.:f32, 6.:f32]} : (!infrt.dense_tensor) -> () + // "phi_dt.print_tensor" (%kernel_weight) : (!infrt.dense_tensor) -> () + + %kernel_bias = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) { + precision=#infrt.precision, + layout=#infrt.layout, + dims=[3:i64], lod=[0:i64]} : (!phi.context) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%kernel_bias) {value=[1.:f32]} : (!infrt.dense_tensor) -> () + // "phi_dt.print_tensor" (%kernel_bias) : (!infrt.dense_tensor) -> () + + infrt.call @run_trt(%input_tensor, %kernel_weight, %kernel_bias, %gpu_ctx) : (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor, !phi.context) -> () + + infrt.return +} diff --git a/paddle/infrt/tests/dialect/tensorrt/disabled_trt_fc.mlir b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_fc.mlir new file mode 100644 index 00000000000..78dc4ac1c10 --- /dev/null +++ b/paddle/infrt/tests/dialect/tensorrt/disabled_trt_fc.mlir @@ -0,0 +1,46 @@ +// RUN: infrtexec -i %s | FileCheck %s + +// CHECK-LABEL: @main +func @main() { + %ctx = "phi_dt.create_context.gpu" (): () -> !phi.context + %cpu_ctx = "phi_dt.create_context.cpu" (): () -> !phi.context + + %input_tensor = "phi_dt.create_dense_tensor.gpu" (%ctx) { + precision=#infrt.precision, + layout=#infrt.layout, + dims=[1:i64, 3:i64, 1:i64, 1:i64], lod=[1:i64]}: (!phi.context) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%input_tensor) {value=[3.8:f32, 2.4:f32, 1.3:f32]} : (!infrt.dense_tensor) -> () + //"phi_dt.print_tensor" (%input_tensor) : (!infrt.dense_tensor) -> () + + %kernel_weight = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) { + precision=#infrt.precision, + layout=#infrt.layout, + dims=[2:i64, 3:i64], lod=[1:i64]} : (!phi.context) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%kernel_weight) {value=[1.:f32, 2.:f32, 3.:f32, 4.:f32, 5.:f32, 6.:f32]} : (!infrt.dense_tensor) -> () + //"phi_dt.print_tensor" (%kernel_weight) : (!infrt.dense_tensor) -> () + + %kernel_bias = "phi_dt.create_dense_tensor.cpu"(%cpu_ctx) { + precision=#infrt.precision, + layout=#infrt.layout, + dims=[2:i64], lod=[1:i64]} : (!phi.context) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%kernel_bias) {value=[1.:f32, 2.:f32]} : (!infrt.dense_tensor) -> () + //"phi_dt.print_tensor" (%kernel_bias) : (!infrt.dense_tensor) -> () + + %engine = "trt.create_engine"(%input_tensor, %kernel_weight, %kernel_bias) ({ + %1 = "trt.Activation"(%input_tensor) {activation_type = 1 : si32, alpha = 1.0 : f32, beta = 6.0 : f32} : (!infrt.dense_tensor) -> !infrt.dense_tensor + %2 = "trt.FullyConnected"(%input_tensor, %kernel_weight, %kernel_bias) {out_channel_num = 2 : si32} : (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor + "infrt.return"(%1, %2) : (!infrt.dense_tensor, !infrt.dense_tensor) -> () + }) : (!infrt.dense_tensor, !infrt.dense_tensor, !infrt.dense_tensor) -> !trt.engine + + %res = "trt.compute"(%engine, %ctx) {} : (!trt.engine, !phi.context) -> (!infrt.tensor_list) + %size = "dt.tensor_list_get_size"(%res) {} : (!infrt.tensor_list) -> (i32) + "infrt.print.i32"(%size) {} : (i32) -> () + + %ts0 = "dt.tensor_list_get_tensor"(%res) {id = 0 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor) + "phi_dt.print_tensor" (%ts0) : (!infrt.dense_tensor) -> () + + %ts1 = "dt.tensor_list_get_tensor"(%res) {id = 1 : i32} : (!infrt.tensor_list) -> (!infrt.dense_tensor) + "phi_dt.print_tensor" (%ts1) : (!infrt.dense_tensor) -> () + + infrt.return +} -- GitLab From 7f93e2b07185bcd2c0ea43030c872ca9766a0c19 Mon Sep 17 00:00:00 2001 From: zhangyikun02 <48021248+zhangyk0314@users.noreply.github.com> Date: Fri, 18 Mar 2022 14:56:07 +0800 Subject: [PATCH 162/176] update unittests for tile op and silce op on XPU, test=kunlun (#40227) --- .../tests/unittests/xpu/test_slice_op_xpu.py | 313 ++++++++-------- .../tests/unittests/xpu/test_tile_op_xpu.py | 343 +++++++----------- 2 files changed, 299 insertions(+), 357 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py index 8f3578b526e..3d7c9959db9 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py @@ -18,169 +18,174 @@ import sys import unittest sys.path.append("..") from op_test import OpTest +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper paddle.enable_static() # Situation 1: starts(list, no tensor), ends(list, no tensor) # 1.1 without attr(decrease) -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestSliceOp(OpTest): - def setUp(self): - self.op_type = "slice" - self.config() - self.inputs = {'Input': self.input} - self.outputs = {'Out': self.out} - self.attrs = { - 'axes': self.axes, - 'starts': self.starts, - 'ends': self.ends, - 'infer_flags': self.infer_flags, - "use_xpu": True - } - - def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float32") - self.starts = [1, 0, 2] - self.ends = [3, 3, 4] - self.axes = [0, 1, 2] - self.infer_flags = [1, 1, 1] - self.out = self.input[1:3, 0:3, 2:4, :] - - def test_check_output(self): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad_normal(self): - place = paddle.XPUPlace(0) - self.check_grad_with_place(place, ['Input'], 'Out') - - -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestCase1(TestSliceOp): - def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float32") - self.starts = [-3, 0, 2] - self.ends = [3, 100, -1] - self.axes = [0, 1, 2] - self.infer_flags = [1, 1, 1] - self.out = self.input[-3:3, 0:100, 2:-1, :] - - -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestCase2(TestSliceOp): - def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float32") - self.starts = [-3, 0, 2] - self.ends = [3, 100, -1] - self.axes = [0, 1, 3] - self.infer_flags = [1, 1, 1] - self.out = self.input[-3:3, 0:100, :, 2:-1] +class XPUTestSliceOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'slice' + self.use_dynamic_create_class = False + + class TestSliceOp(XPUOpTest): + def setUp(self): + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + self.op_type = "slice" + self.config() + self.inputs = {'Input': self.input} + self.outputs = {'Out': self.out} + self.attrs = { + 'axes': self.axes, + 'starts': self.starts, + 'ends': self.ends, + 'infer_flags': self.infer_flags, + "use_xpu": True + } + + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [1, 0, 2] + self.ends = [3, 3, 4] + self.axes = [0, 1, 2] + self.infer_flags = [1, 1, 1] + self.out = self.input[1:3, 0:3, 2:4, :] + + def test_check_grad_normal(self): + if self.dtype == np.float16: + self.check_grad_with_place(self.place, ['Input'], 'Out') + else: + user_defined_grad_outputs = np.random.random( + self.out.shape).astype(self.dtype) + self.check_grad_with_place( + self.place, ['Input'], + 'Out', + user_defined_grad_outputs=user_defined_grad_outputs) + + class TestCase1(TestSliceOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [-3, 0, 2] + self.ends = [3, 100, -1] + self.axes = [0, 1, 2] + self.infer_flags = [1, 1, 1] + self.out = self.input[-3:3, 0:100, 2:-1, :] + + class TestCase2(TestSliceOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [-3, 0, 2] + self.ends = [3, 100, -1] + self.axes = [0, 1, 3] + self.infer_flags = [1, 1, 1] + self.out = self.input[-3:3, 0:100, :, 2:-1] # 1.2 with attr(decrease) -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestSliceOp_decs_dim(OpTest): - def setUp(self): - self.op_type = "slice" - self.config() - self.inputs = {'Input': self.input} - self.outputs = {'Out': self.out} - self.attrs = { - 'axes': self.axes, - 'starts': self.starts, - 'ends': self.ends, - 'infer_flags': self.infer_flags, - 'decrease_axis': self.decrease_axis, - "use_xpu": True - } - - def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float32") - self.starts = [1, 0, 2] - self.ends = [2, 3, 4] - self.axes = [0, 1, 2] - self.decrease_axis = [0] - self.infer_flags = [1, 1, 1] - self.out = self.input[1, 0:3, 2:4, :] - - def test_check_output(self): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad_normal(self): - place = paddle.XPUPlace(0) - self.check_grad_with_place(place, ['Input'], 'Out') - - -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim): - def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float32") - self.starts = [1, 0, 2] - self.ends = [2, 1, 4] - self.axes = [0, 1, 2] - self.decrease_axis = [0, 1] - self.infer_flags = [1, 1, 1] - self.out = self.input[1, 0, 2:4, :] - - -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim): - def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float32") - self.starts = [-1, 0, 2] - self.ends = [1000000, 1, 4] - self.axes = [0, 1, 2] - self.decrease_axis = [0, 1] - self.infer_flags = [1, 1, 1] - self.out = self.input[-1, 0, 2:4, :] - - -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim): - def config(self): - self.input = np.random.random([3, 4, 5, 7]).astype("float32") - self.starts = [0, 1, 2, 3] - self.ends = [1, 2, 3, 4] - self.axes = [0, 1, 2, 3] - self.decrease_axis = [0, 1, 2, 3] - self.infer_flags = [1, 1, 1] - self.out = self.input[0, 1, 2, 3:4] - - -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim): - def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float32") - self.starts = [-1] - self.ends = [1000000] - self.axes = [3] - self.decrease_axis = [3] - self.infer_flags = [1, 1, 1] - self.out = self.input[:, :, :, -1] - - -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim): - def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float32") - self.starts = [0, 1, 2, 3] - self.ends = [1, 2, 3, 4] - self.axes = [0, 1, 2, 3] - self.decrease_axis = [0, 1, 2, 3] - self.infer_flags = [1, 1, 1] - self.out = self.input[0, 1, 2, 3:4] - +class XPUTestSliceOp_decs_dim(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'slice' + self.use_dynamic_create_class = False + + class TestSliceOp_decs_dim(XPUOpTest): + def setUp(self): + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + self.op_type = "slice" + self.config() + self.inputs = {'Input': self.input} + self.outputs = {'Out': self.out} + self.attrs = { + 'axes': self.axes, + 'starts': self.starts, + 'ends': self.ends, + 'infer_flags': self.infer_flags, + 'decrease_axis': self.decrease_axis, + "use_xpu": True + } + + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [1, 0, 2] + self.ends = [2, 3, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0] + self.infer_flags = [1, 1, 1] + self.out = self.input[1, 0:3, 2:4, :] + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad_normal(self): + if self.dtype == np.float16: + self.check_grad_with_place(self.place, ['Input'], 'Out') + else: + user_defined_grad_outputs = np.random.random( + self.out.shape).astype(self.dtype) + self.check_grad_with_place( + self.place, ['Input'], + 'Out', + user_defined_grad_outputs=user_defined_grad_outputs) + + class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [1, 0, 2] + self.ends = [2, 1, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0, 1] + self.infer_flags = [1, 1, 1] + self.out = self.input[1, 0, 2:4, :] + + class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [-1, 0, 2] + self.ends = [1000000, 1, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0, 1] + self.infer_flags = [1, 1, 1] + self.out = self.input[-1, 0, 2:4, :] + + class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim): + def config(self): + self.input = np.random.random([3, 4, 5, 7]).astype(self.dtype) + self.starts = [0, 1, 2, 3] + self.ends = [1, 2, 3, 4] + self.axes = [0, 1, 2, 3] + self.decrease_axis = [0, 1, 2, 3] + self.infer_flags = [1, 1, 1] + self.out = self.input[0, 1, 2, 3:4] + + class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [-1] + self.ends = [1000000] + self.axes = [3] + self.decrease_axis = [3] + self.infer_flags = [1, 1, 1] + self.out = self.input[:, :, :, -1] + + class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [0, 1, 2, 3] + self.ends = [1, 2, 3, 4] + self.axes = [0, 1, 2, 3] + self.decrease_axis = [0, 1, 2, 3] + self.infer_flags = [1, 1, 1] + self.out = self.input[0, 1, 2, 3:4] + + +support_types = get_xpu_op_support_types('slice') +for stype in support_types: + create_test_class(globals(), XPUTestSliceOp, stype) + create_test_class(globals(), XPUTestSliceOp_decs_dim, stype) if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py index d010e163357..cd18bd63a88 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py @@ -24,221 +24,158 @@ import paddle import paddle.fluid as fluid from paddle.fluid import compiler, Program, program_guard from paddle.fluid import core +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper paddle.enable_static() np.random.seed(10) #Situation 1: repeat_times is a list (without tensor) -class TestTileOpRank1(XPUOpTest): - def setUp(self): - self.set_xpu() - self.place = paddle.XPUPlace(0) - self.op_type = "tile" - self.init_data() - - self.inputs = {'X': np.random.random(self.ori_shape).astype("float32")} - self.attrs = {'repeat_times': self.repeat_times} - output = np.tile(self.inputs['X'], self.repeat_times) - self.outputs = {'Out': output} - - def set_xpu(self): - self.__class__.use_xpu = True - - def init_data(self): - self.ori_shape = [100] - self.repeat_times = [2] - - def test_check_output(self): - self.check_output_with_place(self.place) - - def test_check_grad(self): - pass - - -#with dimension expanding -class TestTileOpRank2Expanding(TestTileOpRank1): - def init_data(self): - self.ori_shape = [120] - self.repeat_times = [2, 2] - - -class TestTileOpRank2(TestTileOpRank1): - def init_data(self): - self.ori_shape = [12, 14] - self.repeat_times = [2, 3] - - -class TestTileOpRank3_Corner(TestTileOpRank1): - def init_data(self): - self.ori_shape = (2, 10, 5) - self.repeat_times = (1, 1, 1) - - -class TestTileOpRank3_Corner2(TestTileOpRank1): - def init_data(self): - self.ori_shape = (2, 10, 5) - self.repeat_times = (2, 2) - - -class TestTileOpRank3(TestTileOpRank1): - def init_data(self): - self.ori_shape = (2, 4, 15) - self.repeat_times = (2, 1, 4) - - -class TestTileOpRank4(TestTileOpRank1): - def init_data(self): - self.ori_shape = (2, 4, 5, 7) - self.repeat_times = (3, 2, 1, 2) +class XPUTestTileOpRank1(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'tile' + self.use_dynamic_create_class = False + + class TestTileOpRank1(XPUOpTest): + def setUp(self): + self.dtype = self.in_type + self.__class__.no_need_check_grad = True + self.place = paddle.XPUPlace(0) + self.op_type = "tile" + self.init_data() + self.inputs = { + 'X': np.random.random(self.ori_shape).astype(self.dtype) + } + self.attrs = {'repeat_times': self.repeat_times} + output = np.tile(self.inputs['X'], self.repeat_times) + self.outputs = {'Out': output} + + def init_data(self): + self.ori_shape = [100] + self.repeat_times = [2] + + def test_check_output(self): + self.check_output_with_place(self.place) + + #with dimension expanding + class TestTileOpRank2Expanding(TestTileOpRank1): + def init_data(self): + self.ori_shape = [120] + self.repeat_times = [2, 2] + + class TestTileOpRank2(TestTileOpRank1): + def init_data(self): + self.ori_shape = [12, 14] + self.repeat_times = [2, 3] + + class TestTileOpRank3_Corner(TestTileOpRank1): + def init_data(self): + self.ori_shape = (2, 10, 5) + self.repeat_times = (1, 1, 1) + + class TestTileOpRank3_Corner2(TestTileOpRank1): + def init_data(self): + self.ori_shape = (2, 10, 5) + self.repeat_times = (2, 2) + + class TestTileOpRank3(TestTileOpRank1): + def init_data(self): + self.ori_shape = (2, 4, 15) + self.repeat_times = (2, 1, 4) + + class TestTileOpRank4(TestTileOpRank1): + def init_data(self): + self.ori_shape = (2, 4, 5, 7) + self.repeat_times = (3, 2, 1, 2) # Situation 2: repeat_times is a list (with tensor) -class TestTileOpRank1_tensor_attr(XPUOpTest): - def setUp(self): - self.set_xpu() - self.place = paddle.XPUPlace(0) - self.op_type = "tile" - self.init_data() - repeat_times_tensor = [] - for index, ele in enumerate(self.repeat_times): - repeat_times_tensor.append(("x" + str(index), np.ones( - (1)).astype('int32') * ele)) - - self.inputs = { - 'X': np.random.random(self.ori_shape).astype("float32"), - 'repeat_times_tensor': repeat_times_tensor, - } - self.attrs = {"repeat_times": self.infer_repeat_times} - output = np.tile(self.inputs['X'], self.repeat_times) - self.outputs = {'Out': output} - - def set_xpu(self): - self.__class__.use_xpu = True - - def init_data(self): - self.ori_shape = [100] - self.repeat_times = [2] - self.infer_repeat_times = [-1] - - def test_check_output(self): - self.check_output_with_place(self.place) - - def test_check_grad(self): - pass - - -class TestTileOpRank2_Corner_tensor_attr(TestTileOpRank1_tensor_attr): - def init_data(self): - self.ori_shape = [12, 14] - self.repeat_times = [1, 1] - self.infer_repeat_times = [1, -1] - - -class TestTileOpRank2_attr_tensor(TestTileOpRank1_tensor_attr): - def init_data(self): - self.ori_shape = [12, 14] - self.repeat_times = [2, 3] - self.infer_repeat_times = [-1, 3] +class XPUTestTileOpRank1_tensor_attr(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'tile' + self.use_dynamic_create_class = False + + class TestTileOpRank1_tensor_attr(XPUOpTest): + def setUp(self): + self.dtype = self.in_type + self.__class__.no_need_check_grad = True + self.place = paddle.XPUPlace(0) + self.op_type = "tile" + self.init_data() + repeat_times_tensor = [] + for index, ele in enumerate(self.repeat_times): + repeat_times_tensor.append(("x" + str(index), np.ones( + (1)).astype('int32') * ele)) + + self.inputs = { + 'X': np.random.random(self.ori_shape).astype(self.dtype), + 'repeat_times_tensor': repeat_times_tensor, + } + self.attrs = {"repeat_times": self.infer_repeat_times} + output = np.tile(self.inputs['X'], self.repeat_times) + self.outputs = {'Out': output} + + def init_data(self): + self.ori_shape = [100] + self.repeat_times = [2] + self.infer_repeat_times = [-1] + + def test_check_output(self): + self.check_output_with_place(self.place) + + class TestTileOpRank2_Corner_tensor_attr(TestTileOpRank1_tensor_attr): + def init_data(self): + self.ori_shape = [12, 14] + self.repeat_times = [1, 1] + self.infer_repeat_times = [1, -1] + + class TestTileOpRank2_attr_tensor(TestTileOpRank1_tensor_attr): + def init_data(self): + self.ori_shape = [12, 14] + self.repeat_times = [2, 3] + self.infer_repeat_times = [-1, 3] # Situation 3: repeat_times is a tensor -class TestTileOpRank1_tensor(XPUOpTest): - def setUp(self): - self.set_xpu() - self.place = paddle.XPUPlace(0) - self.op_type = "tile" - self.init_data() - - self.inputs = { - 'X': np.random.random(self.ori_shape).astype("float32"), - 'RepeatTimes': np.array(self.repeat_times).astype("int32"), - } - self.attrs = {} - output = np.tile(self.inputs['X'], self.repeat_times) - self.outputs = {'Out': output} - - def set_xpu(self): - self.__class__.use_xpu = True - - def init_data(self): - self.ori_shape = [100] - self.repeat_times = [2] - - def test_check_output(self): - self.check_output_with_place(self.place) - - def test_check_grad(self): - pass - - -class TestTileOpRank2_tensor(TestTileOpRank1_tensor): - def init_data(self): - self.ori_shape = [12, 14] - self.repeat_times = [2, 3] - - -# Situation 4: input x is Integer -class TestTileOpInteger(XPUOpTest): - def setUp(self): - self.set_xpu() - self.place = paddle.XPUPlace(0) - self.op_type = "tile" - self.inputs = { - 'X': np.random.randint( - 10, size=(4, 4, 5)).astype("int32") - } - self.attrs = {'repeat_times': [2, 1, 4]} - output = np.tile(self.inputs['X'], (2, 1, 4)) - self.outputs = {'Out': output} - - def set_xpu(self): - self.__class__.use_xpu = True - - def test_check_output(self): - self.check_output_with_place(self.place) - - -# Situation 5: input x is Integer -class TestTileOpInt64_t(XPUOpTest): - def setUp(self): - self.set_xpu() - self.place = paddle.XPUPlace(0) - self.op_type = "tile" - self.inputs = { - 'X': np.random.randint( - 10, size=(2, 4, 5)).astype("int64") - } - self.attrs = {'repeat_times': [2, 1, 4]} - output = np.tile(self.inputs['X'], (2, 1, 4)) - self.outputs = {'Out': output} - - def set_xpu(self): - self.__class__.use_xpu = True - - def test_check_output(self): - self.check_output_with_place(self.place) - - -# Situation 6: input x is Bool -class TestTileOpBool(XPUOpTest): - def setUp(self): - self.set_xpu() - self.place = paddle.XPUPlace(0) - self.op_type = "tile" - self.inputs = { - 'X': np.random.randint( - 10, size=(2, 4, 5)).astype("bool") - } - self.attrs = {'repeat_times': [2, 1, 4]} - output = np.tile(self.inputs['X'], (2, 1, 4)) - self.outputs = {'Out': output} - - def set_xpu(self): - self.__class__.use_xpu = True - - def test_check_output(self): - self.check_output_with_place(self.place) +class XPUTestTileOpRank1_tensor(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'tile' + self.use_dynamic_create_class = False + + class TestTileOpRank1_tensor(XPUOpTest): + def setUp(self): + self.dtype = self.in_type + self.__class__.no_need_check_grad = True + self.place = paddle.XPUPlace(0) + self.op_type = "tile" + self.init_data() + + self.inputs = { + 'X': np.random.random(self.ori_shape).astype(self.dtype), + 'RepeatTimes': np.array(self.repeat_times).astype("int32"), + } + self.attrs = {} + output = np.tile(self.inputs['X'], self.repeat_times) + self.outputs = {'Out': output} + + def init_data(self): + self.ori_shape = [100] + self.repeat_times = [2] + + def test_check_output(self): + self.check_output_with_place(self.place) + + class TestTileOpRank2_tensor(TestTileOpRank1_tensor): + def init_data(self): + self.ori_shape = [12, 14] + self.repeat_times = [2, 3] + + +support_types = get_xpu_op_support_types('tile') +for stype in support_types: + create_test_class(globals(), XPUTestTileOpRank1, stype) + create_test_class(globals(), XPUTestTileOpRank1_tensor_attr, stype) + create_test_class(globals(), XPUTestTileOpRank1_tensor, stype) # Test python API -- GitLab From 579173d8e6a8bb6039671cb6b215730b7b9614c5 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Fri, 18 Mar 2022 15:07:18 +0800 Subject: [PATCH 163/176] [Phi] Move infershape of roi_pool to phi (#40682) * move infershape of roi_pool to phi * polish code --- paddle/fluid/operators/roi_pool_op.cc | 78 +++------------------- paddle/phi/infermeta/ternary.cc | 95 ++++++++++++++++++++++++--- paddle/phi/infermeta/ternary.h | 9 +++ 3 files changed, 102 insertions(+), 80 deletions(-) diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc index 9fd66590cb7..12e33d56c00 100644 --- a/paddle/fluid/operators/roi_pool_op.cc +++ b/paddle/fluid/operators/roi_pool_op.cc @@ -13,9 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/phi/kernels/roi_pool_kernel.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -27,74 +29,6 @@ class ROIPoolOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "roi_pool"); - OP_INOUT_CHECK(ctx->HasInput("ROIs"), "Input", "ROIs", "roi_pool"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "roi_pool"); - OP_INOUT_CHECK(ctx->HasOutput("Argmax"), "Output", "Argmax", "roi_pool"); - - auto input_dims = ctx->GetInputDim("X"); - auto rois_dims = ctx->GetInputDim("ROIs"); - - if (ctx->HasInput("RoisNum")) { - auto rois_num_dims = ctx->GetInputDim("RoisNum"); - PADDLE_ENFORCE_EQ(rois_num_dims.size(), 1, - platform::errors::InvalidArgument( - "The second dimension of RoisNum should " - "be 1, but received dimension is %d", - rois_num_dims.size())); - } - PADDLE_ENFORCE_EQ(input_dims.size(), 4, - platform::errors::InvalidArgument( - "The input data should be a four-dimensional " - "tensor with [N,C,H,W], but received input data with " - " %d dimension", - input_dims.size())); - PADDLE_ENFORCE_EQ( - rois_dims.size(), 2, - platform::errors::InvalidArgument( - "ROIs should be a 2-D LoDTensor with shape (num_rois, 4)" - "given as [[x1, y1, x2, y2], ...], but received ROIs is " - "%d-dimensional LoDTensor", - rois_dims.size())); - PADDLE_ENFORCE_EQ( - rois_dims[1], phi::kROISize, - platform::errors::InvalidArgument( - "ROIs should be a 2-D LoDTensor with shape (num_rois, 4)" - "given as [[x1, y1, x2, y2], ...]. But the second dimension of " - "the received data is %d", - rois_dims[1])); - - int pooled_height = ctx->Attrs().Get("pooled_height"); - int pooled_width = ctx->Attrs().Get("pooled_width"); - float spatial_scale = ctx->Attrs().Get("spatial_scale"); - - PADDLE_ENFORCE_GT(pooled_height, 0, - platform::errors::OutOfRange( - "The pooled output height must be greater than 0" - "but received height is %d", - pooled_height)); - PADDLE_ENFORCE_GT(pooled_width, 0, - platform::errors::OutOfRange( - "The pooled output width must be greater than 0" - "but received width is %d", - pooled_width)); - PADDLE_ENFORCE_GT(spatial_scale, 0.0f, - platform::errors::OutOfRange( - "The spatial scale must be greater than 0, " - "but received spatial scale is %f", - spatial_scale)); - - auto out_dims = input_dims; - out_dims[0] = rois_dims[0]; - out_dims[1] = input_dims[1]; - out_dims[2] = pooled_height; - out_dims[3] = pooled_width; - - ctx->SetOutputDim("Out", out_dims); - ctx->SetOutputDim("Argmax", out_dims); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -213,9 +147,13 @@ class ROIPoolGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(roi_pool, RoiPoolInferShapeFunctor, + PD_INFER_META(phi::RoiPoolInferMeta)); + REGISTER_OPERATOR(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, ops::ROIPoolGradMaker, - ops::ROIPoolGradMaker); + ops::ROIPoolGradMaker, + RoiPoolInferShapeFunctor); REGISTER_OPERATOR(roi_pool_grad, ops::ROIPoolGradOp); REGISTER_OP_VERSION(roi_pool) diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index 837750710c9..556fb874470 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -340,29 +340,29 @@ void RoiAlignInferMeta(const MetaTensor& x, PADDLE_ENFORCE_EQ( boxes_num_dims.size(), 1, - phi::errors::InvalidArgument("The size of RoisNum should be 1" + phi::errors::InvalidArgument("The size of boxes_num should be 1" ", but received size = %d", boxes_num_dims.size())); } PADDLE_ENFORCE_EQ(input_dims.size(), 4, phi::errors::InvalidArgument( - "The format of Input(X) in" - "RoIAlignOp is NCHW. And the rank of input must be 4. " + "The format of Input(x) in" + "RoiAlignOp is NCHW. And the rank of input must be 4. " "But received rank = %d", input_dims.size())); PADDLE_ENFORCE_EQ(boxes_dims.size(), 2, - phi::errors::InvalidArgument("The rank of Input(ROIs) " - "in RoIAlignOp should be 2. " - "But the rank of RoIs is %d", + phi::errors::InvalidArgument("The rank of Input(boxes) " + "in RoiAlignOp should be 2. " + "But the rank of boxes is %d", boxes_dims.size())); if (config.is_runtime) { PADDLE_ENFORCE_EQ(boxes_dims[1], 4, phi::errors::InvalidArgument( "The second dimension " - "of Input(ROIs) should be 4. But received the " + "of Input(boxes) should be 4. But received the " "dimension = %d", boxes_dims[1])); } @@ -370,21 +370,21 @@ void RoiAlignInferMeta(const MetaTensor& x, PADDLE_ENFORCE_GT(pooled_height, 0, phi::errors::InvalidArgument( - "The 'pooled_height' attribute in RoIAlignOp is " + "The 'pooled_height' attribute in RoiAlignOp is " "invalid. The height must be greater than 0. But " "received 'pooled_height' = %d", pooled_height)); PADDLE_ENFORCE_GT(pooled_width, 0, phi::errors::InvalidArgument( - "The 'pooled_width' attribute in RoIAlignOp is " + "The 'pooled_width' attribute in RoiAlignOp is " "invalid. The width must be greater than 0. But " "received 'pooled_width' = %d", pooled_width)); PADDLE_ENFORCE_GT(spatial_scale, 0.0f, phi::errors::InvalidArgument( - "The 'spatial_scale' attribute in RoIAlignOp is " + "The 'spatial_scale' attribute in RoiAlignOp is " "invalid. The scale must be greater than 0. But " "received 'spatial_scale' = %f", spatial_scale)); @@ -399,6 +399,81 @@ void RoiAlignInferMeta(const MetaTensor& x, out->set_dtype(x.dtype()); } +void RoiPoolInferMeta(const MetaTensor& x, + const MetaTensor& boxes, + paddle::optional boxes_num, + int pooled_height, + int pooled_width, + float spatial_scale, + MetaTensor* out, + MetaTensor* arg_max) { + auto input_dims = x.dims(); + auto boxes_dims = boxes.dims(); + + if (boxes_num) { + auto boxes_num_dims = boxes_num->dims(); + PADDLE_ENFORCE_EQ( + boxes_num_dims.size(), + 1, + phi::errors::InvalidArgument("The second dimension of boxes_num should " + "be 1, but received dimension is %d", + boxes_num_dims.size())); + } + PADDLE_ENFORCE_EQ(input_dims.size(), + 4, + phi::errors::InvalidArgument( + "The input data should be a four-dimensional " + "tensor with [N,C,H,W], but received input data with " + " %d dimension", + input_dims.size())); + PADDLE_ENFORCE_EQ( + boxes_dims.size(), + 2, + phi::errors::InvalidArgument( + "boxes should be a 2-D LoDTensor with shape (num_boxes, 4)" + "given as [[x1, y1, x2, y2], ...], but received boxes is " + "%d-dimensional LoDTensor", + boxes_dims.size())); + PADDLE_ENFORCE_EQ( + boxes_dims[1], + 4, + phi::errors::InvalidArgument( + "boxes should be a 2-D LoDTensor with shape (num_boxes, 4)" + "given as [[x1, y1, x2, y2], ...]. But the second dimension of " + "the received data is %d", + boxes_dims[1])); + + PADDLE_ENFORCE_GT( + pooled_height, + 0, + phi::errors::OutOfRange("The pooled output height must be greater than 0" + "but received height is %d", + pooled_height)); + PADDLE_ENFORCE_GT( + pooled_width, + 0, + phi::errors::OutOfRange("The pooled output width must be greater than 0" + "but received width is %d", + pooled_width)); + PADDLE_ENFORCE_GT( + spatial_scale, + 0.0f, + phi::errors::OutOfRange("The spatial scale must be greater than 0, " + "but received spatial scale is %f", + spatial_scale)); + + auto out_dims = input_dims; + out_dims[0] = boxes_dims[0]; + out_dims[1] = input_dims[1]; + out_dims[2] = pooled_height; + out_dims[3] = pooled_width; + + out->set_dims(out_dims); + out->set_dtype(x.dtype()); + arg_max->set_dims(out_dims); + arg_max->set_dtype(DataType::INT64); +} + void ScatterInferMeta(const MetaTensor& x, const MetaTensor& index, const MetaTensor& updates, diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index 0e7b9cb12a4..42a0f35dc1d 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -84,6 +84,15 @@ void RoiAlignInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void RoiPoolInferMeta(const MetaTensor& x, + const MetaTensor& boxes, + paddle::optional boxes_num, + int pooled_height, + int pooled_width, + float spatial_scale, + MetaTensor* out, + MetaTensor* arg_max); + void ScatterInferMeta(const MetaTensor& x, const MetaTensor& index, const MetaTensor& updates, -- GitLab From 6e1fe4f1a8eb32aa6d6f3080d1ea22e08c3ee11b Mon Sep 17 00:00:00 2001 From: 0x45f <23097963+0x45f@users.noreply.github.com> Date: Fri, 18 Mar 2022 15:49:15 +0800 Subject: [PATCH 164/176] Support assign x.shape to dict['key'] in dy2st (#40611) * support assign x.shape to dict['key'] in dy2st * remove replace_dot * refine unit test --- .../dygraph_to_static/tensor_shape_transformer.py | 9 +-------- .../unittests/dygraph_to_static/test_tensor_shape.py | 8 ++++++++ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py index e1df2324889..7733226cc09 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py @@ -297,10 +297,6 @@ class TensorShapeTransformer(gast.NodeTransformer): return False def _update_name_to_var_shape(self, node): - def replace_dot(name): - # replace all '.' into '_' - return name.replace('.', '_') - assert isinstance(node, gast.Assign) target_node = node.targets[0] value_node = node.value @@ -315,7 +311,6 @@ class TensorShapeTransformer(gast.NodeTransformer): if value_node.id in self.name_to_var_shape: # TODO(zhhsplendid): is context a problem for the result node of gast.parse? static_shape_var_name = unique_name.generate( - replace_dot(target_id) + STATIC_CONVERT_VAR_SHAPE_SUFFIX) static_shape_var_node = gast.parse( static_shape_var_name).body[0].value @@ -337,7 +332,6 @@ class TensorShapeTransformer(gast.NodeTransformer): if isinstance(value_node, gast.Attribute): if self._is_var_shape(value_node): # eg: x.shape static_shape_var_name = unique_name.generate( - replace_dot(target_id) + STATIC_CONVERT_VAR_SHAPE_SUFFIX) static_shape_var_node = gast.parse( static_shape_var_name).body[0].value @@ -370,7 +364,6 @@ class TensorShapeTransformer(gast.NodeTransformer): if isinstance(value_node, gast.Name): if value_node.id in self.name_to_var_shape: static_shape_var_name = unique_name.generate( - replace_dot(target_id) + STATIC_CONVERT_VAR_SHAPE_SUFFIX) static_shape_var_node = gast.parse( static_shape_var_name).body[0].value @@ -387,7 +380,7 @@ class TensorShapeTransformer(gast.NodeTransformer): self.name_to_var_shape[target_id] = static_shape_var_name elif self._is_var_shape(value_node): # eg: x.shape or x.shape[0] static_shape_var_name = unique_name.generate( - replace_dot(target_id) + STATIC_CONVERT_VAR_SHAPE_SUFFIX) + STATIC_CONVERT_VAR_SHAPE_SUFFIX) static_shape_var_node = gast.parse(static_shape_var_name).body[ 0].value static_shape_value_node = copy.deepcopy(value_node) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py index 06d69daa75d..d05be03bbfb 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py @@ -223,6 +223,12 @@ def dyfunc_len_paddle_shape(): print(x) +def dyfunc_dict_assign_shape(): + x = paddle.to_tensor([1, 2]) + a = {} + a['shape'] = x.shape[0] + + # 1. Basic tests without control flow class TestTensorShapeBasic(unittest.TestCase): def setUp(self): @@ -592,6 +598,8 @@ class TestPaddleShape(unittest.TestCase): def test_paddle_shape(self): func = paddle.jit.to_static(dyfunc_len_paddle_shape) self.assertEqual('paddle.shape(x)' in func.code, True) + func = paddle.jit.to_static(dyfunc_dict_assign_shape) + self.assertEqual("__static_convert_var_shape_suffix" in func.code, True) if __name__ == '__main__': -- GitLab From aaa71ea43cdef2ba1297cbe8f6b10b1ef651dc5e Mon Sep 17 00:00:00 2001 From: furnace <34057289+windstamp@users.noreply.github.com> Date: Fri, 18 Mar 2022 16:34:46 +0800 Subject: [PATCH 165/176] [NPU] fix fp16 (PART I) (#40259) [NPU] fix fp16 (PART I) --- paddle/fluid/operators/batch_norm_op_npu.cc | 12 +++++------ .../unittests/npu/test_batch_norm_op_npu.py | 20 ++++++++++++++++--- .../tests/unittests/npu/test_cos_op_npu.py | 2 -- .../unittests/npu/test_dropout_op_npu.py | 2 -- .../tests/unittests/npu/test_exp_op_npu.py | 2 -- .../unittests/npu/test_hard_sigmoid_op_npu.py | 3 --- .../unittests/npu/test_hard_swish_op_npu.py | 2 -- .../tests/unittests/npu/test_log_op_npu.py | 8 ++------ .../tests/unittests/npu/test_norm_op_npu.py | 3 --- .../tests/unittests/npu/test_p_norm_op_npu.py | 2 -- .../tests/unittests/npu/test_pool2d_op_npu.py | 3 --- .../unittests/npu/test_reciprocal_op_npu.py | 2 -- .../tests/unittests/npu/test_relu6_op_npu.py | 2 -- .../unittests/npu/test_sigmoid_op_npu.py | 2 -- .../test_softmax_with_cross_entropy_op_npu.py | 2 -- .../tests/unittests/npu/test_sqrt_op_npu.py | 11 +++++----- .../tests/unittests/npu/test_square_op_npu.py | 2 -- .../tests/unittests/npu/test_tanh_op_npu.py | 11 +++++----- 18 files changed, 35 insertions(+), 56 deletions(-) diff --git a/paddle/fluid/operators/batch_norm_op_npu.cc b/paddle/fluid/operators/batch_norm_op_npu.cc index a70b6e99116..ae03ecbcb16 100644 --- a/paddle/fluid/operators/batch_norm_op_npu.cc +++ b/paddle/fluid/operators/batch_norm_op_npu.cc @@ -76,10 +76,10 @@ class NPUBatchNormOpKernel : public framework::OpKernel { auto *variance_out = ctx.Output("VarianceOut"); auto *saved_mean = ctx.Output("SavedMean"); auto *saved_variance = ctx.Output("SavedVariance"); - mean_out->mutable_data(ctx.GetPlace()); - variance_out->mutable_data(ctx.GetPlace()); - saved_mean->mutable_data(ctx.GetPlace()); - saved_variance->mutable_data(ctx.GetPlace()); + mean_out->mutable_data(ctx.GetPlace()); + variance_out->mutable_data(ctx.GetPlace()); + saved_mean->mutable_data(ctx.GetPlace()); + saved_variance->mutable_data(ctx.GetPlace()); // if MomentumTensor is set, use MomentumTensor value, momentum // is only used in this training branch @@ -170,8 +170,8 @@ class NPUBatchNormGradOpKernel : public framework::OpKernel { auto stream = ctx.template device_context().stream(); if (d_scale && d_bias) { - d_scale->mutable_data(ctx.GetPlace()); - d_bias->mutable_data(ctx.GetPlace()); + d_scale->mutable_data(ctx.GetPlace()); + d_bias->mutable_data(ctx.GetPlace()); if (use_global_stats) { const auto *running_mean = ctx.Input("Mean"); const auto *running_variance = ctx.Input("Variance"); diff --git a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py index 877f9904f34..e01b2b691a2 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py @@ -144,6 +144,7 @@ class TestBatchNormOpTraining(unittest.TestCase): def setUp(self): self.set_npu() + self.init_dtype() self.use_mkldnn = False self.fuse_with_relu = False self.data_formats = ["NCHW", "NHWC"] @@ -153,6 +154,9 @@ class TestBatchNormOpTraining(unittest.TestCase): self.init_kernel_type() self.init_test_case() + def init_dtype(self): + self.dtype = np.float32 + def init_test_case(self): self.use_global_stats = False self.no_grad_set = set() @@ -210,11 +214,16 @@ class TestBatchNormOpTraining(unittest.TestCase): scale_shape = [c] np.random.seed(123) - x = np.random.random_sample(shape).astype(np.float32) + x = np.random.random_sample(shape).astype(self.dtype) scale = np.random.random_sample(scale_shape).astype(np.float32) bias = np.random.random_sample(scale_shape).astype(np.float32) mean, variance = self.set_mean_variance(scale_shape, x, data_layout) - y_grad = np.random.random_sample(shape).astype(np.float32) + + if self.dtype == np.float16: + mean = mean.astype(np.float32) + variance = variance.astype(np.float32) + + y_grad = np.random.random_sample(shape).astype(self.dtype) momentum_var = np.array([momentum]).astype(np.float32) y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad = self.ref_forward_backward( @@ -275,7 +284,7 @@ class TestBatchNormOpTraining(unittest.TestCase): inputs=inputs, outputs=outputs, attrs=attrs) - block.create_var(name='y@GRAD', dtype='float32', shape=y.shape) + block.create_var(name='y@GRAD', dtype=self.dtype, shape=y.shape) # generate backward op_desc grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc( @@ -320,6 +329,11 @@ class TestBatchNormOpTraining(unittest.TestCase): pass +class TestFP16BatchNormOpTraining(TestBatchNormOpTraining): + def init_dtype(self): + self.dtype = np.float16 + + class TestBatchNormOpTrainingCase1(TestBatchNormOpTraining): def init_test_case(self): self.use_global_stats = False diff --git a/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py index 9b29fc812fa..a4769442b08 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py @@ -51,8 +51,6 @@ class TestCos(OpTest): self.check_output_with_place(self.place, atol=1e-7) def test_check_grad(self): - if self.dtype == np.float16: - return self.check_grad_with_place(self.place, ['X'], 'Out') diff --git a/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py index bd9022f56a3..fea8502f2d7 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py @@ -56,8 +56,6 @@ class TestDropoutOp(OpTest): self.check_output_with_place(self.place) def test_check_grad_normal(self): - if self.dtype == np.float16: - return self.check_grad_with_place(self.place, ['X'], 'Out') diff --git a/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py index ccd5f0649d8..6be2fe0086b 100755 --- a/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_exp_op_npu.py @@ -50,8 +50,6 @@ class TestExpNPUOP(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return self.check_grad_with_place(self.place, ['X'], 'Out') def init_dtype(self): diff --git a/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py index d7aafccc88c..f1d89cb8d56 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py @@ -59,9 +59,6 @@ class TestNPUHardSigmoid(OpTest): self.check_output_with_place(self.place, atol=1e-5) def test_check_grad(self): - if self.dtype == np.float16: - return - self.check_grad_with_place(self.place, ['X'], 'Out') def set_npu(self): diff --git a/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py index 32042ba83a9..9495cdb8a55 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_hard_swish_op_npu.py @@ -66,8 +66,6 @@ class TestHardSwishNPU(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return # There is a problem that precision of grad result using float32 # can't satisfy the default precision requirement # when compared with numeric_grads, but the results on diff --git a/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py index 9534431e99a..5da3cb0ce56 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py @@ -50,12 +50,8 @@ class TestLog(OpTest): def test_check_output(self): self.check_output_with_place(self.place) - # TODO(ascendrc): Add grad test - # def test_check_grad(self): - # if self.dtype == np.float16: - # return - # self.check_grad(['X'], 'Out') - # + def test_check_grad(self): + self.check_grad(['X'], 'Out') class TestLogFp16(OpTest): diff --git a/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py index 2c41f09ff51..8e28b3fe413 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py @@ -54,9 +54,6 @@ class TestNPUNormOp(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return - self.check_grad_with_place( self.place, ['X'], 'Out', max_relative_error=0.006) diff --git a/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py index 3b75cba60b1..a7ca4edc524 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_p_norm_op_npu.py @@ -51,8 +51,6 @@ class TestPnormOp(OpTest): self.check_output_with_place(paddle.NPUPlace(0)) def test_check_grad(self): - if self.dtype == "float16": - return self.check_grad_with_place( paddle.NPUPlace(0), ['X'], 'Out', user_defined_grads=self.gradient) diff --git a/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py index 2b8550a88de..4822abc3b25 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_pool2d_op_npu.py @@ -67,9 +67,6 @@ def create_test_fp16_class(parent): self.use_cudnn = False self.dtype = np.float16 - def test_check_grad(self): - return - cls_name = "{0}_{1}".format(parent.__name__, "Fp16Op") TestFp16Case.__name__ = cls_name globals()[cls_name] = TestFp16Case diff --git a/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py index e8f5de005d4..899d4ef43bd 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py @@ -40,8 +40,6 @@ class TestNPUReciprocal(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return self.check_grad_with_place( self.place, ['X'], 'Out', max_relative_error=0.01) diff --git a/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py index 601a351c015..b1cb5e02a73 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py @@ -56,8 +56,6 @@ class TestRelu6(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return self.check_grad_with_place(self.place, ['X'], 'Out') def init_dtype(self): diff --git a/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py index 4516b25b59d..489f8bfb116 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_sigmoid_op_npu.py @@ -44,8 +44,6 @@ class TestNPUSigmoid(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return self.check_grad_with_place( self.place, ['X'], 'Out', max_relative_error=0.01) diff --git a/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py index 8d78ee6a97e..f0ca7788345 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py @@ -87,8 +87,6 @@ class TestSoftmaxWithCrossEntropyOp(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return # fp32 has low precision, cpu and npu both need to relax the max_relative_error if using fp32 self.check_grad_with_place( self.place, ['Logits'], diff --git a/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py index acb99746d23..24b34fa625c 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py @@ -50,12 +50,11 @@ class TestSqrt(OpTest): def test_check_output(self): self.check_output_with_place(self.place) - # TODO(ascendrc): Add grad test - # def test_check_grad(self): - # if self.dtype == np.float16: - # return - # self.check_grad(['X'], 'Out') - # + def test_check_grad(self): + if self.dtype == np.float16: + self.check_grad(['X'], 'Out', max_relative_error=0.009) + else: + self.check_grad(['X'], 'Out', max_relative_error=0.009) class TestSqrtFp16(OpTest): diff --git a/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py index caf55b4850f..170f6b6ca4f 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py @@ -51,8 +51,6 @@ class TestSquare(OpTest): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return self.check_grad_with_place(self.place, ['X'], 'Out') diff --git a/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py index 55be94da2b7..375eef12291 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py @@ -50,12 +50,11 @@ class TestTanh(OpTest): def test_check_output(self): self.check_output_with_place(self.place) - # TODO(ascendrc): Add grad test - # def test_check_grad(self): - # if self.dtype == np.float16: - # return - # self.check_grad(['X'], 'Out') - # + def test_check_grad(self): + if self.dtype == np.float16: + self.check_grad(['X'], 'Out', max_relative_error=0.009) + else: + self.check_grad(['X'], 'Out', max_relative_error=0.009) class TestTanhFp16(OpTest): -- GitLab From 161d27dc1efcc05eb5cc69090c161faf167b5db9 Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Fri, 18 Mar 2022 17:10:53 +0800 Subject: [PATCH 166/176] set +x to close showing command, update check_change code with linux (#40456) --- paddle/fluid/inference/api/demo_ci/run.sh | 6 +- paddle/scripts/paddle_build.bat | 133 +++++----------------- tools/check_added_ut.sh | 7 +- tools/windows/check_change_of_unittest.sh | 41 +++++++ 4 files changed, 77 insertions(+), 110 deletions(-) create mode 100644 tools/windows/check_change_of_unittest.sh diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index 79a31555c7f..2c0945cd5b3 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -53,7 +53,11 @@ if [ $7 == ON ]; then if [[ -e "MobileNetV2.inference.model.tar.gz" ]]; then echo "MobileNetV2.inference.model.tar.gz has been downloaded." else - wget -q --no-proxy http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz + if [ $WIN_DETECT != "" ]; then + wget -q -Y off http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz + else + wget -q --no-proxy http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz + fi tar xzf *.tar.gz fi cd .. diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index 75afa4ef43f..78a863040ad 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -55,7 +55,6 @@ wmic process where name="python.exe" call terminate 2>NUL rem ------initialize common variable------ if not defined GENERATOR set GENERATOR="Visual Studio 15 2017 Win64" -if not defined BRANCH set BRANCH=develop if not defined WITH_TENSORRT set WITH_TENSORRT=ON if not defined TENSORRT_ROOT set TENSORRT_ROOT=D:/TensorRT if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=Auto @@ -70,7 +69,6 @@ if not defined WITH_ONNXRUNTIME set WITH_ONNXRUNTIME=OFF if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON if not defined WITH_STATIC_LIB set WITH_STATIC_LIB=ON if not defined WITH_TPCACHE set WITH_TPCACHE=OFF -if not defined WITH_CLCACHE set WITH_CLCACHE=OFF if not defined WITH_CACHE set WITH_CACHE=OFF if not defined WITH_SCCACHE set WITH_SCCACHE=OFF if not defined WITH_UNITY_BUILD set WITH_UNITY_BUILD=OFF @@ -145,17 +143,6 @@ if %day_now% NEQ %day_before% ( echo %day_now% > %cache_dir%\day.txt type %cache_dir%\day.txt rmdir %BUILD_DIR% /s/q - - : clear third party cache every once in a while - if %day_now% EQU 21 ( - rmdir %cache_dir%\third_party /s/q - ) - if %day_now% EQU 11 ( - rmdir %cache_dir%\third_party /s/q - ) - if %day_now% EQU 01 ( - rmdir %cache_dir%\third_party /s/q - ) goto :mkbuild ) @@ -212,6 +199,7 @@ echo There is not sccache in this PC, will install sccache. echo Download package from https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe %PYTHON_ROOT%\python.exe -c "import wget;wget.download('https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe')" xcopy sccache.exe %PYTHON_ROOT%\ /Y +del sccache.exe goto:eof rem -------Caching strategy 2: End -------------------------------- @@ -232,13 +220,12 @@ set WITH_AVX=ON set MSVC_STATIC_CRT=OFF set ON_INFER=OFF set WITH_TENSORRT=ON +set WITH_INFERENCE_API_TEST=OFF call :cmake || goto cmake_error call :build || goto build_error call :test_whl_pacakage || goto test_whl_pacakage_error call :test_unit || goto test_unit_error -:: call :test_inference || goto test_inference_error -:: call :check_change_of_unittest || goto check_change_of_unittest_error goto:success rem ------PR CI windows check for OPENBLAS/CPU------ @@ -254,8 +241,6 @@ call :cmake || goto cmake_error call :build || goto build_error call :test_whl_pacakage || goto test_whl_pacakage_error call :test_unit || goto test_unit_error -:: call :test_inference || goto test_inference_error -:: call :check_change_of_unittest || goto check_change_of_unittest_error goto:success rem ------PR CI windows check for unittests and inference in CUDA11-MKL-AVX---------- @@ -265,7 +250,6 @@ set WITH_GPU=ON set WITH_AVX=ON set MSVC_STATIC_CRT=ON set ON_INFER=ON -set WITH_TESTING=ON set WITH_TENSORRT=ON set WITH_INFERENCE_API_TEST=ON @@ -274,7 +258,8 @@ call :build || goto build_error call :test_whl_pacakage || goto test_whl_pacakage_error call :test_unit || goto test_unit_error ::call :test_inference || goto test_inference_error -:: call :check_change_of_unittest || goto check_change_of_unittest_error +::call :test_inference_ut || goto test_inference_ut_error +call :check_change_of_unittest || goto check_change_of_unittest_error goto:success rem ------Build windows avx whl package------ @@ -365,18 +350,6 @@ if "%WITH_GPU%"=="ON" ( nvidia-smi 2>NUL ) -rem ------pre install clcache and init config---------- -rem pip install clcache --user -pip uninstall -y clcache -:: set USE_CLCACHE to enable clcache -rem set USE_CLCACHE=1 -:: In some scenarios, CLCACHE_HARDLINK can save one file copy. -rem set CLCACHE_HARDLINK=1 -:: If it takes more than 1000s to obtain the right to use the cache, an error will be reported -rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000 -:: set maximum cache size to 20G -rem clcache.exe -M 21474836480 - rem ------set third_party cache dir------ if "%WITH_TPCACHE%"=="OFF" ( @@ -384,6 +357,25 @@ if "%WITH_TPCACHE%"=="OFF" ( goto :cmake_impl ) +rem clear third party cache every ten days +for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%# +set day_now=%datetime:~6,2% +set day_before=-1 +set /p day_before=< %cache_dir%\day_third_party.txt +if %day_now% NEQ %day_before% ( + echo %day_now% > %cache_dir%\day_third_party.txt + type %cache_dir%\day_third_party.txt + if %day_now% EQU 21 ( + rmdir %cache_dir%\third_party /s/q + ) + if %day_now% EQU 11 ( + rmdir %cache_dir%\third_party /s/q + ) + if %day_now% EQU 01 ( + rmdir %cache_dir%\third_party /s/q + ) +) + echo set -ex > cache.sh echo md5_content=$(cat %work_dir:\=/%/cmake/external/*.cmake ^|md5sum ^| awk '{print $1}') >> cache.sh echo echo ${md5_content}^>md5.txt >> cache.sh @@ -535,11 +527,7 @@ echo Build Paddle the %build_times% time: if %GENERATOR% == "Ninja" ( ninja all ) else ( - if "%WITH_CLCACHE%"=="OFF" ( - MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj - ) else ( - MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj - ) + MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj ) if %ERRORLEVEL% NEQ 0 ( @@ -774,77 +762,8 @@ echo ======================================== echo Step 6. Check whether deleting a unit test ... echo ======================================== -cd /d %work_dir%\%BUILD_DIR% -echo set -e> check_change_of_unittest.sh -echo set +x>> check_change_of_unittest.sh -echo GITHUB_API_TOKEN=%GITHUB_API_TOKEN% >> check_change_of_unittest.sh -echo GIT_PR_ID=%AGILE_PULL_ID% >> check_change_of_unittest.sh -echo BRANCH=%BRANCH%>> check_change_of_unittest.sh -echo if [ "${GITHUB_API_TOKEN}" == "" ] ^|^| [ "${GIT_PR_ID}" == "" ];then>> check_change_of_unittest.sh -echo exit 0 >> check_change_of_unittest.sh -echo fi>> check_change_of_unittest.sh -echo set -x>> check_change_of_unittest.sh -echo cat ^<^> check_change_of_unittest.sh -echo ============================================ >> check_change_of_unittest.sh -echo Generate unit tests.spec of this PR. >> check_change_of_unittest.sh -echo ============================================ >> check_change_of_unittest.sh -echo EOF>> check_change_of_unittest.sh -echo spec_path=$(pwd)/UNITTEST_PR.spec>> check_change_of_unittest.sh -echo ctest -N ^| awk -F ':' '{print $2}' ^| sed '/^^$/d' ^| sed '$d' ^> ${spec_path}>> check_change_of_unittest.sh -echo num=$(awk 'END{print NR}' ${spec_path})>> check_change_of_unittest.sh -echo echo "Windows 1 card TestCases count is $num">> check_change_of_unittest.sh -echo echo ipipe_log_param_Windows_1_Card_TestCases_Count: $num>> check_change_of_unittest.sh -echo UPSTREAM_URL='https://github.com/PaddlePaddle/Paddle'>> check_change_of_unittest.sh -echo origin_upstream_url=`git remote -v ^| awk '{print $1, $2}' ^| uniq ^| grep upstream ^| awk '{print $2}'`>> check_change_of_unittest.sh -echo if [ "$origin_upstream_url" == "" ]; then>> check_change_of_unittest.sh -echo git remote add upstream $UPSTREAM_URL.git>> check_change_of_unittest.sh -echo elif [ "$origin_upstream_url" ^!= "$UPSTREAM_URL" ] ^\>> check_change_of_unittest.sh -echo ^&^& [ "$origin_upstream_url" ^!= "$UPSTREAM_URL.git" ]; then>> check_change_of_unittest.sh -echo git remote remove upstream>> check_change_of_unittest.sh -echo git remote add upstream $UPSTREAM_URL.git>> check_change_of_unittest.sh -echo fi>> check_change_of_unittest.sh -echo if [ ! -e "$(pwd)/../.git/refs/remotes/upstream/$BRANCH" ]; then>> check_change_of_unittest.sh -echo git fetch upstream $BRANCH # develop is not fetched>> check_change_of_unittest.sh -echo fi>> check_change_of_unittest.sh -echo git checkout -b origin_pr >> check_change_of_unittest.sh -echo git checkout -f $BRANCH >> check_change_of_unittest.sh -echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ --DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DON_INFER=%ON_INFER% ^ --DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^ --DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^ --DWITH_TENSORRT=%WITH_TENSORRT% -DTENSORRT_ROOT="%TENSORRT_ROOT%" -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% ^ --DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% >> check_change_of_unittest.sh -echo cat ^<^> check_change_of_unittest.sh -echo ============================================ >> check_change_of_unittest.sh -echo Generate unit tests.spec of develop. >> check_change_of_unittest.sh -echo ============================================ >> check_change_of_unittest.sh -echo EOF>> check_change_of_unittest.sh -echo spec_path=$(pwd)/UNITTEST_DEV.spec>> check_change_of_unittest.sh -echo ctest -N ^| awk -F ':' '{print $2}' ^| sed '/^^$/d' ^| sed '$d' ^> ${spec_path}>> check_change_of_unittest.sh -echo unittest_spec_diff=`python $(pwd)/../tools/diff_unittest.py $(pwd)/UNITTEST_DEV.spec $(pwd)/UNITTEST_PR.spec`>> check_change_of_unittest.sh -echo if [ "$unittest_spec_diff" ^!= "" ]; then>> check_change_of_unittest.sh -echo set +x>> check_change_of_unittest.sh -echo approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`>> check_change_of_unittest.sh -echo set -x>> check_change_of_unittest.sh -echo if [ "$approval_line" ^!= "" ]; then>> check_change_of_unittest.sh -echo APPROVALS=`echo ${approval_line} ^|python $(pwd)/../tools/check_pr_approval.py 1 22165420 52485244 6836917`>> check_change_of_unittest.sh -echo echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}">> check_change_of_unittest.sh -echo if [ "${APPROVALS}" == "FALSE" ]; then>> check_change_of_unittest.sh -echo echo "************************************" >> check_change_of_unittest.sh -echo echo -e "It is forbidden to disable or delete the unit-test.\n" >> check_change_of_unittest.sh -echo echo -e "If you must delete it temporarily, please add it to[https://github.com/PaddlePaddle/Paddle/wiki/Temporarily-disabled-Unit-Test]." >> check_change_of_unittest.sh -echo echo -e "Then you must have one RD (kolinwei(recommended) or zhouwei25) approval for the deletion of unit-test. \n" >> check_change_of_unittest.sh -echo echo -e "If you have any problems about deleting unit-test, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/Deleting-unit-test-is-forbidden]. \n" >> check_change_of_unittest.sh -echo echo -e "Following unit-tests are deleted in this PR: \n ${unittest_spec_diff} \n" >> check_change_of_unittest.sh -echo echo "************************************" >> check_change_of_unittest.sh -echo exit 1 >> check_change_of_unittest.sh -echo fi>> check_change_of_unittest.sh -echo else>> check_change_of_unittest.sh -echo exit 1 >> check_change_of_unittest.sh -echo fi>> check_change_of_unittest.sh -echo fi>> check_change_of_unittest.sh -echo git checkout -f origin_pr >> check_change_of_unittest.sh -%cache_dir%\tools\busybox64.exe bash check_change_of_unittest.sh +%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\windows\check_change_of_unittest.sh + goto:eof :check_change_of_unittest_error diff --git a/tools/check_added_ut.sh b/tools/check_added_ut.sh index 2a9fb842862..5466a1cdd59 100644 --- a/tools/check_added_ut.sh +++ b/tools/check_added_ut.sh @@ -52,9 +52,10 @@ if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then elif [[ "$SYSTEM" == "Windows_NT" ]];then bash $PADDLE_ROOT/win_cmake.sh >prec_build.log 2>&1 fi -ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | grep 'test' > $PADDLE_ROOT/br-ut +# remove line ended with .exe to get correct deleted_ut list +ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | sed '/\.exe$/d' | grep 'test' > $PADDLE_ROOT/br-ut cd $PADDLE_ROOT/build -ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | grep 'test' > $PADDLE_ROOT/pr-ut +ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | sed '/\.exe$/d' | grep 'test' > $PADDLE_ROOT/pr-ut cd $PADDLE_ROOT grep -F -x -v -f br-ut pr-ut > $PADDLE_ROOT/added_ut if [[ "$SYSTEM" == 'Linux' ]];then @@ -66,6 +67,8 @@ rm -rf prec_build if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then rm $PADDLE_ROOT/br-ut $PADDLE_ROOT/pr-ut $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh elif [[ "$SYSTEM" == "Windows_NT" ]];then + # get the deleted ut list in windows, will be used in check_change_of_unittest.sh + grep -F -x -v -f pr-ut br-ut > $PADDLE_ROOT/deleted_ut rm $PADDLE_ROOT/br-ut $PADDLE_ROOT/pr-ut $PADDLE_ROOT/win_cmake.sh fi git checkout -f $CURBRANCH diff --git a/tools/windows/check_change_of_unittest.sh b/tools/windows/check_change_of_unittest.sh new file mode 100644 index 00000000000..576f0e5d238 --- /dev/null +++ b/tools/windows/check_change_of_unittest.sh @@ -0,0 +1,41 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e +set +x +export PADDLE_ROOT="$(cd "$PWD/../" && pwd )" +GITHUB_API_TOKEN=$GITHUB_API_TOKEN +GIT_PR_ID=$AGILE_PULL_ID +BRANCH=$BRANCH +if [ "${GITHUB_API_TOKEN}" == "" ] || [ "${GIT_PR_ID}" == "" ];then + exit 0 +fi + +unittest_spec_diff=$(cat $PADDLE_ROOT/deleted_ut | sed 's/^/ - /g') +if [ "$unittest_spec_diff" != "" ]; then + approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000` + APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 22165420 52485244 32428676 45041955` + echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" + if [ "${APPROVALS}" == "FALSE" ]; then + echo "************************************" + echo -e "It is forbidden to disable or delete the unit-test.\n" + echo -e "If you must delete it temporarily, please add it to[https://github.com/PaddlePaddle/Paddle/wiki/Temporarily-disabled-Unit-Test]." + echo -e "Then you must have one RD (kolinwei(recommended), chalsliu, XieYunshen or zhouwei25) approval for the deletion of unit-test. \n" + echo -e "If you have any problems about deleting unit-test, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/Deleting-unit-test-is-forbidden]. \n" + echo -e "Following unit-tests are deleted in this PR: \n${unittest_spec_diff} \n" + echo "************************************" + exit 6 + fi +fi +set -x -- GitLab From 64a7cbd3136330cc9d55de7927e9865d8554a85a Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Fri, 18 Mar 2022 17:45:27 +0800 Subject: [PATCH 167/176] [Phi]Move hierarchical_sigmoid kernel to phi (#40553) * first commit * fix compile error * support std::vector * fix * fix op support on GPU by chenweihang * pass test * infershape * add set_dtype * fix order * fix * unify the impl of dt and sr * fix --- paddle/fluid/framework/operator.cc | 25 +- paddle/fluid/imperative/prepared_operator.h | 4 + .../operators/hierarchical_sigmoid_op.cc | 55 +---- .../fluid/operators/hierarchical_sigmoid_op.h | 222 ------------------ paddle/phi/infermeta/multiary.cc | 34 +++ paddle/phi/infermeta/multiary.h | 17 ++ paddle/phi/kernels/CMakeLists.txt | 5 +- .../kernels/cpu/hierarchical_sigmoid_grad.h | 110 +++++++++ .../cpu/hierarchical_sigmoid_grad_kernel.cc | 71 ++++++ .../cpu/hierarchical_sigmoid_kernel.cc | 115 +++++++++ .../hierarchical_sigmoid_grad_kernel.h | 42 ++++ .../phi/kernels/hierarchical_sigmoid_kernel.h | 40 ++++ .../hierarchical_sigmoid_grad_kernel.cc | 99 ++++++++ .../hierarchical_sigmoid_grad_kernel.h | 45 ++++ .../ops/compat/hierarchical_sigmoid_sig.cc | 83 +++++++ 15 files changed, 696 insertions(+), 271 deletions(-) delete mode 100644 paddle/fluid/operators/hierarchical_sigmoid_op.h create mode 100644 paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h create mode 100644 paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc create mode 100644 paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h create mode 100644 paddle/phi/kernels/hierarchical_sigmoid_kernel.h create mode 100644 paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc create mode 100644 paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h create mode 100644 paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index ec28c98d598..42fbeb5d29c 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -628,10 +628,12 @@ std::vector ExecutionContext::MultiOutput( bool OpSupportGPU(const std::string& op_type) { // check in new Function kernel first + bool has_phi_kernel = false; auto& kernel_factory = phi::KernelFactory::Instance(); auto kernel_key_map = kernel_factory.SelectKernelMap(phi::TransToPhiKernelName(op_type)); for (auto& kernel : kernel_key_map) { + has_phi_kernel = true; if (platform::is_gpu_place(phi::TransToPhiPlace(kernel.first.backend()))) { return true; } @@ -639,12 +641,19 @@ bool OpSupportGPU(const std::string& op_type) { auto& all_kernels = OperatorWithKernel::AllOpKernels(); auto it = all_kernels.find(op_type); - if (it == all_kernels.end()) { - // All control operator must support GPU - return true; - } - for (auto& kern_pair : it->second) { - if (platform::is_gpu_place(kern_pair.first.place_)) { + if (it != all_kernels.end()) { + for (auto& kern_pair : it->second) { + if (platform::is_gpu_place(kern_pair.first.place_)) { + return true; + } + } + } else { + if (has_phi_kernel) { + // if has phi kernel, but not find phi gpu kernel and fluid gpu kernel, + // this op doesn't support GPU + return false; + } else { + // All control operator must support GPU return true; } } @@ -2347,6 +2356,10 @@ void OperatorWithKernel::BuildPhiKernelContext( const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr_it->second); pt_kernel_context->EmplaceBackAttr(vector_int_attr); + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr_it->second)); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported cast op attribute `%s` when construct " diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index f70f44878e3..9daac181d57 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -541,6 +541,10 @@ void BuildDygraphPhiKernelContext( } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(std::vector, attr)); + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + kernel_ctx->EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr)); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported cast op attribute `%s` when construct " diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index 9575ab54b32..93f0d3d334f 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/hierarchical_sigmoid_op.h" #include #include + +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/multiary.h" + namespace paddle { namespace operators { @@ -60,31 +64,6 @@ namespace operators { class HierarchicalSigmoidOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "hsigmoid"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "hsigmoid"); - OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "hsigmoid"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "hsigmoid"); - OP_INOUT_CHECK(ctx->HasOutput("PreOut"), "Output", "PreOut", "hsigmoid"); - - auto with_prefetch = ctx->Attrs().Get("remote_prefetch"); - if (with_prefetch) { - OP_INOUT_CHECK(ctx->HasOutput("W_Out"), "Output", "W_Out", "hsigmoid"); - } - const int64_t input_dims = ctx->GetInputDim("X")[0]; - const int64_t label_dims = ctx->GetInputDim("Label")[0]; - PADDLE_ENFORCE_EQ(input_dims, label_dims, - platform::errors::InvalidArgument( - "The first dimension of " - "input and label is expected to be the same. " - "But received input's first dimension is %d; " - "label's first dimension is %d.", - input_dims, label_dims)); - - std::vector output_shape({input_dims, 1}); - ctx->SetOutputDim("Out", phi::make_ddim(output_shape)); - ctx->ShareLoD("X", /*->*/ "Out"); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -272,22 +251,14 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER( } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR( - hierarchical_sigmoid, ops::HierarchicalSigmoidOp, - ops::HierarchicalSigmoidOpMaker, - ops::HierarchicalSigmoidGradMaker, - ops::HierarchicalSigmoidGradMaker); +DECLARE_INFER_SHAPE_FUNCTOR(hierarchical_sigmoid, + HierarchicalSigmoidInferShapeFunctor, + PD_INFER_META(phi::HierarchicalSigmoidInferMeta)); +REGISTER_OPERATOR(hierarchical_sigmoid, ops::HierarchicalSigmoidOp, + ops::HierarchicalSigmoidOpMaker, + ops::HierarchicalSigmoidGradMaker, + ops::HierarchicalSigmoidGradMaker, + HierarchicalSigmoidInferShapeFunctor); REGISTER_OPERATOR(hierarchical_sigmoid_grad, ops::HierarchicalSigmoidGradOp, ops::HierarchicalSigmoidGradOpGradVarTypeInference, ops::HierarchicalSigmoidGradOpNoNeedBufferVarInferer); -REGISTER_OP_CPU_KERNEL( - hierarchical_sigmoid, - ops::HierarchicalSigmoidOpKernel, - ops::HierarchicalSigmoidOpKernel); -REGISTER_OP_CPU_KERNEL( - hierarchical_sigmoid_grad, - ops::HierarchicalSigmoidGradOpKernel, - ops::HierarchicalSigmoidGradOpKernel); diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h deleted file mode 100644 index f11b28cfefb..00000000000 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ /dev/null @@ -1,222 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/mixed_vector.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/clip_op.h" -#include "paddle/fluid/operators/math/matrix_bit_code.h" -#include "paddle/fluid/platform/transform.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -using EigenMatrix = framework::EigenMatrix; -using platform::Transform; -using framework::LoDTensor; - -static std::vector PathToRows(const LoDTensor& path) { - std::set rows; - const int64_t* paths = path.data(); - for (int64_t i = 0; i < path.numel(); ++i) { - int64_t row = paths[i]; - if (row < 0) { - continue; - } - rows.emplace(row); - } - return std::vector(rows.begin(), rows.end()); -} -template -class HierarchicalSigmoidOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& in = GET_DATA_SAFELY(ctx.Input("X"), "Input", "X", - "HierarchicalSigmoid"); - auto& w = GET_DATA_SAFELY(ctx.Input("W"), "Input", "W", - "HierarchicalSigmoid"); - auto* path = ctx.Input("PathTable"); - auto* code = ctx.Input("PathCode"); - auto& label = GET_DATA_SAFELY(ctx.Input("Label"), "Input", - "Label", "HierarchicalSigmoid"); - auto* bias = ctx.Input("Bias"); - auto* out = ctx.Output("Out"); - auto* pre_out = ctx.Output("PreOut"); - size_t num_classes = static_cast(ctx.Attr("num_classes")); - // for remote prefetch - - bool is_custom = false; - if (path) { - is_custom = true; - } - int64_t code_length = - path ? path->dims()[1] : math::FindLastSet(num_classes - 1); - int64_t batch_size = in.dims()[0]; - LoDTensor sum; - auto& dev_ctx = ctx.template device_context(); - auto* pre_out_data = pre_out->mutable_data( - phi::make_ddim({batch_size, code_length}), ctx.GetPlace()); - auto pre_out_mat = EigenMatrix::From(*pre_out); - // Not all class(leaf) nodes' path lengths equal code_length, thus init as - // 0s can avoid out of path's loss. - phi::funcs::SetConstant zero; - zero(dev_ctx, pre_out, static_cast(0.0)); - auto& place = *ctx.template device_context().eigen_device(); - phi::funcs::RowwiseSum row_sum; - - std::unique_ptr> bit_code; - if (!is_custom) { - bit_code.reset(new math::MatrixBitCodeFunctor( - num_classes, label.template data())); - } else { - bit_code.reset(new math::MatrixBitCodeFunctor( - *path, *code, label.template data())); - } - - std::vector sum_dims({batch_size, 1UL}); - sum.mutable_data(phi::make_ddim(sum_dims), ctx.GetPlace()); - auto sum_mat = EigenMatrix::From(sum); - out->mutable_data(ctx.GetPlace()); - auto out_mat = framework::EigenMatrix::From(*out); - if (bias) { - bit_code->Add(*bias, pre_out); - } - bit_code->Mul(pre_out, w, in); - // clip to [-40, 40] - Transform trans; - trans(ctx.template device_context(), pre_out_data, - pre_out_data + pre_out->numel(), pre_out_data, - ClipFunctor(static_cast(-40.0), static_cast(40.0))); - bit_code->Sum(*pre_out, out, static_cast(-1)); - // use softrelu to calculate cross entropy - pre_out_mat.device(place) = (static_cast(1.0) + pre_out_mat.exp()).log(); - row_sum(dev_ctx, *pre_out, &sum); - // TODO(guosheng): Subtract the out of path's loss, since not all - // class(leaf) nodes' path lengths equal code_length. But it won't break the - // gradient check since both have the out of path's loss and will cancel out - // each other. - out_mat.device(place) = sum_mat + out_mat; - } -}; - -template -class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& in = GET_DATA_SAFELY(ctx.Input("X"), "Input", "X", - "HierarchicalSigmoidGrad"); - auto& w = GET_DATA_SAFELY(ctx.Input("W"), "Input", "W", - "HierarchicalSigmoidGrad"); - auto* path = ctx.Input("PathTable"); - auto* code = ctx.Input("PathCode"); - auto* in_grad = ctx.Output(framework::GradVarName("X")); - bool is_sparse = ctx.Attr("is_sparse"); - auto& dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; - auto& label = GET_DATA_SAFELY(ctx.Input("Label"), "Input", - "Label", "HierarchicalSigmoidGrad"); - auto& pre_out = GET_DATA_SAFELY(ctx.Input("PreOut"), "Input", - "PreOut", "HierarchicalSigmoidGrad"); - auto& out_grad = GET_DATA_SAFELY( - ctx.Input(framework::GradVarName("Out")), "Input", - framework::GradVarName("Out"), "HierarchicalSigmoidGrad"); - LoDTensor pre_out_grad; - - pre_out_grad.mutable_data(pre_out.dims(), ctx.GetPlace()); - in_grad->mutable_data(ctx.GetPlace()); - zero(dev_ctx, in_grad, static_cast(0.0)); - - size_t num_classes = static_cast(ctx.Attr("num_classes")); - - bool is_custom = false; - if (path) { - is_custom = true; - } - - std::unique_ptr> bit_code; - if (!is_custom) { - bit_code.reset(new math::MatrixBitCodeFunctor( - num_classes, label.template data())); - } else { - bit_code.reset(new math::MatrixBitCodeFunctor( - *path, *code, label.template data())); - } - - // softrelu derivative - - auto blas = phi::funcs::GetBlas(ctx); - - auto* pre_out_grad_data = pre_out_grad.data(); - auto* pre_out_data = pre_out.template data(); - auto n = pre_out.numel(); - blas.VEXP(n, pre_out_data, pre_out_grad_data); - blas.VINV(n, pre_out_grad_data, pre_out_grad_data); - for (int64_t i = 0; i < n; ++i) { - pre_out_grad_data[i] = 1.0 - pre_out_grad_data[i]; - } - bit_code->Sub(&pre_out_grad); // the gradient of clip(w * x + b) - auto* out_grad_data = out_grad.template data(); - - int64_t dim0 = pre_out_grad.dims()[0]; - int64_t dim1 = pre_out_grad.dims()[1]; - for (int64_t i = 0; i < dim0; ++i) { - T tmp = out_grad_data[i]; - blas.SCAL(dim1, tmp, pre_out_grad_data + i * dim1); - } - // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to - // be consistent with the clipping in forward. - auto* bias_grad = ctx.Output(framework::GradVarName("Bias")); - if (bias_grad) { - bias_grad->mutable_data(ctx.GetPlace()); - zero(dev_ctx, bias_grad, static_cast(0.0)); - bit_code->AddGrad(pre_out_grad, bias_grad); - } - if (!is_sparse) { - auto* w_grad = ctx.Output(framework::GradVarName("W")); - w_grad->mutable_data(ctx.GetPlace()); - zero(dev_ctx, w_grad, static_cast(0.0)); - bit_code->MulGradWeight(pre_out_grad, w_grad, in); - } else { - PADDLE_ENFORCE_NOT_NULL(path, - platform::errors::NotFound( - "Custom tree must be set for sparse mode!")); - framework::Vector real_rows = PathToRows(*path); - auto* w_grad = ctx.Output(framework::GradVarName("W")); - w_grad->set_rows(real_rows); - // Build a map of id -> row_index to speed up finding the index of one id - w_grad->set_height(w.dims()[0]); - auto* w_grad_value = w_grad->mutable_value(); - framework::DDim temp_dim(w.dims()); - temp_dim[0] = real_rows.size(); - w_grad_value->mutable_data(temp_dim, ctx.GetPlace()); - zero(dev_ctx, w_grad_value, static_cast(0.0)); - bit_code->MulGradWeight(pre_out_grad, w_grad, in); - } - bit_code->MulGradError(pre_out_grad, w, in_grad); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index ef75ab573c6..3f77a20af22 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -369,6 +369,40 @@ void ConcatInferMeta(const std::vector& x, out->share_lod(*x.at(0)); } +void HierarchicalSigmoidInferMeta(const MetaTensor& x, + const MetaTensor& w, + const MetaTensor& label, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + MetaTensor* out, + MetaTensor* pre_out, + MetaTensor* w_out) { + const int64_t input_dims = x.dims()[0]; + const int64_t label_dims = label.dims()[0]; + PADDLE_ENFORCE_EQ(input_dims, + label_dims, + phi::errors::InvalidArgument( + "The first dimension of " + "input and label is expected to be the same. " + "But received input's first dimension is %d; " + "label's first dimension is %d.", + input_dims, + label_dims)); + + std::vector output_shape({input_dims, 1}); + out->set_dims(phi::make_ddim(output_shape)); + out->share_lod(x); + out->set_dtype(x.dtype()); +} + void MultiDotInferMeta(const std::vector& x, MetaTensor* out) { auto inputs_dims = GetMetaTensorsDim(x); diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 6de95386dd9..a712ca31de7 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -87,6 +87,23 @@ void ConcatInferMeta(const std::vector& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void HierarchicalSigmoidInferMeta(const MetaTensor& x, + const MetaTensor& w, + const MetaTensor& label, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + MetaTensor* out, + MetaTensor* pre_out, + MetaTensor* w_out); + void MultiDotInferMeta(const std::vector& x, MetaTensor* out); void PsroiPoolInferMeta(const MetaTensor& x, diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index aa76561c5ce..d140912aa78 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -27,12 +27,15 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel) # Some kernels depend on some targets that are not commonly used. # These targets are not suitable for common dependencies. # In this case, you need to manually generate them here. -set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel +set(MANUAL_BUILD_KERNELS eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel + hierarchical_sigmoid_kernel hierarchical_sigmoid_grad_kernel matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel triangular_solve_grad_kernel determinant_grad_kernel reduce_kernel) kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function) +kernel_library(hierarchical_sigmoid_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_bit_code) +kernel_library(hierarchical_sigmoid_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_bit_code) kernel_library(gumbel_softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(gumbel_softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(reduce_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel) diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h new file mode 100644 index 00000000000..b79aab96c0f --- /dev/null +++ b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h @@ -0,0 +1,110 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/math/matrix_bit_code.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/selected_rows.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +namespace math = paddle::operators::math; + +template +void HierarchicalSigmoidGradKernelImpl( + const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + const DenseTensor& pre_out, + const DenseTensor& out_grad, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* x_grad, + DenseTensor* w_grad, + DenseTensor* bias_grad, + SelectedRows* w_grad_sr = nullptr) { + funcs::SetConstant zero; + DenseTensor pre_out_grad; + + pre_out_grad.Resize(pre_out.dims()); + ctx.template Alloc(&pre_out_grad); + ctx.template Alloc(x_grad); + zero(ctx, x_grad, static_cast(0.0)); + + bool is_custom = false; + if (path.get_ptr()) { + is_custom = true; + } + + std::unique_ptr> bit_code; + if (!is_custom) { + bit_code.reset(new math::MatrixBitCodeFunctor( + num_classes, label.template data())); + } else { + bit_code.reset(new math::MatrixBitCodeFunctor( + *(path.get_ptr()), *(code.get_ptr()), label.template data())); + } + + // softrelu derivative + + auto blas = funcs::GetBlas(ctx); + + auto* pre_out_grad_data = pre_out_grad.data(); + auto* pre_out_data = pre_out.template data(); + auto n = pre_out.numel(); + blas.VEXP(n, pre_out_data, pre_out_grad_data); + blas.VINV(n, pre_out_grad_data, pre_out_grad_data); + for (int64_t i = 0; i < n; ++i) { + pre_out_grad_data[i] = 1.0 - pre_out_grad_data[i]; + } + bit_code->Sub(&pre_out_grad); // the gradient of clip(w * x + b) + auto* out_grad_data = out_grad.template data(); + + int64_t dim0 = pre_out_grad.dims()[0]; + int64_t dim1 = pre_out_grad.dims()[1]; + for (int64_t i = 0; i < dim0; ++i) { + T tmp = out_grad_data[i]; + blas.SCAL(dim1, tmp, pre_out_grad_data + i * dim1); + } + // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to + // be consistent with the clipping in forward. + if (bias_grad) { + ctx.template Alloc(bias_grad); + zero(ctx, bias_grad, static_cast(0.0)); + bit_code->AddGrad(pre_out_grad, bias_grad); + } + ctx.template Alloc(w_grad); + zero(ctx, w_grad, static_cast(0.0)); + if (!is_sparse) { + bit_code->MulGradWeight(pre_out_grad, w_grad, x); + } else { + bit_code->MulGradWeight(pre_out_grad, w_grad_sr, x); + } + bit_code->MulGradError(pre_out_grad, w, x_grad); +} + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc new file mode 100644 index 00000000000..f64a1a8162a --- /dev/null +++ b/paddle/phi/kernels/cpu/hierarchical_sigmoid_grad_kernel.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h" + +namespace phi { + +template +void HierarchicalSigmoidGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + const DenseTensor& pre_out, + const DenseTensor& out_grad, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* x_grad, + DenseTensor* w_grad, + DenseTensor* bias_grad) { + HierarchicalSigmoidGradKernelImpl(ctx, + x, + w, + label, + pre_out, + out_grad, + path, + code, + bias, + num_classes, + remote_prefetch, + trainer_id, + height_sections, + epmap, + table_names, + is_sparse, + x_grad, + w_grad, + bias_grad); +} + +} // namespace phi + +PD_REGISTER_KERNEL(hierarchical_sigmoid_grad, + CPU, + ALL_LAYOUT, + phi::HierarchicalSigmoidGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc b/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc new file mode 100644 index 00000000000..096a54f9fb2 --- /dev/null +++ b/paddle/phi/kernels/cpu/hierarchical_sigmoid_kernel.cc @@ -0,0 +1,115 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/hierarchical_sigmoid_kernel.h" + +#include "paddle/fluid/operators/clip_op.h" +#include "paddle/fluid/operators/math/matrix_bit_code.h" +#include "paddle/fluid/platform/transform.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/math_function_impl.h" + +namespace phi { + +namespace math = paddle::operators::math; + +template +void HierarchicalSigmoidKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* out, + DenseTensor* pre_out, + DenseTensor* w_out) { + size_t num_classes_st = static_cast(num_classes); + // for remote prefetch + + bool is_custom = false; + if (path.get_ptr()) { + is_custom = true; + } + int64_t code_length = path.get_ptr() ? path.get_ptr()->dims()[1] + : math::FindLastSet(num_classes_st - 1); + int64_t batch_size = x.dims()[0]; + DenseTensor sum; + pre_out->Resize(phi::make_ddim({batch_size, code_length})); + ctx.template Alloc(pre_out); + auto* pre_out_data = pre_out->data(); + auto pre_out_mat = EigenMatrix::From(*pre_out); + // Not all class(leaf) nodes' path lengths equal code_length, thus init as + // 0s can avoid out of path's loss. + funcs::SetConstant zero; + zero(ctx, pre_out, static_cast(0.0)); + auto& place = *ctx.eigen_device(); + funcs::RowwiseSum row_sum; + + std::unique_ptr> bit_code; + if (!is_custom) { + bit_code.reset(new math::MatrixBitCodeFunctor( + num_classes_st, label.template data())); + } else { + bit_code.reset(new math::MatrixBitCodeFunctor( + *(path.get_ptr()), *(code.get_ptr()), label.template data())); + } + + std::vector sum_dims({batch_size, 1UL}); + sum.Resize(phi::make_ddim(sum_dims)); + ctx.template Alloc(&sum); + auto sum_mat = EigenMatrix::From(sum); + ctx.template Alloc(out); + auto out_mat = EigenMatrix::From(*out); + if (bias.get_ptr()) { + bit_code->Add(*(bias.get_ptr()), pre_out); + } + bit_code->Mul(pre_out, w, x); + // clip to [-40, 40] + paddle::platform::Transform trans; + trans(ctx, + pre_out_data, + pre_out_data + pre_out->numel(), + pre_out_data, + paddle::operators::ClipFunctor(static_cast(-40.0), + static_cast(40.0))); + bit_code->Sum(*pre_out, out, static_cast(-1)); + // use softrelu to calculate cross entropy + pre_out_mat.device(place) = (static_cast(1.0) + pre_out_mat.exp()).log(); + row_sum(ctx, *pre_out, &sum); + // TODO(guosheng): Subtract the out of path's loss, since not all + // class(leaf) nodes' path lengths equal code_length. But it won't break the + // gradient check since both have the out of path's loss and will cancel out + // each other. + out_mat.device(place) = sum_mat + out_mat; +} + +} // namespace phi + +PD_REGISTER_KERNEL(hierarchical_sigmoid, + CPU, + ALL_LAYOUT, + phi::HierarchicalSigmoidKernel, + float, + double) {} diff --git a/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h b/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h new file mode 100644 index 00000000000..f7a327cd3f5 --- /dev/null +++ b/paddle/phi/kernels/hierarchical_sigmoid_grad_kernel.h @@ -0,0 +1,42 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void HierarchicalSigmoidGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + const DenseTensor& pre_out, + const DenseTensor& out_grad, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* x_grad, + DenseTensor* w_grad, + DenseTensor* bias_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/hierarchical_sigmoid_kernel.h b/paddle/phi/kernels/hierarchical_sigmoid_kernel.h new file mode 100644 index 00000000000..619b022904b --- /dev/null +++ b/paddle/phi/kernels/hierarchical_sigmoid_kernel.h @@ -0,0 +1,40 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void HierarchicalSigmoidKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* out, + DenseTensor* pre_out, + DenseTensor* w_out); + +} // namespace phi diff --git a/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc new file mode 100644 index 00000000000..80b2a1f6678 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h" + +#include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/hierarchical_sigmoid_grad.h" + +namespace phi { +namespace sr { + +static std::vector PathToRows(const DenseTensor& path) { + std::set rows; + const int64_t* paths = path.data(); + for (int64_t i = 0; i < path.numel(); ++i) { + int64_t row = paths[i]; + if (row < 0) { + continue; + } + rows.emplace(row); + } + return std::vector(rows.begin(), rows.end()); +} + +template +void HierarchicalSigmoidGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + const DenseTensor& pre_out, + const DenseTensor& out_grad, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* x_grad, + SelectedRows* w_grad, + DenseTensor* bias_grad) { + PADDLE_ENFORCE_NOT_NULL( + path.get_ptr(), + errors::NotFound("Custom tree must be set for sparse mode!")); + paddle::framework::Vector real_rows = PathToRows(*path); + w_grad->set_rows(real_rows); + // Build a map of id -> row_index to speed up finding the index of one id + w_grad->set_height(w.dims()[0]); + auto* w_grad_value = w_grad->mutable_value(); + phi::DDim temp_dim(w.dims()); + temp_dim[0] = real_rows.size(); + w_grad_value->Resize(temp_dim); + phi::HierarchicalSigmoidGradKernelImpl(ctx, + x, + w, + label, + pre_out, + out_grad, + path, + code, + bias, + num_classes, + remote_prefetch, + trainer_id, + height_sections, + epmap, + table_names, + is_sparse, + x_grad, + w_grad_value, + bias_grad, + w_grad); +} + +} // namespace sr +} // namespace phi + +PD_REGISTER_KERNEL(hierarchical_sigmoid_grad_sr, + CPU, + ALL_LAYOUT, + phi::sr::HierarchicalSigmoidGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h new file mode 100644 index 00000000000..557c8b1bc5e --- /dev/null +++ b/paddle/phi/kernels/selected_rows/hierarchical_sigmoid_grad_kernel.h @@ -0,0 +1,45 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/selected_rows.h" + +namespace phi { +namespace sr { + +template +void HierarchicalSigmoidGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& w, + const DenseTensor& label, + const DenseTensor& pre_out, + const DenseTensor& out_grad, + paddle::optional path, + paddle::optional code, + paddle::optional bias, + int num_classes, + bool remote_prefetch, + int trainer_id, + const std::vector& height_sections, + const std::vector& epmap, + const std::vector& table_names, + bool is_sparse, + DenseTensor* x_grad, + SelectedRows* w_grad, + DenseTensor* bias_grad); + +} // namespace sr +} // namespace phi diff --git a/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc b/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc new file mode 100644 index 00000000000..20183d1a9b0 --- /dev/null +++ b/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc @@ -0,0 +1,83 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature HierarchicalSigmoidOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("hierarchical_sigmoid", + {"X", "W", "Label", "PathTable", "PathCode", "Bias"}, + {"num_classes", + "remote_prefetch", + "trainer_id", + "height_sections", + "epmap", + "table_names", + "is_sparse"}, + {"Out", "PreOut", "W_Out"}); +} + +KernelSignature HierarchicalSigmoidGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + if (ctx.IsDenseTensorOutput(GradVarName("W"))) { + return KernelSignature( + "hierarchical_sigmoid_grad", + {"X", + "W", + "Label", + "PreOut", + GradVarName("Out"), + "PathTable", + "PathCode", + "Bias"}, + {"num_classes", + "remote_prefetch", + "trainer_id", + "height_sections", + "epmap", + "table_names", + "is_sparse"}, + {GradVarName("X"), GradVarName("W"), GradVarName("Bias")}); + } else if (ctx.IsSelectedRowsOutput(GradVarName("W"))) { + return KernelSignature( + "hierarchical_sigmoid_grad_sr", + {"X", + "W", + "Label", + "PreOut", + GradVarName("Out"), + "PathTable", + "PathCode", + "Bias"}, + {"num_classes", + "remote_prefetch", + "trainer_id", + "height_sections", + "epmap", + "table_names", + "is_sparse"}, + {GradVarName("X"), GradVarName("W"), GradVarName("Bias")}); + } else { + return KernelSignature("unregistered", {}, {}, {}); + } +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(hierarchical_sigmoid, + phi::HierarchicalSigmoidOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(hierarchical_sigmoid_grad, + phi::HierarchicalSigmoidGradOpArgumentMapping); -- GitLab From aed6faf206b7ac5204a02546700704e5812e78c2 Mon Sep 17 00:00:00 2001 From: shentanyue <34421038+shentanyue@users.noreply.github.com> Date: Fri, 18 Mar 2022 18:51:15 +0800 Subject: [PATCH 168/176] [Phi] Migrate gelu/log_softmax/prelu op kernel and infershape (#40393) * add gelu * fix gelu * add log_softmax * add prelu kernel and prelu/gelu/logsoftmax infershape * fix * fix * fix * fix * fix ci * log_softmax rewrite * fix * fix * fix conflict * fix compile error * fix comment * fix * ci_fix Co-authored-by: Yan Li --- .../mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc | 3 +- .../ir/mkldnn/mkldnn_inplace_pass_tester.cc | 4 +- .../mkldnn/elementwise_mkldnn_op.h | 10 +- paddle/fluid/operators/gelu_op.cc | 32 +- paddle/fluid/operators/gelu_op.cu | 320 ------------------ paddle/fluid/operators/gelu_op.h | 233 ------------- paddle/fluid/operators/gelu_op_npu.cc | 4 +- paddle/fluid/operators/gelu_op_npu_test.cc | 2 +- paddle/fluid/operators/gelu_op_xpu.cc | 6 +- paddle/fluid/operators/log_softmax_op.cc | 24 +- paddle/fluid/operators/log_softmax_op.cu | 81 ----- paddle/fluid/operators/log_softmax_op.h | 197 ----------- paddle/fluid/operators/log_softmax_op_npu.cc | 7 +- paddle/fluid/operators/prelu_op.cc | 109 +----- paddle/fluid/operators/prelu_op.cu | 208 ------------ paddle/fluid/operators/prelu_op.h | 172 ---------- paddle/phi/infermeta/binary.cc | 97 ++++++ paddle/phi/infermeta/binary.h | 7 + paddle/phi/infermeta/unary.cc | 2 +- paddle/phi/kernels/cpu/gelu_grad_kernel.cc | 146 ++++++++ paddle/phi/kernels/cpu/gelu_kernel.cc | 102 ++++++ .../kernels/cpu/log_softmax_grad_kernel.cc | 88 +++++ paddle/phi/kernels/cpu/log_softmax_kernel.cc | 123 +++++++ paddle/phi/kernels/cpu/prelu_grad_kernel.cc | 119 +++++++ paddle/phi/kernels/cpu/prelu_kernel.cc | 71 ++++ paddle/phi/kernels/gelu_grad_kernel.h | 31 ++ paddle/phi/kernels/gelu_kernel.h | 32 ++ paddle/phi/kernels/gpu/gelu_funcs.h | 176 ++++++++++ paddle/phi/kernels/gpu/gelu_grad_kernel.cu | 100 ++++++ paddle/phi/kernels/gpu/gelu_kernel.cu | 90 +++++ .../kernels/gpu/log_softmax_grad_kernel.cu | 53 +++ paddle/phi/kernels/gpu/log_softmax_kernel.cu | 51 +++ paddle/phi/kernels/gpu/prelu_funcs.h | 183 ++++++++++ paddle/phi/kernels/gpu/prelu_grad_kernel.cu | 183 ++++++++++ paddle/phi/kernels/gpu/prelu_kernel.cu | 71 ++++ paddle/phi/kernels/log_softmax_grad_kernel.h | 27 ++ paddle/phi/kernels/log_softmax_kernel.h | 26 ++ paddle/phi/kernels/prelu_grad_kernel.h | 31 ++ paddle/phi/kernels/prelu_kernel.h | 28 ++ paddle/phi/ops/compat/gelu_sig.cc | 33 ++ paddle/phi/ops/compat/log_softmax_sig.cc | 30 ++ paddle/phi/ops/compat/prelu_sig.cc | 28 ++ 42 files changed, 1973 insertions(+), 1367 deletions(-) delete mode 100644 paddle/fluid/operators/gelu_op.cu delete mode 100644 paddle/fluid/operators/gelu_op.h delete mode 100644 paddle/fluid/operators/log_softmax_op.cu delete mode 100644 paddle/fluid/operators/log_softmax_op.h delete mode 100644 paddle/fluid/operators/prelu_op.cu delete mode 100644 paddle/fluid/operators/prelu_op.h create mode 100644 paddle/phi/kernels/cpu/gelu_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/gelu_kernel.cc create mode 100644 paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/log_softmax_kernel.cc create mode 100644 paddle/phi/kernels/cpu/prelu_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/prelu_kernel.cc create mode 100644 paddle/phi/kernels/gelu_grad_kernel.h create mode 100644 paddle/phi/kernels/gelu_kernel.h create mode 100644 paddle/phi/kernels/gpu/gelu_funcs.h create mode 100644 paddle/phi/kernels/gpu/gelu_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/gelu_kernel.cu create mode 100644 paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/log_softmax_kernel.cu create mode 100644 paddle/phi/kernels/gpu/prelu_funcs.h create mode 100644 paddle/phi/kernels/gpu/prelu_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/prelu_kernel.cu create mode 100644 paddle/phi/kernels/log_softmax_grad_kernel.h create mode 100644 paddle/phi/kernels/log_softmax_kernel.h create mode 100644 paddle/phi/kernels/prelu_grad_kernel.h create mode 100644 paddle/phi/kernels/prelu_kernel.h create mode 100644 paddle/phi/ops/compat/gelu_sig.cc create mode 100644 paddle/phi/ops/compat/log_softmax_sig.cc create mode 100644 paddle/phi/ops/compat/prelu_sig.cc diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc index 11190309814..bf2cf58f970 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc @@ -32,8 +32,9 @@ USE_OP(conv2d_transpose); USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN); USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); -USE_OP(gelu); +USE_OP_ITSELF(gelu); USE_OP_DEVICE_KERNEL(gelu, MKLDNN); +PD_DECLARE_ARG_MAPPING_FN(gelu); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index ef2e83ced26..7df957b2c0e 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -18,6 +18,7 @@ #include #include + #include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/op_registry.h" @@ -27,10 +28,11 @@ USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP_ITSELF(leaky_relu); USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN); -USE_OP(gelu); +USE_OP_ITSELF(gelu); USE_OP_ITSELF(relu); USE_OP_ITSELF(tanh); USE_OP_DEVICE_KERNEL(tanh, MKLDNN); +PD_DECLARE_ARG_MAPPING_FN(gelu); namespace paddle { namespace framework { diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index 761b401ca9a..d1a1aa3008c 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -198,10 +198,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { platform::EventRole::kUniqueOp); reorder_p->execute(astream, *reorder_src_memory_p, *dst_memory); - } - - // elementwise_mul & elementwise_div - else { + } else { // elementwise_mul & elementwise_div platform::BinaryMKLDNNHandler binary_handler( BINARY_OP, axis, onednn_engine, ctx.GetPlace(), dout, y, dx, 1.0f, 1.0f, 1.0f); @@ -253,10 +250,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { } else { broadcast_src_memory = reorder_src_memory_p; } - } - - // elementwise_mul & elementwise_div - else { + } else { // elementwise_mul & elementwise_div std::unordered_map args; std::shared_ptr binary_prim; std::shared_ptr post_op_memory; diff --git a/paddle/fluid/operators/gelu_op.cc b/paddle/fluid/operators/gelu_op.cc index 3d338f00d4f..3be2606bfc9 100644 --- a/paddle/fluid/operators/gelu_op.cc +++ b/paddle/fluid/operators/gelu_op.cc @@ -14,10 +14,11 @@ limitations under the License. */ #include #include -#include - -#include "paddle/fluid/operators/gelu_op.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -29,18 +30,6 @@ class GeluOp : public framework::OperatorWithKernel { const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(%s) of GeluOp should not be null.", "X")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(%s) of GeluOp should not be null.", "Out")); - - ctx->ShareDim("X", /*->*/ "Out"); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -156,13 +145,10 @@ class GeluGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(gelu, GeluInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(gelu, ops::GeluOp, ops::GeluOpMaker, ops::GeluGradOpMaker, - ops::GeluGradOpMaker); + ops::GeluGradOpMaker, + GeluInferShapeFunctor); REGISTER_OPERATOR(gelu_grad, ops::GeluGradOp); -REGISTER_OP_CPU_KERNEL( - gelu, ops::GeluKernel, - ops::GeluKernel); -REGISTER_OP_CPU_KERNEL( - gelu_grad, ops::GeluGradKernel, - ops::GeluGradKernel); diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu deleted file mode 100644 index ef836ab72f0..00000000000 --- a/paddle/fluid/operators/gelu_op.cu +++ /dev/null @@ -1,320 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" -#include "paddle/fluid/operators/gelu_op.h" - -DECLARE_bool(use_fast_math); - -namespace paddle { -namespace operators { - -#ifdef __NVCC__ -template -static __device__ __forceinline__ float FP32FastTanh(float x) { -#if __CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000 - if (FastMode) { - float y; - asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(y) : "f"(x)); - return y; - } -#endif - return tanhf(x); -} - -template -static __device__ __forceinline__ float FP32GeluFwd(float x) { - auto tanh_out = - FP32FastTanh(0.79788456f * x * (1.0f + 0.044715f * x * x)); - return x * 0.5f * (1.0f + tanh_out); -} - -template -static __device__ __forceinline__ float FP32GeluBwd(float x, float y_g) { - auto tanh_out = - FP32FastTanh(0.79788456f * x * (1.0f + 0.044715f * x * x)); - auto tmp = 0.5f * x * ((1.0f - tanh_out * tanh_out) * - (0.79788456f + 0.1070322243f * x * x)) + - 0.5f * (1.0f + tanh_out); - return tmp * y_g; -} - -template -static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x, __half* y, - size_t n) { - size_t offset = - static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; - size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; - for (; offset < n; offset += stride) { - using ArrT = phi::AlignedVector<__half, VecSize>; - ArrT in_arr = *reinterpret_cast(x + offset); -#pragma unroll - for (int i = 0; i < VecSize; ++i) { - float tmp = __half2float(in_arr[i]); - in_arr[i] = __float2half(FP32GeluFwd(tmp)); - } - *reinterpret_cast(y + offset) = in_arr; - } -} - -template -static __global__ void FP16FastGeluBwdCUDAKernel(const __half* x, - const __half* y_g, __half* x_g, - size_t n) { - size_t offset = - static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; - size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; - for (; offset < n; offset += stride) { - using ArrT = phi::AlignedVector<__half, VecSize>; - ArrT x_in_arr = *reinterpret_cast(x + offset); - ArrT y_g_in_arr = *reinterpret_cast(y_g + offset); -#pragma unroll - for (int i = 0; i < VecSize; ++i) { - __half2 tmp_fp16_2; - tmp_fp16_2.x = x_in_arr[i]; - tmp_fp16_2.y = y_g_in_arr[i]; - float2 tmp_fp32_2 = __half22float2(tmp_fp16_2); - x_in_arr[i] = - __float2half(FP32GeluBwd(tmp_fp32_2.x, tmp_fp32_2.y)); - } - *reinterpret_cast(x_g + offset) = x_in_arr; - } -} - -static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel( - const platform::CUDADeviceContext& dev_ctx, const __half* x, __half* y, - size_t n) { - auto is_aligned = [](const void* p, size_t alignment) { - return reinterpret_cast(p) % alignment == 0; - }; - -#define PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(__vec_size, __use_fast_math) \ - do { \ - constexpr auto kAlignment = \ - alignof(phi::AlignedVector<__half, __vec_size>); \ - if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ - is_aligned(y, kAlignment)) { \ - size_t thread = std::min(512, dev_ctx.GetMaxThreadsPerBlock()); \ - size_t block = (n / __vec_size + thread - 1) / thread; \ - block = std::min(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \ - VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block \ - << " , thread = " << thread; \ - FP16FastGeluFwdCUDAKernel< \ - __vec_size, \ - __use_fast_math><<>>(x, y, n); \ - return true; \ - } \ - } while (0) - - if (FLAGS_use_fast_math) { - PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, true); - } else { - PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, false); - } - -#undef PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL - return false; -} - -static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel( - const platform::CUDADeviceContext& dev_ctx, const __half* x, - const __half* y_g, __half* x_g, size_t n) { - auto is_aligned = [](const void* p, size_t alignment) { - return reinterpret_cast(p) % alignment == 0; - }; - -#define PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(__vec_size, __use_fast_math) \ - do { \ - constexpr auto kAlignment = \ - alignof(phi::AlignedVector<__half, __vec_size>); \ - if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ - is_aligned(x, kAlignment) && is_aligned(y_g, kAlignment) && \ - is_aligned(x_g, kAlignment)) { \ - size_t thread = std::min(512, dev_ctx.GetMaxThreadsPerBlock()); \ - size_t block = (n / __vec_size + thread - 1) / thread; \ - block = std::min(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \ - VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block \ - << " , thread = " << thread; \ - FP16FastGeluBwdCUDAKernel< \ - __vec_size, \ - __use_fast_math><<>>(x, y_g, \ - x_g, n); \ - return true; \ - } \ - } while (0) - - if (FLAGS_use_fast_math) { - PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, true); - } else { - PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, false); - } - -#undef PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL - return false; -} -#endif - -template -struct GeluWithApproximateFunctor { - using MPType = typename details::MPTypeTrait::Type; - inline HOSTDEVICE T operator()(T arg_x) { - // this function is tanh approximation of gelu - MPType x = static_cast(arg_x); - MPType one = static_cast(1); - MPType half = static_cast(0.5); - MPType kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); - auto tanh_out = - tanh(kAlpha * x * (one + static_cast(GELU_CONSTANT) * x * x)); - MPType out = x * half * (one + tanh_out); - return static_cast(out); - } -}; - -template -struct GeluWithoutApproximateFunctor { - using MPType = typename details::MPTypeTrait::Type; - inline HOSTDEVICE T operator()(T arg_x) { - // actual gelu with approximation = false - MPType x = static_cast(arg_x); - return static_cast(x * normcdf(x)); - } -}; - -template -class GeluKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out = context.Output("Out"); - auto* in = context.Input("X"); - auto approximate = context.Attr("approximate"); - out->mutable_data(in->place()); - - std::vector ins = {in}; - std::vector outs = {out}; - const auto& dev_ctx = - context.template device_context(); - - if (approximate) { -#ifdef __NVCC__ - if (std::is_same::value) { - size_t n = in->numel(); - const auto* in_ptr = reinterpret_cast(in->data()); - auto* out_ptr = reinterpret_cast<__half*>(out->data()); - if (TryLaunchFP16FastGeluFwdVectorizeCUDAKernel(dev_ctx, in_ptr, - out_ptr, n)) { - return; - } - } -#endif - paddle::operators::LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor()); - } else { - paddle::operators::LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, 0, GeluWithoutApproximateFunctor()); - } - } -}; - -template -struct GeluWithApproximateGradFunctor { - using MPType = typename details::MPTypeTrait::Type; - inline HOSTDEVICE T operator()(T arg_x, T arg_dout) { - MPType x = static_cast(arg_x); - MPType dout = static_cast(arg_dout); - MPType one = static_cast(1); - MPType half = static_cast(0.5); - MPType kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); - MPType kBeta = - kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); - auto cube_x = x * x * x; - auto tanh_out = - tanh(kAlpha * ((static_cast(GELU_CONSTANT) * cube_x) + x)); - auto ans = - half * (one + tanh_out + - (one - tanh_out * tanh_out) * (x * kAlpha + kBeta * cube_x)); - return static_cast(ans * dout); - } -}; - -template -struct GeluWithoutApproximateGradFunctor { - using MPType = typename details::MPTypeTrait::Type; - inline HOSTDEVICE T operator()(T arg_x, T arg_dout) { - MPType x = static_cast(arg_x); - MPType dout = static_cast(arg_dout); - constexpr MPType kBeta = M_2_SQRTPI * M_SQRT1_2 * static_cast(0.5); - const MPType cdf = normcdf(x); - const MPType pdf = exp(static_cast(-0.5) * x * x) * kBeta; - return static_cast(dout * (cdf + x * pdf)); - } -}; - -template -class GeluGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* dout = - context.Input(framework::GradVarName("Out")); - auto* dx = context.Output(framework::GradVarName("X")); - auto approximate = context.Attr("approximate"); - dx->mutable_data(dout->place()); - - std::vector ins = {x, dout}; - std::vector outs = {dx}; - const auto& dev_ctx = - context.template device_context(); - if (approximate) { -#ifdef __NVCC__ - if (std::is_same::value) { - size_t n = x->numel(); - const auto* x_ptr = reinterpret_cast(x->data()); - const auto* y_g_ptr = reinterpret_cast(dout->data()); - auto* x_g_ptr = reinterpret_cast<__half*>(dx->data()); - if (TryLaunchFP16FastGeluBwdVectorizeCUDAKernel(dev_ctx, x_ptr, y_g_ptr, - x_g_ptr, n)) { - return; - } - } -#endif - paddle::operators::LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, 0, GeluWithApproximateGradFunctor()); - } else { - paddle::operators::LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, 0, GeluWithoutApproximateGradFunctor()); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - gelu, ops::GeluKernel, - ops::GeluKernel, - ops::GeluKernel); -REGISTER_OP_CUDA_KERNEL( - gelu_grad, ops::GeluGradKernel, - ops::GeluGradKernel, - ops::GeluGradKernel); diff --git a/paddle/fluid/operators/gelu_op.h b/paddle/fluid/operators/gelu_op.h deleted file mode 100644 index d4fed8a868f..00000000000 --- a/paddle/fluid/operators/gelu_op.h +++ /dev/null @@ -1,233 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#ifndef _USE_MATH_DEFINES -#define _USE_MATH_DEFINES -#endif -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -#ifdef PADDLE_WITH_MKLDNN -#include "paddle/fluid/platform/mkldnn_helper.h" -#endif - -namespace paddle { -namespace operators { - -#define GELU_CONSTANT 0.044715 - -template -struct GeluFunctor { - template - void operator()(Device d, X x, Out out, bool approximate) const { - if (approximate) { - // gelu(x) = 0.5 * x * (1 + tanh(sqrt(2 / \pi) * (x + 0.044715 * x^{3}))) - if (std::is_same::value) { - VLOG(4) << "cast from float16 to float before computing"; - auto casted_x = x.template cast(); - auto temp = - (static_cast(M_2_SQRTPI * M_SQRT1_2) * - (casted_x + static_cast(GELU_CONSTANT) * casted_x.cube())) - .tanh(); - out.device(d) = (casted_x * static_cast(0.5) * - (static_cast(1) + temp)) - .template cast(); - } else { - auto temp = (static_cast(M_2_SQRTPI * M_SQRT1_2) * - (x + static_cast(GELU_CONSTANT) * x.cube())) - .tanh(); - out.device(d) = x * static_cast(0.5) * (static_cast(1) + temp); - } - } else { -#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ - !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) - auto x_data = x.data(); - auto out_data = out.data(); - int n = std::min(x.size(), out.size()); - - std::memset(out_data, 0, n * sizeof(T)); - phi::funcs::CBlas::AXPY(n, static_cast(M_SQRT1_2), x_data, 1, - out_data, 1); - phi::funcs::CBlas::VMERF(n, out_data, out_data, VML_LA); - for (int i = 0; i < n; i++) { - out_data[i] += static_cast(1); - } - phi::funcs::CBlas::VMUL(n, x_data, out_data, out_data); - for (int i = 0; i < n; i++) { - out_data[i] *= static_cast(0.5); - } -#else - // gelu(x) = 0.5 * x * (1 + erf(x / sqrt(2))) - if (std::is_same::value) { - VLOG(4) << "cast from float16 to float before computing"; - auto casted_x = x.template cast(); - auto temp = (casted_x * static_cast(M_SQRT1_2)).erf(); - out.device(d) = (casted_x * static_cast(0.5) * - (static_cast(1) + temp)) - .template cast(); - } else { - auto temp = (x * static_cast(M_SQRT1_2)).erf(); - out.device(d) = x * static_cast(0.5) * (static_cast(1) + temp); - } -#endif - } - } -}; - -template -struct GeluGradFunctor { - template - void operator()(Device d, X x, dOut dout, dX dx, bool approximate) const { - if (approximate) { - if (std::is_same::value) { - VLOG(4) << "cast from float16 to float before computing"; - auto casted_x = x.template cast(); - auto casted_dout = dout.template cast(); - - const float kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); - const float kBeta = - kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); - const auto y = - (kAlpha * - ((static_cast(GELU_CONSTANT) * casted_x.cube()) + casted_x)) - .tanh(); - dx.device(d) = (static_cast(0.5) * casted_dout * - (static_cast(1) + y + - (casted_x - casted_x * y.square()) * - (kAlpha + kBeta * casted_x.square()))) - .template cast(); - } else { - const T kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); - const T kBeta = - kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); - const auto y = - (kAlpha * ((static_cast(GELU_CONSTANT) * x.cube()) + x)).tanh(); - dx.device(d) = static_cast(0.5) * dout * - (static_cast(1) + y + - (x - x * y.square()) * (kAlpha + kBeta * x.square())); - } - } else { -#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ - !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) - auto x_data = x.data(); - auto dx_data = dx.data(); - auto dout_data = dout.data(); - int n = std::min(x.size(), dx.size()); - - auto first = static_cast(std::malloc(n * sizeof(T))); - std::memset(first, 0, n * sizeof(T)); - auto second = static_cast(std::malloc(n * sizeof(T))); - std::memset(second, 0, n * sizeof(T)); - - // first = (0.5 * (1 + erf(x / sqrt(2)))) - phi::funcs::CBlas::AXPY(n, static_cast(M_SQRT1_2), x_data, 1, first, - 1); - phi::funcs::CBlas::VMERF(n, first, first, VML_LA); - for (int i = 0; i < n; i++) { - first[i] += static_cast(1); - } - phi::funcs::CBlas::SCAL(n, static_cast(0.5), first, 1); - - // second = (0.5 * 2/sqrt(pi) * 1/sqrt(2) * x * exp(-0.5 * x^2)) - phi::funcs::CBlas::VSQUARE(n, x_data, second); - phi::funcs::CBlas::SCAL(n, -static_cast(0.5), second, 1); - phi::funcs::CBlas::VEXP(n, second, second); - phi::funcs::CBlas::VMUL(n, x_data, second, second); - phi::funcs::CBlas::SCAL( - n, static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2), second, 1); - - // dx = dout * (first + second); - phi::funcs::CBlas::VADD(n, first, second, first); - phi::funcs::CBlas::VMUL(n, dout_data, first, dx_data); - - std::free(first); - std::free(second); -#else - // gelu_grad(x) = dout * 0.5 * (1 + erf(x / sqrt(2)) + x * sqrt(2 / pi) * - // exp(- x^2 / 2) - if (std::is_same::value) { - VLOG(4) << "cast from float16 to float before computing"; - auto casted_x = x.template cast(); - auto casted_dout = dout.template cast(); - auto first = static_cast(0.5) * - (static_cast(1) + - ((casted_x * static_cast(M_SQRT1_2)).erf())); - auto second = static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * - casted_x * - (-static_cast(0.5) * casted_x.square()).exp(); - dx.device(d) = (casted_dout * (first + second)).template cast(); - } else { - auto first = - static_cast(0.5) * - (static_cast(1) + ((x * static_cast(M_SQRT1_2)).erf())); - - auto second = static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * x * - (-static_cast(0.5) * x.square()).exp(); - dx.device(d) = dout * (first + second); - } -#endif - } - } -}; - -template -class GeluKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out = context.Output("Out"); - auto* in = context.Input("X"); - auto approximate = context.Attr("approximate"); - out->mutable_data(in->place()); - - auto eigen_out = framework::EigenVector::Flatten(*out); - auto eigen_in = framework::EigenVector::Flatten(*in); - auto& place = - *context.template device_context().eigen_device(); - - GeluFunctor functor; - functor(place, eigen_in, eigen_out, approximate); - } -}; - -template -class GeluGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* dout = - context.Input(framework::GradVarName("Out")); - auto* dx = context.Output(framework::GradVarName("X")); - auto approximate = context.Attr("approximate"); - dx->mutable_data(dout->place()); - - auto eigen_x = framework::EigenVector::Flatten(*x); - auto eigen_dout = framework::EigenVector::Flatten(*dout); - auto eigen_dx = framework::EigenVector::Flatten(*dx); - auto& place = - *context.template device_context().eigen_device(); - - GeluGradFunctor functor; - functor(place, eigen_x, eigen_dout, eigen_dx, approximate); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/gelu_op_npu.cc b/paddle/fluid/operators/gelu_op_npu.cc index 18bbc7f4929..c5297dd9cd4 100644 --- a/paddle/fluid/operators/gelu_op_npu.cc +++ b/paddle/fluid/operators/gelu_op_npu.cc @@ -15,7 +15,9 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/gelu_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc index f3ac5313832..b132b317075 100644 --- a/paddle/fluid/operators/gelu_op_npu_test.cc +++ b/paddle/fluid/operators/gelu_op_npu_test.cc @@ -30,7 +30,7 @@ limitations under the License. */ namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(gelu); +USE_OP_ITSELF(gelu); USE_OP_DEVICE_KERNEL(gelu, NPU); template diff --git a/paddle/fluid/operators/gelu_op_xpu.cc b/paddle/fluid/operators/gelu_op_xpu.cc index b8c2e9becf2..559d2448ad9 100644 --- a/paddle/fluid/operators/gelu_op_xpu.cc +++ b/paddle/fluid/operators/gelu_op_xpu.cc @@ -14,9 +14,9 @@ limitations under the License. */ #include #include - -#include "paddle/fluid/operators/gelu_op.h" - +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/log_softmax_op.cc b/paddle/fluid/operators/log_softmax_op.cc index 0e69b397e04..da38f906b9b 100644 --- a/paddle/fluid/operators/log_softmax_op.cc +++ b/paddle/fluid/operators/log_softmax_op.cc @@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/log_softmax_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/common_infer_shape_functions.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -24,10 +27,6 @@ class LogSoftmaxOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - return UnaryOpUnchangedInferShapeCheckAxis(ctx); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -123,18 +122,11 @@ class LogSoftmaxGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; - +DECLARE_INFER_SHAPE_FUNCTOR(log_softmax, LogSoftmaxInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMetaCheckAxis)); REGISTER_OPERATOR(log_softmax, ops::LogSoftmaxOp, ops::LogSoftmaxOpMaker, ops::LogSoftmaxOpInferVarType, ops::LogSoftmaxGradOpMaker, - ops::LogSoftmaxGradOpMaker); + ops::LogSoftmaxGradOpMaker, + LogSoftmaxInferShapeFunctor); REGISTER_OPERATOR(log_softmax_grad, ops::LogSoftmaxGradOp); - -REGISTER_OP_CPU_KERNEL( - log_softmax, - ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel); -REGISTER_OP_CPU_KERNEL( - log_softmax_grad, - ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel); diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu deleted file mode 100644 index 26b6ce43303..00000000000 --- a/paddle/fluid/operators/log_softmax_op.cu +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/log_softmax_op.h" -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class LogSoftmaxKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *x = ctx.Input("X"); - auto *out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - int input_axis = ctx.Attr("axis"); - auto &dev_ctx = ctx.template device_context(); - phi::SoftmaxForwardCUDAKernelDriver(dev_ctx, *x, input_axis, out); - } -}; - -template -class LogSoftmaxGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *out = ctx.Input("Out"); - auto *dout = ctx.Input(framework::GradVarName("Out")); - auto *dx = ctx.Output(framework::GradVarName("X")); - dx->mutable_data(ctx.GetPlace()); - - int input_axis = ctx.Attr("axis"); - auto &dev_ctx = ctx.template device_context(); - phi::SoftmaxBackwardCUDAKernelDriver(dev_ctx, *out, *dout, - input_axis, dx); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -#ifdef PADDLE_WITH_HIP -REGISTER_OP_CUDA_KERNEL( - log_softmax, ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel); -REGISTER_OP_CUDA_KERNEL( - log_softmax_grad, ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel); -#else -REGISTER_OP_CUDA_KERNEL( - log_softmax, ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel); -REGISTER_OP_CUDA_KERNEL( - log_softmax_grad, ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel); -#endif diff --git a/paddle/fluid/operators/log_softmax_op.h b/paddle/fluid/operators/log_softmax_op.h deleted file mode 100644 index 162087a7566..00000000000 --- a/paddle/fluid/operators/log_softmax_op.h +++ /dev/null @@ -1,197 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -using EigenMatrix = framework::EigenMatrix; - -static inline int CanonicalAxis(const int axis, const int rank) { - if (axis < 0) { - return axis + rank; - } - return axis; -} - -static inline size_t SizeToAxis(const int axis, const framework::DDim dims) { - size_t size = 1; - for (int i = 0; i < axis; i++) { - size *= dims[i]; - } - return size; -} - -static inline size_t SizeFromAxis(const int axis, const framework::DDim dims) { - size_t size = 1; - for (int i = axis; i < dims.size(); i++) { - size *= dims[i]; - } - return size; -} - -template -struct ValueClip { - HOSTDEVICE T operator()(const T& x) const { - const T kThreshold = static_cast(-64.); - return x < kThreshold ? kThreshold : x; - } -}; - -template -struct LogSoftmaxFunctor { - void operator()(const DeviceContext& context, const framework::Tensor* X, - framework::Tensor* Y, const int axis) { - constexpr int kBatchDim = 0; - constexpr int kClassDim = 1; - constexpr int kAxisDim = 1; - - int axis_dim = X->dims()[axis]; - const int n = SizeToAxis(axis, X->dims()); - const int d = SizeFromAxis(axis, X->dims()); - framework::DDim dim_2d{n, d}; - - auto logits = EigenMatrix::From(*X, dim_2d); - auto log_softmax = EigenMatrix::From(*Y, dim_2d); - - const int batch_size = logits.dimension(kBatchDim); - const int num_classes = logits.dimension(kClassDim); - const int num_remain = num_classes / axis_dim; - - Eigen::DSizes along_axis(kAxisDim); - Eigen::DSizes batch_classes(batch_size, num_classes); - Eigen::DSizes batch_by_one(batch_size, 1); - Eigen::DSizes one_by_class(1, num_classes); - Eigen::DSizes batch_one_remain(batch_size, 1, num_remain); - Eigen::DSizes one_axis_one(1, axis_dim, 1); - Eigen::DSizes one_axis(1, axis_dim); - Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); - - // For numerical stability, logits should be shifted by maximum number along - // axis, calculate shifted_logits into log_softmax tensor for memory reuse. - if (num_remain == 1) { - // axis == -1, axis and class in same dimension, calculate along - // class dimension directly for higher performance - log_softmax.device(*context.eigen_device()) = - (logits - - logits.maximum(along_axis) - .eval() - .reshape(batch_by_one) - .broadcast(one_by_class)) - .unaryExpr(ValueClip()); - } else { - // axis != -1, class dimension split into (axis, remain), max and sum - // should be calculated along axis dimension - log_softmax.device(*context.eigen_device()) = - (logits.reshape(batch_axis_remain) - - logits.reshape(batch_axis_remain) - .maximum(along_axis) - .eval() - .reshape(batch_one_remain) - .broadcast(one_axis_one) - .reshape(batch_classes)) - .unaryExpr(ValueClip()); - } - - log_softmax.device(*context.eigen_device()) = - log_softmax - - log_softmax.exp() - .eval() - .reshape(batch_axis_remain) - .sum(along_axis) - .log() - .broadcast(one_axis); - } -}; - -template -class LogSoftmaxKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* X = context.Input("X"); - auto* Out = context.Output("Out"); - const int rank = X->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); - - // allocate memory on device. - Out->mutable_data(context.GetPlace()); - - if (X->numel() != 0) { - LogSoftmaxFunctor()( - context.template device_context(), X, Out, axis); - } - } -}; - -template -struct LogSoftmaxGradFunctor { - void operator()(const DeviceContext& context, const framework::Tensor* Y, - const framework::Tensor* dY, framework::Tensor* dX, - const int axis) { - constexpr int kBatchDim = 0; - constexpr int kClassDim = 1; - - const int n = SizeToAxis(axis, Y->dims()); - const int d = SizeFromAxis(axis, Y->dims()); - framework::DDim dim_2d{n, d}; - - auto y = EigenMatrix::From(*Y, dim_2d); - auto dy = EigenMatrix::From(*dY, dim_2d); - auto dx = EigenMatrix::From(*dX, dim_2d); - - const int axis_dim = Y->dims()[axis]; - const int batch_size = y.dimension(kBatchDim); - const int num_classes = y.dimension(kClassDim); - const int num_remain = num_classes / axis_dim; - - Eigen::DSizes along_class(kClassDim); - Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); - Eigen::DSizes one_axis(1, axis_dim); - - dx.device(*context.eigen_device()) = - dy - - (y.exp()) * (dy.reshape(batch_axis_remain) - .sum(along_class) - .broadcast(one_axis)); - } -}; - -template -class LogSoftmaxGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* Out = context.Input("Out"); - auto* dOut = - context.Input(framework::GradVarName("Out")); - auto* dX = context.Output(framework::GradVarName("X")); - const int rank = Out->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); - - // allocate memory on device. - dX->mutable_data(context.GetPlace()); - - if (Out->numel() != 0) { - LogSoftmaxGradFunctor()( - context.template device_context(), Out, dOut, dX, - axis); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/log_softmax_op_npu.cc b/paddle/fluid/operators/log_softmax_op_npu.cc index 5795f1dffac..6ce21aec921 100644 --- a/paddle/fluid/operators/log_softmax_op_npu.cc +++ b/paddle/fluid/operators/log_softmax_op_npu.cc @@ -12,8 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/log_softmax_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" namespace paddle { namespace operators { @@ -27,7 +28,7 @@ class LogSoftmaxNPUKernel : public framework::OpKernel { auto* X = ctx.Input("X"); auto* Out = ctx.Output("Out"); const int rank = X->dims().size(); - const int axis = CanonicalAxis(ctx.Attr("axis"), rank); + const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); Out->mutable_data(ctx.GetPlace()); if (X->numel() != 0) { @@ -47,7 +48,7 @@ class LogSoftmaxGradNPUKernel : public framework::OpKernel { auto* dOut = ctx.Input(framework::GradVarName("Out")); auto* dX = ctx.Output(framework::GradVarName("X")); const int rank = dOut->dims().size(); - const int axis = CanonicalAxis(ctx.Attr("axis"), rank); + const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); // allocate memory on device. dX->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index 4d2a2e23b3f..de35f674058 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -9,14 +9,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/prelu_op.h" - #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + framework::OpKernelType innerGetKernelTypeForVar( const Tensor &tensor, const framework::OpKernelType &expected_kernel_type) { #ifdef PADDLE_WITH_MKLDNN @@ -44,95 +49,6 @@ class PReluOp : public framework::OperatorWithKernel { const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "prelu"); - OP_INOUT_CHECK(ctx->HasInput("Alpha"), "Input", "Alpha", "prelu"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "prelu"); - - auto x_dim = ctx->GetInputDim("X"); - std::string mode = ctx->Attrs().Get("mode"); - if (mode == "all") { - PADDLE_ENFORCE_EQ(phi::product(ctx->GetInputDim("Alpha")), 1, - platform::errors::InvalidArgument( - "For mode 'all', size of weight Alpha must be one. " - "But recevied alpha's size: %d.", - product(ctx->GetInputDim("Alpha")))); - } else if (mode == "channel") { - auto x_rank = x_dim.size(); - PADDLE_ENFORCE_GE(x_rank, 2, - platform::errors::InvalidArgument( - "For mode 'channel', rank of input X must be " - "equal or larger than 2. But recevied X's " - "rank: %d", - x_rank)); - const std::string data_format_str = - ctx->Attrs().Get("data_format"); - PADDLE_ENFORCE_EQ(data_format_str == "NCHW" || data_format_str == "NHWC", - true, - platform::errors::InvalidArgument( - "For mode 'channel', data_format must be one of " - "NCHW and NHWC. But recevied data_format: %s", - data_format_str)); - if (data_format_str == "NCHW" || ctx->IsRunMKLDNNKernel()) { - PADDLE_ENFORCE_EQ( - product(ctx->GetInputDim("Alpha")) == x_dim[1], true, - platform::errors::InvalidArgument( - "For mode 'channel', size of weight Alpha must be " - "equal to the number of channels of input(x). But " - "recevied alpha's size: %d, x_dim[1]: %d", - product(ctx->GetInputDim("Alpha")), x_dim[1])); - } else { - PADDLE_ENFORCE_EQ( - product(ctx->GetInputDim("Alpha")) == x_dim[x_rank - 1], true, - platform::errors::InvalidArgument( - "For mode 'channel', size of weight Alpha must be " - "equal to the number of channels of input(x). But " - "recevied alpha's size: %d, x_dim[%d]: %d", - product(ctx->GetInputDim("Alpha")), x_rank - 1, - x_dim[x_rank - 1])); - } - - } else if (mode == "element") { - auto alpha_dim = ctx->GetInputDim("Alpha"); - auto alpha_rank = alpha_dim.size(); - auto x_rank = x_dim.size(); - PADDLE_ENFORCE_GE(x_rank, 1, - platform::errors::InvalidArgument( - "For mode 'element', rank of input X must be " - "equal or larger than 2. But recevied X's " - "rank: %d", - x_rank)); - PADDLE_ENFORCE_EQ( - alpha_rank, x_rank, - platform::errors::InvalidArgument( - "For mode 'element', rank of weight Alpha must be ", - "equal to the rank of input(x). But recevied alpha's rank: %d, " - "x's rank: %d.", - alpha_rank, x_rank)); - size_t x_product = 1; - size_t alpha_product = 1; - for (int64_t i = x_rank - 1; i > 0; i--) { - x_product *= x_dim[i]; - alpha_product *= alpha_dim[i]; - } - PADDLE_ENFORCE_EQ( - alpha_product, x_product, - platform::errors::InvalidArgument( - "For mode 'element', the size of weight Alpha must be " - "equal to the size of input(x). But recevied alpha's size: %d, " - "x's size: %d.", - alpha_product, x_product)); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Attr(mode) of prelu must be one of 'all', 'channel', or 'element'. " - "But recevied " - "mode: '%s'.", - mode)); - } - ctx->ShareDim("X", /*->*/ "Out"); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -268,13 +184,10 @@ class PReluGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(prelu, PReluInferShapeFunctor, + PD_INFER_META(phi::PReluInferMeta)); REGISTER_OPERATOR(prelu, ops::PReluOp, ops::PReluOpMaker, ops::PReluGradOpMaker, - ops::PReluGradOpMaker); + ops::PReluGradOpMaker, + PReluInferShapeFunctor); REGISTER_OPERATOR(prelu_grad, ops::PReluGradOp); -REGISTER_OP_CPU_KERNEL( - prelu, ops::PReluKernel, - ops::PReluKernel); -REGISTER_OP_CPU_KERNEL( - prelu_grad, ops::PReluGradKernel, - ops::PReluGradKernel); diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu deleted file mode 100644 index 12e55d042d7..00000000000 --- a/paddle/fluid/operators/prelu_op.cu +++ /dev/null @@ -1,208 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/prelu.h" -#include "paddle/fluid/operators/prelu_op.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -#define CUDA_NUM_THREADS 1024 - -inline static int PADDLE_GET_BLOCKS(const int N) { - return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; -} - -template -class CUDAPReluKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* alpha = context.Input("Alpha"); - auto* out = context.Output("Out"); - - const T* x_ptr = x->data(); - T* o_ptr = out->mutable_data(context.GetPlace()); - - const T* alpha_ptr = alpha->data(); - auto& mode = context.Attr("mode"); - auto& data_format = context.Attr("data_format"); - - int numel = x->numel(); - auto dim = x->dims(); - auto x_rank = dim.size(); - - VLOG(4) << "dim[0]:" << dim[0] << ", dim[1]:" << dim[1] << ", dim[" - << x_rank - 1 << "]:" << dim[x_rank - 1] << ", numel:" << numel; - - if (mode == "channel") { - bool channel_last = data_format == "NHWC"; - size_t channel = channel_last ? dim[x_rank - 1] : dim[1]; - math::PreluChannelWiseDirectCUDAFunctor prelu_channel_wise; - prelu_channel_wise(context.cuda_device_context().stream(), x_ptr, - alpha_ptr, o_ptr, dim[0], channel, channel_last, - numel); - } else if (mode == "element") { - math::PreluElementWiseDirectCUDAFunctor prelu_element_wise; - prelu_element_wise(context.cuda_device_context().stream(), x_ptr, - alpha_ptr, o_ptr, dim[0], numel); - } else { - math::PreluScalarDirectCUDAFunctor prelu_scalar; - prelu_scalar(context.cuda_device_context().stream(), x_ptr, alpha_ptr, - o_ptr, numel); - } - } -}; - -enum PRELU_MODE { Element, ChannelFirst, ChannelLast, Scalar }; - -template -__global__ void PReluOpGradKernel(const T* x_ptr, const T* alpha_ptr, - const T* dy_ptr, T* dx_ptr, T* dalpha_ptr, - size_t channel_num, size_t plane_size, - size_t spatial_size, size_t numel, - PRELU_MODE mode) { - CUDA_KERNEL_LOOP(index, numel) { - T scale; - if (mode == Element) { - size_t element_index = index % spatial_size; - scale = alpha_ptr[element_index]; - } else if (mode == ChannelFirst) { - size_t temp = index / plane_size; - size_t channel_index = temp % channel_num; - scale = alpha_ptr[channel_index]; - } else if (mode == ChannelLast) { - size_t channel_index = index % channel_num; - scale = alpha_ptr[channel_index]; - } else { - scale = alpha_ptr[0]; - } - T x = x_ptr[index]; - T dy = dy_ptr[index]; - T zero = static_cast(0); - if (dx_ptr != nullptr) dx_ptr[index] = (x > zero) ? dy : scale * dy; - if (dalpha_ptr != nullptr) dalpha_ptr[index] = (x > zero) ? zero : x * dy; - } -} - -template -class PreluOpGradFunctor { - public: - void operator()(gpuStream_t stream, const T* x, const T* alpha, const T* dy, - T* dx, T* dalpha, const framework::DDim& input_dims, - PRELU_MODE mode) { - size_t numel = 1; - for (size_t i = 0; i < input_dims.size(); ++i) { - numel *= input_dims[i]; - } - size_t plane_size = numel / input_dims[0] / input_dims[1]; - size_t spatial_size = numel / input_dims[0]; - size_t channel = - mode == ChannelLast ? input_dims[input_dims.size() - 1] : input_dims[1]; - - PReluOpGradKernel< - T><<>>( - x, alpha, dy, dx, dalpha, channel, plane_size, spatial_size, numel, - mode); - } -}; - -template -class CUDAPReluGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* alpha = context.Input("Alpha"); - auto* dx = context.Output(framework::GradVarName("X")); - auto* dy = context.Input(framework::GradVarName("Out")); - auto* dalpha = context.Output(framework::GradVarName("Alpha")); - - const T* x_ptr = x->data(); - const T* alpha_ptr = alpha->data(); - const T* dy_ptr = dy->data(); - T* dx_ptr = dx ? dx->mutable_data(context.GetPlace()) : nullptr; - T* dalpha_ptr = - dalpha ? dalpha->mutable_data(context.GetPlace()) : nullptr; - - if (!dx && !dalpha) return; - - auto& mode = context.Attr("mode"); - auto& data_format = context.Attr("data_format"); - - int numel = x->numel(); - auto dim = x->dims(); - auto x_rank = dim.size(); - std::vector input_shape = phi::vectorize(dim); - auto stream = context.cuda_device_context().stream(); - - T* dalpha_tmp_ptr; - Tensor dalpha_tmp; - if (dalpha_ptr == nullptr) { - dalpha_tmp_ptr = dalpha_ptr; - } else { - auto& dev_ctx = context.template device_context(); - dalpha_tmp = context.AllocateTmpTensor(dim, dev_ctx); - dalpha_tmp_ptr = dalpha_tmp.mutable_data(context.GetPlace()); - } - - PRELU_MODE m; - bool channel_last = false; - if (mode == "element") { - m = Element; - } else if (mode == "channel") { - channel_last = data_format == "NHWC"; - m = channel_last ? ChannelLast : ChannelFirst; - } else { - m = Scalar; - } - PreluOpGradFunctor prelu_grad; - prelu_grad(stream, x_ptr, alpha_ptr, dy_ptr, dx_ptr, dalpha_tmp_ptr, dim, - m); - - if (dalpha_tmp_ptr == nullptr) return; - - std::vector reduce_dims; - for (size_t i = 0; i < dim.size(); i++) { - if (mode == "channel" && !channel_last && i == 1) continue; - if (mode == "channel" && channel_last && i == dim.size() - 1) continue; - if (mode == "element" && i != 0) continue; - reduce_dims.push_back(i); - } - - TensorReduceImpl>( - context.cuda_device_context(), dalpha_tmp, dalpha, - kps::IdentityFunctor(), reduce_dims, stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - prelu, ops::CUDAPReluKernel, - ops::CUDAPReluKernel, - ops::CUDAPReluKernel); -REGISTER_OP_CUDA_KERNEL( - prelu_grad, - ops::CUDAPReluGradKernel, - ops::CUDAPReluGradKernel, - ops::CUDAPReluGradKernel); diff --git a/paddle/fluid/operators/prelu_op.h b/paddle/fluid/operators/prelu_op.h deleted file mode 100644 index 384994eb37c..00000000000 --- a/paddle/fluid/operators/prelu_op.h +++ /dev/null @@ -1,172 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/transform.h" -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using platform::Transform; - -template -class PReluKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* alpha = context.Input("Alpha"); - auto* out = context.Output("Out"); - - const T* x_ptr = x->data(); - T* o_ptr = out->mutable_data(context.GetPlace()); - - const T* alpha_ptr = alpha->data(); - auto& mode = context.Attr("mode"); - auto& data_format = context.Attr("data_format"); - - int numel = x->numel(); - auto dim = x->dims(); - int index = 0; - int i = 0; - if (mode == "channel") { - if (data_format == "NCHW") { - int temp = 1; - for (int j = 2; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = (i / temp) % dim[1]; - o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - index = i % dim[dim.size() - 1]; - o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; - } - } - } else if (mode == "element") { - int temp = 1; - for (int j = 1; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = i % temp; - o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i]; - } - } - } -}; - -template -class PReluGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* dx = context.Output(framework::GradVarName("X")); - auto* dout = context.Input(framework::GradVarName("Out")); - auto* dalpha = context.Output(framework::GradVarName("Alpha")); - auto* alpha = context.Input("Alpha"); - const T* alpha_ptr = alpha->data(); - const T* x_ptr = x->data(); - const T* dout_ptr = dout->data(); - std::string mode = context.Attr("mode"); - auto& data_format = context.Attr("data_format"); - int numel = x->numel(); - auto dim = x->dims(); - int index = 0; - int i = 0; - if (dx) { - T* dx_ptr = dx->mutable_data(context.GetPlace()); - if (mode == "channel") { - if (data_format == "NCHW") { - int temp = 1; - for (int j = 2; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = (i / temp) % dim[1]; - dx_ptr[i] = - x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - index = i % dim[dim.size() - 1]; - dx_ptr[i] = - x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i]; - } - } - } else if (mode == "element") { - int temp = 1; - for (int j = 1; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = i % temp; - dx_ptr[i] = - x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - dx_ptr[i] = x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[0] * dout_ptr[i]; - } - } - } - - index = 0; - if (dalpha) { - T* dalpha_ptr = dalpha->mutable_data(context.GetPlace()); - memset(dalpha_ptr, 0, sizeof(T) * dalpha->numel()); - - if (mode == "channel") { - if (data_format == "NCHW") { - int temp = 1; - for (int j = 2; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = (i / temp) % dim[1]; - dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - index = i % dim[dim.size() - 1]; - dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i]; - } - } - } else if (mode == "element") { - int temp = 1; - for (int j = 1; j < dim.size(); j++) { - temp *= dim[j]; - } - for (i = 0; i < numel; i++) { - index = i % temp; - dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i]; - } - } else { - for (i = 0; i < numel; i++) { - dalpha_ptr[0] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i]; - } - } - } - - // TODO(Guanzhong): add GPU kernels - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index b7a7a4ec231..f09e8789478 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -918,6 +918,103 @@ void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) { out->share_lod(x); } +void PReluInferMeta(const MetaTensor& x, + const MetaTensor& alpha, + const std::string& mode, + const std::string& data_format, + MetaTensor* out, + MetaConfig config) { + auto x_dim = x.dims(); + if (mode == "all") { + PADDLE_ENFORCE_EQ(phi::product(alpha.dims()), + 1, + phi::errors::InvalidArgument( + "For mode 'all', size of weight Alpha must be one. " + "But recevied alpha's size: %d.", + product(alpha.dims()))); + } else if (mode == "channel") { + auto x_rank = x_dim.size(); + PADDLE_ENFORCE_GE(x_rank, + 2, + phi::errors::InvalidArgument( + "For mode 'channel', rank of input X must be " + "equal or larger than 2. But recevied X's " + "rank: %d", + x_rank)); + PADDLE_ENFORCE_EQ(data_format == "NCHW" || data_format == "NHWC", + true, + phi::errors::InvalidArgument( + "For mode 'channel', data_format must be one of " + "NCHW and NHWC. But recevied data_format: %s", + data_format)); + if (data_format == "NCHW" || config.is_run_mkldnn_kernel) { + PADDLE_ENFORCE_EQ(product(alpha.dims()) == x_dim[1], + true, + phi::errors::InvalidArgument( + "For mode 'channel', size of weight Alpha must be " + "equal to the number of channels of input(x). But " + "recevied alpha's size: %d, x_dim[1]: %d", + product(alpha.dims()), + x_dim[1])); + } else { + PADDLE_ENFORCE_EQ(product(alpha.dims()) == x_dim[x_rank - 1], + true, + phi::errors::InvalidArgument( + "For mode 'channel', size of weight Alpha must be " + "equal to the number of channels of input(x). But " + "recevied alpha's size: %d, x_dim[%d]: %d", + product(alpha.dims()), + x_rank - 1, + x_dim[x_rank - 1])); + } + } else if (mode == "element") { + auto alpha_dim = alpha.dims(); + auto alpha_rank = alpha_dim.size(); + auto x_rank = x_dim.size(); + PADDLE_ENFORCE_GE(x_rank, + 1, + phi::errors::InvalidArgument( + "For mode 'element', rank of input X must be " + "equal or larger than 2. But recevied X's " + "rank: %d", + x_rank)); + PADDLE_ENFORCE_EQ( + alpha_rank, + x_rank, + phi::errors::InvalidArgument( + "For mode 'element', rank of weight Alpha must be ", + "equal to the rank of input(x). But recevied alpha's rank: %d, " + "x's rank: %d.", + alpha_rank, + x_rank)); + size_t x_product = 1; + size_t alpha_product = 1; + for (int64_t i = x_rank - 1; i > 0; i--) { + x_product *= x_dim[i]; + alpha_product *= alpha_dim[i]; + } + PADDLE_ENFORCE_EQ( + alpha_product, + x_product, + phi::errors::InvalidArgument( + "For mode 'element', the size of weight Alpha must be " + "equal to the size of input(x). But recevied alpha's size: %d, " + "x's size: %d.", + alpha_product, + x_product)); + } else { + PADDLE_THROW(phi::errors::InvalidArgument( + "Attr(mode) of prelu must be one of 'all', 'channel', or 'element'. " + "But recevied " + "mode: '%s'.", + mode)); + } + out->set_dims(x_dim); + out->set_dtype(x.dtype()); + out->set_layout(x.layout()); + out->share_lod(x); +} + void SearchsortedInferMeta(const MetaTensor& sorted_sequence, const MetaTensor& value, bool out_int32, diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index cb680415e7d..cb7a83f39a4 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -146,6 +146,13 @@ void MatmulInferMeta(const MetaTensor& x, void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out); +void PReluInferMeta(const MetaTensor& x, + const MetaTensor& alpha, + const std::string& mode, + const std::string& data_format, + MetaTensor* out, + MetaConfig config); + void SearchsortedInferMeta(const MetaTensor& sorted_sequence, const MetaTensor& value, bool out_int32, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index baa5b39670f..03029550c2a 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -1650,7 +1650,7 @@ void UnchangedInferMetaCheckAxis(const MetaTensor& x, PADDLE_ENFORCE_GE( axis, -rank, - errors::InvalidArgument( + phi::errors::InvalidArgument( "Attr(axis) value should be in range [-R, R-1], " "R is the rank of Input(X). But received axis: %d, R: %d.", axis, diff --git a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc new file mode 100644 index 00000000000..254c4ea5716 --- /dev/null +++ b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc @@ -0,0 +1,146 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gelu_grad_kernel.h" + +#include +#include + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/blas/blas_impl.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/gelu_kernel.h" + +namespace phi { + +template +struct GeluGradFunctor { + template + void operator()(Device d, X x, dOut dout, dX dx, bool approximate) const { + if (approximate) { + if (std::is_same::value) { + VLOG(4) << "cast from float16 to float before computing"; + auto casted_x = x.template cast(); + auto casted_dout = dout.template cast(); + + const float kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); + const float kBeta = + kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); + const auto y = + (kAlpha * + ((static_cast(GELU_CONSTANT) * casted_x.cube()) + casted_x)) + .tanh(); + dx.device(d) = (static_cast(0.5) * casted_dout * + (static_cast(1) + y + + (casted_x - casted_x * y.square()) * + (kAlpha + kBeta * casted_x.square()))) + .template cast(); + } else { + const T kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); + const T kBeta = + kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); + const auto y = + (kAlpha * ((static_cast(GELU_CONSTANT) * x.cube()) + x)).tanh(); + dx.device(d) = static_cast(0.5) * dout * + (static_cast(1) + y + + (x - x * y.square()) * (kAlpha + kBeta * x.square())); + } + } else { +#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ + !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ + !defined(PADDLE_WITH_HIP) + auto x_data = x.data(); + auto dx_data = dx.data(); + auto dout_data = dout.data(); + int n = std::min(x.size(), dx.size()); + + auto first = static_cast(std::malloc(n * sizeof(T))); + std::memset(first, 0, n * sizeof(T)); + auto second = static_cast(std::malloc(n * sizeof(T))); + std::memset(second, 0, n * sizeof(T)); + + // first = (0.5 * (1 + erf(x / sqrt(2)))) + phi::funcs::CBlas::AXPY( + n, static_cast(M_SQRT1_2), x_data, 1, first, 1); + phi::funcs::CBlas::VMERF(n, first, first, VML_LA); + for (int i = 0; i < n; i++) { + first[i] += static_cast(1); + } + phi::funcs::CBlas::SCAL(n, static_cast(0.5), first, 1); + + // second = (0.5 * 2/sqrt(pi) * 1/sqrt(2) * x * exp(-0.5 * x^2)) + phi::funcs::CBlas::VSQUARE(n, x_data, second); + phi::funcs::CBlas::SCAL(n, -static_cast(0.5), second, 1); + phi::funcs::CBlas::VEXP(n, second, second); + phi::funcs::CBlas::VMUL(n, x_data, second, second); + phi::funcs::CBlas::SCAL( + n, static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2), second, 1); + + // dx = dout * (first + second); + phi::funcs::CBlas::VADD(n, first, second, first); + phi::funcs::CBlas::VMUL(n, dout_data, first, dx_data); + + std::free(first); + std::free(second); +#else + // gelu_grad(x) = dout * 0.5 * (1 + erf(x / sqrt(2)) + x * sqrt(2 / pi) * + // exp(- x^2 / 2) + if (std::is_same::value) { + VLOG(4) << "cast from float16 to float before computing"; + auto casted_x = x.template cast(); + auto casted_dout = dout.template cast(); + auto first = static_cast(0.5) * + (static_cast(1) + + ((casted_x * static_cast(M_SQRT1_2)).erf())); + auto second = static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * + casted_x * + (-static_cast(0.5) * casted_x.square()).exp(); + dx.device(d) = (casted_dout * (first + second)).template cast(); + } else { + auto first = + static_cast(0.5) * + (static_cast(1) + ((x * static_cast(M_SQRT1_2)).erf())); + + auto second = static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * x * + (-static_cast(0.5) * x.square()).exp(); + dx.device(d) = dout * (first + second); + } +#endif + } + } +}; + +template +void GeluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + bool approximate, + DenseTensor* x_grad) { + dev_ctx.template Alloc(x_grad); + auto eigen_x = EigenVector::Flatten(x); + auto eigen_out_grad = EigenVector::Flatten(out_grad); + auto eigen_x_grad = EigenVector::Flatten(*x_grad); + auto& dev = *dev_ctx.eigen_device(); + + GeluGradFunctor functor; + functor(dev, eigen_x, eigen_out_grad, eigen_x_grad, approximate); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + gelu_grad, CPU, ALL_LAYOUT, phi::GeluGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/gelu_kernel.cc b/paddle/phi/kernels/cpu/gelu_kernel.cc new file mode 100644 index 00000000000..d7af2205745 --- /dev/null +++ b/paddle/phi/kernels/cpu/gelu_kernel.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gelu_kernel.h" +#include +#include +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/blas/blas_impl.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" + +namespace phi { + +template +struct GeluFunctor { + template + void operator()(Device d, X x, Out out, bool approximate) const { + if (approximate) { + // gelu(x) = 0.5 * x * (1 + tanh(sqrt(2 / \pi) * (x + 0.044715 * x^{3}))) + if (std::is_same::value) { + VLOG(4) << "cast from float16 to float before computing"; + auto casted_x = x.template cast(); + auto temp = + (static_cast(M_2_SQRTPI * M_SQRT1_2) * + (casted_x + static_cast(GELU_CONSTANT) * casted_x.cube())) + .tanh(); + out.device(d) = (casted_x * static_cast(0.5) * + (static_cast(1) + temp)) + .template cast(); + } else { + auto temp = (static_cast(M_2_SQRTPI * M_SQRT1_2) * + (x + static_cast(GELU_CONSTANT) * x.cube())) + .tanh(); + out.device(d) = x * static_cast(0.5) * (static_cast(1) + temp); + } + } else { +#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ + !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) && \ + !defined(PADDLE_WITH_HIP) + auto x_data = x.data(); + auto out_data = out.data(); + int n = std::min(x.size(), out.size()); + + std::memset(out_data, 0, n * sizeof(T)); + phi::funcs::CBlas::AXPY( + n, static_cast(M_SQRT1_2), x_data, 1, out_data, 1); + phi::funcs::CBlas::VMERF(n, out_data, out_data, VML_LA); + for (int i = 0; i < n; i++) { + out_data[i] += static_cast(1); + } + phi::funcs::CBlas::VMUL(n, x_data, out_data, out_data); + for (int i = 0; i < n; i++) { + out_data[i] *= static_cast(0.5); + } +#else + // gelu(x) = 0.5 * x * (1 + erf(x / sqrt(2))) + if (std::is_same::value) { + VLOG(4) << "cast from float16 to float before computing"; + auto casted_x = x.template cast(); + auto temp = (casted_x * static_cast(M_SQRT1_2)).erf(); + out.device(d) = (casted_x * static_cast(0.5) * + (static_cast(1) + temp)) + .template cast(); + } else { + auto temp = (x * static_cast(M_SQRT1_2)).erf(); + out.device(d) = x * static_cast(0.5) * (static_cast(1) + temp); + } +#endif + } + } +}; + +template +void GeluKernel(const Context& dev_ctx, + const DenseTensor& x, + bool approximate, + DenseTensor* out) { + dev_ctx.template Alloc(out); + auto eigen_out = EigenVector::Flatten(*out); + auto eigen_x = EigenVector::Flatten(x); + auto& dev = *dev_ctx.eigen_device(); + + GeluFunctor functor; + functor(dev, eigen_x, eigen_out, approximate); +} + +} // namespace phi + +PD_REGISTER_KERNEL(gelu, CPU, ALL_LAYOUT, phi::GeluKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc b/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc new file mode 100644 index 00000000000..5f344b9cc3f --- /dev/null +++ b/paddle/phi/kernels/cpu/log_softmax_grad_kernel.cc @@ -0,0 +1,88 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_softmax_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" + +namespace phi { + +template +using EigenMatrixTemplate = EigenMatrix; + +template +struct LogSoftmaxGradFunctor { + void operator()(const Context& context, + const DenseTensor* Y, + const DenseTensor* dY, + DenseTensor* dX, + const int axis) { + constexpr int kBatchDim = 0; + constexpr int kClassDim = 1; + + const int n = funcs::SizeToAxis(axis, Y->dims()); + const int d = funcs::SizeFromAxis(axis, Y->dims()); + phi::DDim dim_2d{n, d}; + + auto y = EigenMatrixTemplate::From(*Y, dim_2d); + auto dy = EigenMatrixTemplate::From(*dY, dim_2d); + auto dx = EigenMatrixTemplate::From(*dX, dim_2d); + + const int axis_dim = Y->dims()[axis]; + const int batch_size = y.dimension(kBatchDim); + const int num_classes = y.dimension(kClassDim); + const int num_remain = num_classes / axis_dim; + + Eigen::DSizes along_class(kClassDim); + Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); + Eigen::DSizes one_axis(1, axis_dim); + + dx.device(*context.eigen_device()) = + dy - + (y.exp()) * (dy.reshape(batch_axis_remain) + .sum(along_class) + .broadcast(one_axis)); + } +}; + +template +void LogSoftmaxGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad) { + const int rank = out.dims().size(); + const int canonical_axis = funcs::CanonicalAxis(axis, rank); + + dev_ctx.template Alloc(x_grad); + if (out.numel() != 0) { + LogSoftmaxGradFunctor()( + dev_ctx, &out, &out_grad, x_grad, canonical_axis); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(log_softmax_grad, + CPU, + ALL_LAYOUT, + phi::LogSoftmaxGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/log_softmax_kernel.cc b/paddle/phi/kernels/cpu/log_softmax_kernel.cc new file mode 100644 index 00000000000..241742378cc --- /dev/null +++ b/paddle/phi/kernels/cpu/log_softmax_kernel.cc @@ -0,0 +1,123 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_softmax_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" + +namespace phi { + +template +using EigenMatrixTemplate = EigenMatrix; + +template +struct ValueClip { + HOSTDEVICE T operator()(const T& x) const { + const T kThreshold = static_cast(-64.); + return x < kThreshold ? kThreshold : x; + } +}; + +template +struct LogSoftmaxFunctor { + void operator()(const Context& context, + const DenseTensor* X, + DenseTensor* Y, + const int axis) { + constexpr int kBatchDim = 0; + constexpr int kClassDim = 1; + constexpr int kAxisDim = 1; + + int axis_dim = X->dims()[axis]; + const int n = funcs::SizeToAxis(axis, X->dims()); + const int d = funcs::SizeFromAxis(axis, X->dims()); + phi::DDim dim_2d{n, d}; + + auto logits = EigenMatrixTemplate::From(*X, dim_2d); + auto log_softmax = EigenMatrixTemplate::From(*Y, dim_2d); + + const int batch_size = logits.dimension(kBatchDim); + const int num_classes = logits.dimension(kClassDim); + const int num_remain = num_classes / axis_dim; + + Eigen::DSizes along_axis(kAxisDim); + Eigen::DSizes batch_classes(batch_size, num_classes); + Eigen::DSizes batch_by_one(batch_size, 1); + Eigen::DSizes one_by_class(1, num_classes); + Eigen::DSizes batch_one_remain(batch_size, 1, num_remain); + Eigen::DSizes one_axis_one(1, axis_dim, 1); + Eigen::DSizes one_axis(1, axis_dim); + Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); + + // For numerical stability, logits should be shifted by maximum number along + // axis, calculate shifted_logits into log_softmax tensor for memory reuse. + if (num_remain == 1) { + // axis == -1, axis and class in same dimension, calculate along + // class dimension directly for higher performance + log_softmax.device(*context.eigen_device()) = + (logits - + logits.maximum(along_axis) + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class)) + .unaryExpr(ValueClip()); + } else { + // axis != -1, class dimension split into (axis, remain), max and sum + // should be calculated along axis dimension + log_softmax.device(*context.eigen_device()) = + (logits.reshape(batch_axis_remain) - + logits.reshape(batch_axis_remain) + .maximum(along_axis) + .eval() + .reshape(batch_one_remain) + .broadcast(one_axis_one) + .reshape(batch_classes)) + .unaryExpr(ValueClip()); + } + + log_softmax.device(*context.eigen_device()) = + log_softmax - + log_softmax.exp() + .eval() + .reshape(batch_axis_remain) + .sum(along_axis) + .log() + .broadcast(one_axis); + } +}; + +template +void LogSoftmaxKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + DenseTensor* out) { + const int rank = x.dims().size(); + const int canonical_axis = funcs::CanonicalAxis(axis, rank); + + dev_ctx.template Alloc(out); + if (x.numel() != 0) { + LogSoftmaxFunctor()(dev_ctx, &x, out, canonical_axis); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + log_softmax, CPU, ALL_LAYOUT, phi::LogSoftmaxKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/prelu_grad_kernel.cc b/paddle/phi/kernels/cpu/prelu_grad_kernel.cc new file mode 100644 index 00000000000..97558cdb31f --- /dev/null +++ b/paddle/phi/kernels/cpu/prelu_grad_kernel.cc @@ -0,0 +1,119 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/prelu_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void PReluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& alpha, + const DenseTensor& out_grad, + const std::string& mode, + const std::string& data_format, + DenseTensor* x_grad, + DenseTensor* alpha_grad) { + const T* alpha_ptr = alpha.data(); + const T* x_ptr = x.data(); + const T* out_grad_ptr = out_grad.data(); + int numel = x.numel(); + auto dim = x.dims(); + int index = 0; + int i = 0; + if (x_grad) { + T* x_grad_ptr = dev_ctx.template Alloc(x_grad); + if (mode == "channel") { + if (data_format == "NCHW") { + int temp = 1; + for (int j = 2; j < dim.size(); j++) { + temp *= dim[j]; + } + for (i = 0; i < numel; i++) { + index = (i / temp) % dim[1]; + x_grad_ptr[i] = x_ptr[i] > 0 ? out_grad_ptr[i] + : alpha_ptr[index] * out_grad_ptr[i]; + } + } else { + for (i = 0; i < numel; i++) { + index = i % dim[dim.size() - 1]; + x_grad_ptr[i] = x_ptr[i] > 0 ? out_grad_ptr[i] + : alpha_ptr[index] * out_grad_ptr[i]; + } + } + } else if (mode == "element") { + int temp = 1; + for (int j = 1; j < dim.size(); j++) { + temp *= dim[j]; + } + for (i = 0; i < numel; i++) { + index = i % temp; + x_grad_ptr[i] = + x_ptr[i] > 0 ? out_grad_ptr[i] : alpha_ptr[index] * out_grad_ptr[i]; + } + } else { + for (i = 0; i < numel; i++) { + x_grad_ptr[i] = + x_ptr[i] > 0 ? out_grad_ptr[i] : alpha_ptr[0] * out_grad_ptr[i]; + } + } + } + + index = 0; + if (alpha_grad) { + T* alpha_grad_ptr = dev_ctx.template Alloc(alpha_grad); + memset(alpha_grad_ptr, 0, sizeof(T) * alpha_grad->numel()); + + if (mode == "channel") { + if (data_format == "NCHW") { + int temp = 1; + for (int j = 2; j < dim.size(); j++) { + temp *= dim[j]; + } + for (i = 0; i < numel; i++) { + index = (i / temp) % dim[1]; + alpha_grad_ptr[index] += + x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i]; + } + } else { + for (i = 0; i < numel; i++) { + index = i % dim[dim.size() - 1]; + alpha_grad_ptr[index] += + x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i]; + } + } + } else if (mode == "element") { + int temp = 1; + for (int j = 1; j < dim.size(); j++) { + temp *= dim[j]; + } + for (i = 0; i < numel; i++) { + index = i % temp; + alpha_grad_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i]; + } + } else { + for (i = 0; i < numel; i++) { + alpha_grad_ptr[0] += x_ptr[i] > 0 ? 0 : x_ptr[i] * out_grad_ptr[i]; + } + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + prelu_grad, CPU, ALL_LAYOUT, phi::PReluGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/prelu_kernel.cc b/paddle/phi/kernels/cpu/prelu_kernel.cc new file mode 100644 index 00000000000..8f389ab9ff4 --- /dev/null +++ b/paddle/phi/kernels/cpu/prelu_kernel.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/prelu_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void PReluKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& alpha, + const std::string& mode, + const std::string& data_format, + DenseTensor* out) { + const T* x_ptr = x.data(); + const T* alpha_ptr = alpha.data(); + T* o_ptr = dev_ctx.template Alloc(out); + + int numel = x.numel(); + auto dim = x.dims(); + int index = 0; + int i = 0; + if (mode == "channel") { + if (data_format == "NCHW") { + int temp = 1; + for (int j = 2; j < dim.size(); j++) { + temp *= dim[j]; + } + for (i = 0; i < numel; i++) { + index = (i / temp) % dim[1]; + o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; + } + } else { + for (i = 0; i < numel; i++) { + index = i % dim[dim.size() - 1]; + o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; + } + } + } else if (mode == "element") { + int temp = 1; + for (int j = 1; j < dim.size(); j++) { + temp *= dim[j]; + } + for (i = 0; i < numel; i++) { + index = i % temp; + o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; + } + } else { + for (i = 0; i < numel; i++) { + o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i]; + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(prelu, CPU, ALL_LAYOUT, phi::PReluKernel, float, double) {} diff --git a/paddle/phi/kernels/gelu_grad_kernel.h b/paddle/phi/kernels/gelu_grad_kernel.h new file mode 100644 index 00000000000..fd70e8d54bc --- /dev/null +++ b/paddle/phi/kernels/gelu_grad_kernel.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES // use M_2_SQRTPI on Windows +#endif + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void GeluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + bool approximate, + DenseTensor* x_grad); +} // namespace phi diff --git a/paddle/phi/kernels/gelu_kernel.h b/paddle/phi/kernels/gelu_kernel.h new file mode 100644 index 00000000000..bc106a04031 --- /dev/null +++ b/paddle/phi/kernels/gelu_kernel.h @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES // use M_2_SQRTPI on Windows +#endif + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +#define GELU_CONSTANT 0.044715 + +template +void GeluKernel(const Context& dev_ctx, + const DenseTensor& x, + bool approximate, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h new file mode 100644 index 00000000000..2b9be7c6154 --- /dev/null +++ b/paddle/phi/kernels/gpu/gelu_funcs.h @@ -0,0 +1,176 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/platform/flags.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" + +DECLARE_bool(use_fast_math); + +namespace phi { + +#ifdef __NVCC__ +template +static __device__ __forceinline__ float FP32FastTanh(float x) { +#if __CUDA_ARCH__ >= 750 && CUDA_VERSION >= 11000 + if (FastMode) { + float y; + asm("tanh.approx.f32 %0,%1; \n\t" : "=f"(y) : "f"(x)); + return y; + } +#endif + return tanhf(x); +} + +template +static __device__ __forceinline__ float FP32GeluFwd(float x) { + auto tanh_out = + FP32FastTanh(0.79788456f * x * (1.0f + 0.044715f * x * x)); + return x * 0.5f * (1.0f + tanh_out); +} + +template +static __device__ __forceinline__ float FP32GeluBwd(float x, float y_g) { + auto tanh_out = + FP32FastTanh(0.79788456f * x * (1.0f + 0.044715f * x * x)); + auto tmp = 0.5f * x * ((1.0f - tanh_out * tanh_out) * + (0.79788456f + 0.1070322243f * x * x)) + + 0.5f * (1.0f + tanh_out); + return tmp * y_g; +} + +template +static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x, + __half* y, + size_t n) { + size_t offset = + static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; + size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; + for (; offset < n; offset += stride) { + using ArrT = phi::AlignedVector<__half, VecSize>; + ArrT in_arr = *reinterpret_cast(x + offset); +#pragma unroll + for (int i = 0; i < VecSize; ++i) { + float tmp = __half2float(in_arr[i]); + in_arr[i] = __float2half(FP32GeluFwd(tmp)); + } + *reinterpret_cast(y + offset) = in_arr; + } +} + +template +static __global__ void FP16FastGeluBwdCUDAKernel(const __half* x, + const __half* y_g, + __half* x_g, + size_t n) { + size_t offset = + static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; + size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; + for (; offset < n; offset += stride) { + using ArrT = phi::AlignedVector<__half, VecSize>; + ArrT x_in_arr = *reinterpret_cast(x + offset); + ArrT y_g_in_arr = *reinterpret_cast(y_g + offset); +#pragma unroll + for (int i = 0; i < VecSize; ++i) { + __half2 tmp_fp16_2; + tmp_fp16_2.x = x_in_arr[i]; + tmp_fp16_2.y = y_g_in_arr[i]; + float2 tmp_fp32_2 = __half22float2(tmp_fp16_2); + x_in_arr[i] = + __float2half(FP32GeluBwd(tmp_fp32_2.x, tmp_fp32_2.y)); + } + *reinterpret_cast(x_g + offset) = x_in_arr; + } +} + +static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel( + const GPUContext& dev_ctx, const __half* x, __half* y, size_t n) { + auto is_aligned = [](const void* p, size_t alignment) { + return reinterpret_cast(p) % alignment == 0; + }; + +#define PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(__vec_size, __use_fast_math) \ + do { \ + constexpr auto kAlignment = \ + alignof(phi::AlignedVector<__half, __vec_size>); \ + if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ + is_aligned(y, kAlignment)) { \ + size_t thread = std::min(512, dev_ctx.GetMaxThreadsPerBlock()); \ + size_t block = (n / __vec_size + thread - 1) / thread; \ + block = std::min(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \ + VLOG(10) << "Use FP16 fast gelu fwd kernel, block = " << block \ + << " , thread = " << thread; \ + FP16FastGeluFwdCUDAKernel< \ + __vec_size, \ + __use_fast_math><<>>(x, y, n); \ + return true; \ + } \ + } while (0) + + if (FLAGS_use_fast_math) { + PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, true); + } else { + PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(8, false); + } + +#undef PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL + return false; +} + +static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel( + const GPUContext& dev_ctx, + const __half* x, + const __half* y_g, + __half* x_g, + size_t n) { + auto is_aligned = [](const void* p, size_t alignment) { + return reinterpret_cast(p) % alignment == 0; + }; + +#define PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(__vec_size, __use_fast_math) \ + do { \ + constexpr auto kAlignment = \ + alignof(phi::AlignedVector<__half, __vec_size>); \ + if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ + is_aligned(x, kAlignment) && is_aligned(y_g, kAlignment) && \ + is_aligned(x_g, kAlignment)) { \ + size_t thread = std::min(512, dev_ctx.GetMaxThreadsPerBlock()); \ + size_t block = (n / __vec_size + thread - 1) / thread; \ + block = std::min(block, dev_ctx.GetCUDAMaxGridDimSize()[0]); \ + VLOG(10) << "Use FP16 fast gelu bwd kernel, block = " << block \ + << " , thread = " << thread; \ + FP16FastGeluBwdCUDAKernel< \ + __vec_size, \ + __use_fast_math><<>>( \ + x, y_g, x_g, n); \ + return true; \ + } \ + } while (0) + + if (FLAGS_use_fast_math) { + PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, true); + } else { + PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(8, false); + } + +#undef PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL + return false; +} +#endif + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/gelu_grad_kernel.cu b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu new file mode 100644 index 00000000000..1e21f8d4267 --- /dev/null +++ b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu @@ -0,0 +1,100 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gelu_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" +#include "paddle/phi/kernels/gpu/gelu_funcs.h" + +DECLARE_bool(use_fast_math); + +namespace phi { + +template +struct GeluWithApproximateGradFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + inline HOSTDEVICE T operator()(T arg_x, T arg_dout) { + MPType x = static_cast(arg_x); + MPType dout = static_cast(arg_dout); + MPType one = static_cast(1); + MPType half = static_cast(0.5); + MPType kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); + MPType kBeta = + kAlpha * static_cast(GELU_CONSTANT) * static_cast(3); + auto cube_x = x * x * x; + auto tanh_out = + tanh(kAlpha * ((static_cast(GELU_CONSTANT) * cube_x) + x)); + auto ans = + half * (one + tanh_out + + (one - tanh_out * tanh_out) * (x * kAlpha + kBeta * cube_x)); + return static_cast(ans * dout); + } +}; + +template +struct GeluWithoutApproximateGradFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + inline HOSTDEVICE T operator()(T arg_x, T arg_dout) { + MPType x = static_cast(arg_x); + MPType dout = static_cast(arg_dout); + constexpr MPType kBeta = M_2_SQRTPI * M_SQRT1_2 * static_cast(0.5); + const MPType cdf = normcdf(x); + const MPType pdf = exp(static_cast(-0.5) * x * x) * kBeta; + return static_cast(dout * (cdf + x * pdf)); + } +}; + +template +void GeluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + bool approximate, + DenseTensor* x_grad) { + dev_ctx.template Alloc(x_grad); + std::vector ins = {&x, &out_grad}; + std::vector outs = {x_grad}; + if (approximate) { +#ifdef __NVCC__ + if (std::is_same::value) { + size_t n = x.numel(); + const auto* x_ptr = reinterpret_cast(x.data()); + const auto* y_g_ptr = reinterpret_cast(out_grad.data()); + auto* x_g_ptr = reinterpret_cast<__half*>(x_grad->data()); + if (TryLaunchFP16FastGeluBwdVectorizeCUDAKernel( + dev_ctx, x_ptr, y_g_ptr, x_g_ptr, n)) { + return; + } + } +#endif + phi::funcs::BroadcastKernel( + dev_ctx, ins, &outs, 0, GeluWithApproximateGradFunctor()); + } else { + phi::funcs::BroadcastKernel( + dev_ctx, ins, &outs, 0, GeluWithoutApproximateGradFunctor()); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gelu_grad, + GPU, + ALL_LAYOUT, + phi::GeluGradKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/gelu_kernel.cu b/paddle/phi/kernels/gpu/gelu_kernel.cu new file mode 100644 index 00000000000..ce6dda2d6cc --- /dev/null +++ b/paddle/phi/kernels/gpu/gelu_kernel.cu @@ -0,0 +1,90 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gelu_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" +#include "paddle/phi/kernels/gpu/gelu_funcs.h" + +DECLARE_bool(use_fast_math); + +namespace phi { + +template +struct GeluWithApproximateFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + inline HOSTDEVICE T operator()(T arg_x) { + // this function is tanh approximation of gelu + MPType x = static_cast(arg_x); + MPType one = static_cast(1); + MPType half = static_cast(0.5); + MPType kAlpha = static_cast(M_2_SQRTPI * M_SQRT1_2); + auto tanh_out = + tanh(kAlpha * x * (one + static_cast(GELU_CONSTANT) * x * x)); + MPType out = x * half * (one + tanh_out); + return static_cast(out); + } +}; + +template +struct GeluWithoutApproximateFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + inline HOSTDEVICE T operator()(T arg_x) { + // actual gelu with approximation = false + MPType x = static_cast(arg_x); + return static_cast(x * normcdf(x)); + } +}; + +template +void GeluKernel(const Context& dev_ctx, + const DenseTensor& x, + bool approximate, + DenseTensor* out) { + dev_ctx.template Alloc(out); + std::vector ins = {&x}; + std::vector outs = {out}; + if (approximate) { +#ifdef __NVCC__ + if (std::is_same::value) { + size_t n = x.numel(); + const auto* in_ptr = reinterpret_cast(x.data()); + auto* out_ptr = reinterpret_cast<__half*>(out->data()); + if (TryLaunchFP16FastGeluFwdVectorizeCUDAKernel( + dev_ctx, in_ptr, out_ptr, n)) { + return; + } + } +#endif + phi::funcs::BroadcastKernel( + dev_ctx, ins, &outs, 0, GeluWithApproximateFunctor()); + } else { + phi::funcs::BroadcastKernel( + dev_ctx, ins, &outs, 0, GeluWithoutApproximateFunctor()); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(gelu, + GPU, + ALL_LAYOUT, + phi::GeluKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu new file mode 100644 index 00000000000..f7b28253655 --- /dev/null +++ b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu @@ -0,0 +1,53 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_softmax_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" + +namespace phi { + +template +void LogSoftmaxGradKernel(const Context &dev_ctx, + const DenseTensor &out, + const DenseTensor &out_grad, + int axis, + DenseTensor *x_grad) { + dev_ctx.template Alloc(x_grad); + phi::SoftmaxBackwardCUDAKernelDriver( + dev_ctx, out, out_grad, axis, x_grad); +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(log_softmax_grad, + GPU, + ALL_LAYOUT, + phi::LogSoftmaxGradKernel, + float, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else +PD_REGISTER_KERNEL(log_softmax_grad, + GPU, + ALL_LAYOUT, + phi::LogSoftmaxGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#endif diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu new file mode 100644 index 00000000000..d7e34c6c14e --- /dev/null +++ b/paddle/phi/kernels/gpu/log_softmax_kernel.cu @@ -0,0 +1,51 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_softmax_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" + +namespace phi { + +template +void LogSoftmaxKernel(const Context &dev_ctx, + const DenseTensor &x, + int axis, + DenseTensor *out) { + dev_ctx.template Alloc(out); + phi::SoftmaxForwardCUDAKernelDriver(dev_ctx, x, axis, out); +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(log_softmax, + GPU, + ALL_LAYOUT, + phi::LogSoftmaxKernel, + float, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else +PD_REGISTER_KERNEL(log_softmax, + GPU, + ALL_LAYOUT, + phi::LogSoftmaxKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#endif diff --git a/paddle/phi/kernels/gpu/prelu_funcs.h b/paddle/phi/kernels/gpu/prelu_funcs.h new file mode 100644 index 00000000000..76ee9439a20 --- /dev/null +++ b/paddle/phi/kernels/gpu/prelu_funcs.h @@ -0,0 +1,183 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +#define CUDA_NUM_THREADS 1024 + +inline static int PADDLE_GET_BLOCKS(const int N) { + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +template +__global__ void PReluChannelFirstWiseKernel(const T *input, + const T *alpha, + T *output, + size_t channel_num, + size_t plane_size, + size_t numel) { + CUDA_KERNEL_LOOP(index, numel) { + size_t temp = index / plane_size; + size_t channel_index = temp % channel_num; + T scale = alpha[channel_index]; + T x = input[index]; + T zero = static_cast(0); + output[index] = (x > zero) ? x : scale * x; + } +} + +template +__global__ void PReluChannelLastWiseKernel(const T *input, + const T *alpha, + T *output, + size_t channel_num, + size_t numel) { + CUDA_KERNEL_LOOP(index, numel) { + size_t channel_index = index % channel_num; + T scale = alpha[channel_index]; + T x = input[index]; + T zero = static_cast(0); + output[index] = (x > zero) ? x : scale * x; + } +} + +template +__global__ void PReluElementWiseKernel(const T *input, + const T *alpha, + T *output, + size_t spatial_size, + size_t numel) { + CUDA_KERNEL_LOOP(index, numel) { + size_t element_index = index % spatial_size; + T scale = alpha[element_index]; + T x = input[index]; + T zero = static_cast(0); + output[index] = (x > zero) ? x : scale * x; + } +} + +template +__global__ void PReluScalarKernel(const T *input, + const T *alpha, + T *output, + size_t numel) { + T scale = alpha[0]; + CUDA_KERNEL_LOOP(index, numel) { + T x = input[index]; + T zero = static_cast(0); + output[index] = (x > zero) ? x : scale * x; + } +} + +template +class PreluChannelWiseDirectCUDAFunctor { + public: + void operator()(gpuStream_t stream, + const T *input, + const T *alpha, + T *output, + size_t batch_size, + size_t channel, + bool channel_last, + size_t numel); +}; + +template +class PreluElementWiseDirectCUDAFunctor { + public: + void operator()(gpuStream_t stream, + const T *input, + const T *alpha, + T *output, + size_t batch_size, + size_t numel); +}; + +template +class PreluScalarDirectCUDAFunctor { + public: + void operator()(gpuStream_t stream, + const T *input, + const T *alpha, + T *output, + size_t numel); +}; + +template +void PreluChannelWiseDirectCUDAFunctor::operator()(gpuStream_t stream, + const T *input, + const T *alpha, + T *output, + size_t batch_size, + size_t channel, + bool channel_last, + size_t numel) { + if (channel_last) { + PReluChannelLastWiseKernel<<>>( + input, alpha, output, channel, numel); + } else { + PReluChannelFirstWiseKernel<<>>( + input, alpha, output, channel, numel / batch_size / channel, numel); + } +} + +template +void PreluElementWiseDirectCUDAFunctor::operator()(gpuStream_t stream, + const T *input, + const T *alpha, + T *output, + size_t batch_size, + size_t numel) { + PReluElementWiseKernel<<>>( + input, alpha, output, numel / batch_size, numel); +} + +template +void PreluScalarDirectCUDAFunctor::operator()(gpuStream_t stream, + const T *input, + const T *alpha, + T *output, + size_t numel) { + PReluScalarKernel<<>>( + input, alpha, output, numel); +} + +template class PreluChannelWiseDirectCUDAFunctor; +template class PreluChannelWiseDirectCUDAFunctor; +template class PreluChannelWiseDirectCUDAFunctor; + +template class PreluElementWiseDirectCUDAFunctor; +template class PreluElementWiseDirectCUDAFunctor; +template class PreluElementWiseDirectCUDAFunctor; + +template class PreluScalarDirectCUDAFunctor; +template class PreluScalarDirectCUDAFunctor; +template class PreluScalarDirectCUDAFunctor; + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/prelu_grad_kernel.cu b/paddle/phi/kernels/gpu/prelu_grad_kernel.cu new file mode 100644 index 00000000000..d8661268e82 --- /dev/null +++ b/paddle/phi/kernels/gpu/prelu_grad_kernel.cu @@ -0,0 +1,183 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/prelu_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" +#include "paddle/phi/kernels/gpu/prelu_funcs.h" +#include "paddle/phi/kernels/primitive/functor_primitives.h" + +namespace phi { + +enum PRELU_MODE { Element, ChannelFirst, ChannelLast, PRELU_Scalar }; + +template +__global__ void PReluOpGradKernel(const T* x_ptr, + const T* alpha_ptr, + const T* out_grad_ptr, + T* x_grad_ptr, + T* alpha_grad_ptr, + size_t channel_num, + size_t plane_size, + size_t spatial_size, + size_t numel, + PRELU_MODE mode) { + CUDA_KERNEL_LOOP(index, numel) { + T scale; + if (mode == Element) { + size_t element_index = index % spatial_size; + scale = alpha_ptr[element_index]; + } else if (mode == ChannelFirst) { + size_t temp = index / plane_size; + size_t channel_index = temp % channel_num; + scale = alpha_ptr[channel_index]; + } else if (mode == ChannelLast) { + size_t channel_index = index % channel_num; + scale = alpha_ptr[channel_index]; + } else { + scale = alpha_ptr[0]; + } + T x = x_ptr[index]; + T out_grad = out_grad_ptr[index]; + T zero = static_cast(0); + if (x_grad_ptr != nullptr) + x_grad_ptr[index] = (x > zero) ? out_grad : scale * out_grad; + if (alpha_grad_ptr != nullptr) + alpha_grad_ptr[index] = (x > zero) ? zero : x * out_grad; + } +} + +template +class PreluOpGradFunctor { + public: + void operator()(gpuStream_t stream, + const T* x, + const T* alpha, + const T* out_grad, + T* x_grad, + T* alpha_grad, + const DDim& input_dims, + PRELU_MODE mode) { + size_t numel = 1; + for (size_t i = 0; i < input_dims.size(); ++i) { + numel *= input_dims[i]; + } + size_t plane_size = numel / input_dims[0] / input_dims[1]; + size_t spatial_size = numel / input_dims[0]; + size_t channel = + mode == ChannelLast ? input_dims[input_dims.size() - 1] : input_dims[1]; + + PReluOpGradKernel< + T><<>>( + x, + alpha, + out_grad, + x_grad, + alpha_grad, + channel, + plane_size, + spatial_size, + numel, + mode); + } +}; + +template +void PReluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& alpha, + const DenseTensor& out_grad, + const std::string& mode, + const std::string& data_format, + DenseTensor* x_grad, + DenseTensor* alpha_grad) { + dev_ctx.template Alloc(x_grad); + + const T* x_ptr = x.data(); + const T* alpha_ptr = alpha.data(); + const T* out_grad_ptr = out_grad.data(); + T* x_grad_ptr = x_grad ? dev_ctx.template Alloc(x_grad) : nullptr; + T* alpha_grad_ptr = + alpha_grad ? dev_ctx.template Alloc(alpha_grad) : nullptr; + + if (!x_grad && !alpha_grad) return; + + int numel = x.numel(); + auto dim = x.dims(); + auto x_rank = dim.size(); + std::vector input_shape = phi::vectorize(dim); + auto stream = dev_ctx.stream(); + + T* alpha_grad_tmp_ptr; + DenseTensor alpha_grad_tmp; + if (alpha_grad_ptr == nullptr) { + alpha_grad_tmp_ptr = alpha_grad_ptr; + } else { + DenseTensorMeta alpha_grad_meta( + alpha_grad->dtype(), dim, alpha_grad->layout()); + alpha_grad_tmp = phi::Empty(dev_ctx, std::move(alpha_grad_meta)); + alpha_grad_tmp_ptr = alpha_grad_tmp.data(); + } + + PRELU_MODE m; + bool channel_last = false; + if (mode == "element") { + m = Element; + } else if (mode == "channel") { + channel_last = data_format == "NHWC"; + m = channel_last ? ChannelLast : ChannelFirst; + } else { + m = PRELU_Scalar; + } + PreluOpGradFunctor prelu_grad; + prelu_grad(stream, + x_ptr, + alpha_ptr, + out_grad_ptr, + x_grad_ptr, + alpha_grad_tmp_ptr, + dim, + m); + + if (alpha_grad_tmp_ptr == nullptr) return; + + std::vector reduce_dims; + for (size_t i = 0; i < dim.size(); i++) { + if (mode == "channel" && !channel_last && i == 1) continue; + if (mode == "channel" && channel_last && i == dim.size() - 1) continue; + if (mode == "element" && i != 0) continue; + reduce_dims.push_back(i); + } + + phi::funcs::ReduceKernel>( + static_cast(dev_ctx), + alpha_grad_tmp, + alpha_grad, + kps::IdentityFunctor(), + reduce_dims); +} + +} // namespace phi + +PD_REGISTER_KERNEL(prelu_grad, + GPU, + ALL_LAYOUT, + phi::PReluGradKernel, + float, + phi::dtype::float16, + double) {} diff --git a/paddle/phi/kernels/gpu/prelu_kernel.cu b/paddle/phi/kernels/gpu/prelu_kernel.cu new file mode 100644 index 00000000000..8255a7ba2ed --- /dev/null +++ b/paddle/phi/kernels/gpu/prelu_kernel.cu @@ -0,0 +1,71 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/prelu_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/prelu_funcs.h" + +namespace phi { + +template +void PReluKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& alpha, + const std::string& mode, + const std::string& data_format, + DenseTensor* out) { + const T* x_ptr = x.data(); + T* o_ptr = dev_ctx.template Alloc(out); + + const T* alpha_ptr = alpha.data(); + int numel = x.numel(); + auto dim = x.dims(); + auto x_rank = dim.size(); + + VLOG(4) << "dim[0]:" << dim[0] << ", dim[1]:" << dim[1] << ", dim[" + << x_rank - 1 << "]:" << dim[x_rank - 1] << ", numel:" << numel; + + if (mode == "channel") { + bool channel_last = data_format == "NHWC"; + size_t channel = channel_last ? dim[x_rank - 1] : dim[1]; + PreluChannelWiseDirectCUDAFunctor prelu_channel_wise; + prelu_channel_wise(dev_ctx.stream(), + x_ptr, + alpha_ptr, + o_ptr, + dim[0], + channel, + channel_last, + numel); + } else if (mode == "element") { + PreluElementWiseDirectCUDAFunctor prelu_element_wise; + prelu_element_wise( + dev_ctx.stream(), x_ptr, alpha_ptr, o_ptr, dim[0], numel); + } else { + PreluScalarDirectCUDAFunctor prelu_scalar; + prelu_scalar(dev_ctx.stream(), x_ptr, alpha_ptr, o_ptr, numel); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(prelu, + GPU, + ALL_LAYOUT, + phi::PReluKernel, + float, + phi::dtype::float16, + double) {} diff --git a/paddle/phi/kernels/log_softmax_grad_kernel.h b/paddle/phi/kernels/log_softmax_grad_kernel.h new file mode 100644 index 00000000000..6336bc14105 --- /dev/null +++ b/paddle/phi/kernels/log_softmax_grad_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void LogSoftmaxGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad); +} // namespace phi diff --git a/paddle/phi/kernels/log_softmax_kernel.h b/paddle/phi/kernels/log_softmax_kernel.h new file mode 100644 index 00000000000..2caaa86d46c --- /dev/null +++ b/paddle/phi/kernels/log_softmax_kernel.h @@ -0,0 +1,26 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void LogSoftmaxKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/kernels/prelu_grad_kernel.h b/paddle/phi/kernels/prelu_grad_kernel.h new file mode 100644 index 00000000000..15917e2e1f0 --- /dev/null +++ b/paddle/phi/kernels/prelu_grad_kernel.h @@ -0,0 +1,31 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void PReluGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& alpha, + const DenseTensor& out_grad, + const std::string& mode, + const std::string& data_format, + DenseTensor* x_grad, + DenseTensor* alpha_grad); +} // namespace phi diff --git a/paddle/phi/kernels/prelu_kernel.h b/paddle/phi/kernels/prelu_kernel.h new file mode 100644 index 00000000000..251332a8158 --- /dev/null +++ b/paddle/phi/kernels/prelu_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void PReluKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& alpha, + const std::string& mode, + const std::string& data_format, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/ops/compat/gelu_sig.cc b/paddle/phi/ops/compat/gelu_sig.cc new file mode 100644 index 00000000000..bf4b47bcf5f --- /dev/null +++ b/paddle/phi/ops/compat/gelu_sig.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature GeluOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("gelu", {"X"}, {"approximate"}, {"Out"}); +} + +KernelSignature GeluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("gelu_grad", + {"X", GradVarName("Out")}, + {"approximate"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(gelu_grad, phi::GeluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(gelu, phi::GeluOpArgumentMapping); diff --git a/paddle/phi/ops/compat/log_softmax_sig.cc b/paddle/phi/ops/compat/log_softmax_sig.cc new file mode 100644 index 00000000000..b1ecc6d5676 --- /dev/null +++ b/paddle/phi/ops/compat/log_softmax_sig.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature LogSoftmaxGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("log_softmax_grad", + {"Out", GradVarName("Out")}, + {"axis"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(log_softmax_grad, + phi::LogSoftmaxGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/prelu_sig.cc b/paddle/phi/ops/compat/prelu_sig.cc new file mode 100644 index 00000000000..bd296c5e953 --- /dev/null +++ b/paddle/phi/ops/compat/prelu_sig.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature PReluGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("prelu_grad", + {"X", "Alpha", GradVarName("Out")}, + {"mode", "data_format"}, + {GradVarName("X"), GradVarName("Alpha")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(prelu_grad, phi::PReluGradOpArgumentMapping); -- GitLab From e52ffb704e835aae2e52dbb6f266d532d467719e Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Fri, 18 Mar 2022 19:03:03 +0800 Subject: [PATCH 169/176] Add Sparse OP Maxpool (#40569) sparse maxpool; kernel_registry support sparse tensor --- paddle/phi/core/kernel_registry.h | 32 ++ paddle/phi/kernels/funcs/pooling.h | 2 +- paddle/phi/kernels/funcs/sparse/convolution.h | 20 + .../sparse/cpu/sparse_pool_grad_kernel.cc | 73 ++++ .../kernels/sparse/cpu/sparse_pool_kernel.cc | 108 +++++ .../sparse/gpu/sparse_pool_grad_kernel.cu | 120 ++++++ .../kernels/sparse/gpu/sparse_pool_kernel.cu | 140 +++++++ .../kernels/sparse/sparse_pool_grad_kernel.h | 49 +++ .../phi/kernels/sparse/sparse_pool_kernel.h | 53 +++ paddle/phi/tests/kernels/CMakeLists.txt | 1 + .../tests/kernels/test_sparse_pool_dev_api.cc | 391 ++++++++++++++++++ 11 files changed, 988 insertions(+), 1 deletion(-) create mode 100644 paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc create mode 100644 paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc create mode 100644 paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu create mode 100644 paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu create mode 100644 paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h create mode 100644 paddle/phi/kernels/sparse/sparse_pool_kernel.h create mode 100644 paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h index d9ed68593cd..c3356eadcbd 100644 --- a/paddle/phi/core/kernel_registry.h +++ b/paddle/phi/core/kernel_registry.h @@ -98,6 +98,28 @@ struct KernelArgsParseFunctor { default_tensor_layout, default_key.dtype(), arg_type); + } else if (arg_type == std::type_index(typeid(const SparseCooTensor&))) { + args_def->AppendInput(default_key.backend(), + default_tensor_layout, + default_key.dtype(), + arg_type); + } else if (arg_type == std::type_index(typeid( + paddle::optional))) { + args_def->AppendInput(default_key.backend(), + default_tensor_layout, + default_key.dtype(), + arg_type); + } else if (arg_type == std::type_index(typeid(const SparseCsrTensor&))) { + args_def->AppendInput(default_key.backend(), + default_tensor_layout, + default_key.dtype(), + arg_type); + } else if (arg_type == std::type_index(typeid( + paddle::optional))) { + args_def->AppendInput(default_key.backend(), + default_tensor_layout, + default_key.dtype(), + arg_type); } else if (arg_type == std::type_index(typeid(DenseTensor*))) { args_def->AppendOutput(default_key.backend(), default_tensor_layout, @@ -114,6 +136,16 @@ struct KernelArgsParseFunctor { default_tensor_layout, default_key.dtype(), arg_type); + } else if (arg_type == std::type_index(typeid(SparseCooTensor*))) { + args_def->AppendOutput(default_key.backend(), + default_tensor_layout, + default_key.dtype(), + arg_type); + } else if (arg_type == std::type_index(typeid(SparseCsrTensor*))) { + args_def->AppendOutput(default_key.backend(), + default_tensor_layout, + default_key.dtype(), + arg_type); } else { // Attribute deal with // TODO(chenweihang): now here allow any types of attribute, maybe diff --git a/paddle/phi/kernels/funcs/pooling.h b/paddle/phi/kernels/funcs/pooling.h index 19c6d52c4c9..fa285dc69d1 100644 --- a/paddle/phi/kernels/funcs/pooling.h +++ b/paddle/phi/kernels/funcs/pooling.h @@ -43,7 +43,7 @@ template class MaxPool { public: DEVICE inline T initial() { return static_cast(-FLT_MAX); } - DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; } + HOSTDEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; } DEVICE inline void finalize(const T& pool_field, T* y) {} }; diff --git a/paddle/phi/kernels/funcs/sparse/convolution.h b/paddle/phi/kernels/funcs/sparse/convolution.h index d82d793e534..19f1f3d3cd2 100644 --- a/paddle/phi/kernels/funcs/sparse/convolution.h +++ b/paddle/phi/kernels/funcs/sparse/convolution.h @@ -165,6 +165,26 @@ inline void SubmPreProcess(const Context& dev_ctx, x_grad_ptr); } +inline const std::vector PoolResetKernel( + const std::vector& kernel_sizes, + const int in_channels, + const int out_channels) { + std::vector res(kernel_sizes); + res.resize(5); + res[3] = in_channels; + res[4] = out_channels; + return res; +} + +inline void PrefixSum(const int* counter, int* offsets, const int n) { + int offset = 0; + for (int i = 0; i < n; i++) { + offsets[i] = offset; + offset += counter[i]; + } + offsets[n] = offset; +} + } // namespace sparse } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc new file mode 100644 index 00000000000..3010d480b55 --- /dev/null +++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc @@ -0,0 +1,73 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/pooling.h" +#include "paddle/phi/kernels/funcs/sparse/convolution.h" + +namespace phi { +namespace sparse { + +template +void MaxPoolGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& rulebook, + const SparseCooTensor& out, + const DenseTensor& out_grad, + const std::vector& kernel_sizes, + DenseTensor* x_grad) { + int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; + const int channels = x.dims()[4]; + int rulebook_len = rulebook.dims()[1]; + const int* rulebook_ptr = rulebook.data(); + std::vector offsets(kernel_size + 1), counter(kernel_size, 0); + for (int i = 0; i < rulebook_len; i++) { + counter[rulebook_ptr[i]] += 1; + } + phi::funcs::sparse::PrefixSum(&counter[0], &offsets[0], kernel_size); + + const T* in_features_ptr = x.non_zero_elements().data(); + const T* out_features_ptr = out.non_zero_elements().data(); + const T* out_grad_ptr = out_grad.data(); + T* x_grad_ptr = x_grad->data(); + memset(x_grad_ptr, 0, sizeof(T) * x_grad->numel()); + + phi::funcs::MaxPoolGrad grad_functor; + for (int i = 0; i < kernel_size; i++) { + for (int j = 0; j < counter[i]; j++) { + int in_i = rulebook_ptr[rulebook_len + offsets[i] + j]; + int out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j]; + for (int c = 0; c < channels; c++) { + grad_functor.compute(in_features_ptr[in_i * channels + c], + out_features_ptr[out_i * channels + c], + out_grad_ptr[out_i * channels + c], + 1, + &x_grad_ptr[in_i * channels + c]); + } + } + } +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(sparse_maxpool_grad, + CPU, + ALL_LAYOUT, + phi::sparse::MaxPoolGradKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc new file mode 100644 index 00000000000..86971242df5 --- /dev/null +++ b/paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc @@ -0,0 +1,108 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/kernels/funcs/pooling.h" +#include "paddle/phi/kernels/funcs/sparse/convolution.h" +#include "paddle/phi/kernels/sparse/cpu/convolution.h" + +namespace phi { +namespace sparse { + +/** + * x: (N, D, H, W, C) + * kernel: (D, H, W, C, OC) + * out: (N, D, H, W, OC) +**/ +template +void MaxPoolKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + SparseCooTensor* out, + DenseTensor* rulebook) { + const auto& x_dims = x.dims(); + int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; + const std::vector& real_kernel_sizes = + phi::funcs::sparse::PoolResetKernel(kernel_sizes, x_dims[4], x_dims[4]); + DDim out_dims = {1, 1, 1, 1, 1}; + phi::funcs::sparse::GetOutShape( + x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims); + const int in_channels = real_kernel_sizes[3]; + + DenseTensorMeta counter_meta( + DataType::INT32, {kernel_size}, DataLayout::NCHW); + DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); + + const T* in_features_ptr = x.non_zero_elements().data(); + // 1. product rule book + ProductRuleBook(dev_ctx, + x, + real_kernel_sizes, + paddings, + dilations, + strides, + out_dims, + false, + rulebook, + &counter_per_kernel); + + UpdateRulebookAndOutIndex( + dev_ctx, x, kernel_size, in_channels, out_dims, rulebook, out); + + int rulebook_len = rulebook->dims()[1]; + const int* rulebook_ptr = rulebook->data(); + const int* counter_ptr = counter_per_kernel.data(); + + std::vector offsets(kernel_size + 1); + phi::funcs::sparse::PrefixSum(counter_ptr, &offsets[0], kernel_size); + std::vector out_flags(out->nnz(), false); + + // 2. max pool + T* out_features_ptr = out->mutable_non_zero_elements()->data(); + phi::funcs::MaxPool max_pool_functor; + for (int i = 0; i < kernel_size; i++) { + for (int j = 0; j < counter_ptr[i]; j++) { + int in_i = rulebook_ptr[rulebook_len + offsets[i] + j]; + int out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j]; + if (!out_flags[out_i]) { + out_flags[out_i] = true; + memcpy(&out_features_ptr[out_i * in_channels], + &in_features_ptr[in_i * in_channels], + in_channels * sizeof(T)); + } else { + for (int c = 0; c < in_channels; c++) { + max_pool_functor.compute(in_features_ptr[in_i * in_channels + c], + &out_features_ptr[out_i * in_channels + c]); + } + } + } + } +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(sparse_maxpool, + CPU, + ALL_LAYOUT, + phi::sparse::MaxPoolKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu new file mode 100644 index 00000000000..1048dd1be0c --- /dev/null +++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu @@ -0,0 +1,120 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/pooling.h" +#include "paddle/phi/kernels/funcs/sparse/convolution.h" + +#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h" + +namespace phi { +namespace sparse { + +template +__global__ void MaxPoolGradCudaKernel(const T* in_features_ptr, + const T* out_features_ptr, + const T* out_grad_ptr, + const int* rulebook_ptr, + const int n, + const int rulebook_len, + const int channels, + T* x_grad_ptr) { + phi::funcs::MaxPoolGrad grad_functor; + CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) { + int real_i = i / channels; + int c = i - real_i * channels; + int in_i = rulebook_ptr[real_i]; + int out_i = rulebook_ptr[real_i + rulebook_len]; + grad_functor.compute(in_features_ptr[in_i * channels + c], + out_features_ptr[out_i * channels + c], + out_grad_ptr[out_i * channels + c], + 1, + &x_grad_ptr[in_i * channels + c]); + } +} + +template +void MaxPoolGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& rulebook, + const SparseCooTensor& out, + const DenseTensor& out_grad, + const std::vector& kernel_sizes, + DenseTensor* x_grad) { + int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; + const int in_channels = x.dims()[4]; + int rulebook_len = rulebook.dims()[1]; + const int* rulebook_ptr = rulebook.data(); + std::vector offsets(kernel_size + 1), counter(kernel_size, 0), + h_counter(kernel_size); + phi::backends::gpu::GpuMemcpyAsync(&h_counter[0], + rulebook_ptr, + rulebook_len * sizeof(int), +#ifdef PADDLE_WITH_HIP + hipMemcpyDeviceToHost, +#else + cudaMemcpyDeviceToHost, +#endif + + dev_ctx.stream()); + dev_ctx.Wait(); + for (int i = 0; i < rulebook_len; i++) { + counter[h_counter[i]] += 1; + } + phi::funcs::sparse::PrefixSum(&counter[0], &offsets[0], kernel_size); + + const T* in_features_ptr = x.non_zero_elements().data(); + const T* out_features_ptr = out.non_zero_elements().data(); + const T* out_grad_ptr = out_grad.data(); + T* x_grad_ptr = x_grad->data(); + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, x_grad, static_cast(0.0f)); + + for (int i = 0; i < kernel_size; i++) { + if (counter[i] <= 0) { + continue; + } + + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, counter[i] * in_channels, 1); + MaxPoolGradCudaKernel<<>>( + in_features_ptr, + out_features_ptr, + out_grad_ptr, + rulebook_ptr + offsets[i] + rulebook_len, + counter[i], + rulebook_len, + in_channels, + x_grad_ptr); + } +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(sparse_maxpool_grad, + GPU, + ALL_LAYOUT, + phi::sparse::MaxPoolGradKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu new file mode 100644 index 00000000000..0f6a0d13b1d --- /dev/null +++ b/paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu @@ -0,0 +1,140 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/kernels/funcs/pooling.h" +#include "paddle/phi/kernels/funcs/sparse/convolution.h" +#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" +#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h" + +namespace phi { +namespace sparse { + +template +__global__ void MaxPoolCudaKernel(const T* in_features_ptr, + const int* rulebook_ptr, + const int n, + const int rulebook_len, + const int channels, + T* out_features_ptr) { + phi::funcs::MaxPool max_pool_functor; + CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) { + int real_i = i / channels; + int channel_i = i - real_i * channels; + int in_i = rulebook_ptr[real_i]; + int out_i = rulebook_ptr[real_i + rulebook_len]; + max_pool_functor.compute(in_features_ptr[in_i * channels + channel_i], + &out_features_ptr[out_i * channels + channel_i]); + } +} + +/** + * x: (N, D, H, W, C) + * kernel: (D, H, W, C, OC) + * out: (N, D, H, W, OC) +**/ +template +void MaxPoolKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + SparseCooTensor* out, + DenseTensor* rulebook) { + const auto& x_dims = x.dims(); + int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; + const std::vector& real_kernel_sizes = + phi::funcs::sparse::PoolResetKernel(kernel_sizes, x_dims[4], x_dims[4]); + DDim out_dims = {1, 1, 1, 1, 1}; + phi::funcs::sparse::GetOutShape( + x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims); + const int in_channels = real_kernel_sizes[3]; + + std::vector offsets(kernel_size + 1), counter(kernel_size); + DenseTensorMeta counter_meta( + DataType::INT32, {kernel_size}, DataLayout::NCHW); + DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); + DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); + DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW); + DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta)); + DenseTensor unique_key = phi::Empty(dev_ctx, std::move(index_meta)); + DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta)); + + // 1. product rulebook + int rulebook_len = ProductRuleBook(dev_ctx, + x, + real_kernel_sizes, + paddings, + dilations, + strides, + out_dims, + false, + rulebook, + &counter_per_kernel, + &offsets_per_kernel, + &out_index, + &unique_key, + &unique_value, + out, + &counter, + &offsets); + + const int* rulebook_ptr = rulebook->data(); + + T* out_features_ptr = out->mutable_non_zero_elements()->data(); + const T* in_features_ptr = x.non_zero_elements().data(); +// 2. max pool +#ifdef PADDLE_WITH_HIP + thrust::fill(thrust::hip::par.on(dev_ctx.stream()), +#else + thrust::fill(thrust::cuda::par.on(dev_ctx.stream()), +#endif + out_features_ptr, + out_features_ptr + out->non_zero_elements().numel(), + static_cast(-FLT_MAX)); + // TODO(zhangkaihuo) Replacing multiple calls with one kernel may be faster + for (int i = 0; i < kernel_size; i++) { + if (counter[i] <= 0) { + continue; + } + + auto config = phi::backends::gpu::GetGpuLaunchConfig1D( + dev_ctx, counter[i] * in_channels, 1); + MaxPoolCudaKernel<<>>( + in_features_ptr, + rulebook_ptr + offsets[i] + rulebook_len, + counter[i], + rulebook_len, + in_channels, + out_features_ptr); + } +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(sparse_maxpool, + GPU, + ALL_LAYOUT, + phi::sparse::MaxPoolKernel, + float, + double, + phi::dtype::float16) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h b/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h new file mode 100644 index 00000000000..572ade76281 --- /dev/null +++ b/paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/kernels/empty_kernel.h" + +namespace phi { +namespace sparse { + +template +void MaxPoolGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& rulebook, + const SparseCooTensor& out, + const DenseTensor& out_grad, + const std::vector& kernel_sizes, + DenseTensor* x_grad); + +template +DenseTensor MaxPoolGrad(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& rulebook, + const SparseCooTensor& out, + const DenseTensor& out_grad, + const std::vector& kernel_sizes) { + DenseTensor x_grad = phi::Empty( + dev_ctx, + DenseTensorMeta(x.dtype(), x.non_zero_elements().dims(), x.layout())); + MaxPoolGradKernel( + dev_ctx, x, rulebook, out, out_grad, kernel_sizes, &x_grad); + return x_grad; +} + +} // namespace sparse +} // namespace phi diff --git a/paddle/phi/kernels/sparse/sparse_pool_kernel.h b/paddle/phi/kernels/sparse/sparse_pool_kernel.h new file mode 100644 index 00000000000..bfadbf72e30 --- /dev/null +++ b/paddle/phi/kernels/sparse/sparse_pool_kernel.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/kernels/empty_kernel.h" + +namespace phi { +namespace sparse { + +template +void MaxPoolKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + SparseCooTensor* out, + DenseTensor* rulebook); + +template +SparseCooTensor MaxPool(const Context& dev_ctx, + const SparseCooTensor& x, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + DenseTensor* rulebook) { + DenseTensor indices = phi::Empty( + dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW)); + DenseTensor values = + phi::Empty(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout())); + SparseCooTensor coo(indices, values, x.dims()); + MaxPoolKernel( + dev_ctx, x, kernel_sizes, paddings, dilations, strides, &coo, rulebook); + return coo; +} + +} // namespace sparse +} // namespace phi diff --git a/paddle/phi/tests/kernels/CMakeLists.txt b/paddle/phi/tests/kernels/CMakeLists.txt index 317dcce92c8..3897c182e48 100644 --- a/paddle/phi/tests/kernels/CMakeLists.txt +++ b/paddle/phi/tests/kernels/CMakeLists.txt @@ -14,6 +14,7 @@ cc_test(test_concat_dev_api SRCS test_concat_dev_api.cc DEPS phi phi_api_utils) cc_test(test_split_dev_api SRCS test_split_dev_api.cc DEPS phi phi_api_utils) cc_test(test_sparse_utils_dev_api SRCS test_sparse_utils_dev_api.cc DEPS phi phi_api_utils) cc_test(test_sparse_conv3d_dev_api SRCS test_sparse_conv3d_dev_api.cc DEPS phi phi_api_utils) +cc_test(test_sparse_pool_dev_api SRCS test_sparse_pool_dev_api.cc DEPS phi phi_api_utils) cc_test(test_math_function SRCS test_math_function.cc DEPS math_function) if(WITH_GPU) diff --git a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc new file mode 100644 index 00000000000..27673704168 --- /dev/null +++ b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc @@ -0,0 +1,391 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h" +#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h" + +#include "paddle/fluid/memory/allocation/allocator_facade.h" +#include "paddle/phi/api/lib/utils/allocator.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +namespace tests { + +template +std::vector cast(const std::vector& in) { + std::vector out(in.size()); + for (uint64_t i = 0; i < in.size(); i++) { + out[i] = static_cast(in[i]); + } + return out; +} +template +void TestMaxPoolBase(const std::vector& indices, + const std::vector& features, + const DDim& x_dims, + const std::vector& correct_out_indices, + const std::vector& correct_out_features, + const DDim& correct_out_dims, + const int non_zero_num, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& strides, + const std::vector& dilations, + const float diff = 1e-3, + const bool backward = false, + const std::vector features_grad = {}) { + phi::CPUContext dev_ctx_cpu; + dev_ctx_cpu.SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + dev_ctx_cpu.Init(); + + const int in_channels = x_dims[4]; + const int out_channels = in_channels; + + DenseTensor indices_tensor = phi::Empty( + dev_ctx_cpu, + DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW)); + memcpy( + indices_tensor.data(), indices.data(), indices.size() * sizeof(int)); + DenseTensor features_tensor = phi::Empty( + dev_ctx_cpu, + DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), + {non_zero_num, in_channels}, + DataLayout::NHWC)); + memcpy( + features_tensor.data(), features.data(), features.size() * sizeof(T)); + + SparseCooTensor x_tensor(indices_tensor, features_tensor, x_dims); + + auto f_verify = [&](const T* real_data, const std::vector& correct_data) { + for (uint64_t i = 0; i < correct_data.size(); i++) { + float tmp = std::fabs(static_cast(correct_data[i] - real_data[i])); + ASSERT_LT(tmp, diff); + } + }; + + if (!std::is_same::value) { + DenseTensor rulebook = phi::Empty( + dev_ctx_cpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW)); + SparseCooTensor out = sparse::MaxPool(dev_ctx_cpu, + x_tensor, + kernel_sizes, + paddings, + dilations, + strides, + &rulebook); + + ASSERT_EQ(correct_out_dims.size(), out.dims().size()); + for (int i = 0; i < correct_out_dims.size(); i++) { + ASSERT_EQ(correct_out_dims[i], out.dims()[i]); + } + ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, out.nnz()); + + int cmp_indices = memcmp(correct_out_indices.data(), + out.non_zero_indices().data(), + correct_out_indices.size() * sizeof(int)); + ASSERT_EQ(cmp_indices, 0); + + f_verify(out.non_zero_elements().data(), correct_out_features); + + if (backward) { + DenseTensor x_grad = sparse::MaxPoolGrad(dev_ctx_cpu, + x_tensor, + rulebook, + out, + out.non_zero_elements(), + kernel_sizes); + f_verify(x_grad.data(), features_grad); + } + } + +// test gpu +#if defined(PADDLE_WITH_CUDA) + phi::GPUContext dev_ctx_gpu; + dev_ctx_gpu.PartialInitWithoutAllocator(); + dev_ctx_gpu.SetAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream()) + .get()); + dev_ctx_gpu.SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(phi::CPUPlace()) + .get()); + dev_ctx_gpu.PartialInitWithAllocator(); + + DenseTensor d_indices_tensor = phi::Empty( + dev_ctx_gpu, + DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW)); + phi::Copy( + dev_ctx_gpu, indices_tensor, phi::GPUPlace(), true, &d_indices_tensor); + + DenseTensor d_features_tensor = phi::Empty( + dev_ctx_gpu, + DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), + {non_zero_num, in_channels}, + DataLayout::NHWC)); + phi::Copy( + dev_ctx_gpu, features_tensor, phi::GPUPlace(), true, &d_features_tensor); + + SparseCooTensor d_x_tensor(d_indices_tensor, d_features_tensor, x_dims); + + DenseTensor d_rulebook = phi::Empty( + dev_ctx_gpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW)); + SparseCooTensor d_out = sparse::MaxPool(dev_ctx_gpu, + d_x_tensor, + kernel_sizes, + paddings, + dilations, + strides, + &d_rulebook); + + ASSERT_EQ(correct_out_dims.size(), d_out.dims().size()); + ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz()); + for (int i = 0; i < correct_out_dims.size(); i++) { + ASSERT_EQ(correct_out_dims[i], d_out.dims()[i]); + } + + DenseTensor h_indices_tensor = phi::Empty( + dev_ctx_cpu, + DenseTensorMeta(DataType::INT32, {4, d_out.nnz()}, DataLayout::NCHW)); + phi::Copy(dev_ctx_gpu, + d_out.non_zero_indices(), + phi::CPUPlace(), + true, + &h_indices_tensor); + + int cmp_indices2 = memcmp(correct_out_indices.data(), + h_indices_tensor.data(), + correct_out_indices.size() * sizeof(int)); + ASSERT_EQ(cmp_indices2, 0); + + DenseTensor h_features_tensor = phi::Empty( + dev_ctx_cpu, + DenseTensorMeta(paddle::experimental::CppTypeToDataType::Type(), + {d_out.nnz()}, + d_out.layout())); + + phi::Copy(dev_ctx_gpu, + d_out.non_zero_elements(), + phi::CPUPlace(), + true, + &h_features_tensor); + f_verify(h_features_tensor.data(), correct_out_features); + + if (backward) { + DenseTensor x_grad = sparse::MaxPoolGrad(dev_ctx_gpu, + d_x_tensor, + d_rulebook, + d_out, + d_out.non_zero_elements(), + kernel_sizes); + DenseTensor h_features_grad = phi::Empty( + dev_ctx_cpu, + DenseTensorMeta(x_grad.dtype(), x_grad.dims(), x_grad.layout())); + phi::Copy(dev_ctx_gpu, x_grad, phi::CPUPlace(), true, &h_features_grad); + f_verify(h_features_grad.data(), features_grad); + } +#endif +} + +void TestMaxPool(const std::vector& indices, + const std::vector& features, + const DDim& x_dims, + const std::vector& correct_out_indices, + const std::vector& correct_out_features, + const DDim& correct_out_dims, + const int non_zero_num, + const std::vector& kernel_sizes, + const std::vector& paddings, + const std::vector& strides, + const std::vector& dilations, + const float diff = 1e-3, + const bool backward = false, + const std::vector features_grad = {}) { + // test float + TestMaxPoolBase(indices, + features, + x_dims, + correct_out_indices, + correct_out_features, + correct_out_dims, + non_zero_num, + kernel_sizes, + paddings, + strides, + dilations, + diff, + backward, + features_grad); + // test double + TestMaxPoolBase(indices, + cast(features), + x_dims, + correct_out_indices, + cast(correct_out_features), + correct_out_dims, + non_zero_num, + kernel_sizes, + paddings, + strides, + dilations, + diff, + backward, + cast(features_grad)); +} + +TEST(DEV_API, sparse_maxpool) { + const int channels = 1; + DDim x_dims = {1, 1, 4, 4, channels}; + DDim out_dims = {1, 1, 2, 2, channels}; + std::vector kernel_sizes = {1, 3, 3}; + std::vector paddings = {0, 0, 0}; + std::vector strides = {1, 1, 1}; + std::vector dilations = {1, 1, 1}; + + const int non_zero_num = 3; + std::vector indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2}; + std::vector features = {1, 2, 3}; + std::vector out_indices = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, + }; + std::vector out_features = {2, 2, 3, 3}; + std::vector x_grad = {0, 4, 6}; + + TestMaxPool(indices, + features, + x_dims, + out_indices, + out_features, + out_dims, + non_zero_num, + kernel_sizes, + paddings, + strides, + dilations, + 1e-6, + true, + x_grad); +} + +TEST(DEV_API, sparse_maxpool_stride) { + const int channels = 1; + DDim x_dims = {1, 1, 4, 4, channels}; + DDim out_dims = {1, 1, 1, 1, channels}; + std::vector kernel_sizes = {1, 3, 3}; + std::vector paddings = {0, 0, 0}; + std::vector strides = {2, 2, 2}; + std::vector dilations = {1, 1, 1}; + + const int non_zero_num = 3; + std::vector indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2}; + std::vector features = {1, 2, 3}; + std::vector out_indices = {0, 0, 0, 0}; + std::vector out_features = {2}; + std::vector x_grad = {0, 2, 0}; + + TestMaxPool(indices, + features, + x_dims, + out_indices, + out_features, + out_dims, + non_zero_num, + kernel_sizes, + paddings, + strides, + dilations, + 1e-6, + true, + x_grad); +} + +TEST(DEV_API, sparse_maxpool_channel) { + const int channels = 2; + DDim x_dims = {1, 1, 4, 4, channels}; + DDim out_dims = {1, 1, 2, 2, channels}; + std::vector kernel_sizes = {1, 3, 3}; + std::vector paddings = {0, 0, 0}; + std::vector strides = {1, 1, 1}; + std::vector dilations = {1, 1, 1}; + + const int non_zero_num = 3; + std::vector indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2}; + std::vector features = {1, 1, 2, 2, 3, 3}; + std::vector out_indices = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, + }; + std::vector out_features = {2, 2, 2, 2, 3, 3, 3, 3}; + std::vector x_grad = {0, 0, 4, 4, 6, 6}; + + TestMaxPool(indices, + features, + x_dims, + out_indices, + out_features, + out_dims, + non_zero_num, + kernel_sizes, + paddings, + strides, + dilations, + 1e-6, + true, + x_grad); +} + +TEST(DEV_API, sparse_maxpool3d) { + const int channels = 2; + DDim x_dims = {1, 5, 4, 4, channels}; + DDim out_dims = {1, 3, 2, 2, channels}; + std::vector kernel_sizes = {3, 3, 3}; + std::vector paddings = {0, 0, 0}; + std::vector strides = {1, 1, 1}; + std::vector dilations = {1, 1, 1}; + + const int non_zero_num = 3; + std::vector indices = {0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 1, 2}; + std::vector features = {1, 1, 2, 2, 3, 3}; + std::vector out_indices = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, + }; + std::vector out_features = {2, 2, 2, 2, 3, 3, 3, 3}; + std::vector x_grad = {0, 0, 4, 4, 6, 6}; + + TestMaxPool(indices, + features, + x_dims, + out_indices, + out_features, + out_dims, + non_zero_num, + kernel_sizes, + paddings, + strides, + dilations, + 1e-6, + true, + x_grad); +} + +} // namespace tests +} // namespace phi -- GitLab From 34dfb0ec4401806c84a1f336b9ebb484e2dbe68a Mon Sep 17 00:00:00 2001 From: Baibaifan <39549453+Baibaifan@users.noreply.github.com> Date: Fri, 18 Mar 2022 19:54:00 +0800 Subject: [PATCH 170/176] fix_sharding_grad_clip (#40601) --- .../fleet/meta_parallel/sharding/sharding_utils.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py index 89b59254e5b..6a30276e02b 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py @@ -89,7 +89,7 @@ class ShardingClipGrad: global_norm_fp16 = paddle.cast( global_norm_fp16, dtype=paddle.float32) - # global norm of non-distributed FP16 params_and_grads for slice parameter + # global norm of non-distributed FP16 params_and_grads for unslice parameter if len(unslice_params_fp16) == 0: global_unslice_fp16 = paddle.to_tensor([0.], dtype=paddle.float32) else: @@ -104,21 +104,20 @@ class ShardingClipGrad: [0.], dtype=paddle.float32) global_norm_fp32 = layers.reduce_sum(global_norm_fp32) - # global norm of non-distributed FP32 params_and_grads for slice parameter + # global norm of non-distributed FP32 params_and_grads for unslice parameter global_unslice_fp32 = layers.concat(unslice_params_fp32) if len( unslice_params_fp32) != 0 else paddle.to_tensor( [0.], dtype=paddle.float32) global_unslice_fp32 = layers.reduce_sum(global_unslice_fp32) global_unslice_var = global_unslice_fp16 + global_unslice_fp32 - global_norm_var = global_norm_fp16 + global_norm_fp32 + global_norm_var = global_norm_fp16 + global_norm_fp32 + 1.0 / self._group.nranks * global_unslice_var # add all reduce to get global norm of distributed params_and_grads dev_id = int(self._device.split(":")[1]) with device_guard(dev_id, "gpu"): paddle.distributed.all_reduce(global_norm_var, group=self._group) - global_norm_var += global_unslice_var global_norm_var = layers.sqrt(global_norm_var) max_global_norm = layers.fill_constant( shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) -- GitLab From 50fad3edc056d363dfdae165994bbf1eb29d169f Mon Sep 17 00:00:00 2001 From: huzhiqiang <912790387@qq.com> Date: Fri, 18 Mar 2022 07:15:47 -0500 Subject: [PATCH 171/176] update infrt script (#40670) --- paddle/infrt/dialect/infrt/common/types.cc | 10 +++ paddle/infrt/host_context/paddle_mlir.cc | 88 ++++++++++++++----- paddle/infrt/host_context/paddle_mlir.h | 3 + paddle/scripts/infrt_build.sh | 1 + ...rate_pd_op_dialect_from_paddle_op_maker.py | 35 ++++++-- tools/infrt/generate_phi_kernel_dialect.py | 3 +- 6 files changed, 110 insertions(+), 30 deletions(-) diff --git a/paddle/infrt/dialect/infrt/common/types.cc b/paddle/infrt/dialect/infrt/common/types.cc index 62419a19628..c10679b0134 100644 --- a/paddle/infrt/dialect/infrt/common/types.cc +++ b/paddle/infrt/dialect/infrt/common/types.cc @@ -30,6 +30,8 @@ llvm::Optional GetLayoutType(llvm::StringRef key) { return LayoutType::NCHW; else if (key.equals_insensitive("NHWC")) return LayoutType::NHWC; + else if (key.equals_insensitive("ANY")) + return LayoutType::ANY; else return llvm::None; } @@ -39,6 +41,8 @@ llvm::Optional GetPrecisionType(llvm::StringRef key) { return PrecisionType::FLOAT32; else if (key.equals_insensitive("FP16")) return PrecisionType::FLOAT16; + else if (key.equals_insensitive("UNK")) + return PrecisionType::UNK; else return llvm::None; } @@ -67,6 +71,9 @@ llvm::StringRef GetString(LayoutType type) { case (LayoutType::NHWC): str = "NHWC"; break; + case (LayoutType::ANY): + str = "ANY"; + break; default: str = "Unsupported"; } @@ -82,6 +89,9 @@ llvm::StringRef GetString(PrecisionType type) { case (PrecisionType::FLOAT16): str = "FP16"; break; + case (PrecisionType::UNK): + str = "UNK"; + break; default: str = "Unsupported"; } diff --git a/paddle/infrt/host_context/paddle_mlir.cc b/paddle/infrt/host_context/paddle_mlir.cc index 4e7de9e2df1..29328520212 100644 --- a/paddle/infrt/host_context/paddle_mlir.cc +++ b/paddle/infrt/host_context/paddle_mlir.cc @@ -91,11 +91,15 @@ llvm::SmallVector MLIRModelGenImpl::GetModelInputsType( if (var_desc.name() == input_var_name) { std::vector dims = RepeatedToVector( var_desc.type().lod_tensor().tensor().dims()); - mlir::Type precision_; - ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(), - builder_, - &precision_); - mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_); + infrt::PrecisionType precision_; + ConvertDataTypeToPhi( + var_desc.type().lod_tensor().tensor().data_type(), &precision_); + mlir::Type type_ = + infrt::DenseTensorType::get(context_, + infrt::TargetType::CPU, + precision_, + infrt::LayoutType::ANY); + operandTypes.push_back(type_); } } @@ -117,11 +121,14 @@ llvm::SmallVector MLIRModelGenImpl::GetModelOutputsType( if (var_desc.name() == input_var_name) { std::vector dims = RepeatedToVector( var_desc.type().lod_tensor().tensor().dims()); - mlir::Type precision_; - ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(), - builder_, - &precision_); - mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_); + infrt::PrecisionType precision_; + ConvertDataTypeToPhi( + var_desc.type().lod_tensor().tensor().data_type(), &precision_); + mlir::Type type_ = + infrt::DenseTensorType::get(context_, + infrt::TargetType::CPU, + precision_, + infrt::LayoutType::ANY); resultTypes.push_back(type_); } } @@ -168,15 +175,11 @@ void MLIRModelGenImpl::UpdateModelParams( auto name = builder_.getStringAttr(var_desc.name()); std::vector dims = RepeatedToVector( var_desc.type().lod_tensor().tensor().dims()); - mlir::Type precision_; - ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(), - builder_, - &precision_); - mlir::Type type_ = - infrt::DenseTensorType::get(context_, - infrt::TargetType::CPU, - infrt::PrecisionType::FLOAT32, - infrt::LayoutType::NCHW); + infrt::PrecisionType precision_; + ConvertDataTypeToPhi(var_desc.type().lod_tensor().tensor().data_type(), + &precision_); + mlir::Type type_ = infrt::DenseTensorType::get( + context_, infrt::TargetType::CPU, precision_, infrt::LayoutType::ANY); auto op = builder_.create( mlir::UnknownLoc::get(context_), type_, map, name); params_map_.insert(std::pair( @@ -262,11 +265,13 @@ llvm::SmallVector MLIRModelGenImpl::GetOpOutputType( if (var_desc.name() == var_name) { std::vector dims = RepeatedToVector( var_desc.type().lod_tensor().tensor().dims()); - mlir::Type precision_; - ConvertDataType(var_desc.type().lod_tensor().tensor().data_type(), - builder_, - &precision_); - mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_); + infrt::PrecisionType precision_; + ConvertDataTypeToPhi(var_desc.type().lod_tensor().tensor().data_type(), + &precision_); + mlir::Type type_ = infrt::DenseTensorType::get(context_, + infrt::TargetType::CPU, + precision_, + infrt::LayoutType::ANY); resultTypes.push_back(type_); } } @@ -403,3 +408,38 @@ bool ConvertDataType(infrt::paddle::framework_proto::VarType::Type dtype, return false; } } + +bool ConvertDataTypeToPhi(infrt::paddle::framework_proto::VarType::Type dtype, + infrt::PrecisionType *type) { + switch (dtype) { + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_FP16: + *type = infrt::PrecisionType::FLOAT16; + return true; + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_FP32: + *type = infrt::PrecisionType::FLOAT32; + return true; + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_FP64: + *type = infrt::PrecisionType::FLOAT64; + return true; + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_BOOL: + *type = infrt::PrecisionType::BOOL; + return true; + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_INT8: + *type = infrt::PrecisionType::INT8; + return true; + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_INT16: + *type = infrt::PrecisionType::INT16; + return true; + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_INT32: + *type = infrt::PrecisionType::INT32; + return true; + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_INT64: + *type = infrt::PrecisionType::INT64; + return true; + case infrt::paddle::framework_proto::VarType::Type::VarType_Type_UINT8: + *type = infrt::PrecisionType::UINT8; + return true; + default: + return false; + } +} diff --git a/paddle/infrt/host_context/paddle_mlir.h b/paddle/infrt/host_context/paddle_mlir.h index d5f1209b992..a351b5cf80e 100644 --- a/paddle/infrt/host_context/paddle_mlir.h +++ b/paddle/infrt/host_context/paddle_mlir.h @@ -102,4 +102,7 @@ inline std::vector RepeatedToVector( bool ConvertDataType(infrt::paddle::framework_proto::VarType::Type dtype, mlir::Builder builder, mlir::Type *type); +bool ConvertDataTypeToPhi(infrt::paddle::framework_proto::VarType::Type dtype, + infrt::PrecisionType *type); + #endif // PADDLE_INFRT_HOST_CONTEXT_PADDLE_MLIR_H_ diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh index 850d4015abf..1b259023f94 100755 --- a/paddle/scripts/infrt_build.sh +++ b/paddle/scripts/infrt_build.sh @@ -45,6 +45,7 @@ function update_pd_ops() { python3 generate_pd_op_dialect_from_paddle_op_maker.py python3 generate_phi_kernel_dialect.py # generate test model + cd ${PADDLE_ROOT} python3 paddle/infrt/tests/model/abs_model.py ${PADDLE_ROOT}/build/paddle/infrt/tests/abs } diff --git a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py index 8855e1eee38..b0e420da64a 100644 --- a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py +++ b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py @@ -191,6 +191,21 @@ def generate_all_ops_inputs_outputs_map(op_descs): ops_inputs_outputs_head_file.write(cpp_style_ops_outputs_map_str) +def get_constraint(op_type, op_proto): + # 2.3.1 inputs + constraint = "NoSideEffect" + + optional_input_num_ = 0 + for input_ in op_proto[INPUTS]: + if op_proto[INPUTS][input_][EXTRA] != True and op_proto[INPUTS][input_][ + INTERMEDIATE] != True and op_proto[INPUTS][input_][ + DISPENSABLE] == True: + optional_input_num_ += 1 + if optional_input_num_ > 1: + constraint += ", AttrSizedOperandSegments" + return constraint + + # funtion to generate paddle op dialect file def convert_op_proto_into_mlir(op_descs): dst_dialect_file = "../../paddle/infrt/dialect/pd/ir/pd_ops.td" @@ -237,9 +252,11 @@ def convert_op_proto_into_mlir(op_descs): if (op_type in skipped_op_list) or (op_type not in original_ops_): continue automatically_generated_op_dialect.append(op_type) + constraint_ = get_constraint(op_type, op_proto) # 2.1 OpDef - HEAD = 'def PD_{op_type_capitalize}Op : PD_Op<"{op_type}", [NoSideEffect]> {left_brace}\n'.format( + HEAD = 'def PD_{op_type_capitalize}Op : PD_Op<"{op_type}", [{constraint}]> {left_brace}\n'.format( op_type_capitalize=op_type.capitalize(), + constraint=constraint_, op_type=op_type, left_brace="{") SUMMARY = ' let summary = "{} op";\n'.format(op_type) @@ -256,14 +273,22 @@ def convert_op_proto_into_mlir(op_descs): ARGUMENTS = "" if (len(op_proto[INPUTS]) > 0 or len(op_proto[ATTRS]) > 0): ARGUMENTS = " let arguments = (ins " + # 2.3.1 inputs for input_ in op_proto[INPUTS]: if op_proto[INPUTS][input_][EXTRA] != True and op_proto[INPUTS][ input_][INTERMEDIATE] != True: - if op_proto[INPUTS][input_][DUPLICABLE] != "true": - ARGUMENTS = ARGUMENTS + " PD_Tensor:$" + input_ + "," + if op_proto[INPUTS][input_][DISPENSABLE] != True: + if op_proto[INPUTS][input_][DUPLICABLE] != True: + ARGUMENTS = ARGUMENTS + " PD_Tensor:$" + input_ + "," + else: + ARGUMENTS = ARGUMENTS + " PD_Tensor_Array:$" + input_ + "," else: - ARGUMENTS = ARGUMENTS + " PD_Tensor_Array:$" + input_ + "," + if op_proto[INPUTS][input_][DUPLICABLE] != True: + ARGUMENTS = ARGUMENTS + " Optional:$" + input_ + "," + else: + ARGUMENTS = ARGUMENTS + " Optional:$" + input_ + "," + # unsupported: BLOCK = 8; BLOCKS = 10; attr_mlir_converter = { 0: 'SI32Attr', @@ -332,7 +357,7 @@ def convert_op_proto_into_mlir(op_descs): for output_ in op_proto[OUTPUTS]: if op_proto[OUTPUTS][output_][EXTRA] != True and op_proto[ OUTPUTS][output_][INTERMEDIATE] != True: - if op_proto[OUTPUTS][output_][DUPLICABLE] != "true": + if op_proto[OUTPUTS][output_][DUPLICABLE] != True: outputs = outputs + "PD_Tensor:${},".format(output_) else: outputs = outputs + "PD_Tensor_Array:${},".format( diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py index 36561d4e71d..f632c9a9dba 100644 --- a/tools/infrt/generate_phi_kernel_dialect.py +++ b/tools/infrt/generate_phi_kernel_dialect.py @@ -43,7 +43,8 @@ precision_type_converter = { "float64": "FLOAT64", "complex64": "COMPLEX64", "complex128": "COMPLEX128", - "bool": "BOOL" + "bool": "BOOL", + "Undefined": "UNK" } kernel_types_info_file = "./kernels.json" -- GitLab From 8e612903d342f4f717ff195bac3ebc77a2672a10 Mon Sep 17 00:00:00 2001 From: pangyoki Date: Sat, 19 Mar 2022 00:21:38 +0800 Subject: [PATCH 172/176] support inplace in dygraph eager_fluid state (#40400) * [Eager] Support eager grad interface, draft version * Support eager grad interface with allow_unused and multi startup_op * Fix code format * Fix allow_unused case, return PyNone if tensor not initialize * Support output's stop_gradient related to create_graph * Support grad exception case in eager mode, fix coverage CI * Update ToPyObject, return PyNone if not initialize * AccumulationNode add FLAGS_retain_grad_for_all_tensor * Fix ci issue * Fix CI issue * fix, use core.eager.Tensor * Add func SetBufferSlotRankZeros for GradTensorHolder * Support retain_graph by using ClearTensorWrappers * Support retain_graph by using ClearTensorWrappers * Update retain_graph and no_grad_vars related test case * Update code gen logic for ClearTensorWrappers * Fix by override statement * fix override func args * Support retain_graph, update unit tests * Updated ClearTensorWrappers logic * fix grad python interface * Use deep copy and update unit tests * Polish code * Polish code * Fix CI issue, Deep copy only use when user set grad_tensors * Fix CI, use Backward instead RunBackward * Fix CI, Declare kernel explicitly in test file * Polish, remove vector of TensorWrapper * Refactor the logic of grad/backward, polish codes * Update code after merge upstream develop * Polish after merge upstream develop * Update to adapt new GradNodeBase superclass * Fix error introduced during conflict resolution * support inplace strategy in eager_fluid state * solve conflict * nothing * Update purify potential_startup_nodes logic * Fix errors * Polish code * Remove useless args for ToPyObject * Remove useless TensorWrappersSet * fix record conflict * Fix code-format, re-install pre-commit * fix tensor_wrapper bug * Fix pre-process logic for potential_startup_ops * Update unit tests, use eager mode * Fix conflicts * fix unittest timeout * little change Co-authored-by: Weilong Wu --- paddle/fluid/eager/api/utils/tensor_utils.cc | 3 +- .../auto_code_generator/eager_generator.cc | 396 ++++++++++++----- paddle/fluid/eager/tensor_wrapper.h | 48 +++ paddle/fluid/eager/utils.cc | 21 + paddle/fluid/eager/utils.h | 17 + paddle/fluid/pybind/eager_method.cc | 11 + .../pybind/eager_op_function_generator.cc | 73 +++- paddle/fluid/pybind/eager_utils.cc | 16 + paddle/fluid/pybind/eager_utils.h | 44 ++ paddle/fluid/pybind/op_function_common.cc | 25 ++ paddle/fluid/pybind/op_function_common.h | 5 + paddle/phi/api/include/tensor.h | 16 +- paddle/phi/api/lib/tensor.cc | 31 ++ .../fluid/tests/unittests/CMakeLists.txt | 1 + .../unittests/test_inplace_eager_fluid.py | 397 ++++++++++++++++++ 15 files changed, 991 insertions(+), 113 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py diff --git a/paddle/fluid/eager/api/utils/tensor_utils.cc b/paddle/fluid/eager/api/utils/tensor_utils.cc index 77c39d1b0a3..b485beca57a 100644 --- a/paddle/fluid/eager/api/utils/tensor_utils.cc +++ b/paddle/fluid/eager/api/utils/tensor_utils.cc @@ -30,7 +30,8 @@ namespace egr_utils_api { bool IsLeafTensor(const paddle::experimental::Tensor& target) { std::shared_ptr grad_node = EagerUtils::grad_node(target); - if (std::dynamic_pointer_cast(grad_node)) { + if (!grad_node || + std::dynamic_pointer_cast(grad_node)) { return true; } diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index d15c413339a..b8d59e8dd8b 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -979,7 +979,9 @@ static bool CollectGradInformationFromOpInfo( /* --------------------------------------------------- */ static std::string GenerateGradNodeCreationContent( const ForwardGenerationInfo& fwd_info, - const GradNodeGenerationInfo& bwd_info) { + const GradNodeGenerationInfo& bwd_info, + const std::string& trace_op_body_str, + std::map inplace_map = {}) { VLOG(6) << "Generating GradNode Creation codes"; const std::string& op_type = fwd_info.GetOpType(); @@ -998,7 +1000,8 @@ static std::string GenerateGradNodeCreationContent( // If single output slotname and not duplicable, // then generate: "egr::AutogradMeta* p_autograd_out = // egr::EagerUtils::autograd_meta("op_proto->outputs()[0].name()")" - std::string get_autograd_meta_str = " // Prepare Autograd Meta \n"; + std::string get_input_autograd_meta_str = " // Prepare Autograd Meta \n"; + std::string get_output_autograd_meta_str = ""; // If single output slotname and not duplicable, // then generate: "egr::AutogradMeta* p_autograd_out = // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")" @@ -1006,22 +1009,39 @@ static std::string GenerateGradNodeCreationContent( const std::string& output_name = output.name(); const std::string& output_autograd_name = "p_autograd_" + output_name; + // output autograd_meta should be got after running TraceOP. if (output.duplicable()) { const char* GET_MULTI_AUTOGRAD_META_TEMPLATE = - " std::vector %s = " + " std::vector %s = " "egr::EagerUtils::autograd_meta(&%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( + get_output_autograd_meta_str += paddle::string::Sprintf( GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); } else { - const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = - " egr::AutogradMeta* %s = " - "egr::EagerUtils::autograd_meta(&%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( - GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); + // In inplace op, the case where output is duplicable is not considered. + // Replace output directly with input in inplace op. + if (!inplace_map.empty() && inplace_map.count(output_name)) { + auto inplace_input_name = inplace_map[output_name]; + const std::string& inplace_input_autograd_name = + "p_autograd_" + inplace_input_name; + const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = + " %s = egr::EagerUtils::autograd_meta(&%s);\n"; + get_output_autograd_meta_str += paddle::string::Sprintf( + GET_SINGLE_AUTOGRAD_META_TEMPLATE, inplace_input_autograd_name, + inplace_input_name); + } else { + const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = + " egr::AutogradMeta* %s = " + "egr::EagerUtils::autograd_meta(&%s);\n"; + get_output_autograd_meta_str += + paddle::string::Sprintf(GET_SINGLE_AUTOGRAD_META_TEMPLATE, + output_autograd_name, output_name); + } } } VLOG(6) << "Generated outputs autograd_meta"; + // input autograd_meta should be got before running TraceOP (for checking + // inplace). for (const proto::OpProto::Var& input : in_vars) { const std::string& input_name = input.name(); const std::string& input_autograd_name = "p_autograd_" + input_name; @@ -1030,28 +1050,46 @@ static std::string GenerateGradNodeCreationContent( const char* GET_MULTI_AUTOGRAD_META_TEMPLATE = " std::vector %s = " "egr::EagerUtils::nullable_autograd_meta(%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( + get_input_autograd_meta_str += paddle::string::Sprintf( GET_MULTI_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name); } else if (input.dispensable()) { const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = " egr::AutogradMeta* %s = " "egr::EagerUtils::nullable_autograd_meta(%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( + get_input_autograd_meta_str += paddle::string::Sprintf( GET_SINGLE_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name); } else { const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = " egr::AutogradMeta* %s = " "egr::EagerUtils::nullable_autograd_meta(%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( + get_input_autograd_meta_str += paddle::string::Sprintf( GET_SINGLE_AUTOGRAD_META_TEMPLATE, input_autograd_name, input_name); } } VLOG(6) << "Generated inputs autograd_meta"; + // check inplace input to avoid inplace operations on leaf nodes with + // stop_gradient=False. + std::string check_inplace_str = ""; + if (!inplace_map.empty()) { + const char* CHECKING_INPLACE_TEMPLATE = + " // Check Inplace\n" + " egr::EagerUtils::CheckInplace(%s, p_autograd_%s, " + "require_any_grad);\n"; + for (auto& inplace_pair : inplace_map) { + std::string inplace_name = inplace_pair.second; + check_inplace_str += paddle::string::Sprintf(CHECKING_INPLACE_TEMPLATE, + inplace_name, inplace_name); + } + VLOG(6) << "Check Inplace Input"; + } + std::string prepare_autograd_meta_str = ""; - prepare_autograd_meta_str += get_autograd_meta_str; + // only generate input autograd_meta in temporary. + // output autograd_meta will be generated after running TraceOP. + prepare_autograd_meta_str += get_input_autograd_meta_str; prepare_autograd_meta_str += "\n"; // [GradOpNode] GetTraceBackward @@ -1066,7 +1104,7 @@ static std::string GenerateGradNodeCreationContent( size_t bwd_in_slot_num = out_vars.size(); size_t bwd_out_slot_num = in_vars.size(); const char* GRAD_OP_NODE_TEMPLATE = - " auto grad_node = std::make_shared(%d, %d);\n"; + " auto grad_node = std::make_shared(%d, %d);\n"; grad_node_creation_str += " // Create GradOpNode\n"; grad_node_creation_str += paddle::string::Sprintf( GRAD_OP_NODE_TEMPLATE, op_type, bwd_in_slot_num, bwd_out_slot_num); @@ -1075,14 +1113,14 @@ static std::string GenerateGradNodeCreationContent( VLOG(6) << "Generated GradOpNode construction"; // [GradOpNode] Set Attrs - grad_node_creation_str += " // Set Attributes\n"; - grad_node_creation_str += " grad_node->SetAttrMap(std::move(attrs));\n"; + grad_node_creation_str += " // Set Attributes\n"; + grad_node_creation_str += " grad_node->SetAttrMap(std::move(attrs));\n"; grad_node_creation_str += - " grad_node->SetDefaultAttrMap(std::move(default_attrs));\n"; + " grad_node->SetDefaultAttrMap(std::move(default_attrs));\n"; grad_node_creation_str += "\n"; // [GradOpNode] Set TensorWrappers - grad_node_creation_str += " // Set Tensor Wrappers\n"; + grad_node_creation_str += " // Set Tensor Wrappers\n"; for (const auto& iter : op_base_infos) { const std::map& grad_ins_fwd_slotname_map = iter.GetGradInsFwdSlotnameMap(); @@ -1094,10 +1132,18 @@ static std::string GenerateGradNodeCreationContent( full_reserved = "true"; } const char* SET_TENSOR_WRAPPER_TEMPLATE = - " grad_node->SetTensorWrapper%s(%s, %s);\n"; - grad_node_creation_str += paddle::string::Sprintf( - SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name, tensor_wrapper_name, - full_reserved); + " grad_node->SetTensorWrapper%s(%s, %s);\n"; + // Replace output directly with input in inplace op. + if (!inplace_map.empty() && inplace_map.count(tensor_wrapper_name)) { + auto inplace_input_name = inplace_map[tensor_wrapper_name]; + grad_node_creation_str += paddle::string::Sprintf( + SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name, + inplace_input_name, full_reserved); + } else { + grad_node_creation_str += paddle::string::Sprintf( + SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name, + tensor_wrapper_name, full_reserved); + } } } grad_node_creation_str += "\n"; @@ -1115,12 +1161,12 @@ static std::string GenerateGradNodeCreationContent( size_t input_position = fwd_inputs_name_pos_map.at(input_name); const char* SET_GRAD_OUT_META_TEMPLATE = - " grad_node->SetGradOutMeta(%s, %d);\n"; + " grad_node->SetGradOutMeta(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( SET_GRAD_OUT_META_TEMPLATE, input_name, input_position); const char* ADD_EDGES_TEMPLATE = - " if(%s) grad_node->AddEdges(%s, %d);\n"; + " if(%s) grad_node->AddEdges(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf(ADD_EDGES_TEMPLATE, input_autograd_name, input_autograd_name, input_position); @@ -1129,11 +1175,11 @@ static std::string GenerateGradNodeCreationContent( size_t input_position = fwd_inputs_name_pos_map.at(input_name); const char* SET_GRAD_OUT_META_TEMPLATE = - " grad_node->SetGradOutMeta(%s, %d);\n"; + " grad_node->SetGradOutMeta(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( SET_GRAD_OUT_META_TEMPLATE, input_name, input_position); - const char* ADD_EDGES_TEMPLATE = " grad_node->AddEdges(&%s, %d);\n"; + const char* ADD_EDGES_TEMPLATE = " grad_node->AddEdges(&%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( ADD_EDGES_TEMPLATE, input_autograd_name, input_position); } @@ -1145,73 +1191,125 @@ static std::string GenerateGradNodeCreationContent( std::string pass_stop_gradient_args = "false"; for (const proto::OpProto::Var& output : out_vars) { const std::string& output_name = output.name(); - const std::string& output_autograd_name = "p_autograd_" + output_name; - size_t output_position = fwd_outputs_name_pos_map.at(output_name); - - // Intermediate Tensor does not require SetHistory, nor RetainGrad - - if (output.duplicable()) { - pass_stop_gradient_args += ", &" + output_autograd_name; + // Replace output directly with input in inplace op. + if (!inplace_map.empty() && inplace_map.count(output_name)) { + auto inplace_input_name = inplace_map[output_name]; + const std::string& inplace_input_autograd_name = + "p_autograd_" + inplace_input_name; + size_t output_position = fwd_outputs_name_pos_map.at(output_name); + + // Intermediate Tensor does not require SetHistory, nor RetainGrad + pass_stop_gradient_args += ", " + inplace_input_autograd_name; const char* SET_OUT_RANK_TEMPLATE = - " egr::EagerUtils::SetOutRankWithSlot(&%s, %d);\n"; + " egr::EagerUtils::SetOutRankWithSlot(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( - SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position); + SET_OUT_RANK_TEMPLATE, inplace_input_autograd_name, output_position); // Intermediate Tensor does not require SetHistory if (!output.intermediate()) { const char* SET_HISTORY_TEMPLATE = - " egr::EagerUtils::SetHistory(&%s, grad_node);\n"; - grad_node_creation_str += - paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name); + " egr::EagerUtils::SetHistory(%s, grad_node);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_HISTORY_TEMPLATE, inplace_input_autograd_name); } const char* SET_GRAD_IN_META_TEMPLATE = - " grad_node->SetGradInMeta(%s, %d);\n"; + " grad_node->SetGradInMeta(%s, %d);\n"; grad_node_creation_str += paddle::string::Sprintf( - SET_GRAD_IN_META_TEMPLATE, output_name, output_position); + SET_GRAD_IN_META_TEMPLATE, inplace_input_name, output_position); + // Intermediate Tensor does not require CheckAndRetainGrad + if (!output.intermediate()) { + VLOG(6) << "Generated Call RetainGradForTensor"; + const char* RETAIN_GRAD_TEMPLATE = + " egr::EagerUtils::CheckAndRetainGrad(%s);\n"; + grad_node_creation_str += + paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, inplace_input_name); + } } else { - pass_stop_gradient_args += ", " + output_autograd_name; - const char* SET_OUT_RANK_TEMPLATE = - " egr::EagerUtils::SetOutRankWithSlot(%s, %d);\n"; - grad_node_creation_str += paddle::string::Sprintf( - SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position); + const std::string& output_autograd_name = "p_autograd_" + output_name; + size_t output_position = fwd_outputs_name_pos_map.at(output_name); - // Intermediate Tensor does not require SetHistory + // Intermediate Tensor does not require SetHistory, nor RetainGrad + + if (output.duplicable()) { + pass_stop_gradient_args += ", &" + output_autograd_name; + const char* SET_OUT_RANK_TEMPLATE = + " egr::EagerUtils::SetOutRankWithSlot(&%s, %d);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position); + + // Intermediate Tensor does not require SetHistory + if (!output.intermediate()) { + const char* SET_HISTORY_TEMPLATE = + " egr::EagerUtils::SetHistory(&%s, grad_node);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_HISTORY_TEMPLATE, output_autograd_name); + } + const char* SET_GRAD_IN_META_TEMPLATE = + " grad_node->SetGradInMeta(%s, %d);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_GRAD_IN_META_TEMPLATE, output_name, output_position); + + } else { + pass_stop_gradient_args += ", " + output_autograd_name; + const char* SET_OUT_RANK_TEMPLATE = + " egr::EagerUtils::SetOutRankWithSlot(%s, %d);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_OUT_RANK_TEMPLATE, output_autograd_name, output_position); + + // Intermediate Tensor does not require SetHistory + if (!output.intermediate()) { + const char* SET_HISTORY_TEMPLATE = + " egr::EagerUtils::SetHistory(%s, grad_node);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_HISTORY_TEMPLATE, output_autograd_name); + } + const char* SET_GRAD_IN_META_TEMPLATE = + " grad_node->SetGradInMeta(%s, %d);\n"; + grad_node_creation_str += paddle::string::Sprintf( + SET_GRAD_IN_META_TEMPLATE, output_name, output_position); + } + + // Intermediate Tensor does not require CheckAndRetainGrad if (!output.intermediate()) { - const char* SET_HISTORY_TEMPLATE = - " egr::EagerUtils::SetHistory(%s, grad_node);\n"; + VLOG(6) << "Generated Call RetainGradForTensor"; + const char* RETAIN_GRAD_TEMPLATE = + " egr::EagerUtils::CheckAndRetainGrad(%s);\n"; grad_node_creation_str += - paddle::string::Sprintf(SET_HISTORY_TEMPLATE, output_autograd_name); + paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name); } - const char* SET_GRAD_IN_META_TEMPLATE = - " grad_node->SetGradInMeta(%s, %d);\n"; - grad_node_creation_str += paddle::string::Sprintf( - SET_GRAD_IN_META_TEMPLATE, output_name, output_position); - } - - // Intermediate Tensor does not require CheckAndRetainGrad - if (!output.intermediate()) { - VLOG(6) << "Generated Call RetainGradForTensor"; - const char* RETAIN_GRAD_TEMPLATE = - " egr::EagerUtils::CheckAndRetainGrad(%s);\n"; - grad_node_creation_str += - paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name); } } VLOG(6) << "Generated SetGradIn/OutMeta"; // [Generation] GradNode Creation + // After getting require_any_grad, firstly use CheckInplace method for inplace + // op. + // Then execute TraceOp and generate output autograd_meta. + // Finally, Construct GradNode. (Replace output directly with input in inplace + // op.) + // Add event record + std::string event_name = op_type + " node_creation"; const char* GRAD_NODE_CREATION_TEMPLATE = - " %s" + "%s" " bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(%s);\n" - " if(require_any_grad) {\n" - " VLOG(6) << \" Construct Grad for %s \"; \n" - " egr::EagerUtils::PassStopGradient(%s);\n" - "%s\n }"; + "%s\n" + "%s" + " {\n" + " paddle::platform::RecordEvent node_creation_record_event(\"%s\", " + "paddle::platform::TracerEventType::Operator, 1);\n" + "%s" + " if(require_any_grad) {\n" + " VLOG(6) << \" Construct Grad for %s \"; \n" + " egr::EagerUtils::PassStopGradient(%s);\n" + " %s\n" + " }\n" + " }"; std::string grad_node_creation_body_str = paddle::string::Sprintf( GRAD_NODE_CREATION_TEMPLATE, prepare_autograd_meta_str, - compute_require_grad_args, op_type, pass_stop_gradient_args, - grad_node_creation_str); + compute_require_grad_args, check_inplace_str, trace_op_body_str, + event_name, get_output_autograd_meta_str, op_type, + pass_stop_gradient_args, grad_node_creation_str); return grad_node_creation_body_str; } @@ -1221,7 +1319,8 @@ static std::string GenerateGradNodeCreationContent( /* -------------------------------- */ static std::pair GenerateForwardFunctionContents( const ForwardGenerationInfo& fwd_info, - const GradNodeGenerationInfo& bwd_info) { + const GradNodeGenerationInfo& bwd_info, + std::map inplace_map = {}) { /* --- Process Forward Info ---*/ const std::string& op_type = fwd_info.GetOpType(); const std::unordered_map& fwd_inputs_name_pos_map = @@ -1301,8 +1400,21 @@ static std::pair GenerateForwardFunctionContents( core_ops_args_type_info[op_type][input_position] = "list"; } else { - const char* FWD_INS_ARG_TEMPLATE = - "const paddle::experimental::Tensor& %s"; + // inplace tensor can't be const + const char* FWD_INS_ARG_TEMPLATE; + bool flag_find_input_name = false; + if (!inplace_map.empty()) { + for (auto& inplace_pair : inplace_map) { + if (inplace_pair.second == input_name) { + flag_find_input_name = true; + FWD_INS_ARG_TEMPLATE = "paddle::experimental::Tensor& %s"; + break; + } + } + } + if (!flag_find_input_name) { + FWD_INS_ARG_TEMPLATE = "const paddle::experimental::Tensor& %s"; + } input_args_str_list[input_position] = paddle::string::Sprintf(FWD_INS_ARG_TEMPLATE, input_name); @@ -1362,6 +1474,7 @@ static std::pair GenerateForwardFunctionContents( // [Generation] Get Outs Map std::string outs_contents_str = ""; + std::string inplace_mapping_str = ""; for (const proto::OpProto::Var& output : out_vars) { const std::string& output_name = output.name(); std::string outnum = "1"; @@ -1404,6 +1517,22 @@ static std::pair GenerateForwardFunctionContents( } core_ops_args_info[op_type].push_back(output_var_name); + } else if (!inplace_map.empty() && inplace_map.count(output_name)) { + // In inplace op, replace the output with the input directly. + PADDLE_ENFORCE_NE( + inplace_map[output_name], "", + paddle::platform::errors::InvalidArgument( + "Inplace op %s has no input corresponding to output %s.", op_type, + output_name)); + const char* FWD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", ins[\"%s\"] },"; + auto inplace_input_name = inplace_map[output_name]; + outs_contents_str += paddle::string::Sprintf( + FWD_OUTS_CONTENT_TEMPLATE, output_name, inplace_input_name); + + // inplace_map used in TraceOp. + const char* INPLACE_MAPPING_TEMPLATE = R"({"%s", "%s"},)"; + inplace_mapping_str += paddle::string::Sprintf( + INPLACE_MAPPING_TEMPLATE, inplace_input_name, output_name); } else { if (output.duplicable()) { outnum = output_name + "Num"; @@ -1430,6 +1559,8 @@ static std::pair GenerateForwardFunctionContents( } if (outs_contents_str.size() > 0) outs_contents_str.pop_back(); // Remove trailing "," + if (inplace_mapping_str.size() > 0) + inplace_mapping_str.pop_back(); // Remove trailing "," const char* FWD_OUTS_MAP_TEMPLATE = " std::map GenerateForwardFunctionContents( dygraph_function_args_str += ", const paddle::framework::AttributeMap& attr_map"; + /* --------- Generate TraceOp ----- */ + // TraceOp should be run after compute require_any_grad. (for checking + // inplace) + // `trace_op_body_str` will be passed as a parameter to + // `GenerateGradNodeCreationContent`. + std::string trace_op_body_str = ""; // [Generation] Get TraceOp const char* FWD_TRACE_OP_TEMPLATE = " paddle::framework::AttributeMap attrs = attr_map;\n" @@ -1470,11 +1607,12 @@ static std::pair GenerateForwardFunctionContents( " egr::Controller::Instance().GetCurrentTracer()->TraceOp(\"%s\", ins, " "outs, attrs, \n" " egr::Controller::Instance().GetExpectedPlace(),\n" - " &default_attrs, true, {});\n"; - std::string trace_op_str = - paddle::string::Sprintf(FWD_TRACE_OP_TEMPLATE, op_type); - generated_function_body += trace_op_str; - generated_function_body += "\n"; + " &default_attrs, true, {%s});\n"; + std::string trace_op_str = paddle::string::Sprintf( + FWD_TRACE_OP_TEMPLATE, op_type, inplace_mapping_str); + + trace_op_body_str += trace_op_str; + trace_op_body_str += "\n"; VLOG(6) << "Generated AttrMap & TraceOp"; @@ -1539,48 +1677,64 @@ static std::pair GenerateForwardFunctionContents( output_varname, output_var_args_name); } } else { - const char* FWD_OUT_TENSOR_TEMPLATE = - " paddle::experimental::Tensor %s;\n" - " egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n"; - out_tensor_str = - paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, output_varname, - output_name, output_varname); + if (!inplace_map.empty() && inplace_map.count(output_name)) { + // Modify meta info of inplace tensor. + // Bump inplace version of inplace tensor. + auto inplace_input_name = inplace_map[output_name]; + const char* FWD_OUT_TENSOR_TEMPLATE = + " egr::EagerUtils::ModifyInplaceInput(outs[\"%s\"][0], &%s);\n" + " %s.bump_inplace_version();\n" + " VLOG(3) << \"Tensor(\" << %s.name() << \") uses Inplace " + "Strategy.\";\n"; + out_tensor_str = paddle::string::Sprintf( + FWD_OUT_TENSOR_TEMPLATE, output_name, inplace_input_name, + inplace_input_name, inplace_input_name); + } else { + const char* FWD_OUT_TENSOR_TEMPLATE = + " paddle::experimental::Tensor %s;\n" + " egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n"; + out_tensor_str = + paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, output_varname, + output_name, output_varname); + } } return_types[return_position] = "paddle::experimental::Tensor"; } - return_contents[return_position] = output_varname; - generated_function_body += out_tensor_str; + if (!inplace_map.empty() && inplace_map.count(output_name)) { + // Replace output directly with input in inplace op. + return_contents[return_position] = inplace_map[output_name]; + } else { + return_contents[return_position] = output_varname; + } + trace_op_body_str += out_tensor_str; } - generated_function_body += "\n"; + trace_op_body_str += "\n"; VLOG(6) << "Converted Output VarBase to EagerVariable(s)"; + /* ------ END Generate TraceOp ----- */ // [Generation] Handle core_ops_returns_info - core_ops_returns_info[op_type] = return_contents; + // avoid inplace op changing core_ops_returns_info + if (core_ops_returns_info.empty() || !core_ops_returns_info.count(op_type)) { + core_ops_returns_info[op_type] = return_contents; + } // [Generation] ComputeRequireGrad -> GradNodeCreation if (!bwd_info.GenerateForwardOnly()) { - std::string grad_node_creation_body_str = - GenerateGradNodeCreationContent(fwd_info, bwd_info); - - // Add event record - std::string event_name = op_type + " node_creation"; - const char* NODE_CREATION_TEMPLATE = - "{\n" - " paddle::platform::RecordEvent node_creation_record_event(\"%s\", " - "paddle::platform::TracerEventType::Operator, 1);\n" - " %s\n" - "}"; - - grad_node_creation_body_str = paddle::string::Sprintf( - NODE_CREATION_TEMPLATE, event_name, grad_node_creation_body_str); + // If GradNode needs to be generated, pass `trace_op_body_str` + // into `GenerateGradNodeCreationContent`. + std::string grad_node_creation_body_str = GenerateGradNodeCreationContent( + fwd_info, bwd_info, trace_op_body_str, inplace_map); generated_function_body += grad_node_creation_body_str; generated_function_body += "\n"; // [Generation] Call RetainGradForTensor VLOG(6) << "Generated GradNode Creation codes"; + } else { + // If GradNode doesn't need to be generated, generate TraceOP directly. + generated_function_body += trace_op_body_str; } // [Generation] Handle return: Tuple/Vector/Tensor @@ -1627,7 +1781,13 @@ static std::pair GenerateForwardFunctionContents( VLOG(6) << "Generated return codes"; // [Generation] Get Full Function - std::string function_name = op_type + "_dygraph_function"; + std::string function_name; + if (inplace_map.empty()) { + function_name = op_type + "_dygraph_function"; + } else { + // change function_name for inplace op. + function_name = op_type + "__dygraph_function"; + } if (dygraph_function_args_str.size() > 0) { auto iter = dygraph_function_args_str.begin(); @@ -1635,15 +1795,15 @@ static std::pair GenerateForwardFunctionContents( } const char* DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE = - "paddle::platform::RecordEvent dygraph_entrance_record_event(\"%s\", " + " paddle::platform::RecordEvent dygraph_entrance_record_event(\"%s\", " "paddle::platform::TracerEventType::Operator, 1);"; std::string event_name = op_type + " dygraph"; std::string fwd_record_event_str = paddle::string::Sprintf( DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE, event_name); const char* FWD_FUNCTION_TEMPLATE = "%s %s(%s) {\n\n" - " %s\n" - " %s\n" + "%s\n" + "%s\n" "}\n\n"; std::string fwd_function_str = paddle::string::Sprintf( FWD_FUNCTION_TEMPLATE, function_proto_return_type_str, function_name, @@ -2426,7 +2586,7 @@ static void DygraphCodeGeneration(const std::string& output_dir) { /* --------------------------- */ VLOG(6) << "-------- GenerateForwardFunctionContents -------"; std::pair body_and_declaration = - GenerateForwardFunctionContents(fwd_info, bwd_info); + GenerateForwardFunctionContents(fwd_info, bwd_info, {}); fwd_function_str += body_and_declaration.first + "\n"; @@ -2434,6 +2594,30 @@ static void DygraphCodeGeneration(const std::string& output_dir) { std::string fwd_function_declare_str = body_and_declaration.second; dygraph_forward_api_str += fwd_function_declare_str; + auto& infer_inplace = + paddle::framework::OpInfoMap::Instance().Get(op_type).infer_inplace_; + std::map inplace_map; + // Inplace Function Generator. + // `sum` op has duplicate input. Don't consider adding inplace strategy + // for `sum` in temporary. + if (op_type != "sum" && infer_inplace) { + auto in_to_outs = infer_inplace(true); + for (auto& inplace_pair : in_to_outs) { + inplace_map[inplace_pair.second] = inplace_pair.first; + } + + VLOG(6) << "-------- GenerateInplaceForwardFunctionContents -------"; + std::pair inplace_body_and_declaration = + GenerateForwardFunctionContents(fwd_info, bwd_info, inplace_map); + + fwd_function_str += inplace_body_and_declaration.first + "\n"; + + VLOG(6) << "-------- GenerateInplaceDygraphForwardAPIContents -------"; + std::string inplace_fwd_function_declare_str = + inplace_body_and_declaration.second; + dygraph_forward_api_str += inplace_fwd_function_declare_str; + } + if (bwd_info.GenerateForwardOnly()) continue; VLOG(6) << "-------- GenerateGradNodeHeaderContents -------"; diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h index 0e11444b815..8da27f3bb8a 100644 --- a/paddle/fluid/eager/tensor_wrapper.h +++ b/paddle/fluid/eager/tensor_wrapper.h @@ -36,6 +36,15 @@ class TensorWrapper { explicit TensorWrapper(const paddle::experimental::Tensor& tensor, bool full_reserved = false, bool no_need_buffer = false) { + // set inplace_version_snapshot_ according to tensor's current inplace + // version. + if (tensor.impl() && phi::DenseTensor::classof(tensor.impl().get())) { + phi::DenseTensor* dense_tensor = + static_cast(tensor.impl().get()); + auto& inplace_version_counter = dense_tensor->InplaceVersionCounter(); + inplace_version_snapshot_ = inplace_version_counter.CurrentVersion(); + } + /** * Normally, we should fully reserved all non-output or non-leaf fwd tensor * here. And for fwd output tensor, we should not reserve its autogradmeta, @@ -49,6 +58,7 @@ class TensorWrapper { } // shallow copy tensor_impl here + no_need_buffer_ = no_need_buffer; if (no_need_buffer) { if (phi::DenseTensor::classof(tensor.impl().get())) { // Only Copy Meta @@ -86,6 +96,7 @@ class TensorWrapper { // if it's full_reserved just return the full copy of tensor if (full_reserved_) { + check_inplace_version(); return intermidiate_tensor_; } else { std::shared_ptr new_grad_node = grad_node; @@ -94,15 +105,52 @@ class TensorWrapper { intermidiate_tensor_.set_autograd_meta( std::static_pointer_cast( p_ab_autograd_meta)); + check_inplace_version(); return intermidiate_tensor_; } } + void check_inplace_version() { + if (no_need_buffer_) { + VLOG(6) << "There's no need to check inplace_version because " + "no_need_buffer_ is true."; + return; + } + if (intermidiate_tensor_.impl() && + phi::DenseTensor::classof(intermidiate_tensor_.impl().get())) { + phi::DenseTensor* dense_tensor = + static_cast(intermidiate_tensor_.impl().get()); + auto& inplace_version_counter = dense_tensor->InplaceVersionCounter(); + + uint32_t current_inplace_version = + inplace_version_counter.CurrentVersion(); + PADDLE_ENFORCE_EQ( + current_inplace_version, inplace_version_snapshot_, + paddle::platform::errors::PermissionDenied( + "Tensor '%s' used in gradient computation has been " + "modified by an inplace operation. " + "Its version is %d but the expected version is %d. " + "Please fix your code to void calling an inplace operator " + "after using the Tensor which will used in gradient " + "computation.", + intermidiate_tensor_.name(), current_inplace_version, + inplace_version_snapshot_)); + VLOG(6) << " The inplace_version_snapshot_ of Tensor '" + << intermidiate_tensor_.name() << "' is [ " + << inplace_version_snapshot_ << " ]"; + VLOG(6) << " The current_inplace_version of Tensor '" + << intermidiate_tensor_.name() << "' is [ " + << current_inplace_version << " ]"; + } + } + void clear() { intermidiate_tensor_.reset(); } private: bool full_reserved_ = false; + bool no_need_buffer_ = false; std::pair out_rank_info_; paddle::experimental::Tensor intermidiate_tensor_; + uint32_t inplace_version_snapshot_ = 0; }; } // namespace egr diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc index 8a57d269453..048087903a4 100644 --- a/paddle/fluid/eager/utils.cc +++ b/paddle/fluid/eager/utils.cc @@ -212,6 +212,27 @@ std::vector> EagerUtils::CreateVars( return res; } +void EagerUtils::ModifyInplaceInput( + const std::shared_ptr& inplace_variable, + paddle::experimental::Tensor* inplace_tensor) { + // Only modify the meta information of the inplace tensor, because + // EagerVariable cannot modify Tensor's meta information after inplace + // op (such as ``reshape``) is executed. + PADDLE_ENFORCE_NOT_NULL(inplace_tensor, + paddle::platform::errors::Fatal( + "Inplace Tensor is null and cannot be modified. " + "We are tring to Modify Inplace Input from its " + "shared_ptr, this error may indicate the inplace " + " input is nullptr")); + if (phi::DenseTensor::classof(inplace_variable->GetTensorBase().get())) { + phi::DenseTensor* variable_dense_tensor = + static_cast(inplace_variable->GetTensorBase().get()); + phi::DenseTensor* tensor_dense_tensor = + static_cast(inplace_tensor->impl().get()); + tensor_dense_tensor->set_meta(variable_dense_tensor->meta()); + } +} + std::vector EagerUtils::GetOutputs( const std::vector>& outs) { std::vector res; diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h index fa5735e6f32..fbd080ef70e 100644 --- a/paddle/fluid/eager/utils.h +++ b/paddle/fluid/eager/utils.h @@ -14,6 +14,7 @@ #pragma once +#include "paddle/fluid/eager/api/utils/tensor_utils.h" #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/grad_node_info.h" @@ -144,6 +145,19 @@ class EagerUtils { iter.apply(std::forward(args)...); } + static void CheckInplace(const paddle::experimental::Tensor& target, + const AutogradMeta* autograd_meta, + bool require_any_grad) { + if (require_any_grad && autograd_meta) { + PADDLE_ENFORCE_EQ(!autograd_meta->StopGradient() && + egr::egr_utils_api::IsLeafTensor(target), + false, paddle::platform::errors::InvalidArgument( + "Leaf Var (%s) that doesn't stop gradient " + "can't use inplace strategy.", + target.name())); + } + } + // TensorWrapper Utils static paddle::experimental::Tensor RecoverTensorWrapper( TensorWrapper* tw, const std::shared_ptr& grad_node); @@ -171,6 +185,9 @@ class EagerUtils { static std::vector> CreateVars( const size_t num); // Construct Tensor From var + static void ModifyInplaceInput( + const std::shared_ptr& inplace_variable, + paddle::experimental::Tensor* inplace_tensor); static std::vector GetOutputs( const std::vector>& outs); static paddle::experimental::Tensor GetOutput( diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index d4bbfa0e66e..e0a3931c3e3 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -718,6 +718,15 @@ static PyObject* set_grad_type(TensorObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } +static PyObject* tensor__inplace_version(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + uint32_t inplace_version = self->tensor.current_inplace_version(); + + return ToPyObject(inplace_version); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + PyMethodDef variable_methods[] = { {"numpy", (PyCFunction)(void (*)(void))tensor_method_numpy, METH_VARARGS | METH_KEYWORDS, NULL}, @@ -766,6 +775,8 @@ PyMethodDef variable_methods[] = { METH_VARARGS | METH_KEYWORDS, NULL}, {"_set_grad_type", (PyCFunction)(void (*)(void))set_grad_type, METH_VARARGS | METH_KEYWORDS, NULL}, + {"_inplace_version", (PyCFunction)(void (*)(void))tensor__inplace_version, + METH_VARARGS | METH_KEYWORDS, NULL}, {NULL, NULL, 0, NULL}}; } // namespace pybind diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc index 102cdbb91ab..685e20aef25 100644 --- a/paddle/fluid/pybind/eager_op_function_generator.cc +++ b/paddle/fluid/pybind/eager_op_function_generator.cc @@ -162,17 +162,22 @@ static inline std::string TempName(const std::string& name) { std::string GenerateOpFunctionsBody( const paddle::framework::proto::OpProto* op_proto, std::string func_name, - bool use_inplace_strategy = false, std::map inplace_map = {}) { auto& op_type = op_proto->type(); std::string input_args = ""; - std::string call_api_str = "auto out = " + op_type + "_dygraph_function("; + std::string call_api_str = ""; std::string ins_initializer_with_null = ""; std::string py_arg = ""; int arg_idx = 0; int input_args_num = 0; std::string ins_cast_str = ""; std::string view_strategy_str = ""; + if (!inplace_map.empty()) { + // change call_api_str for inplace op + call_api_str = "auto out = " + op_type + "__dygraph_function("; + } else { + call_api_str = "auto out = " + op_type + "_dygraph_function("; + } for (auto& input : op_proto->inputs()) { auto& in_name = input.name(); // skip those dispensable inputs, like ResidualData in conv2d @@ -288,8 +293,31 @@ std::string GenerateOpFunctionsBody( HANDLE_VIEW_BETWEEN_INPUT_AND_OUTPUT, viwe_input_name, viwe_output_name, viwe_input_name, viwe_output_name); } - - return_str = "return ToPyObject(out);"; + if (!inplace_map.empty()) { + // For inplace op, Use the input PyObject directly. + for (auto& inplace_pair : inplace_map) { + // Find index of inplace tensor, and directly use input PyObject. + std::string inplace_arg_name = inplace_pair.second; + std::string inplace_return_name = inplace_pair.first; + const char* RETURN_INPLACE_TENSOR_TEMPLATE = + "ssize_t arg_id = GetIdxFromCoreOpsInfoMap(core_ops_args_info, " + "\"%s\", \"%s\");\n" + " ssize_t return_id = " + "GetIdxFromCoreOpsInfoMap(core_ops_returns_info, \"%s\", \"%s\");\n" + " return ToPyObject(out, return_id, args, arg_id);"; + return_str = paddle::string::Sprintf(RETURN_INPLACE_TENSOR_TEMPLATE, + op_type, inplace_arg_name, op_type, + inplace_return_name); + // only support one inplace_var in temporary. + PADDLE_ENFORCE_EQ( + inplace_map.size(), 1, + paddle::platform::errors::InvalidArgument( + "size of inplace_map must be 1, but got %d", inplace_map.size())); + break; + } + } else { + return_str = "return ToPyObject(out);"; + } std::string function_args = ""; if (input_args == "") { @@ -383,7 +411,8 @@ GenerateOpFunctions() { continue; } std::string func_name = "eager_api_" + op_type; - std::string op_function_str = GenerateOpFunctionsBody(op_proto, func_name); + std::string op_function_str = + GenerateOpFunctionsBody(op_proto, func_name, {}); // generate pybind item auto bind_function_str = paddle::string::Sprintf( @@ -391,6 +420,40 @@ GenerateOpFunctions() { op_function_list.emplace_back(std::move(op_function_str)); bind_function_list.emplace_back(std::move(bind_function_str)); + + // NOTE(pangyoki): Inplace Strategy. + // In this case, output will reuse input varbase. + // Dygraph mode needs to be aligned with the in-place strategy in static + // mode, and the mapping relationships between output and input that have + // been defined in static mode should be used in dygraph mode. + // Find which ops need to use Inplace strategy in static mode, and get the + // mapping relationship between Inplace output and input. + auto& infer_inplace = + paddle::framework::OpInfoMap::Instance().Get(op_type).infer_inplace_; + std::map inplace_map; + // `sum` op has duplicate input. Don't consider adding inplace strategy + // for `sum` in temporary. + if (op_type != "sum" && infer_inplace) { + // Inplace OP: op_type_. + // The inplace OP needs a new implementation method. + auto in_to_outs = infer_inplace(true); + for (auto& inplace_pair : in_to_outs) { + inplace_map[inplace_pair.second] = inplace_pair.first; + } + + std::string inplace_op_type = op_type + "_"; + std::string inplace_func_name = "eager_api_" + inplace_op_type; + std::string inplace_op_function_str = + GenerateOpFunctionsBody(op_proto, inplace_func_name, inplace_map); + + // generate pybind item + auto inplace_bind_function_str = + paddle::string::Sprintf(PYBIND_ITEM_TEMPLATE, inplace_op_type, + inplace_func_name, inplace_op_type); + + op_function_list.emplace_back(std::move(inplace_op_function_str)); + bind_function_list.emplace_back(std::move(inplace_bind_function_str)); + } } if (append_custom_head_file) { op_function_list.emplace_back(CUSTOM_HANDWRITE_OP_FUNC_FILE); diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index 97bb32630d7..a23bb1230e1 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -417,6 +417,8 @@ PyObject* ToPyObject(bool value) { PyObject* ToPyObject(int value) { return PyLong_FromLong(value); } +PyObject* ToPyObject(uint32_t value) { return PyLong_FromUnsignedLong(value); } + PyObject* ToPyObject(int64_t value) { return PyLong_FromLongLong(value); } PyObject* ToPyObject(float value) { return PyLong_FromDouble(value); } @@ -442,6 +444,20 @@ PyObject* ToPyObject(const paddle::experimental::Tensor& value) { return obj; } +PyObject* ToPyObject(const paddle::experimental::Tensor& value, + ssize_t value_idx, PyObject* args, ssize_t arg_idx) { + // For inplace op, directly return the input PyObject of the inplace tensor. + // [Parameter] + // value: Useless parameter. + // value_idx: Useless parameter. + // args: Input PyObject. + // arg_idx: Index of inplace PyObject in input args. Used to find the input + // inplace PyObject. + PyObject* obj = PyTuple_GET_ITEM(args, arg_idx); + Py_INCREF(obj); + return obj; +} + PyObject* ToPyObject(const std::vector& value) { PyObject* result = PyList_New((Py_ssize_t)value.size()); diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index 1c4e2ab69a5..fba1485bcf4 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -56,6 +56,7 @@ framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj, ssize_t arg_pos); PyObject* ToPyObject(int value); +PyObject* ToPyObject(uint32_t value); PyObject* ToPyObject(bool value); PyObject* ToPyObject(int64_t value); PyObject* ToPyObject(float value); @@ -63,6 +64,8 @@ PyObject* ToPyObject(double value); PyObject* ToPyObject(const char* value); PyObject* ToPyObject(const std::string& value); PyObject* ToPyObject(const paddle::experimental::Tensor& value); +PyObject* ToPyObject(const paddle::experimental::Tensor& value, + ssize_t value_idx, PyObject* args, ssize_t arg_idx); PyObject* ToPyObject(const std::vector& value); PyObject* ToPyObject(const std::vector& value); PyObject* ToPyObject(const std::vector& value); @@ -84,6 +87,17 @@ struct TupleTensorResult { TupleTensorResult::Run(out, result); PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get(out))); } + + static void Run(const Tuple& out, PyObject* result, ssize_t value_idx, + PyObject* args, ssize_t arg_idx) { + TupleTensorResult::Run(out, result, value_idx, args, arg_idx); + if (N - 1 == value_idx) { + PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get(out), + value_idx, args, arg_idx)); + } else { + PyTuple_SET_ITEM(result, N - 1, ToPyObject(std::get(out))); + } + } }; template @@ -91,6 +105,16 @@ struct TupleTensorResult { static void Run(const Tuple& out, PyObject* result) { PyTuple_SET_ITEM(result, 0, ToPyObject(std::get<0>(out))); } + + static void Run(const Tuple& out, PyObject* result, ssize_t value_idx, + PyObject* args, ssize_t arg_idx) { + if (value_idx == 0) { + PyTuple_SET_ITEM(result, 0, + ToPyObject(std::get<0>(out), value_idx, args, arg_idx)); + } else { + PyTuple_SET_ITEM(result, 0, ToPyObject(std::get<0>(out))); + } + } }; template @@ -103,6 +127,26 @@ PyObject* ToPyObject(const std::tuple& out) { return result; } +template +PyObject* ToPyObject(const std::tuple& out, ssize_t value_idx, + PyObject* args, ssize_t arg_idx) { + // For inplace op, directly return the input PyObject of the inplace tensor. + // [Parameter] + // out: Outputs tuple after executing op. + // value_idx: Index of inplace tensor in outputs tuple. Used to find the + // output inplace tensor. + // args: Input PyObject. + // arg_idx: Index of inplace PyObject in input args. Used to find the input + // inplace PyObject. + auto len = sizeof...(Args); + PyObject* result = PyTuple_New(len); + + TupleTensorResult::Run(out, result, value_idx, + args, arg_idx); + + return result; +} + paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj, const std::string& op_type, ssize_t arg_pos); diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc index 09c3cea398b..1d483abd774 100644 --- a/paddle/fluid/pybind/op_function_common.cc +++ b/paddle/fluid/pybind/op_function_common.cc @@ -854,5 +854,30 @@ void InitOpsAttrTypeMap() { } } +ssize_t GetIdxFromCoreOpsInfoMap( + const std::unordered_map>& + core_ops_info_map, + const std::string& op_type, const std::string& name) { + // `core_ops_info_map` can be `core_ops_args_info` or `core_ops_returns_info`. + // `core_ops_args_info`: get index from core_ops_args_info[op_type] according + // to input name. + // `core_ops_returns_info`: get index from core_ops_returns_info[op_type] + // according to return name. + if (!core_ops_info_map.count(op_type)) { + PADDLE_THROW(platform::errors::Fatal( + "Op %s is not found in core_ops_*_info map.", op_type)); + } else { + auto args_list = core_ops_info_map.at(op_type); + auto it = std::find(args_list.begin(), args_list.end(), name); + if (it == args_list.end()) { + PADDLE_THROW(platform::errors::Fatal("%s is not found in op %s's args.", + name, op_type)); + } else { + return std::distance(args_list.begin(), it); + } + } + return -1; +} + } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/op_function_common.h b/paddle/fluid/pybind/op_function_common.h index 7ead9852667..33d0e242a02 100644 --- a/paddle/fluid/pybind/op_function_common.h +++ b/paddle/fluid/pybind/op_function_common.h @@ -146,5 +146,10 @@ unsigned long GetUnsignedLongFromArgs( // NOLINT void InitOpsAttrTypeMap(); +ssize_t GetIdxFromCoreOpsInfoMap( + const std::unordered_map>& + core_ops_info_map, + const std::string& op_type, const std::string& name); + } // namespace pybind } // namespace paddle diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h index ce40627bb0d..eae8d12fb37 100644 --- a/paddle/phi/api/include/tensor.h +++ b/paddle/phi/api/include/tensor.h @@ -481,7 +481,21 @@ class PADDLE_API Tensor final { */ void set_autograd_meta(std::shared_ptr autograd_meta); - /* Part 9: Auto generated Tensor methods */ + /* Part 9: Inplace methods */ + + /** + * @brief Increase inplace version + */ + void bump_inplace_version(); + + /** + * @brief Get current inplace version + * + * @return uint32_t + */ + uint32_t current_inplace_version(); + + /* Part 10: Auto generated Tensor methods */ private: /** diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index 6be85d72000..6090e6a400a 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -347,5 +347,36 @@ void Tensor::set_autograd_meta( autograd_meta_ = std::move(autograd_meta); } +void Tensor::bump_inplace_version() { + if (is_dense_tensor()) { + auto &inplace_version_counter = + std::dynamic_pointer_cast(impl_) + ->InplaceVersionCounter(); + VLOG(3) << "yoki: before bump inplace version: " + << inplace_version_counter.CurrentVersion(); + inplace_version_counter.Bump(); + VLOG(3) << "yoki: after bump inplace version: " + << inplace_version_counter.CurrentVersion(); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "bump_inplace_version is only supported on DenseTensor now.")); + } +} + +uint32_t Tensor::current_inplace_version() { + if (is_dense_tensor()) { + auto &inplace_version_counter = + std::dynamic_pointer_cast(impl_) + ->InplaceVersionCounter(); + VLOG(3) << "yoki: print version: " + << inplace_version_counter.CurrentVersion(); + return inplace_version_counter.CurrentVersion(); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "current_inplace_version is only supported on DenseTensor now.")); + } + return 0; +} + } // namespace experimental } // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index c82172780b7..44e6f8e8f2a 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -960,6 +960,7 @@ set_tests_properties(test_bicubic_interp_op PROPERTIES TIMEOUT 120) set_tests_properties(test_deformable_conv_op PROPERTIES TIMEOUT 120) set_tests_properties(test_nearest_interp_op PROPERTIES TIMEOUT 120) set_tests_properties(test_profiler PROPERTIES TIMEOUT 120) +set_tests_properties(test_inplace_eager_fluid PROPERTIES TIMEOUT 120) set_tests_properties(test_inplace_softmax_with_cross_entropy PROPERTIES TIMEOUT 120) set_tests_properties(test_cross_entropy2_op PROPERTIES TIMEOUT 120) set_tests_properties(test_fetch_unmerged PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py b/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py new file mode 100644 index 00000000000..a434c562000 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py @@ -0,0 +1,397 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np + +import paddle +import paddle.fluid.core as core +from paddle.fluid.framework import _test_eager_guard + + +class TestDygraphInplace(unittest.TestCase): + def setUp(self): + self.init_data() + self.set_np_compare_func() + + def init_data(self): + self.input_var_numpy = np.random.uniform(-5, 5, [10, 20, 1]) + self.dtype = "float32" + + def set_np_compare_func(self): + self.np_compare = np.array_equal + + def non_inplace_api_processing(self, var): + return paddle.squeeze(var) + + def inplace_api_processing(self, var): + return paddle.squeeze_(var) + + def test_inplace_api(self): + with _test_eager_guard(): + var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype) + inplace_var = self.inplace_api_processing(var) + self.assertTrue(id(var) == id(inplace_var)) + + inplace_var.exp_() + self.assertTrue(np.array_equal(var.numpy(), inplace_var.numpy())) + + def test_forward_version(self): + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype) + self.assertEqual(var.inplace_version, 0) + + inplace_var = self.inplace_api_processing(var) + self.assertEqual(var.inplace_version, 1) + + inplace_var.exp_() + self.assertEqual(var.inplace_version, 2) + + inplace_var = self.inplace_api_processing(inplace_var) + self.assertEqual(var.inplace_version, 3) + + def test_leaf_inplace_var_error(self): + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype) + var.stop_gradient = False + + def leaf_inplace_error(): + self.inplace_api_processing(var) + + self.assertRaises(ValueError, leaf_inplace_error) + + def test_backward_error(self): + # It raises an error because the inplace operator will result + # in incorrect gradient computation. + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var_a = paddle.to_tensor(self.input_var_numpy).astype( + self.dtype) + var_a.stop_gradient = False + + var_b = var_a**2 + + # Here, the gradient computation will use the value of var_b + var_c = var_b**2 + self.inplace_api_processing(var_b) + + loss = paddle.nn.functional.relu(var_c) + with self.assertRaisesRegexp( + RuntimeError, + "received current_inplace_version:{} != inplace_version_snapshot_:{}". + format(1, 0)): + loss.backward() + + def test_backward_success_1(self): + # var_b is modified inplace before using it, the inplace operator doesn't result + # in incorrect gradient computation. + grad_var_a, grad_var_a_inplace = 0, 1 + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var_a = paddle.to_tensor(self.input_var_numpy).astype( + self.dtype) + var_a.stop_gradient = False + + var_b = var_a**2 + var_c = self.inplace_api_processing( + var_b) # var_b is modified inplace before using it + + # Here, the gradient computation will use the value of var_b + var_d = var_c**2 + loss = var_d.sum() + loss.backward() + grad_var_a_inplace = var_a.grad.numpy() + + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var_a = paddle.to_tensor(self.input_var_numpy).astype( + self.dtype) + var_a.stop_gradient = False + + var_b = var_a**2 + var_c = self.non_inplace_api_processing(var_b) + var_d = var_c**2 + loss = var_d.sum() + loss.backward() + grad_var_a = var_a.grad.numpy() + + self.assertTrue(self.np_compare(grad_var_a_inplace, grad_var_a)) + + def test_backward_success_2(self): + # Although var_b is modified inplace after using it, it does not used in gradient computation. + # The inplace operator doesn't result in incorrect gradient computation. + grad_var_a, grad_var_a_inplace = 0, 1 + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var_a = paddle.to_tensor(self.input_var_numpy).astype( + self.dtype) + var_a.stop_gradient = False + + var_b = var_a**2 + + var_c = self.inplace_api_processing( + var_b) # var_b is modified inplace before using it + + var_d = var_c + var_c # Here, the grad op of sum doesn't use the value of var_b + loss = var_d.sum() + + loss.backward() + grad_var_a_inplace = var_a.grad.numpy() + + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var_a = paddle.to_tensor(self.input_var_numpy).astype( + self.dtype) + var_a.stop_gradient = False + + var_b = var_a**2 + + var_c = self.non_inplace_api_processing( + var_b) # var_b is modified inplace before using it + + var_d = var_c + var_c # Here, the grad op of sum doesn't use the value of var_b + loss = var_d.sum() + + loss.backward() + grad_var_a = var_a.grad.numpy() + self.assertTrue(np.array_equal(grad_var_a_inplace, grad_var_a)) + + +class TestDygraphInplaceUnsqueeze(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return paddle.unsqueeze(var, -1) + + def inplace_api_processing(self, var): + return paddle.unsqueeze_(var, -1) + + +class TestDygraphInplaceReshape(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return paddle.reshape(var, [-1]) + + def inplace_api_processing(self, var): + return paddle.reshape_(var, [-1]) + + +class TestDygraphInplaceFlatten(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return var.flatten() + + def inplace_api_processing(self, var): + return var.flatten_() + + +class TestDygraphInplaceScatter(TestDygraphInplace): + def init_data(self): + self.input_var_numpy = np.array([[1, 1], [2, 2], [3, 3]]) + self.dtype = "float32" + + def non_inplace_api_processing(self, var): + index = paddle.to_tensor([2, 1, 0, 1], dtype='int64') + updates = paddle.to_tensor( + [[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32') + + return paddle.scatter(var, index, updates, overwrite=False) + + def inplace_api_processing(self, var): + index = paddle.to_tensor([2, 1, 0, 1], dtype='int64') + updates = paddle.to_tensor( + [[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32') + + return paddle.scatter_(var, index, updates, overwrite=False) + + +class TestDygraphInplaceElu(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return paddle.nn.functional.elu(var) + + def inplace_api_processing(self, var): + return paddle.nn.functional.elu_(var) + + +class TestDygraphInplaceRelu(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return paddle.nn.functional.relu(var) + + def inplace_api_processing(self, var): + return paddle.nn.functional.relu_(var) + + +class TestDygraphInplaceSoftmax(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return paddle.nn.functional.softmax(var) + + def inplace_api_processing(self, var): + return paddle.nn.functional.softmax_(var) + + +class TestDygraphInplaceTanh(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return paddle.tanh(var) + + def inplace_api_processing(self, var): + return paddle.tanh_(var) + + +class TestDygraphInplaceCeil(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return var.ceil() + + def inplace_api_processing(self, var): + return var.ceil_() + + +class TestDygraphInplaceFloor(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return var.floor() + + def inplace_api_processing(self, var): + return var.floor_() + + +class TestDygraphInplaceExp(TestDygraphInplace): + def set_np_compare_func(self): + self.np_compare = np.allclose + + def non_inplace_api_processing(self, var): + return var.exp() + + def inplace_api_processing(self, var): + return var.exp_() + + +class TestDygraphInplaceReciprocal(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return var.reciprocal() + + def inplace_api_processing(self, var): + return var.reciprocal_() + + +class TestDygraphInplaceRound(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return var.round() + + def inplace_api_processing(self, var): + return var.round_() + + +class TestDygraphInplaceSqrt(TestDygraphInplace): + def init_data(self): + self.input_var_numpy = np.random.uniform(0, 5, [10, 20, 1]) + self.dtype = "float32" + + def non_inplace_api_processing(self, var): + return var.sqrt() + + def inplace_api_processing(self, var): + return var.sqrt_() + + +class TestDygraphInplaceRsqrt(TestDygraphInplaceSqrt): + def non_inplace_api_processing(self, var): + return var.rsqrt() + + def inplace_api_processing(self, var): + return var.rsqrt_() + + +class TestDygraphInplaceClip(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return var.clip(0.6, 1.5) + + def inplace_api_processing(self, var): + return var.clip_(0.6, 1.5) + + +class TestDygraphInplaceScale(TestDygraphInplace): + def non_inplace_api_processing(self, var): + return var.scale(scale=2.0, bias=3.0) + + def inplace_api_processing(self, var): + return var.scale_(scale=2.0, bias=3.0) + + +class TestDygraphInplaceAdd(TestDygraphInplace): + def init_data(self): + self.input_var_numpy = np.random.rand(2, 3, 4) + self.dtype = "float32" + self.input_var_numpy_2 = np.random.rand(2, 3, 4).astype(self.dtype) + + def non_inplace_api_processing(self, var): + input_var_2 = paddle.to_tensor(self.input_var_numpy_2) + return var.add(input_var_2) + + def inplace_api_processing(self, var): + input_var_2 = paddle.to_tensor(self.input_var_numpy_2) + return var.add_(input_var_2) + + +class TestDygraphInplaceSubtract(TestDygraphInplaceAdd): + def non_inplace_api_processing(self, var): + input_var_2 = paddle.to_tensor(self.input_var_numpy_2) + return var.subtract(input_var_2) + + def inplace_api_processing(self, var): + input_var_2 = paddle.to_tensor(self.input_var_numpy_2) + return var.subtract_(input_var_2) + + +class TestLossIsInplaceVar(unittest.TestCase): + def test_loss_is_inplace_var(self): + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var_a = paddle.ones((2, 2)) + var_a.stop_gradient = False + + var_b = var_a * 2 + loss = var_b.tanh_() + + loss.backward() + inplace_grad_var_a = var_a.grad.numpy() + + with paddle.fluid.dygraph.guard(): + with _test_eager_guard(): + var_a = paddle.ones((2, 2)) + var_a.stop_gradient = False + + var_b = var_a * 2 + loss = var_b.tanh() + + loss.backward() + grad_var_a = var_a.grad.numpy() + + self.assertTrue(np.array_equal(inplace_grad_var_a, grad_var_a)) + + +class TestContinuouslyInplace(unittest.TestCase): + def test_continuously_inplace(self): + with _test_eager_guard(): + a = paddle.rand([2, 3]) + a.stop_gradient = False + b = a * 2 + + b.reshape_([-1]) + b.reshape_([2, 3]) + b.reshape_([-1]) + + b.backward() + + +if __name__ == '__main__': + unittest.main() -- GitLab From 8e4e19ab3003c8d8a29a319ada9cd3422b64a999 Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Sat, 19 Mar 2022 12:40:58 +0800 Subject: [PATCH 173/176] Add infer meta (#40544) * add infer meta; test=develop * add histogram infer meta; test=develop * fix unitest bug; test=develop * format; test=develop * format; test=develop * bn not use new infer meta; test=develop * add infer meta; test=develop * fixbug; test=develop * fix bug; * recover unitest; test=develop --- paddle/fluid/operators/batch_norm_op.cc | 763 +----------------- paddle/fluid/operators/batch_norm_op.h | 18 - paddle/fluid/operators/conv_op.cc | 9 + .../fluid/operators/detection/yolo_box_op.cc | 7 +- paddle/fluid/operators/dropout_op.cc | 19 +- .../fluid/operators/fused/conv_fusion_op.cc | 136 ++++ paddle/fluid/operators/histogram_op.cc | 30 +- paddle/fluid/operators/inplace_abn_op.cc | 1 + paddle/fluid/operators/masked_select_op.cc | 19 +- paddle/fluid/operators/norm_op.cc | 22 +- paddle/fluid/operators/sync_batch_norm_op.cc | 1 + paddle/phi/infermeta/binary.cc | 277 +++++++ paddle/phi/infermeta/binary.h | 32 + paddle/phi/infermeta/multiary.cc | 112 +++ paddle/phi/infermeta/multiary.h | 20 + paddle/phi/infermeta/unary.cc | 49 ++ paddle/phi/infermeta/unary.h | 10 + .../test_mkldnn_conv_gelu_fuse_pass.py | 2 + 18 files changed, 692 insertions(+), 835 deletions(-) diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 174207deb08..5194c8772e4 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -21,6 +21,9 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/infermeta/multiary.h" + namespace paddle { namespace operators { @@ -297,184 +300,6 @@ The required data format for this layer is one of the following: )DOC"); } -template -class BatchNormKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const float epsilon = ctx.Attr("epsilon"); - float momentum = ctx.Attr("momentum"); - const bool is_test = ctx.Attr("is_test"); - const bool use_global_stats = ctx.Attr("use_global_stats"); - const bool trainable_stats = ctx.Attr("trainable_statistics"); - bool test_mode = is_test && (!trainable_stats); - - bool global_stats = test_mode || use_global_stats; - - const std::string data_layout_str = ctx.Attr("data_layout"); - DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - - const auto *x = ctx.Input("X"); - const auto &x_dims = x->dims(); - PADDLE_ENFORCE_GE( - x_dims.size(), 2, - platform::errors::InvalidArgument( - "The size of input X's dimensions should be larger than 1." - "But received: the size of input X's dimensions is [%d]", - x_dims.size())); - PADDLE_ENFORCE_LE( - x_dims.size(), 5, - platform::errors::InvalidArgument( - "The size of input X's dimensions should be less than 6." - "But received: the size of input X's dimensionss is [%d]", - x_dims.size())); - const int N = x_dims[0]; - const int C = - (data_layout == DataLayout::kNCHW ? x_dims[1] - : x_dims[x_dims.size() - 1]); - const int sample_size = x->numel() / N / C; - - auto *y = ctx.Output("Y"); - - auto *mean_out = ctx.Output("MeanOut"); - auto *variance_out = ctx.Output("VarianceOut"); - auto *saved_mean = ctx.Output("SavedMean"); - auto *saved_variance = ctx.Output("SavedVariance"); - - // alloc memory - y->mutable_data(ctx.GetPlace()); - mean_out->mutable_data(ctx.GetPlace()); - variance_out->mutable_data(ctx.GetPlace()); - saved_mean->mutable_data(ctx.GetPlace()); - saved_variance->mutable_data(ctx.GetPlace()); - - // input dimension is 2 and the format is NCHW. The input can be regarded - // as NHWC format - if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) { - data_layout = DataLayout::kNHWC; - } - - if (!global_stats) { - // saved_xx is use just in this batch of data - EigenVectorArrayMap saved_mean_e( - saved_mean->mutable_data(ctx.GetPlace()), C); - EigenVectorArrayMap saved_variance_e( - saved_variance->mutable_data(ctx.GetPlace()), C); - saved_mean_e.setZero(); - saved_variance_e.setZero(); - - EigenVectorArrayMap running_mean_arr( - mean_out->mutable_data(ctx.GetPlace()), C); - EigenVectorArrayMap running_var_arr( - variance_out->mutable_data(ctx.GetPlace()), C); - - if ((N * sample_size) == 1) { - // Only 1 element in normalization dimension, - // we skip the batch norm calculation, let y = x. - framework::TensorCopy(*x, ctx.GetPlace(), y); - return; - } - - switch (data_layout) { - case DataLayout::kNCHW: { - ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); - for (int nc = 0; nc < N * C; ++nc) { - saved_mean_e(nc % C) += x_arr.col(nc).sum(); - } - saved_mean_e /= N * sample_size; - for (int nc = 0; nc < N * C; ++nc) { - saved_variance_e(nc % C) += - (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm(); - } - saved_variance_e /= N * sample_size; - break; - } - case DataLayout::kNHWC: { - ConstEigenArrayMap x_arr(x->data(), C, N * sample_size); - for (int i = 0; i < N * sample_size; ++i) { - saved_mean_e += x_arr.col(i); - } - saved_mean_e /= N * sample_size; - for (int i = 0; i < N * sample_size; ++i) { - saved_variance_e += - (x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e); - } - saved_variance_e /= N * sample_size; - break; - } - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Unknown storage order: %s", data_layout_str)); - } - - // if MomentumTensor is set, use MomentumTensor value, momentum - // is only used in this training branch - if (ctx.HasInput("MomentumTensor")) { - const auto *mom_tensor = ctx.Input("MomentumTensor"); - momentum = mom_tensor->data()[0]; - } - - running_mean_arr = - running_mean_arr * momentum + saved_mean_e * (1. - momentum); - running_var_arr = - running_var_arr * momentum + saved_variance_e * (1. - momentum); - } - - // use SavedMean and SavedVariance to do normalize - Eigen::Array inv_std(C); - if (global_stats) { - ConstEigenVectorArrayMap var_arr( - ctx.Input("Variance")->data(), C); - inv_std = (var_arr + epsilon).sqrt().inverse(); - } else { - EigenVectorArrayMap saved_inv_std( - ctx.Output("SavedVariance")->data(), C); - // inverse SavedVariance first, gradient will use it too. - saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt(); - inv_std = saved_inv_std; - } - ConstEigenVectorArrayMap mean_arr( - global_stats ? ctx.Input("Mean")->data() - : ctx.Output("SavedMean")->data(), - C); - - // ((x - est_mean) * (inv_var) * scale + bias - // formula transform ====> - // (x * inv_var * scale) + (bias - est_mean * inv_var * scale) - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - ConstEigenVectorArrayMap scale_arr(scale->data(), C); - ConstEigenVectorArrayMap bias_arr(bias->data(), C); - Eigen::Array new_scale = inv_std * scale_arr; - Eigen::Array new_bias = - bias_arr - mean_arr * inv_std * scale_arr; - - switch (data_layout) { - case DataLayout::kNCHW: { - EigenArrayMap y_arr(y->mutable_data(ctx.GetPlace()), sample_size, - N * C); - ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); - for (int nc = 0; nc < N * C; ++nc) { - y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C); - } - break; - } - case DataLayout::kNHWC: { - EigenArrayMap(y->mutable_data(ctx.GetPlace()), C, - N * sample_size) = - (ConstEigenArrayMap(x->data(), C, N * sample_size).colwise() * - new_scale) - .colwise() + - new_bias; - break; - } - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Unknown storage order: %d", data_layout)); - } - } -}; - void BatchNormGradOp::InferShape(framework::InferShapeContext *ctx) const { // check input OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale", "BatchNormGrad"); @@ -585,261 +410,6 @@ framework::OpKernelType BatchNormGradOp::GetKernelTypeForVar( tensor.place(), tensor.layout()); } -template -class BatchNormGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *d_y = ctx.Input(framework::GradVarName("Y")); - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - const auto *saved_mean = ctx.Input("SavedMean"); - // SavedVariance have been reverted in forward operator - const auto *saved_inv_variance = ctx.Input("SavedVariance"); - const std::string data_layout_str = ctx.Attr("data_layout"); - bool use_global_stats = ctx.Attr("use_global_stats"); - const bool is_test = ctx.Attr("is_test"); - const float epsilon = ctx.Attr("epsilon"); - DataLayout data_layout = framework::StringToDataLayout(data_layout_str); - - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_scale = ctx.Output(framework::GradVarName("Scale")); - auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - - use_global_stats = is_test || use_global_stats; - - // batch_norm with inplace as false will take X as grad input, which - // is same as cuDNN batch_norm backward calculation, batch_norm - // with inplace as true only take Y as input and X should be calculate - // by inverse operation of batch_norm on Y - const Tensor *x; - bool is_inplace; - if (ctx.HasInput("Y")) { - x = ctx.Input("Y"); - is_inplace = true; - // if the input of batch norm is stop_gradient, d_x is null. - if (d_x) { - PADDLE_ENFORCE_EQ(d_x, d_y, - platform::errors::InvalidArgument( - "X@GRAD and Y@GRAD not inplace in inplace mode")); - } - } else { - x = ctx.Input("X"); - is_inplace = false; - if (d_x) { - PADDLE_ENFORCE_NE( - d_x, d_y, platform::errors::InvalidArgument( - "X@GRAD and Y@GRAD inplaced in non-inplace mode")); - } - } - - // Get the size for each dimension. - // NCHW [batch_size, in_channels, in_height, in_width] - const auto &x_dims = x->dims(); - PADDLE_ENFORCE_GE( - x_dims.size(), 2, - platform::errors::InvalidArgument( - "The size of input X's dimensions should be larger than 1." - "But received: the size of input X's dimensions is [%d]", - x_dims.size())); - PADDLE_ENFORCE_LE( - x_dims.size(), 5, - platform::errors::InvalidArgument( - "The size of input X's dimensions should be less than 6." - "But received: the size of input X's dimensions is [%d]", - x_dims.size())); - const int N = x_dims[0]; - const int C = - (data_layout == DataLayout::kNCHW ? x_dims[1] - : x_dims[x_dims.size() - 1]); - const int sample_size = x->numel() / N / C; - - // input dimension is 2 and the format is NCHW. The input can be regarded as - // NHWC format - if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) { - data_layout = DataLayout::kNHWC; - } - - // init output - if (d_x) { - d_x->mutable_data(ctx.GetPlace()); - } - - const T *mean_data = saved_mean->data(); - const T *inv_var_data = saved_inv_variance->data(); - Tensor inv_var_tensor; - if (use_global_stats) { - const auto *running_mean = ctx.Input("Mean"); - const auto *running_variance = ctx.Input("Variance"); - mean_data = running_mean->data(); - inv_var_tensor.Resize({C}); - T *running_inv_var_data = inv_var_tensor.mutable_data(ctx.GetPlace()); - EigenVectorArrayMap inv_var_tmp(running_inv_var_data, C); - ConstEigenVectorArrayMap var_arr(running_variance->data(), C); - - inv_var_tmp = (var_arr + epsilon).sqrt().inverse(); - inv_var_data = running_inv_var_data; - } - - ConstEigenVectorArrayMap scale_arr(scale->data(), C); - ConstEigenVectorArrayMap bias_arr(bias->data(), C); - ConstEigenVectorArrayMap mean_arr(mean_data, C); - ConstEigenVectorArrayMap inv_var_arr(inv_var_data, C); - - T *d_bias_data = nullptr; - T *d_scale_data = nullptr; - if (d_scale && d_bias) { - d_scale->mutable_data(ctx.GetPlace()); - d_bias->mutable_data(ctx.GetPlace()); - d_bias_data = d_bias->mutable_data(ctx.GetPlace()); - d_scale_data = d_scale->mutable_data(ctx.GetPlace()); - } - - // d_bias = np.sum(d_y, axis=0) - // d_scale = np.sum((X - mean) / inv_std * dy, axis=0) - // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0) - // - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0)) - EigenVectorArrayMap d_bias_arr(d_bias_data, C); - EigenVectorArrayMap d_scale_arr(d_scale_data, C); - - if (d_scale && d_bias) { - d_bias_arr.setZero(); - d_scale_arr.setZero(); - } - - if (d_x && (N * sample_size) == 1 && !use_global_stats) { - framework::TensorCopy(*d_y, ctx.GetPlace(), d_x); - return; - } - - int scale_coefff = use_global_stats ? 1 : N * sample_size; - const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff; - - Tensor dy_sum; - dy_sum.Resize({C}); - dy_sum.mutable_data(ctx.GetPlace()); - EigenVectorArrayMap dy_sum_arr(dy_sum.mutable_data(ctx.GetPlace()), - C); - - Tensor dy_mul_x_sub_mean_mul_invstd_sum; - dy_mul_x_sub_mean_mul_invstd_sum.Resize({C}); - dy_mul_x_sub_mean_mul_invstd_sum.mutable_data(ctx.GetPlace()); - EigenVectorArrayMap dy_mul_x_sub_mean_mul_invstd_sum_arr( - dy_mul_x_sub_mean_mul_invstd_sum.mutable_data(ctx.GetPlace()), C); - - dy_sum_arr.setZero(); - dy_mul_x_sub_mean_mul_invstd_sum_arr.setZero(); - - // inplace calculation - // Y: ((x - est_mean) * (inv_var) * scale + bias - // formula transform ====> - // (x * inv_var * scale) + (bias - est_mean * inv_var * scale) - // X: (y - bias) / scale / (inv_var) + est_mean - // formula transform ====> - // (y - bias) / (scale * inv_var) + est_mean - switch (data_layout) { - case DataLayout::kNCHW: { - if (is_inplace) { - auto px = *x; - EigenArrayMap x_data(px.mutable_data(ctx.GetPlace()), - sample_size, N * C); - ConstEigenArrayMap y_data(x->data(), sample_size, N * C); - for (int nc = 0; nc < N * C; ++nc) { - x_data.col(nc) = (y_data.col(nc) - bias_arr(nc % C)) / - scale_inv_var_nhw(nc % C) / scale_coefff + - mean_arr(nc % C); - } - } - ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); - ConstEigenArrayMap d_y_arr(d_y->data(), sample_size, N * C); - - for (int nc = 0; nc < N * C; ++nc) { - int c = nc % C; - dy_sum_arr(c) += d_y_arr.col(nc).sum(); - dy_mul_x_sub_mean_mul_invstd_sum_arr(c) += - ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc)) - .sum(); - } - - if (d_scale && d_bias) { - d_bias_arr = dy_sum_arr; - d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr; - } - - if (d_x) { - EigenArrayMap d_x_arr(d_x->mutable_data(ctx.GetPlace()), - sample_size, N * C); - if (!use_global_stats) { - for (int nc = 0; nc < N * C; ++nc) { - int c = nc % C; - d_x_arr.col(nc) = - scale_inv_var_nhw(c) * - (d_y_arr.col(nc) * N * sample_size - dy_sum_arr(c) - - (x_arr.col(nc) - mean_arr[c]) * - dy_mul_x_sub_mean_mul_invstd_sum_arr(c) * - inv_var_arr(c)); - } - } else { - for (int nc = 0; nc < N * C; ++nc) { - int c = nc % C; - d_x_arr.col(nc) = scale_inv_var_nhw(c) * d_y_arr.col(nc); - } - } - } - break; - } - case DataLayout::kNHWC: { - if (is_inplace) { - auto px = *x; - EigenArrayMap x_data(px.mutable_data(ctx.GetPlace()), C, - N * sample_size); - ConstEigenArrayMap y_data(x->data(), C, N * sample_size); - for (int nhw = 0; nhw < N * sample_size; nhw++) { - x_data.col(nhw) = (y_data.col(nhw) - bias_arr) / scale_inv_var_nhw / - scale_coefff + - mean_arr; - } - } - ConstEigenArrayMap x_arr(x->data(), C, N * sample_size); - ConstEigenArrayMap d_y_arr(d_y->data(), C, N * sample_size); - - for (int nhw = 0; nhw < N * sample_size; ++nhw) { - dy_sum_arr += d_y_arr.col(nhw); - dy_mul_x_sub_mean_mul_invstd_sum_arr += - (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw); - } - - if (d_scale && d_bias) { - d_bias_arr = dy_sum_arr; - d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr; - } - - if (d_x) { - EigenArrayMap d_x_arr(d_x->mutable_data(ctx.GetPlace()), C, - N * sample_size); - if (!use_global_stats) { - for (int nhw = 0; nhw < N * sample_size; ++nhw) { - d_x_arr.col(nhw) = - scale_inv_var_nhw * - (d_y_arr.col(nhw) * N * sample_size - dy_sum_arr - - (x_arr.col(nhw) - mean_arr) * - dy_mul_x_sub_mean_mul_invstd_sum_arr * inv_var_arr); - } - } else { - for (int nhw = 0; nhw < N * sample_size; ++nhw) { - d_x_arr.col(nhw) = scale_inv_var_nhw * d_y_arr.col(nhw); - } - } - } - break; - } - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Unknown storage order: %s", data_layout_str)); - } - } -}; - template void BatchNormGradMaker::Apply(GradOpPtr op) const { op->SetType(this->ForwardOpType() + "_grad"); @@ -951,335 +521,16 @@ framework::OpKernelType BatchNormDoubleGradOp::GetExpectedKernelType( OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); } -template -class BatchNormDoubleGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *X = ctx.Input("X"); - const auto *Scale = ctx.Input("Scale"); - const auto *dY = ctx.Input("DY"); - const auto *Saved_mean = ctx.Input("SavedMean"); - const auto *Saved_variance = ctx.Input("SavedVariance"); - const float epsilon = ctx.Attr("epsilon"); - const bool use_global_stats = ctx.Attr("use_global_stats"); - const bool is_test = ctx.Attr("is_test"); - - PADDLE_ENFORCE_EQ( - is_test, false, - platform::errors::InvalidArgument( - "`is_test = True` CANNOT be used in train program. If " - "you want to use global status in pre_train model, " - "please set `use_global_stats = True`")); - - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - - const auto *ddX = ctx.Input("DDX"); - const auto *ddScale = ctx.Input("DDScale"); - const auto *ddBias = ctx.Input("DDBias"); - - auto *dX = ctx.Output("DX"); - auto *dScale = ctx.Output("DScale"); - auto *ddY = ctx.Output("DDY"); - dX->mutable_data(ctx.GetPlace()); - ddY->mutable_data(ctx.GetPlace()); - - auto &dev_ctx = ctx.template device_context(); - - const auto &x_dims = X->dims(); - const int C = - (data_layout == DataLayout::kNCHW ? x_dims[1] - : x_dims[x_dims.size() - 1]); - const int sample_size = X->numel() / C; - phi::funcs::SetConstant set_constant; - - const T *mean_data = Saved_mean->data(); - const T *inv_var_data = Saved_variance->data(); - - Tensor inv_var_tensor; - if (use_global_stats) { - const auto *running_mean = ctx.Input("Mean"); - const auto *running_variance = ctx.Input("Variance"); - mean_data = running_mean->data(); - inv_var_tensor.Resize({C}); - - T *running_inv_var_data = inv_var_tensor.mutable_data(ctx.GetPlace()); - EigenVectorArrayMap inv_var_tmp(running_inv_var_data, C); - ConstEigenVectorArrayMap var_arr(running_variance->data(), C); - - inv_var_tmp = (var_arr + epsilon).sqrt().inverse(); - inv_var_data = running_inv_var_data; - } - - // transpose NCHW -> NHWC for easy calculate - Tensor transformed_x(X->type()); - Tensor transformed_dy(dY->type()); - Tensor transformed_ddx(ddX->type()); - - Tensor transformed_dx(dX->type()); - Tensor transformed_ddy(ddY->type()); - if (data_layout == DataLayout::kNCHW && x_dims.size() > 2) { - VLOG(3) << "Transform batchnorm output from NCHW to NHWC"; - // Input Tensor - ResizeToChannelLast(ctx, X, - &transformed_x); - TransToChannelLast(ctx, X, &transformed_x); - ResizeToChannelLast(ctx, dY, - &transformed_dy); - TransToChannelLast(ctx, dY, - &transformed_dy); - ResizeToChannelLast(ctx, ddX, - &transformed_ddx); - TransToChannelLast(ctx, ddX, - &transformed_ddx); - // Output Tensor - ResizeToChannelLast(ctx, dX, - &transformed_dx); - ResizeToChannelLast(ctx, ddY, - &transformed_ddy); - } else { - transformed_x.ShareDataWith(*X); - transformed_dy.ShareDataWith(*dY); - transformed_ddx.ShareDataWith(*ddX); - - transformed_dx.ShareDataWith(*dX); - transformed_ddy.ShareDataWith(*ddY); - } - - ConstEigenArrayMap x_arr(transformed_x.data(), C, sample_size); - ConstEigenVectorArrayMap mean_arr(mean_data, C); - ConstEigenVectorArrayMap inv_var_arr(inv_var_data, C); - - Tensor mean_tile; - mean_tile.Resize({C, sample_size}); - mean_tile.mutable_data(ctx.GetPlace()); - EigenArrayMap mean_tile_data(mean_tile.mutable_data(ctx.GetPlace()), - C, sample_size); - - Tensor inv_var_tile; - inv_var_tile.Resize({C, sample_size}); - inv_var_tile.mutable_data(ctx.GetPlace()); - EigenArrayMap inv_var_tile_data( - inv_var_tile.mutable_data(ctx.GetPlace()), C, sample_size); - - mean_tile_data = mean_arr.replicate(1, sample_size); - inv_var_tile_data = inv_var_arr.replicate(1, sample_size); - - Tensor Scale_data; - if (!Scale) { - Scale_data.mutable_data({C}, ctx.GetPlace()); - set_constant(dev_ctx, &Scale_data, static_cast(1)); - } - ConstEigenVectorArrayMap scale_arr( - Scale ? Scale->data() : Scale_data.data(), C); - - Tensor scale_tile; - scale_tile.Resize({C, sample_size}); - scale_tile.mutable_data(ctx.GetPlace()); - EigenArrayMap scale_tile_data(scale_tile.mutable_data(ctx.GetPlace()), - C, sample_size); - scale_tile_data = scale_arr.replicate(1, sample_size); - - ConstEigenArrayMap dy_arr(transformed_dy.data(), C, sample_size); - ConstEigenArrayMap ddx_arr(transformed_ddx.data(), C, sample_size); - - Tensor x_sub_mean_mul_invstd; - x_sub_mean_mul_invstd.Resize({C, sample_size}); - x_sub_mean_mul_invstd.mutable_data(ctx.GetPlace()); - EigenArrayMap x_sub_mean_mul_invstd_arr( - x_sub_mean_mul_invstd.mutable_data(ctx.GetPlace()), C, sample_size); - x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data; - - if (dX) { - dX->mutable_data(ctx.GetPlace()); - EigenArrayMap dx_arr(transformed_dx.mutable_data(ctx.GetPlace()), C, - sample_size); - dx_arr.setZero(); - if (use_global_stats) { - // math: dx = (ddscale * dy) * inv_var - if (ddScale) { - ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); - Tensor ddscale_tile; - ddscale_tile.Resize({C, sample_size}); - EigenArrayMap ddscale_tile_data( - ddscale_tile.mutable_data(ctx.GetPlace()), C, sample_size); - ddscale_tile_data = ddscale_arr.replicate(1, sample_size); - - dx_arr = dy_arr * ddscale_tile_data * inv_var_tile_data; - } - } else { - // math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx, - // axis=(n,h,w)) * - // np.sum(dy, axis=(n,h,w)) - - // np.sum(dy * ddx, axis=(n,h,w)) + 3 * np.mean(dy * (x - - // mean), - // axis=(n,h,w)) * inv_var.pow(2) * - // np.sum(ddx * (x - mean), axis=(n,h,w))) + inv_var.pow(3) / - // NxHxW * - // np.sum(ddx * (x - mean)) * - // (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW * - // np.sum(dy, - // axis=(n,h,w)) * (x - mean) * - // (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var - - // inv_var - // * - // np.mean(dy, axis=(n,h,w)) - - // inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean), - // axis=(n,h,w))) - - if (ddX) { - dx_arr += - (x_sub_mean_mul_invstd_arr * inv_var_tile_data * - inv_var_tile_data / sample_size) - .colwise() * - (ddx_arr.rowwise().sum() * dy_arr.rowwise().sum() / sample_size - - (dy_arr * ddx_arr).rowwise().sum() + - 3. * (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() * - (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() / - sample_size); - - dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() * - (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() / - sample_size * - (dy_arr.rowwise().sum() / sample_size - dy_arr); - - dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() * - (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() / - sample_size * - (ddx_arr.rowwise().sum() / sample_size - ddx_arr); - - dx_arr = scale_tile_data * dx_arr; - } - if (ddScale) { - ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); - Tensor ddscale_tile; - ddscale_tile.Resize({C, sample_size}); - EigenArrayMap ddscale_tile_data( - ddscale_tile.mutable_data(ctx.GetPlace()), C, sample_size); - ddscale_tile_data = ddscale_arr.replicate(1, sample_size); - - dx_arr += (dy_arr * inv_var_tile_data - - (dy_arr.rowwise().sum().replicate(1, sample_size) / - sample_size) * - inv_var_tile_data - - x_sub_mean_mul_invstd_arr * inv_var_tile_data * - (dy_arr * x_sub_mean_mul_invstd_arr) - .rowwise() - .sum() - .replicate(1, sample_size) / - sample_size) * - ddscale_tile_data; - } - } - if (data_layout == DataLayout::kNCHW) { - VLOG(3) << "Transform batchnorm output from NHWC to NCHW"; - TransToChannelFirst( - ctx, &transformed_dx, dX); - } - } - if (dScale) { - dScale->mutable_data(ctx.GetPlace()); - EigenVectorArrayMap dscale_arr(dScale->mutable_data(ctx.GetPlace()), - C); - dscale_arr.setZero(); - if (use_global_stats) { - // math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var - if (ddX) { - dscale_arr = (ddx_arr * dy_arr * inv_var_tile_data).rowwise().sum(); - } - } else { - // math: dscale = inv_var * (dy - np.mean(dy, axis=(n,h,w) - (x-mean) * - // inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) * - // ddx - if (ddX) { - Tensor first_grad; - first_grad.Resize({C, sample_size}); - EigenArrayMap first_grad_arr( - first_grad.mutable_data(ctx.GetPlace()), C, sample_size); - first_grad_arr.setZero(); - - first_grad_arr += - inv_var_tile_data * - (dy_arr - - dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size - - x_sub_mean_mul_invstd_arr * - (dy_arr * x_sub_mean_mul_invstd_arr) - .rowwise() - .sum() - .replicate(1, sample_size) / - sample_size); - dscale_arr = (first_grad_arr * ddx_arr).rowwise().sum(); - } - } - } - - if (ddY) { - ddY->mutable_data(ctx.GetPlace()); - EigenArrayMap ddy_arr(transformed_ddy.mutable_data(ctx.GetPlace()), - C, sample_size); - ddy_arr.setZero(); - if (use_global_stats) { - // math: ddy = r * ddx * inv_var + ddbias + - // ddscale * (x - mean) * inv_var - if (ddX) { - ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data; - } - } else { - // math: ddy = (x - mean) * inv_var * ddscale + ddbias + - // scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) * - // np.mean(ddx * (x - mean), axis=(n,h,w))) - if (ddX) { - ddy_arr += - scale_tile_data * inv_var_tile_data * - (ddx_arr - - ddx_arr.rowwise().sum().replicate(1, sample_size) / sample_size - - x_sub_mean_mul_invstd_arr * - (ddx_arr * x_sub_mean_mul_invstd_arr) - .rowwise() - .sum() - .replicate(1, sample_size) / - sample_size); - } - } - if (ddScale) { - ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); - Tensor ddscale_tile; - ddscale_tile.Resize({C, sample_size}); - EigenArrayMap ddscale_tile_data( - ddscale_tile.mutable_data(ctx.GetPlace()), C, sample_size); - ddscale_tile_data = ddscale_arr.replicate(1, sample_size); - - ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data; - } - - if (ddBias) { - ConstEigenVectorArrayMap ddbias_arr(ddBias->data(), C); - Tensor ddbias_tile; - ddbias_tile.Resize({C, sample_size}); - EigenArrayMap ddbias_tile_data( - ddbias_tile.mutable_data(ctx.GetPlace()), C, sample_size); - ddbias_tile_data = ddbias_arr.replicate(1, sample_size); - - ddy_arr += ddbias_tile_data; - } - - if (data_layout == DataLayout::kNCHW) { - VLOG(3) << "Transform batchnorm output from NHWC to NCHW"; - TransToChannelFirst( - ctx, &transformed_ddy, ddY); - } - } - } -}; - DECLARE_INPLACE_OP_INFERER(BatchNormDoubleGradOpInplaceInferer, {"DY", "DDY"}); } // namespace operators } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(batch_norm, BatchNormInferShapeFunctor, + PD_INFER_META(phi::BatchNormInferMeta)); + REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, ops::BatchNormOpInferVarType, ops::BatchNormGradMaker, diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h index f8d37d685b9..d274e8d2c00 100644 --- a/paddle/fluid/operators/batch_norm_op.h +++ b/paddle/fluid/operators/batch_norm_op.h @@ -113,23 +113,5 @@ class BatchNormOpInferVarType } }; -template -class BatchNormKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override; -}; - -template -class BatchNormGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override; -}; - -template -class BatchNormDoubleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override; -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 8213e877f72..9be63a85fc0 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -27,6 +27,9 @@ limitations under the License. */ #endif #include "paddle/fluid/platform/cudnn_workspace_helper.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/infermeta/binary.h" + namespace paddle { namespace operators { @@ -841,6 +844,8 @@ framework::OpKernelType ConvOpDoubleGrad::GetExpectedKernelType( } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(conv2d, Conv2dInferShapeFunctor, + PD_INFER_META(phi::ConvInferMeta)); REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker, ops::ConvOpInferVarType, ops::Conv2DGradMaker, @@ -851,6 +856,8 @@ REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad, REGISTER_OPERATOR(conv2d_grad_grad, ops::ConvOpDoubleGrad); // depthwise convolution op +DECLARE_INFER_SHAPE_FUNCTOR(depthwise_conv2d, DepthwiseConv2dInferShapeFunctor, + PD_INFER_META(phi::ConvInferMeta)); REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker, ops::ConvOpInferVarType, ops::Conv2DGradMaker, @@ -860,6 +867,8 @@ REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad, ops::Conv2DDoubleGradMaker); REGISTER_OPERATOR(depthwise_conv2d_grad_grad, ops::ConvOpDoubleGrad); +DECLARE_INFER_SHAPE_FUNCTOR(conv3d, Conv3dInferShapeFunctor, + PD_INFER_META(phi::ConvInferMeta)); REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker, ops::ConvOpInferVarType, ops::Conv3DGradMaker, diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc index 0d9fbf612f7..35e38909017 100644 --- a/paddle/fluid/operators/detection/yolo_box_op.cc +++ b/paddle/fluid/operators/detection/yolo_box_op.cc @@ -9,8 +9,10 @@ See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -235,10 +237,13 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(yolo_box, YoloBoxInferShapeFunctor, + PD_INFER_META(phi::YoloBoxInferMeta)); REGISTER_OPERATOR( yolo_box, ops::YoloBoxOp, ops::YoloBoxOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + YoloBoxInferShapeFunctor); REGISTER_OP_VERSION(yolo_box) .AddCheckpoint( diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc index 6d52ce45c4c..3d9950902ac 100644 --- a/paddle/fluid/operators/dropout_op.cc +++ b/paddle/fluid/operators/dropout_op.cc @@ -14,7 +14,9 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -25,17 +27,6 @@ class DropoutOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Dropout"); - - auto x_dims = ctx->GetInputDim("X"); - ctx->SetOutputDim("Out", x_dims); - if (ctx->Attrs().Get("is_test") == false) { - ctx->SetOutputDim("Mask", x_dims); - } - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -173,7 +164,11 @@ class DropoutGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(dropout, DropoutInferShapeFunctor, + PD_INFER_META(phi::DropoutInferMeta)); + REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker, ops::DropoutGradOpMaker, - ops::DropoutGradOpMaker); + ops::DropoutGradOpMaker, + DropoutInferShapeFunctor); REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad); diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cc b/paddle/fluid/operators/fused/conv_fusion_op.cc index c445a28c084..e60fc44e9a6 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cc +++ b/paddle/fluid/operators/fused/conv_fusion_op.cc @@ -120,6 +120,142 @@ class Conv2DFusionOp : public operators::ConvOp { ctx->SetOutputsDim("Outputs", output_shapes); } } + + std::vector ComputeOutputShape( + framework::InferShapeContext* ctx) const { + OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Conv"); + OP_INOUT_CHECK(ctx->HasInput("Filter"), "Input", "Filter", "Conv"); + + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + std::string padding_algorithm = + ctx->Attrs().Get("padding_algorithm"); + int groups = ctx->Attrs().Get("groups"); + std::vector dilations = + ctx->Attrs().Get>("dilations"); + int dilation_size = dilations.size(); + for (int i = 0; i < dilation_size; ++i) { + PADDLE_ENFORCE_GT( + dilations[i], 0, + platform::errors::InvalidArgument( + "The dilation of Op(Conv) should be larget than 0, but received " + "dilation is %d.", + dilations[i])); + } + const std::string data_format = + ctx->Attrs().Get("data_format"); + + // MKL-DNN Kernels are using NCHW order of dims description + // so we ignore data_format consideration for MKL-DNN kernel + const bool channel_last = (ctx->IsRunMKLDNNKernel() == false) && + (data_format == "NHWC" || data_format == "NDHWC"); + + PADDLE_ENFORCE_EQ( + in_dims.size() == 4 || in_dims.size() == 5, true, + platform::errors::InvalidArgument( + "The input of Op(Conv) should be a 4-D or 5-D Tensor. But " + "received: input's dimension is %u, input's shape is [%s].", + in_dims.size(), in_dims)); + + PADDLE_ENFORCE_EQ( + in_dims.size(), filter_dims.size(), + platform::errors::InvalidArgument( + "The input's dimension and filter's dimension of " + "Op(Conv) should be equal. But received: the input's shape is " + "[%s], " + "the input's dimension is %d; the filter's shape is [%s], " + "the filter's dimension is %d.", + in_dims, in_dims.size(), filter_dims, filter_dims.size())); + + int stride_size = strides.size(); + for (int i = 0; i < stride_size; ++i) { + PADDLE_ENFORCE_GT( + strides[i], 0, + platform::errors::InvalidArgument( + "The stride of Op(Conv) should be larget than 0, but received " + "stride is %d.", + strides[i])); + } + + int in_sub_stride_size = in_dims.size() - stride_size; + PADDLE_ENFORCE_EQ( + in_dims.size(), strides.size() + 2U, + platform::errors::InvalidArgument( + "The difference of input's dimension and Attr(strides)'s " + "length must be euqal to 2 for Op(Conv). " + "But received: input's dimension is %d, input's shape is [%s]; " + "Attr(stride)'s length is %d, Attr(stride) is [%s]; " + "difference of input's dimention and Attr(strides)'s length = %u.", + in_dims.size(), in_dims, strides.size(), phi::make_ddim(strides), + in_sub_stride_size)); + + const auto input_channels = + channel_last ? in_dims[in_dims.size() - 1] : in_dims[1]; + + PADDLE_ENFORCE_EQ( + input_channels, filter_dims[1] * groups, + platform::errors::InvalidArgument( + "The number of input's channels should be equal to filter's " + "channels " + "* groups for Op(Conv). But received: the input's channels is %d, " + "the input's shape is [%s]; the filter's channels is %d, the " + "filter's shape is [%s]; the groups is %d, the data_format is %s. " + "The error may come from wrong data_format setting.", + input_channels, in_dims, filter_dims[1], filter_dims, groups, + data_format)); + PADDLE_ENFORCE_EQ( + filter_dims[0] % groups, 0, + platform::errors::InvalidArgument( + "The number of output's channels (filter's first dimension) of " + "Op(Conv) should be divided by groups. But received: " + "the output channels is %d, the filter's shape is [%s], " + "the groups is %d.", + filter_dims[0], filter_dims, groups)); + + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_GT( + filter_dims[0], 0, + platform::errors::InvalidArgument( + "the size of filter at axis 0 should be greater than 0")); + } + + framework::DDim in_data_dims; + if (channel_last) { + in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); + } else { + in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); + } + + framework::DDim filter_data_dims = + phi::slice_ddim(filter_dims, 2, filter_dims.size()); + + std::vector ksize = phi::vectorize(filter_data_dims); + UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + in_data_dims, strides, ksize); + + std::vector output_shape({in_dims[0]}); + if (!channel_last) { + output_shape.push_back(filter_dims[0]); + } + for (int i = 0; i < in_data_dims.size(); ++i) { + if ((!ctx->IsRuntime()) && + (in_data_dims[i] <= 0 || filter_dims[i + 2] <= 0)) { + output_shape.push_back(-1); + } else { + output_shape.push_back( + ConvOutputSize(in_data_dims[i], filter_data_dims[i], dilations[i], + paddings[2 * i], paddings[2 * i + 1], strides[i])); + } + } + if (channel_last) { + output_shape.push_back(filter_dims[0]); + } + + return output_shape; + } }; // TODO(qingqing): add gradient operator for conv2d_fusion diff --git a/paddle/fluid/operators/histogram_op.cc b/paddle/fluid/operators/histogram_op.cc index 92cc6077def..c9fd75651b5 100644 --- a/paddle/fluid/operators/histogram_op.cc +++ b/paddle/fluid/operators/histogram_op.cc @@ -16,7 +16,9 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -28,27 +30,6 @@ class HistogramOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "histogram"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "histogram"); - const auto &nbins = ctx->Attrs().Get("bins"); - const auto &minval = ctx->Attrs().Get("min"); - const auto &maxval = ctx->Attrs().Get("max"); - - PADDLE_ENFORCE_GE(nbins, 1, - platform::errors::InvalidArgument( - "The bins should be greater than or equal to 1." - "But received nbins is %d", - nbins)); - PADDLE_ENFORCE_GE(maxval, minval, platform::errors::InvalidArgument( - "max must be larger or equal to min." - "But received max is %d, min is %d", - maxval, minval)); - - ctx->SetOutputDim("Out", phi::make_ddim({nbins})); - ctx->ShareLoD("X", /*->*/ "Out"); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const { auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); @@ -81,7 +62,12 @@ class HistogramOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(histogram, HistogramInferShapeFunctor, + PD_INFER_META(phi::HistogramInferMeta)); + REGISTER_OPERATOR( histogram, ops::HistogramOp, ops::HistogramOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + HistogramInferShapeFunctor); diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc index 7f513696998..77951ff394e 100644 --- a/paddle/fluid/operators/inplace_abn_op.cc +++ b/paddle/fluid/operators/inplace_abn_op.cc @@ -323,6 +323,7 @@ class InplaceABNGradKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; + REGISTER_OPERATOR(inplace_abn, ops::InplaceABNOp, ops::InplaceABNOpMaker, ops::BatchNormOpInferVarType, ops::InplaceABNOpGradMaker, diff --git a/paddle/fluid/operators/masked_select_op.cc b/paddle/fluid/operators/masked_select_op.cc index a6eb535c693..1887bbcfb7e 100644 --- a/paddle/fluid/operators/masked_select_op.cc +++ b/paddle/fluid/operators/masked_select_op.cc @@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -21,16 +23,6 @@ class MaskedSelectOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "Input", "MaskedSelect"); - OP_INOUT_CHECK(ctx->HasInput("Mask"), "Input", "Mask", "MaskedSelect"); - OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Out", "MaskedSelect"); - - // output will only be a 1-D Tensor - ctx->SetOutputDim("Y", phi::make_ddim({-1})); - ctx->ShareLoD("X", /*->*/ "Y"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -100,8 +92,13 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(MaskedSelectedGradNoNeedBufferVarsInferer, } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(masked_select, MaksedSelectInferShapeFunctor, + PD_INFER_META(phi::MaskedSelectInferMeta)); + REGISTER_OPERATOR(masked_select, ops::MaskedSelectOp, ops::MaskedSelectOpMaker, ops::MaskedSelectGradOpMaker, - ops::MaskedSelectGradOpMaker); + ops::MaskedSelectGradOpMaker, + MaksedSelectInferShapeFunctor); REGISTER_OPERATOR(masked_select_grad, ops::MaskedSelectOpGrad, ops::MaskedSelectedGradNoNeedBufferVarsInferer); diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc index 5d394424d54..51daccce0e8 100644 --- a/paddle/fluid/operators/norm_op.cc +++ b/paddle/fluid/operators/norm_op.cc @@ -15,7 +15,9 @@ limitations under the License. */ #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -57,21 +59,7 @@ where, $\sum {x^2}$ is calculated along the `axis` dimension. }; class NormOp : public framework::OperatorWithKernel { - public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NormOp"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NormOp"); - auto xdim = ctx->GetInputDim("X"); - ctx->SetOutputDim("Out", xdim); - - if (ctx->Attrs().Get("is_test") == false) { - int axis = ctx->Attrs().Get("axis"); - if (axis < 0) axis = xdim.size() + axis; - xdim[axis] = 1; - ctx->SetOutputDim("Norm", xdim); - } - } }; class NormOpGrad : public framework::OperatorWithKernel { @@ -111,7 +99,11 @@ class NormOpGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; +DECLARE_INFER_SHAPE_FUNCTOR(norm, NormInferShapeFunctor, + PD_INFER_META(phi::NormInferMeta)); + REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker, ops::NormOpGradOpMaker, - ops::NormOpGradOpMaker); + ops::NormOpGradOpMaker, + NormInferShapeFunctor); REGISTER_OPERATOR(norm_grad, ops::NormOpGrad); diff --git a/paddle/fluid/operators/sync_batch_norm_op.cc b/paddle/fluid/operators/sync_batch_norm_op.cc index d198992abde..0c178b02d03 100644 --- a/paddle/fluid/operators/sync_batch_norm_op.cc +++ b/paddle/fluid/operators/sync_batch_norm_op.cc @@ -50,6 +50,7 @@ class SyncBatchNormGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; + REGISTER_OPERATOR(sync_batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, ops::BatchNormOpInferVarType, ops::SyncBatchNormGradMaker, diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index f09e8789478..aabb944db30 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -21,6 +21,8 @@ limitations under the License. */ #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/kernels/funcs/common_shape.h" +#include "paddle/phi/kernels/cpu/conv_util.h" + namespace phi { namespace detail { @@ -355,6 +357,161 @@ void CrossInferMeta(const MetaTensor& x, out->share_lod(x); } +void ConvInferMeta(const MetaTensor& input, + const MetaTensor& filter, + const std::vector& strides, + const std::vector& paddings_t, + const std::string& padding_algorithm, + int groups, + const std::vector& dilations_t, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + MetaTensor* out, + MetaConfig config) { + std::vector paddings = paddings_t; + std::vector dilations = dilations_t; + auto in_dims = input.dims(); + auto filter_dims = filter.dims(); + int dilation_size = dilations.size(); + for (int i = 0; i < dilation_size; ++i) { + PADDLE_ENFORCE_GT( + dilations[i], + 0, + phi::errors::InvalidArgument( + "The dilation of Op(Conv) should be larget than 0, but received " + "dilation is %d.", + dilations[i])); + } + const bool channel_last = (config.is_run_mkldnn_kernel == false) && + (data_format == "NHWC" || data_format == "NDHWC"); + + PADDLE_ENFORCE_EQ( + in_dims.size() == 4 || in_dims.size() == 5, + true, + phi::errors::InvalidArgument( + "The input of Op(Conv) should be a 4-D or 5-D Tensor. But " + "received: input's dimension is %u, input's shape is [%s].", + in_dims.size(), + in_dims)); + + PADDLE_ENFORCE_EQ( + in_dims.size(), + filter_dims.size(), + phi::errors::InvalidArgument( + "The input's dimension and filter's dimension of " + "Op(Conv) should be equal. But received: the input's shape is [%s], " + "the input's dimension is %d; the filter's shape is [%s], " + "the filter's dimension is %d.", + in_dims, + in_dims.size(), + filter_dims, + filter_dims.size())); + + int stride_size = strides.size(); + for (int i = 0; i < stride_size; ++i) { + PADDLE_ENFORCE_GT( + strides[i], + 0, + phi::errors::InvalidArgument( + "The stride of Op(Conv) should be larget than 0, but received " + "stride is %d.", + strides[i])); + } + + int in_sub_stride_size = in_dims.size() - stride_size; + PADDLE_ENFORCE_EQ( + in_dims.size(), + strides.size() + 2U, + phi::errors::InvalidArgument( + "The difference of input's dimension and Attr(strides)'s " + "length must be euqal to 2 for Op(Conv). " + "But received: input's dimension is %d, input's shape is [%s]; " + "Attr(stride)'s length is %d, Attr(stride) is [%s]; " + "difference of input's dimention and Attr(strides)'s length = %u.", + in_dims.size(), + in_dims, + strides.size(), + phi::make_ddim(strides), + in_sub_stride_size)); + + const auto input_channels = + channel_last ? in_dims[in_dims.size() - 1] : in_dims[1]; + + PADDLE_ENFORCE_EQ( + input_channels, + filter_dims[1] * groups, + phi::errors::InvalidArgument( + "The number of input's channels should be equal to filter's channels " + "* groups for Op(Conv). But received: the input's channels is %d, " + "the input's shape is [%s]; the filter's channels is %d, the " + "filter's shape is [%s]; the groups is %d, the data_format is %s. " + "The error may come from wrong data_format setting.", + input_channels, + in_dims, + filter_dims[1], + filter_dims, + groups, + data_format)); + PADDLE_ENFORCE_EQ( + filter_dims[0] % groups, + 0, + phi::errors::InvalidArgument( + "The number of output's channels (filter's first dimension) of " + "Op(Conv) should be divided by groups. But received: " + "the output channels is %d, the filter's shape is [%s], " + "the groups is %d.", + filter_dims[0], + filter_dims, + groups)); + + if (config.is_runtime) { + PADDLE_ENFORCE_GT( + filter_dims[0], + 0, + phi::errors::InvalidArgument( + "the size of filter at axis 0 should be greater than 0")); + } + + DDim in_data_dims; + if (channel_last) { + in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); + } else { + in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); + } + + DDim filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size()); + + std::vector ksize = phi::vectorize(filter_data_dims); + phi::UpdatePaddingAndDilation( + &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); + + std::vector output_shape({in_dims[0]}); + if (!channel_last) { + output_shape.push_back(filter_dims[0]); + } + for (int i = 0; i < in_data_dims.size(); ++i) { + if ((!config.is_runtime) && + (in_data_dims[i] <= 0 || filter_dims[i + 2] <= 0)) { + output_shape.push_back(-1); + } else { + const int dkernel = dilations[i] * (filter_data_dims[i] - 1) + 1; + int output_size = + (in_data_dims[i] + paddings[2 * i] + paddings[2 * i + 1] - dkernel) / + strides[i] + + 1; + output_shape.push_back(output_size); + } + } + if (channel_last) { + output_shape.push_back(filter_dims[0]); + } + + out->set_dims(make_ddim(output_shape)); + out->set_dtype(input.dtype()); +} + void DistInferMeta(const MetaTensor& x, const MetaTensor& y, float p, @@ -815,6 +972,13 @@ void LogLossInferMeta(const MetaTensor& input, out->share_lod(input); } +void MaskedSelectInferMeta(const MetaTensor& x, + const MetaTensor& mask, + MetaTensor* out) { + out->set_dims({-1}); // can not infer + out->set_dtype(x.dtype()); +} + void MatmulInferMeta(const MetaTensor& x, const MetaTensor& y, bool trans_x, @@ -1188,6 +1352,118 @@ void TriangularSolveInferMeta(const MetaTensor& x, out->share_lod(y); } +void YoloBoxInferMeta(const MetaTensor& x, + const MetaTensor& img_size, + const std::vector& anchors, + int class_num, + float conf_thresh, + int downsample_ratio, + bool clip_bbox, + float scale_x_y, + bool iou_aware, + float iou_aware_factor, + MetaTensor* boxes, + MetaTensor* scores, + MetaConfig config) { + auto dim_x = x.dims(); + auto dim_imgsize = img_size.dims(); + int anchor_num = anchors.size() / 2; + + PADDLE_ENFORCE_EQ( + dim_x.size(), + 4, + phi::errors::InvalidArgument("Input(X) should be a 4-D tensor." + "But received X dimension(%s)", + dim_x.size())); + if (iou_aware) { + PADDLE_ENFORCE_EQ( + dim_x[1], + anchor_num * (6 + class_num), + phi::errors::InvalidArgument( + "Input(X) dim[1] should be equal to (anchor_mask_number * (6 " + "+ class_num)) while iou_aware is true." + "But received dim[1](%s) != (anchor_mask_number * " + "(6+class_num)(%s).", + dim_x[1], + anchor_num * (6 + class_num))); + PADDLE_ENFORCE_GE( + iou_aware_factor, + 0, + phi::errors::InvalidArgument( + "Attr(iou_aware_factor) should greater than or equal to 0." + "But received iou_aware_factor (%s)", + iou_aware_factor)); + PADDLE_ENFORCE_LE( + iou_aware_factor, + 1, + phi::errors::InvalidArgument( + "Attr(iou_aware_factor) should less than or equal to 1." + "But received iou_aware_factor (%s)", + iou_aware_factor)); + } else { + PADDLE_ENFORCE_EQ( + dim_x[1], + anchor_num * (5 + class_num), + phi::errors::InvalidArgument( + "Input(X) dim[1] should be equal to (anchor_mask_number * (5 " + "+ class_num))." + "But received dim[1](%s) != (anchor_mask_number * " + "(5+class_num)(%s).", + dim_x[1], + anchor_num * (5 + class_num))); + } + PADDLE_ENFORCE_EQ( + dim_imgsize.size(), + 2, + phi::errors::InvalidArgument("Input(ImgSize) should be a 2-D tensor." + "But received Imgsize size(%s)", + dim_imgsize.size())); + if ((dim_imgsize[0] > 0 && dim_x[0] > 0) || config.is_runtime) { + PADDLE_ENFORCE_EQ( + dim_imgsize[0], + dim_x[0], + phi::errors::InvalidArgument( + "Input(ImgSize) dim[0] and Input(X) dim[0] should be same.")); + } + PADDLE_ENFORCE_EQ( + dim_imgsize[1], + 2, + phi::errors::InvalidArgument("Input(ImgSize) dim[1] should be 2." + "But received imgsize dim[1](%s).", + dim_imgsize[1])); + PADDLE_ENFORCE_GT(anchors.size(), + 0, + phi::errors::InvalidArgument( + "Attr(anchors) length should be greater than 0." + "But received anchors length(%s).", + anchors.size())); + PADDLE_ENFORCE_EQ(anchors.size() % 2, + 0, + phi::errors::InvalidArgument( + "Attr(anchors) length should be even integer." + "But received anchors length (%s)", + anchors.size())); + PADDLE_ENFORCE_GT(class_num, + 0, + phi::errors::InvalidArgument( + "Attr(class_num) should be an integer greater than 0." + "But received class_num (%s)", + class_num)); + + int box_num; + if ((dim_x[2] > 0 && dim_x[3] > 0) || config.is_runtime) { + box_num = dim_x[2] * dim_x[3] * anchor_num; + } else { + box_num = -1; + } + std::vector dim_boxes({dim_x[0], box_num, 4}); + boxes->set_dims(phi::make_ddim(dim_boxes)); + boxes->set_dtype(x.dtype()); + + std::vector dim_scores({dim_x[0], box_num, class_num}); + scores->set_dims(phi::make_ddim(dim_scores)); +} + void ValueCompareInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out, @@ -1201,3 +1477,4 @@ void ValueCompareInferMeta(const MetaTensor& x, } // namespace phi PD_REGISTER_INFER_META_FN(add_raw, phi::ElementwiseRawInferMeta); +PD_REGISTER_INFER_META_FN(conv2d, phi::ConvInferMeta); diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index cb7a83f39a4..d770a096de7 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -69,6 +69,20 @@ void CompareInferMeta(const MetaTensor& x, int axis, MetaTensor* out); +void ConvInferMeta(const MetaTensor& input, + const MetaTensor& filter, + const std::vector& strides, + const std::vector& paddings, + const std::string& paddding_algorithm, + int groups, + const std::vector& dilations, + const std::string& data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void CrossInferMeta(const MetaTensor& x, const MetaTensor& y, int axis, @@ -138,6 +152,10 @@ void LogLossInferMeta(const MetaTensor& input, MetaTensor* out, MetaConfig config = MetaConfig()); +void MaskedSelectInferMeta(const MetaTensor& x, + const MetaTensor& mask, + MetaTensor* out); + void MatmulInferMeta(const MetaTensor& x, const MetaTensor& y, bool trans_x, @@ -180,6 +198,20 @@ void TriangularSolveInferMeta(const MetaTensor& x, bool unitriangular, MetaTensor* out); +void YoloBoxInferMeta(const MetaTensor& x, + const MetaTensor& img_size, + const std::vector& anchors, + int class_num, + float conf_thresh, + int downsample_ratio, + bool clip_bbox, + float scale_x_y, + bool iou_aware, + float iou_aware_factor, + MetaTensor* boxes, + MetaTensor* scores, + MetaConfig config = MetaConfig()); + void ValueCompareInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out, diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 3f77a20af22..3e9da9a217a 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -14,7 +14,9 @@ limitations under the License. */ #include "paddle/phi/infermeta/multiary.h" #include +#include "paddle/phi/common/layout.h" #include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/meta_tensor.h" #include "paddle/phi/kernels/funcs/concat_funcs.h" namespace phi { @@ -200,6 +202,114 @@ void AucInferMeta(const MetaTensor& input, } } +void BatchNormInferMeta(const MetaTensor& x, + const MetaTensor& scale, + const MetaTensor& bias, + const MetaTensor& mean, + const MetaTensor& variance, + float momentum, + float epsilon, + const std::string& data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + MetaTensor* y, + MetaTensor* mean_out, + MetaTensor* variance_out, + MetaTensor* saved_mean, + MetaTensor* saved_variance, + MetaTensor* reserve_space, + MetaConfig config) { + const auto x_dims = x.dims(); + for (int i = 0; i < x_dims.size(); i++) { + PADDLE_ENFORCE_EQ( + (x_dims[i] == -1) || (x_dims[i] > 0), + true, + phi::errors::InvalidArgument( + "Each dimension of input tensor is expected to be -1 or a " + "positive number, but recieved %d. Input's shape is [%s].", + x_dims[i], + x_dims)); + } + + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + + PADDLE_ENFORCE_GE( + x_dims.size(), + 2, + phi::errors::InvalidArgument( + "ShapeError: the dimension of input " + "X must greater than or equal to 2. But received: the shape of input " + "X = [%s], the dimension of input X =[%d]", + x_dims, + x_dims.size())); + PADDLE_ENFORCE_LE( + x_dims.size(), + 5, + phi::errors::InvalidArgument( + "ShapeError: the dimension of input X " + "must smaller than or equal to 5. But received: the shape of input X " + "= [%s], the dimension of input X = [%d]", + x_dims, + x_dims.size())); + + const int64_t C = ((config.is_run_mkldnn_kernel == true) || + (data_layout == DataLayout::kNCHW) + ? x_dims[1] + : x_dims[x_dims.size() - 1]); + auto scale_dim = scale.dims(); + auto bias_dim = bias.dims(); + + PADDLE_ENFORCE_EQ( + scale_dim.size(), + 1UL, + phi::errors::InvalidArgument( + "ShapeError: the dimension of scale must equal to 1." + "But received: the shape of scale is [%s], the dimension " + "of scale is [%d]", + scale_dim, + scale_dim.size())); + PADDLE_ENFORCE_EQ(bias_dim.size(), + 1UL, + phi::errors::InvalidArgument( + "ShapeError: the dimension of bias must equal to 1." + "But received: the shape of bias is [%s],the dimension " + "of bias is [%d]", + bias_dim, + bias_dim.size())); + + bool check = true; + if ((!config.is_runtime) && + (phi::product(scale_dim) <= 0 || phi::product(bias_dim) <= 0)) { + check = false; + } + + if (check) { + PADDLE_ENFORCE_EQ(scale_dim[0], + C, + phi::errors::InvalidArgument( + "ShapeError: the shape of scale must equal to [%d]" + "But received: the shape of scale is [%d]", + C, + scale_dim[0])); + PADDLE_ENFORCE_EQ(bias_dim[0], + C, + phi::errors::InvalidArgument( + "ShapeError: the shape of bias must equal to [%d]" + "But received: the shape of bias is [%d]", + C, + bias_dim[0])); + } + y->set_dims(x_dims); + mean_out->set_dims({C}); + variance_out->set_dims({C}); + saved_mean->set_dims({C}); + saved_variance->set_dims({C}); + y->share_lod(x); +} + void BilinearTensorProductInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, @@ -577,3 +687,5 @@ void WhereInferMeta(const MetaTensor& condition, } } // namespace phi + +PD_REGISTER_INFER_META_FN(batch_norm, phi::BatchNormInferMeta); diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index a712ca31de7..068766c0e11 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -72,6 +72,26 @@ void AucInferMeta(const MetaTensor& input, MetaTensor* stat_neg_out, MetaConfig config = MetaConfig()); +void BatchNormInferMeta(const MetaTensor& x, + const MetaTensor& scale, + const MetaTensor& bias, + const MetaTensor& mean, + const MetaTensor& variance, + float momentum, + float epsilon, + const std::string& data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + MetaTensor* y, + MetaTensor* mean_out, + MetaTensor* variance_out, + MetaTensor* saved_mean, + MetaTensor* saved_variance, + MetaTensor* reserve_space, + MetaConfig config = MetaConfig()); + void BilinearTensorProductInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 03029550c2a..0f518395531 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -304,6 +304,17 @@ void DiagonalInferMeta(const MetaTensor& input, out->set_dims(phi::make_ddim(out_dims)); } +void DropoutInferMeta(const MetaTensor& x, MetaTensor* out, MetaTensor* mask) { + auto x_dims = x.dims(); + out->set_dims(x_dims); + out->share_lod(x); + out->set_dtype(x.dtype()); + + if (mask != nullptr) { + mask->set_dims(x_dims); + } +} + void EighInferMeta(const MetaTensor& x, const std::string& uplo, MetaTensor* out_w, @@ -392,6 +403,26 @@ void GumbelSoftmaxInferMeta(const MetaTensor& x, UnchangedInferMetaCheckAxis(x, axis, out); } +void HistogramInferMeta( + const MetaTensor& input, int64_t bins, int min, int max, MetaTensor* out) { + PADDLE_ENFORCE_GE(bins, + 1, + phi::errors::InvalidArgument( + "The bins should be greater than or equal to 1." + "But received nbins is %d", + bins)); + PADDLE_ENFORCE_GE( + max, + min, + phi::errors::InvalidArgument("max must be larger or equal to min." + "But received max is %d, min is %d", + max, + min)); + + out->set_dims({bins}); + out->share_lod(input); +} + void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out) { PADDLE_ENFORCE_EQ( product(x.dims()), @@ -787,6 +818,24 @@ void MultinomialInferMeta(const MetaTensor& x, out->set_dtype(DataType::INT64); } +void NormInferMeta(const MetaTensor& x, + int axis, + float epsilon, + bool is_test, + MetaTensor* out, + MetaTensor* norm) { + auto xdim = x.dims(); + out->set_dims(x.dims()); + out->set_dtype(x.dtype()); + + if (is_test == false) { + if (axis < 0) axis = xdim.size() + axis; + xdim[axis] = 1; + norm->set_dims(xdim); + norm->set_dtype(x.dtype()); + } +} + void PadInferMeta(const MetaTensor& input, const std::vector& paddings, float pad_value, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 00026f8598b..2d51bac995d 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -74,6 +74,8 @@ void DiagInferMeta(const MetaTensor& x, void DiagonalInferMeta( const MetaTensor& input, int offset, int axis1, int axis2, MetaTensor* out); +void DropoutInferMeta(const MetaTensor& x, MetaTensor* out, MetaTensor* mask); + void EighInferMeta(const MetaTensor& x, const std::string& uplo, MetaTensor* out_w, @@ -89,6 +91,8 @@ void GumbelSoftmaxInferMeta(const MetaTensor& x, bool hard, int axis, MetaTensor* out); +void HistogramInferMeta( + const MetaTensor& input, int64_t bins, int min, int max, MetaTensor* out); void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out); @@ -130,6 +134,12 @@ void MultinomialInferMeta(const MetaTensor& x, int num_samples, bool replacement, MetaTensor* out); +void NormInferMeta(const MetaTensor& x, + int axis, + float epsilon, + bool is_test, + MetaTensor* out, + MetaTensor* norm); void PadInferMeta(const MetaTensor& input, const std::vector& paddings, diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py index 33df4283888..81bb182802e 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py @@ -19,6 +19,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set import unittest +import paddle import hypothesis from hypothesis import given, settings, seed, example, assume @@ -104,4 +105,5 @@ class TestConvGeluMkldnnFusePass(PassAutoScanTest): if __name__ == "__main__": + paddle.enable_static() unittest.main() -- GitLab From c46f2ddb55ea075226e144b0222a38eba5352f53 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Sat, 19 Mar 2022 13:05:01 +0800 Subject: [PATCH 174/176] fix python hook mem leak (#40716) --- paddle/fluid/pybind/imperative.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 85427a8455b..3a2c93309f3 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -119,7 +119,11 @@ class PyVariableWrapperHook : public imperative::VariableWrapperHook { return var; } - return PyObjectCast>(res)->SharedVar(); + auto res_varbase = PyObjectCast>(res); + // Here the reference count of `res` is 2, so we decreases the reference + // count manually to avoid memory leaks + Py_DECREF(res); + return res_varbase->SharedVar(); } private: -- GitLab From a8e5c9be8fc16940842e31719f9e85eb7e21de58 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Sat, 19 Mar 2022 13:29:07 +0800 Subject: [PATCH 175/176] move deformable_conv forward kernel to phi (#40700) --- paddle/fluid/operators/deformable_conv_op.cc | 2 - paddle/fluid/operators/deformable_conv_op.cu | 105 ----------- paddle/fluid/operators/deformable_conv_op.h | 96 ---------- .../phi/kernels/cpu/deformable_conv_kernel.cc | 146 +++++++++++++++ paddle/phi/kernels/cumsum_kernel.h | 2 +- paddle/phi/kernels/deformable_conv_kernel.h | 35 ++++ .../phi/kernels/gpu/deformable_conv_kernel.cu | 160 ++++++++++++++++ .../impl/deformable_conv_kernel_impl.h | 173 ++++++++++++++++++ paddle/phi/ops/compat/cumprod_sig.cc | 1 - paddle/phi/ops/compat/deformable_conv_sig.cc | 34 ++++ 10 files changed, 549 insertions(+), 205 deletions(-) create mode 100644 paddle/phi/kernels/cpu/deformable_conv_kernel.cc create mode 100644 paddle/phi/kernels/deformable_conv_kernel.h create mode 100644 paddle/phi/kernels/gpu/deformable_conv_kernel.cu create mode 100644 paddle/phi/kernels/impl/deformable_conv_kernel_impl.h create mode 100644 paddle/phi/ops/compat/deformable_conv_sig.cc diff --git a/paddle/fluid/operators/deformable_conv_op.cc b/paddle/fluid/operators/deformable_conv_op.cc index b15efc5f84b..6e15fd090b8 100644 --- a/paddle/fluid/operators/deformable_conv_op.cc +++ b/paddle/fluid/operators/deformable_conv_op.cc @@ -338,8 +338,6 @@ REGISTER_OPERATOR(deformable_conv, ops::DeformableConvOp, REGISTER_OPERATOR(deformable_conv_grad, ops::DeformableConvGradOp); -REGISTER_OP_CPU_KERNEL(deformable_conv, ops::DeformableConvCPUKernel, - ops::DeformableConvCPUKernel); REGISTER_OP_CPU_KERNEL(deformable_conv_grad, ops::DeformableConvGradCPUKernel, ops::DeformableConvGradCPUKernel); diff --git a/paddle/fluid/operators/deformable_conv_op.cu b/paddle/fluid/operators/deformable_conv_op.cu index 2c7d905c79b..ad10abf9c64 100644 --- a/paddle/fluid/operators/deformable_conv_op.cu +++ b/paddle/fluid/operators/deformable_conv_op.cu @@ -446,108 +446,6 @@ __global__ void FilterGradAddupGpuKernel(const int nthreads, const int n, } } -template -class DeformableConvCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const Tensor* input = ctx.Input("Input"); - const Tensor offset = *ctx.Input("Offset"); - const Tensor mask = *ctx.Input("Mask"); - Tensor filter = *ctx.Input("Filter"); - Tensor* output = ctx.Output("Output"); - output->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.cuda_device_context(); - - const int groups = ctx.Attr("groups"); - const int deformable_groups = ctx.Attr("deformable_groups"); - const int im2col_step = ctx.Attr("im2col_step"); - const std::vector strides = ctx.Attr>("strides"); - const std::vector paddings = ctx.Attr>("paddings"); - const std::vector dilations = ctx.Attr>("dilations"); - - const int batch_size = static_cast(input->dims()[0]); - - std::vector filter_shape_vec(phi::vectorize(filter.dims())); - std::vector output_shape_vec(phi::vectorize(output->dims())); - - // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w} - std::vector col_buffer_shape_vec(filter_shape_vec.size()); - col_buffer_shape_vec[0] = - input->dims()[1] * filter.dims()[2] * filter.dims()[3]; - col_buffer_shape_vec[1] = im2col_step; - for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) { - col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(phi::make_ddim(col_buffer_shape_vec)); - std::vector output_buffer_shape_vec(1); - output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] * - output_shape_vec[2] * output_shape_vec[3]; - framework::DDim output_shape(phi::make_ddim(output_buffer_shape_vec)); - Tensor col_buffer; - Tensor output_buffer; - col_buffer = ctx.AllocateTmpTensor(col_shape, dev_ctx); - output_buffer = - ctx.AllocateTmpTensor(output_shape, dev_ctx); - - int64_t M = output_shape_vec[1] / groups; - int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3]; - int64_t K = - input->dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups; - - Tensor weight_3d; - weight_3d.ShareDataWith(filter).Resize(phi::make_ddim({groups, M, K})); - Tensor col_buffer_3d; - col_buffer_3d.ShareDataWith(col_buffer) - .Resize(phi::make_ddim({groups, K, N})); - Tensor output_4d; - output_4d.ShareDataWith(output_buffer) - .Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N})); - output_4d.mutable_data(ctx.GetPlace()); - framework::DDim input_shape = - phi::slice_ddim(input->dims(), 1, input->dims().size()); - std::vector input_shape_vec = phi::vectorize(input_shape); - - int input_dim = input->numel() / input->dims()[0]; - int input_offset_dim = offset.numel() / offset.dims()[0]; - int input_mask_dim = mask.numel() / mask.dims()[0]; - - auto blas = phi::funcs::GetBlas(dev_ctx); - - const T* input_ptr = input->data(); - const T* offset_ptr = offset.data(); - const T* mask_ptr = mask.data(); - col_buffer.mutable_data(ctx.GetPlace()); - T* col_buffer_ptr = col_buffer.data(); - - for (int i = 0; i < batch_size / im2col_step; ++i) { - ModulatedDeformableIm2col( - ctx.device_context(), input_ptr + i * im2col_step * input_dim, - offset_ptr + i * im2col_step * input_offset_dim, - mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec, - col_buffer_shape_vec, filter_shape_vec, paddings, strides, dilations, - deformable_groups, col_buffer_ptr); - - Tensor output_3d = output_4d.Slice(i, i + 1).Resize( - phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size())); - for (int g = 0; g < groups; ++g) { - Tensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize( - phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size())); - Tensor col_buffer_3d_slice = - col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim( - col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); - Tensor output_3d_slice = output_3d.Slice(g, g + 1).Resize( - phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size())); - - blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, false, T(1.0), - &output_3d_slice, T(0.0)); - } - } - output->ShareDataWith(output_buffer) - .Resize(phi::make_ddim(output_shape_vec)); - } -}; - template class DeformableConvGradCUDAKernel : public framework::OpKernel { public: @@ -740,9 +638,6 @@ class DeformableConvGradCUDAKernel : public framework::OpKernel { namespace ops = paddle::operators; using CUDA = paddle::platform::CUDADeviceContext; -REGISTER_OP_CUDA_KERNEL(deformable_conv, - ops::DeformableConvCUDAKernel, - ops::DeformableConvCUDAKernel); REGISTER_OP_CUDA_KERNEL(deformable_conv_grad, ops::DeformableConvGradCUDAKernel, ops::DeformableConvGradCUDAKernel); diff --git a/paddle/fluid/operators/deformable_conv_op.h b/paddle/fluid/operators/deformable_conv_op.h index 66961655ee6..1176b96987e 100644 --- a/paddle/fluid/operators/deformable_conv_op.h +++ b/paddle/fluid/operators/deformable_conv_op.h @@ -318,102 +318,6 @@ void FilterGradAddupCPUKernel(const int nthreads, const int n, const int height, } } -template -class DeformableConvCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("Input"); - auto* offset = ctx.Input("Offset"); - auto* mask = ctx.Input("Mask"); - Tensor filter = *ctx.Input("Filter"); - Tensor* output = ctx.Output("Output"); - output->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.template device_context(); - - const int groups = ctx.Attr("groups"); - const int deformable_groups = ctx.Attr("deformable_groups"); - const int im2col_step = ctx.Attr("im2col_step"); - const std::vector strides = ctx.Attr>("strides"); - const std::vector paddings = ctx.Attr>("paddings"); - const std::vector dilations = ctx.Attr>("dilations"); - - const int batch_size = static_cast(input->dims()[0]); - - std::vector filter_shape_vec(phi::vectorize(filter.dims())); - std::vector output_shape_vec(phi::vectorize(output->dims())); - - // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w} - std::vector col_buffer_shape_vec(filter_shape_vec.size()); - col_buffer_shape_vec[0] = - input->dims()[1] * filter.dims()[2] * filter.dims()[3]; - col_buffer_shape_vec[1] = im2col_step; - for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) { - col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(phi::make_ddim(col_buffer_shape_vec)); - std::vector output_buffer_shape_vec(1); - output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] * - output_shape_vec[2] * output_shape_vec[3]; - framework::DDim output_shape(phi::make_ddim(output_buffer_shape_vec)); - Tensor col_buffer; - Tensor output_buffer; - col_buffer = ctx.AllocateTmpTensor(col_shape, dev_ctx); - output_buffer = - ctx.AllocateTmpTensor(output_shape, dev_ctx); - int64_t M = output_shape_vec[1] / groups; - int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3]; - int64_t K = - input->dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups; - - Tensor weight_3d; - weight_3d.ShareDataWith(filter).Resize(phi::make_ddim({groups, M, K})); - Tensor col_buffer_3d; - col_buffer_3d.ShareDataWith(col_buffer) - .Resize(phi::make_ddim({groups, K, N})); - Tensor output_4d; - output_4d.ShareDataWith(output_buffer) - .Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N})); - output_4d.mutable_data(ctx.GetPlace()); - framework::DDim input_shape = - phi::slice_ddim(input->dims(), 1, input->dims().size()); - std::vector input_shape_vec = phi::vectorize(input_shape); - int input_dim = input->numel() / input->dims()[0]; - int input_offset_dim = offset->numel() / offset->dims()[0]; - int input_mask_dim = mask->numel() / mask->dims()[0]; - auto blas = phi::funcs::GetBlas(dev_ctx); - const T* input_ptr = input->data(); - const T* offset_ptr = offset->data(); - const T* mask_ptr = mask->data(); - col_buffer.mutable_data(ctx.GetPlace()); - T* col_buffer_ptr = col_buffer.data(); - for (int i = 0; i < batch_size / im2col_step; ++i) { - ModulatedDeformableIm2colCPU( - dev_ctx, input_ptr + i * im2col_step * input_dim, - offset_ptr + i * im2col_step * input_offset_dim, - mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec, - col_buffer_shape_vec, filter_shape_vec, paddings, strides, dilations, - deformable_groups, col_buffer_ptr); - Tensor output_3d = output_4d.Slice(i, i + 1).Resize( - phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size())); - // get the product of pixel and weight - for (int g = 0; g < groups; ++g) { - Tensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize( - phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size())); - Tensor col_buffer_3d_slice = - col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim( - col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); - Tensor output_3d_slice = output_3d.Slice(g, g + 1).Resize( - phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size())); - blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, false, T(1.0), - &output_3d_slice, T(0.0)); - } - } - output->ShareDataWith(output_buffer) - .Resize(phi::make_ddim(output_shape_vec)); - } -}; - template class DeformableConvGradCPUKernel : public framework::OpKernel { public: diff --git a/paddle/phi/kernels/cpu/deformable_conv_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_kernel.cc new file mode 100644 index 00000000000..0d61f7be68a --- /dev/null +++ b/paddle/phi/kernels/cpu/deformable_conv_kernel.cc @@ -0,0 +1,146 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/deformable_conv_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/deformable_conv_kernel_impl.h" + +namespace phi { + +template +inline void ModulatedDeformableIm2colCPUKernel( + const int num_kernels, + const T* data_im, + const T* data_offset, + const T* data_mask, + const int height, + const int width, + const int kernel_h, + const int kernel_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, + const int num_channels, + const int deformable_group, + const int height_col, + const int width_col, + T* data_col) { + for (int i = 0; i < num_kernels; i++) { + const int w_col = i % width_col; + const int h_col = (i / width_col) % height_col; + const int b_col = (i / width_col) / height_col % batch_size; + const int c_im = (i / width_col / height_col) / batch_size; + const int c_col = c_im * kernel_h * kernel_w; + + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + + T* data_col_ptr = + data_col + + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; + const T* data_im_ptr = + data_im + (b_col * num_channels + c_im) * height * width; + const T* data_offset_ptr = + data_offset + + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * + kernel_w * height_col * width_col; + const T* data_mask_ptr = + data_mask + + (b_col * deformable_group + deformable_group_index) * kernel_h * + kernel_w * height_col * width_col; + + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + + w_col; + const int data_mask_hw_ptr = + ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; + + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + const T mask = data_mask_ptr[data_mask_hw_ptr]; + T val = static_cast(0); + const T h_im = h_in + i * dilation_h + offset_h; + const T w_im = w_in + j * dilation_w + offset_w; + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { + val = + DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im); + } + *data_col_ptr = val * mask; + data_col_ptr += batch_size * height_col * width_col; + } + } + } +} + +template +void ModulatedDeformableIm2col(const Context& dev_ctx, + const T* data_im, + const T* data_offset, + const T* data_mask, + const std::vector& im_shape, + const std::vector& col_shape, + const std::vector& filter_shape, + const std::vector& paddings, + const std::vector& strides, + const std::vector& dilations, + const int deformable_groups, + T* data_col) { + int channel_per_deformable_group = im_shape[0] / deformable_groups; + int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3]; + + // get outputs of im2col with offset by bilinear interpolation + ModulatedDeformableIm2colCPUKernel(num_kernels, + data_im, + data_offset, + data_mask, + im_shape[1], + im_shape[2], + filter_shape[2], + filter_shape[3], + paddings[0], + paddings[1], + strides[0], + strides[1], + dilations[0], + dilations[1], + channel_per_deformable_group, + col_shape[1], + im_shape[0], + deformable_groups, + col_shape[2], + col_shape[3], + data_col); +} + +} // namespace phi + +PD_REGISTER_KERNEL(deformable_conv, + CPU, + ALL_LAYOUT, + phi::DeformableConvKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cumsum_kernel.h b/paddle/phi/kernels/cumsum_kernel.h index fd90c7b8f5e..f105c94d559 100644 --- a/paddle/phi/kernels/cumsum_kernel.h +++ b/paddle/phi/kernels/cumsum_kernel.h @@ -18,7 +18,7 @@ namespace phi { -template +template void CumsumKernel(const Context& dev_ctx, const DenseTensor& x, int axis, diff --git a/paddle/phi/kernels/deformable_conv_kernel.h b/paddle/phi/kernels/deformable_conv_kernel.h new file mode 100644 index 00000000000..3886e6801a3 --- /dev/null +++ b/paddle/phi/kernels/deformable_conv_kernel.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void DeformableConvKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& offset, + const DenseTensor& filter, + const DenseTensor& mask, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + int deformable_groups, + int groups, + int im2col_step, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu new file mode 100644 index 00000000000..1db6e1b7cf7 --- /dev/null +++ b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu @@ -0,0 +1,160 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/deformable_conv_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/impl/deformable_conv_kernel_impl.h" + +namespace phi { + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaximumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaximumNumBlocks); +} + +template +__global__ void ModulatedDeformableIm2colGpuKernel( + const int nthreads, + const T* data_im, + const T* data_offset, + const T* data_mask, + const int height, + const int width, + const int kernel_h, + const int kernel_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, + const int num_channels, + const int deformable_group, + const int height_col, + const int width_col, + T* data_col) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (size_t i = index; i < nthreads; i += offset) { + const int w_col = i % width_col; + const int h_col = (i / width_col) % height_col; + const int b_col = (i / width_col) / height_col % batch_size; + const int c_im = (i / width_col / height_col) / batch_size; + const int c_col = c_im * kernel_h * kernel_w; + + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + + T* data_col_ptr = + data_col + + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; + const T* data_im_ptr = + data_im + (b_col * num_channels + c_im) * height * width; + const T* data_offset_ptr = + data_offset + + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * + kernel_w * height_col * width_col; + const T* data_mask_ptr = + data_mask + + (b_col * deformable_group + deformable_group_index) * kernel_h * + kernel_w * height_col * width_col; + + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + + w_col; + const int data_mask_hw_ptr = + ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; + + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + const T mask = data_mask_ptr[data_mask_hw_ptr]; + T val = static_cast(0); + const T h_im = h_in + i * dilation_h + offset_h; + const T w_im = w_in + j * dilation_w + offset_w; + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { + val = + DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im); + } + *data_col_ptr = val * mask; + data_col_ptr += batch_size * height_col * width_col; + } + } + } +} + +template +void ModulatedDeformableIm2col(const Context& dev_ctx, + const T* data_im, + const T* data_offset, + const T* data_mask, + const std::vector& im_shape, + const std::vector& col_shape, + const std::vector& filter_shape, + const std::vector& paddings, + const std::vector& strides, + const std::vector& dilations, + const int deformable_groups, + T* data_col) { + int channel_per_deformable_group = im_shape[0] / deformable_groups; + int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3]; + + int blocks = NumBlocks(num_kernels); + int threads = kNumCUDAThreads; + + ModulatedDeformableIm2colGpuKernel< + T><<>>(num_kernels, + data_im, + data_offset, + data_mask, + im_shape[1], + im_shape[2], + filter_shape[2], + filter_shape[3], + paddings[0], + paddings[1], + strides[0], + strides[1], + dilations[0], + dilations[1], + channel_per_deformable_group, + col_shape[1], + im_shape[0], + deformable_groups, + col_shape[2], + col_shape[3], + data_col); +} + +} // namespace phi + +PD_REGISTER_KERNEL(deformable_conv, + GPU, + ALL_LAYOUT, + phi::DeformableConvKernel, + float, + double) {} diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h new file mode 100644 index 00000000000..d8795808a64 --- /dev/null +++ b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h @@ -0,0 +1,173 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" + +namespace phi { + +template +HOSTDEVICE T DmcnIm2colBilinear(const T* bottom_data, + const int data_width, + const int height, + const int width, + T h, + T w) { + int h_low = floor(h); + int w_low = floor(w); + int h_high = h_low + 1; + int w_high = w_low + 1; + + T lh = h - h_low; + T lw = w - w_low; + T hh = 1 - lh; + T hw = 1 - lw; + + T v1 = + (h_low >= 0 && w_low >= 0) ? bottom_data[h_low * data_width + w_low] : 0; + T v2 = (h_low >= 0 && w_high <= width - 1) + ? bottom_data[h_low * data_width + w_high] + : 0; + T v3 = (h_high <= height - 1 && w_low >= 0) + ? bottom_data[h_high * data_width + w_low] + : 0; + T v4 = (h_high <= height - 1 && w_high <= width - 1) + ? bottom_data[h_high * data_width + w_high] + : 0; + + T w1 = hh * hw; + T w2 = hh * lw; + T w3 = lh * hw; + T w4 = lh * lw; + + return w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; +} + +template +void ModulatedDeformableIm2col(const Context& dev_ctx, + const T* data_im, + const T* data_offset, + const T* data_mask, + const std::vector& im_shape, + const std::vector& col_shape, + const std::vector& filter_shape, + const std::vector& paddings, + const std::vector& strides, + const std::vector& dilations, + const int deformable_groups, + T* data_col); + +template +void DeformableConvKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& offset, + const DenseTensor& filter, + const DenseTensor& mask, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + int deformable_groups, + int groups, + int im2col_step, + DenseTensor* out) { + const int batch_size = static_cast(x.dims()[0]); + + std::vector filter_shape_vec(phi::vectorize(filter.dims())); + std::vector output_shape_vec(phi::vectorize(out->dims())); + + // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w} + std::vector col_buffer_shape_vec(filter_shape_vec.size()); + col_buffer_shape_vec[0] = x.dims()[1] * filter.dims()[2] * filter.dims()[3]; + col_buffer_shape_vec[1] = im2col_step; + for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) { + col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2]; + } + + std::vector output_buffer_shape_vec(1); + output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] * + output_shape_vec[2] * output_shape_vec[3]; + + DenseTensor col_buffer = Empty(dev_ctx, col_buffer_shape_vec); + DenseTensor output_buffer = Empty(dev_ctx, output_buffer_shape_vec); + + int64_t M = output_shape_vec[1] / groups; + int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3]; + int64_t K = x.dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups; + + DenseTensor weight_3d; + weight_3d.ShareDataWith(filter).Resize(phi::make_ddim({groups, M, K})); + + DenseTensor col_buffer_3d; + col_buffer_3d.ShareDataWith(col_buffer) + .Resize(phi::make_ddim({groups, K, N})); + + DenseTensor output_4d; + output_4d.ShareDataWith(output_buffer) + .Resize(phi::make_ddim({batch_size / im2col_step, groups, M, N})); + + DDim input_shape = phi::slice_ddim(x.dims(), 1, x.dims().size()); + std::vector input_shape_vec = phi::vectorize(input_shape); + + int input_dim = x.numel() / x.dims()[0]; + int input_offset_dim = offset.numel() / offset.dims()[0]; + int input_mask_dim = mask.numel() / mask.dims()[0]; + + auto blas = phi::funcs::GetBlas(dev_ctx); + + const T* input_ptr = x.data(); + const T* offset_ptr = offset.data(); + const T* mask_ptr = mask.data(); + T* col_buffer_ptr = col_buffer.data(); + + for (int i = 0; i < batch_size / im2col_step; ++i) { + ModulatedDeformableIm2col(dev_ctx, + input_ptr + i * im2col_step * input_dim, + offset_ptr + i * im2col_step * input_offset_dim, + mask_ptr + i * im2col_step * input_mask_dim, + input_shape_vec, + col_buffer_shape_vec, + filter_shape_vec, + paddings, + strides, + dilations, + deformable_groups, + col_buffer_ptr); + DenseTensor output_3d = output_4d.Slice(i, i + 1).Resize( + phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size())); + // get the product of pixel and weight + for (int g = 0; g < groups; ++g) { + DenseTensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize( + phi::slice_ddim(weight_3d.dims(), 1, weight_3d.dims().size())); + DenseTensor col_buffer_3d_slice = + col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim( + col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); + DenseTensor output_3d_slice = output_3d.Slice(g, g + 1).Resize( + phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size())); + blas.MatMul(weight_3d_slice, + false, + col_buffer_3d_slice, + false, + T(1.0), + &output_3d_slice, + T(0.0)); + } + } + out->ShareDataWith(output_buffer).Resize(phi::make_ddim(output_shape_vec)); +} + +} // namespace phi diff --git a/paddle/phi/ops/compat/cumprod_sig.cc b/paddle/phi/ops/compat/cumprod_sig.cc index 59b4eabfa47..01084e764ed 100644 --- a/paddle/phi/ops/compat/cumprod_sig.cc +++ b/paddle/phi/ops/compat/cumprod_sig.cc @@ -1,4 +1,3 @@ - // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/paddle/phi/ops/compat/deformable_conv_sig.cc b/paddle/phi/ops/compat/deformable_conv_sig.cc new file mode 100644 index 00000000000..e2a21673634 --- /dev/null +++ b/paddle/phi/ops/compat/deformable_conv_sig.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature DeformableConvOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("deformable_conv", + {"Input", "Offset", "Filter", "Mask"}, + {"strides", + "paddings", + "dilations", + "deformable_groups", + "groups", + "im2col_step"}, + {"Output"}); +} + +} // namespace phi +PD_REGISTER_ARG_MAPPING_FN(deformable_conv, + phi::DeformableConvOpArgumentMapping); -- GitLab From 95fbbc5b47a08bb5bb62d0161f795c9ff2ffb813 Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Sat, 19 Mar 2022 18:05:19 +0800 Subject: [PATCH 176/176] Call sparse op from python (#40608) * call sparse api from python --- .../final_state_generator/eager_gen.py | 17 +-- .../final_state_generator/python_c_gen.py | 2 +- paddle/fluid/pybind/eager_method.cc | 114 ++++++++++++++++++ paddle/phi/api/include/tensor.h | 16 +++ paddle/phi/api/lib/sparse_api_custom_impl.cc | 56 ++++----- paddle/phi/api/lib/sparse_api_custom_impl.h | 8 +- paddle/phi/api/lib/tensor.cc | 8 ++ paddle/phi/tests/api/test_sparse_utils_api.cc | 15 +-- .../tests/unittests/test_sparse_utils_op.py | 60 +++++++++ python/paddle/tensor/to_string.py | 51 ++++++-- python/paddle/utils/code_gen/sparse_api.yaml | 13 +- .../paddle/utils/code_gen/sparse_api_gen.py | 4 +- .../utils/code_gen/sparse_bw_api_gen.py | 4 +- 13 files changed, 298 insertions(+), 70 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_sparse_utils_op.py diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 588fe312a3c..1685b6f3cb5 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -730,7 +730,7 @@ def GenerateNodeCreationCodes( else: # Tuple api_result if IsPlainTensorType(rtype): - output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);" + output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));" else: assert IsVectorTensorType(rtype) output_autograd_meta = f" std::vector {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);\n" @@ -767,8 +767,11 @@ def GenerateNodeCreationCodes( else: set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, true);" else: - if IsVectorTensorType(atype): - tw_name = f"api_result[{pos}]" + if num_fwd_outputs > 1: + # Aligned with forward output position + assert name in forward_outputs_position_map.keys() + fwd_output_pos = forward_outputs_position_map[name][1] + tw_name = f"std::get<{fwd_output_pos}>(api_result)" else: tw_name = f"api_result" @@ -805,8 +808,8 @@ def GenerateNodeCreationCodes( set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(api_result);" set_grad_in_meta = f" grad_node->SetGradInMeta(api_result, {pos});" else: - set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(api_result[{pos}]);" - set_grad_in_meta = f" grad_node->SetGradInMeta(api_result[{pos}], {pos});" + set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(std::get<{pos}>(api_result));" + set_grad_in_meta = f" grad_node->SetGradInMeta(std::get<{pos}>(api_result), {pos});" set_out_rank_list.append(set_out_rank) set_history_list.append(set_history) @@ -934,7 +937,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, returns_list[0] = f"api_result" else: # Tuple api_result - returns_list[pos] = f"api_result[{pos}]" + returns_list[pos] = f"std::get<{pos}>(api_result)" if IsPlainTensorType(rtype): returns_type_list[pos] = "paddle::experimental::Tensor" @@ -1084,7 +1087,7 @@ def GenerateNodeCCFile(filepath, node_definition_str): #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h" #include "paddle/fluid/eager/to_static/run_program_op_node.h" -#include "paddle/phi/api/include/sparse_api.h" +#include "paddle/phi/api/backward/sparse_bw_api.h" """ file_contents += node_definition_str with open(filepath, 'a') as f: diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index 753c8ca3aaf..e1c2cf871ea 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -337,7 +337,7 @@ class PythonCSingleFunctionGenerator: "paddle::experimental::", namespace, forward_api_name) else: fwd_function_name = FUNCTION_NAME_TEMPLATE.format( - "", namespace, GetForwardFunctionName(forward_api_name)) + "::", namespace, GetForwardFunctionName(forward_api_name)) # Generate Record Event for performance profiling pythonc_record_event_str = RECORD_EVENT_TEMPLATE.format( diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index e0a3931c3e3..49745e5679d 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -36,6 +36,8 @@ limitations under the License. */ #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/core/sparse_csr_tensor.h" namespace paddle { namespace pybind { @@ -718,6 +720,98 @@ static PyObject* set_grad_type(TensorObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } +static PyObject* tensor_method_get_non_zero_indices(TensorObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY + PADDLE_ENFORCE(self->tensor.is_sparse_coo_tensor(), + paddle::platform::errors::Fatal( + "this method is only effective for SparseCooTensor")); + auto sparse_coo_tensor = + std::dynamic_pointer_cast(self->tensor.impl()); + paddle::experimental::Tensor tensor(std::make_shared( + sparse_coo_tensor->non_zero_indices())); + return ToPyObject(tensor); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* tensor_method_get_non_zero_elements(TensorObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY + PADDLE_ENFORCE( + self->tensor.is_sparse_coo_tensor() || + self->tensor.is_sparse_csr_tensor(), + paddle::platform::errors::Fatal("this method is only effective for " + "SparseCooTensor or SparseCsrTensor")); + if (self->tensor.is_sparse_coo_tensor()) { + auto sparse_coo_tensor = + std::dynamic_pointer_cast(self->tensor.impl()); + paddle::experimental::Tensor tensor(std::make_shared( + sparse_coo_tensor->non_zero_elements())); + return ToPyObject(tensor); + } else { + auto sparse_csr_tensor = + std::dynamic_pointer_cast(self->tensor.impl()); + paddle::experimental::Tensor tensor(std::make_shared( + sparse_csr_tensor->non_zero_elements())); + return ToPyObject(tensor); + } + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* tensor_method_get_non_zero_crows(TensorObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY + PADDLE_ENFORCE(self->tensor.is_sparse_csr_tensor(), + paddle::platform::errors::Fatal( + "this method is only effective for SparseCsrTensor")); + auto sparse_csr_tensor = + std::dynamic_pointer_cast(self->tensor.impl()); + paddle::experimental::Tensor tensor( + std::make_shared(sparse_csr_tensor->non_zero_crows())); + return ToPyObject(tensor); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* tensor_method_get_non_zero_cols(TensorObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY + PADDLE_ENFORCE(self->tensor.is_sparse_csr_tensor(), + paddle::platform::errors::Fatal( + "this method is only effective for SparseCsrTensor")); + auto sparse_csr_tensor = + std::dynamic_pointer_cast(self->tensor.impl()); + paddle::experimental::Tensor tensor( + std::make_shared(sparse_csr_tensor->non_zero_cols())); + return ToPyObject(tensor); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* tensor_method_is_sparse(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + return ToPyObject(self->tensor.is_sparse_coo_tensor() || + self->tensor.is_sparse_csr_tensor()); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* tensor_method_is_sparse_coo(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + return ToPyObject(self->tensor.is_sparse_coo_tensor()); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* tensor_method_is_sparse_csr(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + return ToPyObject(self->tensor.is_sparse_csr_tensor()); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + static PyObject* tensor__inplace_version(TensorObject* self, PyObject* args, PyObject* kwargs) { EAGER_TRY @@ -775,6 +869,26 @@ PyMethodDef variable_methods[] = { METH_VARARGS | METH_KEYWORDS, NULL}, {"_set_grad_type", (PyCFunction)(void (*)(void))set_grad_type, METH_VARARGS | METH_KEYWORDS, NULL}, + /***the method of sparse tensor****/ + {"non_zero_indices", + (PyCFunction)(void (*)(void))tensor_method_get_non_zero_indices, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"non_zero_elements", + (PyCFunction)(void (*)(void))tensor_method_get_non_zero_elements, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"non_zero_crows", + (PyCFunction)(void (*)(void))tensor_method_get_non_zero_crows, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"non_zero_cols", + (PyCFunction)(void (*)(void))tensor_method_get_non_zero_cols, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"is_sparse", (PyCFunction)(void (*)(void))tensor_method_is_sparse, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"is_sparse_coo", (PyCFunction)(void (*)(void))tensor_method_is_sparse_coo, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"is_sparse_csr", (PyCFunction)(void (*)(void))tensor_method_is_sparse_csr, + METH_VARARGS | METH_KEYWORDS, NULL}, + /***the method of sparse tensor****/ {"_inplace_version", (PyCFunction)(void (*)(void))tensor__inplace_version, METH_VARARGS | METH_KEYWORDS, NULL}, {NULL, NULL, 0, NULL}}; diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h index eae8d12fb37..c58ebe69523 100644 --- a/paddle/phi/api/include/tensor.h +++ b/paddle/phi/api/include/tensor.h @@ -225,6 +225,22 @@ class PADDLE_API Tensor final { */ bool is_selected_rows() const; + /** + * @brief Determine whether tensor is SparseCooTensor + * + * @return true + * @return false + */ + bool is_sparse_coo_tensor() const; + + /** + * @brief Determine whether tensor is SparseCsrTensor + * + * @return true + * @return false + */ + bool is_sparse_csr_tensor() const; + /* Part 3: Device and Backend methods */ /** diff --git a/paddle/phi/api/lib/sparse_api_custom_impl.cc b/paddle/phi/api/lib/sparse_api_custom_impl.cc index 832c19361e5..8f8de02e49b 100644 --- a/paddle/phi/api/lib/sparse_api_custom_impl.cc +++ b/paddle/phi/api/lib/sparse_api_custom_impl.cc @@ -25,25 +25,24 @@ namespace paddle { namespace experimental { namespace sparse { -Tensor to_sparse_coo_impl(const Tensor& x, - Backend backend, - const int64_t sparse_dim) { +Tensor to_sparse_coo_impl(const Tensor& x, const int64_t sparse_dim) { if (x.layout() == phi::DataLayout::SPARSE_COO) { return x; } + // 1. Get kernel signature and kernel - auto kernel_key_set = ParseKernelKeyByInputArgs(x); - kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend); - auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); std::string kernel_name = "dense_to_sparse_coo"; if (x.layout() == phi::DataLayout::SPARSE_CSR) { kernel_name = "sparse_csr_to_coo"; } + auto kernel_key_set = ParseKernelKeyByInputArgs(x); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); + auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( kernel_name, kernel_key); - VLOG(6) << "to API kernel key: " << kernel_key; + VLOG(6) << "add API kernel key: " << kernel_key; VLOG(6) << "to API kernel: " << kernel; // 2. Get Device Context @@ -62,18 +61,18 @@ Tensor to_sparse_coo_impl(const Tensor& x, // 4. InferMeta auto indices_meta = - phi::DenseTensorMeta(phi::DataType::INT64, {-1}, phi::DataLayout::NCHW); - auto elements_meta = phi::DenseTensorMeta(x.dtype(), {-1}, x.layout()); + phi::DenseTensorMeta(phi::DataType::INT64, {1}, phi::DataLayout::NCHW); + auto elements_meta = phi::DenseTensorMeta(x.dtype(), {1}, x.layout()); // 5. Prepare outputs // create empty SparseCooTensor phi::DenseTensor non_zero_indices( phi::make_intrusive( - phi::TransToPhiPlace(backend)), + phi::TransToPhiPlace(kernel_key.backend())), std::move(indices_meta)); phi::DenseTensor non_zero_elements( phi::make_intrusive( - phi::TransToPhiPlace(backend)), + phi::TransToPhiPlace(kernel_key.backend())), std::move(elements_meta)); auto coo = std::make_shared( non_zero_indices, non_zero_elements, x.dims()); @@ -88,23 +87,23 @@ Tensor to_sparse_coo_impl(const Tensor& x, return out; } -Tensor to_sparse_csr_impl(const Tensor& x, Backend backend) { +Tensor to_sparse_csr_impl(const Tensor& x) { if (x.layout() == phi::DataLayout::SPARSE_CSR) { return x; } // 1. Get kernel signature and kernel - auto kernel_key_set = ParseKernelKeyByInputArgs(x); - kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend); - auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); std::string kernel_name = "dense_to_sparse_csr"; if (x.layout() == phi::DataLayout::SPARSE_COO) { kernel_name = "sparse_coo_to_csr"; } + auto kernel_key_set = ParseKernelKeyByInputArgs(x); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); + auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( kernel_name, kernel_key); - VLOG(6) << "to API kernel key: " << kernel_key; + VLOG(6) << "add API kernel key: " << kernel_key; VLOG(6) << "to API kernel: " << kernel; // 2. Get Device Context @@ -122,24 +121,24 @@ Tensor to_sparse_csr_impl(const Tensor& x, Backend backend) { // 4. InferMeta auto crows_meta = - phi::DenseTensorMeta(phi::DataType::INT64, {-1}, phi::DataLayout::NCHW); + phi::DenseTensorMeta(phi::DataType::INT64, {1}, phi::DataLayout::NCHW); auto cols_meta = - phi::DenseTensorMeta(phi::DataType::INT64, {-1}, phi::DataLayout::NCHW); - auto elements_meta = phi::DenseTensorMeta(x.dtype(), {-1}, x.layout()); + phi::DenseTensorMeta(phi::DataType::INT64, {1}, phi::DataLayout::NCHW); + auto elements_meta = phi::DenseTensorMeta(x.dtype(), {1}, x.layout()); // 5. Prepare outputs // create empty SparseCooTensor phi::DenseTensor non_zero_crows( phi::make_intrusive( - phi::TransToPhiPlace(backend)), + phi::TransToPhiPlace(kernel_key.backend())), std::move(crows_meta)); phi::DenseTensor non_zero_cols( phi::make_intrusive( - phi::TransToPhiPlace(backend)), + phi::TransToPhiPlace(kernel_key.backend())), std::move(cols_meta)); phi::DenseTensor non_zero_elements( phi::make_intrusive( - phi::TransToPhiPlace(backend)), + phi::TransToPhiPlace(kernel_key.backend())), std::move(elements_meta)); auto csr = std::make_shared( non_zero_crows, non_zero_cols, non_zero_elements, x.dims()); @@ -154,24 +153,25 @@ Tensor to_sparse_csr_impl(const Tensor& x, Backend backend) { return out; } -Tensor to_dense_impl(const Tensor& x, Backend backend) { +Tensor to_dense_impl(const Tensor& x) { if (x.layout() != phi::DataLayout::SPARSE_CSR && x.layout() != phi::DataLayout::SPARSE_COO) { return x; } + // 1. Get kernel signature and kernel - auto kernel_key_set = ParseKernelKeyByInputArgs(x); - kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend); - auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); std::string kernel_name = "sparse_coo_to_dense"; if (x.layout() == phi::DataLayout::SPARSE_CSR) { kernel_name = "sparse_csr_to_dense"; } + auto kernel_key_set = ParseKernelKeyByInputArgs(x); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); + auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( kernel_name, kernel_key); - VLOG(6) << "to API kernel key: " << kernel_key; + VLOG(6) << "add API kernel key: " << kernel_key; VLOG(6) << "to API kernel: " << kernel; // 2. Get Device Context @@ -194,7 +194,7 @@ Tensor to_dense_impl(const Tensor& x, Backend backend) { // create empty SparseCooTensor auto dense_out = std::make_shared( phi::make_intrusive( - phi::TransToPhiPlace(backend)), + phi::TransToPhiPlace(kernel_key.backend())), std::move(dense_meta)); kernel_context.EmplaceBackOutput(dense_out.get()); diff --git a/paddle/phi/api/lib/sparse_api_custom_impl.h b/paddle/phi/api/lib/sparse_api_custom_impl.h index 293b2cfa3d3..6053d281f0f 100644 --- a/paddle/phi/api/lib/sparse_api_custom_impl.h +++ b/paddle/phi/api/lib/sparse_api_custom_impl.h @@ -21,13 +21,11 @@ namespace paddle { namespace experimental { namespace sparse { -Tensor to_dense_impl(const Tensor& x, Backend backend); +Tensor to_dense_impl(const Tensor& x); -Tensor to_sparse_coo_impl(const Tensor& x, - Backend backend, - const int64_t sparse_dim); +Tensor to_sparse_coo_impl(const Tensor& x, const int64_t sparse_dim); -Tensor to_sparse_csr_impl(const Tensor& x, Backend backend); +Tensor to_sparse_csr_impl(const Tensor& x); } // namespace sparse } // namespace experimental diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index 6090e6a400a..066287d4244 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -25,6 +25,8 @@ limitations under the License. */ #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/selected_rows.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/core/sparse_csr_tensor.h" #include "paddle/phi/core/tensor_base.h" #include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/core/tensor_utils.h" @@ -132,6 +134,12 @@ bool Tensor::is_dense_tensor() const { bool Tensor::is_selected_rows() const { return phi::SelectedRows::classof(impl_.get()); } +bool Tensor::is_sparse_coo_tensor() const { + return phi::SparseCooTensor::classof(impl_.get()); +} +bool Tensor::is_sparse_csr_tensor() const { + return phi::SparseCsrTensor::classof(impl_.get()); +} /* Part 3: Device and Backend methods */ PlaceType Tensor::place() const { diff --git a/paddle/phi/tests/api/test_sparse_utils_api.cc b/paddle/phi/tests/api/test_sparse_utils_api.cc index 8595782be35..da66334ced7 100644 --- a/paddle/phi/tests/api/test_sparse_utils_api.cc +++ b/paddle/phi/tests/api/test_sparse_utils_api.cc @@ -53,8 +53,7 @@ TEST(API, to_sparse_coo) { // 1. test dense_to_sparse_coo paddle::experimental::Tensor x(dense_x); - auto out = paddle::experimental::sparse::to_sparse_coo( - x, phi::Backend::CPU, sparse_dim); + auto out = paddle::experimental::sparse::to_sparse_coo(x, sparse_dim); auto coo = std::dynamic_pointer_cast(out.impl()); ASSERT_EQ(coo->nnz(), non_zero_num); int cmp_indices = memcmp(coo->non_zero_indices().data(), @@ -91,8 +90,7 @@ TEST(API, to_sparse_coo) { auto csr = std::make_shared(crows, cols, values, dense_dims); paddle::experimental::Tensor csr_x(csr); - auto out2 = paddle::experimental::sparse::to_sparse_coo( - csr_x, phi::Backend::CPU, sparse_dim); + auto out2 = paddle::experimental::sparse::to_sparse_coo(csr_x, sparse_dim); auto coo2 = std::dynamic_pointer_cast(out.impl()); ASSERT_EQ(coo2->nnz(), non_zero_num); @@ -132,7 +130,7 @@ TEST(API, to_sparse_csr) { // 1. test dense_to_sparse_csr paddle::experimental::Tensor x(dense_x); - auto out = paddle::experimental::sparse::to_sparse_csr(x, phi::Backend::CPU); + auto out = paddle::experimental::sparse::to_sparse_csr(x); auto csr = std::dynamic_pointer_cast(out.impl()); auto check = [&](const phi::SparseCsrTensor& csr) { ASSERT_EQ(csr.non_zero_cols().numel(), non_zero_num); @@ -170,8 +168,7 @@ TEST(API, to_sparse_csr) { auto coo = std::make_shared(indices, values, dense_dims); paddle::experimental::Tensor coo_x(coo); - auto out2 = - paddle::experimental::sparse::to_sparse_csr(coo_x, phi::Backend::CPU); + auto out2 = paddle::experimental::sparse::to_sparse_csr(coo_x); auto csr2 = std::dynamic_pointer_cast(out.impl()); check(*csr2); @@ -212,7 +209,7 @@ TEST(API, to_dense) { std::make_shared(indices, values, dense_dims); paddle::experimental::Tensor coo_x(coo); - auto out = paddle::experimental::sparse::to_dense(coo_x, phi::Backend::CPU); + auto out = paddle::experimental::sparse::to_dense(coo_x); auto dense_out = std::dynamic_pointer_cast(out.impl()); int cmp1 = memcmp(dense_out->data(), &dense_data[0][0], 9 * sizeof(float)); @@ -237,7 +234,7 @@ TEST(API, to_dense) { auto csr = std::make_shared(crows, cols, values, dense_dims); paddle::experimental::Tensor csr_x(csr); - auto out2 = paddle::experimental::sparse::to_dense(csr_x, phi::Backend::CPU); + auto out2 = paddle::experimental::sparse::to_dense(csr_x); auto dense_out2 = std::dynamic_pointer_cast(out.impl()); int cmp2 = diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py new file mode 100644 index 00000000000..8284771920e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py @@ -0,0 +1,60 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +import numpy as np +import paddle +from paddle import _C_ops +from paddle.fluid.framework import _test_eager_guard + + +class TestSparseUtils(unittest.TestCase): + def test_to_sparse_coo(self): + with _test_eager_guard(): + x = [[0, 1, 0, 2], [0, 0, 3, 0], [4, 5, 0, 0]] + non_zero_indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]] + non_zero_elements = [1, 2, 3, 4, 5] + dense_x = paddle.to_tensor(x) + #TODO(zhangkaihuo): change to test the corresponding API + out = _C_ops.final_state_to_sparse_coo(dense_x, 2) + print(out) + assert np.array_equal(out.non_zero_indices().numpy(), + non_zero_indices) + assert np.array_equal(out.non_zero_elements().numpy(), + non_zero_elements) + + dense_tensor = _C_ops.final_state_to_dense(out) + assert np.array_equal(dense_tensor.numpy(), x) + + def test_to_sparse_csr(self): + with _test_eager_guard(): + x = [[0, 1, 0, 2], [0, 0, 3, 0], [4, 5, 0, 0]] + non_zero_crows = [0, 2, 3, 5] + non_zero_cols = [1, 3, 2, 0, 1] + non_zero_elements = [1, 2, 3, 4, 5] + dense_x = paddle.to_tensor(x) + out = _C_ops.final_state_to_sparse_csr(dense_x) + print(out) + assert np.array_equal(out.non_zero_crows().numpy(), non_zero_crows) + assert np.array_equal(out.non_zero_cols().numpy(), non_zero_cols) + assert np.array_equal(out.non_zero_elements().numpy(), + non_zero_elements) + + dense_tensor = _C_ops.final_state_to_dense(out) + assert np.array_equal(dense_tensor.numpy(), x) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py index 85672ec7a36..f164bbc466f 100644 --- a/python/paddle/tensor/to_string.py +++ b/python/paddle/tensor/to_string.py @@ -263,14 +263,7 @@ def to_string(var, prefix='Tensor'): data=data) -def tensor_to_string(tensor, prefix='Tensor'): - indent = len(prefix) + 1 - - _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient},\n{indent}{data})" - - if not tensor._is_initialized(): - return "Tensor(Not initialized)" - +def _format_dense_tensor(tensor, indent): np_tensor = tensor.numpy() if len(tensor.shape) == 0: @@ -288,6 +281,26 @@ def tensor_to_string(tensor, prefix='Tensor'): data = _format_tensor( np_tensor, sumary, indent=indent, max_width=max_width, signed=signed) + return data + + +def sparse_tensor_to_string(tensor, prefix='Tensor'): + indent = len(prefix) + 1 + _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient}, \n{indent}{data})" + if tensor.is_sparse_coo(): + indices_tensor = tensor.non_zero_indices() + elements_tensor = tensor.non_zero_elements() + indices_data = _format_dense_tensor(indices_tensor, indent) + elements_data = _format_dense_tensor(elements_tensor, indent) + data = 'non_zero_indices=' + indices_data + ',\nnon_zero_elements=' + elements_data + else: + crows_tensor = tensor.non_zero_crows() + cols_tensor = tensor.non_zero_cols() + elements_tensor = tensor.non_zero_elements() + crows_data = _format_dense_tensor(crows_tensor, indent) + cols_data = _format_dense_tensor(cols_tensor, indent) + elements_data = _format_dense_tensor(elements_tensor, indent) + data = 'non_zero_crows=' + crows_data + ',\nnon_zero_cols=' + cols_data + ',\nnon_zero_elements=' + elements_data return _template.format( prefix=prefix, @@ -297,3 +310,25 @@ def tensor_to_string(tensor, prefix='Tensor'): stop_gradient=tensor.stop_gradient, indent=' ' * indent, data=data) + + +def tensor_to_string(tensor, prefix='Tensor'): + indent = len(prefix) + 1 + + _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient},\n{indent}{data})" + + if not tensor._is_initialized(): + return "Tensor(Not initialized)" + + if tensor.is_sparse(): + return sparse_tensor_to_string(tensor, prefix) + else: + data = _format_dense_tensor(tensor, indent) + return _template.format( + prefix=prefix, + shape=tensor.shape, + dtype=tensor.dtype, + place=tensor._place_str, + stop_gradient=tensor.stop_gradient, + indent=' ' * indent, + data=data) diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml index 9c859022e8a..2d1fe78b559 100644 --- a/python/paddle/utils/code_gen/sparse_api.yaml +++ b/python/paddle/utils/code_gen/sparse_api.yaml @@ -4,18 +4,19 @@ kernel : func : sparse_conv3d layout : x + backward : conv3d_grad - api : to_dense - args : (Tensor x, Backend backend) + args : (Tensor x) output : Tensor(out@DenseTensor) - invoke : to_dense_impl(x, backend) + invoke : to_dense_impl(x) - api : to_sparse_coo - args : (Tensor x, Backend backend, int64 sparse_dim) + args : (Tensor x, int64 sparse_dim) output : Tensor(out@SparseCooTensor) - invoke : to_sparse_coo_impl(x, backend, sparse_dim) + invoke : to_sparse_coo_impl(x, sparse_dim) - api : to_sparse_csr - args : (Tensor x, Backend backend) + args : (Tensor x) output : Tensor(out@SparseCsrTensor) - invoke : to_sparse_csr_impl(x, backend) + invoke : to_sparse_csr_impl(x) diff --git a/python/paddle/utils/code_gen/sparse_api_gen.py b/python/paddle/utils/code_gen/sparse_api_gen.py index dd22e16dc64..b4fc7638622 100644 --- a/python/paddle/utils/code_gen/sparse_api_gen.py +++ b/python/paddle/utils/code_gen/sparse_api_gen.py @@ -192,9 +192,7 @@ def source_include(header_file_path): def api_register(): - return """ -PD_REGISTER_API(Test); -""" + return "" def api_namespace(): diff --git a/python/paddle/utils/code_gen/sparse_bw_api_gen.py b/python/paddle/utils/code_gen/sparse_bw_api_gen.py index 561e198a41b..5dac7c8c483 100644 --- a/python/paddle/utils/code_gen/sparse_bw_api_gen.py +++ b/python/paddle/utils/code_gen/sparse_bw_api_gen.py @@ -115,9 +115,7 @@ def source_include(header_file_path): def api_register(): - return """ -PD_REGISTER_API(Test); -""" + return "" def api_namespace(): -- GitLab