From b4f67757ff40f8dfc5dbcf5c55df2771d3f58d82 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 16 Aug 2022 05:43:37 -0500 Subject: [PATCH] [Phi] Move amp ops into phi (#45079) * move check finite and unscale kernel into phi * move infershape into phi * move update_loss_scaling kernel into phi * remove original kernels * move update loss scaling infershape into phi * add header for xpu and npu * solve coverage failed * fix npu test failed * remove mutable data in cu file * fix new executor failed * add valid check for meta tensor output --- paddle/fluid/framework/infershape_utils.cc | 22 ++ .../framework/new_executor/data_transfer.cc | 17 +- paddle/fluid/framework/operator.cc | 4 + paddle/fluid/imperative/prepared_operator.h | 4 + .../amp/check_finite_and_unscale_op.cc | 73 +--- .../amp/check_finite_and_unscale_op.cu | 172 --------- .../amp/check_finite_and_unscale_op.h | 32 -- .../amp/check_finite_and_unscale_op_mlu.cc | 2 +- .../amp/check_finite_and_unscale_op_npu.cc | 2 +- .../check_finite_and_unscale_op_npu_test.cc | 2 +- .../amp/check_finite_and_unscale_op_xpu.cc | 2 +- .../operators/amp/update_loss_scaling_op.cc | 117 +----- .../operators/amp/update_loss_scaling_op.cu | 210 ---------- .../operators/amp/update_loss_scaling_op.h | 214 ----------- .../amp/update_loss_scaling_op_npu.cc | 31 +- .../amp/update_loss_scaling_op_xpu.cc | 3 +- paddle/phi/core/meta_tensor.cc | 62 ++- paddle/phi/infermeta/multiary.cc | 241 +++++++----- paddle/phi/infermeta/multiary.h | 59 +-- paddle/phi/kernels/amp_kernel.h | 48 +++ paddle/phi/kernels/cpu/amp_kernel.cc | 115 ++++++ paddle/phi/kernels/gpu/amp_kernel.cu | 358 ++++++++++++++++++ paddle/phi/kernels/impl/amp_kernel_impl.h | 183 +++++++++ .../phi/ops/compat/update_loss_scaling_sig.cc | 47 +++ .../unittests/test_update_loss_scaling_op.py | 4 +- 25 files changed, 1053 insertions(+), 971 deletions(-) delete mode 100644 paddle/fluid/operators/amp/check_finite_and_unscale_op.cu delete mode 100644 paddle/fluid/operators/amp/check_finite_and_unscale_op.h delete mode 100644 paddle/fluid/operators/amp/update_loss_scaling_op.cu delete mode 100644 paddle/fluid/operators/amp/update_loss_scaling_op.h create mode 100644 paddle/phi/kernels/amp_kernel.h create mode 100644 paddle/phi/kernels/cpu/amp_kernel.cc create mode 100644 paddle/phi/kernels/gpu/amp_kernel.cu create mode 100644 paddle/phi/kernels/impl/amp_kernel_impl.h create mode 100644 paddle/phi/ops/compat/update_loss_scaling_sig.cc diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index eb988d59a2a..debe43fab82 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -127,7 +127,15 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext { const InferShapeContext& ctx_; }; +static inline void ValidCheck(const phi::MetaTensor& meta_tensor) { + PADDLE_ENFORCE_EQ(meta_tensor.initialized(), + true, + phi::errors::InvalidArgument( + "The current CompatMetaTensor is not initialized.")); +} + int64_t CompatMetaTensor::numel() const { + ValidCheck(*this); if (is_runtime_) { auto* var = PADDLE_GET_CONST(Variable*, var_); return var->Get().numel(); @@ -138,6 +146,7 @@ int64_t CompatMetaTensor::numel() const { } DDim CompatMetaTensor::dims() const { + ValidCheck(*this); if (is_runtime_) { auto* var = PADDLE_GET_CONST(Variable*, var_); if (var->IsType()) { @@ -162,6 +171,7 @@ DDim CompatMetaTensor::dims() const { } phi::DataType CompatMetaTensor::dtype() const { + ValidCheck(*this); if (is_runtime_) { auto* var = PADDLE_GET_CONST(Variable*, var_); if (var->IsType()) { @@ -183,6 +193,7 @@ phi::DataType CompatMetaTensor::dtype() const { } DataLayout CompatMetaTensor::layout() const { + ValidCheck(*this); if (is_runtime_) { auto* var = PADDLE_GET_CONST(Variable*, var_); if (var->IsType()) { @@ -206,6 +217,7 @@ DataLayout CompatMetaTensor::layout() const { } void CompatMetaTensor::set_dims(const DDim& dims) { + ValidCheck(*this); if (is_runtime_) { auto* var = PADDLE_GET(Variable*, var_); if (var->IsType()) { @@ -236,6 +248,7 @@ void CompatMetaTensor::set_dims(const DDim& dims) { } void CompatMetaTensor::set_dtype(phi::DataType dtype) { + ValidCheck(*this); if (is_runtime_) { auto* var = PADDLE_GET(Variable*, var_); if (var->IsType()) { @@ -258,6 +271,7 @@ void CompatMetaTensor::set_dtype(phi::DataType dtype) { } void CompatMetaTensor::set_layout(DataLayout layout) { + ValidCheck(*this); if (is_runtime_) { auto* var = PADDLE_GET(Variable*, var_); if (var->IsType()) { @@ -281,6 +295,8 @@ void CompatMetaTensor::set_layout(DataLayout layout) { } void CompatMetaTensor::share_lod(const MetaTensor& meta_tensor) { + ValidCheck(*this); + ValidCheck(meta_tensor); if (is_runtime_) { auto* var = PADDLE_GET(Variable*, var_); if (var->IsType()) { @@ -299,6 +315,8 @@ void CompatMetaTensor::share_lod(const MetaTensor& meta_tensor) { } void CompatMetaTensor::share_dims(const MetaTensor& meta_tensor) { + ValidCheck(*this); + ValidCheck(meta_tensor); set_dims(meta_tensor.dims()); if (is_runtime_) { auto* var = PADDLE_GET(Variable*, var_); @@ -472,6 +490,10 @@ CompatInferMetaContext BuildInferMetaContext(InferShapeContext* ctx, infer_meta_context.EmplaceBackAttr( phi::Scalar(PADDLE_GET_CONST(std::string, attr))); break; + case framework::proto::AttrType::BOOLEAN: + infer_meta_context.EmplaceBackAttr( + phi::Scalar(PADDLE_GET_CONST(bool, attr))); + break; default: PADDLE_THROW(platform::errors::Unimplemented( "Unsupported cast op attribute `%s` to Scalar when construct " diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc index 33e50b249ad..32277ed54bb 100644 --- a/paddle/fluid/framework/new_executor/data_transfer.cc +++ b/paddle/fluid/framework/new_executor/data_transfer.cc @@ -135,21 +135,14 @@ void DataTranferHelper::RunAndConstructOpFuncNode( bool run_phi_kernel = false; // check if phi kernel exists - auto phi_kernel_map = - phi::KernelFactory::Instance().SelectKernelMap(op_with_kernel->Type()); - if (phi_kernel_map.size() > 0) { + if (phi::KernelFactory::Instance().HasCompatiblePhiKernel( + op_with_kernel->Type())) { auto phi_kernel_key = op_with_kernel->ChoosePhiKernel(exec_ctx); VLOG(6) << "phi_kernel_key " << phi_kernel_key << "\n"; - // this function is used to construct data transfer op - // we expect that it always has a valid phi kernel - // so no need to fallback to cpu kernel - PADDLE_ENFORCE_EQ( - op_with_kernel->PhiKernel()->IsValid(), - true, - platform::errors::PreconditionNotMet( - "the %s op has no valid phi kernel.", op_with_kernel->Type())); - run_phi_kernel = true; + if (op_with_kernel->PhiKernel()->IsValid()) { + run_phi_kernel = true; + } } // 3. Execute transfer op and construct OpFuncNode diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index b4ef3efb821..4fb7b0e018d 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2752,6 +2752,10 @@ void OperatorWithKernel::BuildPhiKernelContext( phi_kernel_context->EmplaceBackAttr(std::move(phi::Scalar( PADDLE_GET_CONST(std::string, attr_iter->second)))); break; + case proto::AttrType::BOOLEAN: + phi_kernel_context->EmplaceBackAttr(std::move( + phi::Scalar(PADDLE_GET_CONST(bool, attr_iter->second)))); + break; default: PADDLE_THROW(platform::errors::Unimplemented( "Unsupported cast op attribute `%s` to Scalar when construct " diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index c2b23f31d1d..d043b4a5aad 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -420,6 +420,10 @@ void BuildDygraphPhiKernelContext(const phi::KernelSignature& kernel_signature, kernel_ctx->EmplaceBackAttr( std::move(phi::Scalar(PADDLE_GET_CONST(std::string, attr)))); break; + case framework::proto::AttrType::BOOLEAN: + kernel_ctx->EmplaceBackAttr( + std::move(phi::Scalar(PADDLE_GET_CONST(bool, attr)))); + break; default: PADDLE_THROW(platform::errors::Unimplemented( "Unsupported cast op attribute `%s` to Scalar when construct " diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc index 8fc582c1984..3404209063e 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { @@ -25,23 +28,6 @@ class CheckFiniteAndUnscaleOp : public framework::OperatorWithKernel { const framework::AttributeMap& attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext* ctx) const override { - if (ctx->HasInputs("X") || ctx->HasOutputs("Out")) { - PADDLE_ENFORCE_EQ( - ctx->Inputs("X").size(), - ctx->Outputs("Out").size(), - platform::errors::InvalidArgument( - "The input(X) and output(Out) should have same size in " - "Operator(check_finite_and_unscale), size of input(X) is %d " - "and size of output(Out) is %d.", - ctx->Inputs("X").size(), - ctx->Outputs("Out").size())); - auto x_dims = ctx->GetInputsDim("X"); - ctx->SetOutputsDim("Out", x_dims); - } - ctx->SetOutputDim("FoundInfinite", {1}); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -91,59 +77,18 @@ Otherwise, FoundInfinite will be 0 (False). } }; -template -class CheckFiniteAndUnscaleCpuKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto& dev_ctx = ctx.template device_context(); - const auto xs = ctx.MultiInput("X"); - const auto* scale = ctx.Input("Scale"); - auto outs = ctx.MultiOutput("Out"); - auto* found_inf = ctx.Output("FoundInfinite"); - - const T* scale_data = scale->data(); - bool* found_inf_data = found_inf->mutable_data(dev_ctx.GetPlace()); - - *found_inf_data = false; - framework::Tensor is_finite = - ctx.AllocateTmpTensor({1}, dev_ctx); - bool* is_finite_data = is_finite.template data(); - - auto& dev = *ctx.template device_context().eigen_device(); - - T inverse_scale = Inverse(*scale_data); - for (size_t i = 0; i < xs.size(); ++i) { - const auto* x = xs[i]; - auto* out = outs[i]; - out->mutable_data(dev_ctx.GetPlace()); - if (!(*found_inf_data)) { - framework::TensorIsfinite(*x, &is_finite); - *found_inf_data = !(*is_finite_data); - } - auto eigen_out = framework::EigenVector::Flatten(*out); - auto eigen_in = framework::EigenVector::Flatten(*x); - if (!(*found_inf_data)) { - eigen_out.device(dev) = eigen_in * inverse_scale; - } else { - eigen_out.device(dev) = eigen_in * static_cast(0); - } - } - return; - } -}; - } // namespace operators } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(check_finite_and_unscale, + CheckFiniteAndUnscaleInferShapeFunctor, + PD_INFER_META(phi::CheckFiniteAndUnscaleInferMeta)); REGISTER_OPERATOR( check_finite_and_unscale, ops::CheckFiniteAndUnscaleOp, ops::CheckFiniteAndUnscaleOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); - -REGISTER_OP_CPU_KERNEL(check_finite_and_unscale, - ops::CheckFiniteAndUnscaleCpuKernel, - ops::CheckFiniteAndUnscaleCpuKernel); + paddle::framework::EmptyGradOpMaker, + CheckFiniteAndUnscaleInferShapeFunctor); diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu deleted file mode 100644 index 35b667825af..00000000000 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu +++ /dev/null @@ -1,172 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h" -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace operators { - -template -__global__ void InverseAndMemset(const T* s, T* o, bool* found_inf) { - *o = Inverse(*s); - *found_inf = false; -} - -template -__global__ void CheckFiniteAndUnscale(const T** xs, - const MT* scale, - int64_t size, - int64_t* starts, - bool* found_inf, - T** outs) { - const int64_t tid = threadIdx.x + blockIdx.x * blockDim.x; - - // copy starts array from global memory to shared memory - extern __shared__ int64_t s_starts[]; - for (int i = threadIdx.x; i <= size; i += blockDim.x) { - s_starts[i] = starts[i]; - } - __syncthreads(); - - const int64_t num = s_starts[size]; - int xs_index = 0; - bool local_found_inf = false; - const MT local_scale = *scale; - for (int64_t idx = tid; idx < num; idx += gridDim.x * blockDim.x) { - // get the "out" index of "id" - // For example: - // idx = 15, starts = [0, 10, 10, 20, 30] - // because 10 <= idx < 20 ==> - // the idx element locate in the 3rd tensor (notice the 2nd tensor size is - // 0) - int next_xs_index = xs_index; - while (idx >= s_starts[next_xs_index]) next_xs_index++; - xs_index = next_xs_index - 1; - - // get in data and out data - const T* in = xs[xs_index]; - T* out = outs[xs_index]; - int64_t in_idx = idx - s_starts[xs_index]; - - // Unscale - MT val = static_cast(in[in_idx]) * local_scale; - T narrow_val = static_cast(val); - out[in_idx] = narrow_val; - - // CheckFinite - if (!isfinite(narrow_val)) { - local_found_inf = true; - } - } - if (local_found_inf) { - *found_inf = true; - } -} - -template -class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel { - using MPDType = typename details::MPTypeTrait::Type; - - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto& dev_ctx = ctx.template device_context(); - const auto xs = ctx.MultiInput("X"); - const auto* scale = ctx.Input("Scale"); - auto outs = ctx.MultiOutput("Out"); - auto* found_inf = ctx.Output("FoundInfinite"); - - const MPDType* scale_data = scale->data(); - bool* found_inf_data = found_inf->mutable_data(dev_ctx.GetPlace()); - - framework::Tensor inverse_scale = - ctx.AllocateTmpTensor({1}, dev_ctx); - MPDType* inverse_scale_v = inverse_scale.template data(); - - InverseAndMemset<<<1, 1, 0, dev_ctx.stream()>>>( - scale_data, inverse_scale_v, found_inf_data); - - size_t xs_size = xs.size(); - if (xs_size == 0) return; - - const auto& cpu_place = platform::CPUPlace(); - // calculate each tensor's start index and copy to device - auto h_starts_tensor = - memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t)); - int64_t* h_starts = reinterpret_cast(h_starts_tensor->ptr()); - - auto d_starts_tensor = - memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t)); - int64_t* d_starts = reinterpret_cast(d_starts_tensor->ptr()); - - // the start index value of each tensor is - // the sum of previous tensor's size. For example: - // xs = [10, 0, 10, 10] ==> starts = [0, 10, 10, 20, 30] - h_starts[0] = 0; - for (int i = 1; i <= xs_size; i++) { - h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel(); - } - int64_t total_num = h_starts[xs_size]; - memory::Copy(dev_ctx.GetPlace(), - d_starts, - cpu_place, - h_starts, - (xs_size + 1) * sizeof(int64_t), - dev_ctx.stream()); - - // copy each tensor's data address to device - auto h_mem = memory::Alloc(cpu_place, 2 * xs_size * sizeof(T*)); - const T** h_xs = reinterpret_cast(h_mem->ptr()); - T** h_outs = reinterpret_cast(h_mem->ptr()) + xs_size; - - auto d_mem = memory::Alloc(dev_ctx, 2 * xs_size * sizeof(T*)); - const T** d_xs = reinterpret_cast(d_mem->ptr()); - T** d_outs = reinterpret_cast(d_mem->ptr()) + xs_size; - - for (size_t i = 0; i < xs_size; ++i) { - h_xs[i] = xs[i]->data(); - h_outs[i] = outs[i]->mutable_data(dev_ctx.GetPlace()); - } - memory::Copy(dev_ctx.GetPlace(), - d_xs, - cpu_place, - h_xs, - 2 * xs_size * sizeof(T*), - dev_ctx.stream()); - - // Launch Kernel - int threads_per_block = std::min(static_cast(1024), total_num); - int elements_per_block = - threads_per_block * 20; // each thread deal with 20 number - int blocks_per_grid = - (total_num + elements_per_block - 1) / elements_per_block; - VLOG(3) << "launch kernel"; - CheckFiniteAndUnscale<<>>( - d_xs, inverse_scale_v, xs_size, d_starts, found_inf_data, d_outs); - VLOG(3) << "finish kernel"; - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(check_finite_and_unscale, - ops::CheckFiniteAndUnscaleGpuKernel, - ops::CheckFiniteAndUnscaleGpuKernel, - ops::CheckFiniteAndUnscaleGpuKernel); diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.h b/paddle/fluid/operators/amp/check_finite_and_unscale_op.h deleted file mode 100644 index 31647f71038..00000000000 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.h +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "paddle/fluid/operators/isfinite_op.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { - -template -inline HOSTDEVICE T Inverse(T s) { - return 1.0 / s; -} - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc index 7659d67400b..41ba11ac046 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc index 24d55ef103c..98768afa936 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc @@ -15,8 +15,8 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc index ba7b7d900f5..cc60476c269 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc @@ -33,7 +33,7 @@ namespace p = paddle::platform; using Tensor = paddle::framework::Tensor; -USE_OP(check_finite_and_unscale); +USE_OP_ITSELF(check_finite_and_unscale); USE_OP_DEVICE_KERNEL(check_finite_and_unscale, NPU); struct InputVars { diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc index d904e8f7936..7d46b4c7f76 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/float16.h" diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cc b/paddle/fluid/operators/amp/update_loss_scaling_op.cc index 3bae775d308..c8dc8217ef2 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op.cc +++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc @@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/amp/update_loss_scaling_op.h" - #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { @@ -27,55 +28,6 @@ class UpdateLossScalingOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("FoundInfinite"), - "Input", - "FoundInfinite", - "update_loss_scaling"); - OP_INOUT_CHECK(ctx->HasInput("PrevLossScaling"), - "Input", - "PrevLossScaling", - "update_loss_scaling"); - OP_INOUT_CHECK(ctx->HasInput("InGoodSteps"), - "Input", - "InGoodSteps", - "update_loss_scaling"); - OP_INOUT_CHECK(ctx->HasInput("InBadSteps"), - "Input", - "InBadSteps", - "update_loss_scaling"); - OP_INOUT_CHECK(ctx->HasOutput("LossScaling"), - "Output", - "LossScaling", - "update_loss_scaling"); - OP_INOUT_CHECK(ctx->HasOutput("OutGoodSteps"), - "Output", - "OutGoodSteps", - "update_loss_scaling"); - OP_INOUT_CHECK(ctx->HasOutput("OutBadSteps"), - "Output", - "OutBadSteps", - "update_loss_scaling"); - - if (ctx->HasInputs("X") || ctx->HasOutputs("Out")) { - PADDLE_ENFORCE_EQ( - ctx->Inputs("X").size(), - ctx->Outputs("Out").size(), - platform::errors::InvalidArgument( - "The input(X) and output(Out) should have same size in " - "Operator(update_loss_scaling), size of input(X) is %d " - "and size of output(Out) is %d.", - ctx->Inputs("X").size(), - ctx->Outputs("Out").size())); - auto x_dims = ctx->GetInputsDim("X"); - ctx->SetOutputsDim("Out", x_dims); - } - - ctx->SetOutputDim("LossScaling", {1}); - ctx->SetOutputDim("OutGoodSteps", {1}); - ctx->SetOutputDim("OutBadSteps", {1}); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -168,72 +120,19 @@ decr_every_n_nan_or_inf steps and each step some gradients are infinite. } }; -template -class UpdateLossScalingFunctor { - public: - void operator()(const phi::CPUContext& ctx, - const bool* found_inf_data, - const T* pre_loss_scaling_data, - const int* good_in_data, - const int* bad_in_data, - const int incr_every_n_steps, - const int decr_every_n_nan_or_inf, - const float incr_ratio, - const float decr_ratio, - T* updated_loss_scaling_data, - int* good_out_data, - int* bad_out_data) const { - PADDLE_ENFORCE_EQ( - IsFoundInfOnCPU, - true, - platform::errors::InvalidArgument( - "The Input(FoundInfinite) should be on the CPUPlace.")); - Update(found_inf_data, - pre_loss_scaling_data, - good_in_data, - bad_in_data, - incr_every_n_steps, - decr_every_n_nan_or_inf, - incr_ratio, - decr_ratio, - updated_loss_scaling_data, - good_out_data, - bad_out_data); - } -}; - -template -class LazyZeros { - public: - void operator()(const phi::CPUContext& dev_ctx, - const bool* found_inf_data, - const std::vector& xs, - const std::vector& outs) const { - for (size_t i = 0; i < xs.size(); ++i) { - auto* out = outs[i]; - T* out_data = out->mutable_data(dev_ctx.GetPlace()); - int num = out->numel(); - if (*found_inf_data) { - VLOG(1) << "-- UpdateLossScaling: Find infinite grads. --"; - std::memset(out_data, 0, num * sizeof(T)); - } - } - } -}; - } // namespace operators } // namespace paddle namespace ops = paddle::operators; using CPU = phi::CPUContext; +DECLARE_INFER_SHAPE_FUNCTOR(update_loss_scaling, + UpdateLossScalingInferShapeFunctor, + PD_INFER_META(phi::UpdateLossScalingInferMeta)); REGISTER_OPERATOR( update_loss_scaling, ops::UpdateLossScalingOp, ops::UpdateLossScalingOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); - -REGISTER_OP_CPU_KERNEL(update_loss_scaling, - ops::UpdateLossScalingKernel, - ops::UpdateLossScalingKernel); + paddle::framework::EmptyGradOpMaker, + UpdateLossScalingInferShapeFunctor); diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cu b/paddle/fluid/operators/amp/update_loss_scaling_op.cu deleted file mode 100644 index 4c927066892..00000000000 --- a/paddle/fluid/operators/amp/update_loss_scaling_op.cu +++ /dev/null @@ -1,210 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/amp/update_loss_scaling_op.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace operators { - -template -__global__ void GpuUpdateLossScaling(const FoundNanInfFlagT found_inf_data, - const T* pre_loss_scaling_data, - const int* good_in_data, - const int* bad_in_data, - const int incr_every_n_steps, - const int decr_every_n_nan_or_inf, - const float incr_ratio, - const float decr_ratio, - T* updated_loss_scaling_data, - int* good_out_data, - int* bad_out_data) { - Update(found_inf_data, - pre_loss_scaling_data, - good_in_data, - bad_in_data, - incr_every_n_steps, - decr_every_n_nan_or_inf, - incr_ratio, - decr_ratio, - updated_loss_scaling_data, - good_out_data, - bad_out_data); -} - -template -__global__ void FusedFillIf(T** outs, - const size_t xs_size, - const int64_t* starts, - const T value, - const bool* has_inf) { - if (!(*has_inf)) return; - - const int tid = threadIdx.x + blockIdx.x * blockDim.x; - - // copy starts array from global memory to shared memory - extern __shared__ int64_t s_starts[]; - for (int i = threadIdx.x; i <= xs_size; i += blockDim.x) { - s_starts[i] = starts[i]; - } - __syncthreads(); - - const int64_t total_num = s_starts[xs_size]; - int out_index = 0; - - for (int64_t id = tid; id < total_num; id += blockDim.x * gridDim.x) { - // get the "out" index of "id" - // For example: - // id = 15, starts = [0, 10, 10, 20, 30] - // because 10 <= id < 20 ==> - // the id element locate in the 3rd tensor (notice the 2nd tensor size is 0) - int next_out_index = out_index; - while (id >= s_starts[next_out_index]) next_out_index++; - out_index = next_out_index - 1; - - // get data pointer and index - T* out_data = outs[out_index]; - int64_t idx = id - s_starts[out_index]; - - // set value - out_data[idx] = value; - } -} - -template -class UpdateLossScalingFunctor { - public: - void operator()(const phi::GPUContext& dev_ctx, - const bool* found_inf_data, - const T* pre_loss_scaling_data, - const int* good_in_data, - const int* bad_in_data, - const int incr_every_n_steps, - const int decr_every_n_nan_or_inf, - const float incr_ratio, - const float decr_ratio, - T* updated_loss_scaling_data, - int* good_out_data, - int* bad_out_data) const { - if (IsFoundInfOnCPU) { - GpuUpdateLossScaling - <<<1, 1, 0, dev_ctx.stream()>>>(*found_inf_data, - pre_loss_scaling_data, - good_in_data, - bad_in_data, - incr_every_n_steps, - decr_every_n_nan_or_inf, - incr_ratio, - decr_ratio, - updated_loss_scaling_data, - good_out_data, - bad_out_data); - } else { - GpuUpdateLossScaling - <<<1, 1, 0, dev_ctx.stream()>>>(found_inf_data, - pre_loss_scaling_data, - good_in_data, - bad_in_data, - incr_every_n_steps, - decr_every_n_nan_or_inf, - incr_ratio, - decr_ratio, - updated_loss_scaling_data, - good_out_data, - bad_out_data); - } - } -}; - -template -class LazyZeros { - public: - void operator()(const phi::GPUContext& dev_ctx, - const bool* found_inf_data, - const std::vector& xs, - const std::vector& outs) const { - size_t xs_size = xs.size(); - if (xs_size == 0) return; - - const auto& cpu_place = platform::CPUPlace(); - // alloc each tensor's start index and copy to device - auto h_in_starts_mem = - memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t)); - int64_t* h_starts = reinterpret_cast(h_in_starts_mem->ptr()); - - auto d_in_starts_mem = - memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t)); - int64_t* d_starts = reinterpret_cast(d_in_starts_mem->ptr()); - - // the start index value of each tensor is - // the sum of previous tensor's size. For example: - // outs = [10, 0, 10, 10] ==> starts = [0, 10, 10, 20, 30] - h_starts[0] = 0; - for (int i = 0; i < xs_size; i++) { - h_starts[i + 1] = h_starts[i] + outs[i]->numel(); - } - memory::Copy(dev_ctx.GetPlace(), - d_starts, - cpu_place, - h_starts, - (xs_size + 1) * sizeof(int64_t), - dev_ctx.stream()); - - // copy each tensor of "outs" data address array to device - auto h_out_addrs_mem = memory::Alloc(cpu_place, xs_size * sizeof(T*)); - T** h_out_addrs = reinterpret_cast(h_out_addrs_mem->ptr()); - - auto d_out_addrs_mem = memory::Alloc(dev_ctx, xs_size * sizeof(T*)); - T** d_out_addrs = reinterpret_cast(d_out_addrs_mem->ptr()); - - for (size_t i = 0; i < xs_size; ++i) { - h_out_addrs[i] = outs[i]->mutable_data(dev_ctx.GetPlace()); - } - memory::Copy(dev_ctx.GetPlace(), - d_out_addrs, - cpu_place, - h_out_addrs, - xs_size * sizeof(T*), - dev_ctx.stream()); - - // launch cuda kernel - int64_t total_num = h_starts[xs_size]; - int64_t threads_per_block = std::min(static_cast(1024), total_num); - int64_t elements_per_block = - threads_per_block * 50; // each thread deal with 50 data - int64_t blocks_per_grid = - (total_num + elements_per_block - 1) / elements_per_block; - FusedFillIf<<>>( - d_out_addrs, xs_size, d_starts, static_cast(0), found_inf_data); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -using GPU = phi::GPUContext; - -REGISTER_OP_CUDA_KERNEL(update_loss_scaling, - ops::UpdateLossScalingKernel, - ops::UpdateLossScalingKernel, - ops::UpdateLossScalingKernel); diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.h b/paddle/fluid/operators/amp/update_loss_scaling_op.h deleted file mode 100644 index 90c77a3a4fd..00000000000 --- a/paddle/fluid/operators/amp/update_loss_scaling_op.h +++ /dev/null @@ -1,214 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#if defined(PADDLE_WITH_CUDA) && defined(__NVCC__) -#include -#endif // PADDLE_WITH_CUDA && __NVCC__ -#include -#include - -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/errors.h" -#include "paddle/phi/core/hostdevice.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -inline HOSTDEVICE bool check_finite(T value) { -#if defined(PADDLE_WITH_CUDA) && defined(__NVCC__) - return isfinite(value); -#else - return std::isfinite(value); -#endif -} - -inline HOSTDEVICE bool IsFoundNanInf(const bool found_nan_inf_data) { - return found_nan_inf_data; -} - -inline HOSTDEVICE bool IsFoundNanInf(const bool* found_nan_inf_data) { - return *found_nan_inf_data; -} - -template -inline HOSTDEVICE void Update(const FoundInfFlagT found_inf_data, - const T* pre_loss_scaling_data, - const int* good_in_data, - const int* bad_in_data, - const int incr_every_n_steps, - const int decr_every_n_nan_or_inf, - const float incr_ratio, - const float decr_ratio, - T* updated_loss_scaling_data, - int* good_out_data, - int* bad_out_data) { - if (IsFoundNanInf(found_inf_data)) { - *good_out_data = 0; - *bad_out_data = *bad_in_data + 1; - if (*bad_out_data == decr_every_n_nan_or_inf) { - T new_loss_scaling = *pre_loss_scaling_data * decr_ratio; - *updated_loss_scaling_data = new_loss_scaling < static_cast(1) - ? static_cast(1) - : new_loss_scaling; - *bad_out_data = 0; - } - } else { - *bad_out_data = 0; - *good_out_data = *good_in_data + 1; - if (*good_out_data == incr_every_n_steps) { - T new_loss_scaling = *pre_loss_scaling_data * incr_ratio; - *updated_loss_scaling_data = check_finite(new_loss_scaling) - ? new_loss_scaling - : *pre_loss_scaling_data; - *good_out_data = 0; - } - } -} - -template -class UpdateLossScalingFunctor { - public: - void operator()(const DeviceContext& dev_ctx, - const bool* found_inf_data, - const T* pre_loss_scaling_data, - const int* good_in_data, - const int* bad_in_data, - const int incr_every_n_steps, - const int decr_every_n_nan_or_inf, - const float incr_ratio, - const float decr_ratio, - T* updated_loss_scaling_data, - int* good_out_data, - int* bad_out_data) const; -}; - -template -class LazyZeros { - public: - void operator()(const DeviceContext& dev_ctx, - const bool* found_inf_data, - const std::vector& xs, - const std::vector& outs) const; -}; - -template -class UpdateLossScalingKernel : public framework::OpKernel { - using MPDType = typename details::MPTypeTrait::Type; - - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - - const auto xs = ctx.MultiInput("X"); - auto outs = ctx.MultiOutput("Out"); - const auto* found_inf = ctx.Input("FoundInfinite"); - PADDLE_ENFORCE_EQ(found_inf->numel(), - 1, - platform::errors::InvalidArgument( - "FoundInfinite must has only one element.")); - const bool* found_inf_data = found_inf->data(); - bool is_found_inf_on_cpu = platform::is_cpu_place(found_inf->place()); - - if (is_found_inf_on_cpu) { - if (*found_inf_data) { - phi::funcs::SetConstant set_constant; - for (auto* out : outs) { - out->mutable_data(dev_ctx.GetPlace()); - set_constant(dev_ctx, out, static_cast(0)); - } - } - } else { - LazyZeros{}(dev_ctx, found_inf_data, xs, outs); - } - - const auto* stop_update_tensor = ctx.Input("StopUpdate"); - bool stop_update = false; - if (stop_update_tensor && stop_update_tensor->IsInitialized()) { - if (platform::is_cpu_place(stop_update_tensor->place())) { - stop_update = stop_update_tensor->data()[0]; - } else { - framework::Tensor tmp_tensor; - framework::TensorCopySync( - *stop_update_tensor, platform::CPUPlace(), &tmp_tensor); - stop_update = tmp_tensor.data()[0]; - } - } - stop_update |= ctx.Attr("stop_update"); - if (stop_update) { - return; - } - - const auto* pre_loss_scaling = ctx.Input("PrevLossScaling"); - const auto* good_in = ctx.Input("InGoodSteps"); - const auto* bad_in = ctx.Input("InBadSteps"); - auto* updated_loss_scaling = ctx.Output("LossScaling"); - auto* good_out = ctx.Output("OutGoodSteps"); - auto* bad_out = ctx.Output("OutBadSteps"); - const MPDType* pre_loss_scaling_data = pre_loss_scaling->data(); - const int* good_in_data = good_in->data(); - const int* bad_in_data = bad_in->data(); - - MPDType* updated_loss_scaling_data = - updated_loss_scaling->mutable_data(dev_ctx.GetPlace()); - int* good_out_data = good_out->mutable_data(dev_ctx.GetPlace()); - int* bad_out_data = bad_out->mutable_data(dev_ctx.GetPlace()); - - const int incr_every_n_steps = ctx.Attr("incr_every_n_steps"); - const int decr_every_n_nan_or_inf = - ctx.Attr("decr_every_n_nan_or_inf"); - const float incr_ratio = ctx.Attr("incr_ratio"); - const float decr_ratio = ctx.Attr("decr_ratio"); - if (is_found_inf_on_cpu) { - UpdateLossScalingFunctor{}( - dev_ctx, - found_inf_data, - pre_loss_scaling_data, - good_in_data, - bad_in_data, - incr_every_n_steps, - decr_every_n_nan_or_inf, - incr_ratio, - decr_ratio, - updated_loss_scaling_data, - good_out_data, - bad_out_data); - } else { - UpdateLossScalingFunctor{}( - dev_ctx, - found_inf_data, - pre_loss_scaling_data, - good_in_data, - bad_in_data, - incr_every_n_steps, - decr_every_n_nan_or_inf, - incr_ratio, - decr_ratio, - updated_loss_scaling_data, - good_out_data, - bad_out_data); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc index 61bbdf4e0fc..24784803f17 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc +++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/amp/update_loss_scaling_op.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" DECLARE_int32(min_loss_scaling); @@ -150,9 +150,7 @@ void Update(const platform::NPUDeviceContext& ctx, } template -class UpdateLossScalingFunctor { +class UpdateLossScalingFunctor { public: void operator()(const platform::NPUDeviceContext& dev_ctx, const std::vector found_inf_vec, @@ -270,19 +268,18 @@ class UpdateLossScalingNPUKernel : public framework::OpKernel { ctx.Attr("decr_every_n_nan_or_inf"); const float incr_ratio = ctx.Attr("incr_ratio"); const float decr_ratio = ctx.Attr("decr_ratio"); - UpdateLossScalingFunctor{}( - dev_ctx, - found_inf_vec, - pre_loss_scaling, - good_in, - bad_in, - incr_every_n_steps, - decr_every_n_nan_or_inf, - incr_ratio, - decr_ratio, - updated_loss_scaling, - good_out, - bad_out); + UpdateLossScalingFunctor{}(dev_ctx, + found_inf_vec, + pre_loss_scaling, + good_in, + bad_in, + incr_every_n_steps, + decr_every_n_nan_or_inf, + incr_ratio, + decr_ratio, + updated_loss_scaling, + good_out, + bad_out); } }; diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc index 1eefe564156..2348c10ef55 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc +++ b/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc @@ -19,12 +19,13 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/operators/amp/update_loss_scaling_op.h" #include "paddle/fluid/platform/float16.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + template class UpdateLossScalingXPUKernel : public framework::OpKernel { using MPDType = typename details::MPTypeTrait::Type; diff --git a/paddle/phi/core/meta_tensor.cc b/paddle/phi/core/meta_tensor.cc index f0cd841235e..9a008e429da 100644 --- a/paddle/phi/core/meta_tensor.cc +++ b/paddle/phi/core/meta_tensor.cc @@ -25,15 +25,35 @@ limitations under the License. */ namespace phi { -int64_t MetaTensor::numel() const { return tensor_->numel(); } +static inline void ValidCheck(const MetaTensor& meta_tensor) { + PADDLE_ENFORCE_EQ(meta_tensor.initialized(), + true, + phi::errors::InvalidArgument( + "The current MetaTensor is not initialized.")); +} -DDim MetaTensor::dims() const { return tensor_->dims(); } +int64_t MetaTensor::numel() const { + ValidCheck(*this); + return tensor_->numel(); +} -DataType MetaTensor::dtype() const { return tensor_->dtype(); } +DDim MetaTensor::dims() const { + ValidCheck(*this); + return tensor_->dims(); +} -DataLayout MetaTensor::layout() const { return tensor_->layout(); } +DataType MetaTensor::dtype() const { + ValidCheck(*this); + return tensor_->dtype(); +} + +DataLayout MetaTensor::layout() const { + ValidCheck(*this); + return tensor_->layout(); +} void MetaTensor::set_dims(const DDim& dims) { + ValidCheck(*this); if (phi::DenseTensor::classof(tensor_)) { DenseTensorUtils::GetMutableMeta(static_cast(tensor_))->dims = dims; @@ -51,6 +71,7 @@ void MetaTensor::set_dims(const DDim& dims) { } void MetaTensor::set_dtype(DataType dtype) { + ValidCheck(*this); if (phi::DenseTensor::classof(tensor_)) { DenseTensorUtils::GetMutableMeta(static_cast(tensor_)) ->dtype = dtype; @@ -67,6 +88,7 @@ void MetaTensor::set_dtype(DataType dtype) { } void MetaTensor::set_layout(DataLayout layout) { + ValidCheck(*this); if (phi::DenseTensor::classof(tensor_)) { DenseTensorUtils::GetMutableMeta(static_cast(tensor_)) ->layout = layout; @@ -83,6 +105,8 @@ void MetaTensor::set_layout(DataLayout layout) { } void MetaTensor::share_lod(const MetaTensor& meta_tensor) { + ValidCheck(*this); + ValidCheck(meta_tensor); if (meta_tensor.lod().size() == 0) { // no need share return; @@ -101,18 +125,8 @@ void MetaTensor::share_lod(const MetaTensor& meta_tensor) { } } -const LoD& MetaTensor::lod() const { - if (phi::DenseTensor::classof(tensor_)) { - return static_cast(tensor_)->lod(); - } else if (phi::SelectedRows::classof(tensor_)) { - return static_cast(tensor_)->value().lod(); - } else { - PADDLE_THROW(phi::errors::Unimplemented("Unsupported getting lod of `%s`.", - tensor_->type_info().name())); - } -} - void MetaTensor::share_meta(const MetaTensor& meta_tensor) { + ValidCheck(*this); if (phi::DenseTensor::classof(tensor_) || phi::SelectedRows::classof(tensor_)) { share_dims(meta_tensor); @@ -125,9 +139,8 @@ void MetaTensor::share_meta(const MetaTensor& meta_tensor) { } } -TensorBase* MetaTensor::tensor() const { return tensor_; } - void MetaTensor::share_dims(const MetaTensor& meta_tensor) { + ValidCheck(*this); bool is_dense_tensor = phi::DenseTensor::classof(tensor_); bool is_selected_rows = phi::SelectedRows::classof(tensor_); if (is_dense_tensor || is_selected_rows) { @@ -152,4 +165,19 @@ void MetaTensor::share_dims(const MetaTensor& meta_tensor) { bool MetaTensor::initialized() const { return tensor_ != nullptr; } +// Private Member Methods + +const LoD& MetaTensor::lod() const { + if (phi::DenseTensor::classof(tensor_)) { + return static_cast(tensor_)->lod(); + } else if (phi::SelectedRows::classof(tensor_)) { + return static_cast(tensor_)->value().lod(); + } else { + PADDLE_THROW(phi::errors::Unimplemented("Unsupported getting lod of `%s`.", + tensor_->type_info().name())); + } +} + +TensorBase* MetaTensor::tensor() const { return tensor_; } + } // namespace phi diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 27c3d20d853..12c5ba109ab 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -764,6 +764,27 @@ void BroadcastTensorsInferMeta(const std::vector& x, } } +void CheckFiniteAndUnscaleInferMeta(const std::vector& xs, + const MetaTensor& scale, + std::vector outs, + MetaTensor* found_infinite) { + PADDLE_ENFORCE_EQ( + xs.size(), + outs.size(), + phi::errors::InvalidArgument( + "The input(X) and output(Out) should have same size in " + "Operator(check_finite_and_unscale), size of input(X) is %d " + "and size of output(Out) is %d.", + xs.size(), + outs.size())); + for (size_t i = 0; i < xs.size(); ++i) { + outs[i]->set_dims(xs[i]->dims()); + outs[i]->set_dtype(xs[i]->dtype()); + } + found_infinite->set_dims({1}); + found_infinite->set_dtype(DataType::BOOL); +} + void ConcatInferMeta(const std::vector& x, const Scalar& axis_scalar, MetaTensor* out, @@ -1109,6 +1130,102 @@ void GenerateProposalsV2InferMeta(const MetaTensor& scores, rpn_roi_probs->set_dims(phi::make_ddim({-1, 1})); } +void GraphReindexInferMeta(const MetaTensor& x, + const MetaTensor& neighbors, + const MetaTensor& count, + const MetaTensor& hashtable_value, + const MetaTensor& hashtable_index, + bool flag_buffer_hashtable, + MetaTensor* reindex_src, + MetaTensor* reindex_dst, + MetaTensor* out_nodes) { + auto GraphReindexShapeCheck = [](const phi::DDim& dims, + std::string tensor_name) { + if (dims.size() == 2) { + PADDLE_ENFORCE_EQ( + dims[1], + 1, + phi::errors::InvalidArgument("The last dim of %s should be 1 when it " + "is 2D, but we get %d", + tensor_name, + dims[1])); + } else { + PADDLE_ENFORCE_EQ( + dims.size(), + 1, + phi::errors::InvalidArgument( + "The %s should be 1D, when it is not 2D, but we get %d", + tensor_name, + dims.size())); + } + }; + + GraphReindexShapeCheck(x.dims(), "X"); + GraphReindexShapeCheck(neighbors.dims(), "Neighbors"); + GraphReindexShapeCheck(count.dims(), "Count"); + if (flag_buffer_hashtable) { + GraphReindexShapeCheck(hashtable_value.dims(), "HashTable_Value"); + GraphReindexShapeCheck(hashtable_index.dims(), "HashTable_Index"); + } + + reindex_src->set_dims({-1}); + reindex_src->set_dtype(neighbors.dtype()); + reindex_dst->set_dims({-1}); + reindex_dst->set_dtype(neighbors.dtype()); + out_nodes->set_dims({-1}); + out_nodes->set_dtype(x.dtype()); +} + +void GraphSampleNeighborsInferMeta(const MetaTensor& row, + const MetaTensor& col_ptr, + const MetaTensor& x, + const MetaTensor& eids, + const MetaTensor& perm_buffer, + int sample_size, + bool return_eids, + bool flag_perm_buffer, + MetaTensor* out, + MetaTensor* out_count, + MetaTensor* out_eids) { + // GSN: GraphSampleNeighbors + auto GSNShapeCheck = [](const phi::DDim& dims, std::string tensor_name) { + if (dims.size() == 2) { + PADDLE_ENFORCE_EQ( + dims[1], + 1, + phi::errors::InvalidArgument("The last dim of %s should be 1 when it " + "is 2D, but we get %d", + tensor_name, + dims[1])); + } else { + PADDLE_ENFORCE_EQ( + dims.size(), + 1, + phi::errors::InvalidArgument( + "The %s should be 1D, when it is not 2D, but we get %d", + tensor_name, + dims.size())); + } + }; + + GSNShapeCheck(row.dims(), "Row"); + GSNShapeCheck(col_ptr.dims(), "Col_Ptr"); + GSNShapeCheck(x.dims(), "X"); + if (return_eids) { + GSNShapeCheck(eids.dims(), "Eids"); + out_eids->set_dims({-1}); + out_eids->set_dtype(row.dtype()); + } + if (flag_perm_buffer) { + GSNShapeCheck(perm_buffer.dims(), "Perm_Buffer"); + } + + out->set_dims({-1}); + out->set_dtype(row.dtype()); + out_count->set_dims({-1}); + out_count->set_dtype(DataType::INT32); +} + void HierarchicalSigmoidInferMeta(const MetaTensor& x, const MetaTensor& w, const MetaTensor& label, @@ -2294,6 +2411,34 @@ void UnchangedMultiInferMeta(const std::vector& x, } } +void UpdateLossScalingInferMeta(const std::vector& xs, + const MetaTensor& found_infinite, + const MetaTensor& prev_loss_scaling, + const MetaTensor& in_good_steps, + const MetaTensor& in_bad_steps, + std::vector outs, + MetaTensor* loss_scaling, + MetaTensor* out_good_steps, + MetaTensor* out_bad_steps) { + PADDLE_ENFORCE_EQ(xs.size(), + outs.size(), + phi::errors::InvalidArgument( + "The input(X) and output(Out) should have same size in " + "Operator(update_loss_scaling), size of input(X) is %d " + "and size of output(Out) is %d.", + xs.size(), + outs.size())); + for (size_t i = 0; i < xs.size(); ++i) { + outs[i]->set_dims(xs[i]->dims()); + outs[i]->set_dtype(xs[i]->dtype()); + } + loss_scaling->set_dims({1}); + out_good_steps->set_dims({1}); + out_good_steps->set_dtype(DataType::INT32); + out_bad_steps->set_dims({1}); + out_bad_steps->set_dtype(DataType::INT32); +} + void WarpctcInferMeta(const MetaTensor& logits, const MetaTensor& label, const MetaTensor& logits_length, @@ -2356,102 +2501,6 @@ void WhereInferMeta(const MetaTensor& condition, out->share_meta(x); } -void GraphReindexInferMeta(const MetaTensor& x, - const MetaTensor& neighbors, - const MetaTensor& count, - const MetaTensor& hashtable_value, - const MetaTensor& hashtable_index, - bool flag_buffer_hashtable, - MetaTensor* reindex_src, - MetaTensor* reindex_dst, - MetaTensor* out_nodes) { - auto GraphReindexShapeCheck = [](const phi::DDim& dims, - std::string tensor_name) { - if (dims.size() == 2) { - PADDLE_ENFORCE_EQ( - dims[1], - 1, - phi::errors::InvalidArgument("The last dim of %s should be 1 when it " - "is 2D, but we get %d", - tensor_name, - dims[1])); - } else { - PADDLE_ENFORCE_EQ( - dims.size(), - 1, - phi::errors::InvalidArgument( - "The %s should be 1D, when it is not 2D, but we get %d", - tensor_name, - dims.size())); - } - }; - - GraphReindexShapeCheck(x.dims(), "X"); - GraphReindexShapeCheck(neighbors.dims(), "Neighbors"); - GraphReindexShapeCheck(count.dims(), "Count"); - if (flag_buffer_hashtable) { - GraphReindexShapeCheck(hashtable_value.dims(), "HashTable_Value"); - GraphReindexShapeCheck(hashtable_index.dims(), "HashTable_Index"); - } - - reindex_src->set_dims({-1}); - reindex_src->set_dtype(neighbors.dtype()); - reindex_dst->set_dims({-1}); - reindex_dst->set_dtype(neighbors.dtype()); - out_nodes->set_dims({-1}); - out_nodes->set_dtype(x.dtype()); -} - -void GraphSampleNeighborsInferMeta(const MetaTensor& row, - const MetaTensor& col_ptr, - const MetaTensor& x, - const MetaTensor& eids, - const MetaTensor& perm_buffer, - int sample_size, - bool return_eids, - bool flag_perm_buffer, - MetaTensor* out, - MetaTensor* out_count, - MetaTensor* out_eids) { - // GSN: GraphSampleNeighbors - auto GSNShapeCheck = [](const phi::DDim& dims, std::string tensor_name) { - if (dims.size() == 2) { - PADDLE_ENFORCE_EQ( - dims[1], - 1, - phi::errors::InvalidArgument("The last dim of %s should be 1 when it " - "is 2D, but we get %d", - tensor_name, - dims[1])); - } else { - PADDLE_ENFORCE_EQ( - dims.size(), - 1, - phi::errors::InvalidArgument( - "The %s should be 1D, when it is not 2D, but we get %d", - tensor_name, - dims.size())); - } - }; - - GSNShapeCheck(row.dims(), "Row"); - GSNShapeCheck(col_ptr.dims(), "Col_Ptr"); - GSNShapeCheck(x.dims(), "X"); - if (return_eids) { - GSNShapeCheck(eids.dims(), "Eids"); - out_eids->set_dims({-1}); - out_eids->set_dtype(row.dtype()); - } - if (flag_perm_buffer) { - GSNShapeCheck(perm_buffer.dims(), "Perm_Buffer"); - } - - out->set_dims({-1}); - out->set_dtype(row.dtype()); - out_count->set_dims({-1}); - out_count->set_dtype(DataType::INT32); -} - void Yolov3LossInferMeta(const MetaTensor& x, const MetaTensor& gt_box, const MetaTensor& gt_label, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 05c834cebf3..b01a23b9ee3 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -196,6 +196,11 @@ void BilinearTensorProductInferMeta(const MetaTensor& x, void BroadcastTensorsInferMeta(const std::vector& x, std::vector out); +void CheckFiniteAndUnscaleInferMeta(const std::vector& xs, + const MetaTensor& scale, + std::vector outs, + MetaTensor* found_infinite); + void ConcatInferMeta(const std::vector& x, const Scalar& axis_scalar, MetaTensor* out, @@ -237,6 +242,28 @@ void GenerateProposalsV2InferMeta(const MetaTensor& scores, MetaTensor* rpn_roi_probs, MetaTensor* rpn_rois_num); +void GraphReindexInferMeta(const MetaTensor& x, + const MetaTensor& neighbors, + const MetaTensor& count, + const MetaTensor& hashtable_value, + const MetaTensor& hashtable_index, + bool flag_buffer_hashtable, + MetaTensor* reindex_src, + MetaTensor* reindex_dst, + MetaTensor* out_nodes); + +void GraphSampleNeighborsInferMeta(const MetaTensor& row, + const MetaTensor& col_ptr, + const MetaTensor& x, + const MetaTensor& eids, + const MetaTensor& perm_buffer, + int sample_size, + bool return_eids, + bool flag_perm_buffer, + MetaTensor* out, + MetaTensor* out_count, + MetaTensor* out_eids); + void HierarchicalSigmoidInferMeta(const MetaTensor& x, const MetaTensor& w, const MetaTensor& label, @@ -415,6 +442,16 @@ void StackInferMeta(const std::vector& x, void UnchangedMultiInferMeta(const std::vector& x, std::vector out); +void UpdateLossScalingInferMeta(const std::vector& xs, + const MetaTensor& found_infinite, + const MetaTensor& prev_loss_scaling, + const MetaTensor& in_good_steps, + const MetaTensor& in_bad_steps, + std::vector outs, + MetaTensor* loss_scaling, + MetaTensor* out_good_steps, + MetaTensor* out_bad_steps); + void WarpctcInferMeta(const MetaTensor& logits, const MetaTensor& label, const MetaTensor& logits_length, @@ -429,28 +466,6 @@ void WhereInferMeta(const MetaTensor& condition, const MetaTensor& y, MetaTensor* out); -void GraphReindexInferMeta(const MetaTensor& x, - const MetaTensor& neighbors, - const MetaTensor& count, - const MetaTensor& hashtable_value, - const MetaTensor& hashtable_index, - bool flag_buffer_hashtable, - MetaTensor* reindex_src, - MetaTensor* reindex_dst, - MetaTensor* out_nodes); - -void GraphSampleNeighborsInferMeta(const MetaTensor& row, - const MetaTensor& col_ptr, - const MetaTensor& x, - const MetaTensor& eids, - const MetaTensor& perm_buffer, - int sample_size, - bool return_eids, - bool flag_perm_buffer, - MetaTensor* out, - MetaTensor* out_count, - MetaTensor* out_eids); - void Yolov3LossInferMeta(const MetaTensor& x, const MetaTensor& gt_box, const MetaTensor& gt_label, diff --git a/paddle/phi/kernels/amp_kernel.h b/paddle/phi/kernels/amp_kernel.h new file mode 100644 index 00000000000..e22a730db67 --- /dev/null +++ b/paddle/phi/kernels/amp_kernel.h @@ -0,0 +1,48 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, + const std::vector& xs, + const DenseTensor& scale, + std::vector outs, + DenseTensor* found_infinite); + +template +void UpdateLossScalingKernel(const Context& dev_ctx, + const std::vector& xs, + const DenseTensor& found_infinite, + const DenseTensor& prev_loss_scaling, + const DenseTensor& in_good_steps, + const DenseTensor& in_bad_steps, + int incr_every_n_steps, + int decr_every_n_nan_or_inf, + float incr_ratio, + float decr_ratio, + const Scalar& stop_update, + std::vector outs, + DenseTensor* loss_scaling, + DenseTensor* out_good_steps, + DenseTensor* out_bad_steps); + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/amp_kernel.cc b/paddle/phi/kernels/cpu/amp_kernel.cc new file mode 100644 index 00000000000..d15c91dc8dd --- /dev/null +++ b/paddle/phi/kernels/cpu/amp_kernel.cc @@ -0,0 +1,115 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/amp_kernel.h" + +#include + +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/impl/amp_kernel_impl.h" + +#include "paddle/fluid/framework/tensor_util.h" + +namespace phi { + +// Utils + +template +class UpdateLossScalingFunctor { + public: + void operator()(const phi::CPUContext& ctx, + const bool* found_inf_data, + const T* pre_loss_scaling_data, + const int* good_in_data, + const int* bad_in_data, + const int incr_every_n_steps, + const int decr_every_n_nan_or_inf, + const float incr_ratio, + const float decr_ratio, + T* updated_loss_scaling_data, + int* good_out_data, + int* bad_out_data) const { + PADDLE_ENFORCE_EQ( + IsFoundInfOnCPU, + true, + phi::errors::InvalidArgument( + "The Input(FoundInfinite) should be on the CPUPlace.")); + Update(found_inf_data, + pre_loss_scaling_data, + good_in_data, + bad_in_data, + incr_every_n_steps, + decr_every_n_nan_or_inf, + incr_ratio, + decr_ratio, + updated_loss_scaling_data, + good_out_data, + bad_out_data); + } +}; + +// Kernels + +template +void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, + const std::vector& xs, + const DenseTensor& scale, + std::vector outs, + DenseTensor* found_infinite) { + const T* scale_data = scale.data(); + bool* found_inf_data = dev_ctx.template Alloc(found_infinite); + + *found_inf_data = false; + DenseTensor is_finite = Empty(dev_ctx, {1}); + bool* is_finite_data = is_finite.template data(); + + auto& dev = *dev_ctx.eigen_device(); + + T inverse_scale = 1.0 / *scale_data; + for (size_t i = 0; i < xs.size(); ++i) { + const auto* x = xs[i]; + auto* out = outs[i]; + dev_ctx.template Alloc(out); + if (!(*found_inf_data)) { + paddle::framework::TensorIsfinite(*x, &is_finite); + *found_inf_data = !(*is_finite_data); + } + auto eigen_out = EigenVector::Flatten(*out); + auto eigen_in = EigenVector::Flatten(*x); + if (!(*found_inf_data)) { + eigen_out.device(dev) = eigen_in * inverse_scale; + } else { + eigen_out.device(dev) = eigen_in * static_cast(0); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(check_finite_and_unscale, + CPU, + ALL_LAYOUT, + phi::CheckFiniteAndUnscaleKernel, + float, + double) {} + +PD_REGISTER_KERNEL(update_loss_scaling, + CPU, + ALL_LAYOUT, + phi::UpdateLossScalingKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/amp_kernel.cu b/paddle/phi/kernels/gpu/amp_kernel.cu new file mode 100644 index 00000000000..b0b0f9c5e79 --- /dev/null +++ b/paddle/phi/kernels/gpu/amp_kernel.cu @@ -0,0 +1,358 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/amp_kernel.h" + +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/impl/amp_kernel_impl.h" + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/memory/memory.h" + +namespace phi { + +// Utils + +template +__global__ void InverseAndMemset(const T* s, T* o, bool* found_inf) { + *o = 1.0 / *s; + *found_inf = false; +} + +template +__global__ void CheckFiniteAndUnscale(const T** xs, + const MT* scale, + int64_t size, + int64_t* starts, + bool* found_inf, + T** outs) { + const int64_t tid = threadIdx.x + blockIdx.x * blockDim.x; + + // copy starts array from global memory to shared memory + extern __shared__ int64_t s_starts[]; + for (int i = threadIdx.x; i <= size; i += blockDim.x) { + s_starts[i] = starts[i]; + } + __syncthreads(); + + const int64_t num = s_starts[size]; + int xs_index = 0; + bool local_found_inf = false; + const MT local_scale = *scale; + for (int64_t idx = tid; idx < num; idx += gridDim.x * blockDim.x) { + // get the "out" index of "id" + // For example: + // idx = 15, starts = [0, 10, 10, 20, 30] + // because 10 <= idx < 20 ==> + // the idx element locate in the 3rd tensor (notice the 2nd tensor size is + // 0) + int next_xs_index = xs_index; + while (idx >= s_starts[next_xs_index]) next_xs_index++; + xs_index = next_xs_index - 1; + + // get in data and out data + const T* in = xs[xs_index]; + T* out = outs[xs_index]; + int64_t in_idx = idx - s_starts[xs_index]; + + // Unscale + MT val = static_cast(in[in_idx]) * local_scale; + T narrow_val = static_cast(val); + out[in_idx] = narrow_val; + + // CheckFinite + if (!isfinite(narrow_val)) { + local_found_inf = true; + } + } + if (local_found_inf) { + *found_inf = true; + } +} + +template +__global__ void GpuUpdateLossScaling(const FoundNanInfFlagT found_inf_data, + const T* pre_loss_scaling_data, + const int* good_in_data, + const int* bad_in_data, + const int incr_every_n_steps, + const int decr_every_n_nan_or_inf, + const float incr_ratio, + const float decr_ratio, + T* updated_loss_scaling_data, + int* good_out_data, + int* bad_out_data) { + Update(found_inf_data, + pre_loss_scaling_data, + good_in_data, + bad_in_data, + incr_every_n_steps, + decr_every_n_nan_or_inf, + incr_ratio, + decr_ratio, + updated_loss_scaling_data, + good_out_data, + bad_out_data); +} + +template +__global__ void FusedFillIf(T** outs, + const size_t xs_size, + const int64_t* starts, + const T value, + const bool* has_inf) { + if (!(*has_inf)) return; + + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + + // copy starts array from global memory to shared memory + extern __shared__ int64_t s_starts[]; + for (int i = threadIdx.x; i <= xs_size; i += blockDim.x) { + s_starts[i] = starts[i]; + } + __syncthreads(); + + const int64_t total_num = s_starts[xs_size]; + int out_index = 0; + + for (int64_t id = tid; id < total_num; id += blockDim.x * gridDim.x) { + // get the "out" index of "id" + // For example: + // id = 15, starts = [0, 10, 10, 20, 30] + // because 10 <= id < 20 ==> + // the id element locate in the 3rd tensor (notice the 2nd tensor size is 0) + int next_out_index = out_index; + while (id >= s_starts[next_out_index]) next_out_index++; + out_index = next_out_index - 1; + + // get data pointer and index + T* out_data = outs[out_index]; + int64_t idx = id - s_starts[out_index]; + + // set value + out_data[idx] = value; + } +} + +template +class LazyZeros { + public: + void operator()(const phi::GPUContext& dev_ctx, + const bool* found_inf_data, + const std::vector& xs, + const std::vector& outs) { + size_t xs_size = xs.size(); + if (xs_size == 0) return; + + const auto& cpu_place = phi::CPUPlace(); + // alloc each tensor's start index and copy to device + auto h_in_starts_mem = + paddle::memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t)); + int64_t* h_starts = reinterpret_cast(h_in_starts_mem->ptr()); + + auto d_in_starts_mem = + paddle::memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t)); + int64_t* d_starts = reinterpret_cast(d_in_starts_mem->ptr()); + + // the start index value of each tensor is + // the sum of previous tensor's size. For example: + // outs = [10, 0, 10, 10] ==> starts = [0, 10, 10, 20, 30] + h_starts[0] = 0; + for (int i = 0; i < xs_size; i++) { + h_starts[i + 1] = h_starts[i] + outs[i]->numel(); + } + paddle::memory::Copy(dev_ctx.GetPlace(), + d_starts, + cpu_place, + h_starts, + (xs_size + 1) * sizeof(int64_t), + dev_ctx.stream()); + + // copy each tensor of "outs" data address array to device + auto h_out_addrs_mem = + paddle::memory::Alloc(cpu_place, xs_size * sizeof(T*)); + T** h_out_addrs = reinterpret_cast(h_out_addrs_mem->ptr()); + + auto d_out_addrs_mem = paddle::memory::Alloc(dev_ctx, xs_size * sizeof(T*)); + T** d_out_addrs = reinterpret_cast(d_out_addrs_mem->ptr()); + + for (size_t i = 0; i < xs_size; ++i) { + h_out_addrs[i] = dev_ctx.Alloc(outs[i]); + } + paddle::memory::Copy(dev_ctx.GetPlace(), + d_out_addrs, + cpu_place, + h_out_addrs, + xs_size * sizeof(T*), + dev_ctx.stream()); + + // launch cuda kernel + int64_t total_num = h_starts[xs_size]; + int64_t threads_per_block = std::min(static_cast(1024), total_num); + int64_t elements_per_block = + threads_per_block * 50; // each thread deal with 50 data + int64_t blocks_per_grid = + (total_num + elements_per_block - 1) / elements_per_block; + FusedFillIf<<>>( + d_out_addrs, xs_size, d_starts, static_cast(0), found_inf_data); + } +}; + +template +class UpdateLossScalingFunctor { + public: + void operator()(const phi::GPUContext& dev_ctx, + const bool* found_inf_data, + const T* pre_loss_scaling_data, + const int* good_in_data, + const int* bad_in_data, + const int incr_every_n_steps, + const int decr_every_n_nan_or_inf, + const float incr_ratio, + const float decr_ratio, + T* updated_loss_scaling_data, + int* good_out_data, + int* bad_out_data) const { + if (IsFoundInfOnCPU) { + GpuUpdateLossScaling + <<<1, 1, 0, dev_ctx.stream()>>>(*found_inf_data, + pre_loss_scaling_data, + good_in_data, + bad_in_data, + incr_every_n_steps, + decr_every_n_nan_or_inf, + incr_ratio, + decr_ratio, + updated_loss_scaling_data, + good_out_data, + bad_out_data); + } else { + GpuUpdateLossScaling + <<<1, 1, 0, dev_ctx.stream()>>>(found_inf_data, + pre_loss_scaling_data, + good_in_data, + bad_in_data, + incr_every_n_steps, + decr_every_n_nan_or_inf, + incr_ratio, + decr_ratio, + updated_loss_scaling_data, + good_out_data, + bad_out_data); + } + } +}; + +// Kernels + +template +void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, + const std::vector& xs, + const DenseTensor& scale, + std::vector outs, + DenseTensor* found_infinite) { + using MPDType = typename phi::dtype::MPTypeTrait::Type; + + const MPDType* scale_data = scale.data(); + bool* found_inf_data = dev_ctx.template Alloc(found_infinite); + + DenseTensor inverse_scale = Empty(dev_ctx, {1}); + MPDType* inverse_scale_v = inverse_scale.template data(); + + InverseAndMemset<<<1, 1, 0, dev_ctx.stream()>>>( + scale_data, inverse_scale_v, found_inf_data); + + size_t xs_size = xs.size(); + if (xs_size == 0) return; + + const auto& cpu_place = phi::CPUPlace(); + // calculate each tensor's start index and copy to device + auto h_starts_tensor = + paddle::memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t)); + int64_t* h_starts = reinterpret_cast(h_starts_tensor->ptr()); + + auto d_starts_tensor = + paddle::memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t)); + int64_t* d_starts = reinterpret_cast(d_starts_tensor->ptr()); + + // the start index value of each tensor is + // the sum of previous tensor's size. For example: + // x = [10, 0, 10, 10] ==> starts = [0, 10, 10, 20, 30] + h_starts[0] = 0; + for (int i = 1; i <= xs_size; i++) { + h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel(); + } + int64_t total_num = h_starts[xs_size]; + paddle::memory::Copy(dev_ctx.GetPlace(), + d_starts, + cpu_place, + h_starts, + (xs_size + 1) * sizeof(int64_t), + dev_ctx.stream()); + + // copy each tensor's data address to device + auto h_mem = paddle::memory::Alloc(cpu_place, 2 * xs_size * sizeof(T*)); + const T** h_xs = reinterpret_cast(h_mem->ptr()); + T** h_outs = reinterpret_cast(h_mem->ptr()) + xs_size; + + auto d_mem = paddle::memory::Alloc(dev_ctx, 2 * xs_size * sizeof(T*)); + const T** d_xs = reinterpret_cast(d_mem->ptr()); + T** d_outs = reinterpret_cast(d_mem->ptr()) + xs_size; + + for (size_t i = 0; i < xs_size; ++i) { + h_xs[i] = xs[i]->data(); + h_outs[i] = dev_ctx.template Alloc(outs[i]); + } + paddle::memory::Copy(dev_ctx.GetPlace(), + d_xs, + cpu_place, + h_xs, + 2 * xs_size * sizeof(T*), + dev_ctx.stream()); + + // Launch Kernel + int threads_per_block = std::min(static_cast(1024), total_num); + int elements_per_block = + threads_per_block * 20; // each thread deal with 20 number + int blocks_per_grid = + (total_num + elements_per_block - 1) / elements_per_block; + CheckFiniteAndUnscale<<>>( + d_xs, inverse_scale_v, xs_size, d_starts, found_inf_data, d_outs); +} + +} // namespace phi + +PD_REGISTER_KERNEL(check_finite_and_unscale, + GPU, + ALL_LAYOUT, + phi::CheckFiniteAndUnscaleKernel, + float, + double, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(update_loss_scaling, + GPU, + ALL_LAYOUT, + phi::UpdateLossScalingKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/impl/amp_kernel_impl.h b/paddle/phi/kernels/impl/amp_kernel_impl.h new file mode 100644 index 00000000000..4684a924cf0 --- /dev/null +++ b/paddle/phi/kernels/impl/amp_kernel_impl.h @@ -0,0 +1,183 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/amp_kernel.h" +#include "paddle/phi/kernels/full_kernel.h" + +namespace phi { + +template +inline HOSTDEVICE bool CheckFinite(T value) { +#if defined(PADDLE_WITH_CUDA) && defined(__NVCC__) + return isfinite(value); +#else + return std::isfinite(value); +#endif +} + +inline HOSTDEVICE bool IsFoundNanInf(const bool found_nan_inf_data) { + return found_nan_inf_data; +} + +inline HOSTDEVICE bool IsFoundNanInf(const bool* found_nan_inf_data) { + return *found_nan_inf_data; +} + +template +inline HOSTDEVICE void Update(const FoundInfFlagT found_inf_data, + const T* pre_loss_scaling_data, + const int* good_in_data, + const int* bad_in_data, + const int incr_every_n_steps, + const int decr_every_n_nan_or_inf, + const float incr_ratio, + const float decr_ratio, + T* updated_loss_scaling_data, + int* good_out_data, + int* bad_out_data) { + if (IsFoundNanInf(found_inf_data)) { + *good_out_data = 0; + *bad_out_data = *bad_in_data + 1; + if (*bad_out_data == decr_every_n_nan_or_inf) { + T new_loss_scaling = *pre_loss_scaling_data * decr_ratio; + *updated_loss_scaling_data = new_loss_scaling < static_cast(1) + ? static_cast(1) + : new_loss_scaling; + *bad_out_data = 0; + } + } else { + *bad_out_data = 0; + *good_out_data = *good_in_data + 1; + if (*good_out_data == incr_every_n_steps) { + T new_loss_scaling = *pre_loss_scaling_data * incr_ratio; + *updated_loss_scaling_data = CheckFinite(new_loss_scaling) + ? new_loss_scaling + : *pre_loss_scaling_data; + *good_out_data = 0; + } + } +} + +template +class LazyZeros { + public: + void operator()(const DeviceContext& dev_ctx, + const bool* found_inf_data, + const std::vector& xs, + const std::vector& outs) const {} +}; + +template +class UpdateLossScalingFunctor { + public: + void operator()(const DeviceContext& dev_ctx, + const bool* found_inf_data, + const T* pre_loss_scaling_data, + const int* good_in_data, + const int* bad_in_data, + const int incr_every_n_steps, + const int decr_every_n_nan_or_inf, + const float incr_ratio, + const float decr_ratio, + T* updated_loss_scaling_data, + int* good_out_data, + int* bad_out_data) const; +}; + +template +void UpdateLossScalingKernel(const Context& dev_ctx, + const std::vector& xs, + const DenseTensor& found_infinite, + const DenseTensor& prev_loss_scaling, + const DenseTensor& in_good_steps, + const DenseTensor& in_bad_steps, + int incr_every_n_steps, + int decr_every_n_nan_or_inf, + float incr_ratio, + float decr_ratio, + const Scalar& stop_update, + std::vector outs, + DenseTensor* loss_scaling, + DenseTensor* out_good_steps, + DenseTensor* out_bad_steps) { + using MPDType = typename phi::dtype::MPTypeTrait::Type; + + PADDLE_ENFORCE_EQ( + found_infinite.numel(), + 1, + phi::errors::InvalidArgument("FoundInfinite must has only one element.")); + const bool* found_inf_data = found_infinite.data(); + bool is_found_inf_on_cpu = + found_infinite.place().GetType() == AllocationType::CPU; + + if (is_found_inf_on_cpu) { + if (*found_inf_data) { + for (auto* out : outs) { + Full(dev_ctx, vectorize(out->dims()), static_cast(0), out); + } + } + } else { + LazyZeros{}(dev_ctx, found_inf_data, xs, outs); + } + + auto stop_update_val = stop_update.to(); + if (stop_update_val) { + return; + } + + const MPDType* pre_loss_scaling_data = prev_loss_scaling.data(); + const int* good_in_data = in_good_steps.data(); + const int* bad_in_data = in_bad_steps.data(); + + MPDType* updated_loss_scaling_data = + dev_ctx.template Alloc(loss_scaling); + int* good_out_data = dev_ctx.template Alloc(out_good_steps); + int* bad_out_data = dev_ctx.template Alloc(out_bad_steps); + + if (is_found_inf_on_cpu) { + UpdateLossScalingFunctor{}( + dev_ctx, + found_inf_data, + pre_loss_scaling_data, + good_in_data, + bad_in_data, + incr_every_n_steps, + decr_every_n_nan_or_inf, + incr_ratio, + decr_ratio, + updated_loss_scaling_data, + good_out_data, + bad_out_data); + } else { + UpdateLossScalingFunctor{}( + dev_ctx, + found_inf_data, + pre_loss_scaling_data, + good_in_data, + bad_in_data, + incr_every_n_steps, + decr_every_n_nan_or_inf, + incr_ratio, + decr_ratio, + updated_loss_scaling_data, + good_out_data, + bad_out_data); + } +} + +} // namespace phi diff --git a/paddle/phi/ops/compat/update_loss_scaling_sig.cc b/paddle/phi/ops/compat/update_loss_scaling_sig.cc new file mode 100644 index 00000000000..8223d0c7dfd --- /dev/null +++ b/paddle/phi/ops/compat/update_loss_scaling_sig.cc @@ -0,0 +1,47 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature UpdateLossScalingOpArgumentMapping( + const ArgumentMappingContext& ctx) { + if (ctx.HasInput("StopUpdate")) { + return KernelSignature( + "update_loss_scaling", + {"X", "FoundInfinite", "PrevLossScaling", "InGoodSteps", "InBadSteps"}, + {"incr_every_n_steps", + "decr_every_n_nan_or_inf", + "incr_ratio", + "decr_ratio", + "StopUpdate"}, + {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}); + } else { + return KernelSignature( + "update_loss_scaling", + {"X", "FoundInfinite", "PrevLossScaling", "InGoodSteps", "InBadSteps"}, + {"incr_every_n_steps", + "decr_every_n_nan_or_inf", + "incr_ratio", + "decr_ratio", + "stop_update"}, + {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}); + } +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(update_loss_scaling, + phi::UpdateLossScalingOpArgumentMapping); diff --git a/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py index 0d31dad8199..7337d320e7d 100644 --- a/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py +++ b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py @@ -50,6 +50,7 @@ class TestUpdateLossScalingOp(OpTest): self.num_good_steps = np.array([999], dtype=np.int32) self.num_bad_steps = np.array([1], dtype=np.int32) self.zero_steps = np.array([0], dtype=np.int32) + self.stop_update = np.array([False], dtype=np.bool) self.attrs = { 'incr_every_n_steps': 1000, 'decr_every_n_nan_or_inf': 2, @@ -77,7 +78,8 @@ class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp): 'FoundInfinite': found_inf, 'PrevLossScaling': self.prev_loss_scaling, 'InGoodSteps': self.num_good_steps, - 'InBadSteps': self.num_bad_steps + 'InBadSteps': self.num_bad_steps, + 'StopUpdate': self.stop_update } self.outputs = { -- GitLab