diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index a2d824877ea528b40262f8a260fe9b05d4e5d6c0..42de34fb52061af23eee28377659ed4cbbb4de0a 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -35,7 +35,7 @@ ELSE () ENDIF() SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") -SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210625") +SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210701") SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..210f3e098f95f490f9c5d4adf53d9ee4f20f3e97 --- /dev/null +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc @@ -0,0 +1,170 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/platform/float16.h" +namespace paddle { +namespace operators { +template +class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel { + using MPDType = typename details::MPTypeTrait::Type; + using XPUTyp = typename XPUTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto& dev_ctx = ctx.template device_context(); + const auto xs = ctx.MultiInput("X"); + const auto* scale = ctx.Input("Scale"); + auto outs = ctx.MultiOutput("Out"); + auto* found_inf = ctx.Output("FoundInfinite"); + + const MPDType* scale_data = scale->data(); + bool* found_inf_data = found_inf->mutable_data(dev_ctx.GetPlace()); + + // cpy to cpu + bool cpu_found_inf_data = false; + + MPDType cpu_scale_data; + if (platform::is_xpu_place(scale->place())) { + xpu_memcpy(&cpu_scale_data, scale_data, sizeof(MPDType), + XPUMemcpyKind::XPU_DEVICE_TO_HOST); + } else { + cpu_scale_data = (*scale_data); + } + MPDType inverse_scale = 1.0 / cpu_scale_data; + for (size_t i = 0; i < xs.size(); ++i) { + const auto* x = xs[i]; + auto* out = outs[i]; + out->mutable_data(dev_ctx.GetPlace()); + framework::Tensor is_finite = + ctx.AllocateTmpTensor(x->dims(), + dev_ctx); + framework::Tensor is_nan = + ctx.AllocateTmpTensor(x->dims(), + dev_ctx); + framework::Tensor is_finite_and_nan = + ctx.AllocateTmpTensor(x->dims(), + dev_ctx); + if (cpu_found_inf_data == false) { + int r = xpu::isfinite(dev_ctx.x_context(), + reinterpret_cast(x->data()), + is_finite.data(), x->numel()); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(isfinite) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + r = xpu::logical_not(dev_ctx.x_context(), reinterpret_cast( + is_finite.data()), + is_finite.data(), x->numel()); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU API(logical_not) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + r = xpu::isnan(dev_ctx.x_context(), + reinterpret_cast(x->data()), + is_nan.data(), x->numel()); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(isnan) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + r = xpu::logical_or(dev_ctx.x_context(), is_finite.data(), + is_nan.data(), is_finite.data(), + x->numel()); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU API(logical_or) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + r = xpu::any(dev_ctx.x_context(), is_finite.data(), + found_inf_data, x->numel()); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(any) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + memory::Copy(platform::CPUPlace(), &cpu_found_inf_data, + BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()), + found_inf_data, sizeof(bool)); + } + + if (cpu_found_inf_data) { + inverse_scale = 0.0; + } + auto dev_env = XPUEnv::getenv("XPUSIM_DEVICE_MODEL"); + + if (std::is_same::value && + (dev_env == nullptr || std::strcmp(dev_env, "KUNLUN1"))) { + framework::Tensor float_x; + framework::Tensor float_out; + float_x.mutable_data(dev_ctx.GetPlace(), + x->numel() * sizeof(MPDType)); + float_out.mutable_data(dev_ctx.GetPlace(), + out->numel() * sizeof(MPDType)); + int r = xpu::cast_v2(dev_ctx.x_context(), + reinterpret_cast(x->data()), + float_x.data(), x->numel()); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(cast_v2) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + + r = xpu::scale(dev_ctx.x_context(), float_x.data(), + float_out.data(), x->numel(), false, + inverse_scale, 0.0); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(scale) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + + r = xpu::cast_v2(dev_ctx.x_context(), float_out.data(), + reinterpret_cast(out->data()), + out->numel()); + + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(cast_v2) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + if (dev_ctx.x_context()->xpu_stream) { + dev_ctx.Wait(); + } + + } else { + int r = xpu::scale(dev_ctx.x_context(), + reinterpret_cast(x->data()), + reinterpret_cast(out->data()), + x->numel(), false, inverse_scale, 0.0); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(scale) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + } + } + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()), + found_inf_data, platform::CPUPlace(), &cpu_found_inf_data, + sizeof(bool)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_XPU_KERNEL(check_finite_and_unscale, + ops::CheckFiniteAndUnscaleXPUKernel, + ops::CheckFiniteAndUnscaleXPUKernel); + +#endif diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..1f05e5f246d9c564dbf53b121b07ff4beb84c686 --- /dev/null +++ b/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc @@ -0,0 +1,166 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/operators/amp/update_loss_scaling_op.h" +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +template +class UpdateLossScalingXPUKernel : public framework::OpKernel { + using MPDType = typename details::MPTypeTrait::Type; + using XPUTyp = typename XPUTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + + const auto xs = ctx.MultiInput("X"); + auto outs = ctx.MultiOutput("Out"); + const auto* found_inf = ctx.Input("FoundInfinite"); + PADDLE_ENFORCE_EQ(found_inf->numel(), 1, + platform::errors::InvalidArgument( + "FoundInfinite must has only one element.")); + const bool* found_inf_data = found_inf->data(); + bool cpu_found_inf_data = false; + if (platform::is_xpu_place(found_inf->place())) { + xpu_memcpy(&cpu_found_inf_data, found_inf_data, sizeof(bool), + XPUMemcpyKind::XPU_DEVICE_TO_HOST); + } else { + cpu_found_inf_data = (*found_inf_data); + } + + for (size_t i = 0; i < xs.size(); ++i) { + auto* out = outs[i]; + T* out_data = out->mutable_data(dev_ctx.GetPlace()); + int num = out->numel(); + if (cpu_found_inf_data) { + VLOG(1) << "-- UpdateLossScaling: Find infinite grads. --"; + int r = 0; + r = xpu::constant(dev_ctx.x_context(), + reinterpret_cast(out_data), num, + XPUTyp(0.0)); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(constant) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + } + } + const bool stop_update = ctx.Attr("stop_update"); + if (stop_update) { + return; + } + + const auto* pre_loss_scaling = ctx.Input("PrevLossScaling"); + const auto* good_in = ctx.Input("InGoodSteps"); + const auto* bad_in = ctx.Input("InBadSteps"); + auto* updated_loss_scaling = ctx.Output("LossScaling"); + auto* good_out = ctx.Output("OutGoodSteps"); + auto* bad_out = ctx.Output("OutBadSteps"); + const MPDType* pre_loss_scaling_data = pre_loss_scaling->data(); + const int* good_in_data = good_in->data(); + const int* bad_in_data = bad_in->data(); + + MPDType* updated_loss_scaling_data = + updated_loss_scaling->mutable_data(dev_ctx.GetPlace()); + int* good_out_data = good_out->mutable_data(dev_ctx.GetPlace()); + int* bad_out_data = bad_out->mutable_data(dev_ctx.GetPlace()); + + const int incr_every_n_steps = ctx.Attr("incr_every_n_steps"); + const int decr_every_n_nan_or_inf = + ctx.Attr("decr_every_n_nan_or_inf"); + const float incr_ratio = ctx.Attr("incr_ratio"); + const float decr_ratio = ctx.Attr("decr_ratio"); + + int cpu_bad_in_data; + int cpu_good_in_data; + MPDType cpu_pre_loss_scaling_data; + if (platform::is_xpu_place(bad_in->place())) { + xpu_memcpy(&cpu_bad_in_data, bad_in_data, sizeof(int), + XPUMemcpyKind::XPU_DEVICE_TO_HOST); + } else { + cpu_bad_in_data = (*bad_in_data); + } + + if (platform::is_xpu_place(good_in->place())) { + xpu_memcpy(&cpu_good_in_data, good_in_data, sizeof(int), + XPUMemcpyKind::XPU_DEVICE_TO_HOST); + } else { + cpu_good_in_data = (*good_in_data); + } + + if (platform::is_xpu_place(pre_loss_scaling->place())) { + xpu_memcpy(&cpu_pre_loss_scaling_data, pre_loss_scaling_data, + sizeof(MPDType), XPUMemcpyKind::XPU_DEVICE_TO_HOST); + } else { + cpu_pre_loss_scaling_data = (*pre_loss_scaling_data); + } + + int cpu_good_out_data = 0; + int cpu_bad_out_data = 0; + MPDType cpu_updated_loss_scaling_data; + + if (cpu_found_inf_data) { + cpu_good_out_data = 0; + cpu_bad_out_data = cpu_bad_in_data + 1; + if (cpu_bad_out_data == decr_every_n_nan_or_inf) { + MPDType new_loss_scaling = cpu_pre_loss_scaling_data * decr_ratio; + cpu_updated_loss_scaling_data = + (new_loss_scaling < static_cast(1)) + ? (static_cast(1)) + : (new_loss_scaling); + cpu_bad_out_data = 0; + } + } else { + cpu_bad_out_data = 0; + cpu_good_out_data = cpu_good_in_data + 1; + if (cpu_good_out_data == incr_every_n_steps) { + MPDType new_loss_scaling = cpu_pre_loss_scaling_data * incr_ratio; + cpu_updated_loss_scaling_data = (std::isfinite(new_loss_scaling)) + ? new_loss_scaling + : cpu_pre_loss_scaling_data; + cpu_good_out_data = 0; + } + } + + // copy to host + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()), + bad_out_data, platform::CPUPlace(), &cpu_bad_out_data, + sizeof(int)); + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()), + good_out_data, platform::CPUPlace(), &cpu_good_out_data, + sizeof(int)); + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()), + updated_loss_scaling_data, platform::CPUPlace(), + &cpu_updated_loss_scaling_data, sizeof(MPDType)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(update_loss_scaling, + ops::UpdateLossScalingXPUKernel, + ops::UpdateLossScalingXPUKernel); +#endif diff --git a/paddle/fluid/operators/dropout_op_xpu.cc b/paddle/fluid/operators/dropout_op_xpu.cc index f5d831fa24012031897eca2ce5a1cd9004f5a03b..79d239074845ad29f4f40e64a7d1ecc9f19168bb 100644 --- a/paddle/fluid/operators/dropout_op_xpu.cc +++ b/paddle/fluid/operators/dropout_op_xpu.cc @@ -16,11 +16,11 @@ namespace paddle { namespace operators { #ifdef PADDLE_WITH_XPU -static std::map mask_data_tables; -static const int max_data_size = 32 * 1024 * 1024; -static std::mutex s_mask_data_table_lock; + template class DropoutXPUKernel : public framework::OpKernel { + using XPUTyp = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& context) const override { auto* x = context.Input("X"); @@ -30,93 +30,70 @@ class DropoutXPUKernel : public framework::OpKernel { float dropout_prob = context.Attr("dropout_prob"); auto dropout_implementation = context.Attr("dropout_implementation"); - float* mask_data_table = nullptr; + auto& dev_ctx = context.template device_context(); + PADDLE_ENFORCE_EQ(!context.HasInput("Seed"), true, platform::errors::InvalidArgument( ("Input(Seed) not supported on XPU"))); + int is_upscale = (dropout_implementation == "upscale_in_train"); + if (!context.Attr("is_test")) { - int dev_id = - BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()).GetDeviceId(); - int prop = static_cast(dropout_prob * 100); - int is_upscale = (dropout_implementation == "upscale_in_train"); - /* mask_data_tables key contains 3 part: - * | 31-16 | 15-8 | 7-0 | - * | dev_id | prob | is_upscale | - */ - int index = (dev_id << 16) + (prop << 8) + is_upscale; - std::lock_guard lock(s_mask_data_table_lock); - if (mask_data_tables.find(index) == mask_data_tables.end()) { - float* mask_data_host = new float[max_data_size]; - std::random_device rnd; - std::minstd_rand engine; - int seed = - context.Attr("fix_seed") ? context.Attr("seed") : rnd(); - engine.seed(seed); - std::uniform_real_distribution dist(0, 1); - for (size_t i = 0; i < max_data_size; ++i) { - if (dist(engine) < dropout_prob) { - mask_data_host[i] = 0.0f; - } else { - if (is_upscale) { - mask_data_host[i] = 1.0f / static_cast(1.0f - dropout_prob); - } else { - mask_data_host[i] = 1.0; - } - } - } - PADDLE_ENFORCE_EQ( - xpu_malloc(reinterpret_cast(&mask_data_table), - max_data_size * sizeof(float)), - XPU_SUCCESS, - platform::errors::ResourceExhausted( - "\n\nOut of memory error on XPU, Cannot" - "allocate %s memory on XPU. \n\nPlease " - "check whether there is any other process " - "using XPU.\n", - string::HumanReadableSize(max_data_size * sizeof(void*)))); - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()), - mask_data_table, platform::CPUPlace(), mask_data_host, - max_data_size * sizeof(float)); - mask_data_tables[index] = mask_data_table; - free(mask_data_host); + std::random_device rnd; + // int seed = (context.Attr("fix_seed")) ? + // int(context.Attr("seed")) : (rnd()); + int seed = 0; + if (context.Attr("fix_seed") == true) { + seed = static_cast(context.Attr("seed")); } else { - mask_data_table = mask_data_tables[index]; + seed = rnd(); } - } - if (!context.Attr("is_test")) { // Train + auto* mask = context.Output("Mask"); auto* mask_data = mask->mutable_data(context.GetPlace()); - size_t size = framework::product(mask->dims()); - auto& dev_ctx = context.template device_context(); - int r = xpu::dropout(dev_ctx.x_context(), mask_data_table, x_data, - mask_data, y_data, max_data_size, size); - PADDLE_ENFORCE_EQ( - r, xpu::Error_t::SUCCESS, - platform::errors::External( - "XPU dropout return wrong value[%d], please check whether " - "Baidu Kunlun Card is properly installed.", - r)); - } else { // Infer - float scale = 0.0f; - if (dropout_implementation == "upscale_in_train") { - scale = 1.0f; - } else { - scale = static_cast(1.0f - dropout_prob); + // Special case when dropout_prob is 1.0 + if (dropout_prob == 1.0f) { + int r = xpu::constant(dev_ctx.x_context(), + reinterpret_cast(y_data), y->numel(), + XPUTyp(0)); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(constant) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + r = xpu::constant(dev_ctx.x_context(), + reinterpret_cast(mask_data), mask->numel(), + XPUTyp(0)); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(constant) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + return; } - auto& dev_ctx = context.template device_context(); - int r = xpu::scale(dev_ctx.x_context(), x->numel(), scale, 0.0f, 0, - x_data, y_data); - PADDLE_ENFORCE_EQ( - r, xpu::Error_t::SUCCESS, - platform::errors::External( - "XPU dropout return wrong value[%d], please check whether " - "Baidu Kunlun Card is properly installed.", - r)); + int r = xpu::dropout(dev_ctx.x_context(), + reinterpret_cast(x->data()), + reinterpret_cast(y->data()), + reinterpret_cast(mask_data), seed, + mask->numel(), is_upscale, dropout_prob); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(dropout) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + } else { + float scale = + (is_upscale) ? (1.0) : (static_cast(1.0f - dropout_prob)); + int r = xpu::scale( + dev_ctx.x_context(), reinterpret_cast(x_data), + reinterpret_cast(y_data), x->numel(), false, scale, 0.0f); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(scale) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); } } }; template class DropoutGradXPUKernel : public framework::OpKernel { + using XPUTyp = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& context) const override { PADDLE_ENFORCE_EQ(!context.Attr("is_test"), true, @@ -127,23 +104,47 @@ class DropoutGradXPUKernel : public framework::OpKernel { auto* mask = context.Input("Mask"); grad_x->mutable_data(context.GetPlace()); auto& dev_ctx = context.template device_context(); - int r = xpu::elementwise_mul(dev_ctx.x_context(), grad_y->data(), - mask->data(), grad_x->data(), - grad_y->numel()); - PADDLE_ENFORCE_EQ( - r, xpu::Error_t::SUCCESS, - platform::errors::External( - "XPU dropout return wrong value[%d], please check whether " - "Baidu Kunlun Card is properly installed.", - r)); + auto& dropout_implementation = + context.Attr("dropout_implementation"); + float dropout_prob = context.Attr("dropout_prob"); + const T* mask_data = mask->data(); + framework::Tensor mask_new; + if (dropout_implementation == "upscale_in_train") { + mask_new = context.AllocateTmpTensor( + mask->dims(), dev_ctx); + float scale = + (dropout_prob == 1.0f) ? (1.0f) : (1.0f / (1.0f - dropout_prob)); + int r = xpu::scale(dev_ctx.x_context(), + reinterpret_cast(mask->data()), + reinterpret_cast(mask_new.data()), + mask->numel(), false, scale, 0.0f); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(scale) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + mask_data = mask_new.data(); + } + + int r = xpu::mul( + dev_ctx.x_context(), reinterpret_cast(grad_y->data()), + reinterpret_cast(mask_data), + reinterpret_cast(grad_x->data()), grad_y->numel()); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External("XPU API(mul) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); } }; } // namespace operators } // namespace paddle namespace ops = paddle::operators; +namespace plat = paddle::platform; REGISTER_OP_XPU_KERNEL( - dropout, ops::DropoutXPUKernel); + dropout, ops::DropoutXPUKernel, + ops::DropoutXPUKernel); REGISTER_OP_XPU_KERNEL( dropout_grad, - ops::DropoutGradXPUKernel); + ops::DropoutGradXPUKernel, + ops::DropoutGradXPUKernel); #endif diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc index 8b902acebb4c5d4a8f739c9fe0e5a6f40c31ee9f..2e902bd277b1e4d016d0c3190579c409c8d361f3 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc @@ -122,33 +122,50 @@ class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel { axis)); std::vector x_dims_vec(max_dim, 1); std::vector y_dims_vec(max_dim, 1); + int x_len = 1; + int y_len = 1; if (x_dims.size() == max_dim) { for (int i = 0; i < max_dim; i++) { x_dims_vec[i] = x_dims[i]; + x_len *= x_dims_vec[i]; } } else { for (int i = 0; i < x_dims.size(); i++) { x_dims_vec[i + axis] = x_dims[i]; + x_len *= x_dims_vec[i]; } } if (y_dims.size() == max_dim) { for (int i = 0; i < max_dim; i++) { y_dims_vec[i] = y_dims[i]; + y_len *= y_dims_vec[i]; } } else { for (int i = 0; i < y_dims.size(); i++) { y_dims_vec[i + axis] = y_dims[i]; + y_len *= y_dims_vec[i]; } } const T* dz_data = dz->data(); + framework::Tensor dx_local_tensor; + framework::Tensor dy_local_tensor; + bool need_wait = false; T* dx_data = nullptr; T* dy_data = nullptr; if (dx) { dx_data = dx->mutable_data(ctx.GetPlace()); + } else { + dx_data = + dx_local_tensor.mutable_data(ctx.GetPlace(), x_len * sizeof(T)); + need_wait = true; } if (dy) { dy_data = dy->mutable_data(ctx.GetPlace()); + } else { + dy_data = + dy_local_tensor.mutable_data(ctx.GetPlace(), y_len * sizeof(T)); + need_wait = true; } auto& dev_ctx = @@ -161,6 +178,9 @@ class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel { platform::errors::External( "XPU kernel Elementwise occur error in XPUElementwise error code ", ret, XPUAPIErrorMsg[ret])); + if (need_wait && dev_ctx.x_context()->xpu_stream) { + dev_ctx.Wait(); + } } }; diff --git a/python/paddle/fluid/tests/unittests/xpu/test_amp_check_finite_and_scale_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_amp_check_finite_and_scale_op_xpu.py new file mode 100644 index 0000000000000000000000000000000000000000..9a2976f82a46019bcafb40c35265ce5a936ff67e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_amp_check_finite_and_scale_op_xpu.py @@ -0,0 +1,99 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +sys.path.append("..") +import paddle +import unittest +import numpy as np +from op_test_xpu import XPUOpTest +from op_test import OpTest, skip_check_grad_ci +import paddle.fluid as fluid +paddle.enable_static() + + +class TestCheckFiniteAndUnscaleOp(XPUOpTest): + def setUp(self): + self.op_type = "check_finite_and_unscale" + self.init_dtype() + x = np.random.random((1024, 1024)).astype(self.dtype) + scale = np.random.random((1)).astype(self.dtype) + # self.attrs = {'stop_gradient': True} + self.inputs = {'X': [('x0', x)], 'Scale': scale} + self.outputs = { + 'FoundInfinite': np.array([0]), + 'Out': [('out0', x / scale)], + } + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + if paddle.is_compiled_with_xpu(): + place = paddle.XPUPlace(0) + self.check_output_with_place(place) + + +# class TestCheckFiniteAndUnscaleOpWithNan(XPUOpTest): +# def setUp(self): +# self.op_type = "check_finite_and_unscale" +# self.init_dtype() +# x = np.random.random((1024, 1024)).astype(self.dtype) +# x[128][128] = np.nan +# print("x shape = ", x.shape) +# print(x) +# scale = np.random.random((1)).astype(self.dtype) + +# self.inputs = {'X': [('x0', x)], 'Scale': scale} +# self.outputs = { +# 'FoundInfinite': np.array([1]), +# 'Out': [('out0', x)], +# } + +# def init_dtype(self): +# self.dtype = np.float32 + +# def test_check_output(self): +# # When input contains nan, do not check the output, +# # since the output may be nondeterministic and will be discarded. +# if paddle.is_compiled_with_xpu(): +# place = paddle.XPUPlace(0) +# self.check_output_with_place(place, no_check_set=['Out']) + +# class TestCheckFiniteAndUnscaleOpWithInf(XPUOpTest): +# def setUp(self): +# self.op_type = "check_finite_and_unscale" +# self.init_dtype() +# x = np.random.random((1024, 1024)).astype(self.dtype) +# x[128][128] = np.inf +# scale = np.random.random((1)).astype(self.dtype) + +# self.inputs = {'X': [('x0', x)], 'Scale': scale} +# self.outputs = { +# 'FoundInfinite': np.array([1]), +# 'Out': [('out0', x)], +# } + +# def init_dtype(self): +# self.dtype = np.float32 + +# def test_check_output(self): +# # When input contains inf, do not check the output, +# # since the output may be nondeterministic and will be discarded. +# if paddle.is_compiled_with_xpu(): +# place = paddle.XPUPlace(0) +# self.check_output_with_place(place, no_check_set=['Out']) + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_dropout_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_dropout_op_xpu.py index 6c3368c3b6bfc4f8105107df06fd0aa38c18a2db..ca3b3a418abf6c968612fcca7997960f01945ce9 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_dropout_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_dropout_op_xpu.py @@ -22,9 +22,11 @@ from op_test import OpTest, skip_check_grad_ci import paddle import paddle.fluid as fluid from paddle.fluid import Program, program_guard +from op_test_xpu import XPUOpTest +paddle.enable_static() -class TestDropoutOp(OpTest): +class TestDropoutOp(XPUOpTest): def setUp(self): self.op_type = "dropout" self.inputs = {'X': np.random.random((32, 64)).astype("float32")} @@ -47,7 +49,7 @@ class TestDropoutOp(OpTest): self.check_grad_with_place(place, ['X'], 'Out') -class TestDropoutOpInput1d(OpTest): +class TestDropoutOpInput1d(XPUOpTest): def setUp(self): self.op_type = "dropout" self.inputs = {'X': np.random.random((2000, )).astype("float32")} diff --git a/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py new file mode 100644 index 0000000000000000000000000000000000000000..33b13081b54420841a521afd7573c0cb8788ecb6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py @@ -0,0 +1,245 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import sys +sys.path.append("..") +import numpy as np +from op_test import OpTest +from op_test_xpu import XPUOpTest +import paddle +import paddle.fluid as fluid +import paddle.fluid.contrib.mixed_precision.amp_nn as amp_nn + +paddle.enable_static() + + +class TestUpdateLossScalingOp(XPUOpTest): + def setUp(self): + self.op_type = "update_loss_scaling" + self.init() + found_inf = np.array([False], dtype=np.bool) + x = np.random.random((1024, 1024)).astype(self.dtype) + + self.inputs = { + 'X': [('x0', x)], + 'FoundInfinite': found_inf, + 'PrevLossScaling': self.prev_loss_scaling, + 'InGoodSteps': self.num_good_steps, + 'InBadSteps': self.num_bad_steps + } + + self.outputs = { + 'Out': [('out0', x)], + 'LossScaling': self.prev_loss_scaling * self.incr_ratio, + 'OutGoodSteps': self.zero_steps, + 'OutBadSteps': self.zero_steps + } + + def init(self): + self.incr_ratio = 2.0 + self.decr_ratio = 0.8 + self.dtype = np.float32 + self.prev_loss_scaling = np.array([2048]).astype(self.dtype) + self.num_good_steps = np.array([999], dtype=np.int32) + self.num_bad_steps = np.array([1], dtype=np.int32) + self.zero_steps = np.array([0], dtype=np.int32) + self.attrs = { + 'incr_every_n_steps': 1000, + 'decr_every_n_nan_or_inf': 2, + 'incr_ratio': self.incr_ratio, + 'decr_ratio': self.decr_ratio, + } + + def test_check_output(self): + if paddle.is_compiled_with_xpu(): + place = paddle.XPUPlace(0) + self.check_output_with_place(place, no_check_set=['Out']) + + +class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp): + def setUp(self): + self.op_type = "update_loss_scaling" + self.init() + found_inf = np.array([True], dtype=np.bool) + x = np.random.random((1024, 1024)).astype(self.dtype) + i = np.random.randint(0, 1024, 1) + j = np.random.randint(0, 1024, 1) + x[i[0]][j[0]] = np.inf + + self.inputs = { + 'X': [('x0', x)], + 'FoundInfinite': found_inf, + 'PrevLossScaling': self.prev_loss_scaling, + 'InGoodSteps': self.num_good_steps, + 'InBadSteps': self.num_bad_steps + } + + self.outputs = { + 'Out': [('out0', np.zeros_like(x))], + 'LossScaling': self.prev_loss_scaling * self.decr_ratio, + 'OutGoodSteps': self.zero_steps, + 'OutBadSteps': self.zero_steps + } + + def test_check_output(self): + if paddle.is_compiled_with_xpu(): + place = paddle.XPUPlace(0) + self.check_output_with_place(place) + #self.check_output() + + +class TestUpdateLossScalingLayer(unittest.TestCase): + def loss_scaling_check(self, scope=fluid.Scope()): + a = fluid.data(name="a", shape=[1024, 1024], dtype='float32') + b = fluid.data(name="b", shape=[512, 128], dtype='float32') + x = [a, b] + found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool') + prev_loss_scaling = fluid.data( + name="prev_loss_scaling", shape=[1], dtype='float32') + num_good_steps = fluid.data( + name="num_good_steps", shape=[1], dtype='int32') + num_bad_steps = fluid.data( + name="num_bad_steps", shape=[1], dtype='int32') + + a_v = np.random.random([1024, 1024]).astype('float32') + b_v = np.random.random([512, 128]).astype('float32') + found_inf_v = np.array([False]).astype('bool') + prev_loss_scaling_v = np.array([2048]).astype('float32') + num_good_steps_v = np.array([999], dtype=np.int32) + num_bad_steps_v = np.array([1], dtype=np.int32) + + incr_every_n_steps = 1000 + decr_every_n_nan_or_inf = 2 + incr_ratio = 2 + decr_ratio = 0.8 + + result = amp_nn.update_loss_scaling( + x, + found_inf, + prev_loss_scaling, + num_good_steps, + num_bad_steps, + incr_every_n_steps, + decr_every_n_nan_or_inf, + incr_ratio, + decr_ratio, + name="update_loss_scaling") + + place = fluid.XPUPlace(0) + exe = fluid.Executor(place) + with fluid.scope_guard(scope): + exe.run(fluid.default_startup_program()) + result_v = exe.run(feed={ + 'a': a_v, + 'b': b_v, + 'found_inf': found_inf_v, + 'prev_loss_scaling': prev_loss_scaling_v, + 'num_good_steps': num_good_steps_v, + 'num_bad_steps': num_bad_steps_v + }, + fetch_list=[ + result, x, found_inf, prev_loss_scaling, + num_good_steps, num_bad_steps + ]) + assert np.array_equal(result_v[0], a_v) + assert np.array_equal(result_v[1], b_v) + assert np.array_equal(result_v[0], result_v[2]) + assert np.array_equal(result_v[1], result_v[3]) + assert np.array_equal(result_v[4], found_inf_v) + assert np.array_equal(result_v[5], prev_loss_scaling_v * incr_ratio) + assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v)) + assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v)) + + def loss_scaling_check_inf(self, use_cuda=True, scope=fluid.Scope()): + a = fluid.data(name="a", shape=[1024, 1024], dtype='float32') + b = fluid.data(name="b", shape=[512, 128], dtype='float32') + x = [a, b] + found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool') + prev_loss_scaling = fluid.data( + name="prev_loss_scaling", shape=[1], dtype='float32') + num_good_steps = fluid.data( + name="num_good_steps", shape=[1], dtype='int32') + num_bad_steps = fluid.data( + name="num_bad_steps", shape=[1], dtype='int32') + + a_v = np.random.random([1024, 1024]).astype('float32') + b_v = np.random.random([512, 128]).astype('float32') + i = np.random.randint(0, 1024, 1) + j = np.random.randint(0, 1024, 1) + a_v[i[0]][j[0]] = np.inf + found_inf_v = np.array([True]).astype('bool') + prev_loss_scaling_v = np.array([2048]).astype('float32') + num_good_steps_v = np.array([999], dtype=np.int32) + num_bad_steps_v = np.array([1], dtype=np.int32) + + incr_every_n_steps = 1000 + decr_every_n_nan_or_inf = 2 + incr_ratio = 2 + decr_ratio = 0.8 + + result = amp_nn.update_loss_scaling( + x, + found_inf, + prev_loss_scaling, + num_good_steps, + num_bad_steps, + incr_every_n_steps, + decr_every_n_nan_or_inf, + incr_ratio, + decr_ratio, + name="update_loss_scaling") + + place = fluid.XPUPlace(0) + exe = fluid.Executor(place) + with fluid.scope_guard(scope): + exe.run(fluid.default_startup_program()) + result_v = exe.run(feed={ + 'a': a_v, + 'b': b_v, + 'found_inf': found_inf_v, + 'prev_loss_scaling': prev_loss_scaling_v, + 'num_good_steps': num_good_steps_v, + 'num_bad_steps': num_bad_steps_v + }, + fetch_list=[ + result, x, found_inf, prev_loss_scaling, + num_good_steps, num_bad_steps + ]) + assert np.array_equal(result_v[0], np.zeros_like(a_v)) + assert np.array_equal(result_v[1], np.zeros_like(b_v)) + assert np.array_equal(result_v[2], np.zeros_like(a_v)) + assert np.array_equal(result_v[3], np.zeros_like(b_v)) + assert np.array_equal(result_v[4], found_inf_v) + assert np.array_equal(result_v[5], prev_loss_scaling_v * decr_ratio) + assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v)) + assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v)) + + def test_loss_scaling(self): + main = fluid.Program() + startup = fluid.Program() + with fluid.unique_name.guard(): + with fluid.program_guard(main, startup): + self.loss_scaling_check() + + def test_loss_scaling_inf(self): + main = fluid.Program() + startup = fluid.Program() + with fluid.unique_name.guard(): + with fluid.program_guard(main, startup): + self.loss_scaling_check_inf() + + +if __name__ == '__main__': + unittest.main()