diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index d5ccf1297922f5dfb08993aa37200db194be9a71..2c7f28b3a522311244f54df589f11c22c40fb8ba 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -9,7 +9,7 @@ SET(XPU_RT_LIB_NAME "libxpurt.so") if(NOT DEFINED XPU_BASE_URL) SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220425") + SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220510") else() SET(XPU_BASE_URL "${XPU_BASE_URL}") endif() @@ -17,7 +17,7 @@ endif() # ubuntu and centos: use output by XDNN API team if(NOT DEFINED XPU_XDNN_BASE_URL) SET(XPU_XDNN_BASE_URL_WITHOUT_DATE "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev") - SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220425") + SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220510") else() SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}") endif() diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc index 14b5662b24aeb9d5a608b3a4da371248be45f6f6..c4ea6a3c6bc669dc2c5154b9a74fa0fe745269db 100644 --- a/paddle/fluid/framework/data_type_transform.cc +++ b/paddle/fluid/framework/data_type_transform.cc @@ -18,6 +18,10 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/platform/transform.h" +#if defined(PADDLE_WITH_XPU) +#include "paddle/fluid/platform/device/device_wrapper.h" +#endif + namespace paddle { namespace framework { @@ -28,6 +32,49 @@ struct CastDataTypeFunctor { } }; +#if defined(PADDLE_WITH_XPU) + +template +static void XPUCastData(const framework::Tensor& in, framework::Tensor* out, + const platform::XPUDeviceContext* dev_ctx) { + using XPUInTDType = typename XPUTypeTrait::Type; + using XPUOutTDType = typename XPUTypeTrait::Type; + int r = xpu::cast_v2( + dev_ctx->x_context(), + reinterpret_cast(in.data()), + reinterpret_cast(out->mutable_data(in.place())), + in.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); + dev_ctx->Wait(); +} + +template +static void XPUTransDataType( + const framework::Tensor& in, framework::Tensor* out, + const paddle::framework::proto::VarType::Type& dst_type, + const platform::DeviceContext* ctx) { + auto* context = static_cast(ctx); + +#define XPUCastCallback(cpp_type, proto_type) \ + do { \ + if (dst_type == proto_type) { \ + XPUCastData(in, out, context); \ + } \ + } while (0) + + if (dst_type == proto::VarType::FP32 && dst_type == proto::VarType::FP16 && + dst_type == proto::VarType::BOOL && dst_type == proto::VarType::INT16 && + dst_type == proto::VarType::INT32 && dst_type == proto::VarType::INT64) { + _ForEachDataType_(XPUCastCallback); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Data type (%s) is not supported in XPU when casting data type.", + DataTypeToString(dst_type))); + } +} + +#endif + template struct CastDataType { CastDataType(const framework::Tensor& in, framework::Tensor* out, @@ -88,6 +135,34 @@ void TransDataType(const Tensor& in, auto dst_type = type; auto ctx = pool.Get(in.place()); +#if defined(PADDLE_WITH_XPU) + switch (src_type) { + case proto::VarType::FP16: + XPUTransDataType(in, out, dst_type, ctx); + break; + case proto::VarType::FP32: + XPUTransDataType(in, out, dst_type, ctx); + break; + case proto::VarType::BOOL: + XPUTransDataType(in, out, dst_type, ctx); + break; + case proto::VarType::INT16: + XPUTransDataType(in, out, dst_type, ctx); + break; + case proto::VarType::INT32: + XPUTransDataType(in, out, dst_type, ctx); + break; + case proto::VarType::INT64: + XPUTransDataType(in, out, dst_type, ctx); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Data type (%s) is not supported in XPU when casting data type.", + DataTypeToString(src_type))); + } + +#else + switch (src_type) { case proto::VarType::FP16: framework::VisitDataType(dst_type, @@ -123,6 +198,7 @@ void TransDataType(const Tensor& in, "Data type (%s) is not supported when casting data type.", DataTypeToString(src_type))); } +#endif } void TransComplexToReal(const proto::VarType::Type& dst_type, @@ -131,7 +207,6 @@ void TransComplexToReal(const proto::VarType::Type& dst_type, auto& pool = platform::DeviceContextPool::Instance(); auto* ctx = pool.Get(in.place()); out->Resize(in.dims()); - // complex -> real switch (src_type) { case proto::VarType::COMPLEX64: diff --git a/paddle/fluid/operators/log_loss_op_xpu.cc b/paddle/fluid/operators/log_loss_op_xpu.cc index aa5fdd86745d6932052347f3dc11b14e3d447d20..ead6f94417b6ea0353fb42c08f239eeca38c6196 100644 --- a/paddle/fluid/operators/log_loss_op_xpu.cc +++ b/paddle/fluid/operators/log_loss_op_xpu.cc @@ -21,58 +21,67 @@ template class LogLossXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* predict = ctx.Input("Predicted"); - auto* labels = ctx.Input("Labels"); - auto* loss = ctx.Output("Loss"); - auto epsilon = static_cast(ctx.Attr("epsilon")); - loss->mutable_data(ctx.GetPlace()); - int n = predict->numel(); - auto& dev_ctx = ctx.template device_context(); - int r = - xpu::log_loss_fwd(dev_ctx.x_context(), n, epsilon, predict->data(), - labels->data(), loss->data()); - PADDLE_ENFORCE_EQ( - r, xpu::Error_t::SUCCESS, - platform::errors::External( - "XPU log_loss kernel return wrong value[%d], please check whether " - "Baidu Kunlun Card is properly installed.", - r)); + /*** TODO wait XDNN new interface + auto* predict = ctx.Input("Predicted"); + auto* labels = ctx.Input("Labels"); + auto* loss = ctx.Output("Loss"); + auto epsilon = static_cast(ctx.Attr("epsilon")); + loss->mutable_data(ctx.GetPlace()); + int n = predict->numel(); + auto& dev_ctx = ctx.template device_context(); + int r = + xpu::log_loss_fwd(dev_ctx.x_context(), n, epsilon, + predict->data(), + labels->data(), loss->data()); + PADDLE_ENFORCE_EQ( + r, xpu::Error_t::SUCCESS, + platform::errors::External( + "XPU log_loss kernel return wrong value[%d], please check + whether " + "Baidu Kunlun Card is properly installed.", + r)); + ***/ } }; template class LogLossGradXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* predict = ctx.Input("Predicted"); - auto* labels = ctx.Input("Labels"); - auto* dloss = ctx.Input(framework::GradVarName("Loss")); - auto* dpred = ctx.Output(framework::GradVarName("Predicted")); - if (!dpred) { - return; - } - auto epsilon = static_cast(ctx.Attr("epsilon")); - dpred->mutable_data(ctx.GetPlace()); - int n = predict->numel(); - auto& dev_ctx = ctx.template device_context(); - int r = xpu::log_loss_bwd(dev_ctx.x_context(), n, epsilon, - predict->data(), labels->data(), - dloss->data(), dpred->data()); - PADDLE_ENFORCE_EQ( - r, xpu::Error_t::SUCCESS, - platform::errors::External( - "XPU log_loss kernel return wrong value[%d], please check whether " - "Baidu Kunlun Card is properly installed.", - r)); + /*** TODO wait XDNN new interface + + auto* predict = ctx.Input("Predicted"); + auto* labels = ctx.Input("Labels"); + auto* dloss = ctx.Input(framework::GradVarName("Loss")); + auto* dpred = ctx.Output(framework::GradVarName("Predicted")); + if (!dpred) { + return; + } + auto epsilon = static_cast(ctx.Attr("epsilon")); + dpred->mutable_data(ctx.GetPlace()); + int n = predict->numel(); + auto& dev_ctx = ctx.template device_context(); + int r = xpu::log_loss_bwd(dev_ctx.x_context(), n, epsilon, + predict->data(), labels->data(), + dloss->data(), dpred->data()); + PADDLE_ENFORCE_EQ( + r, xpu::Error_t::SUCCESS, + platform::errors::External( + "XPU log_loss kernel return wrong value[%d], please check + whether " + "Baidu Kunlun Card is properly installed.", + r)); + ***/ } }; } // namespace operators } // namespace paddle -namespace ops = paddle::operators; -REGISTER_OP_XPU_KERNEL( - log_loss, ops::LogLossXPUKernel); -REGISTER_OP_XPU_KERNEL( - log_loss_grad, - ops::LogLossGradXPUKernel); +// namespace ops = paddle::operators; +// REGISTER_OP_XPU_KERNEL( +// log_loss, ops::LogLossXPUKernel); +// REGISTER_OP_XPU_KERNEL( +// log_loss_grad, +// ops::LogLossGradXPUKernel); #endif diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc index 3cc1be4de8a82ff263824ab4852178f735596d45..82e4b90468a38c5b539fda9cb6f911c5080d1297 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/device/xpu/xpu_header.h" +#include "paddle/fluid/platform/device/device_wrapper.h" namespace paddle { namespace operators { @@ -42,68 +42,26 @@ class AccuracyXPUKernel : public framework::OpKernel { if (num_samples == 0) { return; } - size_t indices_int32_size = num_samples * class_dim * sizeof(int); - size_t indices_int64_size = num_samples * class_dim * sizeof(int64_t); - size_t label_int32_size = num_samples * sizeof(int); - size_t label_int64_size = num_samples * sizeof(int64_t); auto& dev_ctx = ctx.template device_context(); - int* indices_int32_device = NULL; - PADDLE_ENFORCE_EQ( - xpu_malloc(reinterpret_cast(&indices_int32_device), - indices_int32_size), - XPU_SUCCESS, - platform::errors::ResourceExhausted( - "\n\nOut of memory error on XPU, Cannot allocate %s memory" - " on XPU. \n\nPlease check whether there is any other process " - "using XPU.\n", - string::HumanReadableSize(indices_int32_size))); - int* label_int32_device = NULL; - PADDLE_ENFORCE_EQ( - xpu_malloc(reinterpret_cast(&label_int32_device), - label_int32_size), - XPU_SUCCESS, - platform::errors::ResourceExhausted( - "\n\nOut of memory error on XPU, Cannot allocate %s memory" - " on XPU. \n\nPlease check whether there is any other process " - "using XPU.\n", - string::HumanReadableSize(label_int32_size))); + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + int size = num_samples * class_dim; + int* indices_int32_ptr = RAII_GUARD.alloc_l3_or_gm(size); + PADDLE_ENFORCE_XDNN_NOT_NULL(indices_int32_ptr); + int* label_int32_ptr = RAII_GUARD.alloc_l3_or_gm(size); + PADDLE_ENFORCE_XDNN_NOT_NULL(label_int32_ptr); - int* indices_int32_host = - reinterpret_cast(std::malloc(indices_int32_size)); - int64_t* indices_int64_host = - reinterpret_cast(std::malloc(indices_int64_size)); - int* label_int32_host = - reinterpret_cast(std::malloc(label_int32_size)); - int64_t* label_int64_host = - reinterpret_cast(std::malloc(label_int64_size)); - dev_ctx.Wait(); - memory::Copy(platform::CPUPlace(), indices_int64_host, ctx.GetPlace(), - indices_data, indices_int64_size); - memory::Copy(platform::CPUPlace(), label_int64_host, ctx.GetPlace(), - label_data, label_int64_size); - for (size_t i = 0; i < num_samples; ++i) { - label_int32_host[i] = label_int64_host[i]; - for (size_t j = 0; j < class_dim; ++j) { - indices_int32_host[i * class_dim + j] = - indices_int64_host[i * class_dim + j]; - } - } - memory::Copy(ctx.GetPlace(), indices_int32_device, platform::CPUPlace(), - indices_int32_host, indices_int32_size); - memory::Copy(ctx.GetPlace(), label_int32_device, platform::CPUPlace(), - label_int32_host, label_int32_size); - int r = xpu::accuracy(dev_ctx.x_context(), indices_int32_device, - label_int32_device, num_samples, class_dim, - correct_data, total_data, accuracy_data); - PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, - platform::errors::Fatal("XPU accuracy kernel error!")); - dev_ctx.Wait(); - xpu_free(indices_int32_device); - xpu_free(label_int32_device); - std::free(indices_int32_host); - std::free(indices_int64_host); - std::free(label_int32_host); - std::free(label_int64_host); + int r = xpu::cast_v2(dev_ctx.x_context(), indices_data, + indices_int32_ptr, size); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); + + r = xpu::cast_v2(dev_ctx.x_context(), label_data, + label_int32_ptr, size); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); + + r = xpu::accuracy(dev_ctx.x_context(), indices_int32_ptr, label_int32_ptr, + num_samples, class_dim, correct_data, total_data, + accuracy_data); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); } }; diff --git a/paddle/fluid/operators/optimizers/lamb_op_xpu.cc b/paddle/fluid/operators/optimizers/lamb_op_xpu.cc index e7cbe4aa8dd4b36983dca5413ccdcb8ceac63a3c..643f70b260206c786ce7c6782ab9abd2f76a6de5 100644 --- a/paddle/fluid/operators/optimizers/lamb_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/lamb_op_xpu.cc @@ -25,101 +25,111 @@ template class LambOpXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - using paddle::framework::LoDTensor; - const auto* param_var = ctx.InputVar("Param"); - PADDLE_ENFORCE_EQ(param_var->IsType(), true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Param").front(), - framework::ToTypeName(param_var->Type()))); + /*** TODO wait XDNN new interface + using paddle::framework::LoDTensor; + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE_EQ(param_var->IsType(), true, + platform::errors::InvalidArgument( + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.InputNames("Param").front(), + framework::ToTypeName(param_var->Type()))); - using paddle::framework::LoDTensor; + using paddle::framework::LoDTensor; - // inputs - T epsilon = static_cast(ctx.Attr("epsilon")); - T weight_decay = static_cast(ctx.Attr("weight_decay")); - T beta1 = static_cast(ctx.Attr("beta1")); - T beta2 = static_cast(ctx.Attr("beta2")); - auto& param = GET_DATA_SAFELY(ctx.Input("Param"), "Input", - "Param", "Lamb"); - auto* grad_var = ctx.InputVar("Grad"); - auto& mom1 = GET_DATA_SAFELY(ctx.Input("Moment1"), "Input", - "Moment1", "Lamb"); - auto& mom2 = GET_DATA_SAFELY(ctx.Input("Moment2"), "Input", - "Moment2", "Lamb"); - auto& lr = GET_DATA_SAFELY(ctx.Input("LearningRate"), "Input", - "LearningRate", "Lamb"); + // inputs + T epsilon = static_cast(ctx.Attr("epsilon")); + T weight_decay = static_cast(ctx.Attr("weight_decay")); + T beta1 = static_cast(ctx.Attr("beta1")); + T beta2 = static_cast(ctx.Attr("beta2")); + auto& param = GET_DATA_SAFELY(ctx.Input("Param"), "Input", + "Param", "Lamb"); + auto* grad_var = ctx.InputVar("Grad"); + auto& mom1 = GET_DATA_SAFELY(ctx.Input("Moment1"), "Input", + "Moment1", "Lamb"); + auto& mom2 = GET_DATA_SAFELY(ctx.Input("Moment2"), "Input", + "Moment2", "Lamb"); + auto& lr = GET_DATA_SAFELY(ctx.Input("LearningRate"), + "Input", + "LearningRate", "Lamb"); - auto& beta1_pow = GET_DATA_SAFELY(ctx.Input("Beta1Pow"), "Input", - "Beta1Pow", "Lamb"); - auto& beta2_pow = GET_DATA_SAFELY(ctx.Input("Beta2Pow"), "Input", - "Beta2Pow", "Lamb"); + auto& beta1_pow = GET_DATA_SAFELY(ctx.Input("Beta1Pow"), + "Input", + "Beta1Pow", "Lamb"); + auto& beta2_pow = GET_DATA_SAFELY(ctx.Input("Beta2Pow"), + "Input", + "Beta2Pow", "Lamb"); - auto& param_out = GET_DATA_SAFELY(ctx.Output("ParamOut"), - "Output", "ParamOut", "Lamb"); - auto& mom1_out = GET_DATA_SAFELY(ctx.Output("Moment1Out"), - "Output", "Moment1Out", "Lamb"); - auto& mom2_out = GET_DATA_SAFELY(ctx.Output("Moment2Out"), - "Output", "Moment2Out", "Lamb"); - auto& beta1_pow_out = GET_DATA_SAFELY(ctx.Output("Beta1PowOut"), - "Output", "Beta1PowOut", "Lamb"); - auto& beta2_pow_out = GET_DATA_SAFELY(ctx.Output("Beta2PowOut"), - "Output", "Beta2PowOut", "Lamb"); - auto& dev_ctx = ctx.template device_context(); + auto& param_out = GET_DATA_SAFELY(ctx.Output("ParamOut"), + "Output", "ParamOut", "Lamb"); + auto& mom1_out = GET_DATA_SAFELY(ctx.Output("Moment1Out"), + "Output", "Moment1Out", "Lamb"); + auto& mom2_out = GET_DATA_SAFELY(ctx.Output("Moment2Out"), + "Output", "Moment2Out", "Lamb"); + auto& beta1_pow_out = + GET_DATA_SAFELY(ctx.Output("Beta1PowOut"), + "Output", "Beta1PowOut", "Lamb"); + auto& beta2_pow_out = + GET_DATA_SAFELY(ctx.Output("Beta2PowOut"), + "Output", "Beta2PowOut", "Lamb"); + auto& dev_ctx = ctx.template device_context(); - if (grad_var->IsType()) { - auto& grad = *ctx.Input("Grad"); - int r = xpu::lamb(dev_ctx.x_context(), grad.template data(), - mom1.template data(), mom2.template data(), - param.template data(), beta1_pow.template data(), - beta2_pow.template data(), beta1, beta2, epsilon, - weight_decay, lr.template data(), - mom1_out.template mutable_data(ctx.GetPlace()), - mom2_out.template mutable_data(ctx.GetPlace()), - param_out.template mutable_data(ctx.GetPlace()), - beta1_pow_out.template mutable_data(ctx.GetPlace()), - beta2_pow_out.template mutable_data(ctx.GetPlace()), - param.numel()); + if (grad_var->IsType()) { + auto& grad = *ctx.Input("Grad"); + int r = xpu::lamb(dev_ctx.x_context(), grad.template data(), + mom1.template data(), mom2.template data(), + param.template data(), beta1_pow.template + data(), + beta2_pow.template data(), beta1, beta2, epsilon, + weight_decay, lr.template data(), + mom1_out.template mutable_data(ctx.GetPlace()), + mom2_out.template mutable_data(ctx.GetPlace()), + param_out.template mutable_data(ctx.GetPlace()), + beta1_pow_out.template + mutable_data(ctx.GetPlace()), + beta2_pow_out.template + mutable_data(ctx.GetPlace()), + param.numel()); - if (r == xpu::Error_t::INVALID_PARAM) { - PADDLE_ENFORCE_EQ( - r, xpu::Error_t::SUCCESS, - platform::errors::InvalidArgument( - "XPU kernel error of LambOp, error message: INVALID_PARAM, " - "please check your input & output.")); - } else if (r == xpu::Error_t::RUNTIME_ERROR) { - PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, - platform::errors::Unavailable( - "XPU kernel error of LambOp, error message: " - "RUNTIME_ERROR, please check whether Baidu " - "Kunlun Card is properly installed.")); - } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) { - PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, - platform::errors::ResourceExhausted( - "XPU kernel error of LambOp, error " - "message: NO_ENOUGH_WORKSPACE, XPU " - "has no enough memory.")); - } else { - PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, - platform::errors::ResourceExhausted( - "XPU kernel error of LambOp, error " - "message: OTHER " - "XPU API returns error code: %d.", - r)); - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Variable type not supported by lamb_op. Expect LoDTensor, " - "but got %s", - framework::ToTypeName(param_var->Type()))); - } + if (r == xpu::Error_t::INVALID_PARAM) { + PADDLE_ENFORCE_EQ( + r, xpu::Error_t::SUCCESS, + platform::errors::InvalidArgument( + "XPU kernel error of LambOp, error message: INVALID_PARAM, " + "please check your input & output.")); + } else if (r == xpu::Error_t::RUNTIME_ERROR) { + PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, + platform::errors::Unavailable( + "XPU kernel error of LambOp, error message: " + "RUNTIME_ERROR, please check whether Baidu " + "Kunlun Card is properly installed.")); + } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) { + PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, + platform::errors::ResourceExhausted( + "XPU kernel error of LambOp, error " + "message: NO_ENOUGH_WORKSPACE, XPU " + "has no enough memory.")); + } else { + PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, + platform::errors::ResourceExhausted( + "XPU kernel error of LambOp, error " + "message: OTHER " + "XPU API returns error code: %d.", + r)); + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Variable type not supported by lamb_op. Expect LoDTensor, " + "but got %s", + framework::ToTypeName(param_var->Type()))); + } + **/ } }; } // namespace operators } // namespace paddle -namespace ops = paddle::operators; -REGISTER_OP_XPU_KERNEL( - lamb, ops::LambOpXPUKernel); +// namespace ops = paddle::operators; +// REGISTER_OP_XPU_KERNEL( +// lamb, ops::LambOpXPUKernel); #endif diff --git a/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc b/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc index 85c2d42c841f020e44994546ea3dafb86de0c8f8..873056c7f67fe12aa285d2280072df82e90e8e31 100644 --- a/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc @@ -40,113 +40,122 @@ template class RmspropOpXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - using paddle::framework::LoDTensor; - - // check Param & Grad tensor type - const auto* param_var = ctx.InputVar("Param"); - PADDLE_ENFORCE_EQ(param_var->IsType(), true, - platform::errors::InvalidArgument( - "Tensor holds the wrong type,Expected Var(%s)'s " - "type is LoDTensor, " - "but the received is %s", - ctx.InputNames("Param").front(), - framework::ToTypeName(param_var->Type()))); - - const auto* grad_var = ctx.InputVar("Grad"); - PADDLE_ENFORCE_EQ(grad_var->IsType(), true, - platform::errors::InvalidArgument( - "Tensor holds the wrong type,Expected Var(%s)'s " - "type is LoDTensor, " - "but the received is %s", - ctx.InputNames("Grad").front(), - framework::ToTypeName(grad_var->Type()))); - - // inputs - auto& param = GET_DATA_SAFELY(ctx.Input("Param"), "Input", - "Param", "Rmsprop"); - auto& meanSquare = GET_DATA_SAFELY(ctx.Input("MeanSquare"), - "Input", "MeanSquare", "Rmsprop"); - auto& grad = GET_DATA_SAFELY(ctx.Input("Grad"), "Input", "Grad", - "Rmsprop"); - auto& mom = GET_DATA_SAFELY(ctx.Input("Moment"), "Input", - "Moment", "Rmsprop"); - - auto* learning_rate = ctx.Input("LearningRate"); - PADDLE_ENFORCE_EQ(learning_rate->dims().size(), 1, - platform::errors::InvalidArgument( - "learining rate should have dimension = 1." - " But received learning rate dim [%s] ", - learning_rate->dims().size())); - T lr = static_cast(GetAttrFromTensor(learning_rate)); - - // constants - T epsilon = static_cast(ctx.Attr("epsilon")); - T decay = static_cast(ctx.Attr("decay")); - T momentum = static_cast(ctx.Attr("momentum")); - - // outputs - auto& param_out = GET_DATA_SAFELY(ctx.Output("ParamOut"), - "Output", "ParamOut", "Rmsprop"); - auto& mom_out = GET_DATA_SAFELY(ctx.Output("MomentOut"), - "Output", "MomentOut", "Rmsprop"); - auto& mom_sqrt_out = GET_DATA_SAFELY(ctx.Output("MeanSquareOut"), - "Output", "MeanSquareOut", "Rmsprop"); - auto& dev_ctx = ctx.template device_context(); - - ///// rmsprop优化算法 - /// - /// ms_out[i] = rho * ms[i] + (1 - rho) * (g[i] * g[i]); - /// - /// mom_out[i] = momentum * mom[i] + lr * - /// (g[i] / ((float)sqrt(ms_out[i] + epsilon))); - /// - /// p_out[i] = p[i] - mom_out[i]; - /// DLL_EXPORT int rmsprop(Context* ctx, const float* p, - /// const float* ms, const float* g, const float* mom, - /// float epsilon, float rho, float momentum, float lr, - /// float *ms_out, float *mom_out, float *p_out, int n) - int r = xpu::rmsprop(dev_ctx.x_context(), param.template data(), - meanSquare.template data(), grad.template data(), - mom.template data(), epsilon, decay, momentum, lr, - mom_sqrt_out.template mutable_data(ctx.GetPlace()), - mom_out.template mutable_data(ctx.GetPlace()), - param_out.template mutable_data(ctx.GetPlace()), - param.numel()); - - if (r == xpu::Error_t::INVALID_PARAM) { - PADDLE_ENFORCE_EQ( - r, xpu::Error_t::SUCCESS, - platform::errors::InvalidArgument( - "XPU kernel error of RmspropOp, error message: INVALID_PARAM, " - "please check your input & output.")); - } else if (r == xpu::Error_t::RUNTIME_ERROR) { - PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, - platform::errors::Unavailable( - "XPU kernel error of RmspropOp, error message: " - "RUNTIME_ERROR, please check whether Baidu " - "Kunlun Card is properly installed.")); - } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) { - PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, - platform::errors::ResourceExhausted( - "XPU kernel error of RmspropOp, error " - "message: NO_ENOUGH_WORKSPACE, XPU " - "has no enough memory.")); - } else { - PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, - platform::errors::ResourceExhausted( - "XPU kernel error of RmspropOp, error " - "message: OTHER " - "XPU API returns error code: %d.", - r)); - } + /*** TODO wait XDNN new interface + using paddle::framework::LoDTensor; + + // check Param & Grad tensor type + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE_EQ(param_var->IsType(), true, + platform::errors::InvalidArgument( + "Tensor holds the wrong type,Expected Var(%s)'s " + "type is LoDTensor, " + "but the received is %s", + ctx.InputNames("Param").front(), + framework::ToTypeName(param_var->Type()))); + + const auto* grad_var = ctx.InputVar("Grad"); + PADDLE_ENFORCE_EQ(grad_var->IsType(), true, + platform::errors::InvalidArgument( + "Tensor holds the wrong type,Expected Var(%s)'s " + "type is LoDTensor, " + "but the received is %s", + ctx.InputNames("Grad").front(), + framework::ToTypeName(grad_var->Type()))); + + // inputs + auto& param = GET_DATA_SAFELY(ctx.Input("Param"), "Input", + "Param", "Rmsprop"); + auto& meanSquare = GET_DATA_SAFELY(ctx.Input("MeanSquare"), + "Input", "MeanSquare", "Rmsprop"); + auto& grad = GET_DATA_SAFELY(ctx.Input("Grad"), "Input", + "Grad", + "Rmsprop"); + auto& mom = GET_DATA_SAFELY(ctx.Input("Moment"), "Input", + "Moment", "Rmsprop"); + + auto* learning_rate = ctx.Input("LearningRate"); + PADDLE_ENFORCE_EQ(learning_rate->dims().size(), 1, + platform::errors::InvalidArgument( + "learining rate should have dimension = 1." + " But received learning rate dim [%s] ", + learning_rate->dims().size())); + T lr = static_cast(GetAttrFromTensor(learning_rate)); + + // constants + T epsilon = static_cast(ctx.Attr("epsilon")); + T decay = static_cast(ctx.Attr("decay")); + T momentum = static_cast(ctx.Attr("momentum")); + + // outputs + auto& param_out = GET_DATA_SAFELY(ctx.Output("ParamOut"), + "Output", "ParamOut", "Rmsprop"); + auto& mom_out = GET_DATA_SAFELY(ctx.Output("MomentOut"), + "Output", "MomentOut", "Rmsprop"); + auto& mom_sqrt_out = + GET_DATA_SAFELY(ctx.Output("MeanSquareOut"), + "Output", "MeanSquareOut", + "Rmsprop"); + auto& dev_ctx = ctx.template device_context(); + + ///// rmsprop优化算法 + /// + /// ms_out[i] = rho * ms[i] + (1 - rho) * (g[i] * g[i]); + /// + /// mom_out[i] = momentum * mom[i] + lr * + /// (g[i] / ((float)sqrt(ms_out[i] + epsilon))); + /// + /// p_out[i] = p[i] - mom_out[i]; + /// DLL_EXPORT int rmsprop(Context* ctx, const float* p, + /// const float* ms, const float* g, const float* mom, + /// float epsilon, float rho, float momentum, float lr, + /// float *ms_out, float *mom_out, float *p_out, int n) + int r = xpu::rmsprop(dev_ctx.x_context(), param.template data(), + meanSquare.template data(), grad.template + data(), + mom.template data(), epsilon, decay, momentum, + lr, + mom_sqrt_out.template + mutable_data(ctx.GetPlace()), + mom_out.template mutable_data(ctx.GetPlace()), + param_out.template mutable_data(ctx.GetPlace()), + param.numel()); + + if (r == xpu::Error_t::INVALID_PARAM) { + PADDLE_ENFORCE_EQ( + r, xpu::Error_t::SUCCESS, + platform::errors::InvalidArgument( + "XPU kernel error of RmspropOp, error message: INVALID_PARAM, + " + "please check your input & output.")); + } else if (r == xpu::Error_t::RUNTIME_ERROR) { + PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, + platform::errors::Unavailable( + "XPU kernel error of RmspropOp, error message: " + "RUNTIME_ERROR, please check whether Baidu " + "Kunlun Card is properly installed.")); + } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) { + PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, + platform::errors::ResourceExhausted( + "XPU kernel error of RmspropOp, error " + "message: NO_ENOUGH_WORKSPACE, XPU " + "has no enough memory.")); + } else { + PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, + platform::errors::ResourceExhausted( + "XPU kernel error of RmspropOp, error " + "message: OTHER " + "XPU API returns error code: %d.", + r)); + } + ***/ } }; } // namespace operators } // namespace paddle -namespace ops = paddle::operators; -REGISTER_OP_XPU_KERNEL( - rmsprop, - ops::RmspropOpXPUKernel); +// namespace ops = paddle::operators; +// REGISTER_OP_XPU_KERNEL( +// rmsprop, +// ops::RmspropOpXPUKernel); #endif diff --git a/paddle/fluid/operators/optimizers/sgd_op_xpu.cc b/paddle/fluid/operators/optimizers/sgd_op_xpu.cc index 9dabca1b66a771ed62431715e7a69d285774297e..e7c03be95cae1e1cfb01ab5ec42252f1e888e55e 100644 --- a/paddle/fluid/operators/optimizers/sgd_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/sgd_op_xpu.cc @@ -14,11 +14,15 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include "paddle/fluid/operators/optimizers/sgd_op.h" #include +#include "paddle/fluid/platform/device/device_wrapper.h" + namespace paddle { namespace operators { template class SGDOpXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext &ctx) const override { const auto *learning_rate = ctx.Input("LearningRate"); @@ -48,40 +52,31 @@ class SGDOpXPUKernel : public framework::OpKernel { "numel = [%s], ParamOut's numel = [%s]", grad->numel(), sz)); - const T *lr = learning_rate->data(); + const T *lr_t = learning_rate->data(); + auto &dev_ctx = ctx.template device_context(); + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + const float *lr = nullptr; + if (std::is_same::value) { + float *lr_float = + RAII_GUARD.alloc_l3_or_gm(learning_rate->numel()); + int r = xpu::cast_v2( + dev_ctx.x_context(), reinterpret_cast(lr_t), + lr_float, learning_rate->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2"); + lr = lr_float; + } else { + lr = reinterpret_cast(lr_t); + } + const T *param_data = param->data(); const T *grad_data = grad->data(); T *out_data = param_out->mutable_data(ctx.GetPlace()); - auto &dev_ctx = ctx.template device_context(); - int r = xpu::sgd(dev_ctx.x_context(), sz, grad_data, param_data, lr, - out_data); - if (r == xpu::Error_t::INVALID_PARAM) { - PADDLE_ENFORCE_EQ( - r, xpu::Error_t::SUCCESS, - platform::errors::InvalidArgument( - "XPU kernel error of SgdOp, error message: INVALID_PARAM, " - "please check your input & output.")); - } else if (r == xpu::Error_t::RUNTIME_ERROR) { - PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, - platform::errors::Unavailable( - "XPU kernel error of SgdOp, error message: " - "RUNTIME_ERROR, please check whether Baidu " - "Kunlun Card is properly installed.")); - } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) { - PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, - platform::errors::ResourceExhausted( - "XPU kernel error of SgdOp, error " - "message: NO_ENOUGH_WORKSPACE, XPU " - "has no enough memory.")); - } - } else { - PADDLE_ENFORCE_EQ(false, true, - platform::errors::PermissionDenied( - "Unsupported Variable Type of Param & Grad in " - "SgdOp-XPU. Excepted " - "LodTensor, But received [%s] and [%s]", - paddle::framework::ToTypeName(param_var->Type()))); + int r = xpu::sgd(dev_ctx.x_context(), + reinterpret_cast(grad_data), + reinterpret_cast(param_data), lr, + reinterpret_cast(out_data), sz); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "sgd"); } } }; @@ -90,6 +85,8 @@ class SGDOpXPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; +namespace plat = paddle::platform; REGISTER_OP_XPU_KERNEL( - sgd, ops::SGDOpXPUKernel); + sgd, ops::SGDOpXPUKernel, + ops::SGDOpXPUKernel); #endif diff --git a/paddle/fluid/platform/device/xpu/xpu1_op_list.h b/paddle/fluid/platform/device/xpu/xpu1_op_list.h index a76bdd4ae967987748abe4aefa144ce3ac83a545..e8c3eee5b538ba326986e78148aa6a18f7bb392e 100644 --- a/paddle/fluid/platform/device/xpu/xpu1_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu1_op_list.h @@ -145,7 +145,6 @@ XPUOpMap& get_kl1_ops() { {"hard_switch", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"iou_similarity", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"lamb", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"layer_norm_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, @@ -175,9 +174,6 @@ XPUOpMap& get_kl1_ops() { pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, - {"log_loss_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"log_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"logsumexp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"log", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"lookup_table_v2_grad", @@ -236,7 +232,6 @@ XPUOpMap& get_kl1_ops() { pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::BOOL, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, - {"rmsprop", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"rnn_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"rnn", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"roi_align_grad", diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 0dcab845bc9ca1b9d4dc7ae02e8f9b4c63ac4d83..99f8e5ace9c0088cd304bc3735ceb1696984dc3a 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -328,6 +328,8 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::INT64, XPUPlace())})}, {"scatter", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, + {"sgd", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"sigmoid_cross_entropy_with_logits_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"sigmoid_cross_entropy_with_logits", diff --git a/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py index 7aaa78856811f260eb27663026f1c7ed4c3301a0..b0bb9a37c16bd70c28b548203202be7015ed6243 100755 --- a/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py @@ -23,41 +23,52 @@ import paddle.fluid as fluid from paddle.fluid import compiler, Program, program_guard import paddle +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper + paddle.enable_static() -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestXPUAccuracyOp(OpTest): - def setUp(self): - self.op_type = "accuracy" - self.init_dtype() - n = 8192 - infer = np.random.random((n, 1)).astype(self.dtype) - indices = np.random.randint(0, 2, (n, 1)).astype('int64') - label = np.random.randint(0, 2, (n, 1)).astype('int64') - self.inputs = {'Out': infer, 'Indices': indices, "Label": label} - num_correct = 0 - for rowid in range(n): - for ele in indices[rowid]: - if ele == label[rowid]: - num_correct += 1 - break - self.outputs = { - 'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype), - 'Correct': np.array([num_correct]).astype("int32"), - 'Total': np.array([n]).astype("int32") - } - self.attrs = {'use_xpu': True} - - def init_dtype(self): - self.dtype = np.float32 - - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) +class XPUTestAccuracyOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'accuracy' + self.use_dynamic_create_class = False + + class TestXPUAccuracyOp(XPUOpTest): + def setUp(self): + self.op_type = "accuracy" + self.init_dtype() + n = 8192 + infer = np.random.random((n, 1)).astype(self.dtype) + indices = np.random.randint(0, 2, (n, 1)).astype('int64') + label = np.random.randint(0, 2, (n, 1)).astype('int64') + self.inputs = {'Out': infer, 'Indices': indices, "Label": label} + num_correct = 0 + for rowid in range(n): + for ele in indices[rowid]: + if ele == label[rowid]: + num_correct += 1 + break + self.outputs = { + 'Accuracy': + np.array([num_correct / float(n)]).astype(self.dtype), + 'Correct': np.array([num_correct]).astype("int32"), + 'Total': np.array([n]).astype("int32") + } + self.attrs = {'use_xpu': True} + + def init_dtype(self): + self.dtype = self.in_type + + def test_check_output(self): + if paddle.is_compiled_with_xpu(): + place = paddle.XPUPlace(0) + self.check_output_with_place(place) + +support_types = get_xpu_op_support_types('accuracy') +for stype in support_types: + create_test_class(globals(), XPUTestAccuracyOp, stype) if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py index c29150ef921c2dc3c9d94ca767c5f1263c15b00d..67fd9f871207b2fcdc74e57a6223ee9904dcc2ce 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py @@ -25,30 +25,43 @@ import paddle.fluid as fluid from paddle.fluid import core from paddle.fluid.op import Operator +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper -class TestSGDOp(OpTest): - def setUp(self): - self.op_type = "sgd" - self.conf() - w = np.random.random((self.h, self.w)).astype("float32") - g = np.random.random((self.h, self.w)).astype("float32") - lr = np.array([0.1]).astype("float32") - self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr} - self.outputs = {'ParamOut': w - lr * g} +class XPUTestSgdOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'sgd' + self.use_dynamic_create_class = False - def conf(self): - self.h = 102 - self.w = 105 + class TestSGDOp(XPUOpTest): + def setUp(self): + self.op_type = "sgd" + self.dtype = self.in_type + self.conf() + w = np.random.random((self.h, self.w)).astype(self.dtype) + g = np.random.random((self.h, self.w)).astype(self.dtype) + lr = np.array([0.1]).astype(self.dtype) - def test_check_output_with_place(self): - self.check_output_with_place(paddle.XPUPlace(0)) + self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr} + self.outputs = {'ParamOut': w - lr * g} + def conf(self): + self.h = 102 + self.w = 105 -class TestSGDOpCase8X(TestSGDOp): - def conf(self): - self.h = 10 - self.w = 64 + def test_check_output_with_place(self): + self.check_output_with_place(paddle.XPUPlace(0)) + + class TestSGDOpCase8X(TestSGDOp): + def conf(self): + self.h = 10 + self.w = 64 + + +support_types = get_xpu_op_support_types('sgd') +for stype in support_types: + create_test_class(globals(), XPUTestSgdOp, stype) class TestSGDOpWithLargeInput(unittest.TestCase):