diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake index 2e4a67093dc54115d9f91998bf21c0e91656771b..e8db13a694f5578e314dc1a7c95ed24ad88bad02 100644 --- a/cmake/external/gloo.cmake +++ b/cmake/external/gloo.cmake @@ -32,7 +32,7 @@ cache_third_party(extern_gloo TAG ${GLOO_TAG} DIR GLOO_SOURCE_DIR) -if(WITH_ASCEND) + if(WITH_ASCEND OR WITH_ASCEND_CL) ExternalProject_Add( extern_gloo ${EXTERNAL_PROJECT_LOG_ARGS} diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 82d64fd022883a75a6d334d3443dc43b4b06a904..c108c05368c915f6d4998d46713cda315dfb93ff 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -242,7 +242,7 @@ endif() ) ENDFUNCTION() -if(WITH_ASCEND) +if(WITH_ASCEND OR WITH_ASCEND_CL) SET(PROTOBUF_VERSION 3.8.0) else() SET(PROTOBUF_VERSION 3.1.0) diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake index 0eabdb4e127bdf9e64883e3e15d6cd96753f9b44..f9cb3a9075a821025129c1f6acb479a4ad6ac95c 100644 --- a/cmake/external/threadpool.cmake +++ b/cmake/external/threadpool.cmake @@ -16,7 +16,7 @@ INCLUDE(ExternalProject) SET(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool) SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool) -if(WITH_ASCEND) +if(WITH_ASCEND OR WITH_ASCEND_CL) SET(THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git) else() SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git) diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index a4367510ac703f6c5904cba2c5765c784b7afc8a..100b9153394690f6d872a4f16fb0a1ee5827b89f 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -43,7 +43,7 @@ cache_third_party(extern_warpctc TAG ${WARPCTC_TAG} DIR WARPCTC_SOURCE_DIR) -if(WITH_ASCEND) +if(WITH_ASCEND OR WITH_ASCEND_CL) ExternalProject_Add( extern_warpctc ${EXTERNAL_PROJECT_LOG_ARGS} diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 868d920f13ca8299fb53c0d0506d7e448fd152a3..85af9e50087024246aa88346127ef4005621fd75 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -135,6 +135,7 @@ void TensorFromArray(const T* src, const size_t& array_size, } #endif } + template void TensorFromVector(const std::vector& src, const platform::DeviceContext& ctx, Tensor* dst) { @@ -167,6 +168,49 @@ void TensorFromVector(const std::vector& src, #endif } +// The fully specialized function should be inline to avoid +// multi-definition. +template <> +inline void TensorFromVector(const std::vector& src, + const platform::DeviceContext& ctx, Tensor* dst) { + // vector has no data() member, use array instead. + // See details: + // https://stackoverflow.com/questions/46115669/why-does-stdvectorbool-have-no-data/46115714 + bool* array = new bool[src.size()]; + for (unsigned int i = 0; i < src.size(); i++) { + array[i] = static_cast(src[i]); + } + + auto dst_place = ctx.GetPlace(); + auto src_ptr = static_cast(array); + platform::CPUPlace src_place; + dst->Resize({static_cast(src.size())}); + auto dst_ptr = static_cast(dst->mutable_data(dst_place)); + auto size = src.size() * sizeof(bool); + + if (platform::is_cpu_place(dst_place)) { + memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, + src_place, src_ptr, size); + } +#ifdef PADDLE_WITH_CUDA + else if (platform::is_gpu_place(dst_place)) { // NOLINT + memory::Copy( + BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place, + src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif +#ifdef PADDLE_WITH_ASCEND_CL + else if (platform::is_npu_place(dst_place)) { // NOLINT + memory::Copy( + BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place, + src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif + delete[] array; +} + template void TensorFromVector(const std::vector& src, Tensor* dst) { platform::CPUPlace dst_place = platform::CPUPlace(); @@ -179,6 +223,23 @@ void TensorFromVector(const std::vector& src, Tensor* dst) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } +template <> +inline void TensorFromVector(const std::vector& src, Tensor* dst) { + bool* array = new bool[src.size()]; + for (unsigned int i = 0; i < src.size(); i++) { + array[i] = static_cast(src[i]); + } + platform::CPUPlace dst_place = platform::CPUPlace(); + auto src_ptr = static_cast(array); + platform::CPUPlace src_place; + dst->Resize({static_cast(src.size())}); + auto dst_ptr = static_cast(dst->mutable_data(dst_place)); + auto size = src.size() * sizeof(bool); + + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); + delete[] array; +} + template void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, std::vector* dst) { @@ -212,6 +273,46 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, #endif } +template <> +inline void TensorToVector(const Tensor& src, + const platform::DeviceContext& ctx, + std::vector* dst) { + auto src_ptr = static_cast(src.data()); + auto size = src.numel() * sizeof(bool); + + bool* array = new bool[src.numel()]; + + platform::CPUPlace dst_place; + dst->resize(src.numel()); + auto dst_ptr = static_cast(array); + + if (platform::is_cpu_place(src.place())) { + memory::Copy(dst_place, dst_ptr, + BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, + size); + } +#ifdef PADDLE_WITH_CUDA + else if (platform::is_gpu_place(src.place())) { // NOLINT + memory::Copy( + dst_place, dst_ptr, BOOST_GET_CONST(platform::CUDAPlace, src.place()), + src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif +#ifdef PADDLE_WITH_ASCEND_CL + else if (platform::is_npu_place(src.place())) { // NOLINT + memory::Copy( + dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()), + src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif + for (unsigned int i = 0; i < src.numel(); i++) { + (*dst)[i] = static_cast(array[i]); + } + delete[] array; +} + template void TensorToVector(const Tensor& src, std::vector* dst) { auto src_ptr = static_cast(src.data()); @@ -231,6 +332,32 @@ void TensorToVector(const Tensor& src, std::vector* dst) { BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size); } +template <> +inline void TensorToVector(const Tensor& src, std::vector* dst) { + auto src_ptr = static_cast(src.data()); + auto size = src.numel() * sizeof(bool); + + bool* array = new bool[src.numel()]; + + platform::CPUPlace dst_place; + dst->resize(src.numel()); + auto dst_ptr = static_cast(array); + + PADDLE_ENFORCE_EQ( + platform::is_cpu_place(src.place()), true, + platform::errors::InvalidArgument( + "The input tensor should be CPU device, but actually it is in %s.", + src.place())); + + memory::Copy(dst_place, dst_ptr, + BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size); + + for (unsigned int i = 0; i < src.numel(); i++) { + (*dst)[i] = static_cast(array[i]); + } + delete[] array; +} + std::ostream& operator<<(std::ostream& os, const Tensor& t); } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc index c32efd0a470be201344fa8d7f817792315b7e6ef..8587ee8d1e91969be86ab50d18e70b6a0d034e98 100644 --- a/paddle/fluid/framework/tensor_util_test.cc +++ b/paddle/fluid/framework/tensor_util_test.cc @@ -242,6 +242,61 @@ TEST(TensorToVector, Tensor) { #endif } +TEST(TensorToVector, Tensor_bool) { + { + paddle::framework::Tensor src; + bool* src_ptr = + src.mutable_data({3, 3}, paddle::platform::CPUPlace()); + for (int i = 0; i < 3 * 3; ++i) { + src_ptr[i] = static_cast(i % 2); + } + + paddle::platform::CPUPlace place; + std::vector dst; + paddle::framework::TensorToVector(src, &dst); + + for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_ptr[i], dst[i]); + } + } +#ifdef PADDLE_WITH_CUDA + { + std::vector src_vec = { + false, true, false, true, false, true, false, true, false, + }; + paddle::framework::Tensor gpu_tensor; + paddle::platform::CUDAPlace place; + paddle::platform::CUDADeviceContext gpu_ctx(place); + paddle::framework::TensorFromVector(src_vec, gpu_ctx, &gpu_tensor); + + std::vector dst; + paddle::framework::TensorToVector(gpu_tensor, gpu_ctx, &dst); + + for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_vec[i], dst[i]); + } + } +#endif +#ifdef PADDLE_WITH_ASCEND_CL + { + std::vector src_vec = { + false, true, false, true, false, true, false, true, false, + }; + paddle::framework::Tensor npu_tensor; + paddle::platform::NPUPlace place(0); + paddle::platform::NPUDeviceContext npu_ctx(place); + paddle::framework::TensorFromVector(src_vec, npu_ctx, &npu_tensor); + + std::vector dst; + paddle::framework::TensorToVector(npu_tensor, npu_ctx, &dst); + + for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_vec[i], dst[i]); + } + } +#endif +} + TEST(TensorFromDLPack, Tensor) { { std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index a2b5a98401e2363bbfe98b375807ba91e7b5a2ae..e43cccfe648165ce962b779cb513effe990d0ab3 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -45,6 +45,17 @@ using Attribute = boost::variant< using AttributeMap = std::unordered_map; +#ifdef PADDLE_WITH_ASCEND_CL +using NPUAttribute = + boost::variant, + std::vector, std::vector, bool, + std::vector, BlockDesc*, int64_t, + std::vector, std::vector, + std::vector, std::vector>>; + +using NPUAttributeMap = std::unordered_map; +#endif + using OpCreator = std::function; diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index d9a4503cc1e5f7cebbac9062f38739b02c64890b..1eb0535831bb19ea6ddf62d7994db5a88a902466 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -206,8 +206,16 @@ void Copy(platform::NPUPlace dst_place, if (UNLIKELY(num == 0)) return; platform::SetNPUDeviceId(dst_place.device); + + // NOTE(ascendrc): NPU memcpy async from host to device is a "real" async, + // which is different from CUDA. In Paddle, when async is called, "sync" + // is run actually, which means Paddle doesn't fully supported async. + // TODO(ascendrc): Support NPU memcpy async for better performance. + stream = nullptr; + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << " by thream(" << stream << ")"; + if (stream) { platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU"); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream); @@ -226,8 +234,16 @@ void Copy(platform::CPUPlace dst_place, if (UNLIKELY(num == 0)) return; platform::SetNPUDeviceId(src_place.device); + + // NOTE(ascendrc): NPU memcpy async from device to host is a "real" async, + // which is different from CUDA. In Paddle, when async is called, "sync" + // is run actually, which means Paddle doesn't fully supported async. + // TODO(ascendrc): Support NPU memcpy async for better performance. + stream = nullptr; + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << " by thream(" << stream << ")"; + if (stream) { platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU"); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream); diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index dac8c7b03e5174fe5c6354c3f882bec8fa4b3085..b475f75990f736e08fa9c91a66e138a05a759e9c 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -124,6 +124,7 @@ if (WITH_ASCEND) endif() if (WITH_ASCEND_CL) + cc_test(assign_op_npu_test SRCS assign_op_npu_test.cc DEPS assign_op) cc_library(npu_op_runner SRCS npu_op_runner.cc DEPS operator npu_info) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} npu_op_runner) endif() @@ -141,8 +142,8 @@ set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COMMON_OP_DEPS}) set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies") cc_test(test_common_infer_shape_functions SRCS test_common_infer_shape_functions.cc DEPS common_infer_shape_functions ${COMMON_OP_DEPS} activation_op elementwise_add_op softmax_op softmax) -cc_test(assign_op_test SRCS assign_op_test.cc DEPS assign_op) cc_test(gather_test SRCS gather_test.cc DEPS tensor) +cc_test(assign_op_test SRCS assign_op_test.cc DEPS assign_op) cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function) cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory) @@ -163,10 +164,19 @@ if (WITH_PYTHON) cc_library(py_func_op SRCS py_func_op.cc DEPS op_registry python pybind) endif() +if (WITH_ASCEND_CL) + cc_test(range_op_npu_test SRCS range_op_npu_test.cc DEPS op_registry range_op scope device_context enforce executor) + cc_test(lookup_table_v2_op_npu_test SRCS lookup_table_v2_op_npu_test.cc DEPS op_registry lookup_table_v2_op scope device_context enforce executor compare_op) +endif() + set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") add_subdirectory(benchmark) cc_test(op_debug_string_test SRCS op_debug_string_test.cc DEPS elementwise_add_op) +if (WITH_ASCEND_CL) + cc_test(transpose_op_npu_test SRCS transpose_op_npu_test.cc DEPS op_registry transpose_op scope device_context enforce executor) +endif() + if(WITH_MKLDNN) include(mkldnn/inplace_op_tests.cmake) @@ -180,3 +190,7 @@ if(WITH_UNITY_BUILD) # The specified link dependency needs to be displayed here. target_link_libraries(paddle_operators_unity ${OP_HEADER_DEPS} ${COMMON_OP_DEPS}) endif() + +if(WITH_ASCEND_CL) +cc_test(gelu_op_npu_test SRCS gelu_op_npu_test.cc DEPS op_registry gelu_op scope device_context enforce executor) +endif() diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..923b581af287d1aa400858d9325821e510e4e6b1 --- /dev/null +++ b/paddle/fluid/operators/activation_op_npu.cc @@ -0,0 +1,368 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the Licnse. */ + +#include +#include + +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class PowNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + auto factor = ctx.Attr("factor"); + + out->mutable_data(ctx.GetPlace()); + + auto runner = NpuOpRunner("Power", {*x}, {*out}, + {{"power", factor}, + {"scale", static_cast(1.0)}, + {"shift", static_cast(0.0)}}); + + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class PowGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto factor = ctx.Attr("factor"); + + auto x_dims = x->dims(); + + auto place = ctx.GetPlace(); + auto stream = + ctx.template device_context() + .stream(); + + // NOTE(liym27): dx = dout * factor * x.pow(factor-1) + + // Step1: Compute x_pow = x.pow(factor-1) + Tensor x_pow(x->type()); + x_pow.mutable_data(x->dims(), place); + auto runner_pow = NpuOpRunner("Power", {*x}, {x_pow}, + {{"power", factor - static_cast(1)}}); + runner_pow.Run(stream); + + // Step 2: Construct a broadcast factor, which has the same shape with x. + + // 2.1 Get a factor tensor with shape [1]. + Tensor factor_tensor(framework::proto::VarType::FP32); + factor_tensor.mutable_data({1}, place); + TensorFromVector(std::vector{factor}, ctx.device_context(), + &factor_tensor); + + // 2.2 Get the factor which has the shape with x and the same value with + // factor. + Tensor factor_bc_tensor(framework::proto::VarType::FP32); + factor_bc_tensor.mutable_data(x_dims, place); + auto runner_bc = NpuOpRunner("FillD", {factor_tensor}, {factor_bc_tensor}, + {{"dims", framework::vectorize(x_dims)}}); + runner_bc.Run(stream); + + // Step 3: Compute x_power_mul_factor = factor * x.pow(factor-1) + Tensor x_power_mul_factor(x->type()); + x_power_mul_factor.mutable_data(x->dims(), place); + auto runner_mul_1 = + NpuOpRunner("Mul", {factor_bc_tensor, x_pow}, {x_power_mul_factor}, {}); + runner_mul_1.Run(stream); + + // Step 4: Compute dx = dout * factor * x.pow(factor-1) + dx->mutable_data(place); + auto runner_mul_2 = + NpuOpRunner("Mul", {*dout, x_power_mul_factor}, {*dx}, {}); + runner_mul_2.Run(stream); + } +}; + +template +class ReluNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + out->mutable_data(ctx.GetPlace()); + + auto runner = NpuOpRunner("Relu", + { + *x, + }, + {*out}, {}); + + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class ReluGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + + auto stream = + ctx.template device_context() + .stream(); + + dx->mutable_data(ctx.GetPlace()); + auto runner = NpuOpRunner("ReluGrad", {*dout, *out}, {*dx}, {}); + + runner.Run(stream); + } +}; + +template +class SqrtNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("Sqrt", {*x}, {*out}, {}); + runner.Run(stream); + } +}; + +template +class SqrtGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + + auto* dx = ctx.Output(framework::GradVarName("X")); + + auto place = ctx.GetPlace(); + + dx->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto dx_runner = NpuOpRunner("SqrtGrad", {*out, *dout}, {*dx}, {}); + dx_runner.Run(stream); + } +}; + +template +class LogNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + Tensor one(x->type()); + one.mutable_data(x->dims(), place); + auto one_runner = NpuOpRunner("OnesLike", {*x}, {one}, {}); + one_runner.Run(stream); + + Tensor sub(x->type()); + sub.mutable_data(x->dims(), place); + auto sub_runner = NpuOpRunner("Sub", {*x, one}, {sub}, {}); + sub_runner.Run(stream); + + auto out_runner = NpuOpRunner("Log1p", {sub}, {*out}, {}); + out_runner.Run(stream); + } +}; + +template +class LogGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* x = ctx.Input("X"); + + auto* dx = ctx.Output(framework::GradVarName("X")); + + auto place = ctx.GetPlace(); + + dx->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + auto runner = NpuOpRunner("DivNoNan", {*dout, *x}, {*dx}, {}); + runner.Run(stream); + } +}; + +template +class TanhNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("Tanh", {*x}, {*out}, {}); + runner.Run(stream); + } +}; + +template +class TanhGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* out = ctx.Input("Out"); + + auto* dx = ctx.Output(framework::GradVarName("X")); + + auto place = ctx.GetPlace(); + + dx->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto dx_runner = NpuOpRunner("TanhGrad", {*out, *dout}, {*dx}, {}); + dx_runner.Run(stream); + } +}; + +template +class SquareNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("Square", {*x}, {*out}, {}); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + pow, ops::PowNPUKernel, + ops::PowNPUKernel); + +REGISTER_OP_NPU_KERNEL( + pow_grad, ops::PowGradNPUKernel, + ops::PowGradNPUKernel); + +REGISTER_OP_NPU_KERNEL( + relu, ops::ReluNPUKernel, + ops::ReluNPUKernel); + +REGISTER_OP_NPU_KERNEL( + relu_grad, + ops::ReluGradNPUKernel, + ops::ReluGradNPUKernel); + +REGISTER_OP_NPU_KERNEL( + sqrt, ops::SqrtNPUKernel, + ops::SqrtNPUKernel); + +REGISTER_OP_NPU_KERNEL( + sqrt_grad, + ops::SqrtGradNPUKernel, + ops::SqrtGradNPUKernel); + +REGISTER_OP_NPU_KERNEL( + log, ops::LogNPUKernel, + ops::LogNPUKernel); + +REGISTER_OP_NPU_KERNEL( + log_grad, ops::LogGradNPUKernel, + ops::LogGradNPUKernel); + +REGISTER_OP_NPU_KERNEL( + tanh, ops::TanhNPUKernel, + ops::TanhNPUKernel); + +REGISTER_OP_NPU_KERNEL( + tanh_grad, + ops::TanhGradNPUKernel, + ops::TanhGradNPUKernel); + +REGISTER_OP_NPU_KERNEL( + square, ops::SquareNPUKernel, + ops::SquareNPUKernel, + ops::SquareNPUKernel); diff --git a/paddle/fluid/operators/amp/CMakeLists.txt b/paddle/fluid/operators/amp/CMakeLists.txt index b3ff52a7ae119ded7a305c97f3365d1a72d50acf..2ea8bbcbc61df84eb445b1a512653d66f600c46a 100644 --- a/paddle/fluid/operators/amp/CMakeLists.txt +++ b/paddle/fluid/operators/amp/CMakeLists.txt @@ -4,3 +4,7 @@ if(WITH_UNITY_BUILD) include(unity_build_rule.cmake) endif() register_operators() + +if(WITH_ASCEND_CL) + cc_test(check_finite_and_unscale_op_npu_test SRCS check_finite_and_unscale_op_npu_test.cc DEPS op_registry check_finite_and_unscale_op scope device_context enforce executor) +endif() diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..46f9f7ff0894487e86ecf5740bb90bfcc0edc347 --- /dev/null +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc @@ -0,0 +1,119 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + const auto xs = ctx.MultiInput("X"); + const auto* scale = ctx.Input("Scale"); + auto outs = ctx.MultiOutput("Out"); + auto* found_inf = ctx.Output("FoundInfinite"); + + found_inf->mutable_data(ctx.GetPlace()); + + bool found_inf_data = false; + + auto stream = + ctx.template device_context() + .stream(); + + // step1: inverse scale(RealDiv) + Tensor const_tensor; + const_tensor.mutable_data({1}, ctx.GetPlace()); + TensorFromVector(std::vector{static_cast(1.0)}, ctx.device_context(), + &const_tensor); + + ctx.template device_context().Wait(); + + // Inverse(1.0/scale) + Tensor* tmp_inverse_out = const_cast(scale); + Tensor inverse_out(scale->type()); + inverse_out.Resize(scale->dims()); + inverse_out.mutable_data(ctx.GetPlace()); + auto runner_inverse = + NpuOpRunner("Div", {const_tensor, *scale}, {inverse_out}, {}); + runner_inverse.Run(stream); + tmp_inverse_out = &inverse_out; + + size_t x_size = xs.size(); + for (size_t i = 0; i < x_size; ++i) { + found_inf_data = true; + const auto* x = xs[i]; + auto* out = outs[i]; + out->mutable_data(ctx.GetPlace()); + + // step2: CheckNumerics + // CheckNumerics runs on the Ascend AI CPU, which delivers poor + // performance. + Tensor check_xout(x->type()); + check_xout.Resize(x->dims()); + check_xout.mutable_data(ctx.GetPlace()); + try { + auto runner_checknumerics = + NpuOpRunner("CheckNumerics", {*x}, {check_xout}, + {{"message", std::string("check_nan_and_inf")}}); + runner_checknumerics.Run(stream); + } catch (platform::EnforceNotMet& exception) { + LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!"; + found_inf_data = true; + } catch (...) { + LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!"; + found_inf_data = true; + } + + if (!found_inf_data) { + // MatMul + auto runner_matmul = + NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {}); + runner_matmul.Run(stream); + } else { + // ZerosLike + auto runner_zeroslike = NpuOpRunner("ZerosLike", {*x}, {*out}, {}); + runner_zeroslike.Run(stream); + } // end if + } // end for + + // set found_inf to true + if (found_inf_data) { + Tensor found_inf_tensor; + found_inf_tensor.Resize({1}); + bool* is_found_inf = + found_inf_tensor.mutable_data(paddle::platform::CPUPlace()); + *is_found_inf = true; + framework::TensorCopySync(found_inf_tensor, ctx.GetPlace(), found_inf); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_NPU_KERNEL(check_finite_and_unscale, + ops::CheckFiniteAndUnscaleNPUKernel, + ops::CheckFiniteAndUnscaleNPUKernel); diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..99e81a4757d0e0b8e6ae8c70926d3c0660b99bb9 --- /dev/null +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc @@ -0,0 +1,131 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/enforce.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +using Tensor = paddle::framework::Tensor; + +USE_OP(check_finite_and_unscale); +USE_OP_DEVICE_KERNEL(check_finite_and_unscale, NPU); + +struct InputVars { + std::string name; + f::LoDTensor *tensor; +}; + +template +void Compare(f::Scope *scope, const p::DeviceContext &ctx) { + const f::DDim dims = f::make_ddim({2, 2}); + auto place = ctx.GetPlace(); + + // init input + std::vector input_names = { + {"x", scope->Var("x")->GetMutable()}, + {"x1", scope->Var("x1")->GetMutable()}}; + + auto *scale = scope->Var("scale")->GetMutable(); + + // init output + auto *out = scope->Var("out")->GetMutable(); + auto *out1 = scope->Var("out1")->GetMutable(); + auto *found_inf = scope->Var("found_inf")->GetMutable(); + + // Initialize input data + const int num_inputs = input_names.size(); + size_t numel = static_cast(f::product(dims)); + + for (int i = 0; i < num_inputs; ++i) { + std::vector init_xs; + for (size_t j = 0; j < numel; ++j) { + if (j == 0) { + init_xs.push_back(static_cast(NAN)); + } else { + init_xs.push_back(static_cast(j + 1)); + } + } + f::TensorFromVector(init_xs, ctx, input_names[i].tensor); + input_names[i].tensor->Resize(dims); + } + + f::TensorFromVector(std::vector{static_cast(0.5)}, ctx, scale); + + ctx.Wait(); + + // run + f::AttributeMap attrs; + auto op = f::OpRegistry::CreateOp( + "check_finite_and_unscale", {{"X", {"x", "x1"}}, {"Scale", {"scale"}}}, + {{"Out", {"out", "out1"}}, {"FoundInfinite", {"found_inf"}}}, attrs); + op->Run(*scope, place); + ctx.Wait(); + + // out0 + std::vector out_vec; + f::TensorToVector(*out, ctx, &out_vec); + EXPECT_EQ(out_vec.size(), static_cast(4)); + for (size_t j = 0; j < out_vec.size(); ++j) { + VLOG(3) << "out_vec[" << j << "]:" << out_vec[j]; + } + + ctx.Wait(); + + // out0 + std::vector out1_vec; + f::TensorToVector(*out1, ctx, &out1_vec); + EXPECT_EQ(out1_vec.size(), static_cast(4)); + for (size_t j = 0; j < out1_vec.size(); ++j) { + VLOG(3) << "out1_vec[" << j << "]:" << out1_vec[j]; + } + + ctx.Wait(); + + // out found_inf + Tensor found_inf_tensor; + found_inf_tensor.Resize({1}); + bool *is_finite_data = + found_inf_tensor.mutable_data(paddle::platform::CPUPlace()); + f::TensorCopy(*found_inf, place, &found_inf_tensor); + EXPECT_FALSE(*is_finite_data); + + ctx.Wait(); +} + +TEST(check_finite_and_unscale, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx); +} + +TEST(check_finite_and_unscale, NPU_fp16) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx); +} diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..dd6dbfd5c0b65381b468e147bd873a1775cfb2ec --- /dev/null +++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc @@ -0,0 +1,219 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/amp/update_loss_scaling_op.h" +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +void Update(const platform::NPUDeviceContext& ctx, + const std::vector found_inf_vec, + const Tensor* pre_loss_scaling_tensor, const Tensor* good_in_tensor, + const Tensor* bad_in_tensor, const int incr_every_n_steps, + const int decr_every_n_nan_or_inf, const float incr_ratio, + const float decr_ratio, Tensor* updated_loss_scaling_tensor, + Tensor* good_out_tensor, Tensor* bad_out_tensor) { + auto place = ctx.GetPlace(); + auto stream = ctx.stream(); + if (found_inf_vec[0]) { + // good_out_data = 0 + auto g = good_out_tensor->mutable_data(place); + platform::NPUMemsetAsync(static_cast(g), 0, + good_out_tensor->numel() * sizeof(int), stream); + // bad_out_data = bad_in_data + 1 + Tensor factor_tensor(bad_out_tensor->type()); + factor_tensor.mutable_data({1}, place); + TensorFromVector(std::vector{1}, ctx, &factor_tensor); + auto runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor}, + {*bad_out_tensor}, {}); + runner_p2.Run(stream); + + std::vector bad_out_data; + TensorToVector(*bad_out_tensor, ctx, &bad_out_data); + if (bad_out_data[0] == decr_every_n_nan_or_inf) { + auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, + {*updated_loss_scaling_tensor}, + {{"power", static_cast(1)}, + {"scale", decr_ratio}, + {"shift", static_cast(0)}}); + + runner_p3.Run(stream); + + std::vector new_loss_scaling; + TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling); + if (new_loss_scaling[0] < static_cast(1)) { + // updated_loss_scaling_data = 1 + auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, + {*updated_loss_scaling_tensor}, + {{"power", static_cast(1)}, + {"scale", static_cast(0)}, + {"shift", static_cast(1)}}); + + runner_p4.Run(stream); + } + + // bad_out_data = 0 + auto b = bad_out_tensor->mutable_data(place); + platform::NPUMemsetAsync(static_cast(b), 0, + bad_out_tensor->numel() * sizeof(int), stream); + } + } else { + // bad_out_data = 0 + auto b = bad_out_tensor->mutable_data(place); + platform::NPUMemsetAsync(static_cast(b), 0, + bad_out_tensor->numel() * sizeof(int), stream); + + // good_out_data = good_in_data + 1 + Tensor factor_tensor(good_out_tensor->type()); + factor_tensor.mutable_data({1}, place); + TensorFromVector(std::vector{1}, ctx, &factor_tensor); + auto runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor}, + {*good_out_tensor}, {}); + runner_p2.Run(stream); + + std::vector good_out_data; + TensorToVector(*good_out_tensor, ctx, &good_out_data); + + if (good_out_data[0] == incr_every_n_steps) { + auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, + {*updated_loss_scaling_tensor}, + {{"power", static_cast(1)}, + {"scale", incr_ratio}, + {"shift", static_cast(0)}}); + runner_p3.Run(stream); + + std::vector new_loss_scaling; + TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling); + if (!std::isfinite(new_loss_scaling[0])) { + // updated_loss_scaling_data = pre_loss_scaling_data + auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, + {*updated_loss_scaling_tensor}, + {{"power", static_cast(1)}, + {"scale", static_cast(1)}, + {"shift", static_cast(0)}}); + + runner_p4.Run(stream); + } + // good_out_data = 0 + auto g = good_out_tensor->mutable_data(place); + platform::NPUMemsetAsync(static_cast(g), 0, + good_out_tensor->numel() * sizeof(int), stream); + } + } +} + +template +class UpdateLossScalingFunctor { + public: + void operator()(const platform::NPUDeviceContext& dev_ctx, + const std::vector found_inf_vec, + const Tensor* pre_loss_scaling_tensor, + const Tensor* good_in_tensor, const Tensor* bad_in_tensor, + const int incr_every_n_steps, + const int decr_every_n_nan_or_inf, const float incr_ratio, + const float decr_ratio, Tensor* updated_loss_scaling_tensor, + Tensor* good_out_tensor, Tensor* bad_out_tensor) const { + Update(dev_ctx, found_inf_vec, pre_loss_scaling_tensor, good_in_tensor, + bad_in_tensor, incr_every_n_steps, decr_every_n_nan_or_inf, + incr_ratio, decr_ratio, updated_loss_scaling_tensor, + good_out_tensor, bad_out_tensor); + } +}; + +template +class LazyZerosNPU { + public: + void operator()(const platform::NPUDeviceContext& dev_ctx, + const std::vector found_inf_vec, + const std::vector& xs, + const std::vector& outs) const { + for (size_t i = 0; i < xs.size(); ++i) { + auto* out = outs[i]; + if (found_inf_vec[0]) { + VLOG(4) << "-- UpdateLossScaling: Find infinite grads. --"; + + auto place = dev_ctx.GetPlace(); + auto stream = dev_ctx.stream(); + auto g = out->mutable_data(place); + platform::NPUMemsetAsync(static_cast(g), 0, + out->numel() * sizeof(T), stream); + } + } + } +}; + +template +class UpdateLossScalingNPUKernel : public framework::OpKernel { + using MPDType = typename details::MPTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + + const auto xs = ctx.MultiInput("X"); + auto outs = ctx.MultiOutput("Out"); + const auto* found_inf = ctx.Input("FoundInfinite"); + PADDLE_ENFORCE_EQ(found_inf->numel(), 1, + platform::errors::InvalidArgument( + "FoundInfinite must has only one element.")); + + std::vector found_inf_vec; + TensorToVector(*found_inf, ctx.device_context(), &found_inf_vec); + + LazyZerosNPU{}(dev_ctx, found_inf_vec, xs, outs); + const bool stop_update = ctx.Attr("stop_update"); + if (stop_update) { + return; + } + + const auto* pre_loss_scaling = ctx.Input("PrevLossScaling"); + const auto* good_in = ctx.Input("InGoodSteps"); + const auto* bad_in = ctx.Input("InBadSteps"); + auto* updated_loss_scaling = ctx.Output("LossScaling"); + auto* good_out = ctx.Output("OutGoodSteps"); + auto* bad_out = ctx.Output("OutBadSteps"); + + updated_loss_scaling->mutable_data(dev_ctx.GetPlace()); + good_out->mutable_data(dev_ctx.GetPlace()); + bad_out->mutable_data(dev_ctx.GetPlace()); + + const int incr_every_n_steps = ctx.Attr("incr_every_n_steps"); + const int decr_every_n_nan_or_inf = + ctx.Attr("decr_every_n_nan_or_inf"); + const float incr_ratio = ctx.Attr("incr_ratio"); + const float decr_ratio = ctx.Attr("decr_ratio"); + UpdateLossScalingFunctor{}( + dev_ctx, found_inf_vec, pre_loss_scaling, good_in, bad_in, + incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio, + updated_loss_scaling, good_out, bad_out); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + update_loss_scaling, + ops::UpdateLossScalingNPUKernel, + ops::UpdateLossScalingNPUKernel); diff --git a/paddle/fluid/operators/assign_op_npu.cc b/paddle/fluid/operators/assign_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..93689d5e495f33484d2f05b04d25734a8c5ab07e --- /dev/null +++ b/paddle/fluid/operators/assign_op_npu.cc @@ -0,0 +1,63 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/operators/assign_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace framework { +class OpDesc; +class Variable; +} // namespace framework +namespace imperative { +class OpBase; +} // namespace imperative +namespace platform { +struct CPUPlace; +struct CUDAPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace paddle { +namespace operators { +template +class AssignNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + auto runner = NpuOpRunner("Assign", {*out, *x}, {*out}, {}); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + assign, ops::AssignNPUKernel, + ops::AssignNPUKernel, + ops::AssignNPUKernel) diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..5cf1303a229a9041d0caaea7f7fa9db61579ffb2 --- /dev/null +++ b/paddle/fluid/operators/assign_op_npu_test.cc @@ -0,0 +1,80 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(assign); +USE_OP_DEVICE_KERNEL(assign, NPU); + +template +void Compare(f::Scope* scope, const p::DeviceContext& ctx, + std::string op_type) { + // init + auto x = scope->Var("X"); + auto tensor_x = x->GetMutable(); + + std::vector init; + init.push_back(static_cast(1.0)); + init.push_back(static_cast(2.0)); + init.push_back(static_cast(3.0)); + init.push_back(static_cast(4.0)); + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({4}); + + ctx.Wait(); + + auto place = ctx.GetPlace(); + auto out = scope->Var("Out"); + auto tensor_out = out->GetMutable(); + + auto op = + f::OpRegistry::CreateOp(op_type, {{"X", {"X"}}}, {{"Out", {"Out"}}}, {}); + + op->Run(*scope, place); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + + ctx.Wait(); + + EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)4); + EXPECT_EQ(out_vec[0], static_cast(1.0)); + EXPECT_EQ(out_vec[1], static_cast(2.0)); + EXPECT_EQ(out_vec[2], static_cast(3.0)); + EXPECT_EQ(out_vec[3], static_cast(4.0)); +} + +TEST(assign, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx, "assign"); +} diff --git a/paddle/fluid/operators/cast_op_npu.cc b/paddle/fluid/operators/cast_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..20b33c4e4e05a658e9f1f80d0c98562d1485aea2 --- /dev/null +++ b/paddle/fluid/operators/cast_op_npu.cc @@ -0,0 +1,92 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_ASCEND_CL +#include +#include + +#include "paddle/fluid/operators/cast_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +static std::map + DTYPE_2_ACL_DTYPE = { + {framework::proto::VarType::BOOL, ACL_BOOL}, + {framework::proto::VarType::INT16, ACL_INT16}, + {framework::proto::VarType::INT32, ACL_INT32}, + {framework::proto::VarType::INT64, ACL_INT64}, + {framework::proto::VarType::FP16, ACL_FLOAT16}, + {framework::proto::VarType::FP32, ACL_FLOAT}, + {framework::proto::VarType::FP64, ACL_DOUBLE}, +}; + +using Tensor = framework::Tensor; + +template +class CastNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + int dtype = ctx.Attr("out_dtype"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + auto iter = DTYPE_2_ACL_DTYPE.find( + static_cast(dtype)); + int aclDtype = iter->second; + + if (dtype == framework::proto::VarType::FP32) { + out->mutable_data(place); + } else if (dtype == framework::proto::VarType::FP16) { + out->mutable_data(place); + } else if (dtype == framework::proto::VarType::INT16) { + out->mutable_data(place); + } else if (dtype == framework::proto::VarType::INT32) { + out->mutable_data(place); + } else if (dtype == framework::proto::VarType::INT64) { + out->mutable_data(place); + } else if (dtype == framework::proto::VarType::FP64) { + out->mutable_data(place); + } else if (dtype == framework::proto::VarType::BOOL) { + out->mutable_data(place); + } + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("Cast", {*x}, {*out}, + {{"dst_type", static_cast(aclDtype)}}); + runner.Run(stream); + } +}; +} // namespace operators +} // namespace paddleaclDtype + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + cast, ops::CastNPUKernel, + ops::CastNPUKernel, + ops::CastNPUKernel, + ops::CastNPUKernel, + ops::CastNPUKernel, + ops::CastNPUKernel, + ops::CastNPUKernel); +#endif diff --git a/paddle/fluid/operators/concat_op_npu.cc b/paddle/fluid/operators/concat_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..87bb3397ca2672ce377b74682cb0445e31b03677 --- /dev/null +++ b/paddle/fluid/operators/concat_op_npu.cc @@ -0,0 +1,126 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/concat_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class ConcatNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto ins = ctx.MultiInput("X"); + framework::LoDTensor* out = ctx.Output("Out"); + PADDLE_ENFORCE_NOT_NULL(ins[0], + platform::errors::NotFound( + "The first input tensor is not initalized.")); + auto axis = ctx.Attr("axis"); + + if (ctx.HasInput("AxisTensor")) { + PADDLE_THROW(platform::errors::NotFound( + "The AxisTensor is not supported on NPU now.")); + } + axis = ComputeAxis(static_cast(axis), + static_cast(ins[0]->dims().size())); + + auto place = ctx.GetPlace(); + out->mutable_data(place); + + std::vector inputs; + std::vector names; + for (size_t i = 0; i < ins.size(); ++i) { + if (ins[i] && ins[i]->numel() > 0) { + inputs.push_back(*ins[i]); + names.push_back("x" + std::to_string(i)); + } else { + continue; + } + } + auto stream = + ctx.template device_context() + .stream(); + auto runner = NpuOpRunner( + "ConcatD", {inputs}, {*out}, + {{"concat_dim", axis}, {"N", static_cast(inputs.size())}}); + runner.AddInputNames(names); + runner.Run(stream); + } +}; + +template +class ConcatGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out_grad = + ctx.Input(framework::GradVarName("Out")); + auto ins = ctx.MultiInput("X"); + auto out_var_names = ctx.OutputNames(framework::GradVarName("X")); + auto outs = + ctx.MultiOutput(framework::GradVarName("X")); + + PADDLE_ENFORCE_NOT_NULL(ins[0], + platform::errors::NotFound( + "The first input tensor is not initalized.")); + + auto axis = ctx.Attr("axis"); + + axis = ComputeAxis(static_cast(axis), + static_cast(ins[0]->dims().size())); + + int offset = 0; + auto stream = + ctx.template device_context() + .stream(); + for (size_t j = 0; j < outs.size(); ++j) { + // For stop gradient + // get output tensor that the name is not kEmptyVarName + if (out_var_names[j] != framework::kEmptyVarName && + outs[j]->numel() != 0UL) { + outs[j]->mutable_data(ctx.GetPlace()); + std::vector offsets; + std::vector sizes; + for (int dim = 0; dim < ins[j]->dims().size(); ++dim) { + if (dim == axis) { + offsets.push_back(offset); + sizes.push_back(ins[j]->dims()[dim]); + } else { + offsets.push_back(0); + sizes.push_back(ins[j]->dims()[dim]); + } + } + auto runner = NpuOpRunner("SliceD", {*out_grad}, {*outs[j]}, + {{"offsets", offsets}, {"size", sizes}}); + runner.Run(stream); + } + if (ins[j]->numel() != 0UL) { + offset += ins[j]->dims()[axis]; + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL(concat, ops::ConcatNPUKernel, + ops::ConcatNPUKernel, + ops::ConcatNPUKernel); + +REGISTER_OP_NPU_KERNEL(concat_grad, ops::ConcatGradNPUKernel, + ops::ConcatGradNPUKernel, + ops::ConcatGradNPUKernel); diff --git a/paddle/fluid/operators/controlflow/compare_op_npu.cc b/paddle/fluid/operators/controlflow/compare_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..591fb55936734ffc675dad5c6912e7cbf4e80471 --- /dev/null +++ b/paddle/fluid/operators/controlflow/compare_op_npu.cc @@ -0,0 +1,78 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/operators/controlflow/compare_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" +#include "paddle/fluid/operators/npu_op_runner.h" +#ifdef PADDLE_WITH_ASCEND_CL + +namespace paddle { +namespace operators { + +template +class EqualNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + auto runner = NpuOpRunner("Equal", {*x, *y}, {*out}, {}); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class LessThanNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + // int axis = context.Attr("axis"); + z->mutable_data(ctx.GetPlace()); // allocate + auto runner = NpuOpRunner("Less", {*x, *y}, {*z}); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(equal, ops::EqualNPUKernel, + ops::EqualNPUKernel, + ops::EqualNPUKernel); + +REGISTER_OP_NPU_KERNEL( + less_than, + ops::LessThanNPUKernel, + ops::LessThanNPUKernel); + +#endif diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h index b9ea2ade6cb90b71d423ba977215ab693f19b562..6513bae839e9894ee2b342c5d61fcbf9191a4123 100644 --- a/paddle/fluid/operators/controlflow/conditional_block_op.h +++ b/paddle/fluid/operators/controlflow/conditional_block_op.h @@ -78,6 +78,13 @@ class ConditionalOp : public framework::OperatorBase { framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor); platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait(); res = cpu_tensor.data()[0]; +#endif + } else if (platform::is_npu_place(ips[0]->place())) { +#ifdef PADDLE_WITH_ASCEND_CL + framework::LoDTensor cpu_tensor; + framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor); + platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait(); + res = cpu_tensor.data()[0]; #endif } else { res = ips[0]->data()[0]; diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc index d86b6b48422d94604724303de72f401bfba2e23e..fdd1b776bd8fa3f24fb596af29512f1f781dce4c 100644 --- a/paddle/fluid/operators/controlflow/fetch_op.cc +++ b/paddle/fluid/operators/controlflow/fetch_op.cc @@ -44,6 +44,11 @@ static void DataCopy(const framework::LoDTensor &src_item, TensorCopySync(src_item, platform::CPUPlace(), dst_item); } #else +#ifdef PADDLE_WITH_ASCEND_CL + if (platform::is_npu_place(src_item.place())) { + platform::DeviceContextPool::Instance().Get(src_item.place())->Wait(); + } +#endif TensorCopySync(src_item, platform::CPUPlace(), dst_item); #endif } else { diff --git a/paddle/fluid/operators/controlflow/logical_op_npu.cc b/paddle/fluid/operators/controlflow/logical_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..1b0c0e444347af0a90f8244590b84199dc97f931 --- /dev/null +++ b/paddle/fluid/operators/controlflow/logical_op_npu.cc @@ -0,0 +1,57 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_ASCEND_CL +#include +#include + +#include "paddle/fluid/operators/controlflow/logical_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class LogicalNotNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("LogicalNot", {*x}, {*out}, {}); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + logical_not, + ops::LogicalNotNPUKernel); + +#endif diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc index 1e7e5e02c0181f8828a59b9403ac24f40347f8b6..5b8d08a8943ddeed29731a7b6660619f9a7d4ef3 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc @@ -12,17 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_ASCEND_CL #include #include +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #include "paddle/fluid/operators/npu_op_runner.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; -template +template class ElementwiseAddNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -39,12 +40,127 @@ class ElementwiseAddNPUKernel : public framework::OpKernel { } }; +template +class ElementwiseAddGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + + auto stream = + ctx.template device_context() + .stream(); + + // NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with + // default axis=-1? + // So, the sub_grad should do reduce if needed. + // For example, the shape of each variable in elementwise_sub: + // x, dx: [2, 3, 5] + // y, dy: [1, 5] + // out, dout: [2, 3, 5] + // Then, out = x - y => dx = dout, dy = -dout + // And, the shape of dy can be computed by two stages reduce, + // 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false. + // 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true. + + if (dx) { + dx->mutable_data(ctx.GetPlace()); + // For dx + // stage 1 + auto reduce_ndim = dout->dims().size() - dx->dims().size(); + std::vector axes; + for (auto i = 0; i < reduce_ndim; ++i) { + axes.push_back(i); + } + Tensor* tmp_dout = const_cast(dout); + Tensor reduced_dout(dx->type()); + if (axes.size() != 0) { + std::vector reduced_dout_dims; + for (auto i = reduce_ndim; i < dout->dims().size(); ++i) { + reduced_dout_dims.push_back(dout->dims()[i]); + } + reduced_dout.Resize(framework::make_ddim(reduced_dout_dims)); + reduced_dout.mutable_data(ctx.GetPlace()); + auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, + {{"axes", axes}, {"keep_dims", false}}); + runner.Run(stream); + tmp_dout = &reduced_dout; + } + + // stage 2 + axes.clear(); + for (auto i = 0; i < dx->dims().size(); ++i) { + if (dx->dims()[i] == 1) { + axes.push_back(i); + } + } + if (axes.size() != 0) { + auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx}, + {{"axes", axes}, {"keep_dims", true}}); + runner.Run(stream); + } else { + ctx.template device_context() + .Wait(); + framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx); + } + } + + if (dy) { + // For dy + // stage 1 + auto reduce_ndim = dout->dims().size() - dy->dims().size(); + std::vector axes; + for (auto i = 0; i < reduce_ndim; ++i) { + axes.push_back(i); + } + Tensor* tmp_dout = const_cast(dout); + Tensor reduced_dout(dout->type()); + if (axes.size() != 0) { + std::vector reduced_dout_dims; + for (auto i = reduce_ndim; i < dout->dims().size(); ++i) { + reduced_dout_dims.push_back(dout->dims()[i]); + } + reduced_dout.Resize(framework::make_ddim(reduced_dout_dims)); + reduced_dout.mutable_data(ctx.GetPlace()); + auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, + {{"axes", axes}, {"keep_dims", false}}); + runner.Run(stream); + tmp_dout = &reduced_dout; + ctx.template device_context() + .Wait(); + } + + // stage 2 + axes.clear(); + for (auto i = 0; i < dy->dims().size(); ++i) { + if (dy->dims()[i] == 1) { + axes.push_back(i); + } + } + if (axes.size() != 0) { + dy->mutable_data(ctx.GetPlace()); + auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dy}, + {{"axes", axes}, {"keep_dims", true}}); + runner.Run(stream); + } else { + ctx.template device_context() + .Wait(); + framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dy); + } + } + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(elementwise_add, ops::ElementwiseAddNPUKernel, + ops::ElementwiseAddNPUKernel); -REGISTER_OP_NPU_KERNEL( - elementwise_add, - ops::ElementwiseAddNPUKernel); -#endif +REGISTER_OP_NPU_KERNEL(elementwise_add_grad, + ops::ElementwiseAddGradNPUKernel, + ops::ElementwiseAddGradNPUKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..8852f3a419adc51d311178175fd6f71a8c628540 --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc @@ -0,0 +1,143 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/operators/elementwise/elementwise_div_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class ElementwiseDivNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("Div", {*x, *y}, {*out}, {}); + runner.Run(stream); + } +}; + +template +class ElementwiseDivGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + + auto place = ctx.GetPlace(); + + auto stream = + ctx.template device_context() + .stream(); + + Tensor y_power(y->type()); + y_power.mutable_data(y->dims(), place); + auto y_power_runner = NpuOpRunner("Power", {*y}, {y_power}, + {{"power", static_cast(-1)}}); + y_power_runner.Run(stream); + + if (dx) { + dx->mutable_data(place); + + Tensor tensor_zeros(x->type()); + tensor_zeros.mutable_data(x->dims(), place); + auto tensor_zeros_runner = + NpuOpRunner("ZerosLike", {*x}, {tensor_zeros}, {}); + tensor_zeros_runner.Run(stream); + + Tensor x_zero(paddle::framework::proto::VarType::BOOL); + x_zero.mutable_data(x->dims(), place); + auto x_zero_runner = + NpuOpRunner("Equal", {*x, tensor_zeros}, {x_zero}, {}); + x_zero_runner.Run(stream); + + Tensor x_nozero(paddle::framework::proto::VarType::BOOL); + x_nozero.mutable_data(x->dims(), place); + auto x_nozero_runner = + NpuOpRunner("LogicalNot", {x_zero}, {x_nozero}, {}); + x_nozero_runner.Run(stream); + + Tensor x_nozero_f(x->type()); + x_nozero_f.mutable_data(x->dims(), place); + auto x_nozero_f_runner = + NpuOpRunner("Cast", {x_nozero}, {x_nozero_f}, + {{"dst_type", static_cast(0)}}); + x_nozero_f_runner.Run(stream); + + Tensor x_grad_w(x->type()); + x_grad_w.mutable_data(x->dims(), place); + auto x_grad_w_runner = + NpuOpRunner("Mul", {x_nozero_f, y_power}, {x_grad_w}, {}); + x_grad_w_runner.Run(stream); + + auto x_grad_runner = NpuOpRunner("Mul", {x_grad_w, *dout}, {*dx}, {}); + x_grad_runner.Run(stream); + } + + if (dy) { + dy->mutable_data(place); + + Tensor neg_out(y->type()); + neg_out.mutable_data(y->dims(), place); + auto neg_out_runner = NpuOpRunner("Neg", {*out}, {neg_out}, {}); + neg_out_runner.Run(stream); + + Tensor y_grad_w(y->type()); + y_grad_w.mutable_data(y->dims(), place); + auto y_grad_w_runner = NpuOpRunner("Div", {neg_out, *y}, {y_grad_w}, {}); + y_grad_w_runner.Run(stream); + + auto y_grad_runner = NpuOpRunner("Mul", {y_grad_w, *dout}, {*dy}, {}); + y_grad_runner.Run(stream); + } + } +}; + +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + elementwise_div, + ops::ElementwiseDivNPUKernel, + ops::ElementwiseDivNPUKernel); + +REGISTER_OP_NPU_KERNEL( + elementwise_div_grad, + ops::ElementwiseDivGradNPUKernel, + ops::ElementwiseDivGradNPUKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..da0116114747fa2e44045b75f3bd9bd0dc73d980 --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc @@ -0,0 +1,52 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/operators/elementwise/elementwise_div_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class ElementwiseFloorDivNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + + out->mutable_data(ctx.GetPlace()); + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("FloorDiv", {*x, *y}, {*out}, {}); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL(elementwise_floordiv, + ops::ElementwiseFloorDivNPUKernel, + ops::ElementwiseFloorDivNPUKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..3cdb6420e8ee1d159ecd525ab6a2360544ca5323 --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc @@ -0,0 +1,57 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/operators/elementwise/elementwise_max_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class ElementwiseMaxNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("Maximum", {*x, *y}, {*out}, {}); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + elementwise_max, + ops::ElementwiseMaxNPUKernel, + ops::ElementwiseMaxNPUKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..987c250d651475d44da7e2ebf88222b74e5b5af0 --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc @@ -0,0 +1,57 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/operators/elementwise/elementwise_min_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class ElementwiseMinNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("Minimum", {*x, *y}, {*out}, {}); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + elementwise_min, + ops::ElementwiseMinNPUKernel, + ops::ElementwiseMinNPUKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..08df6d4e27af0a79123f26ad2064ee0203cc1b28 --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc @@ -0,0 +1,96 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_ASCEND_CL +#include +#include + +#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class ElementwiseMulNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {}); + runner.Run(stream); + } +}; + +template +class ElementwiseMulGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + + auto place = ctx.GetPlace(); + + auto stream = + ctx.template device_context() + .stream(); + + if (dx) { + dx->mutable_data(place); + auto dx_runner = NpuOpRunner("Mul", {*dout, *y}, {*dx}, {}); + dx_runner.Run(stream); + } + + if (dy) { + dy->mutable_data(place); + auto dy_runner = NpuOpRunner("Mul", {*x, *dout}, {*dy}, {}); + dy_runner.Run(stream); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + elementwise_mul, + ops::ElementwiseMulNPUKernel, + ops::ElementwiseMulNPUKernel); + +REGISTER_OP_NPU_KERNEL( + elementwise_mul_grad, + ops::ElementwiseMulGradNPUKernel, + ops::ElementwiseMulGradNPUKernel); +#endif diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc index 3a2a21647083bfda097b75326814fd34d2bdd689..df6fae6c8484a016a3589339a3a7820d20d7dcca 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc +++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc @@ -74,6 +74,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, {{"Out", {"Out"}}}, attrs); op->Run(*scope, place); + ctx.Wait(); std::vector out_vec; TensorToVector(*tensor_out, ctx, &out_vec); @@ -131,6 +132,7 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx, auto place = ctx.GetPlace(); op->Run(*scope, place); + ctx.Wait(); std::vector dx_vec; TensorToVector(*tensor_dx, ctx, &dx_vec); @@ -179,3 +181,9 @@ TEST(elementwise_sub_grad, NPU) { p::NPUDeviceContext ctx(p::NPUPlace(0)); CompareGrad(&scope, ctx, "elementwise_sub_grad"); } + +TEST(elementwise_add_grad, NPU) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + CompareGrad(&scope, ctx, "elementwise_add_grad"); +} diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..26cc925b869c647d5a02215c8c8621782cdf2303 --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc @@ -0,0 +1,57 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/operators/elementwise/elementwise_pow_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class ElementwisePowNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("Pow", {*x, *y}, {*out}, {}); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + elementwise_pow, + ops::ElementwisePowNPUKernel, + ops::ElementwisePowNPUKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc index e47c38daee8ba028668f88736ca5e7266ee4bb00..809445c2862035c182e827840d6e8440f80d47c4 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_ASCEND_CL #include #include @@ -24,7 +23,7 @@ namespace operators { using Tensor = framework::Tensor; -template +template class ElementwiseSubNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -43,7 +42,7 @@ class ElementwiseSubNPUKernel : public framework::OpKernel { } }; -template +template class ElementwiseSubGradNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -51,8 +50,9 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel { auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); - dx->mutable_data(ctx.GetPlace()); - dy->mutable_data(ctx.GetPlace()); + auto stream = + ctx.template device_context() + .stream(); // NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with // default axis=-1? @@ -66,89 +66,92 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel { // 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false. // 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true. - auto stream = - ctx.template device_context() - .stream(); - // For dx - // stage 1 - auto reduce_ndim = dout->dims().size() - dx->dims().size(); - std::vector axes; - for (auto i = 0; i < reduce_ndim; ++i) { - axes.push_back(i); - } - auto tmp_dout = dout; - Tensor reduced_dout(dx->type()); - if (axes.size() != 0) { - std::vector reduced_dout_dims; - for (auto i = reduce_ndim; i < dout->dims().size(); ++i) { - reduced_dout_dims.push_back(dout->dims()[i]); - } - reduced_dout.Resize(framework::make_ddim(reduced_dout_dims)); - reduced_dout.mutable_data(ctx.GetPlace()); - auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, - {{"axes", axes}, {"keep_dims", false}}); - runner.Run(stream); - tmp_dout = &reduced_dout; - } - - // stage 2 - axes.clear(); - for (auto i = 0; i < dx->dims().size(); ++i) { - if (dx->dims()[i] == 1) { + if (dx) { + dx->mutable_data(ctx.GetPlace()); + // For dx + // stage 1 + auto reduce_ndim = dout->dims().size() - dx->dims().size(); + std::vector axes; + for (auto i = 0; i < reduce_ndim; ++i) { axes.push_back(i); } - } - if (axes.size() != 0) { - auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx}, - {{"axes", axes}, {"keep_dims", true}}); - runner.Run(stream); - } else { - framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx); - } - - // For dy - // stage 1 - reduce_ndim = dout->dims().size() - dy->dims().size(); - axes.clear(); - for (auto i = 0; i < reduce_ndim; ++i) { - axes.push_back(i); - } - tmp_dout = dout; - Tensor reduced_dy(dy->type()); + Tensor* tmp_dout = const_cast(dout); + Tensor reduced_dout(dx->type()); + if (axes.size() != 0) { + std::vector reduced_dout_dims; + for (auto i = reduce_ndim; i < dout->dims().size(); ++i) { + reduced_dout_dims.push_back(dout->dims()[i]); + } + reduced_dout.Resize(framework::make_ddim(reduced_dout_dims)); + reduced_dout.mutable_data(ctx.GetPlace()); + auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, + {{"axes", axes}, {"keep_dims", false}}); + runner.Run(stream); + tmp_dout = &reduced_dout; + } - if (axes.size() != 0) { - std::vector reduced_dout_dims; - for (auto i = reduce_ndim; i < dout->dims().size(); ++i) { - reduced_dout_dims.push_back(dout->dims()[i]); + // stage 2 + axes.clear(); + for (auto i = 0; i < dx->dims().size(); ++i) { + if (dx->dims()[i] == 1) { + axes.push_back(i); + } + } + if (axes.size() != 0) { + auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx}, + {{"axes", axes}, {"keep_dims", true}}); + runner.Run(stream); + } else { + framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx); } - reduced_dout.Resize(framework::make_ddim(reduced_dout_dims)); - reduced_dout.mutable_data(ctx.GetPlace()); - auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, - {{"axes", axes}, {"keep_dims", false}}); - runner.Run(stream); - tmp_dout = &reduced_dout; } - - // stage 2 - axes.clear(); - auto* tmp_dy = tmp_dout; - for (auto i = 0; i < dy->dims().size(); ++i) { - if (dy->dims()[i] == 1) { + if (dy) { + dy->mutable_data(ctx.GetPlace()); + // For dy + // stage 1 + auto reduce_ndim = dout->dims().size() - dy->dims().size(); + std::vector axes; + for (auto i = 0; i < reduce_ndim; ++i) { axes.push_back(i); } - } - if (axes.size() != 0) { - reduced_dy.Resize(dy->dims()); - reduced_dy.mutable_data(ctx.GetPlace()); - auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy}, - {{"axes", axes}, {"keep_dims", true}}); + Tensor* tmp_dout = const_cast(dout); + Tensor reduced_dy(dy->type()); + Tensor reduced_dout(dy->type()); + + if (axes.size() != 0) { + std::vector reduced_dout_dims; + for (auto i = reduce_ndim; i < dout->dims().size(); ++i) { + reduced_dout_dims.push_back(dout->dims()[i]); + } + reduced_dout.Resize(framework::make_ddim(reduced_dout_dims)); + reduced_dout.mutable_data(ctx.GetPlace()); + auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, + {{"axes", axes}, {"keep_dims", false}}); + runner.Run(stream); + tmp_dout = &reduced_dout; + } + + // stage 2 + axes.clear(); + Tensor* tmp_dy = tmp_dout; + for (auto i = 0; i < dy->dims().size(); ++i) { + if (dy->dims()[i] == 1) { + axes.push_back(i); + } + } + if (axes.size() != 0) { + reduced_dy.Resize(dy->dims()); + reduced_dy.mutable_data(ctx.GetPlace()); + auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy}, + {{"axes", axes}, {"keep_dims", true}}); + runner.Run(stream); + tmp_dy = &reduced_dy; + } + + // stage 3, negative + auto runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {}); runner.Run(stream); - tmp_dy = &reduced_dy; } - - // stage 3, negative - auto runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {}); - runner.Run(stream); } }; @@ -156,16 +159,11 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(elementwise_sub, ops::ElementwiseSubNPUKernel, + ops::ElementwiseSubNPUKernel); -REGISTER_OP_NPU_KERNEL( - elementwise_sub, - ops::ElementwiseSubNPUKernel, - ops::ElementwiseSubNPUKernel); - -REGISTER_OP_NPU_KERNEL( - elementwise_sub_grad, - ops::ElementwiseSubGradNPUKernel, - ops::ElementwiseSubGradNPUKernel); -#endif +REGISTER_OP_NPU_KERNEL(elementwise_sub_grad, + ops::ElementwiseSubGradNPUKernel, + ops::ElementwiseSubGradNPUKernel); diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h index e1a1ce0a8171ee571138b5eff9ced865af154aff..e566d69096595ce5ea9e753b58a2bf3e923a9c10 100755 --- a/paddle/fluid/operators/expand_op.h +++ b/paddle/fluid/operators/expand_op.h @@ -64,6 +64,12 @@ inline std::vector get_expand_times( TensorCopySync(*expand_tensor, platform::CPUPlace(), &cpu_expand_tensor); expand_data = cpu_expand_tensor.data(); } +#ifdef PADDLE_WITH_ASCEND_CL + if (platform::is_npu_place(expand_tensor->place())) { + TensorCopySync(*expand_tensor, platform::CPUPlace(), &cpu_expand_tensor); + expand_data = cpu_expand_tensor.data(); + } +#endif #ifdef PADDLE_WITH_XPU if (platform::is_xpu_place(expand_tensor->place())) { TensorCopySync(*expand_tensor, platform::CPUPlace(), &cpu_expand_tensor); diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..f4ae1785b024f5b6595fee9cad26fd85394f78c4 --- /dev/null +++ b/paddle/fluid/operators/expand_op_npu.cc @@ -0,0 +1,83 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_ASCEND_CL +#include +#include +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/expand_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class ExpandNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto rank = context.Input("X")->dims().size(); + PADDLE_ENFORCE_GE( + rank, 1, + platform::errors::InvalidArgument( + "The number of dimensions of the input 'x' for Op(expand) " + "must be greater than or equal to 1, but the value received is %d.", + rank)); + PADDLE_ENFORCE_LE( + rank, MAX_RANK_SUPPORTED, + platform::errors::InvalidArgument( + "The number of dimensions of the input 'x' for Op(expand) " + "must be less than or equal to %d, but the value received is %d.", + MAX_RANK_SUPPORTED, rank)); + switch (rank) { REP_EXPAND_TEMPLATE(MAX_RANK_SUPPORTED) } + } + + protected: + template + void Expand(const framework::ExecutionContext& context) const { + auto* in0 = context.Input("X"); + auto in_dims = in0->dims(); + auto expand_times = get_expand_times(context); + PADDLE_ENFORCE_EQ( + static_cast(in_dims.size()), expand_times.size(), + platform::errors::InvalidArgument( + "The number of elements (%d) of 'expand_times' for " + "Op(expand) must be equal to the number " + "of dimensions (%d) of the input.", + expand_times.size(), static_cast(in_dims.size()))); + auto* out0 = context.Output("Out"); + framework::DDim out_dims(in_dims); + for (size_t i = 0; i < expand_times.size(); ++i) { + out_dims[i] *= expand_times[i]; + } + out0->Resize(out_dims); + out0->mutable_data(context.device_context().GetPlace()); + auto runner = + NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}}); + auto stream = + context.template device_context() + .stream(); + runner.Run(stream); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_NPU_KERNEL( + expand, ops::ExpandNPUKernel, + ops::ExpandNPUKernel); + +#endif diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..95f7865a8a3a4ee22600e4a64c7f2e7bf0fa2a2c --- /dev/null +++ b/paddle/fluid/operators/expand_op_npu_test.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(expand); +USE_OP_DEVICE_KERNEL(expand, NPU); + +template +void Compare(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto in = scope->Var("X"); + auto expand_times = scope->Var("ExpandTimes"); + auto out = scope->Var("Out"); + auto in_t = in->GetMutable(); + auto out_t = out->GetMutable(); + auto expand_times_t = expand_times->GetMutable(); + + auto place = ctx.GetPlace(); + TensorFromVector(std::vector(3 * 1 * 7, 1), ctx, in_t); + TensorFromVector(std::vector({1, 10, 1}), ctx, expand_times_t); + + in_t->Resize(f::make_ddim({3, 1, 7})); + expand_times_t->Resize(f::make_ddim({3})); + out_t->Resize(f::make_ddim({3, 10, 7})); + out_t->mutable_data(place); + + f::AttributeMap attrs = {{}}; + auto op = f::OpRegistry::CreateOp( + "expand", {{"X", {"X"}}, {"ExpandTimes", {"ExpandTimes"}}}, + {{"Out", {"Out"}}}, attrs); + op->Run(*scope, place); + ctx.Wait(); + + auto out_dim = out_t->dims(); + EXPECT_EQ(out_dim.at(0), 3); + EXPECT_EQ(out_dim.at(1), 10); + EXPECT_EQ(out_dim.at(2), 7); +} + +TEST(expand, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx); +} diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..9d5499e00c82f6c562c0bfd115ae89753bf37c6e --- /dev/null +++ b/paddle/fluid/operators/fill_constant_op_npu.cc @@ -0,0 +1,87 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/operators/fill_constant_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/operators/utils.h" + +namespace paddle { +namespace operators { + +template +class FillConstantNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto data_type = + static_cast(ctx.Attr("dtype")); + auto str_value = ctx.Attr("str_value"); + auto float_value = ctx.Attr("value"); + + auto* out_var = ctx.Output("Out"); + auto place = ctx.GetPlace(); + auto stream = + ctx.template device_context() + .stream(); + + T value; + if (str_value.empty()) { + value = static_cast(float_value); + } else { + // handle NaN/Inf first, which cannot be read from stream. + if (str_value == "inf") { + value = static_cast(std::numeric_limits::infinity()); + } else if (str_value == "-inf") { + value = static_cast(-std::numeric_limits::infinity()); + } else if (str_value == "nan") { + value = static_cast(std::numeric_limits::quiet_NaN()); + } else { + std::stringstream convert_stream(str_value); + if (std::is_same::value) { + int64_t tmp_value; + convert_stream >> tmp_value; + value = static_cast(tmp_value); + } else { + double tmp_value; + convert_stream >> tmp_value; + value = static_cast(tmp_value); + } + } + } + auto shape = GetShape(ctx); + + Tensor tensor_tmp(data_type); + tensor_tmp.mutable_data({1}, ctx.GetPlace()); + TensorFromVector(std::vector{value}, ctx.device_context(), &tensor_tmp); + + out_var->mutable_data(shape, place); + auto runner = NpuOpRunner("FillD", {tensor_tmp}, {*out_var}, + {{"dims", framework::vectorize(shape)}}); + runner.Run(stream); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + fill_constant, + ops::FillConstantNPUKernel, + ops::FillConstantNPUKernel, + ops::FillConstantNPUKernel, + ops::FillConstantNPUKernel); diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..8a487234ad94acd294193e26019e087dc3a7854c --- /dev/null +++ b/paddle/fluid/operators/gather_op_npu.cc @@ -0,0 +1,99 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/gather_op.h" +#include +#include +#include +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/kron_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/platform/npu_info.h" + +namespace paddle { +namespace operators { + +template +class GatherOpNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *x = ctx.Input("X"); + auto *index = ctx.Input("Index"); + auto *out = ctx.Output("Out"); + + out->mutable_data(ctx.GetPlace()); + auto runner = NpuOpRunner("Gather", {*x, *index}, {*out}, + {{"validate_indices", true}}); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class GatherGradOpNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *index = ctx.Input("Index"); + auto *x = ctx.Input("X"); + auto *dout = ctx.Input(framework::GradVarName("Out")); + auto *dx = ctx.Output(framework::GradVarName("X")); + + // step1: Unsqueeze index + framework::Tensor tmp_tensor(index->type()); + const auto index_dims = index->dims(); + if (index_dims.size() == 1) { + tmp_tensor.ShareDataWith(*index); + std::vector new_dim = {index_dims[0], 1}; + tmp_tensor.Resize(framework::make_ddim(new_dim)); + index = &tmp_tensor; + } + + auto stream = + ctx.template device_context() + .stream(); + + // step2: ZerosLike x in device + Tensor zeroslike_xout(x->type()); + zeroslike_xout.Resize(x->dims()); + auto p = zeroslike_xout.mutable_data(ctx.GetPlace()); + + platform::NPUMemsetAsync(static_cast(p), 0, + zeroslike_xout.numel() * sizeof(T), stream); + + // step3: scatter(x_grad) + dx->mutable_data(ctx.GetPlace()); + auto runner_scatter = NpuOpRunner( + "TensorScatterUpdate", {zeroslike_xout, *index, *dout}, {*dx}, {}); + runner_scatter.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_NPU_KERNEL( + gather, ops::GatherOpNPUKernel, + ops::GatherOpNPUKernel, + ops::GatherOpNPUKernel); + +REGISTER_OP_NPU_KERNEL( + gather_grad, + ops::GatherGradOpNPUKernel, + ops::GatherGradOpNPUKernel, + ops::GatherGradOpNPUKernel); diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..de067e45585d91ce0efa2269909f9a1052a895ac --- /dev/null +++ b/paddle/fluid/operators/gather_op_npu_test.cc @@ -0,0 +1,169 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/gather_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(gather); +USE_OP_DEVICE_KERNEL(gather, NPU); +USE_OP(gather_grad); +USE_OP_DEVICE_KERNEL(gather_grad, NPU); + +template +void Compare(f::Scope* scope, const p::DeviceContext& ctx, + std::string op_type) { + // init + auto x = scope->Var("X"); + auto tensor_x = x->GetMutable(); + + auto index = scope->Var("Index"); + auto tensor_index = index->GetMutable(); + + std::vector init_x; + for (int64_t i = 1; i < 7; ++i) { + // 1,2,3,4,5,6 + init_x.push_back(static_cast(i)); + } + + // [[1, 2],[3, 4],[5, 6]] + TensorFromVector(init_x, ctx, tensor_x); + tensor_x->Resize(paddle::framework::make_ddim({3, 2})); + + std::vector init_index = {1, 2}; + paddle::framework::TensorFromVector(init_index, ctx, tensor_index); + tensor_index->Resize(paddle::framework::make_ddim({2})); + + ctx.Wait(); + + auto out = scope->Var("Out"); + auto tensor_out = out->GetMutable(); + + // run + f::AttributeMap attrs = {{"validate_indices", true}}; + auto op = f::OpRegistry::CreateOp( + op_type, {{"X", {"X"}}, {"Index", {"Index"}}}, {{"Out", {"Out"}}}, attrs); + + auto place = ctx.GetPlace(); + op->Run(*scope, place); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + + ctx.Wait(); + + // ref:https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/tensor/manipulation/gather_cn.html#gather + for (int i = 0; i < static_cast(out_vec.size()); ++i) { + VLOG(3) << "out_vec[" << i << "] : " << out_vec[i]; + } + uint32_t expected_size = 4; + EXPECT_EQ((uint32_t)out_vec.size(), expected_size); + + // {3, 4, 5, 6} + std::vector expected_out_vec; + for (int64_t i = 3; i < 7; ++i) { + expected_out_vec.push_back(static_cast(i)); + } + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], expected_out_vec[i]); + } +} + +template +void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx, + std::string op_type) { + // init + auto index = scope->Var("Index"); + auto tensor_index = index->GetMutable(); + + auto x = scope->Var("X"); + auto tensor_x = x->GetMutable(); + + auto dout = scope->Var("DOut"); + auto tensor_dout = dout->GetMutable(); + + std::vector init_index = {0, 1}; + paddle::framework::TensorFromVector(init_index, ctx, tensor_index); + tensor_index->Resize(paddle::framework::make_ddim({2})); + + std::vector init_x = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + TensorFromVector(init_x, ctx, tensor_x); + tensor_x->Resize(paddle::framework::make_ddim({3, 2})); + + std::vector init_dout = {5.0, 10.0, 2.0, 3.0}; + TensorFromVector(init_dout, ctx, tensor_dout); + tensor_dout->Resize(paddle::framework::make_ddim({2, 2})); + + ctx.Wait(); + + auto dx = scope->Var("DX"); + auto tensor_dx = dx->GetMutable(); + + // run + f::AttributeMap attrs; + auto op = f::OpRegistry::CreateOp( + op_type, {{"X", {"X"}}, {"Index", {"Index"}}, {"Out@GRAD", {"DOut"}}}, + {{"X@GRAD", {"DX"}}}, attrs); + + auto place = ctx.GetPlace(); + op->Run(*scope, place); + + std::vector dx_vec; + TensorToVector(*tensor_dx, ctx, &dx_vec); + + ctx.Wait(); + + uint32_t expected_size = 3 * 2; + EXPECT_EQ((uint32_t)dx_vec.size(), expected_size); + + std::vector expected_dx_vec = {5.0, 10.0, 2.0, 3.0, 0.0, 0.0}; + for (uint32_t i = 0; i < dx_vec.size(); i++) { + VLOG(3) << "dx_vec[i]=" << dx_vec[i]; + EXPECT_EQ(dx_vec[i], expected_dx_vec[i]); + } +} + +TEST(gather, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx, "gather"); +} + +TEST(gather, NPU_fp16) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx, "gather"); +} + +TEST(gather_grad, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + CompareGrad(&scope, ctx, "gather_grad"); +} diff --git a/paddle/fluid/operators/gelu_op_npu.cc b/paddle/fluid/operators/gelu_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..56aa509177cfd3e5ecfd521e0b66fd72fc708c38 --- /dev/null +++ b/paddle/fluid/operators/gelu_op_npu.cc @@ -0,0 +1,88 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/operators/gelu_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class GeluNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("Gelu", {*x}, {*out}, {}); + runner.Run(stream); + } +}; + +template +class GeluGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + + auto* dx = ctx.Output(framework::GradVarName("X")); + + auto place = ctx.GetPlace(); + + dx->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + Tensor out(x->type()); + out.mutable_data(x->dims(), place); + auto out_runner = NpuOpRunner("Gelu", {*x}, {out}, {}); + out_runner.Run(stream); + + auto dx_runner = NpuOpRunner("GeluGrad", {*dout, *x, out}, {*dx}, {}); + dx_runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + gelu, ops::GeluNPUKernel, + ops::GeluNPUKernel); + +REGISTER_OP_NPU_KERNEL( + gelu_grad, + ops::GeluGradNPUKernel, + ops::GeluGradNPUKernel); diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f11812ce3bb2198d674796f84020fd4ac8d16e74 --- /dev/null +++ b/paddle/fluid/operators/gelu_op_npu_test.cc @@ -0,0 +1,168 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(gelu); +USE_OP_DEVICE_KERNEL(gelu, NPU); + +template +void Compare(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto x = scope->Var("X"); + auto tensor_x = x->GetMutable(); + + std::vector init_x; + for (int64_t i = 0; i < 10 * 10; ++i) { + init_x.push_back(static_cast(1.0)); + } + + TensorFromVector(init_x, ctx, tensor_x); + tensor_x->Resize({10, 10}); + + auto out = scope->Var("Out"); + auto tensor_out = out->GetMutable(); + + f::AttributeMap attrs; + + ctx.Wait(); + + // run + auto place = ctx.GetPlace(); + + auto op = f::OpRegistry::CreateOp("gelu", {{"X", {"X"}}}, {{"Out", {"Out"}}}, + attrs); + op->Run(*scope, place); + + ctx.Wait(); + + // eval time + struct timeval start, end; + gettimeofday(&start, NULL); + + for (int i = 0; i < 100; i++) { + op->Run(*scope, place); + } + + ctx.Wait(); + + gettimeofday(&end, NULL); + int micros = + (((end.tv_sec - start.tv_sec) * 1000000) + end.tv_usec) - (start.tv_usec); + printf("used time: %d\n", micros / 100); + + // eval value + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + + float expected = 0.841192; + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_FLOAT_EQ(out_vec[i], static_cast(expected)); + } +} + +template +void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { + auto dout = scope->Var("DOut"); + auto tensor_dout = dout->GetMutable(); + + auto x = scope->Var("X"); + auto tensor_x = x->GetMutable(); + + std::vector init_dout; + for (int64_t i = 0; i < 10 * 10; ++i) { + init_dout.push_back(static_cast(1.0)); + } + + std::vector init_x; + for (int64_t i = 0; i < 10 * 10; ++i) { + init_x.push_back(static_cast(1.0)); + } + + TensorFromVector(init_dout, ctx, tensor_dout); + tensor_dout->Resize({10, 10}); + TensorFromVector(init_x, ctx, tensor_x); + tensor_x->Resize({10, 10}); + + auto dx = scope->Var("DX"); + auto tensor_dx = dx->GetMutable(); + + f::AttributeMap attrs; + + ctx.Wait(); + + // run + auto place = ctx.GetPlace(); + + auto op = f::OpRegistry::CreateOp("gelu_grad", + {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}}, + {{"X@GRAD", {"DX"}}}, attrs); + op->Run(*scope, place); + + ctx.Wait(); + + // eval time + struct timeval start, end; + gettimeofday(&start, NULL); + + for (int i = 0; i < 100; i++) { + op->Run(*scope, place); + } + + ctx.Wait(); + + gettimeofday(&end, NULL); + int micros = + (((end.tv_sec - start.tv_sec) * 1000000) + end.tv_usec) - (start.tv_usec); + printf("used time: %d\n", micros / 100); + + // eval value + std::vector dx_vec; + TensorToVector(*tensor_dx, ctx, &dx_vec); + + float expected = 1.082964; + for (uint32_t i = 0; i < dx_vec.size(); i++) { + EXPECT_FLOAT_EQ(dx_vec[i], static_cast(expected)); + } +} + +TEST(gelu, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx); +} + +TEST(gelu_grad, NPU) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + CompareGrad(&scope, ctx); +} diff --git a/paddle/fluid/operators/increment_op_npu.cc b/paddle/fluid/operators/increment_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..c1859bce02c904f15596798ccd7e6845e81f13a6 --- /dev/null +++ b/paddle/fluid/operators/increment_op_npu.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/increment_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace framework { +class OpDesc; +class Variable; +} // namespace framework +namespace imperative { +class OpBase; +} // namespace imperative +} // namespace paddle + +namespace paddle { +namespace operators { + +template +class IncrementalNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x_tensor = context.Input("X"); + auto* out_tensor = context.Output("Out"); + float step = context.Attr("step"); + out_tensor->mutable_data(context.GetPlace()); + + Tensor step_tensor(x_tensor->type()); + std::vector step_vec; + step_vec.push_back(static_cast(step)); + framework::TensorFromVector(step_vec, context.device_context(), + &step_tensor); + + auto runner = + NpuOpRunner("Add", {*x_tensor, step_tensor}, {*out_tensor}, {}); + + auto stream = + context.template device_context() + .stream(); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace plat = paddle::platform; +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + increment, + ops::IncrementalNPUKernel, + ops::IncrementalNPUKernel, + ops::IncrementalNPUKernel, + ops::IncrementalNPUKernel, + ops::IncrementalNPUKernel) diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..b466ae275dd1c177802d883a32b3c1a942df0e0a --- /dev/null +++ b/paddle/fluid/operators/increment_op_npu_test.cc @@ -0,0 +1,82 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(increment); +USE_OP_DEVICE_KERNEL(increment, NPU); + +template +void Compare(f::Scope* scope, const p::DeviceContext& ctx, + std::string op_type) { + // init + auto x = scope->Var("X"); + auto tensor_x = x->GetMutable(); + + std::vector init; + init.push_back(static_cast(1.0)); + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({1}); + + ctx.Wait(); + + auto place = ctx.GetPlace(); + auto out = scope->Var("Out"); + auto tensor_out = out->GetMutable(); + + f::AttributeMap attr_input = {{"step", static_cast(2.0)}}; + auto op = f::OpRegistry::CreateOp("increment", {{"X", {"X"}}}, + {{"Out", {"Out"}}}, attr_input); + + op->Run(*scope, place); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + + ctx.Wait(); + + EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)1); + EXPECT_EQ(out_vec[0], static_cast(3.0)); +} + +TEST(increment, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx, "increment"); +} + +TEST(increment, NPU_fp64) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx, "increment"); +} diff --git a/paddle/fluid/operators/layer_norm_op_npu.cc b/paddle/fluid/operators/layer_norm_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..95549319cd2096c02de70b521d142ad221f84cbc --- /dev/null +++ b/paddle/fluid/operators/layer_norm_op_npu.cc @@ -0,0 +1,390 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/layer_norm_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DDim = framework::DDim; + +using DataLayout = framework::DataLayout; + +template +class NormDataType; + +template <> +class NormDataType { + public: + // The scaling param type is float for HALF and FLOAT tensors + using ScalingParamType = const float; + using BatchNormParamType = float; +}; + +template <> +class NormDataType { + public: + using ScalingParamType = const float; + using BatchNormParamType = float; +}; + +template +using NormDataType = NormDataType; +template +using LayerNormParamType = typename NormDataType::BatchNormParamType; + +template +class LayerNormNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using U = LayerNormParamType; + const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); + const auto epsilon = ctx.Attr("epsilon"); + const auto* x = ctx.Input("X"); + const auto* scale = ctx.Input("Scale"); + const auto* bias = ctx.Input("Bias"); + auto* y = ctx.Output("Y"); + auto* mean = ctx.Output("Mean"); + auto* variance = ctx.Output("Variance"); + const auto& x_dims = x->dims(); + std::vector axes; + auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); + int right = static_cast(matrix_dim[1]); + + // The shape of scale and bias should be equal to x.shape[begin_norm_axis:], + // required by Ascend. + for (auto i = begin_norm_axis; i < x_dims.size(); ++i) { + axes.push_back(x_dims[i]); + } + + auto place = ctx.GetPlace(); + auto stream = + ctx.template device_context() + .stream(); + + Tensor default_scale(x->type()); + if (!scale) { + default_scale.mutable_data(framework::make_ddim(axes), place); + Tensor value(x->type()); + value.mutable_data({1}, place); + TensorFromVector(std::vector{static_cast(1.0)}, + ctx.device_context(), &value); + auto runner = + NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}}); + runner.Run(stream); + scale = &default_scale; + } else { + const_cast(scale)->Resize(framework::make_ddim(axes)); + } + + Tensor default_bias(x->type()); + if (!bias) { + default_bias.mutable_data(framework::make_ddim(axes), place); + Tensor value(x->type()); + value.mutable_data({1}, place); + TensorFromVector(std::vector{static_cast(0)}, ctx.device_context(), + &value); + auto runner = + NpuOpRunner("FillD", {value}, {default_bias}, {{"dims", axes}}); + runner.Run(stream); + bias = &default_bias; + } else { + const_cast(bias)->Resize(framework::make_ddim(axes)); + } + + // cast scale from LayerNormParamType to T if needed + Tensor cast_scale(x->type()); + if (x->type() == framework::proto::VarType::FP16 && + scale->type() == framework::proto::VarType::FP32) { + cast_scale.Resize(scale->dims()); + cast_scale.mutable_data(ctx.GetPlace()); + auto dst_dtype = ConvertToNpuDtype(x->type()); + auto runner_cast_scale = + NpuOpRunner("Cast", {*scale}, {cast_scale}, + {{"dst_type", static_cast(dst_dtype)}}); + runner_cast_scale.Run(stream); + } else { + cast_scale.ShareDataWith(*scale); + } + + // cast bias from LayerNormParamType to T if needed + Tensor cast_bias(x->type()); + if (x->type() == framework::proto::VarType::FP16 && + bias->type() == framework::proto::VarType::FP32) { + cast_bias.Resize(bias->dims()); + cast_bias.mutable_data(ctx.GetPlace()); + auto dst_dtype = ConvertToNpuDtype(x->type()); + auto runner_cast_bias = + NpuOpRunner("Cast", {*bias}, {cast_bias}, + {{"dst_type", static_cast(dst_dtype)}}); + runner_cast_bias.Run(stream); + } else { + cast_bias.ShareDataWith(*bias); + } + + y->mutable_data(ctx.GetPlace()); + + // mean should be of U type + Tensor* tmp_mean = mean; + Tensor cast_mean(x->type()); + if (x->type() == framework::proto::VarType::FP16 && + (scale->type() == framework::proto::VarType::FP32 || + bias->type() == framework::proto::VarType::FP32)) { + cast_mean.Resize(mean->dims()); + cast_mean.mutable_data(ctx.GetPlace()); + tmp_mean = &cast_mean; + mean->mutable_data(ctx.GetPlace()); + } else { + mean->mutable_data(ctx.GetPlace()); + } + + // same for variance + Tensor* tmp_variance = variance; + Tensor cast_variance(x->type()); + if (x->type() == framework::proto::VarType::FP16 && + (scale->type() == framework::proto::VarType::FP32 || + bias->type() == framework::proto::VarType::FP32)) { + cast_variance.Resize(variance->dims()); + cast_variance.mutable_data(ctx.GetPlace()); + tmp_variance = &cast_variance; + variance->mutable_data(ctx.GetPlace()); + } else { + variance->mutable_data(ctx.GetPlace()); + } + + auto runner = NpuOpRunner("LayerNorm", {*x, cast_scale, cast_bias}, + {*y, *tmp_mean, *tmp_variance}, + {{"begin_norm_axis", begin_norm_axis}, + {"begin_params_axis", begin_norm_axis}, + {"epsilon", epsilon}}); + runner.Run(stream); + + // cast back from FP16 to FP32 + if (x->type() == framework::proto::VarType::FP16 && + mean->type() == framework::proto::VarType::FP32) { + auto dst_dtype = ConvertToNpuDtype(mean->type()); + auto runner_cast_mean = + NpuOpRunner("Cast", {*tmp_mean}, {*mean}, + {{"dst_type", static_cast(dst_dtype)}}); + runner_cast_mean.Run(stream); + } + // same for variance + if (x->type() == framework::proto::VarType::FP16 && + variance->type() == framework::proto::VarType::FP32) { + auto dst_dtype = ConvertToNpuDtype(variance->type()); + auto runner_cast_variance = + NpuOpRunner("Cast", {*tmp_variance}, {*variance}, + {{"dst_type", static_cast(dst_dtype)}}); + runner_cast_variance.Run(stream); + } + + // revert shape of scale and bias + // TODO(zhiqiu): better implementation, use tmp tensor to avoid write input + // tensor. + const_cast(scale)->Resize(framework::make_ddim({right})); + const_cast(bias)->Resize(framework::make_ddim({right})); + } +}; + +template +class LayerNormGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using U = LayerNormParamType; + const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); + const auto* x = ctx.Input("X"); + const auto& x_dims = x->dims(); + const auto* mean = ctx.Input("Mean"); + const auto* variance = ctx.Input("Variance"); + const auto* scale = ctx.Input("Scale"); + const auto* dy = ctx.Input(framework::GradVarName("Y")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dscale = ctx.Output(framework::GradVarName("Scale")); + auto* dbias = ctx.Output(framework::GradVarName("Bias")); + + auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); + int right = static_cast(matrix_dim[1]); + + std::vector axes; + for (auto i = begin_norm_axis; i < x_dims.size(); ++i) { + axes.push_back(x_dims[i]); + } + + auto place = ctx.GetPlace(); + auto stream = + ctx.template device_context() + .stream(); + + // No need to compute any gradient, jusr return + if (!dx && !dscale && !dbias) { + return; + } + + // The rank of mean should be equal to x, required by Ascend. + std::vector new_shape; + for (auto i = 0; i < begin_norm_axis; ++i) { + new_shape.push_back(x_dims[i]); + } + for (auto i = begin_norm_axis; i < x_dims.size(); ++i) { + new_shape.push_back(1); + } + + auto mean_dims = mean->dims(); + const_cast(mean)->Resize(framework::make_ddim({new_shape})); + const_cast(variance)->Resize(framework::make_ddim({new_shape})); + + Tensor default_scale(x->type()); + if (!scale) { + default_scale.mutable_data(framework::make_ddim(axes), place); + Tensor value(x->type()); + value.mutable_data({1}, place); + TensorFromVector(std::vector{static_cast(1.0)}, + ctx.device_context(), &value); + auto runner = + NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}}); + runner.Run(stream); + scale = &default_scale; + } else { + const_cast(scale)->Resize(framework::make_ddim(axes)); + } + + // cast scale from LayerNormParamType to T if needed + Tensor cast_scale(x->type()); + if (x->type() == framework::proto::VarType::FP16 && + scale->type() == framework::proto::VarType::FP32) { + cast_scale.Resize(scale->dims()); + cast_scale.mutable_data(ctx.GetPlace()); + auto dst_dtype = ConvertToNpuDtype(x->type()); + auto runner_cast_scale = + NpuOpRunner("Cast", {*scale}, {cast_scale}, + {{"dst_type", static_cast(dst_dtype)}}); + runner_cast_scale.Run(stream); + } else { + cast_scale.ShareDataWith(*scale); + } + + // cast mean from LayerNormParamType to T if needed + Tensor cast_mean(x->type()); + if (x->type() == framework::proto::VarType::FP16 && + mean->type() == framework::proto::VarType::FP32) { + cast_mean.Resize(mean->dims()); + cast_mean.mutable_data(ctx.GetPlace()); + auto dst_dtype = ConvertToNpuDtype(x->type()); + auto runner_cast_mean = + NpuOpRunner("Cast", {*mean}, {cast_mean}, + {{"dst_type", static_cast(dst_dtype)}}); + runner_cast_mean.Run(stream); + } else { + cast_mean.ShareDataWith(*mean); + } + + // cast variance from LayerNormParamType to T if needed + Tensor cast_variance(x->type()); + if (x->type() == framework::proto::VarType::FP16 && + variance->type() == framework::proto::VarType::FP32) { + cast_variance.Resize(variance->dims()); + cast_variance.mutable_data(ctx.GetPlace()); + auto dst_dtype = ConvertToNpuDtype(x->type()); + auto runner_cast_variance = + NpuOpRunner("Cast", {*variance}, {cast_variance}, + {{"dst_type", static_cast(dst_dtype)}}); + runner_cast_variance.Run(stream); + } else { + cast_variance.ShareDataWith(*variance); + } + + Tensor dx_(dy->type()), dscale_(dy->type()), dbias_(dy->type()); + dx = (dx == nullptr) ? &dx_ : dx; + dscale = (dscale == nullptr) ? &dscale_ : dscale; + dbias = (dbias == nullptr) ? &dbias_ : dbias; + + dx->Resize(x->dims()); + dx->mutable_data(ctx.GetPlace()); + + dscale->Resize(framework::make_ddim(axes)); + + dbias->Resize(framework::make_ddim(axes)); + + // dscale should be of U type + Tensor* tmp_dscale = dscale; + Tensor cast_dscale(x->type()); + if (x->type() == framework::proto::VarType::FP16 && + (mean->type() == framework::proto::VarType::FP32 || + variance->type() == framework::proto::VarType::FP32)) { + cast_dscale.Resize(dscale->dims()); + cast_dscale.mutable_data(ctx.GetPlace()); + tmp_dscale = &cast_dscale; + dscale->mutable_data(ctx.GetPlace()); + } else { + dscale->mutable_data(ctx.GetPlace()); + } + + // same for dbias + Tensor* tmp_dbias = dbias; + Tensor cast_dbias(x->type()); + if (x->type() == framework::proto::VarType::FP16 && + (mean->type() == framework::proto::VarType::FP32 || + variance->type() == framework::proto::VarType::FP32)) { + cast_dbias.Resize(dbias->dims()); + cast_dbias.mutable_data(ctx.GetPlace()); + tmp_dbias = &cast_dbias; + dbias->mutable_data(ctx.GetPlace()); + } else { + dbias->mutable_data(ctx.GetPlace()); + } + + auto runner = NpuOpRunner("LayerNormGrad", + {*dy, *x, cast_variance, cast_mean, cast_scale}, + {*dx, *tmp_dscale, *tmp_dbias}, {}); + runner.Run(stream); + + // cast back from FP16 to FP32 + if (x->type() == framework::proto::VarType::FP16 && + dscale->type() == framework::proto::VarType::FP32) { + auto dst_dtype = ConvertToNpuDtype(dscale->type()); + auto runner_cast_dscale = + NpuOpRunner("Cast", {*tmp_dscale}, {*dscale}, + {{"dst_type", static_cast(dst_dtype)}}); + runner_cast_dscale.Run(stream); + } + // same for dbias + if (x->type() == framework::proto::VarType::FP16 && + dbias->type() == framework::proto::VarType::FP32) { + auto dst_dtype = ConvertToNpuDtype(dbias->type()); + auto runner_cast_dbias = + NpuOpRunner("Cast", {*tmp_dbias}, {*dbias}, + {{"dst_type", static_cast(dst_dtype)}}); + runner_cast_dbias.Run(stream); + } + + const_cast(mean)->Resize(mean_dims); + const_cast(variance)->Resize(mean_dims); + const_cast(scale)->Resize(framework::make_ddim({right})); + dscale->Resize(framework::make_ddim({right})); + dbias->Resize(framework::make_ddim({right})); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(layer_norm, ops::LayerNormNPUKernel, + ops::LayerNormNPUKernel); +REGISTER_OP_NPU_KERNEL(layer_norm_grad, ops::LayerNormGradNPUKernel, + ops::LayerNormGradNPUKernel); diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..fab2d7f7aa0542d9d5a2143c72c1551d3884592d --- /dev/null +++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc @@ -0,0 +1,90 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class LookupTableV2NPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *ids_t = ctx.Input("Ids"); // int tensor + auto *output_t = ctx.Output("Out"); // float tensor + auto *table_t = ctx.Input("W"); + auto *table_var = ctx.InputVar("W"); + PADDLE_ENFORCE_EQ( + table_var->IsType(), true, + platform::errors::InvalidArgument("npu only accept LoDTensor")); + output_t->mutable_data(ctx.GetPlace()); + framework::NPUAttributeMap attr_input = {{"validate_indices", false}}; + + auto runner = + NpuOpRunner("Gather", {*table_t, *ids_t}, {*output_t}, attr_input); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class LookupTableV2GradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *ids_t = ctx.Input("Ids"); + auto *output_grad_t = + ctx.Input(framework::GradVarName("Out")); + auto *table_grad_t = + ctx.Output(framework::GradVarName("W")); + table_grad_t->mutable_data(ctx.GetPlace()); + + auto stream = + ctx.template device_context() + .stream(); + + // step2: ZerosLike x in device + Tensor zeroslike_w(table_grad_t->type()); + zeroslike_w.Resize(table_grad_t->dims()); + auto p = zeroslike_w.mutable_data(ctx.GetPlace()); + + platform::NPUMemsetAsync(static_cast(p), 0, + zeroslike_w.numel() * sizeof(T), stream); + + table_grad_t->mutable_data(ctx.GetPlace()); + auto runner_scatter = + NpuOpRunner("ScatterAdd", {zeroslike_w, *ids_t, *output_grad_t}, + {*table_grad_t}, {}); + runner_scatter.Run(stream); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + lookup_table_v2, + ops::LookupTableV2NPUKernel, + ops::LookupTableV2NPUKernel); + +REGISTER_OP_NPU_KERNEL( + lookup_table_v2_grad, ops::LookupTableV2GradNPUKernel, + ops::LookupTableV2GradNPUKernel); diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu_test.cc b/paddle/fluid/operators/lookup_table_v2_op_npu_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f37915834bd75677cc433c7a67a408082c43201e --- /dev/null +++ b/paddle/fluid/operators/lookup_table_v2_op_npu_test.cc @@ -0,0 +1,142 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(lookup_table_v2); +USE_OP_DEVICE_KERNEL(lookup_table_v2, NPU); + +template +void Compare(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto ids = scope->Var("Ids"); + auto out = scope->Var("Out"); + auto w = scope->Var("W"); + + auto ids_t = ids->GetMutable(); + auto out_t = out->GetMutable(); + auto w_t = w->GetMutable(); + int bsz = 10; + int dim = 32; + int seqlen = 8; + int vocab_size = 100; + TensorFromVector(std::vector(bsz * seqlen, 3), ctx, ids_t); + std::vector val(vocab_size * dim, 10.); + TensorFromVector(val, ctx, w_t); + ids_t->Resize({bsz, seqlen}); + w_t->Resize({vocab_size, dim}); + out_t->Resize({bsz, seqlen, dim}); + ctx.Wait(); + + auto place = ctx.GetPlace(); + out_t->mutable_data(place); + f::AttributeMap attrs = {{}}; + auto op = f::OpRegistry::CreateOp("lookup_table_v2", + {{"W", {"W"}}, {"Ids", {"Ids"}}}, + {{"Out", {"Out"}}}, attrs); + op->Run(*scope, place); + std::vector out_v; + TensorToVector(*out_t, ctx, &out_v); + ctx.Wait(); + EXPECT_EQ(out_t->numel(), bsz * seqlen * dim); + T res = std::accumulate(out_v.begin(), out_v.end(), 0.); + float eps = 1.e-6; + EXPECT_LT(fabs(res - bsz * seqlen * dim * 10.), eps); +} + +template +void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto w = scope->Var("W"); + auto ids = scope->Var("Ids"); + auto out = scope->Var("DOut"); + auto dw = scope->Var("DW"); + + auto w_t = w->GetMutable(); + auto ids_t = ids->GetMutable(); + auto out_t = out->GetMutable(); + auto dw_t = dw->GetMutable(); + + int bsz = 2; + int dim = 2; + int seqlen = 2; + int vocab_size = 4; + + std::vector val_int(bsz * seqlen, 3); + std::vector val(vocab_size * dim, 0.); + std::vector val_out(bsz * seqlen * dim, 1.); + + TensorFromVector(val_int, ctx, ids_t); + TensorFromVector(val, ctx, w_t); + TensorFromVector(val, ctx, dw_t); + TensorFromVector(val_out, ctx, out_t); + + w_t->Resize({vocab_size, dim}); + ids_t->Resize({bsz, seqlen}); + out_t->Resize({bsz, seqlen, dim}); + dw_t->Resize({vocab_size, dim}); + + ctx.Wait(); + + auto place = ctx.GetPlace(); + out_t->mutable_data(place); + w_t->mutable_data(place); + dw_t->mutable_data(place); + f::AttributeMap attrs = {{}}; + auto op = f::OpRegistry::CreateOp( + "lookup_table_v2_grad", + {{"Ids", {"Ids"}}, {"W", {"W"}}, {"Out@GRAD", {"DOut"}}}, + {{"W@GRAD", {"DW"}}}, attrs); + op->Run(*scope, place); + ctx.Wait(); + std::vector w_v; + TensorToVector(*dw_t, ctx, &w_v); + ctx.Wait(); + EXPECT_EQ(dw_t->numel(), vocab_size * dim); + T res = std::accumulate(w_v.begin(), w_v.end(), 0.); + float eps = 1.e-6; + EXPECT_LT(fabs(res - bsz * seqlen * dim), eps); +} + +TEST(lookup_table_v2, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx); +} + +TEST(lookup_table_v2_grad, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + CompareGrad(&scope, ctx); +} diff --git a/paddle/fluid/operators/matmul_v2_op_npu.cc b/paddle/fluid/operators/matmul_v2_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..d3022056a47ded99e63aa05c1aca8e9b31ccc3fe --- /dev/null +++ b/paddle/fluid/operators/matmul_v2_op_npu.cc @@ -0,0 +1,160 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/operators/matmul_v2_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class MatMulV2NPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + bool transpose_x = ctx.Attr("trans_x"); + bool transpose_y = ctx.Attr("trans_y"); + + if (x->dims().size() == 2) { + out->mutable_data(ctx.GetPlace()); + + auto runner = NpuOpRunner( + "MatMul", {*x, *y}, {*out}, + {{"transpose_x1", transpose_x}, {"transpose_x2", transpose_y}}); + + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + + } else if (x->dims().size() > 2) { + out->mutable_data(ctx.GetPlace()); + + auto runner = + NpuOpRunner("BatchMatMul", {*x, *y}, {*out}, + {{"adj_x1", transpose_x}, {"adj_x2", transpose_y}}); + + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } + } +}; + +template +class MatMulV2GradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + bool transpose_y = ctx.Attr("trans_y"); + auto stream = + ctx.template device_context() + .stream(); + + if (x->dims().size() == 2) { + if (transpose_y) { + if (dx) { + dx->mutable_data(ctx.GetPlace()); + auto runner_dx = + NpuOpRunner("MatMul", {*dout, *y}, {*dx}, + {{"transpose_x1", false}, {"transpose_x2", false}}); + + runner_dx.Run(stream); + } + if (dy) { + dy->mutable_data(ctx.GetPlace()); + auto runner_dy = + NpuOpRunner("MatMul", {*dout, *x}, {*dy}, + {{"transpose_x1", true}, {"transpose_x2", false}}); + + runner_dy.Run(stream); + } + + } else { + if (dx) { + dx->mutable_data(ctx.GetPlace()); + auto runner_dx = + NpuOpRunner("MatMul", {*dout, *y}, {*dx}, + {{"transpose_x1", false}, {"transpose_x2", true}}); + + runner_dx.Run(stream); + } + if (dy) { + dy->mutable_data(ctx.GetPlace()); + auto runner_dy = + NpuOpRunner("MatMul", {*x, *dout}, {*dy}, + {{"transpose_x1", true}, {"transpose_x2", false}}); + + runner_dy.Run(stream); + } + } + } else if (x->dims().size() > 2) { + if (transpose_y) { + if (dx) { + dx->mutable_data(ctx.GetPlace()); + auto runner_dx = NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx}, + {{"adj_x1", false}, {"adj_x2", false}}); + + runner_dx.Run(stream); + } + if (dy) { + dy->mutable_data(ctx.GetPlace()); + auto runner_dy = NpuOpRunner("BatchMatMul", {*dout, *x}, {*dy}, + {{"adj_x1", true}, {"adj_x2", false}}); + + runner_dy.Run(stream); + } + } else { + if (dx) { + dx->mutable_data(ctx.GetPlace()); + auto runner_dx = NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx}, + {{"adj_x1", false}, {"adj_x2", true}}); + + runner_dx.Run(stream); + } + if (dy) { + dy->mutable_data(ctx.GetPlace()); + auto runner_dy = NpuOpRunner("BatchMatMul", {*x, *dout}, {*dy}, + {{"adj_x1", true}, {"adj_x2", false}}); + runner_dy.Run(stream); + } + } + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + matmul_v2, + ops::MatMulV2NPUKernel, + ops::MatMulV2NPUKernel); +REGISTER_OP_NPU_KERNEL( + matmul_v2_grad, + ops::MatMulV2GradNPUKernel, + ops::MatMulV2GradNPUKernel); diff --git a/paddle/fluid/operators/mean_op_npu.cc b/paddle/fluid/operators/mean_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..676086bd08063353863fc5eb256d49682bcdd12c --- /dev/null +++ b/paddle/fluid/operators/mean_op_npu.cc @@ -0,0 +1,104 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/mean_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +template +class MeanNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + std::vector axes; + + framework::NPUAttributeMap attr_input = {{"keep_dims", false}, + {"axes", axes}}; + + out->mutable_data(ctx.GetPlace()); + + auto runner = NpuOpRunner("ReduceMeanD", {*x}, {*out}, attr_input); + + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class MeanGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto stream = + context.template device_context() + .stream(); + + auto grad = context.Input(framework::GradVarName("Out")); + + PADDLE_ENFORCE_EQ(grad->numel(), 1, + platform::errors::InvalidArgument( + "Mean Gradient Input Tensor len should be 1. But " + "received Out@Grad's elements num is %d.", + grad->numel())); + + auto IG = context.Output(framework::GradVarName("X")); + IG->mutable_data(context.GetPlace()); + + // ones + Tensor ones(grad->type()); + ones.mutable_data(IG->dims(), context.GetPlace()); + auto runner_ones = NpuOpRunner("OnesLike", {*IG}, {ones}, {}); + runner_ones.Run(stream); + + // means + Tensor mean_tensor(grad->type()); + mean_tensor.Resize({1}); + mean_tensor.mutable_data(context.GetPlace()); + std::vector mean_vec; + mean_vec.push_back(1.0 / static_cast(IG->numel())); + framework::TensorFromVector(mean_vec, context.device_context(), + &mean_tensor); + + // means mul ones + Tensor mean_ma(grad->type()); + mean_ma.Resize(IG->dims()); + mean_ma.mutable_data(context.GetPlace()); + auto runner_mul_1 = NpuOpRunner("Mul", {mean_tensor, ones}, {mean_ma}, {}); + runner_mul_1.Run(stream); + + // and mul grad + auto runner_mul_2 = NpuOpRunner("Mul", {mean_ma, *grad}, {*IG}, {}); + runner_mul_2.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_NPU_KERNEL( + mean, ops::MeanNPUKernel, + ops::MeanNPUKernel, + ops::MeanNPUKernel, + ops::MeanNPUKernel) + +REGISTER_OP_NPU_KERNEL( + mean_grad, ops::MeanGradNPUKernel, + ops::MeanGradNPUKernel, + ops::MeanGradNPUKernel, + ops::MeanGradNPUKernel) diff --git a/paddle/fluid/operators/metrics/accuracy_op_npu.cc b/paddle/fluid/operators/metrics/accuracy_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..4ffcbaf55314a46888e15572e8477054b23ae2bb --- /dev/null +++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc @@ -0,0 +1,124 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/operators/controlflow/compare_op.h" +#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class AccuracyNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* pred = ctx.Input("Out"); + auto* label = ctx.Input("Label"); + // auto* logits = ctx.Input("Indices"); + + auto* acc = ctx.Output("Accuracy"); + auto* correct = ctx.Output("Correct"); + auto* total = ctx.Output("Total"); + auto stream = + ctx.template device_context() + .stream(); + + // cast pred + Tensor tmp_pred(pred->type()); + tmp_pred.Resize(pred->dims()); + tmp_pred.mutable_data(ctx.GetPlace()); + auto runner_cast_pred = + NpuOpRunner("Cast", {*pred}, {tmp_pred}, + {{"dst_type", static_cast(ACL_INT32)}}); + runner_cast_pred.Run(stream); + + // cast label + Tensor tmp_label(label->type()); + tmp_label.Resize(label->dims()); + tmp_label.mutable_data(ctx.GetPlace()); + auto runner_cast_label = + NpuOpRunner("Cast", {*label}, {tmp_label}, + {{"dst_type", static_cast(ACL_INT32)}}); + runner_cast_label.Run(stream); + + // equal + Tensor tmp_equal(label->type()); + tmp_equal.Resize(label->dims()); + tmp_equal.mutable_data(ctx.GetPlace()); + auto runner_equal = + NpuOpRunner("Equal", {tmp_pred, tmp_label}, {tmp_equal}, {}); + runner_equal.Run(stream); + + // cast equal + Tensor tmp_equal_cast(label->type()); + tmp_equal_cast.Resize(label->dims()); + tmp_equal_cast.mutable_data(ctx.GetPlace()); + auto runner_cast_equal = + NpuOpRunner("Cast", {tmp_equal}, {tmp_equal_cast}, + {{"dst_type", static_cast(ACL_FLOAT)}}); + runner_cast_equal.Run(stream); + + // acc + acc->mutable_data(ctx.GetPlace()); + std::vector axes_vec_1; + auto runner_acc = NpuOpRunner("ReduceMeanD", {tmp_equal_cast}, {*acc}, + {{"keep_dims", false}, {"axes", axes_vec_1}}); + runner_acc.Run(stream); + + // correct + correct->mutable_data(ctx.GetPlace()); + std::vector axes_vec_2; + auto runner_correct = + NpuOpRunner("ReduceSumD", {tmp_equal_cast}, {*correct}, + {{"keep_dims", false}, {"axes", axes_vec_2}}); + runner_correct.Run(stream); + + // ones_tensor + Tensor ones_tensor(label->type()); + ones_tensor.Resize(label->dims()); + ones_tensor.mutable_data(ctx.GetPlace()); + auto runner_oneslike = + NpuOpRunner("OnesLike", {tmp_label}, {ones_tensor}, {}); + runner_oneslike.Run(stream); + + // ones_tensor_cast + Tensor ones_tensor_cast(label->type()); + ones_tensor_cast.Resize(label->dims()); + ones_tensor_cast.mutable_data(ctx.GetPlace()); + auto runner_ones_cast = + NpuOpRunner("Cast", {ones_tensor}, {ones_tensor_cast}, + {{"dst_type", static_cast(ACL_FLOAT)}}); + runner_ones_cast.Run(stream); + + // total + total->mutable_data(ctx.GetPlace()); + std::vector axes_vec_3; + auto runner_total = + NpuOpRunner("ReduceSumD", {ones_tensor_cast}, {*total}, + {{"keep_dims", false}, {"axes", axes_vec_3}}); + runner_total.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + accuracy, ops::AccuracyNPUKernel, + ops::AccuracyNPUKernel, + ops::AccuracyNPUKernel, + ops::AccuracyNPUKernel); diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..e0736239d40f289a11a1e1fd8380fcbad904a667 --- /dev/null +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -0,0 +1,237 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/operators/mul_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class MulNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + int x_num_col_dims = ctx.Attr("x_num_col_dims"); + int y_num_col_dims = ctx.Attr("y_num_col_dims"); + auto stream = + ctx.template device_context() + .stream(); + if (x_num_col_dims == 1 && y_num_col_dims == 1) { + if (x->dims().size() == 2 && y->dims().size() == 2) { + out->mutable_data(ctx.GetPlace()); + auto runner = + NpuOpRunner("MatMul", {*x, *y}, {*out}, + {{"transpose_x1", false}, {"transpose_x2", false}}); + + runner.Run(stream); + } else if (x->dims().size() == 3 && y->dims().size() == 2) { + // reshape + Tensor tmp_x(x->type()); + int64_t sec_dim = x->dims()[1] * x->dims()[2]; + int64_t first_dim = x->dims()[0]; + tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); + tmp_x.mutable_data(ctx.GetPlace()); + framework::TensorCopy( + *x, ctx.GetPlace(), + ctx.template device_context(), &tmp_x); + tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); + out->mutable_data(ctx.GetPlace()); + // matmul + auto runner = + NpuOpRunner("MatMul", {tmp_x, *y}, {*out}, + {{"transpose_x1", false}, {"transpose_x2", false}}); + runner.Run(stream); + } else { + PADDLE_THROW( + platform::errors::InvalidArgument("npu error: not suppert dims")); + } + // to do other + } else if (x->dims().size() == 3 && y->dims().size() == 2) { + // for example: x.shape=[2, 3, 4] y.shape=[4, 5], expect [2, 3, 5] + PADDLE_ENFORCE_EQ(x_num_col_dims, 2, + platform::errors::InvalidArgument( + "now only support x_num_col_dims == 2: but got %d", + x_num_col_dims)); + // flatten => x.shape=[6, 4] + Tensor tmp_x(x->type()); + int64_t first_dim = x->dims()[0] * x->dims()[1]; + int64_t sec_dim = x->dims()[2]; + tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); + tmp_x.mutable_data(ctx.GetPlace()); + framework::TensorCopy( + *x, ctx.GetPlace(), + ctx.template device_context(), &tmp_x); + tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); + + // matmul [6,4] , [4, 5] => [6, 5] + Tensor tmp_matmul(x->type()); + tmp_matmul.Resize(framework::make_ddim({first_dim, y->dims()[1]})); + tmp_matmul.mutable_data(ctx.GetPlace()); + + auto runner_matmul = + NpuOpRunner("MatMul", {tmp_x, *y}, {tmp_matmul}, + {{"transpose_x1", false}, {"transpose_x2", false}}); + + runner_matmul.Run(stream); + // reshape [6, 5] => [2, 3, 5] + (*out).Resize( + framework::make_ddim({x->dims()[0], x->dims()[1], y->dims()[1]})); + out->mutable_data(ctx.GetPlace(), x->type()); + framework::TensorCopy( + tmp_matmul, ctx.GetPlace(), + ctx.template device_context(), out); + (*out).Resize( + framework::make_ddim({x->dims()[0], x->dims()[1], y->dims()[1]})); + } + } +}; + +template +class MulGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int x_num_col_dims = ctx.Attr("x_num_col_dims"); + int y_num_col_dims = ctx.Attr("y_num_col_dims"); + auto stream = + ctx.template device_context() + .stream(); + if (x_num_col_dims == 1 && y_num_col_dims == 1) { + if (x->dims().size() == 2 && y->dims().size() == 2) { + if (dx) { + dx->mutable_data(ctx.GetPlace()); + auto runner_dx = + NpuOpRunner("MatMul", {*dout, *y}, {*dx}, + {{"transpose_x1", false}, {"transpose_x2", true}}); + + runner_dx.Run(stream); + } + + if (dy) { + dy->mutable_data(ctx.GetPlace()); + auto runner_dy = + NpuOpRunner("MatMul", {*x, *dout}, {*dy}, + {{"transpose_x1", true}, {"transpose_x2", false}}); + + runner_dy.Run(stream); + } + } else if (x->dims().size() == 3 && y->dims().size() == 2) { + // flatten => x.shape=[6, 4] + // matmul + if (dx) { + // matmul [2, 5] * [12, 5] => [2, 12] + dx->mutable_data(ctx.GetPlace()); + auto dx_dims = dx->dims(); + dx->Resize(framework::make_ddim({dout->dims()[0], y->dims()[0]})); + auto runner_matmul = + NpuOpRunner("MatMul", {*dout, *y}, {*dx}, + {{"transpose_x1", false}, {"transpose_x2", true}}); + runner_matmul.Run(stream); + // reshape [2, 12] => [2, 3, 4] + dx->Resize(dx_dims); + } + + if (dy) { + // flatten + Tensor tmp_x(x->type()); + int64_t sec_dim = x->dims()[1] * x->dims()[2]; + int64_t first_dim = x->dims()[0]; + tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); + tmp_x.mutable_data(ctx.GetPlace()); + framework::TensorCopy( + *x, ctx.GetPlace(), + ctx.template device_context(), &tmp_x); + tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); + dy->mutable_data(ctx.GetPlace()); + auto runner_dy = + NpuOpRunner("MatMul", {tmp_x, *dout}, {*dy}, + {{"transpose_x1", true}, {"transpose_x2", false}}); + + runner_dy.Run(stream); + } + } + } else if (x->dims().size() == 3 && y->dims().size() == 2) { + // for example: x.shape=[2, 3, 4] y.shape=[4, 5], expect [2, 3, 5] + PADDLE_ENFORCE_EQ(x_num_col_dims, 2, + platform::errors::InvalidArgument( + "now only support x_num_col_dims == 2: but got %d", + x_num_col_dims)); + // tmp_dout both used by dx and dy + Tensor tmp_dout(x->type()); + int64_t dout_first_dim = dout->dims()[0] * dout->dims()[1]; + int64_t dout_sec_dim = dout->dims()[2]; + tmp_dout.Resize(framework::make_ddim({dout_first_dim, dout_sec_dim})); + tmp_dout.mutable_data(ctx.GetPlace()); + framework::TensorCopy( + *dout, ctx.GetPlace(), + ctx.template device_context(), &tmp_dout); + tmp_dout.Resize(framework::make_ddim({dout_first_dim, dout_sec_dim})); + + if (dx) { + // tmp_dout * y [6,5] * [4,5] => [6, 4] + dx->mutable_data(ctx.GetPlace()); + auto dx_dims = dx->dims(); + dx->Resize(framework::make_ddim({dout_first_dim, y->dims()[0]})); + auto runner_matmul = + NpuOpRunner("MatMul", {tmp_dout, *y}, {*dx}, + {{"transpose_x1", false}, {"transpose_x2", true}}); + runner_matmul.Run(stream); + // reshape [2, 12] => [2, 3, 4] + dx->Resize(dx_dims); + } + if (dy) { + // flatten x.shape [2,3,4] => [6, 4] + Tensor tmp_x(x->type()); + int64_t first_dim = x->dims()[0] * x->dims()[1]; + int64_t sec_dim = x->dims()[2]; + tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); + tmp_x.mutable_data(ctx.GetPlace()); + framework::TensorCopy( + *x, ctx.GetPlace(), + ctx.template device_context(), &tmp_x); + tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); + // mamtul [6,4] [6,5] =>[4,5] + dy->mutable_data(ctx.GetPlace()); + auto runner_dy = + NpuOpRunner("MatMul", {tmp_x, tmp_dout}, {*dy}, + {{"transpose_x1", true}, {"transpose_x2", false}}); + runner_dy.Run(stream); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + mul, ops::MulNPUKernel, + ops::MulNPUKernel); +REGISTER_OP_NPU_KERNEL( + mul_grad, ops::MulGradNPUKernel, + ops::MulGradNPUKernel); diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc index 7af6de5224145b991b9f4f17eebbf4c3748fac59..aa0c4d2dfd274e2d834655429a600326f3af9fe2 100644 --- a/paddle/fluid/operators/npu_op_runner.cc +++ b/paddle/fluid/operators/npu_op_runner.cc @@ -64,13 +64,21 @@ aclFormat ConvertToNpuFormat(DataLayout layout) { return iter->second; } +aclrtStream GetCurrentNPUStream() { + int device_id = platform::GetCurrentNPUDeviceId(); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto *dev_ctx = static_cast( + pool.Get(platform::NPUPlace(device_id))); + return dev_ctx->stream(); +} + NpuOpRunner::NpuOpRunner(std::string op_type) : op_type_(op_type) { attr_ = aclopCreateAttr(); } NpuOpRunner::NpuOpRunner(std::string op_type, const std::vector &inputs, const std::vector &outputs, - const AttributeMap &attrs) + const NPUAttributeMap &attrs) : op_type_(op_type) { attr_ = aclopCreateAttr(); AddInputs(inputs); @@ -85,7 +93,7 @@ NpuOpRunner::~NpuOpRunner() { const std::string &NpuOpRunner::Type() { return op_type_; } NpuOpRunner &NpuOpRunner::AddAttr(const std::string &name, - const Attribute &attr) { + const NPUAttribute &attr) { if (attr.type() == typeid(bool)) { PADDLE_ENFORCE_NPU_SUCCESS( aclopSetAttrBool(attr_, name.c_str(), BOOST_GET_CONST(bool, attr))); @@ -135,6 +143,16 @@ NpuOpRunner &NpuOpRunner::AddAttr(const std::string &name, } PADDLE_ENFORCE_NPU_SUCCESS( aclopSetAttrListString(attr_, name.c_str(), s.size(), s.data())); + } else if (attr.type() == typeid(std::vector>)) { + auto a = BOOST_GET_CONST(std::vector>, attr); + std::vector data; + std::vector num; + for (auto &&v : a) { + data.push_back(v.data()); + num.push_back(v.size()); + } + PADDLE_ENFORCE_NPU_SUCCESS(aclopSetAttrListListInt( + attr_, name.c_str(), data.size(), num.data(), data.data())); } else { PADDLE_THROW(platform::errors::Unimplemented( "Can not convert attribubte '%s' to convert to aclopAttr", name)); @@ -142,7 +160,7 @@ NpuOpRunner &NpuOpRunner::AddAttr(const std::string &name, return *this; } -NpuOpRunner &NpuOpRunner::AddAttrs(const AttributeMap &attrs) { +NpuOpRunner &NpuOpRunner::AddAttrs(const NPUAttributeMap &attrs) { for (const auto &pair : attrs) { AddAttr(pair.first, pair.second); } @@ -175,6 +193,21 @@ NpuOpRunner &NpuOpRunner::AddInputs(const std::vector &tensors) { return *this; } +// NOTE(zhiqiu): For operators whose input is a list (such as concat, stack), +// It is needed to set the name of each input tensor. +NpuOpRunner &NpuOpRunner::AddInputNames(const std::vector &names) { + PADDLE_ENFORCE_EQ(names.size(), input_descs_.size(), + platform::errors::InvalidArgument( + "The size of input names should be " + "equal to the size of input descs, but got the size " + "of input names is %d, the size of input descs is %d.", + names.size(), input_descs_.size())); + for (size_t i = 0; i < names.size(); ++i) { + aclSetTensorDescName(input_descs_[i], names[i].c_str()); + } + return *this; +} + NpuOpRunner &NpuOpRunner::AddOutputs(const std::vector &tensors) { for (auto tensor : tensors) { // create aclTensorDesc @@ -224,18 +257,22 @@ aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor) { auto format = ConvertToNpuFormat(tensor.layout()); auto dims = framework::vectorize(tensor.dims()); - VLOG(4) << dtype << " " << dims.size() << " " << dims[0] << "," << dims[1] - << " " << format; + VLOG(4) << "NPU dtype:" << dtype << " " + << "rank:" << dims.size() << " dims:" << tensor.dims() + << " format:" << format; auto *desc = aclCreateTensorDesc(dtype, dims.size(), dims.data(), format); PADDLE_ENFORCE_NOT_NULL( desc, platform::errors::External("Call aclCreateTensorDesc failed.")); + PADDLE_ENFORCE_NPU_SUCCESS(aclSetTensorStorageFormat(desc, format)); + PADDLE_ENFORCE_NPU_SUCCESS( + aclSetTensorStorageShape(desc, dims.size(), dims.data())); return desc; } aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) { void *ptr = tensor.data(); - VLOG(4) << "ptr: " << ptr << ", size: " << tensor.memory_size(); + VLOG(4) << "NPU ptr: " << ptr << ", size: " << tensor.memory_size(); auto *buffer = aclCreateDataBuffer(ptr, tensor.memory_size()); PADDLE_ENFORCE_NOT_NULL( buffer, platform::errors::External("Call aclCreateDataBuffer failed.")); @@ -243,11 +280,17 @@ aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) { } void NpuOpRunner::Run(aclrtStream stream) { + if (!stream) { + VLOG(4) << "Run with default current npu stream: " << stream; + stream = GetCurrentNPUStream(); + } + VLOG(4) << "op_type: " << op_type_; VLOG(4) << "input_desc.size: " << input_descs_.size(); VLOG(4) << "output_desc.size: " << output_descs_.size(); - VLOG(4) << "stream: " << stream; VLOG(4) << "attr: " << attr_; + VLOG(4) << "stream: " << stream; + aclError ret = aclopCompileAndExecute( op_type_.c_str(), input_descs_.size(), input_descs_.data(), input_buffers_.data(), output_descs_.size(), output_descs_.data(), diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h index c69d8441e5def8b24aea0b094560103bf21a7442..e178f7fc6e96d8c7cae94ac907ae383c9582303c 100644 --- a/paddle/fluid/operators/npu_op_runner.h +++ b/paddle/fluid/operators/npu_op_runner.h @@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifdef PADDLE_WITH_ASCEND_CL #pragma once #include +#include #include #include @@ -26,8 +28,8 @@ namespace operators { using Tensor = framework::Tensor; using DataLayout = framework::DataLayout; -using Attribute = framework::Attribute; -using AttributeMap = framework::AttributeMap; +using NPUAttribute = framework::NPUAttribute; +using NPUAttributeMap = framework::NPUAttributeMap; class NpuOpRunner { public: @@ -35,15 +37,15 @@ class NpuOpRunner { explicit NpuOpRunner(std::string op_type, const std::vector &inputs = {}, const std::vector &outputs = {}, - const AttributeMap &attrs = {}); + const NPUAttributeMap &attrs = {}); ~NpuOpRunner(); const std::string &Type(); - NpuOpRunner &AddAttr(const std::string &name, const Attribute &attr); + NpuOpRunner &AddAttr(const std::string &name, const NPUAttribute &attr); - NpuOpRunner &AddAttrs(const AttributeMap &attrs); + NpuOpRunner &AddAttrs(const NPUAttributeMap &attrs); NpuOpRunner &AddInput(const Tensor &tensor); @@ -51,6 +53,8 @@ class NpuOpRunner { NpuOpRunner &AddInputs(const std::vector &tensors); + NpuOpRunner &AddInputNames(const std::vector &names); + NpuOpRunner &AddOutputs(const std::vector &tensors); aclTensorDesc *GetInputDesc(size_t index); @@ -65,7 +69,7 @@ class NpuOpRunner { std::vector &GetOutputBuffers(); - void Run(aclrtStream stream); + void Run(aclrtStream stream = nullptr); private: aclTensorDesc *CreateTensorDesc(Tensor tensor); @@ -80,5 +84,8 @@ class NpuOpRunner { aclopAttr *attr_{nullptr}; }; +aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype); + } // namespace operators } // namespace paddle +#endif diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..134544c2f65bc397acc3cb6451990e6cee3b0990 --- /dev/null +++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc @@ -0,0 +1,161 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/operators/optimizers/adam_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class AdamNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE_EQ(param_var->IsType(), true, + platform::errors::InvalidArgument( + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.InputNames("Param").front(), + framework::ToTypeName(param_var->Type()))); + T epsilon = static_cast(ctx.Attr("epsilon")); + auto* param = ctx.Input("Param"); + auto* grad_var = ctx.InputVar("Grad"); + PADDLE_ENFORCE_EQ(grad_var->IsType(), true, + platform::errors::InvalidArgument( + "The Grad(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.InputNames("Grad").front(), + framework::ToTypeName(param_var->Type()))); + auto* grad = ctx.Input("Grad"); + auto* mom1 = ctx.Input("Moment1"); + auto* mom2 = ctx.Input("Moment2"); + auto* lr = ctx.Input("LearningRate"); + + auto* beta1_pow = ctx.Input("Beta1Pow"); + auto* beta2_pow = ctx.Input("Beta2Pow"); + + auto* param_out = ctx.Output("ParamOut"); + auto* mom1_out = ctx.Output("Moment1Out"); + auto* mom2_out = ctx.Output("Moment2Out"); + auto* beta1_pow_out = ctx.Output("Beta1PowOut"); + auto* beta2_pow_out = ctx.Output("Beta2PowOut"); + + param_out->mutable_data(ctx.GetPlace()); + mom1_out->mutable_data(ctx.GetPlace()); + mom2_out->mutable_data(ctx.GetPlace()); + beta1_pow_out->mutable_data(ctx.GetPlace()); + beta2_pow_out->mutable_data(ctx.GetPlace()); + + T beta1 = static_cast(ctx.Attr("beta1")); + if (ctx.HasInput("Beta1Tensor")) { + auto* beta1_tensor = ctx.Input("Beta1Tensor"); + PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(Beta1Tensor) size must be 1, but get %d", + beta1_tensor->numel())); + beta1 = static_cast(GetAttrFromTensor(beta1_tensor)); + } + T beta2 = static_cast(ctx.Attr("beta2")); + if (ctx.HasInput("Beta2Tensor")) { + auto* beta2_tensor = ctx.Input("Beta2Tensor"); + PADDLE_ENFORCE_EQ(beta2_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(Beta2Tensor) size must be 1, but get %d", + beta2_tensor->numel())); + beta2 = static_cast(GetAttrFromTensor(beta2_tensor)); + } + VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel() + << "beta2_pow.numel() : " << beta2_pow->numel(); + VLOG(3) << "param.numel(): " << param->numel(); + + PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1, + platform::errors::InvalidArgument( + "beta1 pow output size should be 1, but received " + "value is:%d.", + beta1_pow_out->numel())); + + PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1, + platform::errors::InvalidArgument( + "beta2 pow output size should be 1, but received " + "value is:%d.", + beta2_pow_out->numel())); + + // reshape + Tensor beta1_tensor(framework::proto::VarType::FP32); + beta1_tensor.mutable_data({1}, ctx.GetPlace()); + TensorFromVector(std::vector{beta1}, ctx.device_context(), + &beta1_tensor); + Tensor beta2_tensor(framework::proto::VarType::FP32); + beta2_tensor.mutable_data({1}, ctx.GetPlace()); + TensorFromVector(std::vector{beta2}, ctx.device_context(), + &beta2_tensor); + + Tensor epsilon_tensor(framework::proto::VarType::FP32); + epsilon_tensor.mutable_data({1}, ctx.GetPlace()); + TensorFromVector(std::vector{epsilon}, ctx.device_context(), + &epsilon_tensor); + auto stream = + ctx.template device_context() + .stream(); + auto runner = + NpuOpRunner("ApplyAdamD", + { + *param, *mom1, *mom2, *beta1_pow, *beta2_pow, *lr, + beta1_tensor, beta2_tensor, epsilon_tensor, *grad, + }, + { + *param_out, *mom1_out, *mom2_out, + }, + {}); + runner.Run(stream); + + // NOTE(zhiqiu): ApplyAdamD updates params inplace, so + // if param and param_out is not same, we need to do copy. + if (param_out->data() != param->data()) { + ctx.template device_context().Wait(); + framework::TensorCopySync(*param, ctx.GetPlace(), param_out); + } + if (mom1_out->data() != mom1->data()) { + ctx.template device_context().Wait(); + framework::TensorCopySync(*mom1, ctx.GetPlace(), mom1_out); + } + if (mom2_out->data() != mom2->data()) { + ctx.template device_context().Wait(); + framework::TensorCopySync(*mom2, ctx.GetPlace(), mom2_out); + } + auto runner_m1 = + NpuOpRunner("Mul", {*beta1_pow, beta1_tensor}, {*beta1_pow_out}, {}); + runner_m1.Run(stream); + auto runner_m2 = + NpuOpRunner("Mul", {*beta2_pow, beta2_tensor}, {*beta2_pow_out}, {}); + runner_m2.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + adam, ops::AdamNPUKernel, + ops::AdamNPUKernel); diff --git a/paddle/fluid/operators/optimizers/sgd_op_npu.cc b/paddle/fluid/operators/optimizers/sgd_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..b7aaff5d45791807bd5dd53d755749ea239e402a --- /dev/null +++ b/paddle/fluid/operators/optimizers/sgd_op_npu.cc @@ -0,0 +1,62 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/operators/optimizers/sgd_op.h" + +namespace paddle { +namespace operators { + +template +class SGDNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* learning_rate = ctx.Input("LearningRate"); + auto* param_var = ctx.Input("Param"); + auto* grad_var = ctx.Input("Grad"); + auto* param_out = ctx.Output("ParamOut"); + + param_out->mutable_data(ctx.GetPlace()); + + auto runner = + NpuOpRunner("ApplyGradientDescent", + {*param_var, *learning_rate, *grad_var}, {*param_out}, {}); + + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + + // NOTE(zhiqiu): ApplyGradientDescent updates params inplace, so + // if param and param_out is not same, we need to do copy. + if (param_out->data() != param_var->data()) { + ctx.template device_context().Wait(); + framework::TensorCopySync(*param_var, ctx.GetPlace(), param_out); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + sgd, ops::SGDNPUKernel, + ops::SGDNPUKernel, + ops::SGDNPUKernel); diff --git a/paddle/fluid/operators/range_op_npu.cc b/paddle/fluid/operators/range_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..228372e1e93e03c7748c4439067764e89298789c --- /dev/null +++ b/paddle/fluid/operators/range_op_npu.cc @@ -0,0 +1,76 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_ASCEND_CL +#include +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/operators/range_op.h" +#include "paddle/fluid/operators/utils.h" + +namespace paddle { +namespace operators { + +template +class RangeNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* start_t = context.Input("Start"); + auto* end_t = context.Input("End"); + auto* step_t = context.Input("Step"); + auto* out = context.Output("Out"); + + framework::Tensor n; + framework::TensorCopySync(*start_t, platform::CPUPlace(), &n); + T start = n.data()[0]; + framework::TensorCopySync(*end_t, platform::CPUPlace(), &n); + T end = n.data()[0]; + framework::TensorCopySync(*step_t, platform::CPUPlace(), &n); + T step = n.data()[0]; + + int64_t size = 0; + GetSize(start, end, step, &size); + + out->Resize(framework::make_ddim({size})); + out->mutable_data(context.GetPlace()); + + std::vector odata; + T value = start; + for (int64_t i = 0; i < size; ++i) { + odata.push_back(value); + value += step; + } + + framework::TensorFromVector(odata, context.device_context(), out); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + range, ops::RangeNPUKernel, + ops::RangeNPUKernel, + ops::RangeNPUKernel) + +#endif diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..562a560b2f154809f2435e4fa0b8380565639569 --- /dev/null +++ b/paddle/fluid/operators/range_op_npu_test.cc @@ -0,0 +1,92 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(range); +USE_OP_DEVICE_KERNEL(range, NPU); + +template +void Compare(f::Scope* scope, const p::DeviceContext& ctx, + std::string op_type) { + // init + auto start = scope->Var("Start"); + auto tensor_start = start->GetMutable(); + std::vector init_start; + init_start.push_back(static_cast(1)); + TensorFromVector(init_start, ctx, tensor_start); + tensor_start->Resize({1}); + + auto end = scope->Var("End"); + auto tensor_end = end->GetMutable(); + std::vector init_end; + init_end.push_back(static_cast(10)); + TensorFromVector(init_end, ctx, tensor_end); + tensor_end->Resize({1}); + + auto step = scope->Var("Step"); + auto tensor_step = step->GetMutable(); + std::vector init_step; + init_step.push_back(static_cast(2)); + TensorFromVector(init_step, ctx, tensor_step); + tensor_step->Resize({1}); + + ctx.Wait(); + + auto place = ctx.GetPlace(); + auto out = scope->Var("Out"); + auto tensor_out = out->GetMutable(); + + // run + auto op = f::OpRegistry::CreateOp( + op_type, {{"Start", {"Start"}}, {"End", {"End"}}, {"Step", {"Step"}}}, + {{"Out", {"Out"}}}, {}); + + op->Run(*scope, place); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + ctx.Wait(); + + EXPECT_EQ(static_cast(out_vec.size()), static_cast(5)); + EXPECT_EQ(static_cast(out_vec[0]), static_cast(1.0)); + EXPECT_EQ(static_cast(out_vec[1]), static_cast(3.0)); + EXPECT_EQ(static_cast(out_vec[2]), static_cast(5.0)); + EXPECT_EQ(static_cast(out_vec[3]), static_cast(7.0)); + EXPECT_EQ(static_cast(out_vec[4]), static_cast(9.0)); +} + +TEST(range, NPU) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx, "range"); +} diff --git a/paddle/fluid/operators/reduce_ops/CMakeLists.txt b/paddle/fluid/operators/reduce_ops/CMakeLists.txt index 92107c9dc442ee53fd09f5d51a0d660049dc16f0..846d362fb522db730315dbdc5063a1ffcc035548 100644 --- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt +++ b/paddle/fluid/operators/reduce_ops/CMakeLists.txt @@ -42,3 +42,7 @@ endif() if(WITH_ROCM) hip_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor) endif() + +if(WITH_ASCEND_CL) + cc_test(reduce_any_op_npu_test SRCS reduce_any_op_npu_test.cc DEPS op_registry reduce_any_op scope device_context enforce executor) +endif() diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..39e74c908ae7ab5c420f07a559804d5aa5a9c216 --- /dev/null +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc @@ -0,0 +1,55 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; + +template +class ReduceAnyNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const Tensor* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + bool keep_dim = ctx.Attr("keep_dim"); + auto dims = ctx.Attr>("dim"); + + out->mutable_data(ctx.GetPlace()); + + // set attr + NPUAttributeMap attr = {{"keep_dims", keep_dim}, {"axes", dims}}; + + auto runner = NpuOpRunner("ReduceAnyD", {*x}, {*out}, attr); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(reduce_any, ops::ReduceAnyNPUKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d408ff3988f030fcc63140deed52a26ba7e8c986 --- /dev/null +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc @@ -0,0 +1,83 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +using Tensor = paddle::framework::Tensor; + +USE_OP(reduce_any); +USE_OP_DEVICE_KERNEL(reduce_any, NPU); + +template +void Compare(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto x = scope->Var("X"); + auto tensor_x = x->GetMutable(); + std::vector init_x = {true, false, false, false}; + f::TensorFromVector(init_x, ctx, tensor_x); + tensor_x->Resize(paddle::framework::make_ddim({2})); + + ctx.Wait(); + + auto place = ctx.GetPlace(); + auto out = scope->Var("Out"); + auto tensor_out = out->GetMutable(); + + // run + std::vector axes; + f::AttributeMap attrs = {{"axes", axes}, {"keep_dims", true}}; + auto op = f::OpRegistry::CreateOp("reduce_any", {{"X", {"X"}}}, + {{"Out", {"Out"}}}, attrs); + + op->Run(*scope, place); + + ctx.Wait(); + + std::vector out_vec; + f::TensorToVector(*tensor_out, ctx, &out_vec); + + ctx.Wait(); + + std::vector expected_vec = {true}; + EXPECT_EQ(out_vec.size(), expected_vec.size()); + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], expected_vec[i]); + } +} + +TEST(reduce_any, NPU) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx); +} diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..f3b6e69a48bcb05563bc141e59863f95d6c17e30 --- /dev/null +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc @@ -0,0 +1,149 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" +#include "paddle/fluid/operators/unsqueeze_op.h" + +namespace paddle { +namespace operators { + +template +class ReduceSumNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + bool reduce_all = ctx.Attr("reduce_all"); + bool keep_dims = ctx.Attr("keep_dim"); + auto dims = ctx.Attr>("dim"); + + out->mutable_data(ctx.GetPlace()); + + // special case + if (x->dims().size() == 1 && keep_dims == false) { + keep_dims = true; + } + + auto stream = + ctx.template device_context() + .stream(); + + framework::Tensor cast_x; + framework::Tensor cast_out; + // NOTE: ReduceSumD only supports fp32 and fp16 + if (x->type() != framework::proto::VarType::FP32 && + x->type() != framework::proto::VarType::FP16) { + cast_x.Resize(x->dims()); + cast_x.mutable_data(ctx.GetPlace()); + auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::FP32); + auto runner_cast = NpuOpRunner( + "Cast", {*x}, {cast_x}, {{"dst_type", static_cast(dst_dtype)}}); + runner_cast.Run(stream); + + cast_out.Resize(out->dims()); + cast_out.mutable_data(ctx.GetPlace()); + } else { + cast_x.ShareDataWith(*x); + cast_out.ShareDataWith(*out); + } + + if (reduce_all) { + std::vector dim_vec; + for (int i = 0; i < x->dims().size(); i++) { + dim_vec.push_back(i); + } + + auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out}, + {{"axes", dim_vec}, {"keep_dims", keep_dims}}); + runner.Run(stream); + + } else { + auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out}, + {{"axes", dims}, {"keep_dims", keep_dims}}); + runner.Run(stream); + } + + if (x->type() != framework::proto::VarType::FP32 && + x->type() != framework::proto::VarType::FP16) { + auto dst_dtype = ConvertToNpuDtype(out->type()); + auto runner_cast = + NpuOpRunner("Cast", {cast_out}, {*out}, + {{"dst_type", static_cast(dst_dtype)}}); + runner_cast.Run(stream); + } + } +}; + +template +class ReduceSumGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out_grad = + ctx.Input(framework::GradVarName("Out")); + auto* x_grad = ctx.Output(framework::GradVarName("X")); + bool reduce_all = ctx.Attr("reduce_all"); + bool keep_dims = ctx.Attr("keep_dim"); + auto dims = ctx.Attr>("dim"); + + x_grad->mutable_data(ctx.GetPlace()); + + auto stream = + ctx.template device_context() + .stream(); + if (keep_dims || reduce_all) { + auto runner = NpuOpRunner("BroadcastToD", {*out_grad}, {*x_grad}, + {{"shape", framework::vectorize(x->dims())}}); + runner.Run(stream); + } else { + framework::DDim out_dims; + out_dims = UnsqueezeKernel::GetOutputShape( + dims, out_grad->dims()); + + Tensor out_grad_tmp(out_grad->type()); + out_grad_tmp.Resize(out_dims); + out_grad_tmp.mutable_data(ctx.GetPlace()); + framework::TensorCopy( + *out_grad, ctx.GetPlace(), + ctx.template device_context(), + &out_grad_tmp); + out_grad_tmp.Resize(out_dims); + + auto runner = NpuOpRunner("BroadcastToD", {out_grad_tmp}, {*x_grad}, + {{"shape", framework::vectorize(x->dims())}}); + runner.Run(stream); + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + reduce_sum, + ops::ReduceSumNPUKernel, + ops::ReduceSumNPUKernel, + ops::ReduceSumNPUKernel); +REGISTER_OP_NPU_KERNEL( + reduce_sum_grad, + ops::ReduceSumGradNPUKernel, + ops::ReduceSumGradNPUKernel, + ops::ReduceSumGradNPUKernel); diff --git a/paddle/fluid/operators/reshape_op_npu.cc b/paddle/fluid/operators/reshape_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..79a4cd116f3b939cdbf11992a6386b196b0d77ff --- /dev/null +++ b/paddle/fluid/operators/reshape_op_npu.cc @@ -0,0 +1,86 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class Reshape2NPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + auto list_new_shape_tensor = + ctx.MultiInput("ShapeTensor"); + if (list_new_shape_tensor.size() > 0) { + PADDLE_THROW(platform::errors::Unimplemented( + "Input(ShapeTensor) is not supported on NPU.")); + } + PADDLE_ENFORCE_EQ(ctx.Input("Shape"), nullptr, + platform::errors::Unimplemented( + "Input(Shape) is not supported on NPU.")); + auto shape = out->dims(); + out->mutable_data(ctx.GetPlace(), x->type()); + framework::TensorCopy( + *x, ctx.GetPlace(), + ctx.template device_context(), out); + out->Resize(shape); + } +}; + +template +class Reshape2GradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* d_x = ctx.Output(framework::GradVarName("X")); + auto* d_out = ctx.Input(framework::GradVarName("Out")); + auto in_dims = d_x->dims(); + + d_x->mutable_data(ctx.GetPlace(), d_out->type()); + framework::TensorCopy( + *d_out, ctx.GetPlace(), + ctx.template device_context(), d_x); + d_x->Resize(in_dims); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + reshape2, ops::Reshape2NPUKernel, + ops::Reshape2NPUKernel, + ops::Reshape2NPUKernel, + ops::Reshape2NPUKernel, + ops::Reshape2NPUKernel, + ops::Reshape2NPUKernel, + ops::Reshape2NPUKernel); +REGISTER_OP_NPU_KERNEL( + reshape2_grad, + ops::Reshape2GradNPUKernel, + ops::Reshape2GradNPUKernel, + ops::Reshape2GradNPUKernel, + ops::Reshape2GradNPUKernel, + ops::Reshape2GradNPUKernel, + ops::Reshape2GradNPUKernel, + ops::Reshape2GradNPUKernel); diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..ee7210a7784d72e1cec297ad8ba194b36fae8fba --- /dev/null +++ b/paddle/fluid/operators/scale_op_npu.cc @@ -0,0 +1,69 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/operators/scale_op.h" + +namespace paddle { +namespace operators { + +template +class ScaleNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + auto scale = static_cast(ctx.Attr("scale")); + auto bias = static_cast(ctx.Attr("bias")); + auto bias_after_scale = ctx.Attr("bias_after_scale"); + auto stream = + ctx.template device_context() + .stream(); + float _power = 1.0; + if (bias_after_scale) { + out->mutable_data(ctx.GetPlace()); + auto runner = + NpuOpRunner("Power", {*x}, {*out}, + {{"power", _power}, {"scale", scale}, {"shift", bias}}); + + runner.Run(stream); + } else { + Tensor tmp_x(x->type()); + tmp_x.Resize(x->dims()); + tmp_x.mutable_data(ctx.GetPlace()); + auto runner_tmp = NpuOpRunner("Adds", {*x}, {tmp_x}, {{"value", bias}}); + runner_tmp.Run(stream); + + out->mutable_data(ctx.GetPlace()); + float _bias = 0.0; + auto runner = + NpuOpRunner("Power", {tmp_x}, {*out}, + {{"power", _power}, {"scale", scale}, {"shift", _bias}}); + runner.Run(stream); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + scale, ops::ScaleNPUKernel, + ops::ScaleNPUKernel); diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..e2e49acb94c7b22120acbd614c2f0ac139540f3c --- /dev/null +++ b/paddle/fluid/operators/scatter_op_npu.cc @@ -0,0 +1,75 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_ASCEND_CL +#include +#include + +#include "paddle/fluid/operators/kron_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/operators/scatter_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class ScatterNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* index = ctx.Input("Ids"); + auto* updates = ctx.Input("Updates"); + bool overwrite = ctx.Attr("overwrite"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + out->mutable_data(place); + + framework::Tensor tmp_tensor(index->type()); + const auto index_dims = index->dims(); + if (index_dims.size() == 1) { + tmp_tensor.ShareDataWith(*index); + std::vector new_dim = {index_dims[0], 1}; + tmp_tensor.Resize(framework::make_ddim(new_dim)); + index = &tmp_tensor; + } + + auto stream = + ctx.template device_context() + .stream(); + + if (overwrite) { + auto runner_update = NpuOpRunner("TensorScatterUpdate", + {*x, *index, *updates}, {*out}, {}); + runner_update.Run(stream); + } else { + auto runner_add = + NpuOpRunner("TensorScatterAdd", {*x, *index, *updates}, {*out}, {}); + runner_add.Run(stream); + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + scatter, ops::ScatterNPUKernel, + ops::ScatterNPUKernel); +#endif diff --git a/paddle/fluid/operators/shape_op_npu.cc b/paddle/fluid/operators/shape_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..061849db6ada801cf7728af0c1158dfb75487948 --- /dev/null +++ b/paddle/fluid/operators/shape_op_npu.cc @@ -0,0 +1,60 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/operators/shape_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using SelectedRows = framework::SelectedRows; + +template +class ShapeNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in_var = ctx.InputVar("Input"); + framework::DDim in_dims; + if (in_var->IsType()) { + in_dims = in_var->Get().value().dims(); + } else { + in_dims = in_var->Get().dims(); + } + auto* out_t = ctx.Output("Out"); + out_t->Resize({in_dims.size()}); + // to do: cpuplace? + auto out_data = out_t->mutable_data(platform::CPUPlace()); + for (int i = 0; i < in_dims.size(); ++i) { + out_data[i] = in_dims[i]; + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + shape, ops::ShapeNPUKernel, + ops::ShapeNPUKernel, + ops::ShapeNPUKernel, + ops::ShapeNPUKernel, + ops::ShapeNPUKernel, + ops::ShapeNPUKernel, + ops::ShapeNPUKernel); diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..e5e0dafdae0b15ecc43fe0603688f097659aefd9 --- /dev/null +++ b/paddle/fluid/operators/slice_op_npu.cc @@ -0,0 +1,134 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the Licnse. */ + +#include +#include + +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/operators/slice_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +void UpdateAttr(const framework::DDim in_dims, const std::vector axes, + const std::vector starts, const std::vector ends, + std::vector* offsets, std::vector* size) { + int cnt = 0; + for (int i = 0; i < in_dims.size(); ++i) { + int start = 0; + int end = in_dims[i]; + int axis = axes[cnt]; + + if (axis == i) { + start = starts[cnt]; + if (start < 0) { + start = (start + in_dims[i]); + } + start = std::max(start, static_cast(0)); + end = ends[cnt]; + if (end < 0) { + end = (end + in_dims[i]); + } + end = std::min(end, static_cast(in_dims[i])); + cnt++; + } + + (*offsets)[i] = start; + (*size)[i] = end - start; + } +} + +template +class SliceNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* out = ctx.Output("Out"); + + auto axes = ctx.Attr>("axes"); + auto starts = ctx.Attr>("starts"); + auto ends = ctx.Attr>("ends"); + + out->mutable_data(ctx.GetPlace()); + + auto in_dims = input->dims(); + std::vector offsets(in_dims.size()); + std::vector size(in_dims.size()); + + UpdateAttr(in_dims, axes, starts, ends, &offsets, &size); + + auto runner = NpuOpRunner("SliceD", {*input}, {*out}, + {{"offsets", offsets}, {"size", size}}); + + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class SliceGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dinput = ctx.Output(framework::GradVarName("Input")); + + auto axes = ctx.Attr>("axes"); + auto starts = ctx.Attr>("starts"); + auto ends = ctx.Attr>("ends"); + + auto in_dims = input->dims(); + int rank = in_dims.size(); + + std::vector offsets(rank); + std::vector size(rank); + UpdateAttr(in_dims, axes, starts, ends, &offsets, &size); + + std::vector> paddings(rank, std::vector(2)); + for (int i = 0; i < rank; ++i) { + paddings[i][0] = static_cast(offsets[i]); + paddings[i][1] = static_cast(in_dims[i] - size[i] - offsets[i]); + } + + dinput->mutable_data(ctx.GetPlace()); + auto stream = + ctx.template device_context() + .stream(); + auto runner = + NpuOpRunner("PadD", {*dout}, {*dinput}, {{"paddings", paddings}}); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + slice, ops::SliceNPUKernel, + ops::SliceNPUKernel); + +REGISTER_OP_NPU_KERNEL( + slice_grad, + ops::SliceGradNPUKernel, + ops::SliceGradNPUKernel); diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index a21ef252c03f7c279f5edc8c557758e4f9e1e822..5e7244f4390d84fbcb31f833b5e11bff637f0e7f 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -83,11 +83,13 @@ class SoftmaxOp : public framework::OperatorWithKernel { } #endif +#ifndef PADDLE_WITH_ASCEND_CL if (input_data_type == framework::proto::VarType::FP16) { PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, platform::errors::InvalidArgument( "float16 can only be used on GPU place")); } +#endif return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_, library_); @@ -207,9 +209,10 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { } #endif if (input_data_type == framework::proto::VarType::FP16) { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument( - "float16 can only be used on GPU place")); + if (!(platform::is_gpu_place(ctx.GetPlace()) || + platform::is_npu_place(ctx.GetPlace()))) + PADDLE_THROW(platform::errors::InvalidArgument( + "float16 can only be used on GPU/NPU place")); } return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_, diff --git a/paddle/fluid/operators/softmax_op_npu.cc b/paddle/fluid/operators/softmax_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..0e94f6af232f98e093953e1aee37306eb460211d --- /dev/null +++ b/paddle/fluid/operators/softmax_op_npu.cc @@ -0,0 +1,101 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/operators/softmax_op.h" + +namespace paddle { +namespace operators { + +template +class SoftmaxNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto axis = ctx.Attr("axis"); + std::vector axes; + axes.push_back(axis); + framework::NPUAttributeMap attr_input = {{"axes", axes}}; + + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + auto runner = NpuOpRunner("SoftmaxV2", {*in}, {*out}, attr_input); + + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class SoftmaxGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out = ctx.Input("Out"); + auto* dOut = ctx.Input(framework::GradVarName("Out")); + + auto* dX = ctx.Output(framework::GradVarName("X")); + + auto dims = dX->dims(); + const int rank = dims.size(); + const int axis = CanonicalAxis(ctx.Attr("axis"), rank); + int64_t first_dim = 1; + int64_t sec_dim = 1; + for (int i = 0; i < axis; i++) { + first_dim *= dims[i]; + } + for (int i = axis; i < rank; i++) { + sec_dim *= dims[i]; + } + + Tensor tmp_out; + tmp_out.ShareDataWith(*out).Resize({first_dim, sec_dim}); + + Tensor tmp_dOut; + tmp_dOut.ShareDataWith(*dOut).Resize({first_dim, sec_dim}); + + dX->Resize(framework::make_ddim({first_dim, sec_dim})); + dX->mutable_data(ctx.GetPlace()); + + framework::NPUAttributeMap attr_input = {}; + auto runner = NpuOpRunner(std::string("SoftmaxGrad"), {tmp_out, tmp_dOut}, + {*dX}, attr_input); + + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + + dX->Resize(dims); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + softmax, ops::SoftmaxNPUKernel, + ops::SoftmaxNPUKernel, + ops::SoftmaxNPUKernel); + +REGISTER_OP_NPU_KERNEL( + softmax_grad, ops::SoftmaxGradNPUKernel, + ops::SoftmaxGradNPUKernel, + ops::SoftmaxGradNPUKernel); diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f06f59f3b4e0051d5a09cefe2013af812f2736dd --- /dev/null +++ b/paddle/fluid/operators/softmax_op_npu_test.cc @@ -0,0 +1,170 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(softmax); +USE_OP_DEVICE_KERNEL(softmax, NPU); + +template +void Compare(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto x = scope->Var("X"); + auto tensor_x = x->GetMutable(); + + std::vector init; + for (int i = 3; i < 9; ++i) { + init.push_back(static_cast(i)); + } + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({2, 3}); + + ctx.Wait(); + + auto place = ctx.GetPlace(); + auto out = scope->Var("Out"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({2, 3}); + tensor_out->mutable_data(place); // allocate + + // run + int axis = 1; + f::AttributeMap attrs = { + {"axis", axis}, {"use_cudnn", false}, + {"use_mkldnn", false}, {"mkldnn_data_type", std::string("float32")}, + {"is_test", false}, + }; + + auto op = f::OpRegistry::CreateOp("softmax", {{"X", {"X"}}}, + {{"Out", {"Out"}}}, attrs); + + op->Run(*scope, place); + ctx.Wait(); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + + for (int i = 0; i < static_cast(out_vec.size()); ++i) { + VLOG(3) << "out_vec[" << i << "] : " << out_vec[i]; + } + + ctx.Wait(); + + EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)(6)); +} + +template +void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto out = scope->Var("Out"); + auto tensor_out = out->GetMutable(); + + std::vector out_init; + + out_init.push_back(static_cast(0.6670)); + out_init.push_back(static_cast(0.5888)); + out_init.push_back(static_cast(0.4543)); + out_init.push_back(static_cast(0.3330)); + out_init.push_back(static_cast(0.4112)); + out_init.push_back(static_cast(0.5457)); + + TensorFromVector(out_init, ctx, tensor_out); + tensor_out->Resize({2, 3}); + + ctx.Wait(); + + auto dout = scope->Var("DOut"); + auto tensor_dout = dout->GetMutable(); + + std::vector dout_init; + for (int i = 0; i < 6; ++i) { + dout_init.push_back(static_cast(1.0)); + } + + TensorFromVector(dout_init, ctx, tensor_dout); + tensor_dout->Resize({2, 3}); + + ctx.Wait(); + + auto dx = scope->Var("DX"); + auto tensor_dx = dx->GetMutable(); + + ctx.Wait(); + + // run + f::AttributeMap attrs; + attrs = { + {"name", std::string("softmax_grad")}, + {"axis", static_cast(0)}, + {"use_cudnn", false}, + {"use_mkldnn", false}, + {"mkldnn_data_type", std::string("float32")}, + {"is_test", false}, + {"data_format", std::string("AnyLayout")}, + }; + auto op = f::OpRegistry::CreateOp("softmax_grad", + {{"Out", {"Out"}}, {"Out@GRAD", {"DOut"}}}, + {{"X@GRAD", {"DX"}}}, attrs); + + auto place = ctx.GetPlace(); + op->Run(*scope, place); + ctx.Wait(); + + EXPECT_EQ((uint32_t)tensor_dx->dims()[0], (uint32_t)(2)); + EXPECT_EQ((uint32_t)tensor_dx->dims()[1], (uint32_t)(3)); + + ctx.Wait(); + + std::vector out_vec; + TensorToVector(*tensor_dx, ctx, &out_vec); + + ctx.Wait(); + + EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)(6)); + EXPECT_NEAR((float)out_vec[0], (float)(-0.4737), 0.1); + EXPECT_NEAR((float)out_vec[1], (float)(-0.4181), 0.1); + EXPECT_NEAR((float)out_vec[2], (float)(-0.3226), 0.1); + EXPECT_NEAR((float)out_vec[3], (float)(-0.0965), 0.1); + EXPECT_NEAR((float)out_vec[4], (float)(-0.1192), 0.1); + EXPECT_NEAR((float)out_vec[5], (float)(-0.1582), 0.1); +} + +TEST(softmax, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx); +} + +TEST(softmax_grad, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + CompareGrad(&scope, ctx); +} diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..c777a02f96bd9a2f1dd6fc5a74874e128e3aa359 --- /dev/null +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc @@ -0,0 +1,203 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/softmax.h" +#include +#include +#include "paddle/fluid/operators/math/cross_entropy.h" +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/operators/softmax_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* logits = ctx.Input("Logits"); + auto* labels = ctx.Input("Label"); + auto* softmax = ctx.Output("Softmax"); + auto* loss = ctx.Output("Loss"); + + int cls_num = logits->dims()[1]; + const int rank = logits->dims().size(); + const int axis = CanonicalAxis(ctx.Attr("axis"), rank); + std::vector axes; + for (auto i = axis; i < logits->dims().size(); ++i) { + axes.push_back(i); + } + + auto stream = + ctx.template device_context() + .stream(); + + // softmax + softmax->mutable_data(ctx.GetPlace()); + auto runner_softmax = + NpuOpRunner("SoftmaxV2", {*logits}, {*softmax}, {{"axes", axes}}); + runner_softmax.Run(stream); + + // cast label from int64/int32 to int32 + Tensor tmp_labels(framework::proto::VarType::INT32); + if (labels->type() != framework::proto::VarType::INT32) { + tmp_labels.Resize(labels->dims()); + tmp_labels.mutable_data(ctx.GetPlace(), framework::proto::VarType::INT32); + auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32); + auto runner_cast_label = + NpuOpRunner("Cast", {*labels}, {tmp_labels}, + {{"dst_type", static_cast(dst_dtype)}}); + runner_cast_label.Run(stream); + labels = &tmp_labels; + } + + // on and off + Tensor on_tensor(framework::proto::VarType::INT32); + on_tensor.mutable_data({1}, ctx.GetPlace()); + TensorFromVector(std::vector{static_cast(1)}, + ctx.device_context(), &on_tensor); + Tensor off_tensor(framework::proto::VarType::INT32); + off_tensor.mutable_data({1}, ctx.GetPlace()); + TensorFromVector(std::vector{static_cast(0)}, + ctx.device_context(), &off_tensor); + + // one_hot + Tensor tmp_onehot(on_tensor.type()); + tmp_onehot.Resize(logits->dims()); + tmp_onehot.mutable_data(ctx.GetPlace()); + + auto runner_onehot = + NpuOpRunner("OneHotD", {*labels, on_tensor, off_tensor}, {tmp_onehot}, + {{"axis", -1}, {"depth", cls_num}}); + runner_onehot.Run(stream); + + // cast one_hot from int32 to T + Tensor cast_onehot(logits->type()); + cast_onehot.Resize(tmp_onehot.dims()); + cast_onehot.mutable_data(ctx.GetPlace()); + auto dst_dtype = ConvertToNpuDtype(logits->type()); + auto runner_cast_onehot = + NpuOpRunner("Cast", {tmp_onehot}, {cast_onehot}, + {{"dst_type", static_cast(dst_dtype)}}); + runner_cast_onehot.Run(stream); + + // SoftmaxCrossEntropyWithLogits + Tensor backprop(logits->type()); + backprop.Resize(logits->dims()); + backprop.mutable_data(ctx.GetPlace()); + + loss->mutable_data(ctx.GetPlace()); + + // SoftmaxCrossEntropyWithLogits requires loss to be of shape [batch_size] + auto loss_dims = loss->dims(); + loss->Resize({loss_dims[0]}); + auto runner_s = NpuOpRunner("SoftmaxCrossEntropyWithLogits", + {*logits, cast_onehot}, {*loss, backprop}, {}); + runner_s.Run(stream); + loss->Resize(loss_dims); + } +}; + +template +class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* labels = ctx.Input("Label"); + auto* softmax = ctx.Input("Softmax"); + auto* loss_grad = ctx.Input(framework::GradVarName("Loss")); + auto* logits_grad = ctx.Output(framework::GradVarName("Logits")); + + int cls_num = softmax->dims()[1]; + + auto stream = + ctx.template device_context() + .stream(); + + // cast label from int64/int32 to int32 + Tensor tmp_labels(framework::proto::VarType::INT32); + if (labels->type() != framework::proto::VarType::INT32) { + tmp_labels.Resize(labels->dims()); + tmp_labels.mutable_data(ctx.GetPlace(), framework::proto::VarType::INT32); + auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32); + auto runner_cast_label = + NpuOpRunner("Cast", {*labels}, {tmp_labels}, + {{"dst_type", static_cast(dst_dtype)}}); + runner_cast_label.Run(stream); + labels = &tmp_labels; + } + + // on and off + Tensor on_tensor(framework::proto::VarType::INT32); + on_tensor.mutable_data({1}, ctx.GetPlace()); + TensorFromVector(std::vector{static_cast(1)}, + ctx.device_context(), &on_tensor); + Tensor off_tensor(framework::proto::VarType::INT32); + off_tensor.mutable_data({1}, ctx.GetPlace()); + TensorFromVector(std::vector{static_cast(0)}, + ctx.device_context(), &off_tensor); + + // one_hot + Tensor tmp_onehot(on_tensor.type()); + tmp_onehot.Resize(softmax->dims()); + tmp_onehot.mutable_data(ctx.GetPlace()); + + auto runner_onehot = + NpuOpRunner("OneHotD", {*labels, on_tensor, off_tensor}, {tmp_onehot}, + {{"axis", -1}, {"depth", cls_num}}); + runner_onehot.Run(stream); + + // cast one_hot from int32 to T + Tensor cast_onehot(softmax->type()); + cast_onehot.Resize(tmp_onehot.dims()); + cast_onehot.mutable_data(ctx.GetPlace()); + auto dst_dtype = ConvertToNpuDtype(softmax->type()); + auto runner_cast_onehot = + NpuOpRunner("Cast", {tmp_onehot}, {cast_onehot}, + {{"dst_type", static_cast(dst_dtype)}}); + runner_cast_onehot.Run(stream); + + // sub + Tensor tmp_sub(softmax->type()); + tmp_sub.Resize(softmax->dims()); + tmp_sub.mutable_data(ctx.GetPlace()); + auto runner_sub = + NpuOpRunner("Sub", {*softmax, cast_onehot}, {tmp_sub}, {}); + + runner_sub.Run(stream); + // mul + logits_grad->mutable_data(ctx.GetPlace()); + auto runner_mul = + NpuOpRunner("Mul", {*loss_grad, tmp_sub}, {*logits_grad}, {}); + runner_mul.Run(stream); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + softmax_with_cross_entropy, + ops::SoftmaxWithCrossEntropyNPUKernel, + ops::SoftmaxWithCrossEntropyNPUKernel); +REGISTER_OP_NPU_KERNEL( + softmax_with_cross_entropy_grad, + ops::SoftmaxWithCrossEntropyGradNPUKernel< + paddle::platform::NPUDeviceContext, float>, + ops::SoftmaxWithCrossEntropyGradNPUKernel< + paddle::platform::NPUDeviceContext, paddle::platform::float16>); diff --git a/paddle/fluid/operators/squeeze_op_npu.cc b/paddle/fluid/operators/squeeze_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..33c9273e3b6f50038a738744d47db1ae246d25f8 --- /dev/null +++ b/paddle/fluid/operators/squeeze_op_npu.cc @@ -0,0 +1,43 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_ASCEND_CL +#include +#include + +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/operators/squeeze_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + squeeze, ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel); +REGISTER_OP_NPU_KERNEL( + squeeze2, ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel); +#endif diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..22dc81cbd79e0ed5dd1f60c221edb59f65e8fa5d --- /dev/null +++ b/paddle/fluid/operators/squeeze_op_npu_test.cc @@ -0,0 +1,90 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(squeeze); +USE_OP_DEVICE_KERNEL(squeeze, NPU); + +template +void Compare(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto x = scope->Var("X"); + auto tensor_x = x->GetMutable(); + + int dim0 = 1; + int dim1 = 10; + int dim2 = 1; + + std::vector init; + for (int64_t i = 0; i < dim0 * dim1 * dim2; ++i) { + init.push_back(static_cast(0.1)); + } + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({dim0, dim1, dim2}); + + ctx.Wait(); + + // run + auto place = ctx.GetPlace(); + auto out = scope->Var("Out"); + auto tensor_out = out->GetMutable(); + + std::vector axis; + axis.push_back(2); + f::AttributeMap attrs = {{"axes", axis}}; + + auto op = f::OpRegistry::CreateOp("squeeze", {{"X", {"X"}}}, + {{"Out", {"Out"}}}, attrs); + + op->Run(*scope, place); + ctx.Wait(); + + EXPECT_EQ((uint32_t)tensor_out->dims().size(), uint32_t(2)); + EXPECT_EQ((uint32_t)tensor_out->dims()[0], uint32_t(dim0)); + EXPECT_EQ((uint32_t)tensor_out->dims()[1], uint32_t(dim1)); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], static_cast(0.1)); + } + + ctx.Wait(); +} + +TEST(squeeze, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx); +} diff --git a/paddle/fluid/operators/stack_op_npu.cc b/paddle/fluid/operators/stack_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..958655b1f27c680655c20e8f795fc9e4bf37251d --- /dev/null +++ b/paddle/fluid/operators/stack_op_npu.cc @@ -0,0 +1,106 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_ASCEND_CL +#include +#include +#include + +#include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/operators/stack_op.h" +#include "paddle/fluid/operators/unsqueeze_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class StackNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto x = ctx.MultiInput("X"); + int32_t N = x.size(); + + PADDLE_ENFORCE_GT( + N, 0, platform::errors::InvalidArgument("number of input Tensor <= 0")); + + std::vector x_list; + for (int i = 0; i < N; i++) { + x_list.push_back(*x[i]); + } + + int axis = ctx.Attr("axis"); + + if (axis < 0) { + axis = axis + x_list[0].dims().size() + 1; + } + auto* out = ctx.Output("Y"); + + auto place = ctx.GetPlace(); + + auto stream = + ctx.template device_context() + .stream(); + + out->mutable_data(place); + + if (axis != 0) { + auto x_dim = x_list[0].dims(); + std::vector vec_dim_tmp; + vec_dim_tmp.push_back(N); + for (auto i = 0; i < x_dim.size(); ++i) { + vec_dim_tmp.push_back(x_dim[i]); + } + + Tensor tmp_stack(out->type()); + tmp_stack.Resize(framework::make_ddim(vec_dim_tmp)); + tmp_stack.mutable_data(ctx.GetPlace()); + + auto runner = + NpuOpRunner("Pack", {x_list}, {tmp_stack}, {{"axis", 0}, {"N", N}}); + runner.Run(stream); + + std::vector vec_trans; + for (auto i = 1; i <= x_dim.size(); ++i) { + vec_trans.push_back(i); + if (i == axis) { + vec_trans.push_back(0); + } + } + + auto runner_trans_final = + NpuOpRunner("TransposeD", {tmp_stack}, {*out}, {{"perm", vec_trans}}); + runner_trans_final.Run(stream); + + } else { + auto runner = + NpuOpRunner("Pack", {x_list}, {*out}, {{"axis", axis}, {"N", N}}); + runner.Run(stream); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + stack, ops::StackNPUKernel, + ops::StackNPUKernel); + +#endif diff --git a/paddle/fluid/operators/sum_op_npu.cc b/paddle/fluid/operators/sum_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..e3dc5faf46c81e71173c6f5a6ad7766067cad1c3 --- /dev/null +++ b/paddle/fluid/operators/sum_op_npu.cc @@ -0,0 +1,64 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/operators/sum_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class SumNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto x = ctx.MultiInput("X"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + auto place = ctx.GetPlace(); + + int n = static_cast(x.size()); + PADDLE_ENFORCE_EQ(n > 1, true, + platform::errors::InvalidArgument( + "The size of Input(x) list must larger or equal 2")); + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("Add", {*x[0], *x[1]}, {*out}, {}); + + runner.Run(stream); + for (int i = 2; i < n; i++) { + runner = NpuOpRunner("Add", {*out, *x[i]}, {*out}, {}); + runner.Run(stream); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + sum, ops::SumNPUKernel, + ops::SumNPUKernel); diff --git a/paddle/fluid/operators/top_k_op_npu.cc b/paddle/fluid/operators/top_k_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..684bd476b6ef21bf58a990c36b1ee6f820d82caf --- /dev/null +++ b/paddle/fluid/operators/top_k_op_npu.cc @@ -0,0 +1,85 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/operators/top_k_op.h" + +namespace paddle { +namespace operators { + +void gen_assist_seq(framework::Tensor* assit_tensor, int64_t dim, + const framework::ExecutionContext& ctx) { + const int64_t dimx2 = dim; + std::vector assit; + assit.resize(2 * dimx2); + for (int64_t i = 0; i < dimx2; i++) { + // for i in range [0, dim] + assit[i] = static_cast(i); + + // for i in range [dim, dimx2] + int64_t idx = + static_cast(static_cast(i)); + int64_t gap = i - idx; + assit[i + dim] = static_cast(gap); + } + framework::TensorFromVector(assit, ctx.device_context(), assit_tensor); +} + +template +class TopkNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // read input + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + auto* indices = ctx.Output("Indices"); + + size_t k = static_cast(ctx.Attr("k")); + + output->mutable_data(ctx.GetPlace()); + indices->mutable_data(ctx.GetPlace()); + + // prepare assit + auto dim = input->dims().size(); + framework::Tensor assist_seq_tensor; + assist_seq_tensor.Resize({2 * dim}); + assist_seq_tensor.mutable_data(ctx.GetPlace()); + gen_assist_seq(&assist_seq_tensor, dim, ctx); + + framework::NPUAttributeMap attr_input = {{"sorted", "true"}, + {"k", static_cast(k)}, + {"dim", -1}, + {"largest", true}}; + + // run ascend + auto runner = NpuOpRunner("TopKD", {*input, assist_seq_tensor}, + {*output, *indices}, attr_input); + + auto stream = + ctx.template device_context() + .stream(); + + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +// Ascend Op TopKD only support input float 16 dtype +REGISTER_OP_NPU_KERNEL(top_k, + ops::TopkNPUKernel); diff --git a/paddle/fluid/operators/transpose_op_npu.cc b/paddle/fluid/operators/transpose_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..994b8e534f85e2926481d3767f6e75892751d959 --- /dev/null +++ b/paddle/fluid/operators/transpose_op_npu.cc @@ -0,0 +1,81 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/expand_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class TransposeNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + std::vector axis = ctx.Attr>("axis"); + framework::NPUAttributeMap attr_input = {{"perm", axis}}; + out->mutable_data(ctx.device_context().GetPlace()); + auto runner = NpuOpRunner("TransposeD", {*x}, {*out}, attr_input); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class TransposeGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out_grad = + ctx.Input(framework::GradVarName("Out")); + auto* x_grad = + ctx.Output(framework::GradVarName("X")); + std::vector axis = ctx.Attr>("axis"); + std::vector reversed_axis(axis); + for (size_t i = 0; i < axis.size(); i++) { + reversed_axis[axis[i]] = i; + } + x_grad->mutable_data(ctx.GetPlace()); + framework::NPUAttributeMap attr_input = {{"perm", reversed_axis}}; + auto runner = NpuOpRunner("TransposeD", {*out_grad}, {*x_grad}, attr_input); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + transpose2, + ops::TransposeNPUKernel, + ops::TransposeNPUKernel, + ops::TransposeNPUKernel, + ops::TransposeNPUKernel, + ops::TransposeNPUKernel); + +REGISTER_OP_NPU_KERNEL(transpose2_grad, ops::TransposeGradNPUKernel, + ops::TransposeGradNPUKernel, + ops::TransposeGradNPUKernel, + ops::TransposeGradNPUKernel, + ops::TransposeGradNPUKernel); diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..36f7a6953585114e03cc11ff03e1d2da7d8bcd0e --- /dev/null +++ b/paddle/fluid/operators/transpose_op_npu_test.cc @@ -0,0 +1,137 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(transpose2); +USE_OP_DEVICE_KERNEL(transpose2, NPU); + +template +void Compare(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto x = scope->Var("X"); + auto out = scope->Var("Out"); + auto xshape = scope->Var("XShape"); + auto* x_t = x->GetMutable(); + auto* out_t = out->GetMutable(); + auto* xshape_t = xshape->GetMutable(); + auto place = ctx.GetPlace(); + + int dim0 = 2; + int dim1 = 3; + TensorFromVector(std::vector({0, 1, 2, 3, 4, 5}), ctx, x_t); + ctx.Wait(); + x_t->Resize({dim0, dim1}); + out_t->Resize({dim0, dim1}); + ctx.Wait(); + out_t->mutable_data(place); + ctx.Wait(); + xshape_t->Resize({dim0, dim1}); + xshape_t->mutable_data(place); + f::AttributeMap attrs = {{"axis", std::vector({1, 0})}, + {"data_format", std::string("AnyLayout")}}; + auto op = f::OpRegistry::CreateOp("transpose2", {{"X", {"X"}}}, + {{"Out", {"Out"}}, {"XShape", {"XShape"}}}, + attrs); + ctx.Wait(); + op->Run(*scope, place); + ctx.Wait(); + std::vector out_v; + TensorToVector(*out_t, ctx, &out_v); + ctx.Wait(); + + EXPECT_EQ(out_t->numel(), dim0 * dim1); + EXPECT_EQ(out_v[0], 0); + EXPECT_EQ(out_v[1], 3); + EXPECT_EQ(out_v[2], 1); + EXPECT_EQ(out_v[3], 4); + EXPECT_EQ(out_v[4], 2); + EXPECT_EQ(out_v[5], 5); +} + +template +void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto xshape = scope->Var("XShape"); + auto x_grad = scope->Var("X@GRAD"); + auto out_grad = scope->Var("Out@GRAD"); + + auto* x_grad_t = x_grad->GetMutable(); + auto* xshape_t = xshape->GetMutable(); + auto* out_grad_t = out_grad->GetMutable(); + + int dim0 = 2; + int dim1 = 3; + auto place = ctx.GetPlace(); + + TensorFromVector(std::vector({0, 1, 2, 3, 4, 5}), ctx, out_grad_t); + ctx.Wait(); + + x_grad_t->Resize({dim0, dim1}); + xshape_t->Resize( + {0, dim0, + dim1}); // NOTE(zhiqiu): 0 is needed, see its infershape function + out_grad_t->Resize({dim0, dim1}); + + f::AttributeMap attrs = {{"axis", std::vector({1, 0})}, + {"data_format", std::string("AnyLayout")}}; + + auto op = f::OpRegistry::CreateOp( + "transpose2_grad", {{"Out@GRAD", {"Out@GRAD"}}, {"XShape", {"XShape"}}}, + {{"X@GRAD", {"X@GRAD"}}}, attrs); + + op->Run(*scope, place); + ctx.Wait(); + std::vector out_v; + TensorToVector(*x_grad_t, ctx, &out_v); + ctx.Wait(); + + EXPECT_EQ(x_grad_t->numel(), dim0 * dim1); + EXPECT_EQ(out_v[0], 0); + EXPECT_EQ(out_v[1], 3); + EXPECT_EQ(out_v[2], 1); + EXPECT_EQ(out_v[3], 4); + EXPECT_EQ(out_v[4], 2); + EXPECT_EQ(out_v[5], 5); +} + +TEST(transpose2, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx); +} + +TEST(transpose2_grad, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + CompareGrad(&scope, ctx); +} diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..4253187fdde74de37afac13adb6fa969efd08b99 --- /dev/null +++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc @@ -0,0 +1,113 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/truncated_gaussian_random_op.h" +#include +#include +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class TruncatedGaussianRandomNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // TODO(zhiqiu): support dynamic shape and call ParameterizedTruncatedNormal + std::vector shape = ctx.Attr>("shape"); + Tensor shape_tensor(framework::proto::VarType::INT32); + shape_tensor.mutable_data({static_cast(shape.size())}, + ctx.GetPlace()); + TensorFromVector(shape, ctx.device_context(), &shape_tensor); + float mean = ctx.Attr("mean"); + Tensor mean_tensor(framework::proto::VarType::FP32); + mean_tensor.mutable_data({1}, ctx.GetPlace()); + TensorFromVector(std::vector{mean}, ctx.device_context(), + &mean_tensor); + + float std = ctx.Attr("std"); + Tensor std_tensor(framework::proto::VarType::FP32); + std_tensor.mutable_data({1}, ctx.GetPlace()); + TensorFromVector(std::vector{std}, ctx.device_context(), + &std_tensor); + + int32_t seed_var = ctx.Attr("seed"); + + Tensor min_tensor(framework::proto::VarType::FP32); + min_tensor.mutable_data({1}, ctx.GetPlace()); + float min_value = mean - std * 2.0; + TensorFromVector(std::vector{min_value}, ctx.device_context(), + &min_tensor); + + Tensor max_tensor(framework::proto::VarType::FP32); + max_tensor.mutable_data({1}, ctx.GetPlace()); + float max_value = mean + std * 2.0; + TensorFromVector(std::vector{max_value}, ctx.device_context(), + &max_tensor); + + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + auto stream = + ctx.template device_context() + .stream(); + auto runner = NpuOpRunner( + "ParameterizedTruncatedNormal", + {shape_tensor, mean_tensor, std_tensor, min_tensor, max_tensor}, {*out}, + {{"seed", seed_var}}); + runner.Run(stream); + } +}; + +// NOTE(zhiqiu): actually, this is cpu version kernel, and we need to make the +// above +// npu version work in the future. +template +class NPUTruncatedGaussianRandomKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + float mean = context.Attr("mean"); + float std = context.Attr("std"); + auto* tensor = context.Output("Out"); + tensor->mutable_data(context.GetPlace()); + + Tensor cpu_tensor(tensor->type()); + cpu_tensor.Resize(tensor->dims()); + T* cpu_data = cpu_tensor.mutable_data(platform::CPUPlace()); + std::uniform_real_distribution dist(std::numeric_limits::min(), + 1.0); + TruncatedNormal truncated_normal(mean, std); + int64_t size = tensor->numel(); + + unsigned int seed = static_cast(context.Attr("seed")); + auto engine = framework::GetCPURandomEngine(seed); + for (int64_t i = 0; i < size; ++i) { + cpu_data[i] = truncated_normal(dist(*engine)); + } + framework::TensorCopy( + cpu_tensor, context.GetPlace(), + context.template device_context(), tensor); + context.template device_context() + .Wait(); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL(truncated_gaussian_random, + ops::NPUTruncatedGaussianRandomKernel); diff --git a/paddle/fluid/operators/unsqueeze_op_npu.cc b/paddle/fluid/operators/unsqueeze_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..c3daeffc13d1a771e57f125939c91ebd060f3cee --- /dev/null +++ b/paddle/fluid/operators/unsqueeze_op_npu.cc @@ -0,0 +1,41 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_ASCEND_CL +#include +#include + +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/operators/unsqueeze_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + unsqueeze, ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel); +REGISTER_OP_NPU_KERNEL( + unsqueeze2, ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel); +#endif diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..9b4485047f05c1cd35fc534a615d594ab37be639 --- /dev/null +++ b/paddle/fluid/operators/unsqueeze_op_npu_test.cc @@ -0,0 +1,90 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(unsqueeze); +USE_OP_DEVICE_KERNEL(unsqueeze, NPU); + +template +void Compare(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto x = scope->Var("X"); + auto tensor_x = x->GetMutable(); + + int dim0 = 5; + int dim1 = 10; + + std::vector init; + for (int64_t i = 0; i < dim0 * dim1; ++i) { + init.push_back(static_cast(0.1)); + } + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({dim0, dim1}); + + ctx.Wait(); + + // run + auto place = ctx.GetPlace(); + auto out = scope->Var("Out"); + auto tensor_out = out->GetMutable(); + + std::vector axis; + axis.push_back(1); + f::AttributeMap attrs = {{"axes", axis}}; + + auto op = f::OpRegistry::CreateOp("unsqueeze", {{"X", {"X"}}}, + {{"Out", {"Out"}}}, attrs); + + op->Run(*scope, place); + ctx.Wait(); + + EXPECT_EQ((uint32_t)tensor_out->dims().size(), uint32_t(3)); + EXPECT_EQ((uint32_t)tensor_out->dims()[0], uint32_t(5)); + EXPECT_EQ((uint32_t)tensor_out->dims()[1], uint32_t(1)); + EXPECT_EQ((uint32_t)tensor_out->dims()[2], uint32_t(10)); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], static_cast(0.1)); + } + + ctx.Wait(); +} + +TEST(unsqueeze, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx); +} diff --git a/paddle/fluid/platform/npu_info.cc b/paddle/fluid/platform/npu_info.cc index 090945239a3a1fd7e74d5ed1826bbc73ccc795f6..3814faa7662fc556bc84d61802772b7e7db7ad74 100644 --- a/paddle/fluid/platform/npu_info.cc +++ b/paddle/fluid/platform/npu_info.cc @@ -163,8 +163,11 @@ size_t NPUInitAllocSize() { return NPUAllocSize(/* realloc = */ false); } size_t NPUReallocSize() { return NPUAllocSize(/* realloc = */ true); } size_t NPUMinChunkSize() { - // Allow to allocate the minimum chunk size is 256 bytes. - return 1 << 8; + // NOTE(zhiqiu): It seems the min chunk size should be 512 on NPU, + // though no document specify that explicitly. + // See https://gitee.com/zhiqiuchen/Ascend/tree/master/test_reduce_sum_d for + // details. + return 1 << 9; } size_t NPUMaxChunkSize() { diff --git a/python/paddle/distributed/fleet/ascend_utils.py b/python/paddle/distributed/fleet/ascend_utils.py index 23e812041c8b2d2230539bdf8e02252d720ba3d0..7a4a4a189c92e4844be1bcf0d93a63d832a25ec2 100644 --- a/python/paddle/distributed/fleet/ascend_utils.py +++ b/python/paddle/distributed/fleet/ascend_utils.py @@ -82,24 +82,25 @@ def _get_ascend_rankfile(rank_table_file_path): def get_cloud_cluster(rank_table_file=None, device_mode=DeviceMode.ASCEND_NPU, - devices_per_proc=None, start_port=6070): """ Args: rank_table_file: string, ascend npu rank file path device_mode: DeviceMode(Int) - devices_per_proc:list start_port: the start port of current runtime env """ if rank_table_file: # multi trainers node_ips, device_count = _get_ascend_rankfile(rank_table_file) - node_index = os.environ.get("PADDLE_TRAINER_ID") - node_ip = None - if node_index is None: - _, node_ip = get_host_name_ip() + if len(node_ips) == 1: + node_ip = node_ips[0] else: - node_ip = node_ips[int(node_index)] + node_index = os.environ.get("PADDLE_TRAINER_ID") + node_ip = None + if node_index: + node_ip = node_ips[int(node_index)] + else: + _, node_ip = get_host_name_ip() assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \ % (node_ip, node_ips) @@ -108,11 +109,8 @@ def get_cloud_cluster(rank_table_file=None, node_ips = ["127.0.0.1"] node_ip = node_ips[0] device_count = 1 - devices_per_proc = None - - if devices_per_proc is None: - devices_per_proc = [str(x) for x in range(device_count)] + devices_per_proc = [str(x) for x in range(device_count)] free_ports = [ x for x in range(start_port, start_port + len(devices_per_proc)) ] diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index bd5b67005ba92770ffebd466e3516e55ab7d2141..13b793d3ad170177e193152ca06c280ac911e8a7 100644 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -115,15 +115,6 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra default="collective", help="run mode of job, can be:collective/ps/ps-heter") - base_group.add_argument( - "--ascend_npus", - type=str, - default=None, - help="It's for ascend npu training." - "For example:" - "--ascend_npus=\"0,1,2,3\" will launch four training processes each bound to one npu." - ) - if fluid.core.is_compiled_with_cuda(): base_group.add_argument( "--gpus", @@ -243,7 +234,6 @@ def launch_collective(args): cluster, pod = ascend_utils.get_cloud_cluster( rank_table_file=os.getenv("RANK_TABLE_FILE", None), device_mode=device_mode, - devices_per_proc=devices_per_proc, start_port=start_port) else: # trainers_num = 1 or not use paddlecloud ips="a,b" diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 9f6c186b353399fdbc4c1310337d43d1d314b681..b4d5c58abbf2e59343e0f2fd0088b792c435bddb 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -484,6 +484,11 @@ def start_local_trainers(cluster, proc_env["FLAGS_selected_gpus"] = "%s" % ",".join( [str(g) for g in t.accelerators]) + elif len(t. + accelerators) > 0 and pod.device_mode == DeviceMode.ASCEND_NPU: + proc_env["FLAGS_selected_npus"] = "%s" % ",".join( + [str(g) for g in t.accelerators]) + if len(t.accelerators) > 0: proc_env["FLAGS_selected_accelerators"] = "%s" % ",".join( [str(g) for g in t.accelerators]) @@ -589,17 +594,6 @@ def watch_local_trainers(procs, nranks): return alive -def get_ascend_npus(npus): - if npus is None: - count = fluid.core.NPUDevice.get_device_count() - if count <= 0: - return None - ret = [str(x) for x in range(count)] - else: - ret = [x.strip() for x in npus.split(',')] - return ret - - def get_gpus(gpus): if gpus is None: gpus_num = fluid.core.get_cuda_device_count() @@ -697,9 +691,7 @@ def get_device_proc_info(args): else: devices_per_proc = gpus elif device_mode == DeviceMode.ASCEND_NPU: - npus = get_ascend_npus(args.ascend_npus) - assert args.nproc_per_node is None, "ascend_npus need't nproc_per_node arguments" - devices_per_proc = npus + devices_per_proc = None elif device_mode == DeviceMode.XPU: xpus = get_xpus(args.xpus) if args.nproc_per_node is not None: diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 565c134ae9d95f969b7a81245b7993b3b48ff5b8..e90af2a1e790c358f830be2d91d505fc91899a33 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9518,8 +9518,8 @@ def pow(x, factor=1.0, name=None): y_2 = fluid.layers.pow(x, factor=factor_tensor) # y_2 is x^{3.0} """ - check_variable_and_dtype(x, 'x', ['int32', 'int64', 'float32', 'float64'], - 'pow') + check_variable_and_dtype( + x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], 'pow') helper = LayerHelper('pow', **locals()) inputs = {'X': x} diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 89591646421edd8b9a6de065483fcd796a7f40c5..6b2765745169342863ce5b3d37fadd281047eeec 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -531,7 +531,7 @@ if(WITH_DISTRIBUTE) bash_test_modules(test_fleet_launch_async START_BASH test_fleet_launch_async.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) bash_test_modules(test_fleet_launch_cloud START_BASH test_fleet_launch_cloud.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) bash_test_modules(test_fleet_launch_nproc START_BASH test_fleet_launch_nproc.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) - if(WITH_ASCEND) + if(WITH_ASCEND OR WITH_ASCEND_CL) bash_test_modules(test_fleet_launch_ascend START_BASH test_fleet_launch_ascend.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) bash_test_modules(test_ascend_group START_BASH test_ascend_group.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) endif() diff --git a/python/paddle/fluid/tests/unittests/ascend_group.py b/python/paddle/fluid/tests/unittests/ascend_group.py index 78a3687b5ca3cd2b8687b6b425cad61318cb3671..851544e165980a8cf1ee750cd7b6a9417e00ed48 100644 --- a/python/paddle/fluid/tests/unittests/ascend_group.py +++ b/python/paddle/fluid/tests/unittests/ascend_group.py @@ -71,6 +71,24 @@ def init_communicator(startup_program, main_program, current_endpoint, OP_ROLE_KEY: OpRole.Forward, }) + # add input op for test + fill_var_name = "tensor@Filled" + fill_var = block.create_var( + name=fill_var_name, + shape=[10, 10], + dtype='float32', + persistable=False, + stop_gradient=True) + block.append_op( + type="fill_constant", + outputs={"Out": fill_var_name}, + attrs={ + "shape": [10, 10], + "dtype": fill_var.dtype, + "value": 1.0, + "place_type": 1 + }) + with fluid.program_guard(main_program): op_type = "c_allreduce_sum" data = fluid.layers.fill_constant(shape=[1], dtype='float32', value=2.5) @@ -120,10 +138,14 @@ def train(world_endpoints, world_device_ids, local_device_ids, local_rank): main_program = main_programs[local_rank] loss = Loss(Block(main_program)) optimizer = ascend_optimizer.AscendOptimizer(None, fetch_list=[]) - optimizer.minimize(loss, startup_program, auto_dp=True) + optimizer.minimize( + loss, + startup_program, + auto_dp=True, + rank_table_file=os.getenv("RANK_TABLE_FILE", None)) exe = paddle.static.Executor(paddle.CPUPlace()) - #exe.run(startup_program) + exe.run(startup_program) exe.run(main_program) diff --git a/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py index 33e6f63ea10ceda243a6d11ddbe45f00bf03ad40..bb2180a733f818f81963e9702c854a78488a7592 100644 --- a/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py +++ b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py @@ -19,6 +19,7 @@ import time def train(prefix): selected_accelerators = os.getenv("FLAGS_selected_accelerators") + selected_npus = os.getenv("FLAGS_selected_npus") trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") @@ -27,8 +28,8 @@ def train(prefix): device_ids = os.getenv("PADDLE_WORLD_DEVICE_IDS") current_device_id = os.getenv("PADDLE_LOCAL_DEVICE_IDS") - details = "selected_accelerators:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}"\ - .format(selected_accelerators, worker_endpoints, trainers_num, current_endpoint,trainer_id,device_ids, current_device_id) + details = "selected_accelerators:{} selected_npus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}"\ + .format(selected_accelerators, selected_npus, worker_endpoints, trainers_num, current_endpoint,trainer_id,device_ids, current_device_id) print(details) with open("multi_process_{}.check_{}.log".format(prefix, trainer_id), diff --git a/python/paddle/fluid/tests/unittests/hccl_tools.py b/python/paddle/fluid/tests/unittests/hccl_tools.py new file mode 100644 index 0000000000000000000000000000000000000000..3ae8f38dc64bd1da6ab5a46ddd60b239b5461ad9 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/hccl_tools.py @@ -0,0 +1,174 @@ +# -*- coding:UTF-8 -*- + +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""generate hccl config file script""" +import os +import sys +import json +import socket +from argparse import ArgumentParser +from typing import Dict, Any + + +def parse_args(): + """ + parse args . + + Args: + + Returns: + args. + + Examples: + >>> parse_args() + """ + parser = ArgumentParser(description="mindspore distributed training launch " + "helper utilty that will generate hccl" + " config file") + parser.add_argument( + "--device_num", + type=str, + default="[0,8)", + help="The number of the Ascend accelerators used. please note that the Ascend accelerators" + "used must be continuous, such [0,4) means to use four chips " + "0,1,2,3; [0,1) means to use chip 0; The first four chips are" + "a group, and the last four chips are a group. In addition to" + "the [0,8) chips are allowed, other cross-group such as [3,6)" + "are prohibited.") + parser.add_argument( + "--visible_devices", + type=str, + default="0,1,2,3,4,5,6,7", + help="will use the visible devices sequentially") + parser.add_argument("--server_ip", type=str, default="", help="server ip") + args = parser.parse_args() + return args + + +def get_host_ip(): + """ + get host ip + """ + ip = None + + try: + hostname = socket.gethostname() + ip = socket.gethostbyname(hostname) + except EOFError: + pass + + return ip + + +def main(): + print("start", __file__) + args = parse_args() + + # visible_devices + visible_devices = args.visible_devices.split(',') + print('visible_devices:{}'.format(visible_devices)) + + # server_id + ip = get_host_ip() + if args.server_ip: + server_id = args.server_ip + elif ip: + server_id = ip + else: + raise ValueError("please input server ip!") + print('server_id:{}'.format(server_id)) + + # device_num + first_num = int(args.device_num[1]) + last_num = int(args.device_num[3]) + if first_num < 0 or last_num > 8: + raise ValueError("device num {} must be in range [0,8] !".format( + args.device_num)) + if first_num > last_num: + raise ValueError( + "First num {} of device num {} must less than last num {} !".format( + first_num, args.device_num, last_num)) + if first_num < 4: + if last_num > 4: + if first_num == 0 and last_num == 8: + pass + else: + raise ValueError( + "device num {} must be in the same group of [0,4] or [4,8] !". + format(args.device_num)) + + device_num_list = list(range(first_num, last_num)) + print("device_num_list:", device_num_list) + + assert len(visible_devices) >= len(device_num_list) + + # construct hccn_table + device_ips = {} + with open('/etc/hccn.conf', 'r') as fin: + for hccn_item in fin.readlines(): + if hccn_item.strip().startswith('address_'): + device_id, device_ip = hccn_item.split('=') + device_id = device_id.split('_')[1] + device_ips[device_id] = device_ip.strip() + + hccn_table = {'version': '1.0', 'server_count': '1', 'server_list': []} + device_list = [] + rank_id = 0 + for instance_id in device_num_list: + device_id = visible_devices[instance_id] + device_ip = device_ips[device_id] + device = { + 'device_id': device_id, + 'device_ip': device_ip, + 'rank_id': str(rank_id) + } + print('rank_id:{}, device_id:{}, device_ip:{}'.format( + rank_id, device_id, device_ip)) + rank_id += 1 + device_list.append(device) + hccn_table['server_list'].append({ + 'server_id': server_id, + 'device': device_list, + 'host_nic_ip': 'reserve' + }) + hccn_table['status'] = 'completed' + + # save hccn_table to file + table_path = os.getcwd() + table_fn = os.path.join(table_path, 'hccl_{}p_{}_{}.json'.format( + len(device_num_list), "".join(map(str, device_num_list)), server_id)) + with open(table_fn, 'w') as table_fp: + json.dump(hccn_table, table_fp, indent=4) + sys.stdout.flush() + print("Completed: hccl file was save in :", table_fn) + + +if __name__ == "__main__": + main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..b5175bdb19c7e5bc2e981b7f76fc2b7471d73d6f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py @@ -0,0 +1,122 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() + +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestAccuracy(OpTest): + def setUp(self): + self.op_type = "accuracy" + self.set_npu() + self.init_dtype() + np.random.seed(SEED) + pred = np.random.uniform(1, 2, [11, 1]).astype(self.dtype) + label = pred.copy() + accuracy = np.array([1]).astype(self.dtype) + correct = np.array([11 * 1]).astype(self.dtype) + total = np.array([11 * 1]).astype(self.dtype) + + self.inputs = { + "Out": OpTest.np_dtype_to_fluid_dtype(pred), + "Label": OpTest.np_dtype_to_fluid_dtype(label), + "Indices": OpTest.np_dtype_to_fluid_dtype(pred) + } + self.outputs = { + "Accuracy": accuracy, + "Correct": correct, + "Total": total + } + + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +class TestAccuracy2(TestAccuracy): + def setUp(self): + self.op_type = "accuracy" + self.set_npu() + self.init_dtype() + np.random.seed(SEED) + pred = np.random.uniform(1, 2, [11, 1]).astype(self.dtype) + label = np.random.uniform(4, 5, [11, 1]).astype(self.dtype) + accuracy = np.array([0]).astype(self.dtype) + correct = np.array([11 * 0]).astype(self.dtype) + total = np.array([11 * 1]).astype(self.dtype) + + self.inputs = { + "Out": OpTest.np_dtype_to_fluid_dtype(pred), + "Label": OpTest.np_dtype_to_fluid_dtype(label), + "Indices": OpTest.np_dtype_to_fluid_dtype(pred) + } + self.outputs = { + "Accuracy": accuracy, + "Correct": correct, + "Total": total + } + + +class TestAccuracy3(TestAccuracy): + def setUp(self): + self.op_type = "accuracy" + self.set_npu() + self.init_dtype() + np.random.seed(SEED) + a = np.random.randint(1, 2, [5, 1]) + b = np.random.randint(0, 1, [5, 1]) + pred = np.row_stack((a, b)).astype(self.dtype) + label = np.random.randint(1, 2, [10, 1]).astype(self.dtype) + accuracy = np.array([0.5]).astype(self.dtype) + correct = np.array([5]).astype(self.dtype) + total = np.array([10 * 1]).astype(self.dtype) + + self.inputs = { + "Out": OpTest.np_dtype_to_fluid_dtype(pred), + "Label": OpTest.np_dtype_to_fluid_dtype(label), + "Indices": OpTest.np_dtype_to_fluid_dtype(pred) + } + self.outputs = { + "Accuracy": accuracy, + "Correct": correct, + "Total": total + } + + +class TestAccuracyInt(TestAccuracy): + def init_dtype(self): + self.dtype = np.int + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..ebf041388eeab9707ff9143de3002b11c7c6a94d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py @@ -0,0 +1,148 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +from test_adam_op import adam_step + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestSGD(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "adam" + param = np.random.uniform(-1, 1, (102, 105)).astype("float32") + grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") + moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32") + # The second moment is positive + moment2 = np.random.random((102, 105)).astype("float32") + + learning_rate = 0.004 + beta1 = 0.78 + beta2 = 0.836 + epsilon = 1e-4 + beta1_pow = beta1**10 + beta2_pow = beta2**10 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment1': moment1, + 'Moment2': moment2, + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta1_pow]).astype("float32"), + 'Beta2Pow': np.array([beta2_pow]).astype("float32") + } + + self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2} + + param_out, moment1_out, \ + moment2_out = adam_step(self.inputs, self.attrs) + + self.outputs = { + 'Moment1Out': moment1_out, + 'Moment2Out': moment2_out, + 'ParamOut': param_out, + 'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1, + 'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2 + } + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-5, check_dygraph=False) + + +''' +# TODO(zhiqiu): The following test may let 0-3 card down. +# we need to analyze it and open it. + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + sum = paddle.add(a, b) + z = paddle.pow(sum, 2.0) + + fc_1 = fluid.layers.fc(input=z, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + adam = fluid.optimizer.Adam(learning_rate=0.01) + adam.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) +''' + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..4cda0ceeccf9c703bfebf86700e9f41d84c7b9c1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py @@ -0,0 +1,123 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest, skip_check_grad_ci +import paddle +import paddle.fluid as fluid + +paddle.enable_static() + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestCheckFiniteAndUnscaleOp(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "check_finite_and_unscale" + self.place = paddle.NPUPlace(0) + self.init_dtype() + x = np.random.random((1024, 1024)).astype(self.dtype) + scale = np.random.random((1)).astype(self.dtype) + + self.inputs = {'X': [('x0', x)], 'Scale': scale} + self.outputs = { + 'FoundInfinite': np.array([0]), + 'Out': [('out0', x / scale)], + } + + def set_npu(self): + self.__class__.use_npu = True + + def init_kernel_type(self): + self.use_mkldnn = False + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestCheckFiniteAndUnscaleOpWithNan(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "check_finite_and_unscale" + self.place = paddle.NPUPlace(0) + self.init_dtype() + x = np.random.random((1024, 1024)).astype(self.dtype) + x[128][128] = np.nan + scale = np.random.random((1)).astype(self.dtype) + + self.inputs = {'X': [('x0', x)], 'Scale': scale} + self.outputs = { + 'FoundInfinite': np.array([1]), + 'Out': [('out0', x)], + } + + def set_npu(self): + self.__class__.use_npu = True + + def init_kernel_type(self): + self.use_mkldnn = False + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + # When input contains nan, do not check the output, + # since the output may be nondeterministic and will be discarded. + self.check_output_with_place( + self.place, check_dygraph=False, no_check_set=['Out']) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestCheckFiniteAndUnscaleOpWithInf(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "check_finite_and_unscale" + self.place = paddle.NPUPlace(0) + self.init_dtype() + x = np.random.random((1024, 1024)).astype(self.dtype) + x[128][128] = np.inf + scale = np.random.random((1)).astype(self.dtype) + + self.inputs = {'X': [('x0', x)], 'Scale': scale} + self.outputs = { + 'FoundInfinite': np.array([1]), + 'Out': [('out0', x)], + } + + def set_npu(self): + self.__class__.use_npu = True + + def init_kernel_type(self): + self.use_mkldnn = False + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + # When input contains inf, do not check the output, + # since the output may be nondeterministic and will be discarded. + self.check_output_with_place( + self.place, check_dygraph=False, no_check_set=['Out']) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py new file mode 100755 index 0000000000000000000000000000000000000000..b39771e29c7b4769bab6548d34ebac24bf47506a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py @@ -0,0 +1,77 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestCast1(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "cast" + self.place = paddle.NPUPlace(0) + + ipt = np.random.random(size=[10, 10]) + 1 + self.inputs = {'X': ipt.astype('float32')} + self.outputs = {'Out': ipt.astype('float16')} + + self.attrs = { + 'in_dtype': int(core.VarDesc.VarType.FP32), + 'out_dtype': int(core.VarDesc.VarType.FP16) + } + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +class TestCast2(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "cast" + self.place = paddle.NPUPlace(0) + + ipt = np.random.random(size=[10, 10]) + 1 + self.inputs = {'X': ipt.astype('float16')} + self.outputs = {'Out': ipt.astype('float32')} + + self.attrs = { + 'in_dtype': int(core.VarDesc.VarType.FP16), + 'out_dtype': int(core.VarDesc.VarType.FP32) + } + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..54a2c1e7163a9f122927bc2781eb1a13a84a124e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py @@ -0,0 +1,145 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestEqual(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "equal" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + out = x == y # all elements are not equal + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(x), + 'Y': OpTest.np_dtype_to_fluid_dtype(y) + } + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestLessthan(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "less_than" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + out = x < y + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(x), + 'Y': OpTest.np_dtype_to_fluid_dtype(y) + } + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +class TestEqual2(TestEqual): + def setUp(self): + self.set_npu() + self.op_type = "equal" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + y = x.copy() + y[0][1] = 1 + out = x == y # all elements are equal, except position [0][1] + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(x), + 'Y': OpTest.np_dtype_to_fluid_dtype(y) + } + self.outputs = {'Out': out} + + +class TestLessthan2(TestLessthan): + def setUp(self): + self.set_npu() + self.op_type = "less_than" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + y = x.copy() + y[0][1] = 1 + out = x < y # all elements are equal, except position [0][1] + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(x), + 'Y': OpTest.np_dtype_to_fluid_dtype(y) + } + self.outputs = {'Out': out} + + +class TestEqual2FP16(TestEqual2): + def init_dtype(self): + self.dtype = np.float16 + + +class TestEqual2Int(TestEqual2): + def init_dtype(self): + self.dtype = np.int32 + + +class TestLessthan2FP16(TestLessthan2): + def init_dtype(self): + self.dtype = np.float16 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..a2ec1c7a9eef6ebc516cdc94062dd628c77a1f81 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py @@ -0,0 +1,115 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestConcat(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "concat" + self.place = paddle.NPUPlace(0) + self.init_dtype() + self.init_test_data() + + self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]} + self.attrs = {'axis': self.axis} + if self.axis < 0: + self.actual_axis = self.axis + len(self.x0.shape) + self.actual_axis = self.actual_axis if self.actual_axis > 0 else 0 + else: + self.actual_axis = self.axis + + self.outputs = { + 'Out': np.concatenate( + (self.x0, self.x1, self.x2), axis=self.actual_axis) + } + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + def init_test_data(self): + self.x0 = np.random.random((1, 4, 50)).astype(self.dtype) + self.x1 = np.random.random((2, 4, 50)).astype(self.dtype) + self.x2 = np.random.random((3, 4, 50)).astype(self.dtype) + self.axis = 0 + + def test_check_grad(self): + self.check_grad_with_place( + self.place, ['x0', 'x2'], 'Out', check_dygraph=False) + self.check_grad_with_place( + self.place, ['x1'], 'Out', check_dygraph=False) + self.check_grad_with_place( + self.place, ['x2'], 'Out', check_dygraph=False) + + +class TestConcatFP16(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "concat" + self.place = paddle.NPUPlace(0) + self.init_dtype() + self.init_test_data() + + self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]} + self.attrs = {'axis': self.axis} + if self.axis < 0: + self.actual_axis = self.axis + len(self.x0.shape) + self.actual_axis = self.actual_axis if self.actual_axis > 0 else 0 + else: + self.actual_axis = self.axis + + self.outputs = { + 'Out': np.concatenate( + (self.x0, self.x1, self.x2), axis=self.actual_axis) + } + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + def init_test_data(self): + self.x0 = np.random.random((1, 4, 50)).astype(self.dtype) + self.x1 = np.random.random((2, 4, 50)).astype(self.dtype) + self.x2 = np.random.random((3, 4, 50)).astype(self.dtype) + self.axis = 0 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py index 47da4fdb23ec49924fbfb1b5cc4b02e2355d287e..6a82157faaec41d9abaffa9b68e3a3e80b6b2fb3 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py @@ -64,28 +64,28 @@ class TestElementwiseAddOp(OpTest): def test_check_output(self): self.check_output_with_place(self.place, check_dygraph=False) - # TODO(ascendrc): Test grad op after it is implemented. - # def test_check_grad_normal(self): - # self.check_grad_with_place( - # self.place, ['X', 'Y'], - # 'Out', - # max_relative_error=0.006, - # check_dygraph=False) - # - # def test_check_grad_ingore_x(self): - # self.check_grad_with_place( - # self.place, ['Y'], - # 'Out', - # no_grad_set=set("X"), - # max_relative_error=0.006, - # check_dygraph=False) - # - # def test_check_grad_ingore_y(self): - # self.check_grad_with_place( - # self.place, ['X'], - # 'Out', - # no_grad_set=set("Y"), - # max_relative_error=0.006,check_dygraph=False) + def test_check_grad_normal(self): + self.check_grad_with_place( + self.place, ['X', 'Y'], + 'Out', + max_relative_error=0.006, + check_dygraph=False) + + def test_check_grad_ingore_x(self): + self.check_grad_with_place( + self.place, ['Y'], + 'Out', + no_grad_set=set("X"), + max_relative_error=0.006, + check_dygraph=False) + + def test_check_grad_ingore_y(self): + self.check_grad_with_place( + self.place, ['X'], + 'Out', + no_grad_set=set("Y"), + max_relative_error=0.006, + check_dygraph=False) @unittest.skipIf(not paddle.is_compiled_with_npu(), @@ -133,10 +133,6 @@ class TestAddAPI(unittest.TestCase): True, msg="z_value = {}, but expected {}".format(z_value, z_expected)) - def test_backward(self): - # TODO(ascendrc): Test backward after add grad npu op implemented. - pass - @unittest.skipIf(not paddle.is_compiled_with_npu(), "core is not compiled with NPU") diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..0ae2678d10b47c8998882e3ee00d177e86236a06 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py @@ -0,0 +1,183 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestElementwiseDiv(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_div" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + out = np.divide(x, y) + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(x), + 'Y': OpTest.np_dtype_to_fluid_dtype(y) + } + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + def test_check_grad_normal(self): + self.check_grad_with_place( + self.place, ['X', 'Y'], + 'Out', + max_relative_error=0.007, + check_dygraph=False) + + def test_check_grad_ingore_x(self): + self.check_grad_with_place( + self.place, ['Y'], + 'Out', + max_relative_error=0.007, + no_grad_set=set("X"), + check_dygraph=False) + + def test_check_grad_ingore_y(self): + self.check_grad_with_place( + self.place, ['X'], 'Out', no_grad_set=set("Y"), check_dygraph=False) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestElementwiseDivFp16(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_div" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) + y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) + out = np.divide(x, y) + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(x), + 'Y': OpTest.np_dtype_to_fluid_dtype(y) + } + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestElementwiseDivNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.uniform(1, 2, [32, 32]).astype('float32') + b_np = np.random.uniform(1, 2, [32, 32]).astype('float32') + c_np = np.random.uniform(1, 2, [32, 32]).astype('float32') + d_np = np.random.uniform(1, 2, [32, 32]).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + c = paddle.static.data(name="c", shape=[32, 32], dtype='float32') + d = paddle.static.data(name="d", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + e = paddle.multiply(a, b) + f = paddle.multiply(c, d) + f.stop_gradient = True + g = fluid.layers.elementwise_div(e, f) + + fc_1 = fluid.layers.fc(input=g, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run(main_prog, + feed={ + "a": a_np, + "b": b_np, + "c": c_np, + "d": d_np, + "label": label_np + }, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_floordiv_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_floordiv_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..93538e938670f07ab78f33e0c9749b702854b7d6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_floordiv_op_npu.py @@ -0,0 +1,67 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle + +paddle.enable_static() + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestElementwiseFloorDiv(OpTest): + def setUp(self): + self.op_type = "elementwise_floordiv" + self.set_npu() + self.init_dtype() + self.init_input_output() + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(self.x), + 'Y': OpTest.np_dtype_to_fluid_dtype(self.y) + } + self.attrs = {} + self.outputs = {'Out': self.out} + + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def init_input_output(self): + self.x = np.random.uniform(1, 1000, [10, 10]).astype(self.dtype) + self.y = np.random.uniform(1, 1000, [10, 10]).astype(self.dtype) + self.out = np.floor_divide(self.x, self.y) + + def init_dtype(self): + self.dtype = "int64" + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestElementwiseFloorDiv2(TestElementwiseFloorDiv): + def init_dtype(self): + self.dtype = "int32" + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..b4d9c7285b2b556f76f94241cc0b9373319f7753 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py @@ -0,0 +1,161 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestElementwiseMin(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_min" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + out = np.minimum(x, y) + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(x), + 'Y': OpTest.np_dtype_to_fluid_dtype(y) + } + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + # TODO(ascendrc): Min grad test + # def test_check_grad(self): + # if self.dtype == np.float16: + # return + # self.check_grad(['X'], 'Out') + # + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestElementwiseMinFp16(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_min" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) + y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) + out = np.minimum(x, y) + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(x), + 'Y': OpTest.np_dtype_to_fluid_dtype(y) + } + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestElementwiseMinNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + c = paddle.minimum(a, b) + + fc_1 = fluid.layers.fc(input=c, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..9bfb7e033e7ea454223c683877bb30f02506be75 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py @@ -0,0 +1,171 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestElementwiseMul(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_mul" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + out = np.multiply(x, y) + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(x), + 'Y': OpTest.np_dtype_to_fluid_dtype(y) + } + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + # TODO(ascendrc): Mul grad test + # def test_check_grad(self): + # if self.dtype == np.float16: + # return + # self.check_grad(['X'], 'Out') + # + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestElementwiseMulFp16(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_mul" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) + y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) + out = np.multiply(x, y) + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(x), + 'Y': OpTest.np_dtype_to_fluid_dtype(y) + } + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestElementwiseMulNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + c_np = np.random.random(size=(32, 32)).astype('float32') + d_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + c = paddle.static.data(name="c", shape=[32, 32], dtype='float32') + d = paddle.static.data(name="d", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + e = paddle.multiply(a, b) + f = paddle.multiply(c, d) + f.stop_gradient = True + g = paddle.multiply(e, f) + + fc_1 = fluid.layers.fc(input=g, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run(main_prog, + feed={ + "a": a_np, + "b": b_np, + "c": c_np, + "d": d_np, + "label": label_np + }, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..862c546b8e05ebe2046e9c5aeb52178fa47f59ab --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py @@ -0,0 +1,161 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestElementwisePow(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_pow" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + out = np.power(x, y) + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(x), + 'Y': OpTest.np_dtype_to_fluid_dtype(y) + } + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + # TODO(ascendrc): Pow grad test + # def test_check_grad(self): + # if self.dtype == np.float16: + # return + # self.check_grad(['X'], 'Out') + # + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestElementwisePowFp16(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_pow" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) + y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) + out = np.power(x, y) + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(x), + 'Y': OpTest.np_dtype_to_fluid_dtype(y) + } + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestElementwisePowNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + c = paddle.pow(a, b) + + fc_1 = fluid.layers.fc(input=c, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..f6a84d3be5c100c7324f2ed62ce8c934f4318b7f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py @@ -0,0 +1,144 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestExpand(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "expand" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.randn(3, 1, 7).astype(self.dtype) + out = np.tile(x, [1, 10, 1]) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {'expand_times': [1, 10, 1]} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + # TODO(ascendrc): Add grad test + # def test_check_grad(self): + # if self.dtype == np.float16: + # return + # self.check_grad(['X'], 'Out') + # + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestExpandV2(TestExpand): + def setUp(self): + self.set_npu() + self.op_type = "expand" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.randn(3, 1, 7).astype(self.dtype) + out = np.tile(x, [1, 10, 1]) + expand_times = np.array([1, 10, 1]).astype(np.int32) + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(x), + 'ExpandTimes': OpTest.np_dtype_to_fluid_dtype(expand_times) + } + self.attrs = {} + self.outputs = {'Out': out} + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestExpandFp16(TestExpand): + no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestExpandNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 1)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 1], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + res = paddle.fluid.layers.expand(a, [1, 32]) + loss = res.sum() + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + for epoch in range(100): + + loss_res = exe.run(main_prog, + feed={"a": a_np, + "label": label_np}, + fetch_list=[loss]) + if epoch % 10 == 0: + print("Epoch {} | Loss: {}".format(epoch, loss)) + + return loss_res + + def test_npu(self): + cpu_loss = self._test(False) + npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..6e619bfd11fb901994ad3a91187a716b014dab41 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py @@ -0,0 +1,102 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +from paddle.fluid import core + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestFillConstant(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "fill_constant" + self.init_dtype() + + self.inputs = {} + self.attrs = {'shape': [123, 92], 'value': 3.8} + self.outputs = {'Out': np.full((123, 92), 3.8)} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +class TestFillConstantInt(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "fill_constant" + + self.inputs = {} + self.attrs = { + 'shape': [123, 92], + 'value': 1, + 'dtype': core.VarDesc.VarType.INT32 + } + self.outputs = {'Out': np.full((123, 92), 1).astype(self.dtype)} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.int32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +class TestFillConstantFP16(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "fill_constant" + + self.inputs = {} + self.attrs = { + 'shape': [123, 92], + 'value': 1.0, + 'dtype': core.VarDesc.VarType.FP16 + } + self.outputs = {'Out': np.full((123, 92), 1.0).astype(self.dtype)} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..008422ffd21188327fa938734928a0dc62187824 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py @@ -0,0 +1,177 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +from paddle.framework import core + +paddle.enable_static() +SEED = 2021 + + +def gather_numpy(x, index, axis): + x_transpose = np.swapaxes(x, 0, axis) + tmp_gather = x_transpose[index, ...] + gather = np.swapaxes(tmp_gather, 0, axis) + return gather + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestGatherOp(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "gather" + self.config() + xnp = np.random.random(self.x_shape).astype(self.x_type) + self.inputs = { + 'X': xnp, + 'Index': np.array(self.index).astype(self.index_type) + } + self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]} + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + def test_check_grad(self): + self.check_grad_with_place( + self.place, ['X'], + 'Out', + max_relative_error=0.006, + check_dygraph=False) + + def config(self): + """ + For multi-dimension input + """ + self.x_shape = (10, 20) + self.x_type = "float32" + self.index = [1, 3, 5] + self.index_type = "int32" + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestCase1(TestGatherOp): + def config(self): + """ + For one dimension input + """ + self.x_shape = (100) + self.x_type = "float32" + self.index = [1, 3, 5] + self.index_type = "int32" + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class API_TestGather(unittest.TestCase): + def test_out1(self): + with fluid.program_guard(fluid.Program(), fluid.Program()): + data1 = fluid.layers.data('data1', shape=[-1, 2], dtype='float32') + index = fluid.layers.data('index', shape=[-1, 1], dtype='int32') + out = paddle.fluid.layers.gather(data1, index) + place = paddle.NPUPlace(0) + exe = fluid.Executor(place) + input = np.array([[1, 2], [3, 4], [5, 6]]) + index_1 = np.array([1, 2]) + result, = exe.run(feed={"data1": input, + "index": index_1}, + fetch_list=[out]) + expected_output = np.array([[3, 4], [5, 6]]) + self.assertTrue(np.allclose(result, expected_output)) + + def test_out2(self): + with paddle.static.program_guard(paddle.static.Program(), + paddle.static.Program()): + x = paddle.fluid.data('x', shape=[-1, 2], dtype='float32') + index = paddle.fluid.data('index', shape=[-1, 1], dtype='int32') + out = paddle.gather(x, index) + place = paddle.NPUPlace(0) + exe = paddle.static.Executor(place) + x_np = np.array([[1, 2], [3, 4], [5, 6]]).astype('float32') + index_np = np.array([1, 1]).astype('int32') + result, = exe.run(feed={"x": x_np, + "index": index_np}, + fetch_list=[out]) + expected_output = gather_numpy(x_np, index_np, axis=0) + self.assertTrue(np.allclose(result, expected_output)) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestGatherGrad(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(8192, 768)).astype('float32') + index_np = np.random.randint(0, 8192, size=(1232, 1)).astype('int32') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[8192, 768], dtype='float32') + index = paddle.static.data( + name="index", shape=[1232, 1], dtype='int32') + a.stop_gradient = False + b = paddle.gather(a, index) + + loss = fluid.layers.reduce_mean(b) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run(main_prog, + feed={"a": a_np, + "index": index_np}, + fetch_list=[b, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res[0])) + + return pred_res, loss_res + + def test_npu(self): + npu_pred, npu_loss = self._test(True) + cpu_pred, cpu_loss = self._test(False) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..efa1918206b035005dd12939b7001933266c107c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py @@ -0,0 +1,160 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +from scipy import special +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +def np_gelu(x): + y = 0.5 * x * (1 + special.erf(x / np.sqrt(2))) + return y + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestGelu(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "gelu" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + out = np_gelu(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3) + + # TODO(ascendrc): Add grad test + # def test_check_grad(self): + # if self.dtype == np.float16: + # return + # self.check_grad(['X'], 'Out') + # + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestGeluFp16(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "gelu" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) + out = np_gelu(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestGeluNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + c = paddle.multiply(a, b) + d = fluid.layers.gelu(c) + + fc_1 = fluid.layers.fc(input=d, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred, atol=1e-3)) + self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-3)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..a102f3d9ce185f7b219ce391b56cb5fcb30e99d4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py @@ -0,0 +1,119 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +from paddle.fluid import core + +paddle.enable_static() +SEED = 2021 + +NPUPlace = 5 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestIncrement(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(NPUPlace) + self.op_type = "increment" + self.init_dtype() + + self.inputs = { + 'X': + OpTest.np_dtype_to_fluid_dtype(np.array([1]).astype(self.dtype)), + } + + self.attrs = {"Step": 1} + self.outputs = {'Out': np.array([2])} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.int64 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestIncrementFP16(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(NPUPlace) + self.op_type = "increment" + self.init_dtype() + + self.inputs = { + 'X': + OpTest.np_dtype_to_fluid_dtype(np.array([1]).astype(self.dtype)), + } + self.pre_input_id = id(self.inputs['X']) + + self.attrs = {"Step": 1} + self.outputs = {'Out': np.array([2])} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestIncrementInplace(unittest.TestCase): + def test_npu(self): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.array([1]).astype('float32') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[1], dtype='float32') + b = fluid.layers.increment(a) + + place = paddle.NPUPlace(NPUPlace) + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + b_value = exe.run(main_prog, feed={"a": a_np, }, fetch_list=[b]) + + print('input a id is : {}'.format(id(a))) + print('input b id is : {}'.format(id(b))) + + self.assertEqual(id(a), id(b)) + self.assertEqual(b_value[0], 2) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_layer_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_layer_norm_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..d447dfb8d4d031e6f29fdfedab285066d4dea565 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_layer_norm_op_npu.py @@ -0,0 +1,203 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +from functools import reduce +from operator import mul +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from test_layer_norm_op import _reference_layer_norm_naive, _reference_layer_norm_grad + +paddle.enable_static() + +SEED = 2021 +EPOCH = 100 + +from op_test import _set_use_system_allocator + +_set_use_system_allocator(False) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestLayerNormOp(unittest.TestCase): + def setUp(self): + self.use_cudnn = True + self.set_npu() + self.init_dtype() + + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + self.atol = 1e-4 + + def __assert_close(self, tensor, np_array, msg, atol=1e-4): + self.assertTrue( + np.allclose( + np.array(tensor).astype(np_array.dtype), np_array, atol=atol), + msg) + + def check_forward_backward(self, + shape, + begin_norm_axis, + has_scale=True, + has_bias=True, + y_grad_scale=1.0, + use_mkldnn=False): + def test_with_place(place, + shape, + begin_norm_axis, + use_mkldnn=use_mkldnn): + # attr + epsilon = 0.00001 + x_shape = shape + D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1) + scale_shape = [D] + + np.random.seed(123) + x = np.random.random_sample(x_shape).astype(self.dtype) + scale = np.random.random_sample(scale_shape).astype( + np.float32) if has_scale else None + bias = np.random.random_sample(scale_shape).astype( + np.float32) if has_bias else None + y_grad = (np.random.random_sample(x_shape) * + y_grad_scale).astype(self.dtype) + + # reference forward & backward + y, mean, variance = _reference_layer_norm_naive( + x, scale, bias, epsilon, begin_norm_axis) + x_grad, scale_grad, bias_grad = _reference_layer_norm_grad( + x, y_grad, scale, bias, mean, variance, begin_norm_axis) + + var_dict = locals() + var_dict['y@GRAD'] = y_grad + var_names = ['x', 'mean', 'variance', 'y', 'y@GRAD'] + if has_scale: + var_names += ['scale'] + if has_bias: + var_names += ['bias'] + ground_truth = {name: var_dict[name] for name in var_names} + + program = fluid.Program() + with fluid.program_guard(program): + block = program.global_block() + for name in ground_truth: + block.create_var( + name=name, + dtype=self.dtype, + shape=ground_truth[name].shape) + inputs = {"X": block.var('x')} + fetch_list = [ + 'y', + 'mean', + 'variance', + 'x@GRAD', + ] + if has_scale: + inputs["Scale"] = block.var('scale') + fetch_list += ['scale@GRAD'] + if has_bias: + inputs["Bias"] = block.var('bias') + fetch_list += ['bias@GRAD'] + layer_norm_op = block.append_op( + type="layer_norm", + inputs=inputs, + outputs={ + "Y": block.var('y'), + "Mean": block.var('mean'), # share the same memory + "Variance": + block.var('variance'), # share the same memory + }, + attrs={ + "epsilon": epsilon, + "begin_norm_axis": begin_norm_axis, + "use_mkldnn": use_mkldnn + }) + # generate backward op_desc + grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc( + layer_norm_op.desc, set(), []) + grad_op_desc = grad_op_desc_list[0] + new_op_desc = block.desc.append_op() + new_op_desc.copy_from(grad_op_desc) + for var_name in grad_op_desc.output_arg_names(): + block.desc.var(var_name.encode("ascii")) + grad_op_desc.infer_var_type(block.desc) + grad_op_desc.infer_shape(block.desc) + for arg in grad_op_desc.output_arg_names(): + grad_var = block.desc.find_var(arg.encode("ascii")) + grad_var.set_dtype(core.VarDesc.VarType.FP32) + + program._sync_with_cpp() + exe = fluid.Executor(place) + out = exe.run(program, + feed={ + name: var_dict[name] + for name in ['x', 'scale', 'bias', 'y@GRAD'] + }, + fetch_list=fetch_list) + self.__assert_close(y, out[0], "y", self.atol) + self.__assert_close(mean, out[1], "mean") + self.__assert_close(variance, out[2], "variance", 1e-3) + self.__assert_close(x_grad, out[3], "x_grad", 1e-2) + if has_scale: + self.__assert_close(scale_grad, + out[fetch_list.index('scale@GRAD')], + "scale_grad", 1e-2) + if has_bias: + self.__assert_close(bias_grad, + out[fetch_list.index('bias@GRAD')], + "bias_grad", self.atol) + + test_with_place(self.place, shape, begin_norm_axis) + + def test_check_forward_backward_with_scale_and_bias(self): + self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1) + self.check_forward_backward( + shape=[2, 3, 4, 5], + begin_norm_axis=1, + has_scale=False, + has_bias=True) + self.check_forward_backward( + shape=[2, 3, 4, 5], + begin_norm_axis=1, + has_scale=True, + has_bias=False) + self.check_forward_backward( + shape=[2, 3, 4, 5], + begin_norm_axis=1, + has_scale=False, + has_bias=False) + self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestLayerNormOpFP16(TestLayerNormOp): + def init_dtype(self): + self.dtype = np.float16 + self.atol = 1e-2 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..3cdd2448628a0b0f1900cc8b15d884d578a445ca --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py @@ -0,0 +1,154 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestLog(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "log" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + out = np.log(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + # TODO(ascendrc): Add grad test + # def test_check_grad(self): + # if self.dtype == np.float16: + # return + # self.check_grad(['X'], 'Out') + # + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestLogFp16(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "log" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) + out = np.log(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestLogNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + c = paddle.multiply(a, b) + d = paddle.log(c) + + fc_1 = fluid.layers.fc(input=d, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred, atol=1e-4)) + self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-4)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_logical_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_logical_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..a2b54be3a1482326f78039e78d4dae026f8445f9 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_logical_op_npu.py @@ -0,0 +1,122 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestLogicalNot(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "logical_not" + self.place = paddle.NPUPlace(4) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + out = np.logical_not(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.bool + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + # TODO(ascendrc): Add grad test + # def test_check_grad(self): + # if self.dtype == np.float16: + # return + # self.check_grad(['X'], 'Out') + # + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestLogcialNotNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('bool') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='bool') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + c = paddle.logical_not(a) + d = paddle.cast(c, dtype="float32") + + fc_1 = fluid.layers.fc(input=d, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(4) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run(main_prog, + feed={"a": a_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..400ddd9d4aab0775af6007da36475db72561136f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py @@ -0,0 +1,89 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestLookupTableV2(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "lookup_table_v2" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + bsz = 6 + seqlen = 8 + vocab = 10 + dim = 20 + w = np.ones([vocab, dim]).astype(self.dtype) + x = np.random.randint(0, vocab, size=(bsz, seqlen)).astype(np.int64) + out = np.ones([bsz, seqlen, dim]).astype(self.dtype) + + self.inputs = { + 'W': OpTest.np_dtype_to_fluid_dtype(w), + 'Ids': OpTest.np_dtype_to_fluid_dtype(x) + } + self.attrs = { + 'is_sparse': False, + 'is_distributed': False, + 'remote_prefetch': False, + 'padding_idx': -1 + } + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + def test_check_grad(self): + if self.dtype == np.float16: + return + self.check_grad_with_place( + self.place, ['W'], 'Out', check_dygraph=False) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestLookupTableV2FP16(TestLookupTableV2): + no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..b27b9c0b9756072c42fa7269f73821c18a7cc37e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py @@ -0,0 +1,210 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +def reference_matmul(X, Y, transpose_X=False, transpose_Y=False): + """Reference forward implementation using np.matmul.""" + # np.matmul does not support the transpose flags, so we manually + # transpose X and Y appropriately. + if transpose_X: + if X.ndim == 1: + X = X.reshape((X.size, )) + elif X.ndim == 2: + X = X.T + else: + dim = [i for i in range(len(X.shape))] + dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1] + X = np.transpose(X, tuple(dim)) + if transpose_Y: + if Y.ndim == 1: + Y = Y.reshape((Y.size, )) + else: + dim = [i for i in range(len(Y.shape))] + dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1] + Y = np.transpose(Y, tuple(dim)) + + Out = np.matmul(X, Y) + if not Out.shape: + # We do not support 0-dimensional Tensors (scalars). So where + # np.matmul outputs a scalar, we must convert to a Tensor of + # shape (1, ) instead. + # Everywhere else, we are compatible with np.matmul. + Out = np.array([Out], dtype="float64") + return Out + + +class TestMatMul(OpTest): + def config(self): + self.x_shape = (100, 24) + self.y_shape = (24, 100) + self.trans_x = False + self.trans_y = False + + def setUp(self): + self.set_npu() + self.op_type = "matmul_v2" + self.place = paddle.NPUPlace(0) + self.init_dtype() + self.config() + np.random.seed(SEED) + x = np.random.random(self.x_shape).astype(self.dtype) + y = np.random.random(self.y_shape).astype(self.dtype) + # -0.1 ~ 0.1 + x = -0.1 + 0.2 * x + y = -0.1 + 0.2 * y + result = reference_matmul(x, y, self.trans_x, self.trans_y) + result = result.astype(self.dtype) + self.inputs = { + 'X': x, + 'Y': y, + } + self.attrs = {'trans_x': self.trans_x, 'trans_y': self.trans_y} + self.outputs = {'Out': result} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5) + + + # TODO(ascendrc): Add grad test + # def test_check_grad(self): + # if self.dtype == np.float16: + # return + # self.check_grad(['X'], 'Out') + # +class TestMatMul2(TestMatMul): + """ + case 2 + """ + + def config(self): + self.x_shape = (32, 24) + self.y_shape = (32, 24) + self.trans_x = False + self.trans_y = True + + +class TestMatMul3(TestMatMul): + """ + case 3 + """ + + def init_dtype(self): + self.dtype = np.float16 + + +class TestMatMul4(TestMatMul): + """ + case 4 dim=3 + """ + + def config(self): + self.x_shape = (2, 3, 4) + self.y_shape = (2, 4, 3) + self.trans_x = False + self.trans_y = False + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestMatMulNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(2, 3)).astype('float32') + b_np = np.random.random(size=(2, 3)).astype('float32') + c_np = np.random.random(size=(3, 2)).astype('float32') + d_np = np.random.random(size=(3, 2)).astype('float32') + label_np = np.random.randint(2, size=(2, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[2, 3], dtype='float32') + b = paddle.static.data(name="b", shape=[2, 3], dtype='float32') + c = paddle.static.data(name="c", shape=[3, 2], dtype='float32') + d = paddle.static.data(name="d", shape=[3, 2], dtype='float32') + label = paddle.static.data( + name="label", shape=[2, 1], dtype='int64') + + sum_1 = paddle.add(a, b) + sum_2 = paddle.add(c, d) + result = paddle.matmul(sum_1, sum_2) + + fc_1 = fluid.layers.fc(input=result, size=8) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run(main_prog, + feed={ + "a": a_np, + "b": b_np, + "c": c_np, + "d": d_np, + "label": label_np + }, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..6e8f99a9dbb19785094ad6a94d9f371fe409fc69 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py @@ -0,0 +1,88 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +from paddle.fluid import core + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestMean(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "mean" + self.init_dtype() + + x = np.random.random([1, 100]).astype(self.dtype) + self.inputs = {'X': x} + + self.attrs = {} + np_out = np.mean(x) + self.outputs = {'Out': np_out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + def test_check_grad(self): + self.check_grad_with_place( + self.place, ['X'], 'Out', check_dygraph=False) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestMeanFP16(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "mean" + self.init_dtype() + + x = np.random.random([3, 200]).astype(self.dtype) + self.inputs = {'X': x} + + self.attrs = {} + np_out = np.mean(x) + self.outputs = {'Out': np_out} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..e65a3dac73928cd48c43e0d6eb4ebcc2a84e9d2d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py @@ -0,0 +1,326 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +class TestMul(OpTest): + def config(self): + self.x_shape = (32, 5) + self.y_shape = (5, 100) + + def setUp(self): + self.set_npu() + self.op_type = "mul" + self.place = paddle.NPUPlace(0) + self.init_dtype() + self.config() + np.random.seed(SEED) + self.inputs = { + 'X': np.random.random(self.x_shape).astype(self.dtype), + 'Y': np.random.random(self.y_shape).astype(self.dtype) + } + self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5) + + + # +class TestMulFP16(TestMul): + """ + case 2 + """ + + def init_dtype(self): + self.dtype = np.float16 + + +class TestMul3(TestMul): + """ + case 3 + """ + + def config(self): + self.x_shape = (2, 2, 5) + self.y_shape = (10, 5) + + def setUp(self): + self.set_npu() + self.op_type = "mul" + self.place = paddle.NPUPlace(0) + self.init_dtype() + self.config() + np.random.seed(SEED) + self.inputs = { + 'X': np.random.random(self.x_shape).astype(self.dtype), + 'Y': np.random.random(self.y_shape).astype(self.dtype) + } + self.outputs = { + 'Out': np.dot(self.inputs['X'].reshape(2, 10), self.inputs['Y']) + } + + +class TestMul4(TestMul): + """ + case 4 + """ + + def config(self): + self.x_shape = (2, 3, 4) + self.y_shape = (4, 5) + + def setUp(self): + self.set_npu() + self.op_type = "mul" + self.place = paddle.NPUPlace(0) + self.init_dtype() + self.config() + np.random.seed(SEED) + self.inputs = { + 'X': np.random.random(self.x_shape).astype(self.dtype), + 'Y': np.random.random(self.y_shape).astype(self.dtype) + } + self.attrs = {"x_num_col_dims": 2} + self.outputs = {'Out': np.matmul(self.inputs['X'], self.inputs['Y'])} + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestMulNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(2, 3)).astype('float32') + b_np = np.random.random(size=(2, 3)).astype('float32') + c_np = np.random.random(size=(3, 2)).astype('float32') + d_np = np.random.random(size=(3, 2)).astype('float32') + label_np = np.random.randint(2, size=(2, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[2, 3], dtype='float32') + b = paddle.static.data(name="b", shape=[2, 3], dtype='float32') + c = paddle.static.data(name="c", shape=[3, 2], dtype='float32') + d = paddle.static.data(name="d", shape=[3, 2], dtype='float32') + label = paddle.static.data( + name="label", shape=[2, 1], dtype='int64') + + sum_1 = paddle.add(a, b) + sum_2 = paddle.add(c, d) + result = paddle.fluid.layers.mul(sum_1, sum_2) + + fc_1 = fluid.layers.fc(input=result, size=8) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("TestMulNet Start run on {} . ".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run(main_prog, + feed={ + "a": a_np, + "b": b_np, + "c": c_np, + "d": d_np, + "label": label_np + }, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestMulNet3_2(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(2, 3, 4)).astype('float32') + b_np = np.random.random(size=(2, 3, 4)).astype('float32') + c_np = np.random.random(size=(12, 5)).astype('float32') + d_np = np.random.random(size=(12, 5)).astype('float32') + label_np = np.random.randint(2, size=(2, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[2, 3, 4], dtype='float32') + b = paddle.static.data(name="b", shape=[2, 3, 4], dtype='float32') + c = paddle.static.data(name="c", shape=[12, 5], dtype='float32') + d = paddle.static.data(name="d", shape=[12, 5], dtype='float32') + label = paddle.static.data( + name="label", shape=[2, 1], dtype='int64') + + sum_1 = paddle.add(a, b) + sum_2 = paddle.add(c, d) + result = paddle.fluid.layers.mul(sum_1, sum_2) + + fc_1 = fluid.layers.fc(input=result, size=8) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("testMulNet3_2 tart run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run(main_prog, + feed={ + "a": a_np, + "b": b_np, + "c": c_np, + "d": d_np, + "label": label_np + }, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestMulNet3_2_xc2(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(2, 3, 4)).astype('float32') + b_np = np.random.random(size=(2, 3, 4)).astype('float32') + c_np = np.random.random(size=(4, 5)).astype('float32') + d_np = np.random.random(size=(4, 5)).astype('float32') + label_np = np.random.randint(2, size=(2, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[2, 3, 4], dtype='float32') + b = paddle.static.data(name="b", shape=[2, 3, 4], dtype='float32') + c = paddle.static.data(name="c", shape=[4, 5], dtype='float32') + d = paddle.static.data(name="d", shape=[4, 5], dtype='float32') + label = paddle.static.data( + name="label", shape=[2, 1], dtype='int64') + + sum_1 = paddle.add(a, b) + sum_2 = paddle.add(c, d) + result = paddle.fluid.layers.mul(sum_1, sum_2, x_num_col_dims=2) + result_re = paddle.reshape(result, shape=[2, 15]) + + fc_1 = fluid.layers.fc(input=result_re, size=8) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("TestMulNet3_2_xc2. Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run(main_prog, + feed={ + "a": a_np, + "b": b_np, + "c": c_np, + "d": d_np, + "label": label_np + }, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..8c67766b31184a36446c4fa39f64f760fa23912c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py @@ -0,0 +1,151 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestPow(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "pow" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + out = np.power(x, 3) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {'factor': 3.0} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + def test_check_grad(self): + self.check_grad_with_place( + self.place, ['X'], 'Out', check_dygraph=False) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestPowFp16(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "pow" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) + out = np.power(x, 2) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {'factor': 2.0} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestPowNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + sum = paddle.add(a, b) + z = paddle.pow(sum, 2.0) + + fc_1 = fluid.layers.fc(input=z, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..087256b298088663c68b6de62fcff8746adaead2 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py @@ -0,0 +1,133 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest, skip_check_grad_ci +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid +from paddle.fluid import compiler, Program, program_guard +from paddle.fluid.framework import convert_np_dtype_to_dtype_ + +paddle.enable_static() + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestAny8DOp(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "reduce_any" + self.place = paddle.NPUPlace(0) + self.inputs = { + 'X': np.random.randint(0, 2, + (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool") + } + self.attrs = {'dim': (3, 5, 4)} + self.outputs = {'Out': self.inputs['X'].any(axis=self.attrs['dim'])} + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestAnyOpWithDim(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "reduce_any" + self.place = paddle.NPUPlace(0) + self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")} + self.attrs = {'dim': [1]} + self.outputs = {'Out': self.inputs['X'].any(axis=1)} + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestAny8DOpWithDim(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "reduce_any" + self.place = paddle.NPUPlace(0) + self.inputs = { + 'X': np.random.randint(0, 2, + (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool") + } + self.attrs = {'dim': (3, 6)} + self.outputs = {'Out': self.inputs['X'].any(axis=self.attrs['dim'])} + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestAnyOpWithKeepDim(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "reduce_any" + self.place = paddle.NPUPlace(0) + self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")} + self.attrs = {'dim': (1, ), 'keep_dim': True} + self.outputs = { + 'Out': np.expand_dims( + self.inputs['X'].any(axis=self.attrs['dim']), axis=1) + } + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +class TestAny8DOpWithKeepDim(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "reduce_any" + self.place = paddle.NPUPlace(0) + self.inputs = { + 'X': np.random.randint(0, 2, + (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool") + } + self.attrs = {'dim': (1, ), 'keep_dim': True} + self.outputs = { + 'Out': np.expand_dims( + self.inputs['X'].any(axis=self.attrs['dim']), axis=1) + } + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..d3861bf0780cb58f7362ff9dbd05c99a222bc21b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py @@ -0,0 +1,206 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestReduceSum(OpTest): + def setUp(self): + np.random.seed(SEED) + self.set_npu() + self.init_dtype() + self.place = paddle.NPUPlace(0) + self.init_op_type() + self.initTestCase() + + self.use_mkldnn = False + self.attrs = { + 'dim': self.axis, + 'keep_dim': self.keep_dim, + 'reduce_all': self.reduce_all + } + self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)} + if self.attrs['reduce_all']: + self.outputs = {'Out': self.inputs['X'].sum()} + else: + self.outputs = { + 'Out': self.inputs['X'].sum(axis=self.axis, + keepdims=self.attrs['keep_dim']) + } + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def init_op_type(self): + self.op_type = "reduce_sum" + self.use_mkldnn = False + self.keep_dim = False + self.reduce_all = False + + def initTestCase(self): + self.shape = (5, 6) + self.axis = (0, ) + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + # TODO(ascendrc): Add grad test + # def test_check_grad(self): + # if self.dtype == np.float16: + # return + # self.check_grad(['X'], 'Out') + # + + +class TestReduceSum2(OpTest): + def init_dtype(self): + self.dtype = np.int32 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestReduceSumNet(unittest.TestCase): + def set_reduce_sum_function(self, x): + # keep_dim = False + return paddle.fluid.layers.reduce_sum(x, dim=-1) + + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(2, 3, 4)).astype('float32') + b_np = np.random.random(size=(2, 3, 4)).astype('float32') + label_np = np.random.randint(2, size=(2, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[2, 3, 4], dtype='float32') + b = paddle.static.data(name="b", shape=[2, 3, 4], dtype='float32') + label = paddle.static.data( + name="label", shape=[2, 1], dtype='int64') + + a_1 = fluid.layers.fc(input=a, size=4, num_flatten_dims=2, act=None) + b_1 = fluid.layers.fc(input=b, size=4, num_flatten_dims=2, act=None) + z = paddle.add(a_1, b_1) + z_1 = self.set_reduce_sum_function(z) + + prediction = fluid.layers.fc(input=z_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestReduceSumNet2(TestReduceSumNet): + def set_reduce_sum_function(self, x): + # keep_dim = True + return paddle.fluid.layers.reduce_sum(x, dim=-1, keep_dim=True) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestReduceSumNet3(TestReduceSumNet): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(2, 3, 4)).astype('float32') + b_np = np.random.random(size=(2, 3, 4)).astype('float32') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[2, 3, 4], dtype='float32') + b = paddle.static.data(name="b", shape=[2, 3, 4], dtype='float32') + + z = paddle.add(a, b) + loss = fluid.layers.reduce_sum(z) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + loss_res = exe.run(main_prog, + feed={"a": a_np, + "b": b_np}, + fetch_list=[loss]) + if epoch % 10 == 0: + print("Epoch {} | Loss: {}".format(epoch, loss_res)) + + return loss_res, loss_res + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..9273d01299d8f564ee0ae575b47bb30e939c3d76 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py @@ -0,0 +1,176 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestRelu(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "relu" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.rand(3, 2).astype(self.dtype) + out = x + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestReluFp16(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "relu" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.rand(3, 2).astype(self.dtype) + out = x + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestReluNeg(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "relu" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.array([0.1, -0.1, -1.0]).astype(self.dtype) + out = np.array([0.1, 0.0, 0.0]).astype(self.dtype) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +# +# +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestReluNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + sum = paddle.add(a, b) + z = paddle.nn.functional.relu(sum) + + fc_1 = fluid.layers.fc(input=z, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_reshape2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reshape2_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..885c990c702bd35d2052b3cb79abf11a74b3efc2 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_reshape2_op_npu.py @@ -0,0 +1,77 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestReshape2(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "reshape2" + self.place = paddle.NPUPlace(0) + + self.init_data() + self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")} + self.attrs = {"shape": self.new_shape} + self.outputs = { + "Out": self.inputs["X"].reshape(self.infered_shape), + 'XShape': np.random.random(self.ori_shape).astype("float32") + } + + def set_npu(self): + self.__class__.use_npu = True + + def init_data(self): + self.ori_shape = (2, 100) + self.new_shape = (20, 10) + self.infered_shape = (20, 10) + + def test_check_output(self): + self.check_output_with_place( + self.place, check_dygraph=False, no_check_set=['XShape']) + + def test_check_grad_normal(self): + self.check_grad_with_place( + self.place, ['X'], 'Out', check_dygraph=False) + + +class TestReshape2_case2(TestReshape2): + def init_data(self): + self.ori_shape = (2, 100) + self.new_shape = (-1, 10) + self.infered_shape = (20, 10) + + +class TestReshape2_case3(TestReshape2): + def init_data(self): + self.ori_shape = (100, 5, 6) + self.new_shape = (-1, 0, 3) + self.infered_shape = (200, 5, 3) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..9b4547bc24474afccf2454f992a1c92c3dd22605 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py @@ -0,0 +1,89 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestScale(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "scale" + self.place = paddle.NPUPlace(0) + self.init_dtype() + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype( + np.random.random((10, 10)).astype(self.dtype)) + } + self.attrs = {'scale': -2.3, 'bias': 0, 'bias_after_scale': True} + self.outputs = { + 'Out': self.inputs['X'] * self.dtype(self.attrs['scale']) + } + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +class TestFP16Scale(TestScale): + def init_dtype(self): + self.dtype = np.float16 + + +class TestBiasAfterScale(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "scale" + self.place = paddle.NPUPlace(0) + self.init_dtype() + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype( + np.random.random((10, 10)).astype(self.dtype)) + } + self.attrs = {'scale': -2.3, 'bias': 0, 'bias_after_scale': False} + self.outputs = { + 'Out': self.inputs['X'] * self.dtype(self.attrs['scale']) + } + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py new file mode 100755 index 0000000000000000000000000000000000000000..c3e52c9bfad533bdef724cff7e447f991fa2d6b2 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py @@ -0,0 +1,126 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestCast1(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "scatter" + self.place = paddle.NPUPlace(0) + + ref_np = np.ones((3, 2)).astype("float32") + index_np = np.array([1]).astype("int32") + updates_np = np.random.random((1, 2)).astype("float32") + + output_np = np.copy(ref_np) + output_np[index_np] = updates_np + self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np} + self.outputs = {'Out': output_np} + self.attrs = {'overwrite': True} + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +class TestCast2(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "scatter" + self.place = paddle.NPUPlace(0) + + ref_np = np.ones((3, 2)).astype("int32") + index_np = np.array([1]).astype("int32") + updates_np = np.zeros((1, 2)).astype("int32") + + output_np = np.copy(ref_np) + output_np[index_np] = updates_np + self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np} + self.outputs = {'Out': output_np} + self.attrs = {'overwrite': True} + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +class TestCast3(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "scatter" + self.place = paddle.NPUPlace(0) + + ref_np = np.ones((3, 2)).astype("float32") + index_np = np.array([1]).astype("int32") + updates_np = np.random.random((1, 2)).astype("float32") + + output_np = np.copy(ref_np) + output_np[index_np] += updates_np + self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np} + self.outputs = {'Out': output_np} + self.attrs = {'overwrite': False} + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +class TestCast4(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "scatter" + self.place = paddle.NPUPlace(0) + + ref_np = np.ones((3, 2)).astype("float32") + index_np = np.array([1, 2]).astype("int32") + updates_np = np.random.random((2, 2)).astype("float32") + + output_np = np.copy(ref_np) + output_np[1] = updates_np[0] + output_np[2] = updates_np[1] + self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np} + self.outputs = {'Out': output_np} + self.attrs = {'overwrite': True} + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..af0dea4776d23fdebe26f68b5c84c7d3d07d2940 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py @@ -0,0 +1,119 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestSGD(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "sgd" + self.conf() + w = np.random.random((self.h, self.w)).astype("float32") + g = np.random.random((self.h, self.w)).astype("float32") + lr = np.array([0.1]).astype("float32") + + self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr} + self.outputs = {'ParamOut': w - lr * g} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def conf(self): + self.h = 12 + self.w = 15 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + sum = paddle.add(a, b) + z = paddle.pow(sum, 2.0) + + fc_1 = fluid.layers.fc(input=z, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_shape_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_shape_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..7b9a74b2be98dee86f2b3192d746cc56895ca1d9 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_shape_op_npu.py @@ -0,0 +1,57 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestShape(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "shape" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [5, 10]).astype(self.dtype) + out = np.array([5, 10]) + + self.inputs = {'Input': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..500618f509f682b00be715ea8214cddaaf892b2e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py @@ -0,0 +1,158 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle + +paddle.enable_static() + +SEED = 2021 +EPOCH = 100 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestSliceOp(OpTest): + def setUp(self): + self.op_type = "slice" + self.set_npu() + self.init_dtype() + self.config() + self.inputs = {'Input': self.input} + self.outputs = {'Out': self.out} + self.attrs = { + 'axes': self.axes, + 'starts': self.starts, + 'ends': self.ends, + 'infer_flags': self.infer_flags + } + + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [1, 0, 2] + self.ends = [3, 3, 4] + self.axes = [0, 1, 2] + self.infer_flags = [1, 1, 1] + self.out = self.input[1:3, 0:3, 2:4, :] + + def init_dtype(self): + self.dtype = np.float32 + + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + def test_check_grad_normal(self): + if self.dtype == np.float16: + return + self.check_grad_with_place( + self.place, ['Input'], 'Out', check_dygraph=False) + + +class TestSliceOp2(TestSliceOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [1, 0, -3] + self.ends = [3, 3, -1] + self.axes = [0, 1, 2] + self.infer_flags = [1, 1, 1] + self.out = self.input[1:3, 0:3, -3:-1, :] + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestSliceOpFp16(TestSliceOp): + def init_dtype(self): + self.dtype = np.float16 + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + self.place = paddle.NPUPlace(0) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestSliceNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + batch_size = 32 + data_shape = (32, 32) + a_np = np.random.random(size=data_shape).astype('float32') + b_np = np.random.random(size=data_shape).astype('float32') + label_np = np.random.randint(2, size=(batch_size, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=data_shape, dtype='float32') + b = paddle.static.data(name="b", shape=data_shape, dtype='float32') + label = paddle.static.data( + name="label", shape=[batch_size, 1], dtype='int64') + + sum = paddle.add(a, b) + z = paddle.slice(sum, axes=[0, 1], starts=[0, 0], ends=[33, 2]) + + prediction = paddle.static.nn.fc(z, size=2, activation='softmax') + + cost = paddle.nn.functional.cross_entropy( + input=prediction, label=label) + loss = paddle.mean(cost) + sgd = paddle.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + print("Start run on {}".format(place)) + for epoch in range(EPOCH): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..c1ba41943a359ba2103bfd34c722c697d6b01b2f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py @@ -0,0 +1,125 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +from paddle.fluid import core + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestSoftmax(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "softmax" + self.init_dtype() + + x = np.random.random([3, 3]).astype(self.dtype) + np_out = np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True) + self.inputs = {'X': x} + + self.attrs = {} + self.outputs = {'Out': np_out} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestSoftmaxNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(4, 32)).astype('float32') + b_np = np.random.random(size=(4, 32)).astype('float32') + label_np = np.random.randint(2, size=(4, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[4, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[4, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[4, 1], dtype='int64') + + c = paddle.multiply(a, b) + d = paddle.sqrt(c) + + # 4 x 128 + fc_1 = fluid.layers.fc(input=d, size=128) + # 4 x 2 + prediction = fluid.layers.fc(input=fc_1, size=2) + + # 4 x 2 + prob = fluid.layers.softmax(prediction, axis=1) + + cost = fluid.layers.cross_entropy(input=prob, label=label) + loss = fluid.layers.mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-2)) + self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-2)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..1b48268b0e77e6804d3a26bd58918a4c484d3732 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py @@ -0,0 +1,159 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +from test_softmax_op import stable_softmax +from test_softmax_with_cross_entropy_op import cross_entropy + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestSoftmaxWithCrossEntropyOp(OpTest): + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def initParams(self): + self.set_npu() + self.op_type = "softmax_with_cross_entropy" + self.numeric_stable_mode = False + self.place = paddle.NPUPlace(0) + self.soft_label = False + self.init_dtype() + self.axis = -1 + self.ignore_index = -1 + self.shape = [41, 37] + np.random.seed(SEED) + + def setUp(self): + self.initParams() + + logits = getattr( + self, "logits", + np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)) + softmax = np.apply_along_axis(stable_softmax, self.axis, logits) + + if self.soft_label: + labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype) + labels /= np.sum(labels, axis=self.axis, keepdims=True) + else: + axis_dim = self.shape[self.axis] + self.shape[self.axis] = 1 + labels = np.random.randint(0, axis_dim, self.shape, dtype="int64") + + loss = cross_entropy(softmax, labels, self.soft_label, self.axis, + self.ignore_index) + + self.inputs = {"Logits": logits, "Label": labels} + self.outputs = { + "Softmax": softmax.astype(self.dtype), + "Loss": loss.astype(self.dtype) + } + self.attrs = { + "numeric_stable_mode": self.numeric_stable_mode, + "soft_label": self.soft_label, + "ignore_index": self.ignore_index, + } + + if self.axis != -1: + self.attrs['axis'] = self.axis + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + # TODO(ascendrc): Add grad test + # def test_check_grad(self): + # if self.dtype == np.float16: + # return + # self.check_grad(['X'], 'Out') + # + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestPowNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + sum = paddle.add(a, b) + z = paddle.pow(sum, 2.0) + + fc_1 = fluid.layers.fc(input=z, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2) + + cost = fluid.layers.softmax_with_cross_entropy(prediction, label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..556fa76424b8b60f2efff371c833f57bdc341e40 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py @@ -0,0 +1,154 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestSqrt(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "sqrt" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + out = np.sqrt(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + # TODO(ascendrc): Add grad test + # def test_check_grad(self): + # if self.dtype == np.float16: + # return + # self.check_grad(['X'], 'Out') + # + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestSqrtFp16(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "sqrt" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) + out = np.sqrt(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestSqrtNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + c = paddle.multiply(a, b) + d = paddle.sqrt(c) + + fc_1 = fluid.layers.fc(input=d, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..8c1a8d0070484a3b536256a6e8aafeb20fcf0ae0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py @@ -0,0 +1,154 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestSquare(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "square" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + out = np.square(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + # TODO(ascendrc): Add grad test + # def test_check_grad(self): + # if self.dtype == np.float16: + # return + # self.check_grad(['X'], 'Out') + # + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestSquareFp16(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "square" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) + out = np.square(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestSquareNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + c = paddle.multiply(a, b) + d = paddle.square(c) + + fc_1 = fluid.layers.fc(input=d, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..6db98be9328a4316821f89ebb5d6c145c6711975 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py @@ -0,0 +1,153 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestStack1(OpTest): + def initDefaultParameters(self): + self.num_inputs = 4 + self.input_dim = (5, 6, 7) + self.axis = 0 + self.dtype = 'float32' + + def get_x_names(self): + x_names = [] + for i in range(self.num_inputs): + x_names.append('x{}'.format(i)) + return x_names + + def setUp(self): + self.initDefaultParameters() + self.set_npu() + self.op_type = "stack" + self.place = paddle.NPUPlace(0) + + self.x = [] + for i in range(self.num_inputs): + self.x.append( + np.random.random(size=self.input_dim).astype(self.dtype)) + + tmp = [] + x_names = self.get_x_names() + for i in range(self.num_inputs): + tmp.append((x_names[i], self.x[i])) + + self.inputs = {'X': tmp} + self.outputs = {'Y': np.stack(self.x, axis=self.axis)} + self.attrs = {'axis': self.axis} + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +class TestStack2(OpTest): + def initDefaultParameters(self): + self.num_inputs = 4 + self.input_dim = (2, 3, 4) + self.axis = -1 + self.dtype = 'float32' + + def get_x_names(self): + x_names = [] + for i in range(self.num_inputs): + x_names.append('x{}'.format(i)) + return x_names + + def setUp(self): + self.initDefaultParameters() + self.set_npu() + self.op_type = "stack" + self.place = paddle.NPUPlace(0) + + self.x = [] + for i in range(self.num_inputs): + self.x.append( + np.random.random(size=self.input_dim).astype(self.dtype)) + + tmp = [] + x_names = self.get_x_names() + for i in range(self.num_inputs): + tmp.append((x_names[i], self.x[i])) + + self.inputs = {'X': tmp} + self.outputs = {'Y': np.stack(self.x, axis=self.axis)} + self.attrs = {'axis': self.axis} + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +class TestStack3(OpTest): + def initDefaultParameters(self): + self.num_inputs = 4 + self.input_dim = (2, 3, 4) + self.axis = 1 + self.dtype = 'float32' + + def get_x_names(self): + x_names = [] + for i in range(self.num_inputs): + x_names.append('x{}'.format(i)) + return x_names + + def setUp(self): + self.initDefaultParameters() + self.set_npu() + self.op_type = "stack" + self.place = paddle.NPUPlace(0) + + self.x = [] + for i in range(self.num_inputs): + self.x.append( + np.random.random(size=self.input_dim).astype(self.dtype)) + + tmp = [] + x_names = self.get_x_names() + for i in range(self.num_inputs): + tmp.append((x_names[i], self.x[i])) + + self.inputs = {'X': tmp} + self.outputs = {'Y': np.stack(self.x, axis=self.axis)} + self.attrs = {'axis': self.axis} + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py new file mode 100755 index 0000000000000000000000000000000000000000..6d39aa383ce9495494c2cc90bd2c4fee573b0fd1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py @@ -0,0 +1,86 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestSum1(OpTest): + def setUp(self): + self.set_npu() + self.init_dtype() + self.op_type = "sum" + self.place = paddle.NPUPlace(0) + + x0 = np.random.random((3, 40)).astype(self.dtype) + x1 = np.random.random((3, 40)).astype(self.dtype) + x2 = np.random.random((3, 40)).astype(self.dtype) + self.inputs = {'X': [("x0", x0), ("x1", x1), ("x2", x2)]} + y = x0 + x1 + x2 + self.outputs = {'Out': y} + + self.attrs = {'use_mkldnn': False} + + def init_dtype(self): + self.dtype = np.float32 + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +class TestSum2(OpTest): + def setUp(self): + self.set_npu() + self.init_dtype() + self.op_type = "sum" + self.place = paddle.NPUPlace(0) + + x0 = np.random.random((3, 3)).astype(self.dtype) + x1 = np.random.random((3, 3)).astype(self.dtype) + x2 = np.random.random((3, 3)).astype(self.dtype) + x3 = np.random.random((3, 3)).astype(self.dtype) + self.inputs = {'X': [("x0", x0), ("x1", x1), ("x2", x2), ("x3", x3)]} + y = x0 + x1 + x2 + x3 + self.outputs = {'Out': y} + + self.attrs = {'use_mkldnn': False} + + def init_dtype(self): + self.dtype = np.float16 + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..235fa2783fb3c8c507ebfa73c5631c551fce4f1a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py @@ -0,0 +1,154 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestTanh(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "tanh" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + out = np.tanh(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + # TODO(ascendrc): Add grad test + # def test_check_grad(self): + # if self.dtype == np.float16: + # return + # self.check_grad(['X'], 'Out') + # + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestTanhFp16(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "tanh" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) + out = np.tanh(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestTanhNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + c = paddle.multiply(a, b) + d = paddle.tanh(c) + + fc_1 = fluid.layers.fc(input=d, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..04d4565f7485808daa706a9781c2b7159ab9222a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py @@ -0,0 +1,95 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +from paddle.fluid import core + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestTopk(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "top_k" + self.init_dtype() + + x = np.array([[0.78104149, 0.88745828, 0.32362268], + [0.82196718, 0.48763277, 0.42826136], + [0.96527182, 0.34851612, 0.12959783]]).astype(self.dtype) + + self.inputs = {'X': x} + np_out = np.array( + [[0.88745828], [0.82196718], [0.96527182]]).astype(self.dtype) + np_indices = np.array([[1], [0], [0]]) + + self.attrs = {'k': 1, "axis": -1} + self.outputs = {'Out': np_out, 'Indices': np_indices} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestTopkV2(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "top_k" + self.init_dtype() + + x = np.array([[0.78104149, 0.88745828, 0.32362268], + [0.82196718, 0.48763277, 0.42826136], + [0.96527182, 0.34851612, 0.12959783]]).astype(self.dtype) + + self.inputs = {'X': x} + np_out = np.array([[0.88745828, 0.78104149], [0.82196718, 0.48763277], + [0.96527182, 0.34851612]]).astype(self.dtype) + np_indices = np.array([[1, 0], [0, 1], [0, 1]]) + + self.attrs = {'k': 2, "axis": -1} + self.outputs = {'Out': np_out, 'Indices': np_indices} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..17f6a0ae1ca9bffcb78b3526f78f9a26e4546fc4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py @@ -0,0 +1,74 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest, _set_use_system_allocator +import paddle +import paddle.fluid as fluid + +paddle.enable_static() + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestTransposeOp(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "transpose2" + self.place = paddle.NPUPlace(0) + self.init_dtype() + self.init_input_output() + self.init_kernel_type() + self.init_axis() + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x)} + self.attrs = {'axis': [0, 2, 1, 3], 'data_format': 'AnyLayout'} + self.outputs = {'Out': self.out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_kernel_type(self): + self.use_mkldnn = False + + def init_input_output(self): + self.x = np.random.uniform(0.1, 1, [8, 512, 12, 64]).astype(self.dtype) + self.out = np.transpose(self.x, [0, 2, 1, 3]) + + def init_dtype(self): + self.dtype = np.float32 + + def init_axis(self): + self.axis = -1 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestTransposeOpFP16(TestTransposeOp): + no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..ff89508d196235a8e50678908938ba0fc24d6981 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py @@ -0,0 +1,71 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from paddle.fluid.executor import Executor + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestTruncatedNormal(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + scope = paddle.fluid.core.Scope() + + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + paddle.seed(SEED) + + with fluid.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + weight_attr = paddle.framework.ParamAttr( + name="linear_weight", + initializer=paddle.nn.initializer.TruncatedNormal( + mean=0.0, std=2.0)) + linear = paddle.nn.Linear( + 2, 2, weight_attr=weight_attr, bias_attr=False) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + w = exe.run(startup_prog, fetch_list=['linear_weight']) + return w + + def test_npu(self): + cpu_w = self._test(False) + npu_w = self._test(True) + + self.assertTrue(np.allclose(npu_w, cpu_w)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..1060e67078f8d827618c782c8b413e861bf4d68a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py @@ -0,0 +1,268 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +import paddle.fluid.contrib.mixed_precision.amp_nn as amp_nn + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestUpdateLossScalingOp(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "update_loss_scaling" + self.place = paddle.NPUPlace(0) + + self.init() + found_inf = np.array([False], dtype=np.bool) + x = np.random.random((1024, 1024)).astype(self.dtype) + + self.inputs = { + 'X': [('x0', x)], + 'FoundInfinite': found_inf, + 'PrevLossScaling': self.prev_loss_scaling, + 'InGoodSteps': self.num_good_steps, + 'InBadSteps': self.num_bad_steps + } + + self.outputs = { + 'Out': [('out0', x)], + 'LossScaling': self.prev_loss_scaling * self.incr_ratio, + 'OutGoodSteps': self.zero_steps, + 'OutBadSteps': self.zero_steps + } + + def set_npu(self): + self.__class__.use_npu = True + + def init(self): + self.incr_ratio = 2.0 + self.decr_ratio = 0.8 + self.dtype = np.float32 + self.prev_loss_scaling = np.array([2048]).astype(self.dtype) + self.num_good_steps = np.array([999], dtype=np.int32) + self.num_bad_steps = np.array([1], dtype=np.int32) + self.zero_steps = np.array([0], dtype=np.int32) + self.attrs = { + 'incr_every_n_steps': 1000, + 'decr_every_n_nan_or_inf': 2, + 'incr_ratio': self.incr_ratio, + 'decr_ratio': self.decr_ratio, + } + + def test_check_output(self): + self.check_output_with_place( + self.place, check_dygraph=False, no_check_set=['Out']) + + +class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp): + def setUp(self): + self.set_npu() + self.op_type = "update_loss_scaling" + self.place = paddle.NPUPlace(0) + + self.init() + found_inf = np.array([True], dtype=np.bool) + x = np.random.random((1024, 1024)).astype(self.dtype) + i = np.random.randint(0, 1024, 1) + j = np.random.randint(0, 1024, 1) + x[i[0]][j[0]] = np.inf + + self.inputs = { + 'X': [('x0', x)], + 'FoundInfinite': found_inf, + 'PrevLossScaling': self.prev_loss_scaling, + 'InGoodSteps': self.num_good_steps, + 'InBadSteps': self.num_bad_steps + } + + self.outputs = { + 'Out': [('out0', np.zeros_like(x))], + 'LossScaling': self.prev_loss_scaling * self.decr_ratio, + 'OutGoodSteps': self.zero_steps, + 'OutBadSteps': self.zero_steps + } + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestUpdateLossScalingLayer(unittest.TestCase): + def loss_scaling_check(self, use_npu=True, scope=fluid.Scope()): + a = fluid.data(name="a", shape=[1024, 1024], dtype='float32') + b = fluid.data(name="b", shape=[512, 128], dtype='float32') + x = [a, b] + found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool') + prev_loss_scaling = fluid.data( + name="prev_loss_scaling", shape=[1], dtype='float32') + num_good_steps = fluid.data( + name="num_good_steps", shape=[1], dtype='int32') + num_bad_steps = fluid.data( + name="num_bad_steps", shape=[1], dtype='int32') + + a_v = np.random.random([1024, 1024]).astype('float32') + b_v = np.random.random([512, 128]).astype('float32') + found_inf_v = np.array([False]).astype('bool') + prev_loss_scaling_v = np.array([2048]).astype('float32') + num_good_steps_v = np.array([999], dtype=np.int32) + num_bad_steps_v = np.array([1], dtype=np.int32) + + incr_every_n_steps = 1000 + decr_every_n_nan_or_inf = 2 + incr_ratio = 2 + decr_ratio = 0.8 + + result = amp_nn.update_loss_scaling( + x, + found_inf, + prev_loss_scaling, + num_good_steps, + num_bad_steps, + incr_every_n_steps, + decr_every_n_nan_or_inf, + incr_ratio, + decr_ratio, + name="update_loss_scaling") + + place = paddle.NPUPlace(0) if use_npu else fluid.CPUPlace() + exe = fluid.Executor(place) + with fluid.scope_guard(scope): + exe.run(fluid.default_startup_program()) + result_v = exe.run(feed={ + 'a': a_v, + 'b': b_v, + 'found_inf': found_inf_v, + 'prev_loss_scaling': prev_loss_scaling_v, + 'num_good_steps': num_good_steps_v, + 'num_bad_steps': num_bad_steps_v + }, + fetch_list=[ + result, x, found_inf, prev_loss_scaling, + num_good_steps, num_bad_steps + ]) + assert np.array_equal(result_v[0], a_v) + assert np.array_equal(result_v[1], b_v) + assert np.array_equal(result_v[0], result_v[2]) + assert np.array_equal(result_v[1], result_v[3]) + assert np.array_equal(result_v[4], found_inf_v) + assert np.array_equal(result_v[5], prev_loss_scaling_v * incr_ratio) + assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v)) + assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v)) + + def loss_scaling_check_inf(self, use_npu=True, scope=fluid.Scope()): + a = fluid.data(name="a", shape=[1024, 1024], dtype='float32') + b = fluid.data(name="b", shape=[512, 128], dtype='float32') + x = [a, b] + found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool') + prev_loss_scaling = fluid.data( + name="prev_loss_scaling", shape=[1], dtype='float32') + num_good_steps = fluid.data( + name="num_good_steps", shape=[1], dtype='int32') + num_bad_steps = fluid.data( + name="num_bad_steps", shape=[1], dtype='int32') + + a_v = np.random.random([1024, 1024]).astype('float32') + b_v = np.random.random([512, 128]).astype('float32') + i = np.random.randint(0, 1024, 1) + j = np.random.randint(0, 1024, 1) + a_v[i[0]][j[0]] = np.inf + found_inf_v = np.array([True]).astype('bool') + prev_loss_scaling_v = np.array([2048]).astype('float32') + num_good_steps_v = np.array([999], dtype=np.int32) + num_bad_steps_v = np.array([1], dtype=np.int32) + + incr_every_n_steps = 1000 + decr_every_n_nan_or_inf = 2 + incr_ratio = 2 + decr_ratio = 0.8 + + result = amp_nn.update_loss_scaling( + x, + found_inf, + prev_loss_scaling, + num_good_steps, + num_bad_steps, + incr_every_n_steps, + decr_every_n_nan_or_inf, + incr_ratio, + decr_ratio, + name="update_loss_scaling") + + place = paddle.NPUPlace(0) if use_npu else fluid.CPUPlace() + exe = fluid.Executor(place) + with fluid.scope_guard(scope): + exe.run(fluid.default_startup_program()) + result_v = exe.run(feed={ + 'a': a_v, + 'b': b_v, + 'found_inf': found_inf_v, + 'prev_loss_scaling': prev_loss_scaling_v, + 'num_good_steps': num_good_steps_v, + 'num_bad_steps': num_bad_steps_v + }, + fetch_list=[ + result, x, found_inf, prev_loss_scaling, + num_good_steps, num_bad_steps + ]) + assert np.array_equal(result_v[0], np.zeros_like(a_v)) + assert np.array_equal(result_v[1], np.zeros_like(b_v)) + assert np.array_equal(result_v[2], np.zeros_like(a_v)) + assert np.array_equal(result_v[3], np.zeros_like(b_v)) + assert np.array_equal(result_v[4], found_inf_v) + assert np.array_equal(result_v[5], prev_loss_scaling_v * decr_ratio) + assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v)) + assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v)) + + def test_loss_scaling_cpu(self): + main = fluid.Program() + startup = fluid.Program() + with fluid.unique_name.guard(): + with fluid.program_guard(main, startup): + self.loss_scaling_check(use_npu=False) + + def test_loss_scaling_cpu_inf(self): + main = fluid.Program() + startup = fluid.Program() + with fluid.unique_name.guard(): + with fluid.program_guard(main, startup): + self.loss_scaling_check_inf(use_npu=False) + + def test_loss_scaling_npu(self): + main = fluid.Program() + startup = fluid.Program() + with fluid.unique_name.guard(): + with fluid.program_guard(main, startup): + self.loss_scaling_check(use_npu=True) + + def test_loss_scaling_npu_inf(self): + main = fluid.Program() + startup = fluid.Program() + with fluid.unique_name.guard(): + with fluid.program_guard(main, startup): + self.loss_scaling_check_inf(use_npu=True) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 569c4316880df6f148880d59a8110934cb93e234..583bd3994bd0a40a078ce34f02cab706f5da07a4 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -1449,9 +1449,18 @@ class OpTest(unittest.TestCase): if not type(output_names) is list: output_names = [output_names] + # FIXME: Replace numeric_place with place to calculate numeric_grads. + # NOTE(liym27): There is an unknown error when call op.run() on NPUPlace, which + # needs to be fixed. + if hasattr(self.__class__, + "use_npu") and self.__class__.use_npu == True: + numeric_place = paddle.CPUPlace() + else: + numeric_place = place + numeric_grads = user_defined_grads or [ get_numeric_gradient( - place, + numeric_place, self.scope, self.op, self.inputs, diff --git a/python/paddle/fluid/tests/unittests/test_ascend_group.sh b/python/paddle/fluid/tests/unittests/test_ascend_group.sh index 31c442e0962624622800bd588e0b98635df0032d..68cb075b90c3a1886c7464ee337739b70a1b2e23 100644 --- a/python/paddle/fluid/tests/unittests/test_ascend_group.sh +++ b/python/paddle/fluid/tests/unittests/test_ascend_group.sh @@ -16,15 +16,14 @@ set -e -cluster_node_ips="127.0.0.1" -export PADDLE_TRAINERS_NUM=4 -export POD_IP=127.0.0.1 -export PADDLE_TRAINERS=127.0.0.1 -export PADDLE_TRAINER_ID=0 +curr_host_ip=`hostname -i` +python hccl_tools.py --device_num "[0,4)" --server_ip ${curr_host_ip} -export PADDLE_PORT=35789 -export TRAINER_PORTS_NUM=4 +export RANK_TABLE_FILE="${PWD}/hccl_4p_0123_${curr_host_ip}.json" -distributed_args="--ips=${cluster_node_ips} --ascend_npus=0,1,2,3 --log_dir=testlog" +# use ascend +echo "begin test use ascend npu" + +distributed_args="--run_mode=collective --log_dir=testlog" python -m paddle.distributed.fleet.launch ${distributed_args} \ ascend_group.py fleetascendgroup diff --git a/python/paddle/fluid/tests/unittests/test_assign_op_npu.py b/python/paddle/fluid/tests/unittests/test_assign_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..f00a3c103c817cbfa9de39fec73fbbcb3a59af0c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_assign_op_npu.py @@ -0,0 +1,56 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +from paddle.fluid import core + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestAssign(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "assign" + self.init_dtype() + + x = np.rand.random([3, 3]) + self.inputs = {'X': x} + + self.attrs = {} + self.outputs = {'Out': x} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.int64 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_max_op_npu.py b/python/paddle/fluid/tests/unittests/test_elementwise_max_op_npu.py new file mode 100644 index 0000000000000000000000000000000000000000..6475caf970cba7be5efad60b9a4c094e112175c3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_elementwise_max_op_npu.py @@ -0,0 +1,161 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestElementwiseMax(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_max" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + out = np.maximum(x, y) + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(x), + 'Y': OpTest.np_dtype_to_fluid_dtype(y) + } + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + # TODO(ascendrc): Max grad test + # def test_check_grad(self): + # if self.dtype == np.float16: + # return + # self.check_grad(['X'], 'Out') + # + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestElementwiseMaxFp16(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_max" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) + y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) + out = np.maximum(x, y) + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(x), + 'Y': OpTest.np_dtype_to_fluid_dtype(y) + } + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestElementwiseMaxNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + c = paddle.maximum(a, b) + + fc_1 = fluid.layers.fc(input=c, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py b/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b9d88a8e1155e4e15532aa98381500f04e50c86d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py @@ -0,0 +1,62 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import sys +import os +import time +import six +import copy +import json +import unittest +import paddle.fluid as fluid + +import paddle.distributed.fleet.ascend_utils as ascend_utils + +RANK_TABLE_JSON = { + "status": "completed", + "version": "1.0", + "server_count": "1", + "server_list": [{ + "server_id": "127.0.0.1", + "device": [{ + "device_id": "0", + "device_ip": "192.1.184.23", + "rank_id": "0" + }, { + "device_id": "1", + "device_ip": "192.2.21.93", + "rank_id": "1" + }] + }] +} + + +class TestAscendUtil(unittest.TestCase): + def test_get_cloud_cluster(self): + cluster, pod = ascend_utils.get_cloud_cluster() + self.assertTrue(cluster) + self.assertTrue(pod) + + with open('rank_table_file.json', 'w') as f: + json.dump(RANK_TABLE_JSON, f) + rank_table_file = "./rank_table_file.json" + cluster, pod = ascend_utils.get_cloud_cluster( + rank_table_file=rank_table_file) + self.assertTrue(cluster) + self.assertTrue(pod) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh index 0960083abf28ec7bd34445cf22bd62284c102452..a54334692214c8ac3ced731c450a51a54478104f 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh +++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh @@ -16,22 +16,43 @@ set -e -# use paddlecloud -echo "begin test use paddlecloud" -cluster_node_ips="127.0.0.1,127.0.0.2" -export PADDLE_TRAINERS_NUM=2 -export POD_IP=127.0.0.1 -export PADDLE_TRAINERS=127.0.0.1,127.0.0.2 -export PADDLE_TRAINER_ID=0 - -export PADDLE_PORT=35789 -export TRAINER_PORTS_NUM=2 - -distributed_args="--ips=${cluster_node_ips} --ascend_npus=0,1 --log_dir=testlog" +RANK_TABLE_FILE_NAME="rank_table_file.json" +cat > ${RANK_TABLE_FILE_NAME} < ${RANK_TABLE_FILE_NAME} <