未验证 提交 cbe5c9f8 编写于 作者: L Leo Chen 提交者: GitHub

[NPU] cherry-pick gc/dataloader/save&load/optimization from ascendrc to develop (#32294)

* [NPU] support GarbageCollector for npu (#31874)

* support GarbageCollector for npu

* fix typo

* fix gather_grad

* disable NPUDefaultStreamGarbageCollector on NPU

* [NPU] support npu for memcpy op (#31808)

* support npu for memcpy op

* add ut

* fix ut

* fix typo

* 【NPU】fix bug of using temp vector (#31963)

* fix bug when beta1_pow on cpu (#31995)

* [NPU] support npu profiler (#31684)

* support npu profiler

* add python api

* fix bugs

* add wrapper for incomplete type

* update profile proto

* record npu wait

* add xpu placeholder

* fix adam (#32016)

* [NPU] enable async copy and  add wait before sync operation (#31956)

* enable async copy and  add wait before sync operation

* remove unneccessary wait

* add FillNpuTensorWithConstant

* refine

* fix fill_constant

* make TensorFromVector/TensorToVector sync

* [NPU] Support dataloader on npu place. (#31867)

* [NPU] Wait on NPUPlace (#32086)

* [NPU] fix cast op (#32121)

* fix npu kernel of cast op to handle casting to same dtype

* add comments

* [NPU] support cann 20.3 (#32044)

* fix compile problem on cann 20.3

* fix ut

* fix test_mul

* fix check_finite_and_scale

* fix lookup_table_v2_grad

* fix cmake

* support print op

* [NPU] Support npu save load (#31893)

* support save load for NPU

* add save load npu unittest

* support np.array transform in NPU

* fix errors

* delete dygraph in unittest

* add Wait

* fix unittest

* fix review comment

* fix unittest problem

* fix little problem

* change aclrtSynchronizeDevice to aclrtSynchronizeStream for better performance (#32196)

* change aclrtSynchronizeDevice to aclrtSynchronizeStream for better performace

* refine code

* fix NPUDeviceContext in all c++ unittest (#32198)

* fix NPUDeviceContext in all c++ unittest

* refine log
Co-authored-by: Npangyoki <pangyoki@126.com>

* [NPU] Remove TensorFromVector and avoid sync copy in npu op kernel for better performance (#31994)

* enable async copy and  add wait before sync operation

* remove unneccessary wait

* add FillNpuTensorWithConstant

* refine

* fix fill_constant

* change TensorFromVector to FillNpuTensorWithConstant

* fix ignored api

* delete extra unittest

* fix little error

* fix update_loss_scaling_op_npu and check_finite_and_unscale_op_npu

* change TensorCopySync to TensorCopy

* delete useless Wait and add StreamWait

* fix npu_stream error

* fix check_finite_and_unscale_op_npu TensorCopy

* only save stream wait

* fix NPUDeviceContext in all c++ unittest

* delete wait
Co-authored-by: Nzhiqiu <chenqiuliang@baidu.com>

* delete useless unittest file (#32206)

* Fix op test (#32231)

* fix conditional block (#32243)

* fix adam bug again (#32246)

* fix compile

* fix ut

* fix ut
Co-authored-by: Nliym27 <33742067+liym27@users.noreply.github.com>
Co-authored-by: Npangyoki <pangyoki@126.com>
上级 ffd40860
......@@ -21,6 +21,11 @@ else()
set(ASCEND_DIR /usr/local/Ascend)
endif()
if(EXISTS ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include/graph/ascend_string.h)
# It means CANN 20.2 +
add_definitions(-DPADDLE_WITH_ASCEND_STRING)
endif()
if(WITH_ASCEND)
set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
......@@ -43,9 +48,7 @@ if(WITH_ASCEND)
set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
add_definitions(-DPADDLE_WITH_ASCEND_STRING)
endif()
ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
......@@ -64,11 +67,13 @@ if(WITH_ASCEND_CL)
set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
set(ASCEND_CL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
set(ACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include)
message(STATUS "ASCEND_CL_INC_DIR ${ASCEND_CL_INC_DIR}")
message(STATUS "FWKACLLIB_INC_DIR ${FWKACLLIB_INC_DIR}")
message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}")
INCLUDE_DIRECTORIES(${ASCEND_CL_INC_DIR})
INCLUDE_DIRECTORIES(${FWKACLLIB_INC_DIR})
INCLUDE_DIRECTORIES(${ACLLIB_INC_DIR})
ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})
......
......@@ -456,11 +456,22 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
#endif
} else if (platform::is_npu_place(place_)) {
#ifdef PADDLE_WITH_ASCEND_CL
// TODO(ascendrc): Support garbage collector on NPUPlace
VLOG(4) << "Skip NPU gc because it is not implemented now.";
if (IsFastEagerDeletionModeEnabled()) {
VLOG(4) << "Use unsafe fast gc for NPU.";
gc.reset(new NPUUnsafeFastGarbageCollector(
BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Please set FLAGS_fast_eager_deletion_mode=true to use "
"GarbageCollector on NPU."));
// TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
VLOG(4) << "Use default stream gc for NPU.";
gc.reset(new NPUDefaultStreamGarbageCollector(
BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
}
#else
PADDLE_THROW(platform::errors::Unimplemented(
"No NPU gc found in CPU/GPU/XPU paddle"));
PADDLE_THROW(
platform::errors::Unimplemented("No NPU gc found in CPU/NPU paddle"));
#endif
}
}
......
......@@ -122,6 +122,32 @@ void CUDAPinnedGarbageCollector::ClearCallback(
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
NPUDefaultStreamGarbageCollector::NPUDefaultStreamGarbageCollector(
const platform::NPUPlace &place, size_t max_memory_size)
: GarbageCollector(place, max_memory_size) {}
void NPUDefaultStreamGarbageCollector::Wait() const {
static_cast<platform::NPUDeviceContext *>(this->dev_ctx_)
->WaitStreamCallback();
}
void NPUDefaultStreamGarbageCollector::ClearCallback(
const std::function<void()> &callback) {
static_cast<platform::NPUDeviceContext *>(this->dev_ctx_)
->AddStreamCallback(callback);
}
NPUUnsafeFastGarbageCollector::NPUUnsafeFastGarbageCollector(
const platform::NPUPlace &place, size_t max_memory_size)
: GarbageCollector(place, max_memory_size) {}
void NPUUnsafeFastGarbageCollector::ClearCallback(
const std::function<void()> &callback) {
callback();
}
#endif
int64_t GetEagerDeletionThreshold() {
return FLAGS_eager_delete_tensor_gb < 0
? -1
......
......@@ -131,6 +131,28 @@ class CUDAPinnedGarbageCollector : public GarbageCollector {
};
#endif
#ifdef PADDLE_WITH_ASCEND_CL
class NPUDefaultStreamGarbageCollector : public GarbageCollector {
public:
NPUDefaultStreamGarbageCollector(const platform::NPUPlace &place,
size_t max_memory_size);
void Wait() const override;
protected:
void ClearCallback(const std::function<void()> &callback) override;
};
class NPUUnsafeFastGarbageCollector : public GarbageCollector {
public:
NPUUnsafeFastGarbageCollector(const platform::NPUPlace &place,
size_t max_memory_size);
protected:
void ClearCallback(const std::function<void()> &callback) override;
};
#endif
template <typename Container>
void GarbageCollector::Add(Container &&objs) {
Add(std::forward<Container>(objs), []() {});
......
......@@ -343,6 +343,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
#define REGISTER_OP_NPU_KERNEL_FUNCTOR(op_type, ...) \
REGISTER_OP_KERNEL_EX( \
op_type, NPU, ::paddle::platform::NPUPlace, DEFAULT_TYPE, \
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
/**
* Macro to mark what Operator and Kernel
* we will use and tell the compiler to
......
......@@ -822,6 +822,29 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
#else
PADDLE_THROW(platform::errors::Unimplemented(
"XPUPlace is not supported when not compiled with XPU"));
#endif
} else if (platform::is_npu_place(tensor.place())) {
#ifdef PADDLE_WITH_ASCEND_CL
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]);
auto& npu_dev_ctx =
static_cast<const platform::NPUDeviceContext&>(dev_ctx);
platform::CPUPlace cpu;
uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu, buf.get(),
BOOST_GET_CONST(platform::NPUPlace, tensor.place()),
reinterpret_cast<const void*>(data), size_to_write,
npu_dev_ctx.stream());
npu_dev_ctx.Wait();
os.write(buf.get(), size_to_write);
data += size_to_write;
size -= size_to_write;
}
#else
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported when not compiled with NPU"));
#endif
} else {
os.write(static_cast<const char*>(data_ptr),
......@@ -877,9 +900,10 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
auto ctx = platform::CPUDeviceContext();
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace())) {
platform::is_xpu_place(dev_ctx.GetPlace()) ||
platform::is_npu_place(dev_ctx.GetPlace())) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL)
Tensor cpu_tensor;
cpu_tensor.Resize(framework::make_ddim(shape));
framework::VisitDataType(
......@@ -888,13 +912,19 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
is.read(static_cast<char*>(buf), size);
auto dst_place = dev_ctx.GetPlace();
framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
if (platform::is_npu_place(dev_ctx.GetPlace())) {
dev_ctx.Wait();
}
#else
if (platform::is_gpu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented(
"CUDAPlace is not supported when not compiled with CUDA"));
} else {
} else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented(
"XPUPlace is not supported when not compiled with XPU"));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported when not compiled with NPU"));
}
#endif
} else {
......@@ -935,9 +965,10 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
auto ctx = platform::CPUDeviceContext();
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace())) {
platform::is_xpu_place(dev_ctx.GetPlace()) ||
platform::is_npu_place(dev_ctx.GetPlace())) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL)
Tensor cpu_tensor;
cpu_tensor.Resize(framework::make_ddim(dims));
framework::VisitDataType(
......@@ -946,13 +977,19 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
is.read(static_cast<char*>(buf), size);
auto dst_place = dev_ctx.GetPlace();
framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
if (platform::is_npu_place(dev_ctx.GetPlace())) {
dev_ctx.Wait();
}
#else
if (platform::is_gpu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented(
"CUDAPlace is not supported when not compiled with CUDA"));
} else {
} else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented(
"XPUPlace is not supported when not compiled with XPU"));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported when not compiled with NPU"));
}
#endif
} else {
......
......@@ -159,11 +159,15 @@ void TensorFromVector(const std::vector<T>& src,
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
// NOTE(zhiqiu): Becareful that aclrtMemcpyAsync is different from
// cudaMemcpyAsync.
// cudaMemcpyAsync is actually "sync" between cpu <-> gpu.
// aclrtMemcpyAsync is really "async" between cpu <-> npu.
// Since vector is on cpu, I think this function should be a "sync" operation,
// so pass nullptr as stream to memory::Copy().
else if (platform::is_npu_place(dst_place)) { // NOLINT
memory::Copy(
BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place,
src_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
src_place, src_ptr, size, nullptr);
}
#endif
}
......@@ -202,10 +206,8 @@ inline void TensorFromVector(const std::vector<bool>& src,
#endif
#ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(dst_place)) { // NOLINT
memory::Copy(
BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place,
src_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
src_place, src_ptr, size, nullptr);
}
#endif
delete[] array;
......@@ -265,10 +267,9 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
#endif
#ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(src.place())) { // NOLINT
memory::Copy(
dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()),
src_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
memory::Copy(dst_place, dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
size, nullptr);
}
#endif
}
......@@ -301,10 +302,9 @@ inline void TensorToVector(const Tensor& src,
#endif
#ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(src.place())) { // NOLINT
memory::Copy(
dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()),
src_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
memory::Copy(dst_place, dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
size, nullptr);
}
#endif
for (unsigned int i = 0; i < src.numel(); i++) {
......
......@@ -207,12 +207,6 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
platform::SetNPUDeviceId(dst_place.device);
// NOTE(ascendrc): NPU memcpy async from host to device is a "real" async,
// which is different from CUDA. In Paddle, when async is called, "sync"
// is run actually, which means Paddle doesn't fully supported async.
// TODO(ascendrc): Support NPU memcpy async for better performance.
stream = nullptr;
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by thream(" << stream << ")";
......@@ -220,6 +214,12 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU");
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream);
} else {
// On NPU, async operation after sync operation is ok, while sync operation
// after async is not ok, since the async operation may not done.
// So, its needed to do wait before sync operation.
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
platform::RecordEvent record_event("NpuMemcpySync:CPU->NPU");
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
}
......@@ -235,12 +235,6 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
platform::SetNPUDeviceId(src_place.device);
// NOTE(ascendrc): NPU memcpy async from device to host is a "real" async,
// which is different from CUDA. In Paddle, when async is called, "sync"
// is run actually, which means Paddle doesn't fully supported async.
// TODO(ascendrc): Support NPU memcpy async for better performance.
stream = nullptr;
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by thream(" << stream << ")";
......@@ -248,6 +242,9 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU");
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream);
} else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU");
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
}
......@@ -270,6 +267,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
stream);
} else {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU");
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
}
......@@ -284,6 +285,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
stream);
} else {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU");
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
}
......
......@@ -167,7 +167,7 @@ endif()
if (WITH_ASCEND_CL)
cc_test(range_op_npu_test SRCS range_op_npu_test.cc DEPS op_registry range_op scope device_context enforce executor)
cc_test(lookup_table_v2_op_npu_test SRCS lookup_table_v2_op_npu_test.cc DEPS op_registry lookup_table_v2_op scope device_context enforce executor compare_op)
cc_test(expand_op_npu_test SRCS expand_op_npu_test.cc DEPS op_registry expand_op scope device_context enforce executor compare_op)
endif()
set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
......
......@@ -77,8 +77,7 @@ class PowGradNPUKernel : public framework::OpKernel<T> {
// 2.1 Get a factor tensor with shape [1].
Tensor factor_tensor(framework::proto::VarType::FP32);
factor_tensor.mutable_data<float>({1}, place);
TensorFromVector(std::vector<float>{factor}, ctx.device_context(),
&factor_tensor);
FillNpuTensorWithConstant<float>(&factor_tensor, factor);
// 2.2 Get the factor which has the shape with x and the same value with
// factor.
......
......@@ -44,10 +44,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
// step1: inverse scale(RealDiv)
Tensor const_tensor;
const_tensor.mutable_data<T>({1}, ctx.GetPlace());
TensorFromVector(std::vector<T>{static_cast<T>(1.0)}, ctx.device_context(),
&const_tensor);
ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
FillNpuTensorWithConstant<T>(&const_tensor, static_cast<T>(1.0));
// Inverse(1.0/scale)
Tensor* tmp_inverse_out = const_cast<Tensor*>(scale);
......@@ -61,7 +58,6 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
size_t x_size = xs.size();
for (size_t i = 0; i < x_size; ++i) {
found_inf_data = true;
const auto* x = xs[i];
auto* out = outs[i];
out->mutable_data<T>(ctx.GetPlace());
......@@ -77,6 +73,8 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
NpuOpRunner("CheckNumerics", {*x}, {check_xout},
{{"message", std::string("check_nan_and_inf")}});
runner_checknumerics.Run(stream);
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.Wait();
} catch (platform::EnforceNotMet& exception) {
LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
found_inf_data = true;
......@@ -104,7 +102,11 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
bool* is_found_inf =
found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
*is_found_inf = true;
framework::TensorCopySync(found_inf_tensor, ctx.GetPlace(), found_inf);
framework::TensorCopy(
found_inf_tensor, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), found_inf);
ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
}
}
};
......
......@@ -110,22 +110,22 @@ void Compare(f::Scope *scope, const p::DeviceContext &ctx) {
// out found_inf
Tensor found_inf_tensor;
found_inf_tensor.Resize({1});
bool *is_finite_data =
bool *found_inf_data =
found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
f::TensorCopy(*found_inf, place, &found_inf_tensor);
EXPECT_FALSE(*is_finite_data);
EXPECT_TRUE(*found_inf_data);
ctx.Wait();
}
TEST(check_finite_and_unscale, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<float>(&scope, ctx);
auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
Compare<float>(&scope, *ctx);
}
TEST(check_finite_and_unscale, NPU_fp16) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<p::float16>(&scope, ctx);
auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
Compare<p::float16>(&scope, *ctx);
}
......@@ -41,7 +41,7 @@ void Update(const platform::NPUDeviceContext& ctx,
// bad_out_data = bad_in_data + 1
Tensor factor_tensor(bad_out_tensor->type());
factor_tensor.mutable_data<int>({1}, place);
TensorFromVector(std::vector<int>{1}, ctx, &factor_tensor);
FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
auto runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor},
{*bad_out_tensor}, {});
runner_p2.Run(stream);
......@@ -84,7 +84,7 @@ void Update(const platform::NPUDeviceContext& ctx,
// good_out_data = good_in_data + 1
Tensor factor_tensor(good_out_tensor->type());
factor_tensor.mutable_data<int>({1}, place);
TensorFromVector(std::vector<int>{1}, ctx, &factor_tensor);
FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
auto runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor},
{*good_out_tensor}, {});
runner_p2.Run(stream);
......
......@@ -75,6 +75,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
TEST(assign, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<float>(&scope, ctx, "assign");
auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
Compare<float>(&scope, *ctx, "assign");
}
......@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL
#include <memory>
#include <string>
......@@ -41,11 +40,20 @@ class CastNPUKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<Tensor>("X");
int dtype = ctx.Attr<int>("out_dtype");
auto* out = ctx.Output<Tensor>("Out");
auto place = ctx.GetPlace();
if (x->type() == dtype) {
// NOTE(zhiqiu): NPU cast op may result in wrong value, so
// add special case here.
VLOG(4) << "cast to same dtype:" << dtype;
out->mutable_data(place, x->type());
framework::TensorCopy(
*x, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), out);
return;
}
auto iter = DTYPE_2_ACL_DTYPE.find(
static_cast<framework::proto::VarType::Type>(dtype));
int aclDtype = iter->second;
......@@ -76,7 +84,7 @@ class CastNPUKernel : public framework::OpKernel<T> {
}
};
} // namespace operators
} // namespace paddleaclDtype
} // namespace paddle
namespace ops = paddle::operators;
......@@ -89,4 +97,3 @@ REGISTER_OP_NPU_KERNEL(
ops::CastNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::CastNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
#endif
......@@ -100,9 +100,9 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
{{"axes", axes}, {"keep_dims", true}});
runner.Run(stream);
} else {
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.Wait();
framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx);
framework::TensorCopy(
*tmp_dout, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), dx);
}
}
......@@ -127,8 +127,6 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
{{"axes", axes}, {"keep_dims", false}});
runner.Run(stream);
tmp_dout = &reduced_dout;
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.Wait();
}
// stage 2
......@@ -144,9 +142,9 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
{{"axes", axes}, {"keep_dims", true}});
runner.Run(stream);
} else {
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.Wait();
framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dy);
framework::TensorCopy(
*tmp_dout, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), dy);
}
}
}
......
......@@ -38,7 +38,7 @@ USE_OP(elementwise_sub);
USE_OP_DEVICE_KERNEL(elementwise_sub, NPU);
template <typename T>
void Compare(f::Scope* scope, const p::DeviceContext& ctx,
void Compare(f::Scope *scope, const p::DeviceContext &ctx,
std::string op_type) {
// init
auto x = scope->Var("X");
......@@ -62,8 +62,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
TensorFromVector(init_y, ctx, tensor_y);
tensor_y->Resize({10, 10});
ctx.Wait();
auto place = ctx.GetPlace();
auto out = scope->Var("Out");
auto tensor_out = out->GetMutable<f::LoDTensor>();
......@@ -74,7 +72,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
{{"Out", {"Out"}}}, attrs);
op->Run(*scope, place);
ctx.Wait();
std::vector<T> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
......@@ -93,7 +90,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
}
template <typename T>
void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
void CompareGrad(f::Scope *scope, const p::DeviceContext &ctx,
std::string op_type) {
// init
auto dout = scope->Var("DOut");
......@@ -122,8 +119,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
TensorFromVector(init_dout, ctx, tensor_dout);
tensor_dout->Resize({2, 3, 5});
ctx.Wait();
// run
f::AttributeMap attrs;
auto op = f::OpRegistry::CreateOp(
......@@ -132,7 +127,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
auto place = ctx.GetPlace();
op->Run(*scope, place);
ctx.Wait();
std::vector<T> dx_vec;
TensorToVector(*tensor_dx, ctx, &dx_vec);
......@@ -160,30 +154,30 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
TEST(elementwise_add, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<float>(&scope, ctx, "elementwise_add");
auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
Compare<float>(&scope, *ctx, "elementwise_add");
}
TEST(elementwise_sub, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<float>(&scope, ctx, "elementwise_sub");
auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
Compare<float>(&scope, *ctx, "elementwise_sub");
}
TEST(elementwise_sub, NPU_fp16) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<p::float16>(&scope, ctx, "elementwise_sub");
auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
Compare<p::float16>(&scope, *ctx, "elementwise_sub");
}
TEST(elementwise_sub_grad, NPU) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
CompareGrad<float>(&scope, ctx, "elementwise_sub_grad");
auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
CompareGrad<float>(&scope, *ctx, "elementwise_sub_grad");
}
TEST(elementwise_add_grad, NPU) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
CompareGrad<float>(&scope, ctx, "elementwise_add_grad");
auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
CompareGrad<float>(&scope, *ctx, "elementwise_add_grad");
}
......@@ -102,7 +102,9 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
{{"axes", axes}, {"keep_dims", true}});
runner.Run(stream);
} else {
framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx);
framework::TensorCopy(
*tmp_dout, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), dx);
}
}
if (dy) {
......
......@@ -58,9 +58,11 @@ class ExpandNPUKernel : public framework::OpKernel<T> {
expand_times.size(), static_cast<size_t>(in_dims.size())));
auto* out0 = context.Output<framework::LoDTensor>("Out");
framework::DDim out_dims(in_dims);
for (size_t i = 0; i < expand_times.size(); ++i) {
out_dims[i] *= expand_times[i];
}
out0->Resize(out_dims);
out0->mutable_data<T>(context.device_context().GetPlace());
auto runner =
......
......@@ -69,6 +69,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
TEST(expand, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<float>(&scope, ctx);
auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
Compare<float>(&scope, *ctx);
}
......@@ -65,7 +65,7 @@ class FillConstantNPUKernel : public framework::OpKernel<T> {
Tensor tensor_tmp(data_type);
tensor_tmp.mutable_data<T>({1}, ctx.GetPlace());
TensorFromVector(std::vector<T>{value}, ctx.device_context(), &tensor_tmp);
FillNpuTensorWithConstant<T>(&tensor_tmp, value);
out_var->mutable_data<T>(shape, place);
auto runner = NpuOpRunner("FillD", {tensor_tmp}, {*out_var},
......
......@@ -50,6 +50,7 @@ class GatherGradOpNPUKernel : public framework::OpKernel<T> {
auto *x = ctx.Input<Tensor>("X");
auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
dx->mutable_data<T>(ctx.GetPlace());
// step1: Unsqueeze index
framework::Tensor tmp_tensor(index->type());
......@@ -66,7 +67,7 @@ class GatherGradOpNPUKernel : public framework::OpKernel<T> {
.stream();
// step2: ZerosLike x in device
Tensor zeroslike_xout(x->type());
Tensor zeroslike_xout(dx->type());
zeroslike_xout.Resize(x->dims());
auto p = zeroslike_xout.mutable_data<T>(ctx.GetPlace());
......@@ -74,7 +75,6 @@ class GatherGradOpNPUKernel : public framework::OpKernel<T> {
zeroslike_xout.numel() * sizeof(T), stream);
// step3: scatter(x_grad)
dx->mutable_data<T>(ctx.GetPlace());
auto runner_scatter = NpuOpRunner(
"TensorScatterUpdate", {zeroslike_xout, *index, *dout}, {*dx}, {});
runner_scatter.Run(stream);
......
......@@ -152,18 +152,18 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
TEST(gather, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<float>(&scope, ctx, "gather");
auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
Compare<float>(&scope, *ctx, "gather");
}
TEST(gather, NPU_fp16) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<p::float16>(&scope, ctx, "gather");
auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
Compare<p::float16>(&scope, *ctx, "gather");
}
TEST(gather_grad, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
CompareGrad<float>(&scope, ctx, "gather_grad");
auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
CompareGrad<float>(&scope, *ctx, "gather_grad");
}
......@@ -157,12 +157,12 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
TEST(gelu, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<float>(&scope, ctx);
auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
Compare<float>(&scope, *ctx);
}
TEST(gelu_grad, NPU) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
CompareGrad<float>(&scope, ctx);
auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
CompareGrad<float>(&scope, *ctx);
}
......@@ -39,10 +39,9 @@ class IncrementalNPUKernel : public framework::OpKernel<T> {
out_tensor->mutable_data<T>(context.GetPlace());
Tensor step_tensor(x_tensor->type());
std::vector<T> step_vec;
step_vec.push_back(static_cast<T>(step));
framework::TensorFromVector(step_vec, context.device_context(),
&step_tensor);
step_tensor.mutable_data<T>({1}, context.GetPlace());
FillNpuTensorWithConstant<T>(&step_tensor, static_cast<T>(step));
auto runner =
NpuOpRunner("Add", {*x_tensor, step_tensor}, {*out_tensor}, {});
......
......@@ -71,12 +71,12 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
TEST(increment, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<float>(&scope, ctx, "increment");
auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
Compare<float>(&scope, *ctx, "increment");
}
TEST(increment, NPU_fp64) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<float>(&scope, ctx, "increment");
auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
Compare<double>(&scope, *ctx, "increment");
}
......@@ -80,8 +80,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
default_scale.mutable_data<T>(framework::make_ddim(axes), place);
Tensor value(x->type());
value.mutable_data<T>({1}, place);
TensorFromVector(std::vector<T>{static_cast<T>(1.0)},
ctx.device_context(), &value);
FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0));
auto runner =
NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}});
runner.Run(stream);
......@@ -95,8 +94,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
default_bias.mutable_data<T>(framework::make_ddim(axes), place);
Tensor value(x->type());
value.mutable_data<T>({1}, place);
TensorFromVector(std::vector<T>{static_cast<T>(0)}, ctx.device_context(),
&value);
FillNpuTensorWithConstant<T>(&value, static_cast<T>(0));
auto runner =
NpuOpRunner("FillD", {value}, {default_bias}, {{"dims", axes}});
runner.Run(stream);
......@@ -251,8 +249,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
default_scale.mutable_data<T>(framework::make_ddim(axes), place);
Tensor value(x->type());
value.mutable_data<T>({1}, place);
TensorFromVector(std::vector<T>{static_cast<T>(1.0)},
ctx.device_context(), &value);
FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0));
auto runner =
NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}});
runner.Run(stream);
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/load_combine_op.h"
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(
load_combine,
ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, float>,
ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, double>,
ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int>,
ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/load_op.h"
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(
load, ops::LoadOpKernel<paddle::platform::NPUDeviceContext, float>,
ops::LoadOpKernel<paddle::platform::NPUDeviceContext, double>,
ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int>,
ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
......@@ -28,6 +28,12 @@ class LookupTableV2NPUKernel : public framework::OpKernel<T> {
auto *ids_t = ctx.Input<framework::LoDTensor>("Ids"); // int tensor
auto *output_t = ctx.Output<framework::LoDTensor>("Out"); // float tensor
auto *table_t = ctx.Input<framework::LoDTensor>("W");
// It seems cann 20.1 accepts int64, but cann 20.2+ not.
PADDLE_ENFORCE_EQ(ids_t->type(), framework::proto::VarType::INT32,
platform::errors::Unimplemented(
"The index of LookupTableV2 should be int32."));
auto *table_var = ctx.InputVar("W");
PADDLE_ENFORCE_EQ(
table_var->IsType<framework::LoDTensor>(), true,
......@@ -49,28 +55,26 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *ids_t = ctx.Input<framework::LoDTensor>("Ids");
auto *output_grad_t =
ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
auto *table_grad_t =
ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
table_grad_t->mutable_data<T>(ctx.GetPlace());
auto *p = table_grad_t->mutable_data<T>(ctx.GetPlace());
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
// step2: ZerosLike x in device
Tensor zeroslike_w(table_grad_t->type());
zeroslike_w.Resize(table_grad_t->dims());
auto p = zeroslike_w.mutable_data<T>(ctx.GetPlace());
platform::NPUMemsetAsync(static_cast<void *>(p), 0,
zeroslike_w.numel() * sizeof(T), stream);
table_grad_t->numel() * sizeof(T), stream);
table_grad_t->mutable_data<T>(ctx.GetPlace());
// NOTE(zhiqiu): It seems in cann 20.1, the first input and output
// can be different tensor, but in cann 20.2+, it does inplace operation.
// Thus, the first input and output should be same tensor.
auto runner_scatter =
NpuOpRunner("ScatterAdd", {zeroslike_w, *ids_t, *output_grad_t},
{*table_grad_t}, {});
NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t},
{*table_grad_t}, {{"use_locking", true}});
runner_scatter.Run(stream);
}
};
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <cmath>
#include <iostream>
#include <numeric>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/string/printf.h"
namespace f = paddle::framework;
namespace p = paddle::platform;
namespace m = paddle::operators::math;
USE_OP(lookup_table_v2);
USE_OP_DEVICE_KERNEL(lookup_table_v2, NPU);
template <typename T>
void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
// init
auto ids = scope->Var("Ids");
auto out = scope->Var("Out");
auto w = scope->Var("W");
auto ids_t = ids->GetMutable<f::LoDTensor>();
auto out_t = out->GetMutable<f::LoDTensor>();
auto w_t = w->GetMutable<f::LoDTensor>();
int bsz = 10;
int dim = 32;
int seqlen = 8;
int vocab_size = 100;
TensorFromVector(std::vector<int64_t>(bsz * seqlen, 3), ctx, ids_t);
std::vector<T> val(vocab_size * dim, 10.);
TensorFromVector(val, ctx, w_t);
ids_t->Resize({bsz, seqlen});
w_t->Resize({vocab_size, dim});
out_t->Resize({bsz, seqlen, dim});
ctx.Wait();
auto place = ctx.GetPlace();
out_t->mutable_data<T>(place);
f::AttributeMap attrs = {{}};
auto op = f::OpRegistry::CreateOp("lookup_table_v2",
{{"W", {"W"}}, {"Ids", {"Ids"}}},
{{"Out", {"Out"}}}, attrs);
op->Run(*scope, place);
std::vector<T> out_v;
TensorToVector(*out_t, ctx, &out_v);
ctx.Wait();
EXPECT_EQ(out_t->numel(), bsz * seqlen * dim);
T res = std::accumulate(out_v.begin(), out_v.end(), 0.);
float eps = 1.e-6;
EXPECT_LT(fabs(res - bsz * seqlen * dim * 10.), eps);
}
template <typename T>
void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
// init
auto w = scope->Var("W");
auto ids = scope->Var("Ids");
auto out = scope->Var("DOut");
auto dw = scope->Var("DW");
auto w_t = w->GetMutable<f::LoDTensor>();
auto ids_t = ids->GetMutable<f::LoDTensor>();
auto out_t = out->GetMutable<f::LoDTensor>();
auto dw_t = dw->GetMutable<f::LoDTensor>();
int bsz = 2;
int dim = 2;
int seqlen = 2;
int vocab_size = 4;
std::vector<int64_t> val_int(bsz * seqlen, 3);
std::vector<T> val(vocab_size * dim, 0.);
std::vector<T> val_out(bsz * seqlen * dim, 1.);
TensorFromVector(val_int, ctx, ids_t);
TensorFromVector(val, ctx, w_t);
TensorFromVector(val, ctx, dw_t);
TensorFromVector(val_out, ctx, out_t);
w_t->Resize({vocab_size, dim});
ids_t->Resize({bsz, seqlen});
out_t->Resize({bsz, seqlen, dim});
dw_t->Resize({vocab_size, dim});
ctx.Wait();
auto place = ctx.GetPlace();
out_t->mutable_data<T>(place);
w_t->mutable_data<T>(place);
dw_t->mutable_data<T>(place);
f::AttributeMap attrs = {{}};
auto op = f::OpRegistry::CreateOp(
"lookup_table_v2_grad",
{{"Ids", {"Ids"}}, {"W", {"W"}}, {"Out@GRAD", {"DOut"}}},
{{"W@GRAD", {"DW"}}}, attrs);
op->Run(*scope, place);
ctx.Wait();
std::vector<T> w_v;
TensorToVector(*dw_t, ctx, &w_v);
ctx.Wait();
EXPECT_EQ(dw_t->numel(), vocab_size * dim);
T res = std::accumulate(w_v.begin(), w_v.end(), 0.);
float eps = 1.e-6;
EXPECT_LT(fabs(res - bsz * seqlen * dim), eps);
}
TEST(lookup_table_v2, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<float>(&scope, ctx);
}
TEST(lookup_table_v2_grad, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
CompareGrad<float>(&scope, ctx);
}
......@@ -68,10 +68,8 @@ class MeanGradNPUKernel : public framework::OpKernel<T> {
Tensor mean_tensor(grad->type());
mean_tensor.Resize({1});
mean_tensor.mutable_data<T>(context.GetPlace());
std::vector<float> mean_vec;
mean_vec.push_back(1.0 / static_cast<float>(IG->numel()));
framework::TensorFromVector(mean_vec, context.device_context(),
&mean_tensor);
FillNpuTensorWithConstant<T>(
&mean_tensor, static_cast<T>(1.0 / static_cast<float>(IG->numel())));
// means mul ones
Tensor mean_ma(grad->type());
......
......@@ -105,16 +105,18 @@ class MemcpyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
"is the same as input X.");
AddAttr<int>("dst_place_type",
"Determine the dst place of tensor copy. "
"By Now it ONLY support CUDAPlace and CUDAPinnedPlace. Other "
"place type is Unimplemented and will cause ERROR."
"By Now it ONLY support CUDAPlace <-> CUDAPinnedPlace or "
"NPUPlace <-> CPUPlace. "
"Other place type is Unimplemented and will cause ERROR."
"0: dst is on CPUPlace. "
"1: dst is on CUDAPlace. "
"2: dst is on CUDAPinnedPlace. "
"3: dst is on XPUPlace. ");
"3: dst is on XPUPlace. "
"4: dst is on NPUPlace. ");
AddComment(R"DOC(
Memcpy Operator.
By now, it ONLY supports the memcopy between CUDAPinnedPlace and CUDAPlace,
and used as an internal op by Recompute-Offload.
By now, it ONLY supports the memcopy between CUDAPinnedPlace <-> CUDAPlace or
NPUPlace <-> CPUPlace, and used as an internal op by Recompute-Offload.
You would have to update it if you want other more capacities.
Out = X, when type in [LoDTensor]
......@@ -146,3 +148,11 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double,
ops::MemcpyKernel, plat::float16,
ops::MemcpyKernel);
#endif
#ifdef PADDLE_WITH_ASCEND_CL
REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double,
ops::MemcpyKernel, int, ops::MemcpyKernel,
int64_t, ops::MemcpyKernel, bool,
ops::MemcpyKernel, plat::float16,
ops::MemcpyKernel);
#endif
......@@ -51,7 +51,17 @@ class MemcpyFunctor {
} else if (dst_place_type_ == 1) {
framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
&out_tensor);
} else {
}
#ifdef PADDLE_WITH_ASCEND_CL
else if (dst_place_type_ == 0) { // NOLINT
framework::TensorCopy(lod_tensor, platform::CPUPlace(), dev_ctx_,
&out_tensor);
} else if (dst_place_type_ == 4) {
framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
&out_tensor);
}
#endif
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"memcpy dst_place_type: %d is not supported yet.", dst_place_type_));
}
......
......@@ -64,8 +64,10 @@ aclFormat ConvertToNpuFormat(DataLayout layout) {
return iter->second;
}
aclrtStream GetCurrentNPUStream() {
int device_id = platform::GetCurrentNPUDeviceId();
aclrtStream GetCurrentNPUStream(int device_id) {
if (device_id == -1) {
device_id = platform::GetCurrentNPUDeviceId();
}
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *dev_ctx = static_cast<platform::NPUDeviceContext *>(
pool.Get(platform::NPUPlace(device_id)));
......@@ -299,5 +301,6 @@ void NpuOpRunner::Run(aclrtStream stream) {
VLOG(4) << "after aclopCompileAndExecute: " << ret;
PADDLE_ENFORCE_NPU_SUCCESS(ret);
}
} // namespace operators
} // namespace paddle
......@@ -86,6 +86,44 @@ class NpuOpRunner {
aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype);
aclrtStream GetCurrentNPUStream(int device_id = -1);
template <typename T>
void FillNpuTensorWithConstant(Tensor *tensor, T val) {
PADDLE_ENFORCE_EQ(
tensor->IsInitialized(), true,
platform::errors::InvalidArgument("The tensor should be initialized."));
PADDLE_ENFORCE_EQ(
platform::is_npu_place(tensor->place()), true,
platform::errors::InvalidArgument("The tensor should be on NPUPlace."));
// do async for better performance
if (typeid(float) == typeid(T) || typeid(platform::float16) == typeid(T)) {
Tensor tmp(tensor->type());
tmp.Resize(tensor->dims());
tmp.mutable_data<T>(tensor->place());
auto stream = GetCurrentNPUStream(
BOOST_GET_CONST(platform::NPUPlace, tensor->place()).device);
platform::NPUMemsetAsync(tmp.data<void>(), 0, tmp.numel() * sizeof(T),
stream);
auto runner = NpuOpRunner("Power", {tmp}, {*tensor},
{{"power", static_cast<float>(1)},
{"scale", static_cast<float>(0)},
{"shift", static_cast<float>(val)}});
runner.Run(stream);
} else {
T *array = new T[tensor->numel()];
for (unsigned int i = 0; i < tensor->numel(); ++i) {
array[i] = static_cast<T>(val);
}
std::vector<T> vec(tensor->numel(), static_cast<T>(val));
// do sync copy
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()),
tensor->data<void>(), platform::CPUPlace(), array,
tensor->numel() * sizeof(T), nullptr);
delete[] array;
}
}
} // namespace operators
} // namespace paddle
#endif
......@@ -61,8 +61,23 @@ class AdamNPUKernel : public framework::OpKernel<T> {
param_out->mutable_data<T>(ctx.GetPlace());
mom1_out->mutable_data<T>(ctx.GetPlace());
mom2_out->mutable_data<T>(ctx.GetPlace());
beta1_pow_out->mutable_data<T>(ctx.GetPlace());
beta2_pow_out->mutable_data<T>(ctx.GetPlace());
// NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform place.
if (beta1_pow->place() == platform::CPUPlace()) {
T beta1 = *beta1_pow->data<T>();
// `mutable_data` operation needs to be done after getting data
beta1_pow_out->mutable_data<T>(ctx.GetPlace());
FillNpuTensorWithConstant<T>(beta1_pow_out, beta1);
} else {
beta1_pow_out->mutable_data<T>(ctx.GetPlace());
}
if (beta2_pow->place() == platform::CPUPlace()) {
T beta2 = *beta2_pow->data<T>();
beta2_pow_out->mutable_data<T>(ctx.GetPlace());
FillNpuTensorWithConstant<T>(beta2_pow_out, beta2);
} else {
beta2_pow_out->mutable_data<T>(ctx.GetPlace());
}
T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
if (ctx.HasInput("Beta1Tensor")) {
......@@ -100,18 +115,15 @@ class AdamNPUKernel : public framework::OpKernel<T> {
// reshape
Tensor beta1_tensor(framework::proto::VarType::FP32);
beta1_tensor.mutable_data<float>({1}, ctx.GetPlace());
TensorFromVector(std::vector<T>{beta1}, ctx.device_context(),
&beta1_tensor);
beta1_tensor.mutable_data<T>({1}, ctx.GetPlace());
FillNpuTensorWithConstant<T>(&beta1_tensor, beta1);
Tensor beta2_tensor(framework::proto::VarType::FP32);
beta2_tensor.mutable_data<float>({1}, ctx.GetPlace());
TensorFromVector(std::vector<T>{beta2}, ctx.device_context(),
&beta2_tensor);
beta2_tensor.mutable_data<T>({1}, ctx.GetPlace());
FillNpuTensorWithConstant<T>(&beta2_tensor, beta2);
Tensor epsilon_tensor(framework::proto::VarType::FP32);
epsilon_tensor.mutable_data<T>({1}, ctx.GetPlace());
TensorFromVector(std::vector<T>{epsilon}, ctx.device_context(),
&epsilon_tensor);
FillNpuTensorWithConstant<T>(&epsilon_tensor, epsilon);
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
......@@ -130,16 +142,19 @@ class AdamNPUKernel : public framework::OpKernel<T> {
// NOTE(zhiqiu): ApplyAdamD updates params inplace, so
// if param and param_out is not same, we need to do copy.
if (param_out->data<T>() != param->data<T>()) {
ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
framework::TensorCopySync(*param, ctx.GetPlace(), param_out);
framework::TensorCopy(
*param, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), param_out);
}
if (mom1_out->data<T>() != mom1->data<T>()) {
ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
framework::TensorCopySync(*mom1, ctx.GetPlace(), mom1_out);
framework::TensorCopy(
*mom1, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), mom1_out);
}
if (mom2_out->data<T>() != mom2->data<T>()) {
ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
framework::TensorCopySync(*mom2, ctx.GetPlace(), mom2_out);
framework::TensorCopy(
*mom2, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), mom2_out);
}
auto runner_m1 =
NpuOpRunner("Mul", {*beta1_pow, beta1_tensor}, {*beta1_pow_out}, {});
......
......@@ -44,8 +44,9 @@ class SGDNPUKernel : public framework::OpKernel<T> {
// NOTE(zhiqiu): ApplyGradientDescent updates params inplace, so
// if param and param_out is not same, we need to do copy.
if (param_out->data<T>() != param_var->data<T>()) {
ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
framework::TensorCopySync(*param_var, ctx.GetPlace(), param_out);
framework::TensorCopy(
*param_var, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), param_out);
}
}
};
......
......@@ -39,11 +39,23 @@ class RangeNPUKernel : public framework::OpKernel<T> {
auto* out = context.Output<framework::Tensor>("Out");
framework::Tensor n;
framework::TensorCopySync(*start_t, platform::CPUPlace(), &n);
framework::TensorCopy(
*start_t, platform::CPUPlace(),
context.template device_context<platform::DeviceContext>(), &n);
context.template device_context<paddle::platform::NPUDeviceContext>()
.Wait();
T start = n.data<T>()[0];
framework::TensorCopySync(*end_t, platform::CPUPlace(), &n);
framework::TensorCopy(
*end_t, platform::CPUPlace(),
context.template device_context<platform::DeviceContext>(), &n);
context.template device_context<paddle::platform::NPUDeviceContext>()
.Wait();
T end = n.data<T>()[0];
framework::TensorCopySync(*step_t, platform::CPUPlace(), &n);
framework::TensorCopy(
*step_t, platform::CPUPlace(),
context.template device_context<platform::DeviceContext>(), &n);
context.template device_context<paddle::platform::NPUDeviceContext>()
.Wait();
T step = n.data<T>()[0];
int64_t size = 0;
......
......@@ -87,6 +87,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
TEST(range, NPU) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<int>(&scope, ctx, "range");
auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
Compare<int>(&scope, *ctx, "range");
}
......@@ -53,9 +53,25 @@ BufferedReader::BufferedReader(
stream_ = platform::CudaStreamResourcePool::Instance().New(dev_idx);
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
if (platform::is_npu_place(place_)) {
int dev_idx = BOOST_GET_CONST(platform::NPUPlace, place_).device;
compute_stream_ =
((platform::NPUDeviceContext *)(platform::DeviceContextPool::Instance()
.Get(place_)))
->stream();
events_.resize(buffer_size);
for (auto &event : events_) {
event = platform::NpuEventResourcePool::Instance().New(dev_idx);
}
stream_ = platform::NpuStreamResourcePool::Instance().New(dev_idx);
}
#endif
is_same_place_ = false;
cpu_buffer_.resize(buffer_size);
cuda_buffer_.resize(buffer_size);
npu_buffer_.resize(buffer_size);
ReadTillBufferFullAsync();
}
......@@ -196,7 +212,59 @@ void BufferedReader::ReadAsync(size_t i) {
#endif
}
}
#endif // @} End Group GPU Place
#endif
#ifdef PADDLE_WITH_ASCEND_CL
if (platform::is_npu_place(place_)) {
TensorVec &npu = npu_buffer_[i];
if (npu.empty()) {
npu.resize(cpu.size());
} else {
PADDLE_ENFORCE_EQ(
npu.size(), cpu.size(),
platform::errors::InvalidArgument(
"Input tensor number on NPU and CPU devices are not matched. "
"The number on NPU is %d, on CPU is %d",
npu.size(), cpu.size()));
}
std::vector<void *> npu_ptrs;
npu_ptrs.reserve(cpu.size());
for (size_t i = 0; i < cpu.size(); ++i) {
npu[i].Resize(cpu[i].dims());
npu[i].set_layout(cpu[i].layout());
npu_ptrs.emplace_back(npu[i].mutable_data(place_, cpu[i].type()));
}
platform::SetNPUDeviceId(
BOOST_GET_CONST(platform::NPUPlace, place_).device);
PADDLE_ENFORCE_NPU_SUCCESS(
aclrtRecordEvent(events_[i].get(), compute_stream_));
PADDLE_ENFORCE_NPU_SUCCESS(
aclrtStreamWaitEvent(stream_.get(), events_[i].get()));
platform::RecordEvent record_event("BufferedReader:MemoryCopy");
for (size_t i = 0; i < cpu.size(); ++i) {
auto cpu_place = cpu[i].place();
auto cpu_ptr = cpu[i].data<void>();
auto npu_ptr = npu_ptrs[i];
auto size =
cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
if ((platform::is_npu_place(cpu_place))) {
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place_), npu_ptr,
BOOST_GET_CONST(platform::NPUPlace, cpu_place), cpu_ptr,
size, stream_.get());
} else {
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place_), npu_ptr,
BOOST_GET_CONST(platform::CPUPlace, cpu_place), cpu_ptr,
size, stream_.get());
PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream_.get()));
}
npu[i].set_lod(cpu[i].lod());
}
PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream_.get()));
}
#endif
return i;
}));
}
......@@ -228,9 +296,13 @@ void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
return;
}
*out = std::move((platform::is_gpu_place(place_) && !is_same_place_)
? cuda_buffer_[i]
: cpu_buffer_[i]);
if (platform::is_gpu_place(place_) && !is_same_place_) {
*out = std::move(cuda_buffer_[i]);
} else if (platform::is_npu_place(place_) && !is_same_place_) {
*out = std::move(npu_buffer_[i]);
} else {
*out = std::move(cpu_buffer_[i]);
}
// Do not push current position into ReadAsync. Push the previous position
// Since all computation in fluid are async, change the data of
......
......@@ -25,7 +25,10 @@
#include "paddle/fluid/platform/cuda_resource_pool.h"
#include "paddle/fluid/platform/gpu_info.h"
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/npu_info.h"
#include "paddle/fluid/platform/npu_resource_pool.h"
#endif
namespace paddle {
namespace operators {
namespace reader {
......@@ -67,12 +70,19 @@ class BufferedReader : public framework::DecoratedReader {
bool is_same_place_;
std::vector<TensorVec> cpu_buffer_;
std::vector<TensorVec> cuda_buffer_;
std::vector<TensorVec> npu_buffer_;
size_t prev_pos_{-1UL};
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
gpuStream_t compute_stream_;
std::shared_ptr<platform::CudaStreamObject> stream_;
std::vector<std::shared_ptr<platform::CudaEventObject>> events_;
#endif
#ifdef PADDLE_WITH_ASCEND_CL
aclrtStream compute_stream_;
std::shared_ptr<platform::NpuStreamObject> stream_;
std::vector<std::shared_ptr<platform::NpuEventObject>> events_;
#endif
};
} // namespace reader
......
......@@ -78,6 +78,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
TEST(reduce_any, NPU) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<bool>(&scope, ctx);
auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
Compare<bool>(&scope, *ctx);
}
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/save_combine_op.h"
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(
save_combine,
ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, float>,
ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, double>,
ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, int>,
ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/save_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(
save, ops::SaveOpKernel<paddle::platform::NPUDeviceContext, float>,
ops::SaveOpKernel<paddle::platform::NPUDeviceContext, double>,
ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int>,
ops::SaveOpKernel<paddle::platform::NPUDeviceContext, uint8_t>,
ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int64_t>,
ops::SaveOpKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
......@@ -159,12 +159,12 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
TEST(softmax, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<float>(&scope, ctx);
auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
Compare<float>(&scope, *ctx);
}
TEST(softmax_grad, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
CompareGrad<float>(&scope, ctx);
auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
CompareGrad<float>(&scope, *ctx);
}
......@@ -67,12 +67,10 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
// on and off
Tensor on_tensor(framework::proto::VarType::INT32);
on_tensor.mutable_data<int>({1}, ctx.GetPlace());
TensorFromVector(std::vector<int>{static_cast<int>(1)},
ctx.device_context(), &on_tensor);
FillNpuTensorWithConstant<int>(&on_tensor, static_cast<int>(1));
Tensor off_tensor(framework::proto::VarType::INT32);
off_tensor.mutable_data<int>({1}, ctx.GetPlace());
TensorFromVector(std::vector<int>{static_cast<int>(0)},
ctx.device_context(), &off_tensor);
FillNpuTensorWithConstant<int>(&off_tensor, static_cast<int>(0));
// one_hot
Tensor tmp_onehot(on_tensor.type());
......@@ -142,12 +140,10 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> {
// on and off
Tensor on_tensor(framework::proto::VarType::INT32);
on_tensor.mutable_data<int>({1}, ctx.GetPlace());
TensorFromVector(std::vector<int>{static_cast<int>(1)},
ctx.device_context(), &on_tensor);
FillNpuTensorWithConstant<int>(&on_tensor, static_cast<int>(1));
Tensor off_tensor(framework::proto::VarType::INT32);
off_tensor.mutable_data<int>({1}, ctx.GetPlace());
TensorFromVector(std::vector<int>{static_cast<int>(0)},
ctx.device_context(), &off_tensor);
FillNpuTensorWithConstant<int>(&off_tensor, static_cast<int>(0));
// one_hot
Tensor tmp_onehot(on_tensor.type());
......
......@@ -85,6 +85,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
TEST(squeeze, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<float>(&scope, ctx);
auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
Compare<float>(&scope, *ctx);
}
......@@ -125,6 +125,11 @@ void TensorFormatter::FormatData(const framework::LoDTensor& print_tensor,
framework::LoDTensor cpu_tensor;
platform::CPUPlace cpu_place;
TensorCopy(print_tensor, cpu_place, &cpu_tensor);
#ifdef PADDLE_WITH_ASCEND_CL
if (platform::is_npu_place(print_tensor.place())) {
platform::DeviceContextPool::Instance().Get(print_tensor.place())->Wait();
}
#endif
data = cpu_tensor.data<T>();
}
......
......@@ -126,12 +126,12 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
TEST(transpose2, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<float>(&scope, ctx);
auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
Compare<float>(&scope, *ctx);
}
TEST(transpose2_grad, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
CompareGrad<float>(&scope, ctx);
auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
CompareGrad<float>(&scope, *ctx);
}
......@@ -35,28 +35,24 @@ class TruncatedGaussianRandomNPUKernel : public framework::OpKernel<T> {
float mean = ctx.Attr<float>("mean");
Tensor mean_tensor(framework::proto::VarType::FP32);
mean_tensor.mutable_data<float>({1}, ctx.GetPlace());
TensorFromVector(std::vector<float>{mean}, ctx.device_context(),
&mean_tensor);
FillNpuTensorWithConstant<float>(&mean_tensor, mean);
float std = ctx.Attr<float>("std");
Tensor std_tensor(framework::proto::VarType::FP32);
std_tensor.mutable_data<float>({1}, ctx.GetPlace());
TensorFromVector(std::vector<float>{std}, ctx.device_context(),
&std_tensor);
FillNpuTensorWithConstant<float>(&std_tensor, std);
int32_t seed_var = ctx.Attr<int32_t>("seed");
Tensor min_tensor(framework::proto::VarType::FP32);
min_tensor.mutable_data<float>({1}, ctx.GetPlace());
float min_value = mean - std * 2.0;
TensorFromVector(std::vector<float>{min_value}, ctx.device_context(),
&min_tensor);
FillNpuTensorWithConstant<float>(&min_tensor, min_value);
Tensor max_tensor(framework::proto::VarType::FP32);
max_tensor.mutable_data<float>({1}, ctx.GetPlace());
float max_value = mean + std * 2.0;
TensorFromVector(std::vector<float>{max_value}, ctx.device_context(),
&max_tensor);
FillNpuTensorWithConstant<float>(&max_tensor, max_value);
auto* out = ctx.Output<framework::Tensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
......
......@@ -85,6 +85,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
TEST(unsqueeze, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<float>(&scope, ctx);
auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
Compare<float>(&scope, *ctx);
}
......@@ -143,6 +143,11 @@ if(WITH_GPU OR WITH_ROCM)
target_link_libraries(device_context cuda_resource_pool)
endif()
if(WITH_ASCEND_CL)
cc_library(npu_resource_pool SRCS npu_resource_pool.cc DEPS npu_info)
target_link_libraries(device_context npu_resource_pool)
endif()
cc_test(init_test SRCS init_test.cc DEPS device_context)
if(WITH_GPU)
......
......@@ -16,8 +16,8 @@ limitations under the License. */
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#include "glog/logging.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace memory {
......@@ -254,8 +254,9 @@ NPUDeviceContext::~NPUDeviceContext() {
}
void NPUDeviceContext::Wait() const {
NPUDeviceGuard guard(place_.device);
PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice());
platform::RecordEvent record_event("NPUDeviceContext/wait");
VLOG(4) << "NPU context(" << this << ") Wait";
stream_->Wait();
}
aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }
......
......@@ -197,6 +197,13 @@ class NPUDeviceContext : public DeviceContext {
void set_hccl_context(HCCLContext_t context) { hccl_context_ = context; }
#endif
template <typename Callback>
void AddStreamCallback(Callback&& callback) const {
return stream_->AddCallback(callback);
}
void WaitStreamCallback() const { return stream_->WaitCallback(); }
private:
NPUPlace place_;
aclrtContext context_;
......
......@@ -587,6 +587,8 @@ class DeviceTracerImpl : public DeviceTracer {
BOOST_GET_CONST(platform::CUDAPlace, r.place).GetDeviceId());
} else if (platform::is_cuda_pinned_place(r.place)) {
event->set_place(proto::MemEvent::CUDAPinnedPlace);
} else if (platform::is_npu_place(r.place)) {
event->set_place(proto::MemEvent::NPUPlace);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"The current place is not supported."));
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "acl/acl_prof.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace platform {
#ifdef PADDLE_WITH_ASCEND_STRING
// For CANN 20.2+
// ACL_AICORE_ARITHMETIC_UTILIZATION = 0, record arithmetic stats
// ACL_AICORE_PIPE_UTILIZATION = 1, record pipeline
// ACL_AICORE_MEMORY_BANDWIDTH = 2, record memory
// ACL_AICORE_L0B_AND_WIDTH = 3, recore internal memory
// ACL_AICORE_RESOURCE_CONFLICT_RATIO = 5, record pipeline ratio
constexpr aclprofAicoreMetrics default_metrics =
ACL_AICORE_ARITHMETIC_UTILIZATION;
#else
// For CANN 20.1
// ACL_AICORE_ARITHMATIC_THROUGHPUT = 0, record arithmetic stats
// ACL_AICORE_PIPELINE = 1, record pipeline
// ACL_AICORE_SYNCHRONIZATION = 2, record sync
// ACL_AICORE_MEMORY = 3, recore memory
// ACL_AICORE_INTERNAL_MEMORY = 4, recore internal memory
// ACL_AICORE_STALL = 5, record pipeline ratio
constexpr aclprofAicoreMetrics default_metrics =
ACL_AICORE_ARITHMATIC_THROUGHPUT;
#endif
// ACL_PROF_ACL_API, record ACL API stats
// ACL_PROF_TASK_TIME, record AI core stats
// ACL_PROF_AICORE_METRICS, must include
// ACL_PROF_AICPU_TRACE, recore AICPU, not supported yet
constexpr uint64_t default_type =
ACL_PROF_ACL_API | ACL_PROF_AICORE_METRICS | ACL_PROF_TASK_TIME;
aclprofConfig *NPUProfilerCreateConfig(
std::vector<uint32_t> devices = {},
aclprofAicoreMetrics metrics = default_metrics, uint64_t c = default_type,
aclprofAicoreEvents *events = nullptr) {
if (devices.size() == 0) {
int device_id = GetCurrentNPUDeviceId();
devices.emplace_back(device_id);
}
aclprofConfig *config =
aclprofCreateConfig(devices.data(), devices.size(), metrics, events, c);
PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::External(
"Failed to create prof config for NPU"));
return config;
}
void NPUProfilerDestroyConfig(const aclprofConfig *config) {
PADDLE_ENFORCE_NPU_SUCCESS(aclprofDestroyConfig(config));
}
void NPUProfilerInit(std::string output_path) {
PADDLE_ENFORCE_NPU_SUCCESS(
aclprofInit(output_path.c_str(), output_path.size()));
}
void NPUProfilerStart(const aclprofConfig *config) {
if (config == nullptr) {
// NOTE(zhiqiu): support single device by default.
int device_id = GetCurrentNPUDeviceId();
std::vector<uint32_t> devices = {static_cast<uint32_t>(device_id)};
config = NPUProfilerCreateConfig(devices);
}
PADDLE_ENFORCE_NPU_SUCCESS(aclprofStart(config));
}
void NPUProfilerStop(const aclprofConfig *config) {
PADDLE_ENFORCE_NPU_SUCCESS(aclprofStop(config));
NPUProfilerDestroyConfig(config);
}
void NPUProfilerFinalize() { PADDLE_ENFORCE_NPU_SUCCESS(aclprofFinalize()); }
struct NPUProfConfigWrapper {
aclprofConfig *p_;
explicit NPUProfConfigWrapper(aclprofConfig *p) : p_(p) {}
aclprofConfig *ptr() { return p_; }
};
} // namespace platform
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/npu_resource_pool.h"
#include "paddle/fluid/platform/npu_info.h"
namespace paddle {
namespace platform {
NpuStreamResourcePool::NpuStreamResourcePool() {
int dev_cnt = platform::GetNPUDeviceCount();
pool_.reserve(dev_cnt);
for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
auto creator = [dev_idx] {
platform::SetNPUDeviceId(dev_idx);
aclrtStream stream;
PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateStream(&stream));
return stream;
};
auto deleter = [dev_idx](aclrtStream stream) {
platform::SetNPUDeviceId(dev_idx);
PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyStream(stream));
};
pool_.emplace_back(ResourcePool<NpuStreamObject>::Create(creator, deleter));
}
}
NpuStreamResourcePool& NpuStreamResourcePool::Instance() {
static NpuStreamResourcePool pool;
return pool;
}
std::shared_ptr<NpuStreamObject> NpuStreamResourcePool::New(int dev_idx) {
PADDLE_ENFORCE_GE(
dev_idx, 0,
platform::errors::InvalidArgument(
"The dev_idx should be not less than 0, but got %d.", dev_idx));
PADDLE_ENFORCE_LT(
dev_idx, pool_.size(),
platform::errors::OutOfRange(
"The dev_idx should be less than device count %d, but got %d.",
pool_.size(), dev_idx));
return pool_[dev_idx]->New();
}
NpuEventResourcePool::NpuEventResourcePool() {
int dev_cnt = platform::GetNPUDeviceCount();
pool_.reserve(dev_cnt);
for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
auto creator = [dev_idx] {
platform::SetNPUDeviceId(dev_idx);
aclrtEvent event;
PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateEvent(&event));
return event;
};
auto deleter = [dev_idx](aclrtEvent event) {
platform::SetNPUDeviceId(dev_idx);
PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyEvent(event));
};
pool_.emplace_back(ResourcePool<NpuEventObject>::Create(creator, deleter));
}
}
NpuEventResourcePool& NpuEventResourcePool::Instance() {
static NpuEventResourcePool pool;
return pool;
}
std::shared_ptr<NpuEventObject> NpuEventResourcePool::New(int dev_idx) {
PADDLE_ENFORCE_GE(
dev_idx, 0,
platform::errors::InvalidArgument(
"The dev_idx should be not less than 0, but got %d.", dev_idx));
PADDLE_ENFORCE_LT(
dev_idx, pool_.size(),
platform::errors::OutOfRange(
"The dev_idx should be less than device count %d, but got %d.",
pool_.size(), dev_idx));
return pool_[dev_idx]->New();
}
} // namespace platform
} // namespace paddle
#endif
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_ASCEND_CL
#include <memory>
#include <type_traits>
#include <vector>
#include "acl/acl.h"
#include "paddle/fluid/platform/resource_pool.h"
namespace paddle {
namespace platform {
using NpuStreamObject = std::remove_pointer<aclrtStream>::type;
using NpuEventObject = std::remove_pointer<aclrtEvent>::type;
class NpuStreamResourcePool {
public:
std::shared_ptr<NpuStreamObject> New(int dev_idx);
static NpuStreamResourcePool &Instance();
private:
NpuStreamResourcePool();
DISABLE_COPY_AND_ASSIGN(NpuStreamResourcePool);
private:
std::vector<std::shared_ptr<ResourcePool<NpuStreamObject>>> pool_;
};
class NpuEventResourcePool {
public:
std::shared_ptr<NpuEventObject> New(int dev_idx);
static NpuEventResourcePool &Instance();
private:
NpuEventResourcePool();
DISABLE_COPY_AND_ASSIGN(NpuEventResourcePool);
private:
std::vector<std::shared_ptr<ResourcePool<NpuEventObject>>> pool_;
};
} // namespace platform
} // namespace paddle
#endif
......@@ -21,6 +21,7 @@ message Event {
enum EventType {
CPU = 0;
GPUKernel = 1;
NPUKernel = 2;
}
optional EventType type = 8;
optional string name = 1;
......@@ -39,6 +40,8 @@ message MemEvent {
CUDAPlace = 0;
CPUPlace = 1;
CUDAPinnedPlace = 2;
XPUPlace = 3;
NPUPlace = 4;
}
optional uint64 start_ns = 1;
optional uint64 end_ns = 2;
......
......@@ -71,6 +71,8 @@ void StreamCallbackManager<Stream>::AddCallback(
#endif
#if PADDLE_WITH_ASCEND_CL
VLOG(3) << "aclrtLaunchCallback at stream: " << stream_;
// TODO(zhiqiu): failed to call aclrtLaunchCallback
PADDLE_ENFORCE_NPU_SUCCESS(aclrtLaunchCallback(StreamCallbackFunc, func,
ACL_CALLBACK_BLOCK, stream_));
#endif
......
......@@ -109,6 +109,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/npu_info.h"
#include "paddle/fluid/platform/npu_profiler.h"
#endif
#ifdef PADDLE_WITH_XPU
......@@ -581,11 +582,6 @@ PYBIND11_MODULE(core_noavx, m) {
make_ddim(x_dim), make_ddim(y_dim), -1));
});
#ifdef PADDLE_WITH_ASCEND_CL
m.def("_npu_finalize",
[]() { platform::AclInstance::Instance().Finalize(); });
#endif
m.def(
"_append_python_callable_object_and_return_id",
[](py::object py_obj) -> size_t {
......@@ -1744,7 +1740,7 @@ All parameter, weight, gradient are variables in Paddle.
"Cannot use NPU because you have installed CPU/GPU version "
"PaddlePaddle.\n"
"If you want to use NPU, please try to install NPU version "
"PaddlePaddle by: pip install paddlepaddle-xpu\n"
"PaddlePaddle by: pip install paddlepaddle-npu\n"
"If you only have CPU, please change NPUPlace(%d) to be "
"CPUPlace().\n",
dev_id);
......@@ -2180,6 +2176,31 @@ All parameter, weight, gradient are variables in Paddle.
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
m.def("get_npu_device_count", platform::GetNPUDeviceCount);
m.def("_npu_finalize", []() {
platform::AclInstance::Instance().Finalize();
}); // private interface
py::class_<platform::NPUProfConfigWrapper>(m, "NPUProfConfigWrapper");
m.def("npu_prof_init", platform::NPUProfilerInit);
m.def("npu_prof_start", [](platform::NPUProfConfigWrapper c) {
platform::NPUProfilerStart(c.ptr());
});
m.def("npu_prof_stop", [](platform::NPUProfConfigWrapper c) {
platform::NPUProfilerStop(c.ptr());
});
m.def("npu_prof_finalize", platform::NPUProfilerFinalize);
m.def("npu_prof_create_config", []() {
return platform::NPUProfConfigWrapper(platform::NPUProfilerCreateConfig());
});
m.def("npu_prof_destropy_config", [](platform::NPUProfConfigWrapper c) {
platform::NPUProfilerDestroyConfig(c.ptr());
});
#endif
py::enum_<platform::TracerOption>(m, "TracerOption", py::arithmetic())
.value("kDefault", platform::TracerOption::kDefault)
.value("kOpDetail", platform::TracerOption::kOpDetail)
......
......@@ -663,6 +663,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
}
bool is_gpu_tensor = platform::is_gpu_place(tensor.place());
bool is_xpu_tensor = platform::is_xpu_place(tensor.place());
bool is_npu_tensor = platform::is_npu_place(tensor.place());
const auto &tensor_dims = tensor.dims();
auto tensor_dtype = tensor.type();
size_t sizeof_dtype = framework::SizeOfType(tensor_dtype);
......@@ -681,7 +682,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(tensor.type());
if (!is_gpu_tensor && !is_xpu_tensor) {
if (!is_gpu_tensor && !is_xpu_tensor && !is_npu_tensor) {
if (!need_deep_copy) {
auto base = py::cast(std::move(tensor));
return py::array(py::dtype(py_dtype_str.c_str()), py_dims, py_strides,
......@@ -749,6 +750,34 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use CUDAPlace in CPU only version, "
"Please recompile or reinstall Paddle with CUDA support."));
#endif
} else if (is_npu_tensor) {
#ifdef PADDLE_WITH_ASCEND_CL
py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
platform::errors::InvalidArgument(
"PyArray is not writable, in which case memory leak "
"or double free would occur"));
PADDLE_ENFORCE_EQ(
py_arr.owndata(), true,
platform::errors::InvalidArgument(
"PyArray does not own data, in which case memory leak "
"or double free would occur"));
size_t copy_bytes = sizeof_dtype * numel;
auto p = BOOST_GET_CONST(platform::NPUPlace, tensor.place());
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &ctx = *pool.Get(tensor.place());
paddle::memory::Copy(
platform::CPUPlace(), py_arr.mutable_data(), p, tensor_buf_ptr,
copy_bytes,
reinterpret_cast<const platform::NPUDeviceContext &>(ctx).stream());
ctx.Wait();
return py_arr;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use NPUPlace in CPU/GPU/XPU version, "
"Please recompile or reinstall Paddle with NPU support."));
#endif
}
PADDLE_THROW(platform::errors::Unimplemented("Place is not supported"));
......
......@@ -2041,6 +2041,10 @@ def load(program, model_path, executor=None, var_list=None):
p = paddle.fluid.core.Place()
p.set_place(t._place())
place = paddle.fluid.XPUPlace(p.xpu_device_id())
elif p.is_npu_place():
p = paddle.fluid.core.Place()
p.set_place(t._place())
place = paddle.fluid.NPUPlace(p.npu_device_id())
else:
p = paddle.fluid.core.Place()
p.set_place(t._place())
......@@ -2335,6 +2339,10 @@ def set_program_state(program, state_dict):
p = paddle.fluid.core.Place()
p.set_place(ten_place)
py_place = paddle.fluid.XPUPlace(p.xpu_device_id())
elif ten_place.is_npu_place():
p = paddle.fluid.core.Place()
p.set_place(ten_place)
py_place = paddle.fluid.NPUPlace(p.npu_device_id())
ten.set(new_para_np, py_place)
......
......@@ -106,6 +106,65 @@ def cuda_profiler(output_file, output_mode=None, config=None):
os.remove(config_file)
@signature_safe_contextmanager
def npu_profiler(output_file, config=None):
"""
The NPU profiler.
This fuctions is used to profile NPU program by NPU runtime application
programming interface. The profiling result will be written into
`output_file`. The users can set set the NPU profiling config by `config` argument.
After getting the profiling result file, users can use
`tools provided by Ascend <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_
to load this output file to visualize results.
Args:
output_file (str) : The output file name, the result will be
written into this file. It should be absolute path.
config (list<str>, optional) : NPU profile config. For more details, please
refer to `User Guide <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_ .
Examples:
.. code-block:: python
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
import numpy as np
epoc = 8
dshape = [4, 3, 28, 28]
data = fluid.data(name='data', shape=[None, 3, 28, 28], dtype='float32')
conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
place = fluid.NPUPlace(0)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
output_file = 'npu.txt'
with profiler.npu_profiler(output_file) as npu_prof:
for i in range(epoc):
input = np.random.random(dshape).astype('float32')
exe.run(fluid.default_main_program(), feed={'data': input})
# then use NPU profiler tools to load this output file
# to visualize results.
"""
# TODO: support config in python.
if not config:
config = core.npu_prof_create_config()
core.npu_prof_init(output_file)
# Enables profiler collection by the active NPU profiling tool.
core.npu_prof_start(config)
try:
yield
# Disables profiler collection.
finally:
core.npu_prof_stop(config)
core.npu_prof_finalize()
def reset_profiler():
"""
Clear the previous time record. This interface does not work for
......
......@@ -14,6 +14,8 @@
import unittest
import numpy as np
import sys
sys.path.append("..")
from op_test import OpTest, skip_check_grad_ci
import paddle
import paddle.fluid as fluid
......
......@@ -73,5 +73,27 @@ class TestCast2(OpTest):
self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
class TestCast3(OpTest):
def setUp(self):
self.set_npu()
self.op_type = "cast"
self.place = paddle.NPUPlace(0)
ipt = np.random.random(size=[10, 10]) + 1
self.inputs = {'X': ipt.astype('int32')}
self.outputs = {'Out': ipt.astype('int32')}
self.attrs = {
'in_dtype': int(core.VarDesc.VarType.INT32),
'out_dtype': int(core.VarDesc.VarType.INT32)
}
def set_npu(self):
self.__class__.use_npu = True
def test_check_output(self):
self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
if __name__ == '__main__':
unittest.main()
......@@ -26,7 +26,7 @@ from paddle.fluid import core
paddle.enable_static()
SEED = 2021
NPUPlace = 5
NPUPlace = 0
@unittest.skipIf(not paddle.is_compiled_with_npu(),
......
......@@ -41,7 +41,7 @@ class TestLookupTableV2(OpTest):
vocab = 10
dim = 20
w = np.ones([vocab, dim]).astype(self.dtype)
x = np.random.randint(0, vocab, size=(bsz, seqlen)).astype(np.int64)
x = np.random.randint(0, vocab, size=(bsz, seqlen)).astype(np.int32)
out = np.ones([bsz, seqlen, dim]).astype(self.dtype)
self.inputs = {
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import unittest
import sys
sys.path.append("..")
from op_test import OpTest
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid import compiler, Program, program_guard
paddle.enable_static()
SEED = 2021
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestMemcpy_FillConstant(unittest.TestCase):
def get_prog(self):
paddle.enable_static()
main_program = Program()
with program_guard(main_program):
cpu_var_name = "tensor@Cpu"
npu_var_name = "tensor@Npu"
cpu_var = main_program.global_block().create_var(
name=cpu_var_name,
shape=[10, 10],
dtype='float32',
persistable=False,
stop_gradient=True)
npu_var = main_program.global_block().create_var(
name=npu_var_name,
shape=[10, 10],
dtype='float32',
persistable=False,
stop_gradient=True)
main_program.global_block().append_op(
type="fill_constant",
outputs={"Out": npu_var_name},
attrs={
"shape": [10, 10],
"dtype": npu_var.dtype,
"value": 1.0,
"place_type": 1
})
main_program.global_block().append_op(
type="fill_constant",
outputs={"Out": cpu_var_name},
attrs={
"shape": [10, 10],
"dtype": cpu_var.dtype,
"value": 0.0,
"place_type": 2
})
return main_program, npu_var, cpu_var
def test_npu_cpoy_to_cpu(self):
main_program, npu_var, cpu_var = self.get_prog()
main_program.global_block().append_op(
type='memcpy',
inputs={'X': npu_var},
outputs={'Out': cpu_var},
attrs={'dst_place_type': 0})
place = fluid.NPUPlace(0)
exe = fluid.Executor(place)
npu_, cpu_ = exe.run(main_program,
feed={},
fetch_list=[npu_var.name, cpu_var.name])
self.assertTrue(np.allclose(npu_, cpu_))
self.assertTrue(np.allclose(cpu_, np.ones((10, 10))))
def test_cpu_cpoy_npu(self):
main_program, npu_var, cpu_var = self.get_prog()
main_program.global_block().append_op(
type='memcpy',
inputs={'X': cpu_var},
outputs={'Out': npu_var},
attrs={'dst_place_type': 4})
place = fluid.NPUPlace(0)
exe = fluid.Executor(place)
npu_, cpu_ = exe.run(main_program,
feed={},
fetch_list=[npu_var.name, cpu_var.name])
self.assertTrue(np.allclose(npu_, cpu_))
self.assertTrue(np.allclose(npu_, np.zeros((10, 10))))
if __name__ == '__main__':
unittest.main()
......@@ -248,8 +248,9 @@ class TestMulNet3_2(unittest.TestCase):
cpu_pred, cpu_loss = self._test(False)
npu_pred, npu_loss = self._test(True)
self.assertTrue(np.allclose(npu_pred, cpu_pred))
self.assertTrue(np.allclose(npu_loss, cpu_loss))
self.assertTrue(np.allclose(
npu_pred, cpu_pred, atol=1e-5)) # atol needed on cann 20.3
self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-5))
@unittest.skipIf(not paddle.is_compiled_with_npu(),
......
......@@ -16,6 +16,8 @@ from __future__ import print_function
import unittest
import numpy as np
import sys
sys.path.append("..")
from op_test import OpTest, skip_check_grad_ci
import paddle
import paddle.fluid.core as core
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import sys
sys.path.append("..")
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.nn import Embedding
import paddle.fluid.framework as framework
from paddle.fluid.optimizer import Adam
from paddle.fluid.dygraph.base import to_variable
from test_imperative_base import new_program_scope
from paddle.fluid.executor import global_scope
import numpy as np
import six
import pickle
import os
import errno
from test_static_save_load import *
paddle.enable_static()
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestNPUSaveLoadBase(TestSaveLoadBase):
def set_place(self):
return fluid.CPUPlace() if not core.is_compiled_with_npu(
) else paddle.NPUPlace(0)
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestNPUSaveLoadPartial(TestSaveLoadPartial):
def set_place(self):
return fluid.CPUPlace() if not core.is_compiled_with_npu(
) else paddle.NPUPlace(0)
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestNPUSaveLoadSetStateDict(TestSaveLoadSetStateDict):
def set_place(self):
return fluid.CPUPlace() if not core.is_compiled_with_npu(
) else paddle.NPUPlace(0)
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestNPUProgramStatePartial(TestProgramStatePartial):
def set_place(self):
return fluid.CPUPlace() if not core.is_compiled_with_npu(
) else paddle.NPUPlace(0)
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestNPULoadFromOldInterface(TestLoadFromOldInterface):
def set_place(self):
return fluid.CPUPlace() if not core.is_compiled_with_npu(
) else paddle.NPUPlace(0)
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestNPULoadFromOldInterfaceSingleFile(TestLoadFromOldInterfaceSingleFile):
def set_place(self):
return fluid.CPUPlace() if not core.is_compiled_with_npu(
) else paddle.NPUPlace(0)
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestNPUProgramStateOldSave(TestProgramStateOldSave):
def setUp(self):
self.test_dygraph = False
def set_place(self):
return fluid.CPUPlace() if not core.is_compiled_with_npu(
) else paddle.NPUPlace(0)
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"core is not compiled with NPU")
class TestNPUProgramStateOldSaveSingleModel(TestProgramStateOldSaveSingleModel):
def set_place(self):
return fluid.CPUPlace() if not core.is_compiled_with_npu(
) else paddle.NPUPlace(0)
if __name__ == '__main__':
paddle.enable_static()
unittest.main()
......@@ -36,7 +36,7 @@ class TestAssign(OpTest):
self.op_type = "assign"
self.init_dtype()
x = np.rand.random([3, 3])
x = np.random.random([3, 3]).astype(self.dtype)
self.inputs = {'X': x}
self.attrs = {}
......@@ -46,7 +46,7 @@ class TestAssign(OpTest):
self.__class__.use_npu = True
def init_dtype(self):
self.dtype = np.int64
self.dtype = np.float32
def test_check_output(self):
self.check_output_with_place(self.place, check_dygraph=False)
......
......@@ -101,7 +101,7 @@ def prepare_places(with_data_parallel, with_cpu=False, with_gpu=True):
class TestStaticDataLoader(unittest.TestCase):
def run_main(self, num_workers, places):
def run_main(self, num_workers, places, use_pe=True):
scope = fluid.Scope()
with fluid.scope_guard(scope):
startup_prog, main_prog, image, label, loss = simple_fc_net_static()
......@@ -120,10 +120,13 @@ class TestStaticDataLoader(unittest.TestCase):
exe = fluid.Executor(place=places[0])
exe.run(startup_prog)
prog = fluid.CompiledProgram(main_prog)
if len(places) > 1:
prog = prog.with_data_parallel(
loss_name=loss.name, places=places)
if use_pe:
prog = fluid.CompiledProgram(main_prog)
if len(places) > 1:
prog = prog.with_data_parallel(
loss_name=loss.name, places=places)
else:
prog = main_prog
step_list = []
loss_list = []
......
......@@ -19,7 +19,7 @@ import unittest
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.dygraph.nn import Embedding
from paddle.nn import Embedding
import paddle.fluid.framework as framework
from paddle.fluid.optimizer import Adam
from paddle.fluid.dygraph.base import to_variable
......@@ -31,6 +31,8 @@ import pickle
import os
import errno
paddle.enable_static()
class SimpleLSTMRNN(fluid.Layer):
def __init__(self,
......@@ -159,11 +161,10 @@ class PtbModel(fluid.Layer):
num_layers=num_layers,
init_scale=init_scale,
dropout=dropout)
self.embedding = Embedding(
size=[vocab_size, hidden_size],
dtype='float32',
is_sparse=False,
param_attr=fluid.ParamAttr(
self.embedding = paddle.nn.Embedding(
num_embeddings=vocab_size,
embedding_dim=hidden_size,
weight_attr=fluid.ParamAttr(
name='embedding_para',
initializer=fluid.initializer.UniformInitializer(
low=-init_scale, high=init_scale)))
......@@ -187,6 +188,8 @@ class PtbModel(fluid.Layer):
init_c = fluid.layers.reshape(
init_cell, shape=[self.num_layers, -1, self.hidden_size])
# NPU 'tok_k' kernel only support `int32` dtype, so cast `input` from `int64` to `int32`.
input = fluid.layers.cast(input, "int32")
x_emb = self.embedding(input)
x_emb = fluid.layers.reshape(
x_emb, shape=[-1, self.num_steps, self.hidden_size])
......@@ -214,6 +217,10 @@ class PtbModel(fluid.Layer):
class TestSaveLoadBase(unittest.TestCase):
def set_place(self):
return fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
def test_ptb_rnn_cpu_float32(self):
seed = 90
hidden_size = 10
......@@ -235,8 +242,7 @@ class TestSaveLoadBase(unittest.TestCase):
num_steps=num_steps,
init_scale=init_scale)
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
place = self.set_place()
exe = fluid.Executor(place)
sgd = Adam(learning_rate=1e-3)
x = fluid.layers.data(
......@@ -315,6 +321,10 @@ class TestSaveLoadBase(unittest.TestCase):
class TestSaveLoadPartial(unittest.TestCase):
def set_place(self):
return fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
def test_ptb_rnn_cpu_float32(self):
seed = 90
hidden_size = 10
......@@ -336,8 +346,7 @@ class TestSaveLoadPartial(unittest.TestCase):
num_steps=num_steps,
init_scale=init_scale)
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
place = self.set_place()
exe = fluid.Executor(place)
sgd = Adam(learning_rate=1e-3)
x = fluid.layers.data(
......@@ -425,6 +434,10 @@ class TestSaveLoadPartial(unittest.TestCase):
class TestSaveLoadSetStateDict(unittest.TestCase):
def set_place(self):
return fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
def test_ptb_rnn_cpu_float32(self):
seed = 90
hidden_size = 10
......@@ -446,8 +459,7 @@ class TestSaveLoadSetStateDict(unittest.TestCase):
num_steps=num_steps,
init_scale=init_scale)
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
place = self.set_place()
exe = fluid.Executor(place)
sgd = Adam(learning_rate=1e-3)
x = fluid.layers.data(
......@@ -526,6 +538,10 @@ class TestSaveLoadSetStateDict(unittest.TestCase):
class TestProgramStatePartial(unittest.TestCase):
def set_place(self):
return fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
def test_ptb_rnn_cpu_float32(self):
seed = 90
hidden_size = 10
......@@ -547,8 +563,7 @@ class TestProgramStatePartial(unittest.TestCase):
num_steps=num_steps,
init_scale=init_scale)
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
place = self.set_place()
exe = fluid.Executor(place)
sgd = Adam(learning_rate=1e-3)
x = fluid.layers.data(
......@@ -708,14 +723,17 @@ class TestProgramStatePartial(unittest.TestCase):
class TestVariableInit(unittest.TestCase):
def set_place(self):
return fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
def test_variable_init(self):
x = fluid.data(name="x", shape=[10, 10], dtype='float32')
y = fluid.layers.fc(x, 10)
z = fluid.layers.fc(y, 10)
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
place = self.set_place()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
......@@ -738,8 +756,7 @@ class TestVariableInit(unittest.TestCase):
program = fluid.default_main_program()
new_scope = fluid.core.Scope()
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
place = self.set_place()
exe = fluid.Executor(place)
parameter_list = list(
filter(fluid.io.is_parameter, program.list_vars()))
......@@ -798,6 +815,10 @@ class TestLoadFromOldInterface(unittest.TestCase):
if os.path.exists("test_static_load_var_list.pdparams"):
os.remove("test_static_load_var_list.pdparams")
def set_place(self):
return fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
def test_load_from_old_interface(self):
seed = 90
hidden_size = 10
......@@ -819,8 +840,7 @@ class TestLoadFromOldInterface(unittest.TestCase):
num_steps=num_steps,
init_scale=init_scale)
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
place = self.set_place()
exe = fluid.Executor(place)
sgd = Adam(learning_rate=1e-3)
x = fluid.layers.data(
......@@ -935,8 +955,7 @@ class TestLoadFromOldInterface(unittest.TestCase):
num_steps=num_steps,
init_scale=init_scale)
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
place = self.set_place()
exe = fluid.Executor(place)
sgd = Adam(learning_rate=1e-3)
x = fluid.layers.data(
......@@ -1027,6 +1046,10 @@ class TestLoadFromOldInterface(unittest.TestCase):
class TestLoadFromOldInterfaceSingleFile(unittest.TestCase):
def set_place(self):
return fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
def test_load_from_old_interface(self):
seed = 90
hidden_size = 10
......@@ -1048,8 +1071,7 @@ class TestLoadFromOldInterfaceSingleFile(unittest.TestCase):
num_steps=num_steps,
init_scale=init_scale)
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
place = self.set_place()
exe = fluid.Executor(place)
sgd = Adam(learning_rate=1e-3)
x = fluid.layers.data(
......@@ -1171,6 +1193,13 @@ class TestLoadFromOldInterfaceSingleFile(unittest.TestCase):
class TestProgramStateOldSave(unittest.TestCase):
def setUp(self):
self.test_dygraph = True
def set_place(self):
return fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
def test_ptb_rnn_cpu_float32(self):
seed = 90
hidden_size = 10
......@@ -1192,8 +1221,7 @@ class TestProgramStateOldSave(unittest.TestCase):
num_steps=num_steps,
init_scale=init_scale)
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
place = self.set_place()
exe = fluid.Executor(place)
sgd = Adam(learning_rate=1e-3)
x = fluid.layers.data(
......@@ -1299,11 +1327,12 @@ class TestProgramStateOldSave(unittest.TestCase):
fluid.set_program_state(main_program, program_state)
self.check_in_static(main_program, base_map)
# make sure `load_program_state` can be used in dynamic graph mode
with fluid.dygraph.guard(place):
load_state = fluid.load_program_state("test_program_1")
for k, v in load_state.items():
self.assertTrue(np.array_equal(base_map[k], v))
if self.test_dygraph:
# make sure `load_program_state` can be used in dynamic graph mode
with fluid.dygraph.guard(place):
load_state = fluid.load_program_state("test_program_1")
for k, v in load_state.items():
self.assertTrue(np.array_equal(base_map[k], v))
def create_symlink(self, target, link_name):
try:
......@@ -1323,6 +1352,10 @@ class TestProgramStateOldSave(unittest.TestCase):
class TestProgramStateOldSaveSingleModel(unittest.TestCase):
def set_place(self):
return fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
def test_ptb_rnn_cpu_float32(self):
seed = 90
hidden_size = 10
......@@ -1344,8 +1377,7 @@ class TestProgramStateOldSaveSingleModel(unittest.TestCase):
num_steps=num_steps,
init_scale=init_scale)
place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
) else fluid.CUDAPlace(0)
place = self.set_place()
exe = fluid.Executor(place)
sgd = Adam(learning_rate=1e-3)
x = fluid.layers.data(
......
......@@ -186,6 +186,13 @@ class Timeline(object):
self._chrome_trace.emit_pid(
"memory usage on %s:cudapinnedplace:%d" %
(k, mevent.device_id), pid)
elif mevent.place == profiler_pb2.MemEvent.NPUPlace:
if (k, mevent.device_id, "NPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, mevent.device_id, "NPU")] = pid
self._chrome_trace.emit_pid(
"memory usage on %s:npu:%d" % (k, mevent.device_id),
pid)
if (k, 0, "CPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, 0, "CPU")] = pid
......@@ -201,6 +208,11 @@ class Timeline(object):
self._mem_devices[(k, 0, "CUDAPinnedPlace")] = pid
self._chrome_trace.emit_pid(
"memory usage on %s:cudapinnedplace:%d" % (k, 0), pid)
if (k, 0, "NPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, 0, "NPU")] = pid
self._chrome_trace.emit_pid("memory usage on %s:npu:%d" %
(k, 0), pid)
def _allocate_events(self):
for k, profile_pb in six.iteritems(self._profile_dict):
......@@ -227,7 +239,8 @@ class Timeline(object):
place_to_str = {
profiler_pb2.MemEvent.CPUPlace: "CPU",
profiler_pb2.MemEvent.CUDAPlace: "GPU",
profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace"
profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace",
profiler_pb2.MemEvent.NPUPlace: "NPU"
}
for k, profile_pb in six.iteritems(self._profile_dict):
mem_list = []
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册