未验证 提交 6503ef56 编写于 作者: L Leo Chen 提交者: GitHub

[NPU] support npu profiler (#31684)

* support npu profiler

* add python api

* fix bugs

* add wrapper for incomplete type

* update profile proto

* record npu wait

* add xpu placeholder
上级 44ed8f2d
......@@ -65,11 +65,13 @@ if(WITH_ASCEND_CL)
set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so)
set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
set(ASCEND_CL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
set(ACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include)
message(STATUS "ASCEND_CL_INC_DIR ${ASCEND_CL_INC_DIR}")
message(STATUS "FWKACLLIB_INC_DIR ${FWKACLLIB_INC_DIR}")
message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}")
INCLUDE_DIRECTORIES(${ASCEND_CL_INC_DIR})
INCLUDE_DIRECTORIES(${FWKACLLIB_INC_DIR})
INCLUDE_DIRECTORIES(${ACLLIB_INC_DIR})
ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})
......
......@@ -58,12 +58,15 @@ class ExpandNPUKernel : public framework::OpKernel<T> {
expand_times.size(), static_cast<size_t>(in_dims.size())));
auto* out0 = context.Output<framework::LoDTensor>("Out");
framework::DDim out_dims(in_dims);
for (size_t i = 0; i < expand_times.size(); ++i) {
out_dims[i] *= expand_times[i];
}
out0->Resize(out_dims);
out0->mutable_data<T>(context.device_context().GetPlace());
auto runner = NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}});
auto runner =
NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}});
auto stream =
context.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
......
......@@ -21,8 +21,8 @@ limitations under the License. */
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#include "glog/logging.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace memory {
......@@ -253,6 +253,7 @@ NPUDeviceContext::~NPUDeviceContext() {
}
void NPUDeviceContext::Wait() const {
platform::RecordEvent record_event("NPUDeviceContext/wait");
NPUDeviceGuard guard(place_.device);
PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice());
}
......
......@@ -598,6 +598,8 @@ class DeviceTracerImpl : public DeviceTracer {
BOOST_GET_CONST(platform::CUDAPlace, r.place).GetDeviceId());
} else if (platform::is_cuda_pinned_place(r.place)) {
event->set_place(proto::MemEvent::CUDAPinnedPlace);
} else if (platform::is_npu_place(r.place)) {
event->set_place(proto::MemEvent::NPUPlace);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"The current place is not supported."));
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "acl/acl_prof.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace platform {
// For ACL 20.1
// ACL_AICORE_ARITHMATIC_THROUGHPUT = 0, record arithmetic stats
// ACL_AICORE_PIPELINE = 1, record pipeline
// ACL_AICORE_SYNCHRONIZATION = 2, record sync
// ACL_AICORE_MEMORY = 3, recore memory
// ACL_AICORE_INTERNAL_MEMORY = 4, recore internal memory
// ACL_AICORE_STALL = 5, record pipeline ratio
constexpr aclprofAicoreMetrics default_metrics =
ACL_AICORE_ARITHMATIC_THROUGHPUT;
// ACL_PROF_ACL_API, record ACL API stats
// ACL_PROF_TASK_TIME, record AI core stats
// ACL_PROF_AICORE_METRICS, must include
// ACL_PROF_AICPU_TRACE, recore AICPU, not supported yet
constexpr uint64_t default_type =
ACL_PROF_ACL_API | ACL_PROF_AICORE_METRICS | ACL_PROF_TASK_TIME;
aclprofConfig *NPUProfilerCreateConfig(
std::vector<uint32_t> devices = {},
aclprofAicoreMetrics metrics = default_metrics, uint64_t c = default_type,
aclprofAicoreEvents *events = nullptr) {
if (devices.size() == 0) {
int device_id = GetCurrentNPUDeviceId();
devices.emplace_back(device_id);
}
aclprofConfig *config =
aclprofCreateConfig(devices.data(), devices.size(), metrics, events, c);
PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::External(
"Failed to create prof config for NPU"));
return config;
}
void NPUProfilerDestroyConfig(const aclprofConfig *config) {
PADDLE_ENFORCE_NPU_SUCCESS(aclprofDestroyConfig(config));
}
void NPUProfilerInit(std::string output_path) {
PADDLE_ENFORCE_NPU_SUCCESS(
aclprofInit(output_path.c_str(), output_path.size()));
}
void NPUProfilerStart(const aclprofConfig *config) {
if (config == nullptr) {
// NOTE(zhiqiu): support single device by default.
int device_id = GetCurrentNPUDeviceId();
std::vector<uint32_t> devices = {static_cast<uint32_t>(device_id)};
config = NPUProfilerCreateConfig(devices);
}
PADDLE_ENFORCE_NPU_SUCCESS(aclprofStart(config));
}
void NPUProfilerStop(const aclprofConfig *config) {
PADDLE_ENFORCE_NPU_SUCCESS(aclprofStop(config));
NPUProfilerDestroyConfig(config);
}
void NPUProfilerFinalize() { PADDLE_ENFORCE_NPU_SUCCESS(aclprofFinalize()); }
struct NPUProfConfigWrapper {
aclprofConfig *p_;
explicit NPUProfConfigWrapper(aclprofConfig *p) : p_(p) {}
aclprofConfig *ptr() { return p_; }
};
} // namespace platform
} // namespace paddle
......@@ -21,6 +21,7 @@ message Event {
enum EventType {
CPU = 0;
GPUKernel = 1;
NPUKernel = 2;
}
optional EventType type = 8;
optional string name = 1;
......@@ -39,6 +40,8 @@ message MemEvent {
CUDAPlace = 0;
CPUPlace = 1;
CUDAPinnedPlace = 2;
XPUPlace = 3;
NPUPlace = 4;
}
optional uint64 start_ns = 1;
optional uint64 end_ns = 2;
......
......@@ -104,6 +104,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/npu_info.h"
#include "paddle/fluid/platform/npu_profiler.h"
#endif
#ifdef PADDLE_WITH_XPU
......@@ -499,11 +500,6 @@ PYBIND11_MODULE(core_noavx, m) {
make_ddim(x_dim), make_ddim(y_dim), -1));
});
#ifdef PADDLE_WITH_ASCEND_CL
m.def("_npu_finalize",
[]() { platform::AclInstance::Instance().Finalize(); });
#endif
m.def(
"_append_python_callable_object_and_return_id",
[](py::object py_obj) -> size_t {
......@@ -2082,6 +2078,31 @@ All parameter, weight, gradient are variables in Paddle.
#endif
#endif
#ifdef PADDLE_WITH_ASCEND_CL
m.def("get_npu_device_count", platform::GetNPUDeviceCount);
m.def("_npu_finalize", []() {
platform::AclInstance::Instance().Finalize();
}); // private interface
py::class_<platform::NPUProfConfigWrapper>(m, "NPUProfConfigWrapper");
m.def("npu_prof_init", platform::NPUProfilerInit);
m.def("npu_prof_start", [](platform::NPUProfConfigWrapper c) {
platform::NPUProfilerStart(c.ptr());
});
m.def("npu_prof_stop", [](platform::NPUProfConfigWrapper c) {
platform::NPUProfilerStop(c.ptr());
});
m.def("npu_prof_finalize", platform::NPUProfilerFinalize);
m.def("npu_prof_create_config", []() {
return platform::NPUProfConfigWrapper(platform::NPUProfilerCreateConfig());
});
m.def("npu_prof_destropy_config", [](platform::NPUProfConfigWrapper c) {
platform::NPUProfilerDestroyConfig(c.ptr());
});
#endif
py::enum_<platform::TracerOption>(m, "TracerOption", py::arithmetic())
.value("kDefault", platform::TracerOption::kDefault)
.value("kOpDetail", platform::TracerOption::kOpDetail)
......
......@@ -106,6 +106,65 @@ def cuda_profiler(output_file, output_mode=None, config=None):
os.remove(config_file)
@signature_safe_contextmanager
def npu_profiler(output_file, config=None):
"""
The NPU profiler.
This fuctions is used to profile NPU program by NPU runtime application
programming interface. The profiling result will be written into
`output_file`. The users can set set the NPU profiling config by `config` argument.
After getting the profiling result file, users can use
`tools provided by Ascend <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_
to load this output file to visualize results.
Args:
output_file (str) : The output file name, the result will be
written into this file. It should be absolute path.
config (list<str>, optional) : NPU profile config. For more details, please
refer to `User Guide <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_ .
Examples:
.. code-block:: python
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
import numpy as np
epoc = 8
dshape = [4, 3, 28, 28]
data = fluid.data(name='data', shape=[None, 3, 28, 28], dtype='float32')
conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
place = fluid.NPUPlace(0)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
output_file = 'npu.txt'
with profiler.npu_profiler(output_file) as npu_prof:
for i in range(epoc):
input = np.random.random(dshape).astype('float32')
exe.run(fluid.default_main_program(), feed={'data': input})
# then use NPU profiler tools to load this output file
# to visualize results.
"""
# TODO: support config in python.
if not config:
config = core.npu_prof_create_config()
core.npu_prof_init(output_file)
# Enables profiler collection by the active NPU profiling tool.
core.npu_prof_start(config)
try:
yield
# Disables profiler collection.
finally:
core.npu_prof_stop(config)
core.npu_prof_finalize()
def reset_profiler():
"""
Clear the previous time record. This interface does not work for
......
......@@ -186,6 +186,13 @@ class Timeline(object):
self._chrome_trace.emit_pid(
"memory usage on %s:cudapinnedplace:%d" %
(k, mevent.device_id), pid)
elif mevent.place == profiler_pb2.MemEvent.NPUPlace:
if (k, mevent.device_id, "NPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, mevent.device_id, "NPU")] = pid
self._chrome_trace.emit_pid(
"memory usage on %s:npu:%d" % (k, mevent.device_id),
pid)
if (k, 0, "CPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, 0, "CPU")] = pid
......@@ -201,6 +208,11 @@ class Timeline(object):
self._mem_devices[(k, 0, "CUDAPinnedPlace")] = pid
self._chrome_trace.emit_pid(
"memory usage on %s:cudapinnedplace:%d" % (k, 0), pid)
if (k, 0, "NPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, 0, "NPU")] = pid
self._chrome_trace.emit_pid("memory usage on %s:npu:%d" %
(k, 0), pid)
def _allocate_events(self):
for k, profile_pb in six.iteritems(self._profile_dict):
......@@ -227,7 +239,8 @@ class Timeline(object):
place_to_str = {
profiler_pb2.MemEvent.CPUPlace: "CPU",
profiler_pb2.MemEvent.CUDAPlace: "GPU",
profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace"
profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace",
profiler_pb2.MemEvent.NPUPlace: "NPU"
}
for k, profile_pb in six.iteritems(self._profile_dict):
mem_list = []
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册