From 6503ef56b132a746ab562985238b47f555024a05 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Thu, 1 Apr 2021 13:44:14 +0800 Subject: [PATCH] [NPU] support npu profiler (#31684) * support npu profiler * add python api * fix bugs * add wrapper for incomplete type * update profile proto * record npu wait * add xpu placeholder --- cmake/external/ascend.cmake | 8 ++- paddle/fluid/operators/expand_op_npu.cc | 5 +- paddle/fluid/platform/device_context.cc | 3 +- paddle/fluid/platform/device_tracer.cc | 2 + paddle/fluid/platform/npu_profiler.h | 91 +++++++++++++++++++++++++ paddle/fluid/platform/profiler.proto | 3 + paddle/fluid/pybind/pybind.cc | 31 +++++++-- python/paddle/fluid/profiler.py | 59 ++++++++++++++++ tools/timeline.py | 15 +++- 9 files changed, 206 insertions(+), 11 deletions(-) create mode 100644 paddle/fluid/platform/npu_profiler.h diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake index ecc75332d1f..f46c5bf7ac0 100644 --- a/cmake/external/ascend.cmake +++ b/cmake/external/ascend.cmake @@ -65,11 +65,13 @@ if(WITH_ASCEND_CL) set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so) set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so) set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so) - set(ASCEND_CL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) + set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) + set(ACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include) - message(STATUS "ASCEND_CL_INC_DIR ${ASCEND_CL_INC_DIR}") + message(STATUS "FWKACLLIB_INC_DIR ${FWKACLLIB_INC_DIR}") message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}") - INCLUDE_DIRECTORIES(${ASCEND_CL_INC_DIR}) + INCLUDE_DIRECTORIES(${FWKACLLIB_INC_DIR}) + INCLUDE_DIRECTORIES(${ACLLIB_INC_DIR}) ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib}) diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc index 3c06008d00a..bb3a6512d2c 100644 --- a/paddle/fluid/operators/expand_op_npu.cc +++ b/paddle/fluid/operators/expand_op_npu.cc @@ -58,12 +58,15 @@ class ExpandNPUKernel : public framework::OpKernel { expand_times.size(), static_cast(in_dims.size()))); auto* out0 = context.Output("Out"); framework::DDim out_dims(in_dims); + for (size_t i = 0; i < expand_times.size(); ++i) { out_dims[i] *= expand_times[i]; } + out0->Resize(out_dims); out0->mutable_data(context.device_context().GetPlace()); - auto runner = NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}}); + auto runner = + NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}}); auto stream = context.template device_context() .stream(); diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index e5031acb9b4..2a7519706cc 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -21,8 +21,8 @@ limitations under the License. */ #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h" #include "paddle/fluid/platform/cuda_device_guard.h" #endif - #include "glog/logging.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace memory { @@ -253,6 +253,7 @@ NPUDeviceContext::~NPUDeviceContext() { } void NPUDeviceContext::Wait() const { + platform::RecordEvent record_event("NPUDeviceContext/wait"); NPUDeviceGuard guard(place_.device); PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice()); } diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index bbf8e4d5ca7..1469e0ecd06 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -598,6 +598,8 @@ class DeviceTracerImpl : public DeviceTracer { BOOST_GET_CONST(platform::CUDAPlace, r.place).GetDeviceId()); } else if (platform::is_cuda_pinned_place(r.place)) { event->set_place(proto::MemEvent::CUDAPinnedPlace); + } else if (platform::is_npu_place(r.place)) { + event->set_place(proto::MemEvent::NPUPlace); } else { PADDLE_THROW(platform::errors::Unimplemented( "The current place is not supported.")); diff --git a/paddle/fluid/platform/npu_profiler.h b/paddle/fluid/platform/npu_profiler.h new file mode 100644 index 00000000000..05325aaf9ba --- /dev/null +++ b/paddle/fluid/platform/npu_profiler.h @@ -0,0 +1,91 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "acl/acl_prof.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { + +// For ACL 20.1 +// ACL_AICORE_ARITHMATIC_THROUGHPUT = 0, record arithmetic stats +// ACL_AICORE_PIPELINE = 1, record pipeline +// ACL_AICORE_SYNCHRONIZATION = 2, record sync +// ACL_AICORE_MEMORY = 3, recore memory +// ACL_AICORE_INTERNAL_MEMORY = 4, recore internal memory +// ACL_AICORE_STALL = 5, record pipeline ratio +constexpr aclprofAicoreMetrics default_metrics = + ACL_AICORE_ARITHMATIC_THROUGHPUT; + +// ACL_PROF_ACL_API, record ACL API stats +// ACL_PROF_TASK_TIME, record AI core stats +// ACL_PROF_AICORE_METRICS, must include +// ACL_PROF_AICPU_TRACE, recore AICPU, not supported yet +constexpr uint64_t default_type = + ACL_PROF_ACL_API | ACL_PROF_AICORE_METRICS | ACL_PROF_TASK_TIME; + +aclprofConfig *NPUProfilerCreateConfig( + std::vector devices = {}, + aclprofAicoreMetrics metrics = default_metrics, uint64_t c = default_type, + aclprofAicoreEvents *events = nullptr) { + if (devices.size() == 0) { + int device_id = GetCurrentNPUDeviceId(); + devices.emplace_back(device_id); + } + aclprofConfig *config = + aclprofCreateConfig(devices.data(), devices.size(), metrics, events, c); + PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::External( + "Failed to create prof config for NPU")); + return config; +} + +void NPUProfilerDestroyConfig(const aclprofConfig *config) { + PADDLE_ENFORCE_NPU_SUCCESS(aclprofDestroyConfig(config)); +} + +void NPUProfilerInit(std::string output_path) { + PADDLE_ENFORCE_NPU_SUCCESS( + aclprofInit(output_path.c_str(), output_path.size())); +} + +void NPUProfilerStart(const aclprofConfig *config) { + if (config == nullptr) { + // NOTE(zhiqiu): support single device by default. + int device_id = GetCurrentNPUDeviceId(); + std::vector devices = {static_cast(device_id)}; + config = NPUProfilerCreateConfig(devices); + } + PADDLE_ENFORCE_NPU_SUCCESS(aclprofStart(config)); +} + +void NPUProfilerStop(const aclprofConfig *config) { + PADDLE_ENFORCE_NPU_SUCCESS(aclprofStop(config)); + NPUProfilerDestroyConfig(config); +} + +void NPUProfilerFinalize() { PADDLE_ENFORCE_NPU_SUCCESS(aclprofFinalize()); } + +struct NPUProfConfigWrapper { + aclprofConfig *p_; + explicit NPUProfConfigWrapper(aclprofConfig *p) : p_(p) {} + aclprofConfig *ptr() { return p_; } +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler.proto b/paddle/fluid/platform/profiler.proto index cfa3c6906f8..31193534a00 100644 --- a/paddle/fluid/platform/profiler.proto +++ b/paddle/fluid/platform/profiler.proto @@ -21,6 +21,7 @@ message Event { enum EventType { CPU = 0; GPUKernel = 1; + NPUKernel = 2; } optional EventType type = 8; optional string name = 1; @@ -39,6 +40,8 @@ message MemEvent { CUDAPlace = 0; CPUPlace = 1; CUDAPinnedPlace = 2; + XPUPlace = 3; + NPUPlace = 4; } optional uint64 start_ns = 1; optional uint64 end_ns = 2; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index b3d9e22dba8..857498e852f 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -104,6 +104,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/platform/npu_info.h" +#include "paddle/fluid/platform/npu_profiler.h" #endif #ifdef PADDLE_WITH_XPU @@ -499,11 +500,6 @@ PYBIND11_MODULE(core_noavx, m) { make_ddim(x_dim), make_ddim(y_dim), -1)); }); -#ifdef PADDLE_WITH_ASCEND_CL - m.def("_npu_finalize", - []() { platform::AclInstance::Instance().Finalize(); }); -#endif - m.def( "_append_python_callable_object_and_return_id", [](py::object py_obj) -> size_t { @@ -2082,6 +2078,31 @@ All parameter, weight, gradient are variables in Paddle. #endif #endif +#ifdef PADDLE_WITH_ASCEND_CL + m.def("get_npu_device_count", platform::GetNPUDeviceCount); + m.def("_npu_finalize", []() { + platform::AclInstance::Instance().Finalize(); + }); // private interface + + py::class_(m, "NPUProfConfigWrapper"); + + m.def("npu_prof_init", platform::NPUProfilerInit); + m.def("npu_prof_start", [](platform::NPUProfConfigWrapper c) { + platform::NPUProfilerStart(c.ptr()); + }); + m.def("npu_prof_stop", [](platform::NPUProfConfigWrapper c) { + platform::NPUProfilerStop(c.ptr()); + }); + m.def("npu_prof_finalize", platform::NPUProfilerFinalize); + m.def("npu_prof_create_config", []() { + return platform::NPUProfConfigWrapper(platform::NPUProfilerCreateConfig()); + }); + + m.def("npu_prof_destropy_config", [](platform::NPUProfConfigWrapper c) { + platform::NPUProfilerDestroyConfig(c.ptr()); + }); +#endif + py::enum_(m, "TracerOption", py::arithmetic()) .value("kDefault", platform::TracerOption::kDefault) .value("kOpDetail", platform::TracerOption::kOpDetail) diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py index bc7a60af946..40b0862be01 100644 --- a/python/paddle/fluid/profiler.py +++ b/python/paddle/fluid/profiler.py @@ -106,6 +106,65 @@ def cuda_profiler(output_file, output_mode=None, config=None): os.remove(config_file) +@signature_safe_contextmanager +def npu_profiler(output_file, config=None): + """ + The NPU profiler. + + This fuctions is used to profile NPU program by NPU runtime application + programming interface. The profiling result will be written into + `output_file`. The users can set set the NPU profiling config by `config` argument. + + After getting the profiling result file, users can use + `tools provided by Ascend `_ + to load this output file to visualize results. + + Args: + output_file (str) : The output file name, the result will be + written into this file. It should be absolute path. + config (list, optional) : NPU profile config. For more details, please + refer to `User Guide `_ . + + Examples: + + .. code-block:: python + + import paddle.fluid as fluid + import paddle.fluid.profiler as profiler + import numpy as np + + epoc = 8 + dshape = [4, 3, 28, 28] + data = fluid.data(name='data', shape=[None, 3, 28, 28], dtype='float32') + conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1]) + + place = fluid.NPUPlace(0) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + output_file = 'npu.txt' + with profiler.npu_profiler(output_file) as npu_prof: + for i in range(epoc): + input = np.random.random(dshape).astype('float32') + exe.run(fluid.default_main_program(), feed={'data': input}) + # then use NPU profiler tools to load this output file + # to visualize results. + """ + # TODO: support config in python. + if not config: + config = core.npu_prof_create_config() + + core.npu_prof_init(output_file) + # Enables profiler collection by the active NPU profiling tool. + core.npu_prof_start(config) + try: + yield + # Disables profiler collection. + finally: + core.npu_prof_stop(config) + core.npu_prof_finalize() + + def reset_profiler(): """ Clear the previous time record. This interface does not work for diff --git a/tools/timeline.py b/tools/timeline.py index 119018380b5..2a399b71b77 100644 --- a/tools/timeline.py +++ b/tools/timeline.py @@ -186,6 +186,13 @@ class Timeline(object): self._chrome_trace.emit_pid( "memory usage on %s:cudapinnedplace:%d" % (k, mevent.device_id), pid) + elif mevent.place == profiler_pb2.MemEvent.NPUPlace: + if (k, mevent.device_id, "NPU") not in self._mem_devices: + pid = self._allocate_pid() + self._mem_devices[(k, mevent.device_id, "NPU")] = pid + self._chrome_trace.emit_pid( + "memory usage on %s:npu:%d" % (k, mevent.device_id), + pid) if (k, 0, "CPU") not in self._mem_devices: pid = self._allocate_pid() self._mem_devices[(k, 0, "CPU")] = pid @@ -201,6 +208,11 @@ class Timeline(object): self._mem_devices[(k, 0, "CUDAPinnedPlace")] = pid self._chrome_trace.emit_pid( "memory usage on %s:cudapinnedplace:%d" % (k, 0), pid) + if (k, 0, "NPU") not in self._mem_devices: + pid = self._allocate_pid() + self._mem_devices[(k, 0, "NPU")] = pid + self._chrome_trace.emit_pid("memory usage on %s:npu:%d" % + (k, 0), pid) def _allocate_events(self): for k, profile_pb in six.iteritems(self._profile_dict): @@ -227,7 +239,8 @@ class Timeline(object): place_to_str = { profiler_pb2.MemEvent.CPUPlace: "CPU", profiler_pb2.MemEvent.CUDAPlace: "GPU", - profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace" + profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace", + profiler_pb2.MemEvent.NPUPlace: "NPU" } for k, profile_pb in six.iteritems(self._profile_dict): mem_list = [] -- GitLab