diff --git a/CMakeLists.txt b/CMakeLists.txt index 7500e8ed3ca1a93bb7fb4716e98b2660b82ad430..c5552d6ec94d0a334c441f90f5047e2aa19dd1f6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -146,6 +146,7 @@ include(external/cares) include(external/grpc) include(cudnn) # set cudnn libraries, must before configure +include(cupti) include(configure) # add paddle env configuration include(generic) # simplify cmake module include(package) # set paddle packages diff --git a/cmake/configure.cmake b/cmake/configure.cmake index ae3295fe4115f457570203e61a56a637895e4770..7730453fc9292015465713232abda155a18a1aad 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -59,6 +59,7 @@ endif(NOT WITH_GOLANG) if(NOT WITH_GPU) add_definitions(-DHPPL_STUB_FUNC) + add_definitions("-DCUPTI_LIB_PATH=\"\"") list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu) else() @@ -73,7 +74,14 @@ else() if(NOT CUDNN_FOUND) message(FATAL_ERROR "Paddle needs cudnn to compile") endif() - + if(CUPTI_FOUND) + include_directories(${CUPTI_INCLUDE_DIR}) + add_definitions(-DPADDLE_WITH_CUPTI) + add_definitions("-DCUPTI_LIB_PATH=\"${CUPTI_LIBRARY_PATH}\"") + else() + add_definitions("-DCUPTI_LIB_PATH=\"\"") + message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.") + endif() set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}") # Include cuda and cudnn diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index de94bd5008effef1bf0fd3a125d4aed56e1b7f81..7edc8637727e300539a46bc3941ace87c87903b8 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -155,7 +155,8 @@ endif() include_directories(${CUDA_INCLUDE_DIRS}) list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY}) if(NOT WITH_DSO) - list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY}) + # TODO(panyx0718): CUPTI only allows DSO? + list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY}) endif(NOT WITH_DSO) # setting nvcc arch flags diff --git a/cmake/cupti.cmake b/cmake/cupti.cmake new file mode 100644 index 0000000000000000000000000000000000000000..72ed0f1e5858d6d836743ceb038c7f4ad8f194cf --- /dev/null +++ b/cmake/cupti.cmake @@ -0,0 +1,41 @@ +if(NOT WITH_GPU) + return() +endif() + + +set(CUPTI_ROOT "/usr" CACHE PATH "CUPTI ROOT") +find_path(CUPTI_INCLUDE_DIR cupti.h + PATHS ${CUPTI_ROOT} ${CUPTI_ROOT}/include + $ENV{CUPTI_ROOT} $ENV{CUPTI_ROOT}/include + ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/include + NO_DEFAULT_PATH + ) + +get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH) + +set(TARGET_ARCH "x86_64") +if(NOT ${CMAKE_SYSTEM_PROCESSOR}) + set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR}) +endif() + +list(APPEND CUPTI_CHECK_LIBRARY_DIRS + ${CUPTI_ROOT} + ${CUPTI_ROOT}/lib64 + ${CUPTI_ROOT}/lib + ${CUPTI_ROOT}/lib/${TARGET_ARCH}-linux-gnu + $ENV{CUPTI_ROOT} + $ENV{CUPTI_ROOT}/lib64 + $ENV{CUPTI_ROOT}/lib + /usr/lib + ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/lib64) +find_library(CUPTI_LIBRARY NAMES libcupti.so libcupti.dylib # libcupti_static.a + PATHS ${CUPTI_CHECK_LIBRARY_DIRS} ${CUPTI_INCLUDE_DIR} ${__libpath_hist} + NO_DEFAULT_PATH + DOC "Path to cuPTI library.") + +get_filename_component(CUPTI_LIBRARY_PATH ${CUPTI_LIBRARY} DIRECTORY) +if(CUPTI_INCLUDE_DIR AND CUPTI_LIBRARY) + set(CUPTI_FOUND ON) +else() + set(CUPTI_FOUND OFF) +endif() diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 88863ab99eb765124bc825b4e9ec9dff890ba3cc..d3155d33d0b461c9a3889ed8ae2ad9ee400a60fe 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -127,7 +127,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - platform::RecordEvent record_event(op->Type(), pool.Get(place_)); + // TODO(panyx0718): Need a program id to distinguish programs. + platform::RecordEvent record_event(op->Type(), pool.Get(place_), + op_desc->Block()->ID()); VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); op->Run(*local_scope, place_); diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index 53725d3d802c27202a6379cee518991a628cf9a1..38f22b89143c3e23c8368b9281ccc757a892a373 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -167,4 +167,6 @@ message BlockDesc { // Please refer to // https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md // for more details. +// TODO(panyx0718): A model can have multiple programs. Need a +// way to distinguish them. Maybe ID or name? message ProgramDesc { repeated BlockDesc blocks = 1; } diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index b72aad6fb538ac483e9ce6fc9cb866c75190f006..614dd8cd00eb866cb8cbc41c3e03c25f968a7d2b 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -125,6 +125,8 @@ class OpDesc { BlockDesc *Block() { return this->block_; } + const BlockDesc &BlockRef() const { return *this->block_; } + void SetBlock(BlockDesc *block) { this->block_ = block; } private: diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 0d0cee21d14f29c03ebabcb921ecc4f29f352b55..28a668c86aa322803a65b916b4273181f5652e21 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -1,3 +1,5 @@ +proto_library(profiler_proto SRCS profiler.proto) + if(WITH_GPU) cc_library(enforce SRCS enforce.cc DEPS) else() @@ -37,7 +39,8 @@ nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context) nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context) -cc_library(profiler SRCS profiler.cc DEPS device_context) +cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS}) +cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) nv_test(float16_gpu_test SRCS float16_test.cu) diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc new file mode 100644 index 0000000000000000000000000000000000000000..87bbdfa5fd5d9781d5f2b310d2142b1b4decbf9b --- /dev/null +++ b/paddle/fluid/platform/device_tracer.cc @@ -0,0 +1,285 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/device_tracer.h" +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/string/printf.h" + +namespace paddle { +namespace platform { +namespace { + +thread_local const char *cur_annotation = nullptr; +std::once_flag tracer_once_flag; +DeviceTracer *tracer = nullptr; +} // namespace +#ifdef PADDLE_WITH_CUPTI + +namespace { +// TODO(panyx0718): Revisit the buffer size here. +uint64_t kBufSize = 32 * 1024; +uint64_t kAlignSize = 8; + +#define ALIGN_BUFFER(buffer, align) \ + (((uintptr_t)(buffer) & ((align)-1)) \ + ? ((buffer) + (align) - ((uintptr_t)(buffer) & ((align)-1))) \ + : (buffer)) + +#define CUPTI_CALL(call) \ + do { \ + CUptiResult _status = call; \ + if (_status != CUPTI_SUCCESS) { \ + const char *errstr; \ + dynload::cuptiGetResultString(_status, &errstr); \ + fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ + __FILE__, __LINE__, #call, errstr); \ + exit(-1); \ + } \ + } while (0) + +void EnableActivity() { + // Device activity record is created when CUDA initializes, so we + // want to enable it before cuInit() or any CUDA runtime call. + CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY)); + CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL)); + CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE)); + CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET)); + CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD)); + // We don't track these activities for now. + // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT)); + // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER)); + // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME)); + // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NAME)); + // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MARKER)); +} + +void DisableActivity() { + CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY)); + CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL)); + CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE)); + // Disable all other activity record kinds. + CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT)); + CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER)); + CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME)); + CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET)); + CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME)); + CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER)); + CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD)); +} + +void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size, + size_t *maxNumRecords) { + uint8_t *buf = (uint8_t *)malloc(kBufSize + kAlignSize); + *size = kBufSize; + *buffer = ALIGN_BUFFER(buf, kAlignSize); + *maxNumRecords = 0; +} + +void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, + size_t size, size_t validSize) { + CUptiResult status; + CUpti_Activity *record = NULL; + if (validSize > 0) { + do { + status = dynload::cuptiActivityGetNextRecord(buffer, validSize, &record); + if (status == CUPTI_SUCCESS) { + switch (record->kind) { + case CUPTI_ACTIVITY_KIND_KERNEL: + case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: { + auto *kernel = + reinterpret_cast(record); + tracer->AddKernelRecords(kernel->start, kernel->end, + kernel->deviceId, kernel->streamId, + kernel->correlationId); + break; + } + default: { break; } + } + } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) { + // Seems not an error in this case. + break; + } else { + CUPTI_CALL(status); + } + } while (1); + + size_t dropped; + CUPTI_CALL( + dynload::cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped)); + if (dropped != 0) { + fprintf(stderr, "Dropped %u activity records\n", (unsigned int)dropped); + } + } + free(buffer); +} +} // namespace + +class DeviceTracerImpl : public DeviceTracer { + public: + DeviceTracerImpl() : enabled_(false) {} + + void AddAnnotation(uint64_t id, const std::string &anno) { + std::lock_guard l(trace_mu_); + correlations_[id] = anno; + } + + void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id, + uint32_t stream_id, uint32_t correlation_id) { + std::lock_guard l(trace_mu_); + kernel_records_.push_back( + KernelRecord{start, end, device_id, stream_id, correlation_id}); + } + + bool IsEnabled() { + std::lock_guard l(trace_mu_); + return enabled_; + } + + void Enable() { + std::lock_guard l(trace_mu_); + if (enabled_) { + fprintf(stderr, "DeviceTracer already enabled\n"); + return; + } + EnableActivity(); + + // Register callbacks for buffer requests and completed by CUPTI. + CUPTI_CALL(dynload::cuptiActivityRegisterCallbacks(bufferRequested, + bufferCompleted)); + + CUptiResult ret; + ret = dynload::cuptiSubscribe( + &subscriber_, static_cast(ApiCallback), this); + if (ret == CUPTI_ERROR_MAX_LIMIT_REACHED) { + fprintf(stderr, "CUPTI subcriber limit reached.\n"); + } else if (ret != CUPTI_SUCCESS) { + fprintf(stderr, "Failed to create CUPTI subscriber.\n"); + } + CUPTI_CALL( + dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, + CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)); + + CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_)); + enabled_ = true; + } + + proto::Profile GenProfile() { + std::lock_guard l(trace_mu_); + proto::Profile profile_pb; + profile_pb.set_start_ns(start_ns_); + profile_pb.set_end_ns(end_ns_); + std::map> event_times; + for (const KernelRecord &r : kernel_records_) { + if (correlations_.find(r.correlation_id) == correlations_.end()) { + fprintf(stderr, "cannot relate a kernel activity\n"); + continue; + } + auto *event = profile_pb.add_events(); + event->set_name(correlations_.at(r.correlation_id)); + event->set_start_ns(r.start_ns); + event->set_end_ns(r.end_ns); + event->set_stream_id(r.stream_id); + event->set_device_id(r.device_id); + event_times[event->name()].push_back(r.end_ns - r.start_ns); + } + for (const auto &et : event_times) { + fprintf( + stderr, "%s: total: %fms invoked cuda kernels: %lu\n", + et.first.c_str(), + std::accumulate(et.second.begin(), et.second.end(), 0) / 1000000.0, + et.second.size()); + } + return profile_pb; + } + + void Disable() { + // flush might cause additional calls to DeviceTracker. + dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED); + std::lock_guard l(trace_mu_); + DisableActivity(); + dynload::cuptiUnsubscribe(subscriber_); + CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_)); + PADDLE_ENFORCE(dynload::cuptiFinalize()); + enabled_ = false; + } + + private: + static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain, + CUpti_CallbackId cbid, const void *cbdata) { + auto *cbInfo = reinterpret_cast(cbdata); + DeviceTracer *tracer = reinterpret_cast(userdata); + + if ((domain == CUPTI_CB_DOMAIN_DRIVER_API) && + (cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) { + if (cbInfo->callbackSite == CUPTI_API_ENTER) { + const std::string anno = + cur_annotation ? cur_annotation : cbInfo->symbolName; + tracer->AddAnnotation(cbInfo->correlationId, anno); + } + } else { + VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid; + } + } + + std::mutex trace_mu_; + bool enabled_; + uint64_t start_ns_; + uint64_t end_ns_; + std::vector kernel_records_; + std::unordered_map correlations_; + CUpti_SubscriberHandle subscriber_; +}; + +#endif // PADDLE_WITH_CUPTI + +class DeviceTracerDummy : public DeviceTracer { + public: + DeviceTracerDummy() {} + + void AddAnnotation(uint64_t id, const std::string &anno) {} + + void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id, + uint32_t stream_id, uint32_t correlation_id) {} + + bool IsEnabled() { return false; } + + void Enable() {} + + proto::Profile GenProfile() { return proto::Profile(); } + + void Disable() {} +}; + +void CreateTracer(DeviceTracer **t) { +#ifdef PADDLE_WITH_CUPTI + *t = new DeviceTracerImpl(); +#else + *t = new DeviceTracerDummy(); +#endif // PADDLE_WITH_CUPTI +} + +DeviceTracer *GetDeviceTracer() { + std::call_once(tracer_once_flag, CreateTracer, &tracer); + return tracer; +} + +void SetCurAnnotation(const char *anno) { cur_annotation = anno; } + +void ClearCurAnnotation() { cur_annotation = nullptr; } + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h new file mode 100644 index 0000000000000000000000000000000000000000..06cea84cc80ebefe9f5c396673cc9a35673f718f --- /dev/null +++ b/paddle/fluid/platform/device_tracer.h @@ -0,0 +1,72 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/platform/dynload/cupti.h" +#include "paddle/fluid/platform/profiler.pb.h" + +namespace paddle { +namespace platform { + +/////////////////////// +// WARN: Under Development. Don't depend on it yet. +////////////////////// + +// DeviceTracer performs the following tasks: +// 1. Register cuda callbacks for various events: kernel, memcpy, etc. +// 2. Collect cuda statistics: start/end ts, memory, etc. +// 3. Generate a protobuf for further analysis. +class DeviceTracer { + public: + struct KernelRecord { + uint64_t start_ns; + uint64_t end_ns; + uint32_t device_id; + uint32_t stream_id; + uint32_t correlation_id; + }; + + virtual ~DeviceTracer() {} + // Needs to be called once before use. + virtual void Enable() = 0; + // Needs to be called once after use. + virtual void Disable() = 0; + + // Add a pair to correlate internal cuda id with high level + // annotation (string). So cuda statistics can be represented by + // human-readable annotations. + virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0; + + // Add a cuda kernel stats. `correlation_id` will be mapped to annotation + // added before for human readability. + virtual void AddKernelRecords(uint64_t start, uint64_t end, + uint32_t device_id, uint32_t stream_id, + uint32_t correlation_id) = 0; + + // Generate a proto after done (Disabled). + virtual proto::Profile GenProfile() = 0; + + virtual bool IsEnabled() = 0; +}; + +// Get a DeviceTracer. +DeviceTracer* GetDeviceTracer(); + +// Set a name for the cuda kernel operation being launched by the thread. +void SetCurAnnotation(const char* anno); +// Clear the name after the operation is done. +void ClearCurAnnotation(); + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 264b4ebf2c06d9e688a32a223dff3ec079333fd9..567c137a55e4e0cb0b5080893be305e847bb61e1 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -1,4 +1,8 @@ cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce) -nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc - DEPS dynamic_loader) + +list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc) +if (CUPTI_FOUND) + list(APPEND CUDA_SRCS cupti.cc) +endif(CUPTI_FOUND) +nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) diff --git a/paddle/fluid/platform/dynload/cupti.cc b/paddle/fluid/platform/dynload/cupti.cc new file mode 100644 index 0000000000000000000000000000000000000000..a25660c6ed411bbe444ac8aa10a324cbed9c9d4f --- /dev/null +++ b/paddle/fluid/platform/dynload/cupti.cc @@ -0,0 +1,35 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_CUPTI + +#include "paddle/fluid/platform/dynload/cupti.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag cupti_dso_flag; +void *cupti_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +CUPTI_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle + +#endif // PADDLE_WITH_CUPTI diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h new file mode 100644 index 0000000000000000000000000000000000000000..a79868c18c14b6bcdf85d60e766c7ec8be993f28 --- /dev/null +++ b/paddle/fluid/platform/dynload/cupti.h @@ -0,0 +1,86 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PADDLE_WITH_CUPTI +#include +#include +#include +#include +#include "paddle/fluid/platform/dynload/dynamic_loader.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag cupti_dso_flag; +extern void *cupti_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load cupti routine + * via operator overloading. + * + * note: default dynamic linked libs + */ +#ifdef PADDLE_USE_DSO +#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline CUptiResult CUPTIAPI operator()(Args... args) { \ + typedef CUptiResult CUPTIAPI (*cuptiFunc)(Args...); \ + std::call_once(cupti_dso_flag, \ + paddle::platform::dynload::GetCUPTIDsoHandle, \ + &cupti_dso_handle); \ + void *p_##__name = dlsym(cupti_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name +#else +#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline CUptiResult CUPTIAPI operator()(Args... args) { \ + return __name(args...); \ + } \ + }; \ + extern DynLoad__##__name __name +#endif + +#define CUPTI_ROUTINE_EACH(__macro) \ + __macro(cuptiActivityEnable); \ + __macro(cuptiActivityDisable); \ + __macro(cuptiActivityRegisterCallbacks); \ + __macro(cuptiActivityGetAttribute); \ + __macro(cuptiActivitySetAttribute); \ + __macro(cuptiGetTimestamp); \ + __macro(cuptiActivityGetNextRecord); \ + __macro(cuptiGetResultString); \ + __macro(cuptiActivityGetNumDroppedRecords); \ + __macro(cuptiActivityFlushAll); \ + __macro(cuptiFinalize); \ + __macro(cuptiSubscribe); \ + __macro(cuptiUnsubscribe); \ + __macro(cuptiEnableCallback); + +CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP); + +#undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP +} // namespace dynload +} // namespace platform +} // namespace paddle + +#endif // PADDLE_WITH_CUPTI diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index db1eb41f28e67ee4ed6b276714db989bd25ece2e..8eb5966e5776004a03fee17b74ae72614331a694 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -40,10 +40,14 @@ DEFINE_string(nccl_dir, "", "libcurand. For instance, /usr/local/cuda/lib64. If default, " "dlopen will search cuda from LD_LIBRARY_PATH"); +DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so."); + namespace paddle { namespace platform { namespace dynload { +static const char* cupti_lib_path = CUPTI_LIB_PATH; + static inline std::string join(const std::string& part1, const std::string& part2) { // directory separator @@ -143,6 +147,18 @@ void GetCUDNNDsoHandle(void** dso_handle) { #endif } +void GetCUPTIDsoHandle(void** dso_handle) { + std::string cupti_path = cupti_lib_path; + if (!FLAGS_cupti_dir.empty()) { + cupti_path = FLAGS_cupti_dir; + } +#if defined(__APPLE__) || defined(__OSX__) + GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", dso_handle, false); +#else + GetDsoHandleFromSearchPath(cupti_path, "libcupti.so", dso_handle, false); +#endif +} + void GetCurandDsoHandle(void** dso_handle) { #if defined(__APPLE__) || defined(__OSX__) GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle); diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index 4ffc335332698d1aba262edf2800965e72de77cb..b5b9c4af916241c1c7361b506f74563ebcf69b9a 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -34,6 +34,8 @@ void GetCublasDsoHandle(void** dso_handle); */ void GetCUDNNDsoHandle(void** dso_handle); +void GetCUPTIDsoHandle(void** dso_handle); + /** * @brief load the DSO of CURAND * diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 4804df7966dfedf7264eebaad3a42ed92739b096..201fc872946b70e3d7fbc318c8b04781056279b9 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -15,7 +15,13 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.h" #include #include +#ifdef PADDLE_WITH_CUDA +#include +#endif // PADDLE_WITH_CUDA #include "glog/logging.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/platform/device_tracer.h" +#include "paddle/fluid/string/printf.h" namespace paddle { namespace platform { @@ -126,15 +132,20 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) { GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx); } -RecordEvent::RecordEvent(const std::string& name, - const DeviceContext* dev_ctx) { +RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx, + int32_t block_id) { if (g_state == ProfilerState::kDisabled) return; dev_ctx_ = dev_ctx; name_ = name; PushEvent(name_, dev_ctx_); + + full_name_ = string::Sprintf("%s_b%d", name, block_id); + // Maybe need the same push/pop behavior. + SetCurAnnotation(full_name_.c_str()); } RecordEvent::~RecordEvent() { + ClearCurAnnotation(); if (g_state == ProfilerState::kDisabled) return; PopEvent(name_, dev_ctx_); } @@ -147,7 +158,14 @@ void EnableProfiler(ProfilerState state) { "The profiling state should be disabled when calling ", "EnableProfiler."); g_state = state; - g_profiler_place = (g_state == ProfilerState::kCUDA) ? "CUDA" : "CPU"; + if (g_state == ProfilerState::kCUDA) { + g_profiler_place = "CUDA"; + } else if (g_state == ProfilerState::kCPU) { + g_profiler_place = "CPU"; + } else { + g_profiler_place = "All"; + GetDeviceTracer()->Enable(); + } #ifdef PADDLE_WITH_CUDA if (g_state == ProfilerState::kCUDA) { // Generate some dummy evenets first to reduce the startup overhead. @@ -190,6 +208,12 @@ void DisableProfiler(EventSortingKey sorted_key) { Mark("_stop_profiler_", nullptr); g_state = ProfilerState::kDisabled; + DeviceTracer* tracer = GetDeviceTracer(); + if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) { + tracer->Disable(); + tracer->GenProfile(); + } + std::vector> all_events = GetAllEvents(); ParseEvents(all_events, sorted_key); ResetProfiler(); @@ -254,9 +278,11 @@ void ParseEvents(std::vector>& events, } if (rit != pushed_events.rend()) { - double event_time = (g_profiler_place == "CUDA") - ? rit->CudaElapsedMs(events[i][j]) - : rit->CpuElapsedMs(events[i][j]); + double event_time = + (g_profiler_place == "CUDA" || g_profiler_place == "All") + ? rit->CudaElapsedMs(events[i][j]) + : rit->CpuElapsedMs(events[i][j]); + std::string event_name = "thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); max_name_width = std::max(max_name_width, event_name.size()); diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index a3d22df70057e7967d9fc349ea0cbd73ceb8e0e9..830b86c88ee11b217114c95348c2d25d0dcdf961 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/profiler.pb.h" namespace paddle { namespace platform { @@ -93,6 +94,7 @@ enum ProfilerState { kDisabled, // disabled state kCPU, // CPU profiling state kCUDA, // GPU profiling state + kAll, // Profile both CPU and GPU. (Currently experimental). }; void Mark(const std::string& name, const DeviceContext* dev_ctx); @@ -102,7 +104,8 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx); void PopEvent(const std::string& name, const DeviceContext* dev_ctx); struct RecordEvent { - explicit RecordEvent(const std::string& name, const DeviceContext* dev_ctx); + RecordEvent(const std::string& name, const DeviceContext* dev_ctx, + int32_t block_id); ~RecordEvent(); @@ -110,9 +113,12 @@ struct RecordEvent { const DeviceContext* dev_ctx_; // Event name std::string name_; + // Need to distinguish name by op type, block_id, program_id and perhaps + // different kernel invocations within an op. + std::string full_name_; }; -// Return the event list of all threads. Asummed the returned value calls +// Return the event list of all threads. Assumed the returned value calls // event_lists, event_lists[i][j] represents the j-th Event of i-th thread. std::vector> GetAllEvents(); diff --git a/paddle/fluid/platform/profiler.proto b/paddle/fluid/platform/profiler.proto new file mode 100644 index 0000000000000000000000000000000000000000..bdd86a0440d2b00eaee14195030456d0ad217f9a --- /dev/null +++ b/paddle/fluid/platform/profiler.proto @@ -0,0 +1,30 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +syntax = "proto2"; +package paddle.platform.proto; + +message Event { + optional string name = 1; + optional uint64 start_ns = 2; + optional uint64 end_ns = 3; + optional uint32 device_id = 5; + optional uint32 stream_id = 6; +} + +message Profile { + repeated Event events = 1; + optional uint64 start_ns = 2; + optional uint64 end_ns = 3; +} \ No newline at end of file diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc index dae4d2206e0a1ec6ef99122460a15c064efe58fd..8bc480857a4c3ae2825f08a8d9ed9c152adb80d4 100644 --- a/paddle/fluid/platform/profiler_test.cc +++ b/paddle/fluid/platform/profiler_test.cc @@ -95,7 +95,7 @@ TEST(RecordEvent, RecordEvent) { */ for (int i = 1; i < 5; ++i) { std::string name = "evs_op_" + std::to_string(i); - RecordEvent record_event(name, dev_ctx); + RecordEvent record_event(name, dev_ctx, 0); int counter = 1; while (counter != i * 1000) counter++; } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index b3e03f33470810a685dc7bfe29f8da50454b2238..ac7d1efb577505b70e10a70cdcfd3ed9c5fe1f5c 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -459,6 +459,7 @@ All parameter, weight, gradient are variables in Paddle. .value("kDisabled", platform::ProfilerState::kDisabled) .value("kCPU", platform::ProfilerState::kCPU) .value("kCUDA", platform::ProfilerState::kCUDA) + .value("kAll", platform::ProfilerState::kAll) .export_values(); py::enum_(m, "EventSortingKey", py::arithmetic()) diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py index 4611986c9969f12b71290cf8ee03a50a6ad76f94..59e75209d39dc0f2b72ecf832ff15df192a2898e 100644 --- a/python/paddle/fluid/profiler.py +++ b/python/paddle/fluid/profiler.py @@ -97,9 +97,14 @@ def profiler(state, sorted_key=None): The `ave` means sorting by the average execution time. """ - if state not in ['CPU', 'GPU']: - raise ValueError("The state must be 'CPU' or 'GPU'.") - prof_state = core.ProfilerState.kCUDA if state == "GPU" else core.ProfilerState.kCPU + if state not in ['CPU', 'GPU', "All"]: + raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.") + if state == "GPU": + prof_state = core.ProfilerState.kCUDA + elif state == "CPU": + prof_state = core.ProfilerState.kCPU + else: + prof_state = core.ProfilerState.kAll core.enable_profiler(prof_state) yield diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 9355f51311e33729c0cb8ff321010235aafa4063..f96c2ca4f0593b6c2624d449304f23425c69ab93 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -41,6 +41,7 @@ list(REMOVE_ITEM TEST_OPS test_while_op) list(REMOVE_ITEM TEST_OPS test_lod_array_length_op) list(REMOVE_ITEM TEST_OPS test_reorder_lod_tensor) list(REMOVE_ITEM TEST_OPS test_profiler) +list(REMOVE_ITEM TEST_OPS test_nvprof) list(REMOVE_ITEM TEST_OPS test_normalization_wrapper) list(REMOVE_ITEM TEST_OPS test_executor_and_mul) list(REMOVE_ITEM TEST_OPS test_assign_value_op) @@ -75,6 +76,7 @@ py_test_modules(test_while_op MODULES test_while_op) py_test_modules(test_lod_array_length_op MODULES test_lod_array_length_op) py_test_modules(test_reorder_lod_tensor MODULES test_reorder_lod_tensor) py_test_modules(test_profiler MODULES test_profiler) +py_test_modules(test_nvprof MODULES test_nvprof) py_test_modules(test_normalization_wrapper MODULES test_normalization_wrapper) py_test_modules(test_executor_and_mul MODULES test_executor_and_mul) py_test_modules(test_assign_value_op MODULES test_assign_value_op) diff --git a/python/paddle/fluid/tests/unittests/test_nvprof.py b/python/paddle/fluid/tests/unittests/test_nvprof.py new file mode 100644 index 0000000000000000000000000000000000000000..226e5e5d1131b1f33cfbbfefec536e6974f85b36 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_nvprof.py @@ -0,0 +1,46 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import os +import numpy as np +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler +import paddle.fluid.layers as layers +import paddle.fluid.core as core + + +class TestNVProf(unittest.TestCase): + def test_nvprof(self): + if not fluid.core.is_compiled_with_cuda(): + return + epoc = 8 + dshape = [4, 3, 28, 28] + data = layers.data(name='data', shape=[3, 28, 28], dtype='float32') + conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1]) + + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + output_file = 'cuda_profiler.txt' + with profiler.cuda_profiler(output_file, 'csv') as nvprof: + for i in range(epoc): + input = np.random.random(dshape).astype('float32') + exe.run(fluid.default_main_program(), feed={'data': input}) + os.remove(output_file) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py index d9444b50a2362d4d122ea880d47d337426fbdc96..f6f581ff7d67260dad50b285aa35276698fd7130 100644 --- a/python/paddle/fluid/tests/unittests/test_profiler.py +++ b/python/paddle/fluid/tests/unittests/test_profiler.py @@ -22,27 +22,9 @@ import paddle.fluid.core as core class TestProfiler(unittest.TestCase): - def test_nvprof(self): - if not fluid.core.is_compiled_with_cuda(): - return - epoc = 8 - dshape = [4, 3, 28, 28] - data = layers.data(name='data', shape=[3, 28, 28], dtype='float32') - conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1]) - - place = fluid.CUDAPlace(0) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - - output_file = 'cuda_profiler.txt' - with profiler.cuda_profiler(output_file, 'csv') as nvprof: - for i in range(epoc): - input = np.random.random(dshape).astype('float32') - exe.run(fluid.default_main_program(), feed={'data': input}) - os.remove(output_file) - def net_profiler(self, state): - if state == 'GPU' and not core.is_compiled_with_cuda(): + enable_if_gpu = state == 'GPU' or state == "All" + if enable_if_gpu and not core.is_compiled_with_cuda(): return startup_program = fluid.Program() main_program = fluid.Program() @@ -85,6 +67,9 @@ class TestProfiler(unittest.TestCase): def test_cuda_profiler(self): self.net_profiler('GPU') + def test_all_profiler(self): + self.net_profiler('All') + if __name__ == '__main__': unittest.main()