提交 b9ec24c6 编写于 作者: X Xin Pan

Extend current profiler for timeline and more features.

上级 2c89d975
...@@ -146,6 +146,7 @@ include(external/cares) ...@@ -146,6 +146,7 @@ include(external/cares)
include(external/grpc) include(external/grpc)
include(cudnn) # set cudnn libraries, must before configure include(cudnn) # set cudnn libraries, must before configure
include(cupti)
include(configure) # add paddle env configuration include(configure) # add paddle env configuration
include(generic) # simplify cmake module include(generic) # simplify cmake module
include(package) # set paddle packages include(package) # set paddle packages
......
...@@ -59,6 +59,7 @@ endif(NOT WITH_GOLANG) ...@@ -59,6 +59,7 @@ endif(NOT WITH_GOLANG)
if(NOT WITH_GPU) if(NOT WITH_GPU)
add_definitions(-DHPPL_STUB_FUNC) add_definitions(-DHPPL_STUB_FUNC)
add_definitions("-DCUPTI_LIB_PATH=\"\"")
list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu) list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
else() else()
...@@ -73,7 +74,14 @@ else() ...@@ -73,7 +74,14 @@ else()
if(NOT CUDNN_FOUND) if(NOT CUDNN_FOUND)
message(FATAL_ERROR "Paddle needs cudnn to compile") message(FATAL_ERROR "Paddle needs cudnn to compile")
endif() endif()
if(CUPTI_FOUND)
include_directories(${CUPTI_INCLUDE_DIR})
add_definitions(-DPADDLE_WITH_CUPTI)
add_definitions("-DCUPTI_LIB_PATH=\"${CUPTI_LIBRARY_PATH}\"")
else()
add_definitions("-DCUPTI_LIB_PATH=\"\"")
message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.")
endif()
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}") set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")
# Include cuda and cudnn # Include cuda and cudnn
......
...@@ -155,7 +155,8 @@ endif() ...@@ -155,7 +155,8 @@ endif()
include_directories(${CUDA_INCLUDE_DIRS}) include_directories(${CUDA_INCLUDE_DIRS})
list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY}) list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
if(NOT WITH_DSO) if(NOT WITH_DSO)
list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY}) # TODO(panyx0718): CUPTI only allows DSO?
list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
endif(NOT WITH_DSO) endif(NOT WITH_DSO)
# setting nvcc arch flags # setting nvcc arch flags
......
if(NOT WITH_GPU)
return()
endif()
set(CUPTI_ROOT "/usr" CACHE PATH "CUPTI ROOT")
find_path(CUPTI_INCLUDE_DIR cupti.h
PATHS ${CUPTI_ROOT} ${CUPTI_ROOT}/include
$ENV{CUPTI_ROOT} $ENV{CUPTI_ROOT}/include
${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/include
NO_DEFAULT_PATH
)
get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
set(TARGET_ARCH "x86_64")
if(NOT ${CMAKE_SYSTEM_PROCESSOR})
set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
endif()
list(APPEND CUPTI_CHECK_LIBRARY_DIRS
${CUPTI_ROOT}
${CUPTI_ROOT}/lib64
${CUPTI_ROOT}/lib
${CUPTI_ROOT}/lib/${TARGET_ARCH}-linux-gnu
$ENV{CUPTI_ROOT}
$ENV{CUPTI_ROOT}/lib64
$ENV{CUPTI_ROOT}/lib
/usr/lib
${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/lib64)
find_library(CUPTI_LIBRARY NAMES libcupti.so libcupti.dylib # libcupti_static.a
PATHS ${CUPTI_CHECK_LIBRARY_DIRS} ${CUPTI_INCLUDE_DIR} ${__libpath_hist}
NO_DEFAULT_PATH
DOC "Path to cuPTI library.")
get_filename_component(CUPTI_LIBRARY_PATH ${CUPTI_LIBRARY} DIRECTORY)
if(CUPTI_INCLUDE_DIR AND CUPTI_LIBRARY)
set(CUPTI_FOUND ON)
else()
set(CUPTI_FOUND OFF)
endif()
...@@ -127,7 +127,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, ...@@ -127,7 +127,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(op->Type(), pool.Get(place_)); // TODO(panyx0718): Need a program id to distinguish programs.
platform::RecordEvent record_event(op->Type(), pool.Get(place_),
op_desc->Block()->ID());
VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
op->Run(*local_scope, place_); op->Run(*local_scope, place_);
......
...@@ -167,4 +167,6 @@ message BlockDesc { ...@@ -167,4 +167,6 @@ message BlockDesc {
// Please refer to // Please refer to
// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md // https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
// for more details. // for more details.
// TODO(panyx0718): A model can have multiple programs. Need a
// way to distinguish them. Maybe ID or name?
message ProgramDesc { repeated BlockDesc blocks = 1; } message ProgramDesc { repeated BlockDesc blocks = 1; }
...@@ -125,6 +125,8 @@ class OpDesc { ...@@ -125,6 +125,8 @@ class OpDesc {
BlockDesc *Block() { return this->block_; } BlockDesc *Block() { return this->block_; }
const BlockDesc &BlockRef() const { return *this->block_; }
void SetBlock(BlockDesc *block) { this->block_ = block; } void SetBlock(BlockDesc *block) { this->block_ = block; }
private: private:
......
proto_library(profiler_proto SRCS profiler.proto)
if(WITH_GPU) if(WITH_GPU)
cc_library(enforce SRCS enforce.cc DEPS) cc_library(enforce SRCS enforce.cc DEPS)
else() else()
...@@ -37,7 +39,8 @@ nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) ...@@ -37,7 +39,8 @@ nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context) nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context) nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)
cc_library(profiler SRCS profiler.cc DEPS device_context) cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS})
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
nv_test(float16_gpu_test SRCS float16_test.cu) nv_test(float16_gpu_test SRCS float16_test.cu)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/device_tracer.h"
#include <map>
#include <mutex>
#include "glog/logging.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/string/printf.h"
namespace paddle {
namespace platform {
namespace {
thread_local const char *cur_annotation = nullptr;
std::once_flag tracer_once_flag;
DeviceTracer *tracer = nullptr;
} // namespace
#ifdef PADDLE_WITH_CUPTI
namespace {
// TODO(panyx0718): Revisit the buffer size here.
uint64_t kBufSize = 32 * 1024;
uint64_t kAlignSize = 8;
#define ALIGN_BUFFER(buffer, align) \
(((uintptr_t)(buffer) & ((align)-1)) \
? ((buffer) + (align) - ((uintptr_t)(buffer) & ((align)-1))) \
: (buffer))
#define CUPTI_CALL(call) \
do { \
CUptiResult _status = call; \
if (_status != CUPTI_SUCCESS) { \
const char *errstr; \
dynload::cuptiGetResultString(_status, &errstr); \
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
__FILE__, __LINE__, #call, errstr); \
exit(-1); \
} \
} while (0)
void EnableActivity() {
// Device activity record is created when CUDA initializes, so we
// want to enable it before cuInit() or any CUDA runtime call.
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY));
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
// We don't track these activities for now.
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_NAME));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MARKER));
}
void DisableActivity() {
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE));
// Disable all other activity record kinds.
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD));
}
void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
size_t *maxNumRecords) {
uint8_t *buf = (uint8_t *)malloc(kBufSize + kAlignSize);
*size = kBufSize;
*buffer = ALIGN_BUFFER(buf, kAlignSize);
*maxNumRecords = 0;
}
void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
size_t size, size_t validSize) {
CUptiResult status;
CUpti_Activity *record = NULL;
if (validSize > 0) {
do {
status = dynload::cuptiActivityGetNextRecord(buffer, validSize, &record);
if (status == CUPTI_SUCCESS) {
switch (record->kind) {
case CUPTI_ACTIVITY_KIND_KERNEL:
case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
auto *kernel =
reinterpret_cast<const CUpti_ActivityKernel3 *>(record);
tracer->AddKernelRecords(kernel->start, kernel->end,
kernel->deviceId, kernel->streamId,
kernel->correlationId);
break;
}
default: { break; }
}
} else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
// Seems not an error in this case.
break;
} else {
CUPTI_CALL(status);
}
} while (1);
size_t dropped;
CUPTI_CALL(
dynload::cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped));
if (dropped != 0) {
fprintf(stderr, "Dropped %u activity records\n", (unsigned int)dropped);
}
}
free(buffer);
}
} // namespace
class DeviceTracerImpl : public DeviceTracer {
public:
DeviceTracerImpl() : enabled_(false) {}
void AddAnnotation(uint64_t id, const std::string &anno) {
std::lock_guard<std::mutex> l(trace_mu_);
correlations_[id] = anno;
}
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
uint32_t stream_id, uint32_t correlation_id) {
std::lock_guard<std::mutex> l(trace_mu_);
kernel_records_.push_back(
KernelRecord{start, end, device_id, stream_id, correlation_id});
}
bool IsEnabled() {
std::lock_guard<std::mutex> l(trace_mu_);
return enabled_;
}
void Enable() {
std::lock_guard<std::mutex> l(trace_mu_);
if (enabled_) {
fprintf(stderr, "DeviceTracer already enabled\n");
return;
}
EnableActivity();
// Register callbacks for buffer requests and completed by CUPTI.
CUPTI_CALL(dynload::cuptiActivityRegisterCallbacks(bufferRequested,
bufferCompleted));
CUptiResult ret;
ret = dynload::cuptiSubscribe(
&subscriber_, static_cast<CUpti_CallbackFunc>(ApiCallback), this);
if (ret == CUPTI_ERROR_MAX_LIMIT_REACHED) {
fprintf(stderr, "CUPTI subcriber limit reached.\n");
} else if (ret != CUPTI_SUCCESS) {
fprintf(stderr, "Failed to create CUPTI subscriber.\n");
}
CUPTI_CALL(
dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
enabled_ = true;
}
proto::Profile GenProfile() {
std::lock_guard<std::mutex> l(trace_mu_);
proto::Profile profile_pb;
profile_pb.set_start_ns(start_ns_);
profile_pb.set_end_ns(end_ns_);
std::map<std::string, std::vector<uint64_t>> event_times;
for (const KernelRecord &r : kernel_records_) {
if (correlations_.find(r.correlation_id) == correlations_.end()) {
fprintf(stderr, "cannot relate a kernel activity\n");
continue;
}
auto *event = profile_pb.add_events();
event->set_name(correlations_.at(r.correlation_id));
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_stream_id(r.stream_id);
event->set_device_id(r.device_id);
event_times[event->name()].push_back(r.end_ns - r.start_ns);
}
for (const auto &et : event_times) {
fprintf(
stderr, "%s: total: %fms invoked cuda kernels: %lu\n",
et.first.c_str(),
std::accumulate(et.second.begin(), et.second.end(), 0) / 1000000.0,
et.second.size());
}
return profile_pb;
}
void Disable() {
// flush might cause additional calls to DeviceTracker.
dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED);
std::lock_guard<std::mutex> l(trace_mu_);
DisableActivity();
dynload::cuptiUnsubscribe(subscriber_);
CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_));
PADDLE_ENFORCE(dynload::cuptiFinalize());
enabled_ = false;
}
private:
static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain,
CUpti_CallbackId cbid, const void *cbdata) {
auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata);
DeviceTracer *tracer = reinterpret_cast<DeviceTracer *>(userdata);
if ((domain == CUPTI_CB_DOMAIN_DRIVER_API) &&
(cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) {
if (cbInfo->callbackSite == CUPTI_API_ENTER) {
const std::string anno =
cur_annotation ? cur_annotation : cbInfo->symbolName;
tracer->AddAnnotation(cbInfo->correlationId, anno);
}
} else {
VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid;
}
}
std::mutex trace_mu_;
bool enabled_;
uint64_t start_ns_;
uint64_t end_ns_;
std::vector<KernelRecord> kernel_records_;
std::unordered_map<uint32_t, std::string> correlations_;
CUpti_SubscriberHandle subscriber_;
};
#endif // PADDLE_WITH_CUPTI
class DeviceTracerDummy : public DeviceTracer {
public:
DeviceTracerDummy() {}
void AddAnnotation(uint64_t id, const std::string &anno) {}
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
uint32_t stream_id, uint32_t correlation_id) {}
bool IsEnabled() { return false; }
void Enable() {}
proto::Profile GenProfile() { return proto::Profile(); }
void Disable() {}
};
void CreateTracer(DeviceTracer **t) {
#ifdef PADDLE_WITH_CUPTI
*t = new DeviceTracerImpl();
#else
*t = new DeviceTracerDummy();
#endif // PADDLE_WITH_CUPTI
}
DeviceTracer *GetDeviceTracer() {
std::call_once(tracer_once_flag, CreateTracer, &tracer);
return tracer;
}
void SetCurAnnotation(const char *anno) { cur_annotation = anno; }
void ClearCurAnnotation() { cur_annotation = nullptr; }
} // namespace platform
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/profiler.pb.h"
namespace paddle {
namespace platform {
///////////////////////
// WARN: Under Development. Don't depend on it yet.
//////////////////////
// DeviceTracer performs the following tasks:
// 1. Register cuda callbacks for various events: kernel, memcpy, etc.
// 2. Collect cuda statistics: start/end ts, memory, etc.
// 3. Generate a protobuf for further analysis.
class DeviceTracer {
public:
struct KernelRecord {
uint64_t start_ns;
uint64_t end_ns;
uint32_t device_id;
uint32_t stream_id;
uint32_t correlation_id;
};
virtual ~DeviceTracer() {}
// Needs to be called once before use.
virtual void Enable() = 0;
// Needs to be called once after use.
virtual void Disable() = 0;
// Add a pair to correlate internal cuda id with high level
// annotation (string). So cuda statistics can be represented by
// human-readable annotations.
virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability.
virtual void AddKernelRecords(uint64_t start, uint64_t end,
uint32_t device_id, uint32_t stream_id,
uint32_t correlation_id) = 0;
// Generate a proto after done (Disabled).
virtual proto::Profile GenProfile() = 0;
virtual bool IsEnabled() = 0;
};
// Get a DeviceTracer.
DeviceTracer* GetDeviceTracer();
// Set a name for the cuda kernel operation being launched by the thread.
void SetCurAnnotation(const char* anno);
// Clear the name after the operation is done.
void ClearCurAnnotation();
} // namespace platform
} // namespace paddle
cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce) cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc
DEPS dynamic_loader) list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc)
if (CUPTI_FOUND)
list(APPEND CUDA_SRCS cupti.cc)
endif(CUPTI_FOUND)
nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_CUPTI
#include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag cupti_dso_flag;
void *cupti_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
CUPTI_ROUTINE_EACH(DEFINE_WRAP);
} // namespace dynload
} // namespace platform
} // namespace paddle
#endif // PADDLE_WITH_CUPTI
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_CUPTI
#include <cuda.h>
#include <cupti.h>
#include <dlfcn.h>
#include <mutex>
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag cupti_dso_flag;
extern void *cupti_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load cupti routine
* via operator overloading.
*
* note: default dynamic linked libs
*/
#ifdef PADDLE_USE_DSO
#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
inline CUptiResult CUPTIAPI operator()(Args... args) { \
typedef CUptiResult CUPTIAPI (*cuptiFunc)(Args...); \
std::call_once(cupti_dso_flag, \
paddle::platform::dynload::GetCUPTIDsoHandle, \
&cupti_dso_handle); \
void *p_##__name = dlsym(cupti_dso_handle, #__name); \
return reinterpret_cast<cuptiFunc>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#else
#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
inline CUptiResult CUPTIAPI operator()(Args... args) { \
return __name(args...); \
} \
}; \
extern DynLoad__##__name __name
#endif
#define CUPTI_ROUTINE_EACH(__macro) \
__macro(cuptiActivityEnable); \
__macro(cuptiActivityDisable); \
__macro(cuptiActivityRegisterCallbacks); \
__macro(cuptiActivityGetAttribute); \
__macro(cuptiActivitySetAttribute); \
__macro(cuptiGetTimestamp); \
__macro(cuptiActivityGetNextRecord); \
__macro(cuptiGetResultString); \
__macro(cuptiActivityGetNumDroppedRecords); \
__macro(cuptiActivityFlushAll); \
__macro(cuptiFinalize); \
__macro(cuptiSubscribe); \
__macro(cuptiUnsubscribe); \
__macro(cuptiEnableCallback);
CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
#undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP
} // namespace dynload
} // namespace platform
} // namespace paddle
#endif // PADDLE_WITH_CUPTI
...@@ -40,10 +40,14 @@ DEFINE_string(nccl_dir, "", ...@@ -40,10 +40,14 @@ DEFINE_string(nccl_dir, "",
"libcurand. For instance, /usr/local/cuda/lib64. If default, " "libcurand. For instance, /usr/local/cuda/lib64. If default, "
"dlopen will search cuda from LD_LIBRARY_PATH"); "dlopen will search cuda from LD_LIBRARY_PATH");
DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
static const char* cupti_lib_path = CUPTI_LIB_PATH;
static inline std::string join(const std::string& part1, static inline std::string join(const std::string& part1,
const std::string& part2) { const std::string& part2) {
// directory separator // directory separator
...@@ -143,6 +147,18 @@ void GetCUDNNDsoHandle(void** dso_handle) { ...@@ -143,6 +147,18 @@ void GetCUDNNDsoHandle(void** dso_handle) {
#endif #endif
} }
void GetCUPTIDsoHandle(void** dso_handle) {
std::string cupti_path = cupti_lib_path;
if (!FLAGS_cupti_dir.empty()) {
cupti_path = FLAGS_cupti_dir;
}
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", dso_handle, false);
#else
GetDsoHandleFromSearchPath(cupti_path, "libcupti.so", dso_handle, false);
#endif
}
void GetCurandDsoHandle(void** dso_handle) { void GetCurandDsoHandle(void** dso_handle) {
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle); GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
......
...@@ -34,6 +34,8 @@ void GetCublasDsoHandle(void** dso_handle); ...@@ -34,6 +34,8 @@ void GetCublasDsoHandle(void** dso_handle);
*/ */
void GetCUDNNDsoHandle(void** dso_handle); void GetCUDNNDsoHandle(void** dso_handle);
void GetCUPTIDsoHandle(void** dso_handle);
/** /**
* @brief load the DSO of CURAND * @brief load the DSO of CURAND
* *
......
...@@ -15,7 +15,13 @@ limitations under the License. */ ...@@ -15,7 +15,13 @@ limitations under the License. */
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include <iomanip> #include <iomanip>
#include <map> #include <map>
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#endif // PADDLE_WITH_CUDA
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/device_tracer.h"
#include "paddle/fluid/string/printf.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -126,15 +132,20 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) { ...@@ -126,15 +132,20 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx); GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
} }
RecordEvent::RecordEvent(const std::string& name, RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx,
const DeviceContext* dev_ctx) { int32_t block_id) {
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
dev_ctx_ = dev_ctx; dev_ctx_ = dev_ctx;
name_ = name; name_ = name;
PushEvent(name_, dev_ctx_); PushEvent(name_, dev_ctx_);
full_name_ = string::Sprintf("%s_b%d", name, block_id);
// Maybe need the same push/pop behavior.
SetCurAnnotation(full_name_.c_str());
} }
RecordEvent::~RecordEvent() { RecordEvent::~RecordEvent() {
ClearCurAnnotation();
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
PopEvent(name_, dev_ctx_); PopEvent(name_, dev_ctx_);
} }
...@@ -147,7 +158,14 @@ void EnableProfiler(ProfilerState state) { ...@@ -147,7 +158,14 @@ void EnableProfiler(ProfilerState state) {
"The profiling state should be disabled when calling ", "The profiling state should be disabled when calling ",
"EnableProfiler."); "EnableProfiler.");
g_state = state; g_state = state;
g_profiler_place = (g_state == ProfilerState::kCUDA) ? "CUDA" : "CPU"; if (g_state == ProfilerState::kCUDA) {
g_profiler_place = "CUDA";
} else if (g_state == ProfilerState::kCPU) {
g_profiler_place = "CPU";
} else {
g_profiler_place = "All";
GetDeviceTracer()->Enable();
}
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (g_state == ProfilerState::kCUDA) { if (g_state == ProfilerState::kCUDA) {
// Generate some dummy evenets first to reduce the startup overhead. // Generate some dummy evenets first to reduce the startup overhead.
...@@ -190,6 +208,12 @@ void DisableProfiler(EventSortingKey sorted_key) { ...@@ -190,6 +208,12 @@ void DisableProfiler(EventSortingKey sorted_key) {
Mark("_stop_profiler_", nullptr); Mark("_stop_profiler_", nullptr);
g_state = ProfilerState::kDisabled; g_state = ProfilerState::kDisabled;
DeviceTracer* tracer = GetDeviceTracer();
if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) {
tracer->Disable();
tracer->GenProfile();
}
std::vector<std::vector<Event>> all_events = GetAllEvents(); std::vector<std::vector<Event>> all_events = GetAllEvents();
ParseEvents(all_events, sorted_key); ParseEvents(all_events, sorted_key);
ResetProfiler(); ResetProfiler();
...@@ -254,9 +278,11 @@ void ParseEvents(std::vector<std::vector<Event>>& events, ...@@ -254,9 +278,11 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
} }
if (rit != pushed_events.rend()) { if (rit != pushed_events.rend()) {
double event_time = (g_profiler_place == "CUDA") double event_time =
(g_profiler_place == "CUDA" || g_profiler_place == "All")
? rit->CudaElapsedMs(events[i][j]) ? rit->CudaElapsedMs(events[i][j])
: rit->CpuElapsedMs(events[i][j]); : rit->CpuElapsedMs(events[i][j]);
std::string event_name = std::string event_name =
"thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); "thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
max_name_width = std::max(max_name_width, event_name.size()); max_name_width = std::max(max_name_width, event_name.size());
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <mutex> #include <mutex>
#include <vector> #include <vector>
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/profiler.pb.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -93,6 +94,7 @@ enum ProfilerState { ...@@ -93,6 +94,7 @@ enum ProfilerState {
kDisabled, // disabled state kDisabled, // disabled state
kCPU, // CPU profiling state kCPU, // CPU profiling state
kCUDA, // GPU profiling state kCUDA, // GPU profiling state
kAll, // Profile both CPU and GPU. (Currently experimental).
}; };
void Mark(const std::string& name, const DeviceContext* dev_ctx); void Mark(const std::string& name, const DeviceContext* dev_ctx);
...@@ -102,7 +104,8 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx); ...@@ -102,7 +104,8 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
void PopEvent(const std::string& name, const DeviceContext* dev_ctx); void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
struct RecordEvent { struct RecordEvent {
explicit RecordEvent(const std::string& name, const DeviceContext* dev_ctx); RecordEvent(const std::string& name, const DeviceContext* dev_ctx,
int32_t block_id);
~RecordEvent(); ~RecordEvent();
...@@ -110,9 +113,12 @@ struct RecordEvent { ...@@ -110,9 +113,12 @@ struct RecordEvent {
const DeviceContext* dev_ctx_; const DeviceContext* dev_ctx_;
// Event name // Event name
std::string name_; std::string name_;
// Need to distinguish name by op type, block_id, program_id and perhaps
// different kernel invocations within an op.
std::string full_name_;
}; };
// Return the event list of all threads. Asummed the returned value calls // Return the event list of all threads. Assumed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread. // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std::vector<std::vector<Event>> GetAllEvents(); std::vector<std::vector<Event>> GetAllEvents();
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
syntax = "proto2";
package paddle.platform.proto;
message Event {
optional string name = 1;
optional uint64 start_ns = 2;
optional uint64 end_ns = 3;
optional uint32 device_id = 5;
optional uint32 stream_id = 6;
}
message Profile {
repeated Event events = 1;
optional uint64 start_ns = 2;
optional uint64 end_ns = 3;
}
\ No newline at end of file
...@@ -95,7 +95,7 @@ TEST(RecordEvent, RecordEvent) { ...@@ -95,7 +95,7 @@ TEST(RecordEvent, RecordEvent) {
*/ */
for (int i = 1; i < 5; ++i) { for (int i = 1; i < 5; ++i) {
std::string name = "evs_op_" + std::to_string(i); std::string name = "evs_op_" + std::to_string(i);
RecordEvent record_event(name, dev_ctx); RecordEvent record_event(name, dev_ctx, 0);
int counter = 1; int counter = 1;
while (counter != i * 1000) counter++; while (counter != i * 1000) counter++;
} }
......
...@@ -459,6 +459,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -459,6 +459,7 @@ All parameter, weight, gradient are variables in Paddle.
.value("kDisabled", platform::ProfilerState::kDisabled) .value("kDisabled", platform::ProfilerState::kDisabled)
.value("kCPU", platform::ProfilerState::kCPU) .value("kCPU", platform::ProfilerState::kCPU)
.value("kCUDA", platform::ProfilerState::kCUDA) .value("kCUDA", platform::ProfilerState::kCUDA)
.value("kAll", platform::ProfilerState::kAll)
.export_values(); .export_values();
py::enum_<platform::EventSortingKey>(m, "EventSortingKey", py::arithmetic()) py::enum_<platform::EventSortingKey>(m, "EventSortingKey", py::arithmetic())
......
...@@ -97,9 +97,14 @@ def profiler(state, sorted_key=None): ...@@ -97,9 +97,14 @@ def profiler(state, sorted_key=None):
The `ave` means sorting by the average execution time. The `ave` means sorting by the average execution time.
""" """
if state not in ['CPU', 'GPU']: if state not in ['CPU', 'GPU', "All"]:
raise ValueError("The state must be 'CPU' or 'GPU'.") raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
prof_state = core.ProfilerState.kCUDA if state == "GPU" else core.ProfilerState.kCPU if state == "GPU":
prof_state = core.ProfilerState.kCUDA
elif state == "CPU":
prof_state = core.ProfilerState.kCPU
else:
prof_state = core.ProfilerState.kAll
core.enable_profiler(prof_state) core.enable_profiler(prof_state)
yield yield
......
...@@ -41,6 +41,7 @@ list(REMOVE_ITEM TEST_OPS test_while_op) ...@@ -41,6 +41,7 @@ list(REMOVE_ITEM TEST_OPS test_while_op)
list(REMOVE_ITEM TEST_OPS test_lod_array_length_op) list(REMOVE_ITEM TEST_OPS test_lod_array_length_op)
list(REMOVE_ITEM TEST_OPS test_reorder_lod_tensor) list(REMOVE_ITEM TEST_OPS test_reorder_lod_tensor)
list(REMOVE_ITEM TEST_OPS test_profiler) list(REMOVE_ITEM TEST_OPS test_profiler)
list(REMOVE_ITEM TEST_OPS test_nvprof)
list(REMOVE_ITEM TEST_OPS test_normalization_wrapper) list(REMOVE_ITEM TEST_OPS test_normalization_wrapper)
list(REMOVE_ITEM TEST_OPS test_executor_and_mul) list(REMOVE_ITEM TEST_OPS test_executor_and_mul)
list(REMOVE_ITEM TEST_OPS test_assign_value_op) list(REMOVE_ITEM TEST_OPS test_assign_value_op)
...@@ -75,6 +76,7 @@ py_test_modules(test_while_op MODULES test_while_op) ...@@ -75,6 +76,7 @@ py_test_modules(test_while_op MODULES test_while_op)
py_test_modules(test_lod_array_length_op MODULES test_lod_array_length_op) py_test_modules(test_lod_array_length_op MODULES test_lod_array_length_op)
py_test_modules(test_reorder_lod_tensor MODULES test_reorder_lod_tensor) py_test_modules(test_reorder_lod_tensor MODULES test_reorder_lod_tensor)
py_test_modules(test_profiler MODULES test_profiler) py_test_modules(test_profiler MODULES test_profiler)
py_test_modules(test_nvprof MODULES test_nvprof)
py_test_modules(test_normalization_wrapper MODULES test_normalization_wrapper) py_test_modules(test_normalization_wrapper MODULES test_normalization_wrapper)
py_test_modules(test_executor_and_mul MODULES test_executor_and_mul) py_test_modules(test_executor_and_mul MODULES test_executor_and_mul)
py_test_modules(test_assign_value_op MODULES test_assign_value_op) py_test_modules(test_assign_value_op MODULES test_assign_value_op)
......
...@@ -22,27 +22,9 @@ import paddle.fluid.core as core ...@@ -22,27 +22,9 @@ import paddle.fluid.core as core
class TestProfiler(unittest.TestCase): class TestProfiler(unittest.TestCase):
def test_nvprof(self):
if not fluid.core.is_compiled_with_cuda():
return
epoc = 8
dshape = [4, 3, 28, 28]
data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
output_file = 'cuda_profiler.txt'
with profiler.cuda_profiler(output_file, 'csv') as nvprof:
for i in range(epoc):
input = np.random.random(dshape).astype('float32')
exe.run(fluid.default_main_program(), feed={'data': input})
os.remove(output_file)
def net_profiler(self, state): def net_profiler(self, state):
if state == 'GPU' and not core.is_compiled_with_cuda(): enable_if_gpu = state == 'GPU' or state == "All"
if enable_if_gpu and not core.is_compiled_with_cuda():
return return
startup_program = fluid.Program() startup_program = fluid.Program()
main_program = fluid.Program() main_program = fluid.Program()
...@@ -85,6 +67,9 @@ class TestProfiler(unittest.TestCase): ...@@ -85,6 +67,9 @@ class TestProfiler(unittest.TestCase):
def test_cuda_profiler(self): def test_cuda_profiler(self):
self.net_profiler('GPU') self.net_profiler('GPU')
def test_all_profiler(self):
self.net_profiler('All')
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import os
import numpy as np
import paddle.v2.fluid as fluid
import paddle.v2.fluid.profiler as profiler
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.core as core
class TestNVProf(unittest.TestCase):
def test_nvprof(self):
if not fluid.core.is_compiled_with_cuda():
return
epoc = 8
dshape = [4, 3, 28, 28]
data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
output_file = 'cuda_profiler.txt'
with profiler.cuda_profiler(output_file, 'csv') as nvprof:
for i in range(epoc):
input = np.random.random(dshape).astype('float32')
exe.run(fluid.default_main_program(), feed={'data': input})
os.remove(output_file)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册