未验证 提交 e8fcb71b 编写于 作者: Q Qiao Longfei 提交者: GitHub

Merge pull request #12620 from jacquesqiao/timeline-support-pure-cpu

Timeline support pure cpu
......@@ -189,6 +189,8 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
}
} // namespace
#endif // PADDLE_WITH_CUPTI
class DeviceTracerImpl : public DeviceTracer {
public:
DeviceTracerImpl() : enabled_(false) {}
......@@ -244,6 +246,8 @@ class DeviceTracerImpl : public DeviceTracer {
if (enabled_) {
return;
}
#ifdef PADDLE_WITH_CUPTI
EnableActivity();
// Register callbacks for buffer requests and completed by CUPTI.
......@@ -262,6 +266,7 @@ class DeviceTracerImpl : public DeviceTracer {
dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
#endif // PADDLE_WITH_CUPTI
enabled_ = true;
}
......@@ -313,16 +318,21 @@ class DeviceTracerImpl : public DeviceTracer {
}
void Disable() {
#ifdef PADDLE_WITH_CUPTI
// flush might cause additional calls to DeviceTracker.
dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED);
#endif // PADDLE_WITH_CUPTI
std::lock_guard<std::mutex> l(trace_mu_);
#ifdef PADDLE_WITH_CUPTI
DisableActivity();
dynload::cuptiUnsubscribe(subscriber_);
CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_));
#endif // PADDLE_WITH_CUPTI
enabled_ = false;
}
private:
#ifdef PADDLE_WITH_CUPTI
static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain,
CUpti_CallbackId cbid, const void *cbdata) {
auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata);
......@@ -340,7 +350,8 @@ class DeviceTracerImpl : public DeviceTracer {
VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid;
}
}
CUpti_SubscriberHandle subscriber_;
#endif // PADDLE_WITH_CUPTI
std::mutex trace_mu_;
bool enabled_;
uint64_t start_ns_;
......@@ -349,45 +360,9 @@ class DeviceTracerImpl : public DeviceTracer {
std::vector<MemRecord> mem_records_;
std::vector<CPURecord> cpu_records_;
std::unordered_map<uint32_t, std::string> correlations_;
CUpti_SubscriberHandle subscriber_;
};
#endif // PADDLE_WITH_CUPTI
class DeviceTracerDummy : public DeviceTracer {
public:
DeviceTracerDummy() {}
void AddAnnotation(uint64_t id, const std::string &anno) {}
void AddCPURecords(const std::string &anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, int64_t thread_id) {}
void AddMemRecords(const std::string &name, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, int64_t stream_id,
uint32_t correlation_id, uint64_t bytes) {}
void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id,
int64_t stream_id, uint32_t correlation_id) {}
bool IsEnabled() { return false; }
void Enable() {}
proto::Profile GenProfile(const std::string &profile_path) {
return proto::Profile();
}
void Disable() {}
};
void CreateTracer(DeviceTracer **t) {
#ifdef PADDLE_WITH_CUPTI
*t = new DeviceTracerImpl();
#else
*t = new DeviceTracerDummy();
#endif // PADDLE_WITH_CUPTI
}
void CreateTracer(DeviceTracer **t) { *t = new DeviceTracerImpl(); }
DeviceTracer *GetDeviceTracer() {
std::call_once(tracer_once_flag, CreateTracer, &tracer);
......
......@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <sys/time.h>
#include <time.h>
#include <chrono> // NOLINT
#include <string>
#include "paddle/fluid/platform/dynload/cupti.h"
......@@ -25,6 +28,12 @@ namespace platform {
// WARN: Under Development. Don't depend on it yet.
//////////////////////
inline uint64_t PosixInNsec() {
struct timeval tv;
gettimeofday(&tv, nullptr);
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
}
// DeviceTracer performs the following tasks:
// 1. Register cuda callbacks for various events: kernel, memcpy, etc.
// 2. Collect cuda statistics: start/end ts, memory, etc.
......
......@@ -15,7 +15,6 @@ limitations under the License. */
#include "paddle/fluid/platform/profiler.h"
#include <sys/time.h>
#include <time.h>
#include <algorithm>
#include <iomanip>
#include <limits>
......@@ -97,12 +96,6 @@ inline uint64_t GetTimeInNsec() {
.count();
}
inline uint64_t PosixInNsec() {
struct timeval tv;
gettimeofday(&tv, nullptr);
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
}
Event::Event(EventType type, std::string name, uint32_t thread_id,
const DeviceContext* dev_ctx)
: type_(type), name_(name), thread_id_(thread_id), has_cuda_(false) {
......
......@@ -218,7 +218,7 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
"""The profiler interface.
Different from cuda_profiler, this profiler can be used to profile both CPU
and GPU program. By defalut, it records the CPU and GPU operator kernels,
and GPU program. By default, it records the CPU and GPU operator kernels,
if you want to profile other program, you can refer the profiling tutorial
to add more records in C++ code.
......@@ -231,7 +231,7 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
state (string) : The profiling state, which should be 'CPU' or 'GPU',
telling the profiler to use CPU timer or GPU timer for profiling.
Although users may have already specified the execution place
(CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
(CPUPlace/CUDAPlace) in the beginning, for flexibility the profiler
would not inherit this place.
sorted_key (string) : If None, the profiling results will be printed
in the order of first end time of events. Otherwise, the profiling
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册