未验证 提交 5c79dbb2 编写于 作者: Y Yuang Liu 提交者: GitHub

Marker op for profiling (#33034)

上级 c711e913
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace operators {
class MarkerOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
std::string marker_role = ctx->Attrs().Get<std::string>("marker_role");
std::string marker_pos = ctx->Attrs().Get<std::string>("marker_pos");
VLOG(3) << "The role is:" << marker_role << ";"
<< "The position is:" << marker_pos << ".";
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(framework::proto::VarType::FP32,
ctx.GetPlace());
}
};
class MarkerOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddAttr<std::string>("marker_role",
"(string, default forward)forward or backward,"
" mark different stages of porcess.")
.SetDefault("forward");
AddAttr<std::string>(
"marker_pos",
"(string, default B)the posititon where the marker is placed, "
"B stands for begin of duration,"
" E stands for end of duration.")
.SetDefault("B");
AddComment(
R"DOC(Marker Operator - Add marker at the beginning/end of a forward/backward process.)DOC");
}
};
template <typename T>
class MarkerOpCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto marker_role = ctx.Attr<std::string>("marker_role");
auto marker_pos = ctx.Attr<std::string>("marker_pos");
platform::RecordEvent record_event(
"MarkerCPU", platform::EventRole::kInnerOp,
"marker_" + marker_role + "_" + marker_pos);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(marker, ops::MarkerOp, ops::MarkerOpMaker);
REGISTER_OP_CPU_KERNEL(marker, ops::MarkerOpCPUKernel<float>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void SimpleMarkerKernel(T* in, T* out, int ndim) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
for (; idx < ndim; idx += blockDim.x * gridDim.x) {
out[idx] = in[idx];
}
}
template <typename T>
class MarkerOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto marker_role = ctx.Attr<std::string>("marker_role");
auto marker_pos = ctx.Attr<std::string>("marker_pos");
VLOG(3) << "marker role: " << marker_role
<< " marker position: " << marker_pos;
framework::Tensor A;
framework::Tensor B;
auto* in_temp = A.mutable_data<T>({32, 1}, ctx.GetPlace());
auto* out_temp = B.mutable_data<T>({32, 1}, ctx.GetPlace());
platform::RecordEvent record_event(
"MarkerCUDA", platform::EventRole::kInnerOp,
"marker_" + marker_role + "_" + marker_pos);
SimpleMarkerKernel<T><<<1, 32, 0, dev_ctx.stream()>>>(in_temp, out_temp,
32);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(marker, ops::MarkerOpCUDAKernel<float>);
......@@ -511,7 +511,7 @@ class DeviceTracerImpl : public DeviceTracer {
auto c = correlations_.find(r.correlation_id);
if (c != correlations_.end() && c->second != nullptr) {
event->set_name(c->second->name());
event->set_detail_info(r.name);
event->set_detail_info(c->second->attr());
find++;
} else {
VLOG(10) << "Missing Kernel Event: " + r.name;
......
......@@ -40,7 +40,7 @@ class Event {
// The DeviceContext is used to get the cuda stream.
// If CPU profiling mode, can pass nullptr.
Event(EventType type, std::string name, uint32_t thread_id,
EventRole role = EventRole::kOrdinary);
EventRole role = EventRole::kOrdinary, std::string attr = "none");
const EventType& type() const;
Event* parent() const { return parent_; }
......@@ -50,7 +50,7 @@ class Event {
uint32_t thread_id() const { return thread_id_; }
void set_name(std::string name) { name_ = name; }
void set_role(EventRole role) { role_ = role; }
std::string attr() const { return attr_; }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifndef PADDLE_WITH_CUPTI
gpuEvent_t event() const { return event_; }
......@@ -69,6 +69,7 @@ class Event {
EventRole role_{};
int64_t cpu_ns_;
bool visited_status_{false};
std::string attr_;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_CUPTI
int64_t gpu_ns_ = 0;
......
......@@ -32,8 +32,12 @@ namespace platform {
MemEvenRecorder MemEvenRecorder::recorder;
Event::Event(EventType type, std::string name, uint32_t thread_id,
EventRole role)
: type_(type), name_(name), thread_id_(thread_id), role_(role) {
EventRole role, std::string attr)
: type_(type),
name_(name),
thread_id_(thread_id),
role_(role),
attr_(attr) {
cpu_ns_ = GetTimeInNsec();
}
......@@ -52,7 +56,8 @@ double Event::CudaElapsedMs(const Event &e) const {
#endif
}
RecordEvent::RecordEvent(const std::string &name, const EventRole role) {
RecordEvent::RecordEvent(const std::string &name, const EventRole role,
const std::string attr) {
#ifndef _WIN32
#ifdef PADDLE_WITH_CUDA
if (g_enable_nvprof_hook) {
......@@ -69,7 +74,7 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role) {
is_enabled_ = true;
// lock is not needed, the code below is thread-safe
// Maybe need the same push/pop behavior.
Event *e = PushEvent(name, role);
Event *e = PushEvent(name, role, attr);
SetCurAnnotation(e);
name_ = e->name();
}
......@@ -186,12 +191,14 @@ void Mark(const std::string &name) {
GetEventList().Record(EventType::kMark, name, g_thread_id);
}
Event *PushEvent(const std::string &name, const EventRole role) {
return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role);
Event *PushEvent(const std::string &name, const EventRole role,
std::string attr) {
return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role,
attr);
}
void PopEvent(const std::string &name, const EventRole role) {
GetEventList().Record(EventType::kPopRange, name, g_thread_id, role);
void PopEvent(const std::string &name, const EventRole role, std::string attr) {
GetEventList().Record(EventType::kPopRange, name, g_thread_id, role, attr);
}
void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE_NE(state, ProfilerState::kDisabled,
......
......@@ -126,7 +126,8 @@ struct MemEvenRecorder {
struct RecordEvent {
RecordEvent(const std::string& name,
const EventRole role = EventRole::kOrdinary);
const EventRole role = EventRole::kOrdinary,
const std::string attr = "none");
~RecordEvent();
......@@ -200,8 +201,10 @@ void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place& place, const std::string& annotation);
void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place& place, const std::string& annotation);
Event* PushEvent(const std::string& name, const EventRole role);
void PopEvent(const std::string& name, const EventRole role);
Event* PushEvent(const std::string& name, const EventRole role,
const std::string attr = "none");
void PopEvent(const std::string& name, const EventRole role,
const std::string attr = "none");
// Return the event list of all threads. Assumed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std::vector<std::vector<Event>> GetAllEvents();
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
from paddle.distributed.fleet.meta_optimizers.common import OpRole
class TestMarkerOp(OpTest):
def setUp(self):
self.op_type = "marker"
self.inputs = {}
self.attrs = {
'marker_role': 'forward',
'marker_pos': 'B',
'op_role': OpRole.Forward
}
self.outputs = {}
def test_check_output(self):
self.check_output()
if __name__ == "__main__":
unittest.main()
......@@ -710,4 +710,5 @@ STATIC_MODE_TESTING_LIST = [
'test_lamb_op_xpu',
'test_model_cast_to_bf16',
'test_sgd_op_bf16',
'test_marker_op',
]
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册