未验证 提交 cb3f3f37 编写于 作者: J Jinhui Yuan 提交者: GitHub

Tune inception (#1014)

* let NormalMdUpdtTask use independent stream

* let kMix task share the same stream

* remove UseIndependentStream

* refine
上级 7ab4cee4
......@@ -121,13 +121,8 @@ void Actor::InitDeviceCtx(const ThreadCtx& thread_ctx) {
}
case DeviceType::kGPU: {
CudaStreamHandle* cuda_handle = nullptr;
if (GetLocalWorkStreamId() == 0) {
cuda_handle = thread_ctx.g_cuda_stream.get();
} else {
CHECK(Global<IDMgr>::Get()->IsIndependentLocalWorkStreamId(GetLocalWorkStreamId()));
cuda_handle_.reset(new CudaStreamHandle(thread_ctx.cb_event_chan));
cuda_handle = cuda_handle_.get();
}
CHECK_EQ(GetLocalWorkStreamId(), 0);
cuda_handle = thread_ctx.g_cuda_stream.get();
device_ctx_.reset(new CudaDeviceCtx(thread_ctx.buf_ptr, thread_ctx.buf_size, cuda_handle));
break;
}
......
......@@ -16,7 +16,6 @@ class CompTaskNode : public TaskNode {
virtual CudaWorkType GetCudaWorkType() const { return CudaWorkType::kCompute; }
virtual void ToProto(TaskProto*) override;
bool UseIndependentWorkStream() const override { return GetCudaWorkType() == CudaWorkType::kMix; }
// parallel_ctx_
int64_t parallel_id() const { return parallel_ctx_.parallel_id(); }
......
......@@ -19,6 +19,7 @@ class NormalMdUpdtCompTaskNode final : public CompTaskNode {
void set_random_seed(uint32_t val) { random_seed_ = val; }
TaskType GetTaskType() const override { return TaskType::kNormalMdUpdt; }
CudaWorkType GetCudaWorkType() const override { return CudaWorkType::kMix; }
void ToProto(TaskProto*) override;
private:
......
......@@ -266,17 +266,7 @@ void TaskNode::FixRegisterNumRange() {
int64_t TaskNode::AllocateLocalWorkStreamId() {
CHECK_NE(machine_id_, -1);
CHECK_NE(thrd_id_, -1);
if (UseIndependentWorkStream()) {
if (device_type() == DeviceType::kCPU) {
return 0;
} else if (device_type() == DeviceType::kGPU) {
return Global<IDMgr>::Get()->AllocateLocalWorkStreamId(machine_id_, thrd_id_);
} else {
UNIMPLEMENTED();
}
} else {
return 0;
}
return 0;
}
void TaskNode::UpdateTaskId() {
......
......@@ -49,7 +49,6 @@ class TaskNode : public Node<TaskNode, TaskEdge> {
int64_t LocalWorkStreamId() const;
int64_t GlobalWorkStreamId() const;
int64_t GpuPhyId() const { return Global<IDMgr>::Get()->GetGpuPhyIdFromThrdId(thrd_id_); }
virtual bool UseIndependentWorkStream() const { return false; }
// Setters
void set_machine_id(int64_t val);
......
......@@ -47,11 +47,9 @@ class IDMgr final {
// 0: the actor thread
// for gpu:
// 0: the global cuda stream
// other: start from 100
int64_t AllocateLocalWorkStreamId(int64_t machine_id, int64_t thrd_id);
int64_t LocalWorkStreamId4TaskId(int64_t task_id) const;
int64_t LocalWorkStreamId4ActorId(int64_t actor_id) const;
bool IsIndependentLocalWorkStreamId(int64_t local_wsid) const { return local_wsid >= 100; }
// global_work_stream_id
// sign | machine_id | thrd_id | local_work_stream_id | 0
// 1 | 10 | 11 | 21 | 21
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册