task_node.cc 5.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/distributed/fleet_executor/task_node.h"
16

17
#include "paddle/fluid/framework/op_desc.h"
18
#include "paddle/fluid/framework/op_registry.h"
19 20 21 22 23 24 25 26
#include "paddle/fluid/framework/operator.h"

namespace paddle {
namespace distributed {
namespace {
using OperatorBase = TaskNode::OperatorBase;
}

27 28 29
TaskNode::TaskNode(paddle::framework::ProgramDesc* program,
                   int64_t rank,
                   int64_t task_id,
30
                   int64_t max_run_times)
31 32 33
    : program_(program),
      rank_(rank),
      task_id_(task_id),
34
      max_run_times_(max_run_times) {
35 36 37 38
  // TODO(liyurui): Will be removed when execute program is supported.
  Init();
}

39 40 41 42 43 44 45 46 47
TaskNode::TaskNode(paddle::framework::ProgramDesc* program, int64_t rank)
    : program_(program), rank_(rank), task_id_(rank) {
  max_run_times_ = 1;
  LOG(INFO)
      << "Constructing TaskNode for DistModelInf. The TaskNode's id is: "
      << rank
      << ". And the TaskNode's max_run_time and max_slot_num will be set to 1.";
}

48 49 50 51
void TaskNode::SetProgram(paddle::framework::ProgramDesc* program) {
  program_ = program;
}

52 53 54 55 56 57 58 59 60 61
void TaskNode::SetVarsToDtype(
    const std::map<std::string, std::string>& vars_to_dtype) {
  vars_to_dtype_ = vars_to_dtype;
}

void TaskNode::SetVarsToShape(
    const std::map<std::string, std::vector<int64_t>>& vars_to_shape) {
  vars_to_shape_ = vars_to_shape;
}

62 63 64 65
void TaskNode::Init(bool use_feed_fetch_ops) {
  if (!use_feed_fetch_ops) {
    VLOG(3) << "TaskNode will be inited without feed and fetch ops";
  }
66 67 68 69
  if (ops_.empty()) {
    // Q (for fleet executor dev): should we need another reset funct?
    VLOG(3) << "Task node will be inited by calling Init().";
    for (const auto& op_desc : program_->Block(0).AllOps()) {
70 71 72 73 74 75
      if (!use_feed_fetch_ops &&
          (op_desc->Type() == "feed" || op_desc->Type() == "fetch")) {
        VLOG(3) << "TaskNode will skip [" << op_desc->Input("X")[0] << "], "
                << op_desc->Type() << " -> " << op_desc->Output("Out")[0];
        continue;
      }
76 77 78 79 80
      ops_vec_.emplace_back(framework::OpRegistry::CreateOp(*op_desc));
    }
    for (const auto& op : ops_vec_) {
      ops_.emplace_back(op.get());
    }
81
  }
L
LiYuRio 已提交
82 83
}

84 85 86
TaskNode::TaskNode(int64_t rank, int64_t task_id, int64_t max_run_times)
    : rank_(rank), task_id_(task_id), max_run_times_(max_run_times) {}

87 88
TaskNode::TaskNode(int32_t role,
                   const std::vector<framework::OpDesc*>& op_descs,
89 90
                   int64_t rank,
                   int64_t task_id,
91
                   int64_t max_run_times)
92 93 94
    : role_(role),
      rank_(rank),
      task_id_(task_id),
95
      max_run_times_(max_run_times) {
96 97 98
  if (op_descs.empty()) {
    return;
  }
99
  VLOG(3) << "Task node will be inited by providing list of ops.";
100 101 102 103 104 105 106 107 108 109
  for (const auto& desc : op_descs) {
    ops_vec_.emplace_back(framework::OpRegistry::CreateOp(*desc));
  }
  for (const auto& op : ops_vec_) {
    ops_.emplace_back(op.get());
  }
}

TaskNode::TaskNode(int32_t role,
                   const std::vector<framework::OperatorBase*>& ops,
110 111
                   int64_t rank,
                   int64_t task_id,
112
                   int64_t max_run_times)
113 114 115 116
    : ops_(ops),
      role_(role),
      rank_(rank),
      task_id_(task_id),
117
      max_run_times_(max_run_times) {}
118

119 120 121
TaskNode::TaskNode(int32_t role,
                   int64_t rank,
                   int64_t task_id,
122
                   int64_t max_run_times)
123 124 125
    : role_(role),
      rank_(rank),
      task_id_(task_id),
126
      max_run_times_(max_run_times) {}
127

128 129 130
bool TaskNode::AddUpstreamTask(int64_t task_id,
                               int64_t buff_size,
                               DependType type) {
131
  const auto& ret = upstream_.emplace(task_id, buff_size);
132
  id_to_dep_type_.emplace(task_id, type);
133
  return ret.second;
L
LiYuRio 已提交
134
}
135

136 137 138
bool TaskNode::AddDownstreamTask(int64_t task_id,
                                 int64_t buff_size,
                                 DependType type) {
139
  const auto& ret = downstream_.emplace(task_id, buff_size);
140
  id_to_dep_type_.emplace(task_id, type);
141
  return ret.second;
142
}
143 144 145 146 147 148 149 150 151 152

std::string TaskNode::DebugString() const {
  std::ostringstream os;
  os << "role: " << role_ << ", task_id: " << task_id_ << "\n";
  for (std::size_t i = 0; i < ops_.size(); ++i) {
    os << ops_[i]->Type() << " ";
  }
  os << "\n";
  return os.str();
}
153 154

void TaskNode::SetRunPerSteps(int64_t value) {
155 156
  PADDLE_ENFORCE_GE(value,
                    1,
157 158 159 160 161 162
                    platform::errors::InvalidArgument(
                        "run_per_steps must >= 1, but received %ld", value));
  run_per_steps_ = value;
}

void TaskNode::SetRunAtOffset(int64_t value) {
163 164
  PADDLE_ENFORCE_GE(value,
                    0,
165 166 167 168 169 170 171
                    platform::errors::InvalidArgument(
                        "run_at_offset must >= 0, but received %ld", value));
  run_at_offset_ = value;
}

void TaskNode::SetReplyUpPerSteps(int64_t value) {
  PADDLE_ENFORCE_GE(
172 173
      value,
      1,
174 175
      platform::errors::InvalidArgument(
          "reply_up_per_steps must >= 1, but received %ld", value));
176 177 178 179 180
  reply_up_per_steps_ = value;
}

void TaskNode::SetSendDownPerSteps(int64_t value) {
  PADDLE_ENFORCE_GE(
181 182
      value,
      1,
183 184
      platform::errors::InvalidArgument(
          "send_down_per_steps must >= 1, but received %ld", value));
185 186 187
  send_down_per_steps_ = value;
}

188 189
}  // namespace distributed
}  // namespace paddle