task_node.cc 6.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/distributed/fleet_executor/task_node.h"
16

17
#include "paddle/fluid/framework/op_desc.h"
18
#include "paddle/fluid/framework/op_registry.h"
19 20 21 22 23 24 25 26
#include "paddle/fluid/framework/operator.h"

namespace paddle {
namespace distributed {
namespace {
using OperatorBase = TaskNode::OperatorBase;
}

27 28 29 30
TaskNode::TaskNode(paddle::framework::ProgramDesc* program,
                   int64_t rank,
                   int64_t max_run_times,
                   int64_t max_slot_nums)
L
LiYuRio 已提交
31 32 33 34 35
    : program_(program),
      rank_(rank),
      max_run_times_(max_run_times),
      max_slot_nums_(max_slot_nums) {
  // Should be serially invoked, not thread-safe
36 37 38 39
  // NOTE: when instantiate TaskNode with program, won't init task node
  // immediately, since the provided program may be updated later (with
  // high probability) by adding_feed_fetch_ops or by RuntimeGraph.
  // So, delay the init part to the Init() function.
L
LiYuRio 已提交
40 41
  static int64_t task_node_cnt = 0;
  task_id_ = task_node_cnt++;
42 43
}

44 45 46 47 48 49 50 51 52 53 54 55 56 57
TaskNode::TaskNode(paddle::framework::ProgramDesc* program,
                   int64_t rank,
                   int64_t task_id,
                   int64_t max_run_times,
                   int64_t max_slot_nums)
    : program_(program),
      rank_(rank),
      task_id_(task_id),
      max_run_times_(max_run_times),
      max_slot_nums_(max_slot_nums) {
  // TODO(liyurui): Will be removed when execute program is supported.
  Init();
}

58 59 60 61 62 63 64 65 66 67
TaskNode::TaskNode(paddle::framework::ProgramDesc* program, int64_t rank)
    : program_(program), rank_(rank), task_id_(rank) {
  max_run_times_ = 1;
  max_slot_nums_ = 1;
  LOG(INFO)
      << "Constructing TaskNode for DistModelInf. The TaskNode's id is: "
      << rank
      << ". And the TaskNode's max_run_time and max_slot_num will be set to 1.";
}

68 69 70 71
void TaskNode::SetProgram(paddle::framework::ProgramDesc* program) {
  program_ = program;
}

72 73 74 75
void TaskNode::Init(bool use_feed_fetch_ops) {
  if (!use_feed_fetch_ops) {
    VLOG(3) << "TaskNode will be inited without feed and fetch ops";
  }
76 77 78 79
  if (ops_.empty()) {
    // Q (for fleet executor dev): should we need another reset funct?
    VLOG(3) << "Task node will be inited by calling Init().";
    for (const auto& op_desc : program_->Block(0).AllOps()) {
80 81 82 83 84 85
      if (!use_feed_fetch_ops &&
          (op_desc->Type() == "feed" || op_desc->Type() == "fetch")) {
        VLOG(3) << "TaskNode will skip [" << op_desc->Input("X")[0] << "], "
                << op_desc->Type() << " -> " << op_desc->Output("Out")[0];
        continue;
      }
86 87 88 89 90
      ops_vec_.emplace_back(framework::OpRegistry::CreateOp(*op_desc));
    }
    for (const auto& op : ops_vec_) {
      ops_.emplace_back(op.get());
    }
91
  }
L
LiYuRio 已提交
92 93
}

94 95 96
TaskNode::TaskNode(int64_t rank, int64_t task_id, int64_t max_run_times)
    : rank_(rank), task_id_(task_id), max_run_times_(max_run_times) {}

97 98
TaskNode::TaskNode(int32_t role,
                   const std::vector<framework::OpDesc*>& op_descs,
99 100 101
                   int64_t rank,
                   int64_t task_id,
                   int64_t max_run_times,
102 103 104 105 106 107 108 109 110
                   int64_t max_slot_nums)
    : role_(role),
      rank_(rank),
      task_id_(task_id),
      max_run_times_(max_run_times),
      max_slot_nums_(max_slot_nums) {
  if (op_descs.empty()) {
    return;
  }
111
  VLOG(3) << "Task node will be inited by providing list of ops.";
112 113 114 115 116 117 118 119 120 121
  for (const auto& desc : op_descs) {
    ops_vec_.emplace_back(framework::OpRegistry::CreateOp(*desc));
  }
  for (const auto& op : ops_vec_) {
    ops_.emplace_back(op.get());
  }
}

TaskNode::TaskNode(int32_t role,
                   const std::vector<framework::OperatorBase*>& ops,
122 123 124
                   int64_t rank,
                   int64_t task_id,
                   int64_t max_run_times,
125 126 127 128 129 130 131
                   int64_t max_slot_nums)
    : ops_(ops),
      role_(role),
      rank_(rank),
      task_id_(task_id),
      max_run_times_(max_run_times),
      max_slot_nums_(max_slot_nums) {}
132

133 134 135 136 137
TaskNode::TaskNode(int32_t role,
                   int64_t rank,
                   int64_t task_id,
                   int64_t max_run_times,
                   int64_t max_slot_nums)
138 139 140 141 142
    : role_(role),
      rank_(rank),
      task_id_(task_id),
      max_run_times_(max_run_times),
      max_slot_nums_(max_slot_nums) {}
143

144 145 146
bool TaskNode::AddUpstreamTask(int64_t task_id, int64_t buff_size) {
  const auto& ret = upstream_.emplace(task_id, buff_size);
  return ret.second;
L
LiYuRio 已提交
147
}
148

149 150 151
bool TaskNode::AddDownstreamTask(int64_t task_id, int64_t buff_size) {
  const auto& ret = downstream_.emplace(task_id, buff_size);
  return ret.second;
152
}
153 154 155 156 157 158 159 160 161 162

std::string TaskNode::DebugString() const {
  std::ostringstream os;
  os << "role: " << role_ << ", task_id: " << task_id_ << "\n";
  for (std::size_t i = 0; i < ops_.size(); ++i) {
    os << ops_[i]->Type() << " ";
  }
  os << "\n";
  return os.str();
}
163 164

void TaskNode::SetRunPerSteps(int64_t value) {
165 166
  PADDLE_ENFORCE_GE(value,
                    1,
167 168 169 170 171 172
                    platform::errors::InvalidArgument(
                        "run_per_steps must >= 1, but received %ld", value));
  run_per_steps_ = value;
}

void TaskNode::SetRunAtOffset(int64_t value) {
173 174
  PADDLE_ENFORCE_GE(value,
                    0,
175 176 177 178 179 180 181
                    platform::errors::InvalidArgument(
                        "run_at_offset must >= 0, but received %ld", value));
  run_at_offset_ = value;
}

void TaskNode::SetReplyUpPerSteps(int64_t value) {
  PADDLE_ENFORCE_GE(
182 183
      value,
      1,
184 185
      platform::errors::InvalidArgument(
          "reply_up_per_steps must >= 1, but received %ld", value));
186 187 188 189 190
  reply_up_per_steps_ = value;
}

void TaskNode::SetSendDownPerSteps(int64_t value) {
  PADDLE_ENFORCE_GE(
191 192
      value,
      1,
193 194
      platform::errors::InvalidArgument(
          "send_down_per_steps must >= 1, but received %ld", value));
195 196 197
  send_down_per_steps_ = value;
}

198 199
}  // namespace distributed
}  // namespace paddle