task_node.cc 5.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/distributed/fleet_executor/task_node.h"
16

17
#include "paddle/fluid/framework/op_desc.h"
18
#include "paddle/fluid/framework/op_registry.h"
19 20 21 22 23 24 25 26
#include "paddle/fluid/framework/operator.h"

namespace paddle {
namespace distributed {
namespace {
using OperatorBase = TaskNode::OperatorBase;
}

27 28 29 30
TaskNode::TaskNode(paddle::framework::ProgramDesc* program,
                   int64_t rank,
                   int64_t max_run_times,
                   int64_t max_slot_nums)
L
LiYuRio 已提交
31 32 33 34 35
    : program_(program),
      rank_(rank),
      max_run_times_(max_run_times),
      max_slot_nums_(max_slot_nums) {
  // Should be serially invoked, not thread-safe
36 37 38 39
  // NOTE: when instantiate TaskNode with program, won't init task node
  // immediately, since the provided program may be updated later (with
  // high probability) by adding_feed_fetch_ops or by RuntimeGraph.
  // So, delay the init part to the Init() function.
L
LiYuRio 已提交
40 41
  static int64_t task_node_cnt = 0;
  task_id_ = task_node_cnt++;
42 43
}

44 45 46 47 48 49 50 51 52 53
TaskNode::TaskNode(paddle::framework::ProgramDesc* program, int64_t rank)
    : program_(program), rank_(rank), task_id_(rank) {
  max_run_times_ = 1;
  max_slot_nums_ = 1;
  LOG(INFO)
      << "Constructing TaskNode for DistModelInf. The TaskNode's id is: "
      << rank
      << ". And the TaskNode's max_run_time and max_slot_num will be set to 1.";
}

54 55 56 57
void TaskNode::SetProgram(paddle::framework::ProgramDesc* program) {
  program_ = program;
}

58 59 60 61
void TaskNode::Init(bool use_feed_fetch_ops) {
  if (!use_feed_fetch_ops) {
    VLOG(3) << "TaskNode will be inited without feed and fetch ops";
  }
62 63 64 65
  if (ops_.empty()) {
    // Q (for fleet executor dev): should we need another reset funct?
    VLOG(3) << "Task node will be inited by calling Init().";
    for (const auto& op_desc : program_->Block(0).AllOps()) {
66 67 68 69 70 71
      if (!use_feed_fetch_ops &&
          (op_desc->Type() == "feed" || op_desc->Type() == "fetch")) {
        VLOG(3) << "TaskNode will skip [" << op_desc->Input("X")[0] << "], "
                << op_desc->Type() << " -> " << op_desc->Output("Out")[0];
        continue;
      }
72 73 74 75 76
      ops_vec_.emplace_back(framework::OpRegistry::CreateOp(*op_desc));
    }
    for (const auto& op : ops_vec_) {
      ops_.emplace_back(op.get());
    }
77
  }
L
LiYuRio 已提交
78 79
}

80 81 82
TaskNode::TaskNode(int64_t rank, int64_t task_id, int64_t max_run_times)
    : rank_(rank), task_id_(task_id), max_run_times_(max_run_times) {}

83 84
TaskNode::TaskNode(int32_t role,
                   const std::vector<framework::OpDesc*>& op_descs,
85 86 87
                   int64_t rank,
                   int64_t task_id,
                   int64_t max_run_times,
88 89 90 91 92 93 94 95 96
                   int64_t max_slot_nums)
    : role_(role),
      rank_(rank),
      task_id_(task_id),
      max_run_times_(max_run_times),
      max_slot_nums_(max_slot_nums) {
  if (op_descs.empty()) {
    return;
  }
97
  VLOG(3) << "Task node will be inited by providing list of ops.";
98 99 100 101 102 103 104 105 106 107
  for (const auto& desc : op_descs) {
    ops_vec_.emplace_back(framework::OpRegistry::CreateOp(*desc));
  }
  for (const auto& op : ops_vec_) {
    ops_.emplace_back(op.get());
  }
}

TaskNode::TaskNode(int32_t role,
                   const std::vector<framework::OperatorBase*>& ops,
108 109 110
                   int64_t rank,
                   int64_t task_id,
                   int64_t max_run_times,
111 112 113 114 115 116 117
                   int64_t max_slot_nums)
    : ops_(ops),
      role_(role),
      rank_(rank),
      task_id_(task_id),
      max_run_times_(max_run_times),
      max_slot_nums_(max_slot_nums) {}
118

119 120 121 122 123
TaskNode::TaskNode(int32_t role,
                   int64_t rank,
                   int64_t task_id,
                   int64_t max_run_times,
                   int64_t max_slot_nums)
124 125 126 127 128
    : role_(role),
      rank_(rank),
      task_id_(task_id),
      max_run_times_(max_run_times),
      max_slot_nums_(max_slot_nums) {}
129

130 131 132
bool TaskNode::AddUpstreamTask(int64_t task_id, int64_t buff_size) {
  const auto& ret = upstream_.emplace(task_id, buff_size);
  return ret.second;
L
LiYuRio 已提交
133
}
134

135 136 137
bool TaskNode::AddDownstreamTask(int64_t task_id, int64_t buff_size) {
  const auto& ret = downstream_.emplace(task_id, buff_size);
  return ret.second;
138
}
139 140 141 142 143 144 145 146 147 148

std::string TaskNode::DebugString() const {
  std::ostringstream os;
  os << "role: " << role_ << ", task_id: " << task_id_ << "\n";
  for (std::size_t i = 0; i < ops_.size(); ++i) {
    os << ops_[i]->Type() << " ";
  }
  os << "\n";
  return os.str();
}
149 150

void TaskNode::SetRunPerSteps(int64_t value) {
151 152
  PADDLE_ENFORCE_GE(value,
                    1,
153 154 155 156 157 158
                    platform::errors::InvalidArgument(
                        "run_per_steps must >= 1, but received %ld", value));
  run_per_steps_ = value;
}

void TaskNode::SetRunAtOffset(int64_t value) {
159 160
  PADDLE_ENFORCE_GE(value,
                    0,
161 162 163 164 165 166 167
                    platform::errors::InvalidArgument(
                        "run_at_offset must >= 0, but received %ld", value));
  run_at_offset_ = value;
}

void TaskNode::SetReplyUpPerSteps(int64_t value) {
  PADDLE_ENFORCE_GE(
168 169
      value,
      1,
170 171
      platform::errors::InvalidArgument(
          "reply_up_per_steps must >= 1, but received %ld", value));
172 173 174 175 176
  reply_up_per_steps_ = value;
}

void TaskNode::SetSendDownPerSteps(int64_t value) {
  PADDLE_ENFORCE_GE(
177 178
      value,
      1,
179 180
      platform::errors::InvalidArgument(
          "send_down_per_steps must >= 1, but received %ld", value));
181 182 183
  send_down_per_steps_ = value;
}

184 185
}  // namespace distributed
}  // namespace paddle