rpc_server.cc 4.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <fstream>
#include <iostream>
#include <limits>
#include <string>

20
#include "paddle/fluid/operators/distributed/rpc_server.h"
21 22 23 24 25 26
#include "paddle/fluid/platform/profiler.h"

DEFINE_int32(rpc_server_profile_period, 0,
             "the period of listen_and_serv to do profile");
DEFINE_string(rpc_server_profile_path, "/dev/null",
              "the profile log file path");
27 28 29

namespace paddle {
namespace operators {
30
namespace distributed {
31

32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
RPCServerProfiler::RPCServerProfiler(int profile_period,
                                     const std::string& profile_log_path)
    : profile_period_(profile_period), profile_log_path_(profile_log_path) {
  step_ = 0;
}

void RPCServerProfiler::OneStep() {
  PADDLE_ENFORCE_LE(step_, profile_period_,
                    "step_ should not be larger then "
                    "profile_period_");
  if (profile_period_ <= 0) {
    return;
  }

  if (step_ == 0) {
    auto pf_state = paddle::platform::ProfilerState::kCPU;
    paddle::platform::EnableProfiler(pf_state);
  }
  if (step_ == profile_period_) {
    paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kTotal,
                                      profile_log_path_);
    step_ = 0;
  } else {
    step_++;
  }
}

59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
void RPCServer::ShutDown() {
  LOG(INFO) << "RPCServer ShutDown ";
  ShutDownImpl();

  exit_flag_ = true;
  barrier_cond_.notify_all();
  rpc_cond_.notify_all();
}

void RPCServer::SavePort() const {
  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
  std::ofstream port_file;
  port_file.open(file_path);
  port_file << selected_port_;
  port_file.close();
  VLOG(4) << "selected port written to " << file_path;
}

void RPCServer::WaitBarrier(const std::string& rpc_name) {
  std::unique_lock<std::mutex> lock(this->mutex_);
W
Wu Yi 已提交
79
  barrier_cond_.wait(lock, [this, &rpc_name] {
Y
Yancey1989 已提交
80 81
    return ((barrier_counter_[rpc_name] == client_num_ && client_num_ != 0) ||
            exit_flag_.load());
82 83
  });

W
Wu Yi 已提交
84 85
  VLOG(3) << "batch_barrier_: " << rpc_name << " "
          << barrier_counter_[rpc_name];
86 87 88
}

void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
W
Wu Yi 已提交
89
  VLOG(4) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
90
  int b = 0;
W
Wu Yi 已提交
91 92
  std::unique_lock<std::mutex> lock(mutex_);
  b = ++barrier_counter_[rpc_name];
93
  if (b >= client_num_) {
W
Wu Yi 已提交
94
    lock.unlock();
95
    barrier_cond_.notify_all();
W
Wu Yi 已提交
96
    lock.lock();
97 98 99
  }
}

Y
Yancey1989 已提交
100
void RPCServer::Complete() {
Y
Yancey1989 已提交
101
  {
Y
Yancey1989 已提交
102
    std::unique_lock<std::mutex> lock(mutex_);
W
Wu Yi 已提交
103
    client_num_--;
Y
Yancey1989 已提交
104 105
    need_reset_all_vars_ = true;

Y
Yancey1989 已提交
106 107 108 109
    VLOG(4) << "decrease client_num to: " << client_num_;
    if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) {
      barrier_counter_[kRequestGet]--;
    }
W
Wu Yi 已提交
110 111 112 113
  }
  barrier_cond_.notify_all();
}

Y
Yancey1989 已提交
114 115 116 117 118
bool RPCServer::NeedResetAllVars() {
  std::unique_lock<std::mutex> lock(mutex_);
  return need_reset_all_vars_;
}

Y
Yancey1989 已提交
119 120 121 122 123
int RPCServer::GetClientNum() {
  std::unique_lock<std::mutex> lock(mutex_);
  return client_num_;
}

124 125 126 127 128 129
void RPCServer::ResetBarrierCounter() {
  VLOG(3) << "RPCServer ResetBarrierCounter ";
  std::unique_lock<std::mutex> lock(mutex_);
  for (auto& t : barrier_counter_) {
    t.second = 0;
  }
Y
Yancey1989 已提交
130
  need_reset_all_vars_ = false;
131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
}

void RPCServer::RegisterRPC(const std::string& rpc_name,
                            RequestHandler* handler, int thread_num) {
  rpc_call_map_[rpc_name] = handler;
  rpc_thread_num_[rpc_name] = thread_num;

  static int cond = -1;
  rpc_cond_map_[rpc_name] = ++cond;
  VLOG(4) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler
          << ", cond:" << rpc_cond_map_[rpc_name];
}

void RPCServer::SetCond(const std::string& rpc_name) {
  VLOG(3) << "RPCServer SetCond " << rpc_name;
  {
    std::unique_lock<std::mutex> lock(mutex_);
    cur_cond_ = rpc_cond_map_[rpc_name];
  }

  rpc_cond_.notify_all();
}

void RPCServer::WaitCond(const std::string& rpc_name) {
W
Wu Yi 已提交
155
  VLOG(4) << "RPCServer WaitCond " << rpc_name;
156 157 158 159 160 161 162 163 164 165 166
  int cond = 0;
  {
    std::unique_lock<std::mutex> lock(mutex_);
    cond = rpc_cond_map_[rpc_name];
  }

  std::unique_lock<std::mutex> lock(mutex_);
  rpc_cond_.wait(
      lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
}

167
}  // namespace distributed
168 169
}  // namespace operators
}  // namespace paddle