LightNetwork.h 4.9 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Z
zhangjinchao01 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

#include "SocketChannel.h"

Y
Yu Yang 已提交
19
#include <atomic>
Z
zhangjinchao01 已提交
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
#include <memory>
#include <thread>
#include <vector>

#include "paddle/utils/Thread.h"

struct sxi_socket;

namespace paddle {

class SocketWorker;

/**
 * @brief class for holding all parameters processing for current port
 *
 * @note  each parameter server inherits from one socket server, each
 *        server contains serveral woker threads which are to parallelize
 *        the processing of computation, but share some common datas stored
 *        in child class of socketserver.
 */
class SocketServer : public Thread {
41 42 43
  // rdmaCpu controls the cpu affinity of RDMA server daemon,
  // which could benifit performance. rdmaCpu = -1 means TCP
  // is used instead of RDMA transport.
Z
zhangjinchao01 已提交
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
public:
  SocketServer(const std::string& addr, int port, int rdmaCpu);
  ~SocketServer();

  virtual void run();

  typedef std::function<void(const std::vector<iovec>& outputIovs)>
      ResponseCallback;

protected:
  //
  // The derived class needs to implement this function
  // to handle the request received by SocketWorker
  // The request is encapsulated by MsgReader, which contains
  // a set of blocks.
  virtual void handleRequest(std::unique_ptr<MsgReader> msgReader,
                             ResponseCallback callback) = 0;

  std::unique_ptr<SocketChannel> createChannel(int sock,
                                               const std::string& peerName) {
    return std::unique_ptr<SocketChannel>(new SocketChannel(sock, peerName));
  }
  std::unique_ptr<SocketChannel> createChannel(struct sxi_sock* sock,
                                               const std::string& peerName) {
    return std::unique_ptr<SocketChannel>(new SocketChannel(sock, peerName));
  }

  friend class SocketWorker;

private:
  void rdmaServer();
  void tcpServer();

  void detach() {}  // detach accept thread is forbidden

protected:
  enum ChannelType tcpRdma_;
  // for rdma
  int rdmaCpu_;
  std::string rdmaUri_;
  sxi_socket* rdmaSocket_;
  // for tcp
  int port_;
  std::string addr_;
  int socket_;
  int maxPendingConnections_;
  bool stopping_;
};

/**
 * @brief class for holding one connection from one trainer
 *
 * @note  all parameter processing will run in the context of this worker
 */
class SocketWorker : public Thread {
public:
  SocketWorker(std::unique_ptr<SocketChannel>&& channel, SocketServer* server)
      : channel_(std::move(channel)), server_(server) {}

  virtual ~SocketWorker() {}

  virtual void run();

protected:
  std::unique_ptr<SocketChannel> channel_;
  SocketServer* server_;
  enum ChannelType tcpRdma_;
};

/**
 * @brief class for providing rdma client deamon thread
 *
 * @note  the deamons are required by sock like rdam library. Here
 *        use singleton model for daemons. Each deamon hosts in
 *        single cpu core for better load balance performance
 */
class RdmaClientDaemons {
private:
  RdmaClientDaemons();

  static std::unique_ptr<RdmaClientDaemons> daemons_;

public:
  static RdmaClientDaemons* get() {
    std::call_once(RdmaClientDaemons::initDataFlag_,
                   &RdmaClientDaemons::getInstance);

    return daemons_.get();
  }

  struct sxi_socket* selectDaemon() {
    int cpu = curCpu_;
    curCpu_ = (curCpu_ + 1) % onlineCpus_;

    LOG(INFO) << "select daemon " << cpu << "onlineCpus_ " << onlineCpus_;
    return rdmaClientSocket_[cpu];
  }

  ~RdmaClientDaemons();

public:
  friend class SocketClient;

private:
  static std::once_flag initDataFlag_;
  static void getInstance() {
    if (!daemons_.get()) daemons_.reset(new RdmaClientDaemons());
  }

  std::vector<struct sxi_socket*> rdmaClientSocket_;
  std::atomic<int> curCpu_;
  int onlineCpus_;
};

/**
 * @brief management for client connection which are from trainers
 *
 * @note  it contains one channel descriptor which used to write and
 *        read data
 */
class SocketClient {
public:
166 167
  SocketClient(const std::string& serverAddr,
               int serverPort,
Z
zhangjinchao01 已提交
168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
               enum ChannelType channelType);

  SocketChannel* getChannel() { return channel_.get(); }

protected:
  std::unique_ptr<SocketChannel> channel_;
  struct sxi_socket* socketDaemon_;
  enum ChannelType tcpRdma_;

private:
  void RdmaClient(const std::string& serverAddr, int serverPort);
  void TcpClient(const std::string& serverAddr, int serverPort);
};

std::string getIpAddr(std::string& device);
void setOption(int sockfd);

}  // namespace paddle