fix switch client multithread bug (#42600)

* back fl * delete ssl cert * . * make warning * . * unittest paral degree * solve unittest * heter & multi cloud commm ready * . * . * arm_brpc compile * . * . * . * . * . * . * . * . * . * . * . * . * . * . * only output is ok * base is ok * . * . * . * . * . * . * . * . * add switch server bin * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * adapt brpc ssl * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * fix heter_server & heter_client * . * . * int->int64_t * . * safe map in multithread * fix heter unitest * . * fix code_style * . * fix bug * .

fix switch client multithread bug (#42600)
* back fl * delete ssl cert * . * make warning * . * unittest paral degree * solve unittest * heter & multi cloud commm ready * . * . * arm_brpc compile * . * . * . * . * . * . * . * . * . * . * . * . * . * . * only output is ok * base is ok * . * . * . * . * . * . * . * . * add switch server bin * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * adapt brpc ssl * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * . * fix heter_server & heter_client * . * . * int->int64_t * . * safe map in multithread * fix heter unitest * . * fix code_style * . * fix bug * .
e2540c17 · ziyoujiyi · GitHub · d47690b2 · e2540c17 · e2540c17
Showing with 14 addition and 14 deletion

paddle/fluid/distributed/ps/service/heter_client.h paddle/fluid/distributed/ps/service/heter_client.h +9 -12

paddle/fluid/distributed/ps/service/heter_server.cc paddle/fluid/distributed/ps/service/heter_server.cc +5 -2

未找到文件。
--- a/paddle/fluid/distributed/ps/service/heter_client.h
+++ b/paddle/fluid/distributed/ps/service/heter_client.h
@@ -171,7 +171,6 @@ class HeterClient {
  // switch client singleton
  static std::shared_ptr<HeterClient> GetSwitchInstance(
      const std::vector<std::string>& peer_endpoints, int32_t peer_role) {
-    if (switch_s_instance_ == nullptr) {
    std::unique_lock<std::mutex> lock(mtx_);
    if (peer_endpoints.empty()) {
      VLOG(4) << "init switch client failed, null peer_endpoints";
@@ -181,9 +180,7 @@ class HeterClient {
    if (switch_s_instance_ == nullptr) {
      switch_s_instance_.reset(new HeterClient());
      switch_s_instance_->SetPeerSwitchList(peer_endpoints);
-        switch_s_instance_->InitClientChannels(false, peer_endpoints,
+      switch_s_instance_->InitClientChannels(false, peer_endpoints, peer_role);
-                                               peer_role);
-      }
    }
    return switch_s_instance_;
  }

--- a/paddle/fluid/distributed/ps/service/heter_server.cc
+++ b/paddle/fluid/distributed/ps/service/heter_server.cc
@@ -125,6 +125,9 @@ int SendAndRecvVariableHandler::SaveInSwitchWithShard(
    brpc::Controller* cntl) {
  VLOG(4) << "entering SaveInSwitchWithShard";
  int32_t group_id = request->group_id();
+  if (group_id >= FLAGS_heter_world_size) {
+    LOG(ERROR) << "group id exceed maxmium";
+  }
  auto& local_shard = _local_shards[group_id];
  auto& request_io_buffer = cntl->request_attachment();
  butil::IOBufBytesIterator io_buffer_itr(request_io_buffer);
@@ -132,11 +135,11 @@ int SendAndRecvVariableHandler::SaveInSwitchWithShard(
    const auto& var_name = request->send_var_names(idx);
    const auto& var_size = request->vars_len(idx);
    WaitForVarsConsumed(group_id, var_name);
+    std::unique_lock<std::mutex> lk(scope_mutex_);
    auto& value = local_shard[var_name];
    value.resize(var_size);
    io_buffer_itr.copy_and_forward(reinterpret_cast<void*>(value.data()),
                                   var_size);
-    std::unique_lock<std::mutex> lk(scope_mutex_);
    vars_ready_flag[group_id][var_name] = 1;
    VLOG(4) << "saved var_name: " << var_name << "is saved ready!";
  }
@@ -162,11 +165,11 @@ int SendAndRecvVariableHandler::QueryInSwitchWithShard(
    VLOG(4) << "req var name: " << req_var_name;
    response->add_send_var_names(req_var_name);
    WaitForVarsProduced(group_id, req_var_name);
+    std::unique_lock<std::mutex> lk(scope_mutex_);
    auto itr = local_shard.find(req_var_name);
    auto& value = itr.value();
    response_io_buffer.append(value.data(), value.size());
    value.resize(0);  // 清空内存
-    std::unique_lock<std::mutex> lk(scope_mutex_);
    vars_ready_flag[group_id][req_var_name] = 0;
    VLOG(4) << "query var_name: " << req_var_name << "is consumed ready!";
  }