未验证 提交 032414ca 编写于 作者: T tangwei12 提交者: GitHub

[Feature] one ps (3/4) (#29604)

* oneps (3/4)
Co-authored-by: NMrChengmo <cmchengmo@163.com>
Co-authored-by: Nmalin10 <malin10@baidu.com>
Co-authored-by: Nchengmo <chengmo@baidu.com>
上级 edc06c6a
......@@ -246,17 +246,6 @@ endif()
include(third_party) # download, build, install third_party, Contains about 20+ dependencies
if(WITH_DISTRIBUTE)
if(WITH_GRPC)
message(STATUS "Use grpc framework.")
include(external/grpc)
else()
message(STATUS "Use brpc framework.")
include(external/leveldb)
include(external/brpc)
endif()
endif()
include(flags) # set paddle compile flags
if(WITH_PROFILER)
......
......@@ -33,15 +33,15 @@ SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc libr
INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})
# Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog")
set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog")
# If minimal .a is need, you can set WITH_DEBUG_SYMBOLS=OFF
ExternalProject_Add(
extern_brpc
${EXTERNAL_PROJECT_LOG_ARGS}
${SHALLOW_CLONE}
GIT_REPOSITORY "${GIT_URL}/apache/incubator-brpc.git"
GIT_TAG "ad00fe940b4f05225b214131959293bbed8744a0" #rdma branch's head now.
# TODO(gongwb): change to de newst repo when they changed.
GIT_REPOSITORY "https://github.com/wangjiawei04/brpc"
GIT_TAG "6d79e0b17f25107c35b705ea58d888083f59ff47"
PREFIX ${BRPC_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
......@@ -63,9 +63,13 @@ ExternalProject_Add(
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
)
ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest)
# ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest snappy)
ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog snappy)
ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
ADD_DEPENDENCIES(brpc extern_brpc)
add_definitions(-DBRPC_WITH_GLOG)
LIST(APPEND external_project_dependencies brpc)
......@@ -23,10 +23,10 @@ INCLUDE_DIRECTORIES(${LEVELDB_INCLUDE_DIR})
ExternalProject_Add(
extern_leveldb
${EXTERNAL_PROJECT_LOG_ARGS}
${SHALLOW_CLONE}
PREFIX ${LEVELDB_SOURCES_DIR}
GIT_REPOSITORY "${GIT_URL}/google/leveldb.git"
GIT_REPOSITORY "https://github.com/google/leveldb"
GIT_TAG v1.18
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a
INSTALL_COMMAND mkdir -p ${LEVELDB_INSTALL_DIR}/lib/
......@@ -35,6 +35,11 @@ ExternalProject_Add(
BUILD_IN_SOURCE 1
)
ADD_DEPENDENCIES(extern_leveldb snappy)
ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES})
ADD_DEPENDENCIES(leveldb extern_leveldb)
LIST(APPEND external_project_dependencies leveldb)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include (ExternalProject)
# NOTE: snappy is needed when linking with recordio
set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
if(WIN32)
SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267")
else()
SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
endif()
ExternalProject_Add(
extern_snappy
GIT_REPOSITORY "https://github.com/google/snappy"
GIT_TAG "1.1.7"
PREFIX ${SNAPPY_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS}
-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DBUILD_TESTING=OFF
-DSNAPPY_BUILD_TESTS:BOOL=OFF
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPY_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
)
IF(WIN32)
IF(NOT EXISTS "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib")
add_custom_command(TARGET extern_snappy POST_BUILD
COMMAND cmake -E copy ${SNAPPY_INSTALL_DIR}/lib/snappy.lib ${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib
)
ENDIF()
set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib")
else(WIN32)
set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
endif (WIN32)
add_library(snappy STATIC IMPORTED GLOBAL)
set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES})
include_directories(${SNAPPY_INCLUDE_DIR})
add_dependencies(snappy extern_snappy)
......@@ -95,7 +95,7 @@ include_directories("${PADDLE_SOURCE_DIR}/paddle/fluid/framework/io")
if(NOT APPLE)
find_package(Threads REQUIRED)
link_libraries(${CMAKE_THREAD_LIBS_INIT})
if(WITH_PSLIB)
if(WITH_PSLIB OR WITH_DISTRIBUTE)
set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt -lz -lssl")
else()
set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
......
......@@ -233,7 +233,7 @@ if(WITH_PYTHON)
list(APPEND third_party_deps extern_pybind)
endif()
IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
IF(WITH_TESTING OR WITH_DISTRIBUTE)
include(external/gtest) # download, build, install gtest
list(APPEND third_party_deps extern_gtest)
ENDIF()
......@@ -275,14 +275,18 @@ if(WITH_BOX_PS)
list(APPEND third_party_deps extern_box_ps)
endif(WITH_BOX_PS)
if(WITH_DISTRIBUTE)
if (WITH_DISTRIBUTE)
include(external/snappy)
list(APPEND third_party_deps extern_snappy)
if(WITH_GRPC)
list(APPEND third_party_deps extern_grpc)
else()
include(external/leveldb)
list(APPEND third_party_deps extern_leveldb)
include(external/brpc)
list(APPEND third_party_deps extern_brpc)
endif()
include(external/libmct) # download, build, install libmct
list(APPEND third_party_deps extern_libmct)
endif()
if(WITH_XBYAK)
......
......@@ -14,14 +14,9 @@ if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
"${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
endif()
add_subdirectory(table)
add_subdirectory(test)
# open it until CI support brpc
return()
add_subdirectory(service)
add_subdirectory(test)
get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
......
......@@ -35,6 +35,6 @@ cc_library(server SRCS server.cc DEPS downpour_server boost ${RPC_DEPS})
cc_library(communicator SRCS communicator.cc DEPS scope client boost table math_function selected_rows_functor ${RPC_DEPS})
cc_library(ps_service SRCS service.cc DEPS communicator client server boost ${RPC_DEPS})
cc_library(brpc_utils SRCS brpc_utils.cc DEPS ${COMMON_DEPS} ${RPC_DEPS})
cc_library(brpc_utils SRCS brpc_utils.cc DEPS tensor device_context ${COMMON_DEPS} ${RPC_DEPS})
cc_library(heter_server SRCS heter_server.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS})
cc_library(heter_client SRCS heter_client.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS})
......@@ -741,7 +741,7 @@ std::future<int32_t> BrpcPsClient::pull_sparse(float **select_values,
request_call_num, [shard_sorted_kvs, value_size](void *done) {
int ret = 0;
auto *closure = (DownpourBrpcClosure *)done;
for (size_t i = 0; i < ids.size(); ++i) {
for (size_t i = 0; i < shard_sorted_kvs->size(); ++i) {
if (closure->check_response(i, PS_PULL_SPARSE_TABLE) != 0) {
ret = -1;
break;
......
......@@ -839,7 +839,7 @@ void GeoCommunicator::InitParams(const RecvCtxMap &recv_varname_to_ctx) {
for (auto &iter : send_varname_to_ctx_) {
auto &ctx = iter.second;
if (!ctx.is_sparse) return;
if (!ctx.is_sparse) continue;
auto &varname = ctx.origin_varnames[0];
auto &table_id = ctx.table_id;
auto param = varname.substr(0, varname.size() - 5);
......@@ -853,12 +853,12 @@ void GeoCommunicator::InitDense(std::vector<std::string> &varnames,
if (trainer_id_ == 0) {
RpcSendDenseParam(varnames, table_id, *recv_scope_);
BarrierWithTable(1);
VLOG(0) << "push dense param to table " << table_id
VLOG(1) << "push dense param to table " << table_id
<< " from 0' trainer done";
} else {
BarrierWithTable(1);
RpcRecvDense(varnames, table_id, recv_scope_);
VLOG(0) << "push dense param to table " << table_id
VLOG(1) << "pull dense param to table " << table_id
<< " from 0' trainer done";
}
......@@ -952,20 +952,20 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
}
void GeoCommunicator::InitSparse(const std::string &var_name, int table_id) {
VLOG(0) << "Init Sparse " << var_name << " : table " << table_id << " begin.";
VLOG(1) << "Init Sparse " << var_name << " : table " << table_id << " begin.";
if (trainer_id_ == 0) {
RpcSendSparseParam(var_name, table_id, *recv_scope_);
BarrierWithTable(1);
VLOG(0) << "push sparse param to table " << table_id
VLOG(1) << "push sparse param to table " << table_id
<< " from 0' trainer done";
} else {
BarrierWithTable(1);
RpcRecvSparse(var_name, table_id, recv_scope_);
VLOG(0) << "push dense param to table " << table_id
VLOG(1) << "pull sparse param to table " << table_id
<< " from 0' trainer done";
}
VLOG(0) << "Init Sparse " << var_name << " : table " << table_id << " done.";
VLOG(1) << "Init Sparse " << var_name << " : table " << table_id << " done.";
auto *global_var = recv_scope_->FindVar(var_name);
auto *var = old_scope_->Var(var_name);
framework::CopyVariable(*global_var, var);
......
......@@ -24,11 +24,11 @@
#include "paddle/fluid/platform/timer.h"
DECLARE_int32(rpc_deadline);
DECLARE_int32(pserver_timeout_ms);
namespace paddle {
namespace distributed {
DEFINE_int32(pserver_timeout_ms, 10800000, "pserver request server timeout_ms");
std::shared_ptr<HeterClient> HeterClient::s_instance_ = NULL;
bool HeterClient::is_initialized_ = false;
......@@ -53,6 +53,23 @@ void HeterClient::Stop() {
}
}
void HeterClient::FinalizeWorker() {
running_ = false;
if (!is_initialized_) {
VLOG(0) << "HeterClient is not inited, do nothing";
} else {
if (main_thread_) {
main_thread_->join();
main_thread_.reset(nullptr);
}
VLOG(1) << "HeterClient Stop Done";
}
}
std::future<int32_t> HeterClient::StopHeterWorker() {
return SendCmd(-1, PS_STOP_SERVER, {});
}
void HeterClient::RpcProfilerControl() {
if (trainer_id_ == 0) {
if (!do_server_profiler_ && platform::IsProfileEnabled()) {
......@@ -73,7 +90,7 @@ void HeterClient::CreateClient2XpuConnection() {
brpc::ChannelOptions options;
options.protocol = "baidu_std";
options.connection_type = "single";
options.timeout_ms = pserver_timeout_ms;
options.timeout_ms = FLAGS_pserver_timeout_ms;
xpu_channels_.resize(xpu_list_.size());
for (size_t i = 0; i < xpu_list_.size(); ++i) {
......@@ -102,7 +119,7 @@ void HeterClient::SendAndRecvAsync(
int num = trainer_id_ % xpu_channels_.size();
brpc::Controller cntl;
cntl.set_timeout_ms(pserver_timeout_ms);
cntl.set_timeout_ms(FLAGS_pserver_timeout_ms);
distributed::MultiVarMsg request, response;
auto& request_io_buffer = cntl.request_attachment();
::paddle::PsService_Stub stub(xpu_channels_[num].get());
......@@ -149,7 +166,7 @@ std::future<int32_t> HeterClient::SendCmd(
}
::paddle::PsService_Stub rpc_stub(xpu_channels_[i].get());
closure->cntl(i)->set_timeout_ms(
pserver_timeout_ms); // cmd msg don't limit timeout for save/load
FLAGS_pserver_timeout_ms); // cmd msg don't limit timeout for save/load
rpc_stub.service(closure->cntl(i), closure->request(i),
closure->response(i), closure);
}
......
......@@ -42,7 +42,7 @@ typedef std::function<void(void*)> HeterRpcCallbackFunc;
class OnHeterRpcDone : public google::protobuf::Closure {
public:
OnHeterRpcDone(HeterRpcCallbackFunc func) : handler_(func) {}
explicit OnHeterRpcDone(HeterRpcCallbackFunc func) : handler_(func) {}
virtual ~OnHeterRpcDone() {}
void Run() {
std::unique_ptr<OnHeterRpcDone> self_guard(this);
......@@ -79,7 +79,6 @@ class HeterClient {
if (NULL == s_instance_) {
is_initialized_ = true;
s_instance_.reset(new paddle::distributed::HeterClient());
std::vector<std::string> xpu_list = {endpoint};
s_instance_->SetXpuList(endpoint);
s_instance_->SetTrainerID(trainer_id);
s_instance_->CreateClient2XpuConnection();
......@@ -89,6 +88,8 @@ class HeterClient {
void Stop();
void FinalizeWorker();
void MainThread();
void RpcProfilerControl();
......@@ -97,6 +98,7 @@ class HeterClient {
const std::vector<std::string>& params);
std::future<int32_t> StartProfiler();
std::future<int32_t> StopProfiler();
std::future<int32_t> StopHeterWorker();
......@@ -104,17 +106,16 @@ class HeterClient {
void SetXpuList(const std::vector<std::string>& xpu_list) {
xpu_list_ = xpu_list;
};
}
void SetTrainerID(const int& trainer_id) { trainer_id_ = trainer_id; }
private:
static std::shared_ptr<HeterClient> s_instance_;
protected:
static bool is_initialized_;
std::unique_ptr<std::thread> main_thread_{nullptr};
std::vector<std::shared_ptr<brpc::Channel>> xpu_channels_;
DISABLE_COPY_AND_ASSIGN(HeterClient);
std::vector<std::string> xpu_list_;
......
......@@ -45,7 +45,11 @@ void HeterServer::StartHeterService() {
}
condition_ready_.notify_all();
server_.Join();
std::unique_lock<std::mutex> running_lock(mutex_);
cv_.wait(running_lock, [&] {
VLOG(1) << "Heter Server is Stop? " << stoped_;
return stoped_;
});
}
void HeterServer::SetEndPoint(std::string& endpoint) {
......@@ -83,6 +87,7 @@ int32_t HeterService::stop_heter_worker(const PsRequestMessage& request,
stop_cpu_worker_set_.insert(client_id);
if (stop_cpu_worker_set_.size() == fan_in_) {
is_exit_ = true;
VLOG(0) << "Stop heter Service done.";
}
return 0;
}
......
......@@ -20,6 +20,7 @@ limitations under the License. */
#include <random>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "brpc/channel.h"
#include "brpc/controller.h"
......@@ -34,6 +35,7 @@ limitations under the License. */
#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
#include "paddle/fluid/platform/profiler.h"
DECLARE_double(eager_delete_tensor_gb);
namespace paddle {
namespace distributed {
......@@ -82,7 +84,7 @@ class HeterService : public ::paddle::PsService {
response->set_err_code(service_ret);
response->set_err_msg("server internal error");
}
};
}
void SendAndRecvVariable(::google::protobuf::RpcController* controller,
const MultiVarMsg* request, MultiVarMsg* response,
......@@ -134,6 +136,10 @@ class HeterServer {
virtual ~HeterServer() {}
void Stop() {
VLOG(0) << "HeterServer Stop()";
std::unique_lock<std::mutex> lock(mutex_);
stoped_ = true;
cv_.notify_all();
server_.Stop(1000);
server_.Join();
}
......@@ -162,6 +168,10 @@ class HeterServer {
private:
static std::shared_ptr<HeterServer> s_instance_;
mutable std::mutex mutex_;
std::condition_variable cv_;
std::condition_variable condition_ready_;
bool stoped_ = false;
std::string endpoint_;
protected:
......@@ -169,7 +179,7 @@ class HeterServer {
HeterService service_;
DISABLE_COPY_AND_ASSIGN(HeterServer);
std::mutex mutex_ready_;
std::condition_variable condition_ready_;
int ready_;
};
......@@ -215,6 +225,7 @@ class RequestSendAndRecvHandler final : public HeterRequestHandler {
int Handle(const MultiVarMsg* request, MultiVarMsg* response,
brpc::Controller* cntl) override {
platform::RecordEvent record_event("RequestSendAndRecvHandler->Handle");
FLAGS_eager_delete_tensor_gb = -1;
auto& local_scope = scope_->NewScope();
auto message_name = request->message_name();
auto& request_io_buffer = cntl->request_attachment();
......
......@@ -60,6 +60,8 @@ int32_t PSServer::configure(const PSParameter &config, PSEnvironment &env,
_environment = &env;
_shuffled_ins =
paddle::framework::MakeChannel<std::pair<uint64_t, std::string>>();
size_t shard_num = env.get_ps_servers().size();
const auto &downpour_param = _config.downpour_server_param();
uint32_t barrier_table = UINT32_MAX;
......@@ -72,6 +74,7 @@ int32_t PSServer::configure(const PSParameter &config, PSEnvironment &env,
"BarrierTable") {
barrier_table = downpour_param.downpour_table_param(i).table_id();
}
table->set_shard(_rank, shard_num);
table->initialize(downpour_param.downpour_table_param(i),
config.fs_client_param());
_table_map[downpour_param.downpour_table_param(i).table_id()].reset(table);
......
......@@ -12,8 +12,7 @@ cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc sparse
set_source_files_properties(tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(tensor_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_library(tensor_accessor SRCS tensor_accessor.cc DEPS ${TABLE_DEPS} eigen3 ps_framework_proto device_context)
cc_library(tensor_table SRCS tensor_table.cc DEPS ps_framework_proto proto_desc enforce executor tensor device_context simple_threadpool gflags glog )
set_source_files_properties(table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_library(table SRCS table.cc DEPS common_table tensor_table tensor_accessor ps_framework_proto string_helper device_context gflags glog boost)
cc_library(table SRCS table.cc DEPS common_table tensor_accessor ps_framework_proto string_helper device_context gflags glog boost)
......@@ -251,6 +251,30 @@ int32_t CommonSparseTable::initialize_value() {
auto shard = std::make_shared<ValueBlock>(common, &initializers_);
shard_values_.emplace_back(shard);
}
auto accessor = _config.accessor();
std::vector<uint64_t> feasigns;
for (size_t x = 0; x < accessor.fea_dim(); ++x) {
if (x % _shard_num == _shard_idx) {
feasigns.push_back(x);
}
}
VLOG(0) << "has " << feasigns.size() << " ids need to be pre inited";
auto buckets = bucket(feasigns.size(), 10);
for (int x = 0; x < 10; ++x) {
auto bucket_feasigns = buckets[x + 1] - buckets[x];
std::vector<uint64_t> ids(bucket_feasigns);
std::copy(feasigns.begin() + buckets[x], feasigns.begin() + buckets[x + 1],
ids.begin());
std::vector<float> pulls;
pulls.resize(bucket_feasigns * param_dim_);
pull_sparse(pulls.data(), ids.data(), bucket_feasigns);
}
return 0;
}
......
......@@ -34,6 +34,18 @@ class Initializer {
virtual float GetValue() = 0;
virtual void GetValue(std::vector<float> *values, int numel) {
for (int x = 0; x < numel; ++x) {
values->push_back(GetValue());
}
}
virtual void GetValue(float *value, int numel) {
for (int x = 0; x < numel; ++x) {
value[x] = GetValue();
}
}
virtual ~Initializer() {}
protected:
......@@ -54,6 +66,11 @@ class UniformInitializer : public Initializer {
}
float GetValue() override { return dist_(*random_engine_); }
void GetValue(float *value, int numel) {
for (int x = 0; x < numel; ++x) {
value[x] = dist_(*random_engine_);
}
}
private:
float min_;
......@@ -77,6 +94,11 @@ class GaussianInitializer : public Initializer {
}
float GetValue() override { return dist_(*random_engine_); }
void GetValue(float *value, int numel) {
for (int x = 0; x < numel; ++x) {
value[x] = dist_(*random_engine_);
}
}
private:
float std_;
......@@ -94,6 +116,7 @@ class FillConstantInitializer : public Initializer {
}
float GetValue() override { return value_; }
void GetValue(float *value, int numel) { std::fill_n(value, numel, value_); }
private:
float value_;
......
......@@ -68,7 +68,7 @@ inline bool entry<float>(const int count, const float threshold) {
struct VALUE {
explicit VALUE(const std::vector<std::string> &names)
: names_(names), count_(0), unseen_days_(0) {
: names_(names), count_(1), unseen_days_(0), seen_after_last_save_(true) {
values_.resize(names.size());
for (int i = 0; i < static_cast<int>(names.size()); i++) {
places[names[i]] = i;
......@@ -79,6 +79,14 @@ struct VALUE {
values_ = std::move(*values);
}
void set(const std::vector<Initializer *> &inits, std::vector<int> numels) {
for (int x = 0; x < numels.size(); ++x) {
auto &value = values_[x];
value.resize(numels[x]);
inits[x]->GetValue(value.data(), numels[x]);
}
}
void set(const std::vector<std::string> &names,
const std::vector<std::vector<float>> &values) {
for (int i = 0; i < static_cast<int>(names.size()); i++) {
......@@ -117,8 +125,8 @@ struct VALUE {
std::vector<std::string> names_;
int count_;
bool seen_after_last_save_;
int unseen_days_;
bool seen_after_last_save_;
bool is_entry_;
std::vector<std::vector<float>> values_;
std::unordered_map<std::string, int> places;
......@@ -139,15 +147,20 @@ class ValueBlock {
value_dims_.push_back(dim);
}
for (auto &name : value_names_) {
initializer_list_.emplace_back(initializers_->at(name));
}
// for Entry
{
// entry will add later
std::string entry_attr = "none";
if (entry_attr == "none") {
has_entry = false;
entry_func_ =
std::bind(entry<std::string>, std::placeholders::_1, "none");
} else {
has_entry = true;
auto slices = string::split_string<std::string>(entry_attr, "&");
if (slices[0] == "count_filter") {
int threshold = std::stoi(slices[1]);
......@@ -181,6 +194,22 @@ class ValueBlock {
values_[id] = value;
}
void Init(const uint64_t &id, const std::vector<Initializer *> &inits,
int count) {
if (Has(id)) {
PADDLE_THROW(platform::errors::AlreadyExists("id already exist, error"));
}
if (inits.size() != value_names_.size()) {
PADDLE_THROW(
platform::errors::AlreadyExists("values can not match, error"));
}
auto value = new VALUE(value_names_);
value->set(inits, value_dims_);
values_[id] = value;
}
std::vector<std::vector<float> *> Get(
const uint64_t &id, const std::vector<std::string> &value_names) {
auto ret_values = values_.at(id)->get(value_names);
......@@ -195,27 +224,12 @@ class ValueBlock {
void InitFromInitializer(const uint64_t &id,
const std::vector<std::string> &value_names) {
if (Has(id)) {
if (has_entry) {
Update(id);
return;
}
auto rets = std::vector<std::vector<float>>();
rets.resize(value_names_.size());
for (int i = 0; i < static_cast<int>(value_names_.size()); i++) {
auto name = value_names_[i];
auto *init = initializers_->at(name);
auto dim = value_dims_[i];
rets[i].resize(dim);
for (int j = 0; j < static_cast<int>(dim); j++) {
rets[i][j] = init->GetValue();
}
return;
}
Init(id, &rets, 0);
Update(id);
Init(id, initializer_list_, 1);
}
bool GetEntry(const uint64_t &id) {
......@@ -254,10 +268,12 @@ class ValueBlock {
std::unordered_map<uint64_t, VALUE *> values_;
private:
bool has_entry = false;
std::vector<std::string> value_names_;
std::vector<int> value_dims_;
std::function<bool(uint64_t)> entry_func_;
std::unordered_map<std::string, Initializer *> *initializers_;
std::vector<Initializer *> initializer_list_;
};
} // namespace distributed
......
......@@ -22,14 +22,12 @@
#include "paddle/fluid/distributed/table/common_sparse_table.h"
#include "paddle/fluid/distributed/table/sparse_geo_table.h"
#include "paddle/fluid/distributed/table/tensor_accessor.h"
#include "paddle/fluid/distributed/table/tensor_table.h"
namespace paddle {
namespace distributed {
REGISTER_CLASS(Table, CommonDenseTable);
REGISTER_CLASS(Table, CommonSparseTable);
REGISTER_CLASS(Table, DenseTensorTable);
REGISTER_CLASS(Table, SparseGeoTable);
REGISTER_CLASS(Table, BarrierTable);
......
if(APPLE)
return()
endif()
set_source_files_properties(table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(table_test SRCS table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})
set_source_files_properties(dense_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(dense_table_test SRCS dense_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})
set_source_files_properties(sparse_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(sparse_table_test SRCS sparse_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})
set_source_files_properties(geo_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(geo_table_test SRCS geo_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})
set_source_files_properties(barrier_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(barrier_table_test SRCS barrier_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})
# open it until CI support brpc
return()
set_source_files_properties(brpc_service_dense_sgd_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(brpc_service_dense_sgd_test SRCS brpc_service_dense_sgd_test.cc DEPS scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})
......
......@@ -120,7 +120,7 @@ TEST(CommonDenseTable, Adam) {
beta2_pow[0] *= beta2;
}
for (int j = 0; j < fea_dim; j++) {
ASSERT_TRUE(abs(param[j] - pull_values[j]) < 1e-6);
ASSERT_TRUE(abs(param[j] - pull_values[j]) < 1e-5);
}
}
......
......@@ -62,7 +62,7 @@ TEST(SparseGeoTable, SSUM) {
std::vector<float> pull_values(init_values.size());
table->pull_sparse(pull_values.data(), init_keys.data(), init_keys.size());
for (size_t i = 0; i < init_keys.size() * emb_dim; i++) {
ASSERT_TRUE(abs(pull_values[i] - init_values[i]) < 1e-6);
ASSERT_TRUE(abs(pull_values[i] - init_values[i]) < 1e-5);
}
std::vector<std::vector<uint64_t>> trainer_keys;
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <ThreadPool.h>
#include <unistd.h>
#include <string>
#include <thread> // NOLINT
#include "google/protobuf/text_format.h"
#include "gtest/gtest.h"
#include "paddle/fluid/distributed/ps.pb.h"
#include "paddle/fluid/distributed/table/common_sparse_table.h"
#include "paddle/fluid/distributed/table/depends/large_scale_kv.h"
#include "paddle/fluid/distributed/table/table.h"
namespace paddle {
namespace distributed {
TEST(BENCHMARK, LargeScaleKV) {
int emb_dim = 10;
int trainers = 2;
float beta1 = 0.9;
float beta2 = 0.999;
float epsilon = 1.0e-8;
TableParameter table_config;
table_config.set_table_class("CommonSparseTable");
FsClientParameter fs_config;
Table *table = new CommonSparseTable();
TableAccessorParameter *accessor_config = table_config.mutable_accessor();
accessor_config->set_accessor_class("CommMergeAccessor");
CommonAccessorParameter *common_config = table_config.mutable_common();
common_config->set_name("adam");
common_config->set_table_name("adam_test_table");
common_config->set_trainer_num(trainers);
common_config->add_params("Param");
common_config->add_dims(emb_dim);
common_config->add_initializers("uniform_random&0&-1.0&1.0");
common_config->add_params("LearningRate");
common_config->add_dims(1);
common_config->add_initializers("fill_constant&1.0");
common_config->add_params("Moment1");
common_config->add_dims(emb_dim);
common_config->add_initializers("fill_constant&0.0");
common_config->add_params("Moment2");
common_config->add_dims(emb_dim);
common_config->add_initializers("fill_constant&0.0");
common_config->add_params("Beta1Pow");
common_config->add_dims(1);
common_config->add_initializers("fill_constant&1.0");
common_config->add_params("Beta2Pow");
common_config->add_dims(1);
common_config->add_initializers("fill_constant&1.0");
auto ret = table->initialize(table_config, fs_config);
ASSERT_EQ(ret, 0);
}
} // namespace distributed
} // namespace paddle
......@@ -218,16 +218,16 @@ if(WITH_DISTRIBUTE)
cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
heterxpu_trainer.cc
data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc ps_gpu_worker.cc
heterbox_worker.cc heterbox_trainer.cc ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc
data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc
heterbox_worker.cc heterbox_trainer.cc downpour_worker.cc downpour_worker_opt.cc
pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
device_context scope framework_proto trainer_desc_proto glog fs shell
fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer
lod_rank_table feed_fetch_method sendrecvop_rpc communicator collective_helper ${GLOB_DISTRIBUTE_DEPS}
graph_to_program_pass variable_helper data_feed_proto timer monitor
heter_service_proto)
device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper lodtensor_printer feed_fetch_method
graph_to_program_pass variable_helper timer monitor heter_service_proto fleet)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(multi_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(hogwild_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
endif()
elseif(WITH_PSLIB)
cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
......@@ -239,11 +239,7 @@ elseif(WITH_PSLIB)
device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
graph_to_program_pass variable_helper timer monitor pslib_brpc )
# TODO: Fix these unittest failed on Windows
# This unittest will always failed, now no CI will run this unittest
if(NOT WITH_MUSL AND NOT WIN32)
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
endif()
else()
cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
......@@ -254,11 +250,6 @@ else()
device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
graph_to_program_pass variable_helper timer monitor)
# TODO: Fix these unittest failed on Windows
# This unittest will always failed, now no CI will run this unittest
if(NOT WITH_MUSL AND NOT WIN32)
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
endif()
endif()
target_link_libraries(executor while_op_helper executor_gc_helper recurrent_op_helper conditional_block_op_helper)
......
......@@ -15,10 +15,10 @@ cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_he
cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
if(WITH_DISTRIBUTE)
if(NOT WITH_GRPC)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(reduce_op_handle.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
endif()
set_source_files_properties(threaded_ssa_graph_executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(async_ssa_graph_executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
endif()
......@@ -36,7 +36,7 @@ if(WITH_GPU)
if(WITH_DISTRIBUTE)
nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
ddim dynload_cuda selected_rows_functor sendrecvop_rpc)
ddim dynload_cuda selected_rows_functor)
else()
nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
ddim dynload_cuda selected_rows_functor)
......@@ -52,7 +52,7 @@ else()
variable_visitor place device_memory_aligment)
if(WITH_DISTRIBUTE)
cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
ddim selected_rows_functor sendrecvop_rpc)
ddim selected_rows_functor)
else()
cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
ddim selected_rows_functor)
......@@ -85,9 +85,7 @@ cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS
cc_library(parallel_ssa_graph_executor SRCS parallel_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor)
set(ASYNC_SSA_GRAPH_EXECUTOR_DEPS threaded_ssa_graph_executor)
if(WITH_DISTRIBUTE)
list(APPEND ASYNC_SSA_GRAPH_EXECUTOR_DEPS communicator)
endif()
cc_library(async_ssa_graph_executor SRCS async_ssa_graph_executor.cc DEPS ${ASYNC_SSA_GRAPH_EXECUTOR_DEPS})
cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
......
......@@ -17,7 +17,7 @@
#include "paddle/fluid/framework/variable_helper.h"
#ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/operators/distributed/communicator.h"
#include "paddle/fluid/distributed/service/communicator.h"
#endif
namespace paddle {
......@@ -43,40 +43,7 @@ inline void InitVarsInScope(const std::vector<VarInfo> &var_infos, Scope *scope,
}
// get CommContext and remote send and recv op
void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
#ifdef PADDLE_WITH_DISTRIBUTE
bool need_communicator = false;
for (auto &node : graphs[0]->Nodes()) {
VLOG(3) << "node name " << node->Name();
if (node && node->IsOp()) {
if (node->Name() == "send") {
auto send_varnames =
BOOST_GET_CONST(std::vector<std::string>,
node->Op()->GetNullableAttr("send_varnames"));
if (send_varnames.size() > 0) {
need_communicator = true;
break;
}
}
}
}
if (need_communicator) {
// init communicator here
auto *instance = operators::distributed::Communicator::GetInstance();
auto initialized = instance ? true : false;
PADDLE_ENFORCE_EQ(initialized, true,
platform::errors::InvalidArgument(
"Communicator is not Initialized, you may use "
"FleetAPI(https://github.com/PaddlePaddle/Fleet/tree/"
"develop/markdown_doc/transpiler)"));
}
#endif
}
void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) { return; }
AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
......@@ -171,12 +138,12 @@ FetchResultType AsyncSSAGraphExecutor::Run(
"results to be fetched!"));
// init once
if (run_futures_.size() == 0 && places_.size() > 1) {
if (strategy_.thread_barrier_) {
#ifdef PADDLE_WITH_DISTRIBUTE
operators::distributed::Communicator::GetInstance()->BarrierTriggerReset(
if (strategy_.thread_barrier_) {
paddle::distributed::Communicator::GetInstance()->BarrierTriggerReset(
places_.size());
#endif
}
#endif
exception_holder_.Clear();
StartOffPythonTrainLoop(return_merged);
}
......
......@@ -19,11 +19,6 @@
#include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h"
#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/operators/distributed/collective_client.h"
#include "paddle/fluid/operators/distributed/collective_server.h"
#include "paddle/fluid/operators/distributed/request_handler.h"
#endif
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -51,106 +46,6 @@ void ReduceOpHandle::Wait(
}
}
#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
template <typename DevCtx, typename DataType>
void ReduceOpHandle::GatherSelectedRows(
const std::vector<const SelectedRows *> &src_selected_rows,
const std::vector<platform::Place> &in_places,
const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
VarHandle *out_var_handle, const platform::Place &out_place,
SelectedRows *dst_selected_rows) {
const CollectiveContext &collective_context =
*CollectiveContext::GetInstance();
// 1. gather local selected rows, merge them
std::string gathered_var_name = out_var_handle->name() + "_gathered_tmp";
auto scope = local_scopes_.at(out_var_handle->scope_idx());
auto gathered_var_mid = scope->Var(gathered_var_name);
auto gathered_select_rows =
gathered_var_mid->GetMutable<framework::SelectedRows>();
GatherLocalSelectedRowsFunctor functor(
src_selected_rows, in_places, dev_ctxes, out_place, gathered_select_rows);
WaitInputVarGenerated();
functor();
// FIXME(gongwb): remove this Wait.
Wait(dev_ctxes);
// merge them
auto merged_dev_ctx = dynamic_cast<DevCtx *>(dev_ctxes.at(out_place));
std::string merged_var_name =
GetRemoteVarName(out_var_handle->name(), collective_context.trainer_id_);
auto merged_select_rows =
scope->Var(merged_var_name)->GetMutable<SelectedRows>();
operators::math::scatter::MergeAdd<DevCtx, DataType> merge_func;
merge_func(*merged_dev_ctx, *gathered_select_rows, merged_select_rows);
// 2. start collective server if it doesn't exist
operators::distributed::CollectiveServer *server =
operators::distributed::CollectiveServer::GetInstance(
collective_context.endpoints_[collective_context.trainer_id_],
collective_context.endpoints_.size() - 1);
auto rpc_server = server->GetRPCServer();
rpc_server->RegisterVar(merged_var_name,
operators::distributed::kRequestGetMonomerVariable,
scope, merged_dev_ctx);
// 3. gather them from all remote nodes.
std::vector<const SelectedRows *> remote;
operators::distributed::CollectiveClient *client =
operators::distributed::CollectiveClient::GetInstance();
std::vector<operators::distributed::RemoteVar> vars;
for (unsigned int i = 0; i < collective_context.endpoints_.size(); i++) {
if (i == (unsigned)collective_context.trainer_id_) continue;
operators::distributed::RemoteVar var;
var.trainer_id_ = i;
var.var_name_ = GetRemoteVarName(out_var_handle->name(), i);
var.ep_ = collective_context.endpoints_[i];
vars.push_back(var);
VLOG(4) << "gather from:" << var.String();
}
// erase gathered vars
merged_dev_ctx->Wait();
scope->EraseVars(std::vector<std::string>{gathered_var_name});
PADDLE_ENFORCE_EQ(
client->Gather(vars, &remote, *merged_dev_ctx, scope), true,
platform::errors::PreconditionNotMet("Gather SelectedRows failed."));
PADDLE_ENFORCE_EQ(remote.size(), vars.size(),
platform::errors::PreconditionNotMet(
"The number of remotes should be equal to the number "
"of variables to be gathered, but got the number of "
"remotes is %d and the number of variables is %d.",
remote.size(), vars.size()));
// 4. merged local selected rows.
std::vector<const SelectedRows *> all;
all.resize(collective_context.endpoints_.size());
for (auto v : vars) {
all[v.trainer_id_] =
scope->FindVar(v.var_name_)->GetMutable<SelectedRows>();
}
all[collective_context.trainer_id_] = merged_select_rows;
merge_func(*merged_dev_ctx, all, dst_selected_rows);
rpc_server->WaitVarBarrier(merged_var_name);
rpc_server->ClearVar(merged_var_name);
// 5. clear mid vars
std::vector<std::string> tmp_vars{merged_var_name};
for (auto r : vars) {
tmp_vars.push_back(r.var_name_);
}
scope->EraseVars(tmp_vars);
}
#endif
void ReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name());
......@@ -241,25 +136,6 @@ void ReduceOpHandle::RunImpl() {
functor();
return;
}
#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
if (in_selected_rows[0]->value().type() ==
framework::proto::VarType::FP32) {
GatherSelectedRows<platform::CUDADeviceContext, float>(
in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
out_var->GetMutable<framework::SelectedRows>());
} else if (in_selected_rows[0]->value().type() ==
framework::proto::VarType::FP64) {
GatherSelectedRows<platform::CUDADeviceContext, double>(
in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
out_var->GetMutable<framework::SelectedRows>());
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Only support double or float when gather SelectedRows, but got "
"%s.",
framework::DataTypeToString(in_selected_rows[0]->value().type())));
}
#endif
});
} else {
std::vector<const LoDTensor *> lod_tensors =
......
......@@ -18,7 +18,7 @@
#include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/operators/distributed/communicator.h"
#include "paddle/fluid/distributed/service/communicator.h"
#endif
namespace paddle {
......@@ -362,14 +362,11 @@ void ThreadedSSAGraphExecutor::ExecutionFinal(
std::vector<OpHandleBase *> *fetch_ops) {
#ifdef PADDLE_WITH_DISTRIBUTE
if (strategy_.thread_barrier_) {
operators::distributed::Communicator::GetInstance()
->BarrierTriggerDecrement();
paddle::distributed::Communicator::GetInstance()->BarrierTriggerDecrement();
}
#endif
VLOG(3) << "caught exception " << exception_holder_.Type() << ", rethrow it";
ClearFetchOp(graph_, fetch_ops);
exception_holder_.ReThrow();
}
......
......@@ -34,7 +34,6 @@ limitations under the License. */
#include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
#include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
#include "paddle/fluid/operators/controlflow/while_op_helper.h"
#include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_MKLDNN
......@@ -91,13 +90,13 @@ Executor::~Executor() {
}
void Executor::Close() {
#ifdef PADDLE_WITH_DISTRIBUTE
// TODO(typhoonzero): complete message will need to use real trainer_id,
// except 0.
auto client =
paddle::operators::distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
client->SendComplete();
#endif
// #ifdef PADDLE_WITH_DISTRIBUTE
// // TODO(typhoonzero): complete message will need to use real trainer_id,
// // except 0.
// auto client =
// paddle::operators::distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
// client->SendComplete();
// #endif
}
void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
......
......@@ -16,10 +16,13 @@ limitations under the License. */
#include "paddle/fluid/framework/device_worker.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
#include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/lodtensor_printer.h"
#ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/distributed/service/communicator.h"
#endif
namespace paddle {
namespace framework {
......@@ -185,8 +188,7 @@ void HogwildWorker::TrainFilesWithProfiler() {
#ifdef PADDLE_WITH_DISTRIBUTE
if (thread_barrier_) {
operators::distributed::Communicator::GetInstance()
->BarrierTriggerDecrement();
paddle::distributed::Communicator::GetInstance()->BarrierTriggerDecrement();
}
#endif
}
......@@ -216,8 +218,7 @@ void HogwildWorker::TrainFiles() {
}
#ifdef PADDLE_WITH_DISTRIBUTE
if (thread_barrier_) {
operators::distributed::Communicator::GetInstance()
->BarrierTriggerDecrement();
paddle::distributed::Communicator::GetInstance()->BarrierTriggerDecrement();
}
#endif
}
......
......@@ -17,7 +17,10 @@ limitations under the License. */
#include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/trainer.h"
#include "paddle/fluid/operators/distributed/distributed.h"
#ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/distributed/service/communicator.h"
#endif
namespace paddle {
namespace framework {
......@@ -48,7 +51,7 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc,
#ifdef PADDLE_WITH_DISTRIBUTE
if (trainer_desc.thread_barrier()) {
operators::distributed::Communicator::GetInstance()->BarrierTriggerReset(
paddle::distributed::Communicator::GetInstance()->BarrierTriggerReset(
thread_num_);
}
#endif
......
......@@ -77,8 +77,13 @@ set(SHARED_INFERENCE_SRCS
${mkldnn_quantizer_src_file})
# Create shared inference library defaultly
cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
if(NOT WITH_DISTRIBUTE)
cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
DEPS ${fluid_modules} analysis_predictor)
else()
cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS}
DEPS ${fluid_modules} analysis_predictor fleet ps_service)
endif()
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(paddle_fluid_shared ${os_dependency_modules})
......
#!/bin/sh
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
lib=$1
if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi
num_paddle_syms=$(nm -D ${lib} | grep paddle | wc -l)
num_google_syms=$(nm -D ${lib} | grep google | grep -v paddle | grep "T " | wc -l)
num_paddle_syms=$(nm -D "${lib}" | grep -c paddle )
num_google_syms=$(nm -D "${lib}" | grep google | grep -v paddle | grep -v brpc | grep -c "T " )
if [ $num_paddle_syms -le 0 ]; then echo "Have no paddle symbols"; exit -1 ; fi
if [ $num_google_syms -ge 1 ]; then echo "Have some google symbols"; exit -1 ; fi
......
......@@ -20,9 +20,9 @@ add_subdirectory(reduce_ops)
add_subdirectory(sequence_ops)
add_subdirectory(jit)
if(WITH_DISTRIBUTE)
add_subdirectory(distributed)
add_subdirectory(distributed_ops)
add_subdirectory(pscore)
add_subdirectory(collective)
endif()
......@@ -50,10 +50,6 @@ if (WITH_GPU)
endif()
endif()
SET(OP_PREFETCH_DEPS "")
if (WITH_DISTRIBUTE)
SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch)
endif()
SET(OP_MKL_DEPS "")
if (NOT WITH_MKL OR NOT WITH_AVX)
......@@ -70,9 +66,9 @@ if(WITH_UNITY_BUILD)
endif()
register_operators(EXCLUDES py_func_op warpctc_op dgc_op lstm_op run_program_op eye_op recurrent_op
sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})
if (WITH_GPU)
# warpctc_op needs cudnn 7 above
......@@ -86,9 +82,10 @@ if (WITH_GPU)
else()
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
endif()
op_library(lstm_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS} lstm_compute)
op_library(eye_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
op_library(recurrent_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
op_library(lstm_op DEPS ${OP_HEADER_DEPS} lstm_compute)
op_library(eye_op DEPS ${OP_HEADER_DEPS})
op_library(recurrent_op DEPS ${OP_HEADER_DEPS})
set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
......@@ -163,5 +160,5 @@ if(WITH_UNITY_BUILD)
# Using Unity Build to compile operators, `register_operator` will cause
# the unity library to lose some symbols.
# The specified link dependency needs to be displayed here.
target_link_libraries(paddle_operators_unity ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS} ${COMMON_OP_DEPS})
target_link_libraries(paddle_operators_unity ${OP_HEADER_DEPS} ${COMMON_OP_DEPS})
endif()
include(operators)
set(COLLECTIVE_DEPS "")
if(WITH_GRPC)
set(COLLECTIVE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder grpc++_unsecure grpc_unsecure gpr zlib protobuf node)
else()
set(COLLECTIVE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder brpc leveldb protobuf ssl crypto zlib node)
if(WITH_BRPC_RDMA)
find_library(IBVERBS_LIBRARY NAMES ibverbs)
ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ibverbs PROPERTY IMPORTED_LOCATION ${IBVERBS_LIBRARY})
find_library(RDMACM_LIBRARY NAMES rdmacm)
ADD_LIBRARY(rdmacm SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET rdmacm PROPERTY IMPORTED_LOCATION ${RDMACM_LIBRARY})
set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} ibverbs rdmacm)
endif()
endif()
set(COLLECTIVE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <future> // NOLINT
#include <ostream>
#include "paddle/fluid/operators/distributed_ops/allreduce_op.h"
namespace paddle {
namespace operators {
class AllReduceOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
}
};
class AllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "(Tensor), tensor to be allreduced.");
AddOutput("Out", "(Tensor) the result of allreduced.");
AddAttr<int>("reduce_type", "(int) determin the reduce type.")
.SetDefault(0);
AddAttr<bool>(
"sync_mode",
"(bool) whether to synchronize the CUDA stream after nccl call.")
.SetDefault(false);
AddComment(R"DOC(
***AllReduce Operator***
Call NCCL AllReduce internally. Note that this op must be used when one
thread is managing one GPU device.
For speed reasons, reduce_type should be an integer:
0: sum
1: prod
2: max
3: min
If input and output are the same variable, in-place allreduce will be used.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_WITHOUT_GRADIENT(allreduce, ops::AllReduceOp,
ops::AllReduceOpMaker);
REGISTER_OP_CPU_KERNEL(
allreduce, ops::AllReduceOpKernel<plat::CPUDeviceContext, float>,
ops::AllReduceOpKernel<plat::CPUDeviceContext, double>,
ops::AllReduceOpKernel<plat::CPUDeviceContext, int>,
ops::AllReduceOpKernel<plat::CPUDeviceContext, int64_t>,
ops::AllReduceOpKernel<plat::CPUDeviceContext, plat::float16>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/distributed_ops/allreduce_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
allreduce, ops::AllReduceOpKernel<plat::CUDADeviceContext, float>,
ops::AllReduceOpKernel<plat::CUDADeviceContext, double>,
ops::AllReduceOpKernel<plat::CUDADeviceContext, int>,
ops::AllReduceOpKernel<plat::CUDADeviceContext, int64_t>,
ops::AllReduceOpKernel<plat::CUDADeviceContext, plat::float16>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class AllReduceOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto place = ctx.GetPlace();
PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
platform::errors::PreconditionNotMet(
"AllReduce op can run on gpu place only for now."));
#if defined(PADDLE_WITH_NCCL)
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto in = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out");
int dtype = platform::ToNCCLDataType(in->type());
int64_t numel = in->numel();
auto* sendbuff = in->data<void>();
out->Resize(in->dims());
void* recvbuff = out->mutable_data<T>(place);
auto* comm = dev_ctx.nccl_comm();
// FIXME(typhoonzero): should use nccl stream here.
auto stream = dev_ctx.stream();
PADDLE_ENFORCE_NOT_NULL(
stream, platform::errors::NotFound("Should initialize NCCL firstly."));
int reduce_type = ctx.Attr<int>("reduce_type");
ncclRedOp_t red_type = ncclSum;
switch (reduce_type) {
case 0:
red_type = ncclSum;
break;
case 1:
red_type = ncclProd;
break;
case 2:
red_type = ncclMax;
break;
case 3:
red_type = ncclMin;
break;
}
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
sendbuff, recvbuff, numel, static_cast<ncclDataType_t>(dtype), red_type,
comm, stream));
if (ctx.Attr<bool>("sync_mode")) {
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
}
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with GPU."));
#endif
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <ostream>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
class BroadcastOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
platform::errors::InvalidArgument(
"Input(X) of BroadcastOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
platform::errors::InvalidArgument(
"Output(Output) of ConvOp should not be null."));
}
};
class BroadcastOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "(Tensor), tensor to be broadcast.");
AddOutput("Out", "(Tensor) the result of broadcast.");
AddAttr<bool>(
"sync_mode",
"(bool) whether to synchronize the CUDA stream after nccl call.")
.SetDefault(false);
AddAttr<int>("root", "(int).").SetDefault(0).EqualGreaterThan(0);
AddComment(R"DOC(
***Broadcast Operator***
Call NCCL Broadcast internally. Note that this op must be used when one
thread is managing one GPU device.
)DOC");
}
};
template <typename T>
class BroadcastOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Broadcast op can run on gpu place only for now."));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_WITHOUT_GRADIENT(broadcast, ops::BroadcastOp,
ops::BroadcastOpMaker);
REGISTER_OP_CPU_KERNEL(broadcast, ops::BroadcastOpKernel<float>,
ops::BroadcastOpKernel<double>,
ops::BroadcastOpKernel<int>,
ops::BroadcastOpKernel<int64_t>,
ops::BroadcastOpKernel<plat::float16>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
namespace ops = paddle::operators;
namespace plat = paddle::platform;
namespace paddle {
namespace operators {
template <typename T>
class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::PreconditionNotMet(
"The place of ExecutionContext should be CUDAPlace."));
#if defined(PADDLE_WITH_NCCL)
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).device;
int root_dev_id = ctx.Attr<int>("root");
auto in = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out");
PADDLE_ENFORCE_EQ(
out->IsInitialized(), true,
platform::errors::PreconditionNotMet(
"Currently, the output of broadcast op must be initialized,"
"because this op can only be an In-Place operation."));
void* send_recv_buffer = out->mutable_data<T>(ctx.GetPlace());
PADDLE_ENFORCE_EQ(
send_recv_buffer, in->data<void>(),
platform::errors::PreconditionNotMet("Currently, the broadcast op can "
"only be an In-Place operation."));
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto comm = dev_ctx.nccl_comm();
auto stream = dev_ctx.stream();
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
send_recv_buffer, static_cast<size_t>(in->numel()),
platform::ToNCCLDataType(in->type()), root_dev_id, comm, stream));
VLOG(3) << "Bcast " << ctx.InputNames("X")[0] << ", (" << in->numel() << ")"
<< " From " << root_dev_id << " to " << dev_id;
if (ctx.Attr<bool>("sync_mode")) {
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
}
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with GPU."));
#endif
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_CUDA_KERNEL(broadcast, ops::NCCLBroadcastOpKernel<float>,
ops::NCCLBroadcastOpKernel<double>,
ops::NCCLBroadcastOpKernel<int>,
ops::NCCLBroadcastOpKernel<int64_t>,
ops::NCCLBroadcastOpKernel<plat::float16>);
......@@ -23,8 +23,6 @@ limitations under the License. */
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h"
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h"
......
if(NOT WITH_DISTRIBUTE)
return()
endif()
return()
if(WITH_GRPC)
set(cc_generic_services "false")
......
......@@ -28,10 +28,6 @@ limitations under the License. */
#include "paddle/fluid/operators/math/matrix_bit_code.h"
#include "paddle/fluid/platform/transform.h"
#ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
#endif
namespace paddle {
namespace operators {
......
......@@ -24,10 +24,6 @@ limitations under the License. */
#include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/operators/math/blas.h"
#ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
#endif
namespace paddle {
namespace operators {
......
......@@ -23,10 +23,6 @@ limitations under the License. */
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/math/blas.h"
#ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
#endif
namespace paddle {
namespace operators {
......
......@@ -24,10 +24,6 @@ limitations under the License. */
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/math/blas.h"
#ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
#endif
namespace paddle {
namespace operators {
......
......@@ -26,10 +26,6 @@ limitations under the License. */
#include "paddle/fluid/operators/math/sampler.h"
#include "unsupported/Eigen/CXX11/Tensor"
#ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
#endif
namespace paddle {
namespace operators {
......@@ -187,72 +183,7 @@ class NCEKernel : public framework::OpKernel<T> {
// forward mul
auto input_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
// for remote prefetch
auto remote_prefetch = context.Attr<bool>("remote_prefetch");
auto epmap = context.Attr<std::vector<std::string>>("epmap");
if (remote_prefetch && !epmap.empty()) {
// if epmap is not empty, then the parameter will be fetched from remote
// parameter
// server
std::vector<int64_t> labels;
for (int64_t i = 0; i < sample_labels->numel(); ++i) {
labels.push_back(sample_labels_data[i]);
}
std::set<T> st(labels.begin(), labels.end());
labels.assign(st.begin(), st.end());
framework::Scope &local_scope = context.scope().NewScope();
auto table_names = context.Attr<std::vector<std::string>>("table_names");
auto *ids = local_scope.Var("Ids@Prefetch");
auto *x_tensor = ids->GetMutable<framework::LoDTensor>();
x_tensor->mutable_data<int64_t>(
framework::make_ddim({static_cast<int64_t>(labels.size()), 1}),
context.GetPlace());
// copy.
std::memcpy(x_tensor->data<int64_t>(), labels.data(),
labels.size() * sizeof(int64_t));
std::vector<int> w_dims = paddle::framework::vectorize<int>(
context.Input<Tensor>("Weight")->dims());
w_dims[0] = static_cast<int>(labels.size());
auto *w_tensor = local_scope.Var("Weight@Prefetch")
->GetMutable<framework::LoDTensor>();
w_tensor->Resize(framework::make_ddim(w_dims));
#ifdef PADDLE_WITH_DISTRIBUTE
auto weight = context.InputNames("Weight").front();
operators::distributed::prefetch("Ids@Prefetch", "Weight@Prefetch",
weight, false, table_names, epmap,
context, local_scope);
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"paddle is not compiled with distribute support, can not do "
"parameter prefetch!"));
#endif
auto weight_mat = EigenMatrix<T>::From(
(local_scope.Var("Weight@Prefetch")->Get<framework::LoDTensor>()));
for (int64_t i = 0; i < sample_labels->numel(); ++i) {
std::vector<int64_t>::iterator it =
std::find(labels.begin(), labels.end(), sample_labels_data[i]);
int idx = std::distance(labels.begin(), it);
Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
(input_mat.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
weight_mat.chip(idx, 0))
.sum();
sample_out_data[i] += result(0);
sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i])));
}
context.scope().DeleteScope(&local_scope);
} else {
auto weight_mat =
EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
auto weight_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
for (int64_t i = 0; i < sample_labels->numel(); ++i) {
Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
(input_mat.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
......@@ -261,7 +192,6 @@ class NCEKernel : public framework::OpKernel<T> {
sample_out_data[i] += result(0);
sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i])));
}
}
// forward cost
for (int64_t i = 0; i < sample_labels->dims()[0]; ++i) {
......
include(operators)
set(DISTRIBUTE_DEPS "")
list(APPEND DISTRIBUTE_DEPS fleet ps_service brpc_utils heter_server heter_client ps_framework_proto framework_proto sendrecv_rpc brpc leveldb ssl crypto protobuf gflags glog zlib snappy)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
set(DISTRIBUTE_COMPILE_FLAGS
"${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
endif()
file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
list(REMOVE_DUPLICATES OPS)
foreach (src ${OPS})
set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
endforeach ()
register_operators()
set(OPERATOR_DEPS ${OPERATOR_DEPS} ${DISTRIBUTE_DEPS} PARENT_SCOPE)
set_source_files_properties(heter_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(heter_server_test SRCS heter_server_test.cc DEPS ${RPC_DEPS} ${DISTRIBUTE_DEPS} executor scope proto_desc scale_op)
set_source_files_properties(heter_listen_and_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(heter_listen_and_server_test SRCS heter_listen_and_server_test.cc DEPS executor scope proto_desc scale_op heter_listen_and_serv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS})
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/pscore/distributed_lookup_table_op.h"
namespace paddle {
namespace operators {
constexpr int64_t kNoPadding = -1;
class DistributedLookupTableOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInputs("Ids"), true,
platform::errors::InvalidArgument(
"Input(Ids) of LookupTableOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true,
platform::errors::InvalidArgument(
"Input(W) of LookupTableOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutputs("Outputs"), true,
platform::errors::InvalidArgument(
"Output(Outs) of LookupTableOp should not be null."));
auto ids_dims = ctx->GetInputsDim("Ids");
auto table_dims = ctx->GetInputDim("W");
PADDLE_ENFORCE_EQ(
table_dims.size(), 2,
platform::errors::InvalidArgument(
"Only 2 dimensions of the 'Embedding' is supported."));
for (auto &ids_dim : ids_dims) {
PADDLE_ENFORCE_EQ(ids_dim.size(), 2,
platform::errors::InvalidArgument(
"The dimension of the 'Ids' tensor must be 2."));
}
// for fluid.embedding
auto lookup_table_version =
ctx->Attrs().Get<std::string>("lookup_table_version");
auto outputs_dims = std::vector<framework::DDim>();
for (auto &ids_dim : ids_dims) {
if (lookup_table_version == "lookup_table") {
outputs_dims.push_back(
framework::make_ddim({ids_dim[0], table_dims[1]}));
} else if (lookup_table_version == "lookup_table_v2") {
outputs_dims.push_back(framework::make_ddim(
{static_cast<int64_t>(ids_dim[0]), static_cast<int64_t>(ids_dim[1]),
static_cast<int64_t>(table_dims[1])}));
}
}
ctx->SetOutputsDim("Outputs", outputs_dims);
ctx->ShareLoD("Ids", /*->*/ "Outputs");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
ctx.GetPlace());
}
};
class DistributedLookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Ids",
"(LoDTensor) Ids's type should be LoDTensor"
"THe ids to be looked up in W.")
.AsDuplicable();
AddInput("W",
"(Tensor) The input represents embedding tensors, "
"which is a learnable parameter.");
AddOutput("Outputs",
"(LoDTensor) The lookup results, which have the same type as W.")
.AsDuplicable();
AddAttr<int>("table_id", "sparse table id").SetDefault(0);
AddAttr<bool>("is_distributed",
"(boolean, default false) distributed lookup table.")
.SetDefault(false);
AddAttr<std::string>(
"lookup_table_version",
"(string, default lookup_table) "
"To distinguish between different versions of embedding OP")
.SetDefault(std::string("lookup_table"));
AddAttr<int64_t>("padding_idx",
"(int64, default -1) "
"If the value is -1, it makes no effect to lookup. "
"Otherwise the given value indicates padding the output "
"with zeros whenever lookup encounters it in Ids.")
.SetDefault(kNoPadding);
AddAttr<int>("dtype",
"(int, default 5 (FP32)) "
"Output data type")
.SetDefault(framework::proto::VarType::FP32);
AddComment(R"DOC(
Lookup Tablel Prefetch Operator.
This operator is used to perform lookup on parameter W,
then concatenated into a sparse tensor.
The type of Ids(Input) is SelectedRows, the rows of Ids contains
the ids to be looked up in W;
if the Id is not in the sparse table, this operator will return a
random value and set the value into the table for the next looking up.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(distributed_lookup_table, ops::DistributedLookupTableOp,
ops::DistributedLookupTableOpMaker);
REGISTER_OP_CPU_KERNEL(distributed_lookup_table,
ops::DistributedLookupTableKernel<
paddle::platform::CPUDeviceContext, float>);
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/pscore/distributed_lookup_table_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
distributed_lookup_table,
ops::DistributedLookupTableKernel<plat::CUDADeviceContext, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/distributed/fleet.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class DistributedLookupTableKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto &scope = context.scope();
auto padding_idx = context.Attr<int64_t>("padding_idx");
auto table_id = context.Attr<int>("table_id");
auto embedding_name = context.InputNames("W").front();
int64_t emb_dim = 0;
auto *var = scope.FindVar(embedding_name);
if (var->IsType<framework::LoDTensor>()) {
emb_dim = var->Get<framework::LoDTensor>().dims()[1];
} else if (var->IsType<framework::SelectedRows>()) {
emb_dim = var->Get<framework::SelectedRows>().value().dims()[1];
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Expected type of `W` must be Tensor, SelectedRows.But got "
"unsupport type: %s.",
framework::ToTypeName(var->Type())));
}
auto inputs = context.MultiInput<framework::LoDTensor>("Ids");
auto outputs = context.MultiOutput<framework::LoDTensor>("Outputs");
auto fleet = distributed::FleetWrapper::GetInstance();
if (platform::is_cpu_place(context.GetPlace())) {
fleet->PullSparseToTensorSync(static_cast<uint64_t>(table_id), emb_dim,
static_cast<uint64_t>(padding_idx),
context.GetPlace(), &inputs, &outputs);
} else {
auto inputs_variable = context.MultiInputVar("Ids");
auto outputs_variable = context.MultiOutputVar("Outputs");
auto inputs_name = context.InputNames("Ids");
auto outputs_name = context.OutputNames("Outputs");
auto cpu_place = platform::CPUPlace();
framework::Scope *tmp_scope = scope.NewTmpScope().release();
std::vector<const framework::LoDTensor *> tmp_input_vec;
auto input_var_size = inputs_variable.size();
std::vector<framework::LoDTensor *> tmp_output_vec;
auto output_var_size = outputs_variable.size();
// create temp input
for (size_t idx = 0; idx < input_var_size; ++idx) {
framework::Variable *tmp_input_var = tmp_scope->Var(inputs_name[idx]);
framework::LoDTensor *tmp_input_tensor =
tmp_input_var->GetMutable<framework::LoDTensor>();
framework::TensorCopy(inputs_variable[idx]->Get<framework::LoDTensor>(),
cpu_place, context.device_context(),
tmp_input_tensor);
tmp_input_vec.push_back(tmp_input_tensor);
}
// create temp output
for (size_t idx = 0; idx < output_var_size; ++idx) {
framework::Variable *tmp_output_var = tmp_scope->Var(outputs_name[idx]);
framework::LoDTensor *tmp_output_tensor =
tmp_output_var->GetMutable<framework::LoDTensor>();
tmp_output_tensor->Resize(outputs[idx]->dims());
tmp_output_vec.push_back(tmp_output_tensor);
}
// use fleet->PullSparse
fleet->PullSparseToTensorSync(static_cast<uint64_t>(table_id), emb_dim,
static_cast<uint64_t>(padding_idx),
cpu_place, &tmp_input_vec, &tmp_output_vec);
// cp temp to origin
for (size_t idx = 0; idx < output_var_size; ++idx) {
framework::Variable *tmp_output_var = tmp_scope->Var(outputs_name[idx]);
framework::LoDTensor *tmp_output_tensor =
tmp_output_var->GetMutable<framework::LoDTensor>();
framework::TensorCopy(
*tmp_output_tensor, context.GetPlace(), context.device_context(),
outputs_variable[idx]->GetMutable<framework::LoDTensor>());
}
delete tmp_scope;
}
auto id_names = context.InputNames("Ids");
auto out_names = context.OutputNames("Outputs");
auto lookup_table_version =
context.Attr<std::string>("lookup_table_version");
if (lookup_table_version == "lookup_table_v2") {
for (size_t i = 0; i < id_names.size(); ++i) {
auto *id_var = scope.FindVar(id_names[i]);
auto *out_var = scope.FindVar(out_names[i]);
auto *id_tensor = id_var->GetMutable<framework::LoDTensor>();
auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
auto id_dims = id_tensor->dims();
out_tensor->Resize(framework::make_ddim(
{static_cast<int64_t>(id_dims[0]), static_cast<int64_t>(id_dims[1]),
static_cast<int64_t>(emb_dim)}));
}
}
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
class FakeInitInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "FakeInit");
auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
ctx->SetOutputDim("Out", framework::make_ddim(shape));
}
};
class FakeInitOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
framework::Tensor *tensor = nullptr;
auto &out_var = *scope.FindVar(Output("Out"));
if (out_var.IsType<framework::LoDTensor>()) {
tensor = out_var.GetMutable<framework::LoDTensor>();
tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
} else if (out_var.IsType<framework::SelectedRows>()) {
tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"fake init op's output only"
"supports SelectedRows and LoDTensor"));
}
}
};
class FakeInitOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(framework::InferVarTypeContext *ctx) const override {}
};
class FakeInitOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddAttr<std::vector<int64_t>>("shape",
"(vector<int64_t>) The shape of the output");
AddOutput("Out",
"(Tensor) Tensor of specified shape will be filled "
"with the specified value");
AddComment(R"DOC(
FakeInit Operator.
Init an variable but not alloc memory for it, it is used for init the
table parameter at trainer side in distributed lookup table.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(
fake_init, ops::FakeInitOp, ops::FakeInitInferShape, ops::FakeInitOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
ops::FakeInitOpVarTypeInference);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/distributed/service/communicator.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace framework {
class InferShapeContext;
class OpDesc;
class Scope;
template <typename T>
class EmptyGradOpMaker;
} // namespace framework
namespace imperative {
class OpBase;
} // namespace imperative
namespace distributed {
class Communicator;
} // namespace distributed
} // namespace paddle
namespace paddle {
namespace operators {
class FetchBarrierOp : public framework::OperatorBase {
public:
FetchBarrierOp(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {
VLOG(4) << "FetchBarrier Sync, do not need now";
}
};
class FetchBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "(Any) Dummy inputs, used for control dependency")
.AsDispensable()
.AsDuplicable();
AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
.AsDuplicable();
AddComment(R"DOC(
SendBarrier operator
This operator will send a send barrier signal to list_and_serv op, so that
the Parameter Server would knew all variables have been sent.
)DOC");
AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
AddAttr<std::vector<std::string>>("endpoints",
"(string vector, default 127.0.0.1:6164)"
"Server endpoints to send variables to.")
.SetDefault({"127.0.0.1:6164"});
}
};
class FetchBarrierOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override {}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(
fetch_barrier, ops::FetchBarrierOp,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
ops::FetchBarrierOpMaker, ops::FetchBarrierOpShapeInference);
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <stdio.h> // for removing the port file
#include <csignal>
#include <cstdlib>
#include <fstream>
#include <thread> // NOLINT
#include <vector>
#include "gflags/gflags.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/pscore/heter_listen_and_serv_op.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send");
namespace paddle {
namespace operators {
static void split(const std::string &str, char sep,
std::vector<std::string> *pieces) {
pieces->clear();
if (str.empty()) {
return;
}
size_t pos = 0;
size_t next = str.find(sep, pos);
while (next != std::string::npos) {
pieces->push_back(str.substr(pos, next - pos));
pos = next + 1;
next = str.find(sep, pos);
}
if (!str.substr(pos).empty()) {
pieces->push_back(str.substr(pos));
}
}
HeterListenAndServOp::HeterListenAndServOp(
const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
HeterListenAndServOp::~HeterListenAndServOp() { Stop(); }
void HeterListenAndServOp::Stop() {}
void HeterListenAndServOp::RunAsyncLoop(framework::Executor *executor,
framework::ProgramDesc *program,
framework::Scope *recv_scope) const {
VLOG(2) << "RunAsyncLoop";
auto message_to_block_id_str =
Attr<std::vector<std::string>>("message_to_block_id");
DoubleFindMap<std::string, int32_t> message_to_block_id;
auto append_block_maps = [](DoubleFindMap<std::string, int32_t> *out_map,
const std::string &grad_and_id) {
std::vector<std::string> pieces;
split(grad_and_id, ':', &pieces);
VLOG(3) << "after split, key = " << pieces[0] << ", id=" << pieces[1];
PADDLE_ENFORCE_EQ(pieces.size(), 2,
platform::errors::PreconditionNotMet(
"Invalid format of message_and_id argument. "
"Expected \"message:block_id\". Recieved %s",
grad_and_id.c_str()));
PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0,
platform::errors::AlreadyExists(
"The message name %s has already existed in out_map",
pieces[0].c_str()));
int block_id = std::stoi(pieces[1]);
(*out_map)[pieces[0]] = block_id;
};
for (const auto &message_and_id : message_to_block_id_str) {
append_block_maps(&message_to_block_id, message_and_id);
}
size_t num_blocks = program->Size();
PADDLE_ENFORCE_GE(num_blocks, 1,
platform::errors::PreconditionNotMet(
"Invalid number of blocks in server program. Expected "
"equal or greater than 1. Recieved %zu",
num_blocks));
std::vector<int> block_list;
for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
block_list.push_back(blkid);
}
auto optimize_prepared = executor->Prepare(*program, block_list);
// execute global block if needed, block id 1 in the program is global
// block if it's not bind to a grad var for it's update.
if (block_list[0] == 1 &&
message_to_block_id.find_value(static_cast<int32_t>(1)) ==
message_to_block_id.end()) {
executor->RunPreparedContext(optimize_prepared[0].get(), recv_scope);
}
std::unordered_map<std::string,
std::shared_ptr<framework::ExecutorPrepareContext>>
message_to_prepared_ctx;
for (size_t i = 0; i < block_list.size(); ++i) {
auto blkid = block_list[i];
auto it = message_to_block_id.find_value(blkid);
if (it != message_to_block_id.end()) {
message_to_prepared_ctx[it->first] = optimize_prepared[i];
}
}
request_send_and_recv_handler_->SetGradToPreparedCtx(
&message_to_prepared_ctx);
for (size_t i = 0; i < block_list.size(); ++i) {
auto blkid = block_list[i];
auto it = message_to_block_id.find_value(blkid);
rpc_service_->RegisterServiceHandler(
it->first, [&](const MultiVarMsg *request, MultiVarMsg *response,
brpc::Controller *cntl) -> int {
return request_send_and_recv_handler_->Handle(request, response,
cntl);
});
}
while (true) {
if (rpc_service_->IsExit()) {
rpc_service_->Stop();
VLOG(0) << "get exit. rpc_processor stop!";
break;
}
sleep(1);
} // while(true)
}
void RunServer(std::shared_ptr<paddle::distributed::HeterServer> service) {
service->StartHeterService();
}
void HeterListenAndServOp::RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const {
// Mark this as PS that it should decide profiling by listening from trainer.
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place);
VLOG(1) << "HeterListenAndServOp::RunImpl On gpu? "
<< platform::is_gpu_place(dev_place);
framework::Scope &recv_scope = scope.NewScope();
auto pserver_id = Attr<int>("pserver_id");
auto fan_in = Attr<int>("fanin");
auto inputs = Inputs("X");
PADDLE_ENFORCE_EQ(rpc_service_, nullptr,
platform::errors::PreconditionNotMet(
"RPC service has been created unexpectedly."));
std::string endpoint = Attr<std::string>("endpoint");
VLOG(4) << "pserver_id: " << pserver_id << ", end_point:" << endpoint;
rpc_service_ = distributed::HeterServer::GetInstance();
rpc_service_->SetEndPoint(endpoint);
rpc_service_->SetFanin(fan_in);
auto optimize_blocks =
Attr<std::vector<framework::BlockDesc *>>("optimize_blocks");
PADDLE_ENFORCE_GE(optimize_blocks.size(), 1,
platform::errors::PreconditionNotMet(
"optimize blocks is less than 1. Optimize blocks "
"should be 1 at least on the pserver side."));
auto *program = optimize_blocks[0]->Program();
framework::Executor executor(dev_place);
request_send_and_recv_handler_.reset(
new distributed::RequestSendAndRecvHandler());
request_send_and_recv_handler_->SetScope(&recv_scope);
request_send_and_recv_handler_->SetDevCtx(&dev_ctx);
request_send_and_recv_handler_->SetProgram(program);
request_send_and_recv_handler_->SetExecutor(&executor);
VLOG(2) << "RunAsyncLoop";
auto message_to_block_id_str =
Attr<std::vector<std::string>>("message_to_block_id");
// start the server listening after all member initialized.
server_thread_.reset(new std::thread(RunServer, rpc_service_));
VLOG(3) << "wait server thread to become ready...";
rpc_service_->WaitServerReady();
RunAsyncLoop(&executor, program, &recv_scope);
VLOG(3) << "Wait for Server_thread_ stop";
(server_thread_.get())->join();
VLOG(3) << "Server_thread_ stop";
}
class HeterListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "(Tensor) Variables that server recv.").AsDuplicable();
AddComment(
R"DOC(" + "HeterListenAndServ operator" + "\n" + "This operator" +
" will start a RPC server which can receive variables from send_op and send" +
"back variables to recv_op.)DOC");
AddAttr<std::string>("endpoint",
"(string, default 127.0.0.1:6164)"
"IP address to listen on.")
.SetDefault("127.0.0.1:6164")
.AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
AddAttr<int>("pserver_id",
"(int, default -1), the parameter server index id")
.SetDefault(-1);
AddAttr<std::vector<std::string>>(
"message_to_block_id",
"['param1@GRAD.block0:1', 'param2@GRAD.blockn:2'] "
"a map from message name to it's optimize block id")
.SetDefault({});
AddAttr<int>("distributed_mode",
"indicate distriubte training mode, 0 is sync, 1 is "
"fully-async, 2 is half-async, 3 is geo")
.SetDefault(0);
AddAttr<std::vector<framework::BlockDesc *>>(
"optimize_blocks", "Optimize blocks to run on server side.")
.SetDefault({});
AddAttr<int>("fanin", "How many clients send to this server.")
.SetDefault(1);
AddAttr<int>("rpc_exec_thread_num", "pserver send thread num.")
.SetDefault(1);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(heter_listen_and_serv, ops::HeterListenAndServOp,
ops::HeterListenAndServOpMaker);
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdint.h>
#include <atomic>
#include <memory>
#include <set>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/distributed/service/brpc_utils.h"
#include "paddle/fluid/distributed/service/heter_server.h"
#include "paddle/fluid/distributed/service/sendrecv.pb.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
class Executor;
class ProgramDesc;
class Scope;
} // namespace framework
namespace platform {
class DeviceContext;
} // namespace platform
} // namespace paddle
namespace paddle {
namespace operators {
using MultiVarMsg = ::paddle::MultiVariableMessage;
using VarMsg = ::paddle::VariableMessage;
template <class TKey, class TValue>
class DoubleFindMap : public std::unordered_map<TKey, TValue> {
public:
typename std::unordered_map<TKey, TValue>::iterator find_value(TValue v) {
return std::find_if(this->begin(), this->end(),
[&v](const std::pair<const std::string, int> p) {
return p.second == v;
});
}
};
void RunServer(std::shared_ptr<paddle::distributed::HeterServer> service);
class HeterListenAndServOp : public framework::OperatorBase {
public:
HeterListenAndServOp(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs);
virtual ~HeterListenAndServOp();
void RunAsyncLoop(framework::Executor* executor,
framework::ProgramDesc* program,
framework::Scope* recv_scope) const;
void Stop() override;
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override;
protected:
mutable std::shared_ptr<paddle::distributed::HeterServer> rpc_service_;
mutable std::shared_ptr<std::thread> server_thread_;
mutable std::shared_ptr<paddle::distributed::HeterRequestHandler>
request_send_and_recv_handler_;
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <stdlib.h>
#include <unistd.h>
#include <chrono> // NOLINT
#include <memory>
#include <string>
#include <thread> // NOLINT
#include <unordered_map>
#include "gtest/gtest.h"
#include "paddle/fluid/distributed/service/brpc_utils.h"
#include "paddle/fluid/distributed/service/heter_client.h"
#include "paddle/fluid/distributed/service/heter_server.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/pscore/heter_listen_and_serv_op.h"
namespace framework = paddle::framework;
namespace platform = paddle::platform;
namespace distributed = paddle::distributed;
using MultiVarMsg = ::paddle::MultiVariableMessage;
using VarMsg = ::paddle::VariableMessage;
DECLARE_double(eager_delete_tensor_gb);
USE_OP(scale);
USE_NO_KERNEL_OP(heter_listen_and_serv);
framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
framework::BlockDesc* block =
program->AppendBlock(*(program->MutableBlock(0)));
framework::OpDesc* op = block->AppendOp();
op->SetType("scale");
op->SetInput("X", {"x"});
op->SetOutput("Out", {"res"});
op->SetAttr("scale", 0.5f);
auto* out = block->Var("res");
out->SetType(framework::proto::VarType::LOD_TENSOR);
out->SetShape({1, 10});
return block;
}
void GetHeterListenAndServProgram(framework::ProgramDesc* program) {
auto root_block = program->MutableBlock(0);
auto* sub_block = AppendSendAndRecvBlock(program);
std::vector<framework::BlockDesc*> optimize_blocks;
optimize_blocks.push_back(sub_block);
std::vector<std::string> message_to_block_id = {"x:1"};
std::string endpoint = "127.0.0.1:19944";
framework::OpDesc* op = root_block->AppendOp();
op->SetType("heter_listen_and_serv");
op->SetInput("X", {});
op->SetAttr("message_to_block_id", message_to_block_id);
op->SetAttr("optimize_blocks", optimize_blocks);
op->SetAttr("endpoint", endpoint);
op->SetAttr("fanin", 1);
op->SetAttr("pserver_id", 0);
}
void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
auto x_var = scope->Var("x");
x_var->GetMutable<framework::LoDTensor>();
auto res_var = scope->Var("res");
res_var->GetMutable<framework::LoDTensor>();
}
void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
int64_t rows_numel) {
CreateVarsOnScope(scope, place);
auto x_var = scope->Var("x")->GetMutable<framework::LoDTensor>();
float* x_ptr =
x_var->mutable_data<float>(framework::DDim({1, rows_numel}), *place);
for (int64_t i = 0; i < rows_numel; ++i) x_ptr[i] = 1.0;
auto res_var = scope->Var("res")->GetMutable<framework::LoDTensor>();
float* res_ptr =
res_var->mutable_data<float>(framework::DDim({1, rows_numel}), *place);
for (int64_t i = 0; i < rows_numel; ++i) res_ptr[i] = 1.0;
}
void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
int64_t rows_numel) {
CreateVarsOnScope(scope, place);
}
void StartHeterServer() {
framework::ProgramDesc program;
framework::Scope scope;
platform::CPUPlace place;
framework::Executor exe(place);
platform::CPUDeviceContext ctx(place);
LOG(INFO) << "before GetHeterListenAndServProgram";
GetHeterListenAndServProgram(&program);
auto prepared = exe.Prepare(program, 0);
LOG(INFO) << "before InitTensorsOnServer";
InitTensorsOnServer(&scope, &place, 10);
LOG(INFO) << "before RunPreparedContext";
exe.RunPreparedContext(prepared.get(), &scope, false);
}
TEST(HETER_LISTEN_AND_SERV, CPU) {
setenv("http_proxy", "", 1);
setenv("https_proxy", "", 1);
std::string endpoint = "127.0.0.1:19944";
LOG(INFO) << "before StartSendAndRecvServer";
FLAGS_eager_delete_tensor_gb = -1;
std::thread server_thread(StartHeterServer);
sleep(1);
LOG(INFO) << "before HeterClient::GetInstance";
distributed::HeterClient* rpc_client =
distributed::HeterClient::GetInstance({endpoint}, 0).get();
PADDLE_ENFORCE_NE(rpc_client, nullptr,
platform::errors::InvalidArgument(
"Client Start Fail, Check Your Code & Env"));
framework::Scope scope;
platform::CPUPlace place;
platform::CPUDeviceContext ctx(place);
// create var on local scope
int64_t rows_numel = 10;
LOG(INFO) << "before InitTensorsOnClient";
InitTensorsOnClient(&scope, &place, rows_numel);
std::string in_var_name("x");
std::string out_var_name("res");
std::vector<std::string> send_var = {in_var_name};
std::vector<std::string> recv_var = {out_var_name};
LOG(INFO) << "before SendAndRecvAsync";
rpc_client->SendAndRecvAsync({endpoint}, ctx, scope, in_var_name, send_var,
recv_var);
auto var = scope.Var(out_var_name);
auto value = var->GetMutable<framework::LoDTensor>();
auto ptr = value->mutable_data<float>(place);
LOG(INFO) << "before CHECK";
for (int64_t i = 0; i < rows_numel; ++i) {
LOG(INFO) << "ptr " << i << " is " << ptr[i];
EXPECT_EQ(ptr[i], 0.5);
}
LOG(INFO) << "end CHECK";
rpc_client->Stop();
LOG(INFO) << "end server Stop";
server_thread.join();
LOG(INFO) << "end server thread join";
}
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <stdlib.h>
#include <unistd.h>
#include <chrono> // NOLINT
#include <memory>
#include <string>
#include <thread> // NOLINT
#include <unordered_map>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/distributed/service/brpc_utils.h"
#include "paddle/fluid/distributed/service/heter_client.h"
#include "paddle/fluid/distributed/service/heter_server.h"
namespace framework = paddle::framework;
namespace platform = paddle::platform;
namespace distributed = paddle::distributed;
using MultiVarMsg = ::paddle::MultiVariableMessage;
using VarMsg = ::paddle::VariableMessage;
USE_OP(scale);
std::shared_ptr<distributed::HeterServer> b_rpc_service;
framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
auto root_block = program->MutableBlock(0);
auto* block = program->AppendBlock(*root_block);
framework::OpDesc* op = block->AppendOp();
op->SetType("scale");
op->SetInput("X", {"x"});
op->SetOutput("Out", {"res"});
op->SetAttr("scale", 0.5f);
auto& out = *root_block->Var("res");
out.SetType(framework::proto::VarType::LOD_TENSOR);
out.SetShape({1, 10});
return block;
}
void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
auto w_var = scope->Var("w");
w_var->GetMutable<framework::SelectedRows>();
auto out_var = scope->Var("out");
out_var->GetMutable<framework::LoDTensor>();
auto ids_var = scope->Var("ids");
ids_var->GetMutable<framework::LoDTensor>();
auto x_var = scope->Var("x");
x_var->GetMutable<framework::LoDTensor>();
auto res_var = scope->Var("res");
res_var->GetMutable<framework::LoDTensor>();
}
void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
int64_t rows_numel) {
CreateVarsOnScope(scope, place);
auto ids_var = scope->Var("ids")->GetMutable<framework::LoDTensor>();
int64_t* ids_ptr =
ids_var->mutable_data<int64_t>(framework::DDim({rows_numel, 1}), *place);
for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2;
auto x_var = scope->Var("x")->GetMutable<framework::LoDTensor>();
float* x_ptr =
x_var->mutable_data<float>(framework::DDim({1, rows_numel}), *place);
for (int64_t i = 0; i < rows_numel; ++i) x_ptr[i] = 1.0;
auto res_var = scope->Var("res")->GetMutable<framework::LoDTensor>();
float* res_ptr =
res_var->mutable_data<float>(framework::DDim({1, rows_numel}), *place);
for (int64_t i = 0; i < rows_numel; ++i) res_ptr[i] = 1.0;
}
void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
int64_t rows_numel) {
CreateVarsOnScope(scope, place);
auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
auto w_value = w->mutable_value();
w_value->Resize({rows_numel, 10});
for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true);
auto ptr = w_value->mutable_data<float>(*place);
for (int64_t i = 0; i < w_value->numel(); ++i) {
ptr[i] = static_cast<float>(i / 10);
}
}
void RunServer(std::shared_ptr<paddle::distributed::HeterServer> service) {
service->StartHeterService();
}
void StartSendAndRecvServer(std::string endpoint) {
framework::ProgramDesc program;
framework::Scope scope;
platform::CPUPlace place;
framework::Executor exe(place);
platform::CPUDeviceContext ctx(place);
LOG(INFO) << "before AppendSendAndRecvBlock";
auto block = AppendSendAndRecvBlock(&program);
std::string in_var_name("x");
std::vector<int> prefetch_block_ids{block->ID()};
auto prepared = exe.Prepare(program, prefetch_block_ids);
LOG(INFO) << "before InitTensorsOnServer";
InitTensorsOnServer(&scope, &place, 10);
LOG(INFO) << "end InitTensorsOnServer";
std::unordered_map<std::string,
std::shared_ptr<framework::ExecutorPrepareContext>>
message_to_prepared_ctx;
message_to_prepared_ctx[in_var_name] = prepared[0];
std::shared_ptr<distributed::RequestSendAndRecvHandler> b_req_handler;
b_req_handler.reset(new distributed::RequestSendAndRecvHandler());
LOG(INFO) << "before SetProgram";
b_req_handler->SetProgram(&program);
LOG(INFO) << "before SetGradToPreparedCtx";
b_req_handler->SetGradToPreparedCtx(&message_to_prepared_ctx);
LOG(INFO) << "before SetDevCtx";
b_req_handler->SetDevCtx(&ctx);
LOG(INFO) << "before SetScope";
b_req_handler->SetScope(&scope);
LOG(INFO) << "before SetExecutor";
b_req_handler->SetExecutor(&exe);
LOG(INFO) << "before HeterServer::GetInstance";
b_rpc_service = distributed::HeterServer::GetInstance();
b_rpc_service->SetEndPoint(endpoint);
LOG(INFO) << "before HeterServer::RegisterServiceHandler";
b_rpc_service->RegisterServiceHandler(
in_var_name, [&](const MultiVarMsg* request, MultiVarMsg* response,
brpc::Controller* cntl) -> int {
return b_req_handler->Handle(request, response, cntl);
});
LOG(INFO) << "before HeterServer::RunServer";
std::thread server_thread(std::bind(RunServer, b_rpc_service));
server_thread.join();
}
TEST(SENDANDRECV, CPU) {
setenv("http_proxy", "", 1);
setenv("https_proxy", "", 1);
std::string endpoint = "127.0.0.1:4444";
LOG(INFO) << "before StartSendAndRecvServer";
b_rpc_service = distributed::HeterServer::GetInstance();
std::thread server_thread(StartSendAndRecvServer, endpoint);
b_rpc_service->WaitServerReady();
LOG(INFO) << "before HeterClient::GetInstance";
distributed::HeterClient* rpc_client =
distributed::HeterClient::GetInstance({endpoint}, 0).get();
PADDLE_ENFORCE_NE(rpc_client, nullptr,
platform::errors::InvalidArgument(
"Client Start Fail, Check Your Code & Env"));
framework::Scope scope;
platform::CPUPlace place;
platform::CPUDeviceContext ctx(place);
// create var on local scope
int64_t rows_numel = 10;
LOG(INFO) << "before InitTensorsOnClient";
InitTensorsOnClient(&scope, &place, rows_numel);
std::string in_var_name("x");
std::string out_var_name("res");
std::vector<std::string> send_var = {in_var_name};
std::vector<std::string> recv_var = {out_var_name};
LOG(INFO) << "before SendAndRecvAsync";
rpc_client->SendAndRecvAsync({endpoint}, ctx, scope, in_var_name, send_var,
recv_var);
auto var = scope.Var(out_var_name);
auto value = var->GetMutable<framework::LoDTensor>();
auto ptr = value->mutable_data<float>(place);
LOG(INFO) << "before CHECK";
for (int64_t i = 0; i < rows_numel; ++i) {
LOG(INFO) << "ptr " << i << " is " << ptr[i];
EXPECT_EQ(ptr[i], 0.5);
}
LOG(INFO) << "end CHECK";
rpc_client->FinalizeWorker();
// b_rpc_service->Stop();
b_rpc_service->Stop();
LOG(INFO) << "end server Stop";
server_thread.join();
LOG(INFO) << "end server thread join";
}
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
constexpr char kLRDecayBlockId[] = "lr_decay_block_id";
constexpr char kCheckpointBlockId[] = "checkpint_block_id";
constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
constexpr char kOptimizeBlocks[] = "optimize_blocks";
constexpr char kSparseGradToParam[] = "sparse_grad_to_param";
namespace paddle {
namespace framework {
class InferShapeContext;
class OpDesc;
class Scope;
template <typename T>
class EmptyGradOpMaker;
} // namespace framework
namespace imperative {
class OpBase;
} // namespace imperative
} // namespace paddle
namespace paddle {
namespace operators {
class ListenAndServOp : public framework::OperatorBase {
public:
ListenAndServOp(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {
VLOG(1) << "just for recorder";
}
};
class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "(Tensor) Variables that server recv.").AsDuplicable();
AddComment(R"DOC(" + "ListenAndServ operator" + "\n" + "This operator" +
" will start a RPC server which can receive variables from send_op and send" +
"back variables to recv_op.)DOC");
AddAttr<std::string>("endpoint",
"(string, default 127.0.0.1:6164)"
"IP address to listen on.")
.SetDefault("127.0.0.1:6164")
.AddCustomChecker([](const std::string& ip) { return !ip.empty(); });
AddAttr<int>("pserver_id",
"(int, default -1), the parameter server index id")
.SetDefault(-1);
AddAttr<std::vector<std::string>>(
"grad_to_block_id",
"['param1@GRAD.block0:1', 'param2@GRAD.blockn:2'] "
"a map from grad name to it's optimize block id")
.SetDefault({});
AddAttr<int>("distributed_mode",
"indicate distriubte training mode, 0 is sync, 1 is "
"fully-async, 2 is half-async, 3 is geo")
.SetDefault(0);
AddAttr<bool>("dc_asgd", "set to true will enable DC-ASGD training.")
.SetDefault(false);
AddAttr<std::vector<framework::BlockDesc*>>(
kOptimizeBlocks, "Optimize blocks to run on server side.")
.SetDefault({});
AddAttr<std::vector<std::string>>(kPrefetchVarNameToBlockId,
"prefetch blocks to run on server side.")
.SetDefault({});
AddAttr<std::vector<std::string>>(
kSparseGradToParam,
"sparse grad name to param name. like: 'emb@Grad:emb'")
.SetDefault({});
AddAttr<int>("Fanin", "How many clients send to this server.")
.SetDefault(1);
AddAttr<int>(kCheckpointBlockId,
"BolckID to run save checkpoint on pserer.")
.SetDefault(-1);
AddAttr<int>(kLRDecayBlockId, "BolckID to run lr decay on pserer.")
.SetDefault(-1);
AddAttr<int>("rpc_get_thread_num", "pserver get thread num.").SetDefault(1);
AddAttr<int>("rpc_send_thread_num", "pserver send thread num.")
.SetDefault(1);
AddAttr<int>("rpc_prefetch_thread_num", "pserver prefetch thread num.")
.SetDefault(1);
}
};
class ListenAndServOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override {}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(
listen_and_serv, ops::ListenAndServOp,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
ops::ListenAndServOpMaker, ops::ListenAndServOpShapeInference);
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <future> // NOLINT
#include <ostream>
#include "paddle/fluid/distributed/service/heter_client.h"
#include "paddle/fluid/framework/blocking_queue.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class SendAndRecvKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& scope = ctx.scope();
const auto& place = ctx.GetPlace();
auto message_name = ctx.Attr<std::string>("message_name");
auto send_var_name = ctx.Attr<std::vector<std::string>>("send_var_name");
auto recv_var_name = ctx.Attr<std::vector<std::string>>("recv_var_name");
auto epmap = ctx.Attr<std::vector<std::string>>("endpoints");
auto trainer_id = ctx.Attr<int>("trainer_id");
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& context = *pool.Get(place);
distributed::HeterClient* rpc_client =
distributed::HeterClient::GetInstance(epmap, trainer_id).get();
VLOG(3) << "SendAndRecvOp message_name: " << message_name;
rpc_client->SendAndRecvAsync(epmap, context, scope, message_name,
send_var_name, recv_var_name);
}
};
class SendAndRecvOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
return framework::OpKernelType(data_type, platform::CPUPlace());
}
};
class SendAndRecvOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "Tensor Input variable to be sent").AsDuplicable();
AddOutput("Out", "Tensor Output varibale to be recv").AsDuplicable();
AddAttr<std::string>("message_name", "");
AddAttr<std::vector<std::string>>("send_var_name", "Send Tensor's name");
AddAttr<std::vector<std::string>>("recv_var_name", "Recv Tensor's name");
AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
AddAttr<std::vector<std::string>>("endpoints", "Server endpoint")
.SetDefault({"127.0.0.1:6164"});
AddComment(R"DOC(
SendAndRecv operator
This operator will send variables to listen_and_serve op at the parameter server.
And recv variable from parameter server of send variable's scope.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(send_and_recv, ops::SendAndRecvOp, ops::SendAndRecvOpMaker);
REGISTER_OP_CPU_KERNEL(
send_and_recv,
ops::SendAndRecvKernel<paddle::platform::CPUDeviceContext, float>)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/distributed/service/communicator.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace framework {
class InferShapeContext;
class OpDesc;
class Scope;
template <typename T>
class EmptyGradOpMaker;
} // namespace framework
namespace imperative {
class OpBase;
} // namespace imperative
namespace distributed {
class Communicator;
} // namespace distributed
} // namespace paddle
namespace paddle {
namespace operators {
class SendBarrierOp : public framework::OperatorBase {
public:
SendBarrierOp(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {
paddle::distributed::Communicator::GetInstance()->Barrier();
}
};
class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "(Any) Dummy inputs, used for control dependency")
.AsDuplicable();
AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
.AsDuplicable();
AddComment(R"DOC(
SendBarrier operator
This operator will send a send barrier signal to list_and_serv op, so that
the Parameter Server would knew all variables have been sent.
)DOC");
AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
AddAttr<std::vector<std::string>>("endpoints",
"(string vector, default 127.0.0.1:6164)"
"Server endpoints to send variables to.")
.SetDefault({"127.0.0.1:6164"});
AddAttr<bool>(
"half_async",
"(bool, default false)"
"half_async=True is for half_async mode, this will send signal "
"to HalfAsyncCommunicator Instance")
.SetDefault(false);
}
};
class SendBarrierOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override {}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(
send_barrier, ops::SendBarrierOp,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
ops::SendBarrierOpMaker, ops::SendBarrierOpShapeInference);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/distributed/fleet.h"
#include "paddle/fluid/distributed/service/communicator.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace framework {
class InferShapeContext;
class OpDesc;
class Scope;
template <typename T>
class EmptyGradOpMaker;
} // namespace framework
namespace imperative {
class OpBase;
} // namespace imperative
} // namespace paddle
namespace paddle {
namespace operators {
namespace distributed {
class RPCClient;
} // namespace distributed
class SendOp : public framework::OperatorBase {
public:
SendOp(const std::string& type, const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {
auto ins = Inputs("X");
// auto is_sparse = Attr<int>("is_sparse");
// auto table_id = Attr<int>("table_id");
auto send_varnames = Attr<std::vector<std::string>>("send_varnames");
auto* communicator = paddle::distributed::Communicator::GetInstance();
communicator->Check(send_varnames);
communicator->Send(ins, scope);
// auto fleet = paddle::distributed::FleetWrapper::GetInstance();
// if (is_sparse == 0) {
// std::vector<::std::future<int32_t>> status;
// fleet->PushDenseVarsAsync(scope, table_id, send_varnames, &status, 0,
// -1);
// } else {
// std::vector<::std::future<int32_t>> status;
// fleet->PushSparseVarsAsync(scope, table_id, send_varnames[0], &status);
// }
}
};
class SendOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
.AsDuplicable();
AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
.AsDuplicable();
AddComment(R"DOC(
Send operator
This operator will send variables to listen_and_serve op at the parameter server.
)DOC");
AddAttr<int>("table_id", "table_id for send").SetDefault(0);
AddAttr<int>("is_sparse",
"(int, default 0->Dense, 1->Sparse, 2->Distributed)")
.SetDefault(0);
AddAttr<std::vector<std::string>>(
"send_varnames",
"(vector<string>) "
"the split output varnames to send to pserver")
.SetDefault(std::vector<std::string>{});
}
};
class SendOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override {}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(
send, ops::SendOp,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
ops::SendOpMaker, ops::SendOpShapeInference);
......@@ -20,10 +20,6 @@ if(WITH_PYTHON)
list(APPEND PYBIND_DEPS py_func_op)
endif()
if (WITH_DISTRIBUTE)
list(APPEND PYBIND_DEPS communicator)
endif()
set(PYBIND_SRCS
pybind.cc
exception.cc
......@@ -54,7 +50,10 @@ if (WITH_CRYPTO)
endif (WITH_CRYPTO)
if (WITH_DISTRIBUTE)
list(APPEND PYBIND_SRCS communicator_py.cc)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=type-limits -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result")
set_source_files_properties(fleet_py.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
list(APPEND PYBIND_DEPS fleet communicator)
list(APPEND PYBIND_SRCS fleet_py.cc)
endif()
if (WITH_NCCL)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fcntl.h>
#ifdef _POSIX_C_SOURCE
#undef _POSIX_C_SOURCE
#endif
#ifdef _XOPEN_SOURCE
#undef _XOPEN_SOURCE
#endif
#include "paddle/fluid/pybind/fleet_py.h"
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/distributed/communicator_common.h"
#include "paddle/fluid/distributed/fleet.h"
#include "paddle/fluid/distributed/service/communicator.h"
#include "paddle/fluid/distributed/service/env.h"
#include "paddle/fluid/distributed/service/heter_client.h"
namespace py = pybind11;
using paddle::distributed::CommContext;
using paddle::distributed::Communicator;
using paddle::distributed::FleetWrapper;
using paddle::distributed::HeterClient;
namespace paddle {
namespace pybind {
void BindDistFleetWrapper(py::module* m) {
py::class_<FleetWrapper, std::shared_ptr<FleetWrapper>>(*m,
"DistFleetWrapper")
.def(py::init([]() { return FleetWrapper::GetInstance(); }))
.def("load_sparse", &FleetWrapper::LoadSparseOnServer)
.def("init_server", &FleetWrapper::InitServer)
.def("run_server",
(uint64_t (FleetWrapper::*)(void)) & FleetWrapper::RunServer)
.def("run_server", (uint64_t (FleetWrapper::*)( // NOLINT
const std::string&, uint32_t)) & // NOLINT
FleetWrapper::RunServer)
.def("init_worker", &FleetWrapper::InitWorker)
.def("push_dense_params", &FleetWrapper::PushDenseParamSync)
.def("pull_dense_params", &FleetWrapper::PullDenseVarsSync)
.def("save_all_model", &FleetWrapper::SaveModel)
.def("save_one_model", &FleetWrapper::SaveModelOneTable)
.def("sparse_table_stat", &FleetWrapper::PrintTableStat)
.def("stop_server", &FleetWrapper::StopServer)
.def("stop_worker", &FleetWrapper::FinalizeWorker)
.def("barrier", &FleetWrapper::BarrierWithTable);
} // end BindDistFleetWrapper
void BindPSHost(py::module* m) {
py::class_<distributed::PSHost>(*m, "PSHost")
.def(py::init<const std::string&, uint32_t, uint32_t>())
.def("serialize_to_string", &distributed::PSHost::serialize_to_string)
.def("parse_from_string", &distributed::PSHost::parse_from_string)
.def("to_uint64", &distributed::PSHost::serialize_to_uint64)
.def("from_uint64", &distributed::PSHost::parse_from_uint64)
.def("to_string", &distributed::PSHost::to_string);
}
void BindCommunicatorContext(py::module* m) {
py::class_<CommContext>(*m, "CommContext")
.def(
py::init<const std::string&, const std::vector<std::string>&,
const std::vector<std::string>&, const std::vector<int64_t>&,
const std::vector<std::string>&, int, bool, bool, bool,
int>())
.def("var_name", [](const CommContext& self) { return self.var_name; })
.def("trainer_id",
[](const CommContext& self) { return self.trainer_id; })
.def("table_id", [](const CommContext& self) { return self.table_id; })
.def("split_varnames",
[](const CommContext& self) { return self.splited_varnames; })
.def("split_endpoints",
[](const CommContext& self) { return self.epmap; })
.def("sections",
[](const CommContext& self) { return self.height_sections; })
.def("aggregate", [](const CommContext& self) { return self.merge_add; })
.def("is_sparse", [](const CommContext& self) { return self.is_sparse; })
.def("is_distributed",
[](const CommContext& self) { return self.is_distributed; })
.def("origin_varnames",
[](const CommContext& self) { return self.origin_varnames; })
.def("__str__", [](const CommContext& self) { return self.print(); });
}
using paddle::distributed::AsyncCommunicator;
using paddle::distributed::GeoCommunicator;
using paddle::distributed::RecvCtxMap;
using paddle::distributed::RpcCtxMap;
using paddle::distributed::SyncCommunicator;
using paddle::framework::Scope;
void BindDistCommunicator(py::module* m) {
// Communicator is already used by nccl, change to DistCommunicator
py::class_<Communicator, std::shared_ptr<Communicator>>(*m,
"DistCommunicator")
.def(py::init([](const std::string& mode, const std::string& dist_desc,
const std::vector<std::string>& host_sign_list,
const RpcCtxMap& send_ctx, const RecvCtxMap& recv_ctx,
Scope* param_scope,
std::map<std::string, std::string>& envs) {
if (mode == "ASYNC") {
Communicator::InitInstance<AsyncCommunicator>(
send_ctx, recv_ctx, dist_desc, host_sign_list, param_scope, envs);
} else if (mode == "SYNC") {
Communicator::InitInstance<SyncCommunicator>(
send_ctx, recv_ctx, dist_desc, host_sign_list, param_scope, envs);
} else if (mode == "GEO") {
Communicator::InitInstance<GeoCommunicator>(
send_ctx, recv_ctx, dist_desc, host_sign_list, param_scope, envs);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"unsuported communicator MODE"));
}
return Communicator::GetInstantcePtr();
}))
.def("stop", &Communicator::Stop)
.def("start", &Communicator::Start)
.def("push_sparse_param", &Communicator::RpcSendSparseParam)
.def("is_running", &Communicator::IsRunning)
.def("init_params", &Communicator::InitParams);
// .def("recv", &Communicator::RecvNoBarrier);
}
void BindHeterClient(py::module* m) {
py::class_<HeterClient, std::shared_ptr<HeterClient>>(*m, "HeterClient")
.def(py::init(
[](const std::vector<std::string>& endpoint, const int& trainer_id) {
return HeterClient::GetInstance(endpoint, trainer_id);
}))
.def("stop", &HeterClient::Stop);
}
} // end namespace pybind
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
namespace py = pybind11;
namespace paddle {
namespace pybind {
void BindDistFleetWrapper(py::module* m);
void BindPSHost(py::module* m);
void BindCommunicatorContext(py::module* m);
void BindDistCommunicator(py::module* m);
void BindHeterClient(py::module* m);
} // namespace pybind
} // namespace paddle
......@@ -103,14 +103,14 @@ limitations under the License. */
#include "paddle/fluid/platform/xpu_info.h"
#endif
#ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/pybind/communicator_py.h"
#endif
#ifdef PADDLE_WITH_CRYPTO
#include "paddle/fluid/pybind/crypto.h"
#endif
#ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/pybind/fleet_py.h"
#endif
#include "pybind11/stl.h"
DECLARE_bool(use_mkldnn);
......@@ -2837,10 +2837,13 @@ All parameter, weight, gradient are variables in Paddle.
#ifdef PADDLE_WITH_CRYPTO
BindCrypto(&m);
#endif
#ifdef PADDLE_WITH_DISTRIBUTE
BindCommunicator(&m);
BindDistFleetWrapper(&m);
BindPSHost(&m);
BindCommunicatorContext(&m);
BindLargeScaleKV(&m);
BindDistCommunicator(&m);
BindHeterClient(&m);
#endif
}
} // namespace pybind
......
......@@ -212,7 +212,7 @@ function cmake_base() {
fi
if [ "$SYSTEM" == "Darwin" ]; then
WITH_DISTRIBUTE=${WITH_DISTRIBUTE:-ON}
WITH_DISTRIBUTE="OFF"
WITH_AVX=${WITH_AVX:-ON}
INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-~/.cache/inference_demo}
else
......@@ -220,13 +220,8 @@ function cmake_base() {
fi
distibuted_flag=${WITH_DISTRIBUTE:-OFF}
grpc_flag=${WITH_GRPC:-${distibuted_flag}}
if [ "$SYSTEM" == "Darwin" ]; then
gloo_flag="OFF"
else
grpc_flag="OFF"
gloo_flag=${distibuted_flag}
fi
cat <<EOF
========================================
......
......@@ -13,6 +13,7 @@
# limitations under the License.
from ..runtime.collective_runtime import CollectiveRuntime
from ..runtime.parameter_server_runtime import ParameterServerRuntime
from ..runtime.the_one_ps import TheOnePSRuntime
class RuntimeFactory(object):
......@@ -26,7 +27,8 @@ class RuntimeFactory(object):
return collective_runtime
k_steps = context["valid_strategy"].a_sync_configs["k_steps"]
if not context["role_maker"]._is_collective and k_steps >= 0:
ps_runtime = ParameterServerRuntime()
ps_runtime = TheOnePSRuntime()
ps_runtime._set_basic_info(context)
return ps_runtime
......@@ -72,7 +72,6 @@ class ParameterServerOptimizer(MetaOptimizerBase):
# for startup program
_startup = worker.fake_init_ops_pass(_startup, compiled_config)
_startup = worker.init_from_server_pass(_startup, compiled_config)
_startup = worker.delet_extra_optimizes_pass(_startup,
compiled_config)
......@@ -106,19 +105,37 @@ class ParameterServerOptimizer(MetaOptimizerBase):
wait_server_ready(self.role_maker._get_pserver_endpoints())
# for ps-heter mode, wait heter worker ready
if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker(
):
wait_server_ready(self.role_maker._get_heter_worker_endpoints())
# if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker(
# ):
# wait_server_ready(self.role_maker._get_heter_worker_endpoints())
return _main, _startup
def _build_pserver_programs(self, compiled_config):
from paddle.fluid.incubate.fleet.parameter_server.ir import pserver_pass as server
_main = fluid.Program()
_startup = fluid.Program()
from paddle.fluid.incubate.fleet.parameter_server.ir import pserver_pass as server
if not compiled_config.is_geo_mode():
from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_optimize_ops
is_sgd_adam = False
main_program = compiled_config.get_origin_main_program()
ops = _get_optimize_ops(main_program)
if len(ops) == 0:
return _main, _startup
for op in ops:
if op.type in ["sgd", "adam"]:
is_sgd_adam = True
break
if is_sgd_adam:
return _main, _startup
_main = server.add_listen_and_serv_pass(_main, compiled_config)
_main = server.add_rpc_global_flags_pass(_main, compiled_config)
_main = server.add_optimizer_pass(_main, compiled_config)
......@@ -139,12 +156,8 @@ class ParameterServerOptimizer(MetaOptimizerBase):
_main = server.add_listen_and_serv_pass(_main, compiled_config)
_main = server.add_rpc_global_flags_pass(_main, compiled_config)
_main = server.add_geo_optimizer_pass(_main, compiled_config)
_main = server.large_scale_sparse_pass(_main, _main,
compiled_config, False)
_startup = server.build_pserver_startup_program_pass(
_startup, _main, compiled_config)
_startup = server.large_scale_sparse_pass(_startup, _main,
compiled_config, True)
_startup = server.delete_unused_in_startup_pass(_startup, _main,
compiled_config)
......
......@@ -17,10 +17,10 @@ import paddle.fluid as fluid
import math
import numpy as np
from paddle.fluid.framework import Variable
from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
import paddle.distributed.fleet as fleet
def sum(input, scope=None):
def sum(input, scope=None, util=None):
"""
distributed sum in fleet
......@@ -45,21 +45,22 @@ def sum(input, scope=None):
res = np.array(scope.find_var(global_cnt.name).get_tensor())
print("sum array: ", paddle.distributed.fleet.sum(res))
"""
fleet._role_maker._barrier_worker()
if scope is None:
scope = fluid.global_scope()
if util is None:
util = fleet.util
if isinstance(input, Variable):
input = np.array(scope.find_var(input.name).get_tensor())
elif isinstance(input, str):
input = np.array(scope.find_var(input).get_tensor())
old_shape = np.array(input.shape)
output = np.copy(input) * 0
fleet._role_maker._all_reduce(input, output, mode="sum")
output = util.all_reduce(input, "sum")
output = output.reshape(old_shape)
return output
def max(input, scope=None):
def max(input, scope=None, util=None):
"""
distributed max in fleet
......@@ -84,21 +85,22 @@ def max(input, scope=None):
res = np.array(scope.find_var(global_cnt.name).get_tensor())
print("max array: ", paddle.distributed.fleet.max(res))
"""
fleet._role_maker._barrier_worker()
if scope is None:
scope = fluid.global_scope()
if util is None:
util = fleet.util
if isinstance(input, Variable):
input = np.array(scope.find_var(input.name).get_tensor())
elif isinstance(input, str):
input = np.array(scope.find_var(input).get_tensor())
old_shape = np.array(input.shape)
output = np.copy(input) * 0
fleet._role_maker._all_reduce(input, output, mode="max")
output = util.all_reduce(input, "max")
output = output.reshape(old_shape)
return output
def min(input, scope=None):
def min(input, scope=None, util=None):
"""
distributed min in fleet
......@@ -123,21 +125,22 @@ def min(input, scope=None):
res = np.array(scope.find_var(global_cnt.name).get_tensor())
print("min array: ", paddle.distributed.fleet.min(res))
"""
fleet._role_maker._barrier_worker()
if scope is None:
scope = fluid.global_scope()
if util is None:
util = fleet.util
if isinstance(input, Variable):
input = np.array(scope.find_var(input.name).get_tensor())
elif isinstance(input, str):
input = np.array(scope.find_var(input).get_tensor())
old_shape = np.array(input.shape)
output = np.copy(input) * 0
fleet._role_maker._all_reduce(input, output, mode="min")
output = util.all_reduce(input, "min")
output = output.reshape(old_shape)
return output
def auc(stat_pos, stat_neg, scope=None):
def auc(stat_pos, stat_neg, scope=None, util=None):
"""
distributed auc in fleet
......@@ -164,9 +167,11 @@ def auc(stat_pos, stat_neg, scope=None):
neg = np.array(scope.find_var(stat_neg.name).get_tensor())
print("auc: ", paddle.distributed.fleet.auc(pos, neg))
"""
fleet._role_maker._barrier_worker()
if scope is None:
scope = fluid.global_scope()
if util is None:
util = fleet.util
if isinstance(stat_pos, Variable):
stat_pos = np.array(scope.find_var(stat_pos.name).get_tensor())
elif isinstance(stat_pos, str):
......@@ -181,15 +186,14 @@ def auc(stat_pos, stat_neg, scope=None):
stat_pos = stat_pos.reshape(-1)
global_pos = np.copy(stat_pos) * 0
# mpi allreduce
fleet._role_maker._all_reduce(stat_pos, global_pos)
# reshape to its original shape
global_pos = util.all_reduce(stat_pos, "sum")
global_pos = global_pos.reshape(old_pos_shape)
# auc neg bucket
old_neg_shape = np.array(stat_neg.shape)
stat_neg = stat_neg.reshape(-1)
global_neg = np.copy(stat_neg) * 0
fleet._role_maker._all_reduce(stat_neg, global_neg)
global_neg = util.all_reduce(stat_neg, "sum")
global_neg = global_neg.reshape(old_neg_shape)
# calculate auc
......@@ -216,11 +220,10 @@ def auc(stat_pos, stat_neg, scope=None):
else:
auc_value = area / (pos * neg)
fleet._role_maker._barrier_worker()
return auc_value
def mae(abserr, total_ins_num, scope=None):
def mae(abserr, total_ins_num, scope=None, util=None):
"""
distributed mae in fleet
......@@ -242,23 +245,28 @@ def mae(abserr, total_ins_num, scope=None):
res = np.array(scope.find_var(abserr.name).get_tensor())
print("mae: ", paddle.distributed.fleet.mae(res, total_ins_num))
"""
fleet._role_maker._barrier_worker()
if scope is None:
scope = fluid.global_scope()
if util is None:
util = fleet.util
if isinstance(abserr, Variable):
abserr = np.array(scope.find_var(abserr.name).get_tensor())
elif isinstance(abserr, str):
abserr = np.array(scope.find_var(abserr).get_tensor())
old_metric_shape = np.array(abserr.shape)
abserr = abserr.reshape(-1)
global_metric = np.copy(abserr) * 0
fleet._role_maker._all_reduce(abserr, global_metric)
global_metric = util.all_reduce(abserr, "sum")
global_metric = global_metric.reshape(old_metric_shape)
mae_value = global_metric[0] / total_ins_num
return mae_value
def rmse(sqrerr, total_ins_num, scope=None):
def rmse(sqrerr, total_ins_num, scope=None, util=None):
"""
distributed rmse in fleet
......@@ -280,9 +288,11 @@ def rmse(sqrerr, total_ins_num, scope=None):
res = np.array(scope.find_var(sqrerr.name).get_tensor())
print("rmse: ", paddle.distributed.fleet.rmse(res, total_ins_num))
"""
fleet._role_maker._barrier_worker()
if scope is None:
scope = fluid.global_scope()
if util is None:
util = fleet.util
if isinstance(sqrerr, Variable):
sqrerr = np.array(scope.find_var(sqrerr.name).get_tensor())
elif isinstance(sqrerr, str):
......@@ -290,13 +300,15 @@ def rmse(sqrerr, total_ins_num, scope=None):
old_metric_shape = np.array(sqrerr.shape)
sqrerr = sqrerr.reshape(-1)
global_metric = np.copy(sqrerr) * 0
fleet._role_maker._all_reduce(sqrerr, global_metric)
global_metric = util.all_reduce(sqrerr, "sum")
global_metric = global_metric.reshape(old_metric_shape)
rmse_value = math.sqrt(global_metric[0] / total_ins_num)
return rmse_value
def mse(sqrerr, total_ins_num, scope=None):
def mse(sqrerr, total_ins_num, scope=None, util=None):
"""
distributed mse in fleet
......@@ -318,9 +330,11 @@ def mse(sqrerr, total_ins_num, scope=None):
metric = np.array(scope.find_var(sqrerr.name).get_tensor())
print("mse: ", paddle.distributed.fleet.mse(metric, total_ins_num))
"""
fleet._role_maker._barrier_worker()
if scope is None:
scope = fluid.global_scope()
if util is None:
util = fleet.util
if isinstance(sqrerr, Variable):
sqrerr = np.array(scope.find_var(sqrerr.name).get_tensor())
elif isinstance(sqrerr, str):
......@@ -328,13 +342,15 @@ def mse(sqrerr, total_ins_num, scope=None):
old_metric_shape = np.array(sqrerr.shape)
sqrerr = sqrerr.reshape(-1)
global_metric = np.copy(sqrerr) * 0
fleet._role_maker._all_reduce(sqrerr, global_metric)
global_metric = util.all_reduce(sqrerr, "sum")
global_metric = global_metric.reshape(old_metric_shape)
mse_value = global_metric[0] / total_ins_num
return mse_value
def acc(correct, total, scope=None):
def acc(correct, total, scope=None, util=None):
"""
distributed accuracy in fleet
......@@ -367,9 +383,11 @@ def acc(correct, total, scope=None):
total_num = np.array(scope.find_var(total.name).get_tensor())
print("accuracy: ", paddle.distributed.fleet.acc(correct_num, total_num))
"""
fleet._role_maker._barrier_worker()
if scope is None:
scope = fluid.global_scope()
if util is None:
util = fleet.util
if isinstance(correct, Variable):
correct = np.array(scope.find_var(correct.name).get_tensor())
elif isinstance(correct, str):
......@@ -378,8 +396,11 @@ def acc(correct, total, scope=None):
total = np.array(scope.find_var(total.name).get_tensor())
elif isinstance(total, str):
total = np.array(scope.find_var(total).get_tensor())
global_correct_num = np.copy(correct) * 0
global_total_num = np.copy(total) * 0
fleet._role_maker._all_reduce(correct, global_correct_num)
fleet._role_maker._all_reduce(total, global_total_num)
global_correct_num = util.all_reduce(correct, "sum")
global_total_num = util.all_reduce(total, "sum")
return float(global_correct_num[0]) / float(global_total_num[0])
......@@ -14,3 +14,4 @@
from .collective_runtime import CollectiveRuntime
from .parameter_server_runtime import ParameterServerRuntime
from .the_one_ps import TheOnePSRuntime
此差异已折叠。
......@@ -13,3 +13,4 @@
# limitations under the License.
from .fs import LocalFS, HDFSClient
from .ps_util import Distributed
此差异已折叠。
......@@ -216,25 +216,6 @@ def __bootstrap__():
read_env_flags.append('tracer_mkldnn_ops_on')
read_env_flags.append('tracer_mkldnn_ops_off')
if core.is_compiled_with_dist():
#env for rpc
read_env_flags.append('rpc_deadline')
read_env_flags.append('rpc_retry_times')
read_env_flags.append('rpc_server_profile_path')
read_env_flags.append('enable_rpc_profiler')
read_env_flags.append('rpc_send_thread_num')
read_env_flags.append('rpc_get_thread_num')
read_env_flags.append('rpc_prefetch_thread_num')
read_env_flags.append('rpc_disable_reuse_port')
read_env_flags.append('rpc_retry_bind_port')
read_env_flags.append('worker_update_interval_secs')
if core.is_compiled_with_brpc():
read_env_flags.append('max_body_size')
#set brpc max body size
os.environ['FLAGS_max_body_size'] = "2147483647"
if core.is_compiled_with_cuda():
read_env_flags += [
'fraction_of_gpu_memory_to_use',
......
此差异已折叠。
此差异已折叠。
......@@ -1365,7 +1365,8 @@ class Variable(object):
if self.type == core.VarDesc.VarType.SELECTED_ROWS or self.type == core.VarDesc.VarType.LOD_TENSOR:
dtype_str = str(self.dtype).split('.')[1]
var_str = "{name} : {type}.shape{shape}.dtype({dtype}).stop_gradient({stop_gradient})".\
format(name=self.name, type=type_str, shape=self.shape, dtype=dtype_str, stop_gradient=self.stop_gradient)
format(name=self.name, type=type_str, shape=self.shape,
dtype=dtype_str, stop_gradient=self.stop_gradient)
else:
var_str = "{name} : {type})".\
format(name=self.name, type=type_str)
......@@ -2013,7 +2014,8 @@ class Operator(object):
'conditional_block', 'while', 'send', 'recv', 'listen_and_serv',
'fl_listen_and_serv', 'ncclInit', 'select', 'checkpoint_notify',
'gen_nccl_id', 'c_gen_nccl_id', 'c_comm_init', 'c_sync_calc_stream',
'c_sync_comm_stream', 'queue_generator', 'dequeue', 'enqueue'
'c_sync_comm_stream', 'queue_generator', 'dequeue', 'enqueue',
'heter_listen_and_serv'
}
def __init__(self,
......@@ -2284,7 +2286,8 @@ class Operator(object):
if outputs_str != "{}":
op_str = "{outputs} = {op_type}(inputs={inputs}, {attrs})".\
format(outputs = outputs_str, op_type=self.type, inputs=inputs_str, attrs=attrs_str)
format(outputs=outputs_str, op_type=self.type,
inputs=inputs_str, attrs=attrs_str)
else:
op_str = "{op_type}(inputs={inputs}, {attrs})".\
format(op_type=self.type, inputs=inputs_str, attrs=attrs_str)
......@@ -3967,8 +3970,9 @@ class IrGraph(object):
def _convert_to_pdf(dot_file_path):
pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf'
exited_code = subprocess.call('dot -Tpdf ' + dot_file_path \
+ ' -o ' + pdf_save_path, shell=True)
exited_code = subprocess.call(
'dot -Tpdf ' + dot_file_path + ' -o ' + pdf_save_path,
shell=True)
if exited_code != 0:
print('The dot command is needed for creating pdf files.')
print('The {} is saved as the dot filetype.'.format(
......@@ -4581,7 +4585,7 @@ class Program(object):
The two code snippets above will generate and print same programs.
"""
#NOTE(zhiqiu): we sync the original program first, since its program may diff with
# NOTE(zhiqiu): we sync the original program first, since its program may diff with
# its desc due to modifying desc in c++ space. E.g. save op will add kLookupTablePath in desc.
self._sync_with_cpp()
......@@ -4611,7 +4615,7 @@ class Program(object):
if hasattr(self, 'lr_sheduler'):
p.lr_sheduler = self.lr_sheduler
#NOTE(zhiqiu): we sync the cloned program, to update its program by
# NOTE(zhiqiu): we sync the cloned program, to update its program by
# its desc.
p._sync_with_cpp()
......@@ -4656,7 +4660,7 @@ class Program(object):
Program: A new, pruned program.
"""
#NOTE(zhiqiu): we sync the original program first, since its program may diff with
# NOTE(zhiqiu): we sync the original program first, since its program may diff with
# its desc due to modifying desc in c++ space. E.g. save op will add kLookupTablePath in desc.
self._sync_with_cpp()
......
......@@ -20,6 +20,9 @@ set_property(TARGET relu_op_shared PROPERTY LINK_LIBRARIES ${TARGET_LIBRARIES}
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
# for coverage
LIST(REMOVE_ITEM TEST_OPS test_custom_op)
foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py)
endforeach()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册