diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake index 0eb590c42d0cb73ccb252430bc3e27312b0bddf9..582c06e88c1d45ccfef58dac6a722ee40645e9a3 100644 --- a/cmake/external/brpc.cmake +++ b/cmake/external/brpc.cmake @@ -41,7 +41,7 @@ ExternalProject_Add( ${EXTERNAL_PROJECT_LOG_ARGS} # TODO(gongwb): change to de newst repo when they changed. GIT_REPOSITORY "https://github.com/wangjiawei04/brpc" - GIT_TAG "6d79e0b17f25107c35b705ea58d888083f59ff47" + GIT_TAG "e203afb794caf027da0f1e0776443e7d20c0c28e" PREFIX ${BRPC_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/paddle/fluid/distributed/service/brpc_ps_server.cc b/paddle/fluid/distributed/service/brpc_ps_server.cc index 8400e669182d670b892dc2eb55492a92ee919ae5..d7ff0ecd95a3bdfd0191e7a6d65822c425e6bb3c 100644 --- a/paddle/fluid/distributed/service/brpc_ps_server.cc +++ b/paddle/fluid/distributed/service/brpc_ps_server.cc @@ -60,7 +60,8 @@ uint64_t BrpcPsServer::start(const std::string &ip, uint32_t port) { std::unique_lock lock(mutex_); std::string ip_port = ip + ":" + std::to_string(port); - VLOG(3) << "server of rank " << _rank << " starts at " << ip_port; + VLOG(0) << "running server with rank id: " << _rank + << ", endpoint: " << ip_port; brpc::ServerOptions options; int num_threads = std::thread::hardware_concurrency(); @@ -538,7 +539,7 @@ int32_t BrpcPsService::stop_server(Table *table, auto *p_server = _server; std::thread t_stop([p_server]() { p_server->stop(); - LOG(INFO) << "Server Stoped"; + VLOG(3) << "Server Stoped"; }); t_stop.detach(); return 0; diff --git a/paddle/fluid/distributed/service/brpc_utils.cc b/paddle/fluid/distributed/service/brpc_utils.cc index 096718768149c574fd57b91396879d7bec5d37e0..a356b77e73733ed9b657a7603adf57c5228bf3c5 100644 --- a/paddle/fluid/distributed/service/brpc_utils.cc +++ b/paddle/fluid/distributed/service/brpc_utils.cc @@ -324,7 +324,7 @@ std::string GetIntTypeEndpoint(const std::string& ip, const uint32_t& port) { while (hp->h_addr_list[i] != NULL) { int_ip = inet_ntoa(*(struct in_addr*)hp->h_addr_list[i]); - VLOG(0) << "Brpc Get host by name, host:" << ip << " -> ip: " << int_ip; + VLOG(3) << "Brpc Get host by name, host:" << ip << " -> ip: " << int_ip; break; } diff --git a/paddle/fluid/distributed/service/env.h b/paddle/fluid/distributed/service/env.h index 901aba0ad90c49c7403862997830bed7e0950dc0..ca395a776afd4e2ee53e0aeaebb94494d4f4e6a6 100644 --- a/paddle/fluid/distributed/service/env.h +++ b/paddle/fluid/distributed/service/env.h @@ -39,7 +39,7 @@ struct PSHost { // |---ip---|---port---|--rank--| // |-32bit--|--20bit---|--12bit-| - // for pslib + uint64_t serialize_to_uint64() { uint64_t host_label = 0; host_label = inet_addr(ip.c_str()); @@ -175,14 +175,12 @@ class PSEnvironment { host.ip = ip; host.port = port; host.rank = rank; - if (sign_set.count(rank) > 0) { - LOG(WARNING) << "ps-host :" << host.ip << ":" << host.port - << ", rank:" << host.rank - << " already register, ignore register"; - } else { + + if (sign_set.count(rank) == 0) { host_list.push_back(host); sign_set.insert(rank); } + return 0; } diff --git a/paddle/fluid/distributed/service/ps_client.cc b/paddle/fluid/distributed/service/ps_client.cc index 095b5dee0b28e4f3319927aa2440e906489db7de..d427ecfc538a69e2011a241a3eb3b43d939a0153 100644 --- a/paddle/fluid/distributed/service/ps_client.cc +++ b/paddle/fluid/distributed/service/ps_client.cc @@ -78,8 +78,7 @@ PSClient *PSClientFactory::create(const PSParameter &ps_config) { } TableManager::instance().initialize(); - LOG(INFO) << "Create PSClient[" << service_param.client_class() - << "] success"; + VLOG(3) << "Create PSClient[" << service_param.client_class() << "] success"; return client; } } // namespace distributed diff --git a/paddle/fluid/distributed/service/service.cc b/paddle/fluid/distributed/service/service.cc index 3d0f94fac277509be7b17b648767dc835629b2b3..2759e4614e66e1d69c6427e0320ae44292757ffd 100644 --- a/paddle/fluid/distributed/service/service.cc +++ b/paddle/fluid/distributed/service/service.cc @@ -47,7 +47,7 @@ paddle::distributed::PSParameter load_from_prototxt( } void PSCore::init_gflag(const std::string& gflags) { - LOG(INFO) << "Init With Gflags:" << gflags; + VLOG(3) << "Init With Gflags:" << gflags; std::vector flags = paddle::string::split_string(gflags); if (flags.size() < 1) { flags.push_back("-max_body_size=314217728"); diff --git a/paddle/fluid/distributed/table/depends/dense.h b/paddle/fluid/distributed/table/depends/dense.h index a2acdfd20148ac282f6633e55ea450dd3367e5f2..8079003d1bf8f677aeaba91f8860504adcb853e0 100644 --- a/paddle/fluid/distributed/table/depends/dense.h +++ b/paddle/fluid/distributed/table/depends/dense.h @@ -89,7 +89,6 @@ class DSGD : public DenseOptimizer { auto blas = GetBlas(); float lr = *(global_learning_rate_) * (*learning_rate); - VLOG(4) << "DSGD LearningRate: " << lr; blas.VCOPY(update_numel, update_values + begin, grads.data()); blas.SCAL(update_numel, lr, grads.data()); blas.VSUB(update_numel, param + begin, grads.data(), param + begin); @@ -157,7 +156,6 @@ class DAdam : public DenseOptimizer { beta2_pow[0] = beta2_pow[0] * beta2; float lr_ = *(global_learning_rate_)*learning_rate[0]; - VLOG(4) << "DAdam LearningRate: " << lr_; lr_ *= sqrt(1 - beta2_pow[0]) / (1 - beta1_pow[0]); float* tmp_ = tmp.data(); diff --git a/paddle/fluid/distributed/table/depends/sparse.h b/paddle/fluid/distributed/table/depends/sparse.h index 672d6e7d396874b5cc5a296f15e3842a3233410b..0e1d7ef03c129c2dc6f72d6e56fafb143d879bd4 100644 --- a/paddle/fluid/distributed/table/depends/sparse.h +++ b/paddle/fluid/distributed/table/depends/sparse.h @@ -110,7 +110,6 @@ class SSGD : public SparseOptimizer { auto* value = block->Get(id); float learning_rate = *(global_learning_rate_) * (value + lr_offset)[0]; - VLOG(4) << "SSGD LearningRate: " << learning_rate; float* param = value + param_offset; std::vector grads; @@ -166,7 +165,6 @@ class SAdam : public SparseOptimizer { if (!block->GetEntry(id)) continue; auto* values = block->Get(id); float lr_ = *(global_learning_rate_) * (values + lr_offset)[0]; - VLOG(4) << "SAdam LearningRate: " << lr_; float* param = values + param_offset; float* moment1 = values + m1_offset; float* moment2 = values + m2_offset; diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 34c87b8388975aa108bbb2ecdaded1ff4a33d16c..5636e3ed1b63f9e4b9854ebcaac4a84a4c20176b 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -161,9 +161,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ !defined(_WIN32) && !defined(__APPLE__) AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass"); -#else - LOG(WARNING) << "fusion_group is not enabled for Windows/MacOS now, and " - "only effective when running with CUDA GPU."; #endif AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_, "fuse_elewise_add_act_pass"); @@ -265,12 +262,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { if (FLAGS_use_mkldnn) { AppendPass(pass_name); } else if (!strategy_.mkldnn_enabled_op_types_.empty()) { - LOG(WARNING) - << "mkldnn_enabled_op_types specify the operator type list to " - "use MKLDNN acceleration. It is null in default, means " - "that all the operators supported by MKLDNN will be " - "accelerated. And it should not be set when " - "FLAGS_use_mkldnn=false."; + VLOG(1) << "mkldnn_enabled_op_types specify the operator type list to " + "use MKLDNN acceleration. It is null in default, means " + "that all the operators supported by MKLDNN will be " + "accelerated. And it should not be set when " + "FLAGS_use_mkldnn=false."; } #else PADDLE_ENFORCE_NE(FLAGS_use_mkldnn, true, @@ -403,26 +399,26 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, << ", num_trainers:" << num_trainers_; } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") { if (use_device != p::kCUDA) { - LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on " - "GPU, skipped."; + VLOG(1) << "fuse_relu_depthwise_conv_pass is only supported on " + "GPU, skipped."; continue; } } else if (pass->Type() == "fusion_group_pass") { pass->Set("use_gpu", new bool((use_device == p::kCUDA))); if (use_device != p::kCUDA) { - LOG(WARNING) << "fusion_group_pass is only supported on GPU, skipped."; + VLOG(1) << "fusion_group_pass is only supported on GPU, skipped."; continue; } } else if (pass->Type() == "fuse_bn_act_pass") { if (use_device != p::kCUDA) { - LOG(WARNING) << "fuse_bn_act_pass is only supported on " - "GPU, skipped."; + VLOG(1) << "fuse_bn_act_pass is only supported on " + "GPU, skipped."; continue; } } else if (pass->Type() == "fuse_bn_add_act_pass") { if (use_device != p::kCUDA) { - LOG(WARNING) << "fuse_bn_add_act_pass is only supported on " - "GPU, skipped."; + VLOG(1) << "fuse_bn_add_act_pass is only supported on " + "GPU, skipped."; continue; } } else if (pass->Type() == "mkldnn_placement_pass") { diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 05c54a90f7eb02eb4897533f58c338ee5bf07de0..9ced4221e1dd6cc154fbd0edbbc57903a983cd68 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -205,7 +205,7 @@ class DeviceWorker { Scope* root_scope_ = nullptr; Scope* thread_scope_; paddle::platform::Place place_; - int64_t batch_num_; + int64_t batch_num_ = 0; FetchConfig fetch_config_; bool use_cvm_; bool no_cvm_; diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc index d8639643f2c8a773246d9cd88637a7f9f478e8a8..89dc5c7d3ea932388fd8ab220478bb438f6b35f8 100644 --- a/paddle/fluid/framework/hogwild_worker.cc +++ b/paddle/fluid/framework/hogwild_worker.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h" @@ -226,14 +227,32 @@ void HogwildWorker::PrintFetchVars() { // call count batch_num_++; int batch_per_print = fetch_config_.print_period(); - if (thread_id_ == 0) { - if (batch_num_ % batch_per_print == 0) { - int fetch_var_num = fetch_config_.fetch_var_names_size(); - for (int i = 0; i < fetch_var_num; ++i) { - platform::PrintVar(thread_scope_, fetch_config_.fetch_var_names(i), - fetch_config_.fetch_var_str_format(i)); + int fetch_var_num = fetch_config_.fetch_var_names_size(); + + if (fetch_var_num == 0) { + return; + } + + if (thread_id_ == 0 && batch_num_ % batch_per_print == 0) { + time_t curtime; + time(&curtime); + char mbstr[80]; + std::strftime(mbstr, sizeof(mbstr), "%Y-%m-%d %H:%M:%S", + std::localtime(&curtime)); + + std::stringstream ss; + ss << "time: [" << mbstr << "], "; + ss << "batch: [" << batch_num_ << "], "; + + for (int i = 0; i < fetch_var_num; ++i) { + platform::PrintVar(thread_scope_, fetch_config_.fetch_var_names(i), + fetch_config_.fetch_var_str_format(i), &ss); + if (i < fetch_var_num - 1) { + ss << ", "; } } + + std::cout << ss.str() << std::endl; } } diff --git a/paddle/fluid/platform/lodtensor_printer.cc b/paddle/fluid/platform/lodtensor_printer.cc index 0be4233269e0f4f732c6b6a2622e7db3cb8e37e3..25ae0ab264f2d8cc044502673cc8b09f589308f9 100644 --- a/paddle/fluid/platform/lodtensor_printer.cc +++ b/paddle/fluid/platform/lodtensor_printer.cc @@ -27,24 +27,38 @@ namespace paddle { namespace platform { void PrintVar(framework::Scope* scope, const std::string& var_name, - const std::string& print_info) { + const std::string& print_info, std::stringstream* sstream) { framework::Variable* var = scope->FindVar(var_name); if (var == nullptr) { - VLOG(1) << "Variable Name " << var_name << " does not exist in your scope"; + VLOG(0) << "Variable Name " << var_name << " does not exist in your scope"; return; } framework::LoDTensor* tensor = var->GetMutable(); if (tensor == nullptr) { - VLOG(1) << "tensor of variable " << var_name + VLOG(0) << "tensor of variable " << var_name << " does not exist in your scope"; return; } - std::ostringstream sstream; - sstream << print_info << "\t"; - sstream << var_name << "\t"; - sstream << *tensor << "\t"; - std::cout << sstream.str() << std::endl; + *sstream << print_info << ": "; + +#define PrintTensorCallback(cpp_type, proto_type) \ + do { \ + if (tensor->type() == proto_type) { \ + *sstream << "["; \ + auto* data = tensor->data(); \ + auto element_num = tensor->numel(); \ + if (element_num > 0) { \ + *sstream << data[0]; \ + for (int j = 1; j < element_num; ++j) { \ + *sstream << " " << data[j]; \ + } \ + } \ + *sstream << "]"; \ + } \ + } while (0) + + _ForEachDataType_(PrintTensorCallback); } } // end namespace platform diff --git a/paddle/fluid/platform/lodtensor_printer.h b/paddle/fluid/platform/lodtensor_printer.h index e0bd1fff197f700303e3e7d9c9afe18937a1b9e6..d30afb62b0b8c6999972e81d396bd89053bf2d16 100644 --- a/paddle/fluid/platform/lodtensor_printer.h +++ b/paddle/fluid/platform/lodtensor_printer.h @@ -26,6 +26,6 @@ class Scope; namespace paddle { namespace platform { void PrintVar(framework::Scope* scope, const std::string& var_name, - const std::string& print_info); + const std::string& print_info, std::stringstream* out); } // end namespace platform } // end namespace paddle diff --git a/paddle/fluid/platform/lodtensor_printer_test.cc b/paddle/fluid/platform/lodtensor_printer_test.cc index 5b2af270740766307990c5cd53585a9c62606da3..51bd55ebb7f4889dda89cc3bcda5491514547188 100644 --- a/paddle/fluid/platform/lodtensor_printer_test.cc +++ b/paddle/fluid/platform/lodtensor_printer_test.cc @@ -18,5 +18,6 @@ TEST(LodTensorPrinter, PrintVar) { paddle::framework::Scope scope; - paddle::platform::PrintVar(&scope, "NotAVar", "We don't have var"); + std::stringstream ss; + paddle::platform::PrintVar(&scope, "NotAVar", "We don't have var", &ss); } diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index 19ba637cc96809be508a074bab4552bded4e523c..cf802034cabd85209f8df66e54e82cf765f2c5a5 100644 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -628,12 +628,13 @@ class Fleet(object): self.user_defined_optimizer = optimizer if strategy is not None: - warnings.warn( - "It is recommended to use DistributedStrategy " - "in fleet.init(). The strategy here is only for compatibility. " - "If the strategy in fleet.distributed_optimizer() is " - "not None, then it will overwrite the DistributedStrategy in fleet.init(), " - "which will take effect in distributed training.") + if self._is_collective: + warnings.warn( + "It is recommended to use DistributedStrategy " + "in fleet.init(). The strategy here is only for compatibility. " + "If the strategy in fleet.distributed_optimizer() is " + "not None, then it will overwrite the DistributedStrategy in fleet.init(), " + "which will take effect in distributed training.") self._user_defined_strategy = copy.deepcopy(strategy) self._context = {} diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py index a56868060055e80cccd934fc1b349ec2f8681e90..aa7df57e3c58bdc0c1cce80f7e03b972bd91f5d6 100644 --- a/python/paddle/distributed/fleet/runtime/the_one_ps.py +++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py @@ -768,7 +768,7 @@ class TheOnePSRuntime(RuntimeBase): server = self._get_fleet_proto(is_server=True, is_sync=is_sync) proto_txt = str(server) - debug = bool(os.getenv("PSERVER_DEBUG", "0")) + debug = bool(int(os.getenv("PSERVER_DEBUG", "0"))) if debug: print("server: \n{}".format(proto_txt)) diff --git a/python/paddle/fluid/tests/unittests/test_monitor.py b/python/paddle/fluid/tests/unittests/test_monitor.py index cf273876b1f2f8a9b4828375ca6e20e591feb306..bea2f6c8b38b23d35a283cf4403303025f328179 100644 --- a/python/paddle/fluid/tests/unittests/test_monitor.py +++ b/python/paddle/fluid/tests/unittests/test_monitor.py @@ -17,6 +17,8 @@ TestCases for Monitor from __future__ import print_function import paddle +paddle.enable_static() + import paddle.fluid as fluid import paddle.fluid.core as core import numpy as np @@ -52,6 +54,11 @@ class TestDatasetWithStat(unittest.TestCase): name=slot, shape=[1], dtype="int64", lod_level=1) slots_vars.append(var) + embs = [] + for x in slots_vars: + emb = fluid.layers.embedding(x, is_sparse=True, size=[100001, 4]) + embs.append(emb) + dataset = paddle.distributed.InMemoryDataset() dataset._set_batch_size(32) dataset._set_thread(3) @@ -74,11 +81,17 @@ class TestDatasetWithStat(unittest.TestCase): for i in range(self.epoch_num): for data in data_loader(): exe.run(fluid.default_main_program(), feed=data) + else: for i in range(self.epoch_num): try: - exe.train_from_dataset(fluid.default_main_program(), - dataset) + exe.train_from_dataset( + fluid.default_main_program(), + dataset, + fetch_list=[embs[0], embs[1]], + fetch_info=["emb0", "emb1"], + print_period=1) + except Exception as e: self.assertTrue(False)