From 5e839e4da584d073c065a60c39db4f81b16df110 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Tue, 12 Jan 2021 16:20:31 +0800 Subject: [PATCH] add sparse embedding & load vars for 2.0 & gloo bug fix (#30306) * add sparse embedding & load vars for 2.0 Change-Id: I36b59ed5f015189dc9d9d2e34a9357722d369f1b * fix hdfs gloo Change-Id: Ia84d579053720ad804183e54c9a04b4f031c79c6 * fix gloo hdfs Change-Id: I5ab982fd483cddc10adcdef0b8aa83aca976cb9e * move loadvar/sparse embedding from incubute to static Change-Id: I57081d3545ad2efab78c72420d2162c0eacaf3a0 --- paddle/fluid/framework/fleet/gloo_wrapper.cc | 15 ++++--- python/paddle/fluid/contrib/layers/nn.py | 2 +- python/paddle/static/__init__.py | 42 ++++++++++++++++---- python/paddle/static/nn/__init__.py | 2 + 4 files changed, 48 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.cc b/paddle/fluid/framework/fleet/gloo_wrapper.cc index 8780db89e85..e18cad10ac2 100644 --- a/paddle/fluid/framework/fleet/gloo_wrapper.cc +++ b/paddle/fluid/framework/fleet/gloo_wrapper.cc @@ -229,18 +229,18 @@ void ParallelConnectContext::connectFullMesh( store.wait({key}, getTimeout()); std::vector allAddrs; - auto max_retry_times = 5; + auto max_retry_times = 10; // Connect to other side of this pair while (max_retry_times > 0) { allAddrs = store.get(key); - VLOG(3) << "store get all address size: " << allAddrs.size() << " except: " << total_add_size; if (allAddrs.size() == static_cast(total_add_size)) { break; } + sleep(5); --max_retry_times; } @@ -272,11 +272,13 @@ void GlooWrapper::Init() { attr.iface = iface_; std::shared_ptr file_store = nullptr; std::shared_ptr http_store = nullptr; - auto context = std::make_shared(rank_, size_); - context->setTimeout(run_timeout_); auto dev = gloo::transport::tcp::CreateDevice(attr); + switch (store_type_) { case GlooStoreType::HDFS: { + auto context = std::make_shared( + rank_, size_); + context->setTimeout(run_timeout_); std::string cmd = std::string("${HADOOP_HOME}/bin/hadoop fs"); cmd += " -D fs.default.name=" + hdfs_name_; cmd += " -D hadoop.job.ugi=" + hdfs_ugi_; @@ -286,22 +288,25 @@ void GlooWrapper::Init() { auto prefix_store = std::make_shared(prefix_, *file_store); context->connectFullMesh(*prefix_store, dev); + context_ = std::move(context); break; } case GlooStoreType::HTTP: { + auto context = std::make_shared(rank_, size_); + context->setTimeout(run_timeout_); http_store = std::make_shared( http_ip_, http_port_, prefix_ + "_" + http_scope_, rank_); http_store->SetTimeoutSeconds(init_timeout_.count()); context->connectFullMesh(*http_store, dev); http_store->Finalize(); VLOG(3) << "after calling http_store->Finalize."; + context_ = std::move(context); break; } default: LOG(ERROR) << "unknown store type " << store_type_; exit(-1); } - context_ = std::move(context); #endif is_initialized_ = true; VLOG(3) << "gloo initialized done."; diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py index f3f8c815b00..acb57fc2456 100644 --- a/python/paddle/fluid/contrib/layers/nn.py +++ b/python/paddle/fluid/contrib/layers/nn.py @@ -976,7 +976,7 @@ def sparse_embedding(input, 'fluid.contrib.layers.sparse_embedding') check_dtype(dtype, 'dtype', ['float32'], - 'fluid.contrib.layers.sparse_embedding') + 'paddle.static.nn.sparse_embedding') w = helper.create_parameter( attr=helper.param_attr, diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py index 3bd94fb4527..60daae8667d 100644 --- a/python/paddle/static/__init__.py +++ b/python/paddle/static/__init__.py @@ -14,13 +14,37 @@ # TODO: import framework api under this directory __all__ = [ - 'append_backward', 'gradients', 'Executor', 'global_scope', 'scope_guard', - 'BuildStrategy', 'CompiledProgram', 'Print', 'py_func', 'ExecutionStrategy', - 'name_scope', 'ParallelExecutor', 'program_guard', 'WeightNormParamAttr', - 'default_main_program', 'default_startup_program', 'Program', 'data', - 'InputSpec', 'save', 'load', 'save_inference_model', 'load_inference_model', - 'load_program_state', 'set_program_state', 'cpu_places', 'cuda_places', - 'xpu_places', 'Variable' + 'append_backward', + 'gradients', + 'Executor', + 'global_scope', + 'scope_guard', + 'BuildStrategy', + 'CompiledProgram', + 'Print', + 'py_func', + 'ExecutionStrategy', + 'name_scope', + 'ParallelExecutor', + 'program_guard', + 'WeightNormParamAttr', + 'default_main_program', + 'default_startup_program', + 'Program', + 'data', + 'InputSpec', + 'save', + 'load', + 'save_inference_model', + 'load_inference_model', + 'load_program_state', + 'set_program_state', + 'cpu_places', + 'cuda_places', + 'xpu_places', + 'Variable', + 'load_vars', + 'save_vars', ] from . import nn @@ -61,6 +85,10 @@ from ..fluid.io import save #DEFINE_ALIAS from ..fluid.io import load #DEFINE_ALIAS from ..fluid.io import load_program_state #DEFINE_ALIAS from ..fluid.io import set_program_state #DEFINE_ALIAS + +from ..fluid.io import load_vars #DEFINE_ALIAS +from ..fluid.io import save_vars #DEFINE_ALIAS + from ..fluid.layers import create_parameter #DEFINE_ALIAS from ..fluid.layers import create_global_var #DEFINE_ALIAS from ..fluid.layers.metric_op import auc #DEFINE_ALIAS diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py index 9161bb7af41..fd84a0a9284 100644 --- a/python/paddle/static/nn/__init__.py +++ b/python/paddle/static/nn/__init__.py @@ -38,6 +38,7 @@ __all__ = [ 'spectral_norm', 'switch_case', 'while_loop', + 'sparse_embedding', ] from .common import fc #DEFINE_ALIAS @@ -67,3 +68,4 @@ from ...fluid.layers import switch_case #DEFINE_ALIAS from ...fluid.layers import while_loop #DEFINE_ALIAS from ...fluid.input import embedding #DEFINE_ALIAS +from ...fluid.contrib.layers import sparse_embedding #DEFINE_ALIAS -- GitLab