未验证 提交 abaf87be 编写于 作者: G gongweibao 提交者: GitHub

Change backward_guard to optimize_guard to maximize the allreduce overlap. (#19506)

Change backward_guard to optimize_guard to maximize the allreduce overlap
上级 578cccd4
...@@ -81,6 +81,8 @@ TAlgorithm framework::AlgorithmsCache<TAlgorithm>::GetAlgorithm( ...@@ -81,6 +81,8 @@ TAlgorithm framework::AlgorithmsCache<TAlgorithm>::GetAlgorithm(
seed ^= hashFn(static_cast<int64_t>(algorithmFlags)) + 0x9e3779b9 + seed ^= hashFn(static_cast<int64_t>(algorithmFlags)) + 0x9e3779b9 +
(seed << 6) + (seed >> 2) + 5; (seed << 6) + (seed >> 2) + 5;
VLOG(10) << "seed:" << seed << ", hash_.size:" << hash_.size();
if (seed == 0) return gen_func(); if (seed == 0) return gen_func();
if (hash_.find(seed) == hash_.end()) { if (hash_.find(seed) == hash_.end()) {
......
...@@ -22,6 +22,14 @@ limitations under the License. */ ...@@ -22,6 +22,14 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T>
std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
out << "[";
for (auto const& tmp : v) out << tmp << ",";
out << "]";
return out;
}
using framework::AlgorithmsCache; using framework::AlgorithmsCache;
struct ConvArgs { struct ConvArgs {
...@@ -119,6 +127,11 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> { ...@@ -119,6 +127,11 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
auto x_dims = framework::vectorize(args.x->dims()); auto x_dims = framework::vectorize(args.x->dims());
auto w_dims = framework::vectorize(args.w->dims()); auto w_dims = framework::vectorize(args.w->dims());
VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t algo_cache_id:"
<< algo_cache_id << ", x_dims:" << x_dims
<< ", w_dims:" << w_dims << ", args.s" << args.s << ", args.p"
<< args.p << ", args.d" << args.d;
algo = algo_cache.GetAlgorithm( algo = algo_cache.GetAlgorithm(
x_dims, w_dims, args.s, args.p, args.d, 0, [&]() { x_dims, w_dims, args.s, args.p, args.d, 0, [&]() {
int returned_algo_count; int returned_algo_count;
...@@ -247,6 +260,11 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> { ...@@ -247,6 +260,11 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
auto x_dims = framework::vectorize(args.x->dims()); auto x_dims = framework::vectorize(args.x->dims());
auto w_dims = framework::vectorize(args.w->dims()); auto w_dims = framework::vectorize(args.w->dims());
VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t algo_cache_id:"
<< algo_cache_id << ", x_dims:" << x_dims
<< ", w_dims:" << w_dims << ", args.s" << args.s << ", args.p"
<< args.p << ", args.d" << args.d;
algo = algo_cache.GetAlgorithm( algo = algo_cache.GetAlgorithm(
x_dims, w_dims, args.s, args.p, args.d, 0, [&]() { x_dims, w_dims, args.s, args.p, args.d, 0, [&]() {
int returned_algo_count; int returned_algo_count;
...@@ -368,6 +386,11 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> { ...@@ -368,6 +386,11 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
auto x_dims = framework::vectorize(args.x->dims()); auto x_dims = framework::vectorize(args.x->dims());
auto w_dims = framework::vectorize(args.w->dims()); auto w_dims = framework::vectorize(args.w->dims());
VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t algo_cache_id:"
<< algo_cache_id << ", x_dims:" << x_dims
<< ", w_dims:" << w_dims << ", args.s" << args.s << ", args.p"
<< args.p << ", args.d" << args.d;
algo = algo_cache.GetAlgorithm( algo = algo_cache.GetAlgorithm(
x_dims, w_dims, args.s, args.p, args.d, 0, [&]() { x_dims, w_dims, args.s, args.p, args.d, 0, [&]() {
int returned_algo_count; int returned_algo_count;
......
...@@ -80,8 +80,9 @@ def create_master_params_grads(params_grads, main_prog, startup_prog, ...@@ -80,8 +80,9 @@ def create_master_params_grads(params_grads, main_prog, startup_prog,
A list of master parameters and gradients. A list of master parameters and gradients.
""" """
master_params_grads = [] master_params_grads = []
with main_prog._backward_role_guard():
for p, g in params_grads: for p, g in params_grads:
# create master parameters
with main_prog._optimized_guard([p, g]):
# create master parameters # create master parameters
master_param = copy_to_master_param(p, main_prog.global_block()) master_param = copy_to_master_param(p, main_prog.global_block())
startup_master_param = startup_prog.global_block()._clone_variable( startup_master_param = startup_prog.global_block()._clone_variable(
......
...@@ -278,8 +278,8 @@ class DataFeeder(object): ...@@ -278,8 +278,8 @@ class DataFeeder(object):
for each_sample in iterable: for each_sample in iterable:
assert len(each_sample) == len(converter), ( assert len(each_sample) == len(converter), (
"The number of fields in data (%s) does not match " + "The number of fields in data (%d) does not match " +
"len(feed_list) (%s)") % (len(each_sample), len(converter)) "len(feed_list) (%d)") % (len(each_sample), len(converter))
for each_converter, each_slot in six.moves.zip(converter, for each_converter, each_slot in six.moves.zip(converter,
each_sample): each_sample):
each_converter.feed(each_slot) each_converter.feed(each_slot)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册