From 97a77512b4b41b4adb5b8a48336731ede16d9c01 Mon Sep 17 00:00:00 2001 From: chengduo Date: Tue, 7 Aug 2018 16:48:31 +0800 Subject: [PATCH] Fix the order of sum (#12562) * fix the order of sum * add doc * check whether need to copy * follow comments --- cmake/generic.cmake | 5 +++ .../fluid/framework/details/build_strategy.h | 20 ++++++++++++ .../framework/details/reduce_op_handle.cc | 32 +++++++++++++++++-- python/paddle/fluid/__init__.py | 3 +- .../test_parallel_executor_seresnext.py | 8 ++--- 5 files changed, 60 insertions(+), 8 deletions(-) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 9059ae206..82c958073 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -264,6 +264,8 @@ function(cc_test TARGET_NAME) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) if (${cc_test_SERIAL}) set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) + + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) endif() @@ -330,6 +332,8 @@ function(nv_test TARGET_NAME) add_test(${TARGET_NAME} ${TARGET_NAME}) if (nv_test_SERIAL) set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) + + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) endif() @@ -580,6 +584,7 @@ function(py_test TARGET_NAME) cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_test(NAME ${TARGET_NAME} COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true + FLAGS_cpu_deterministic=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index b2e5399e2..8714a4216 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -21,6 +21,26 @@ namespace framework { namespace details { struct BuildStrategy { + // ParallelExecutor supports two modes of ReduceStrategy, kAllReduce and + // kReduce, for CPU and GPU. If you use kAllReduce, different threads + // optimize their parameters separately. If you use kReduce, the optimizations + // of parameters are distributed to different threads. + // For example, a model has 100 parameters and is running with four threads, + // if you choose kAllReduce, every thread is to optimize 100 parameters + // separately, if you choose kReduce, every thread is to optimize 25 + // parameters. + // Of particular note is, if you use kReduce when using CPU training, + // all the parameters are shared between different threads. This feature will + // save memory. + // FIXME(zcd): The result of the two modes(kAllReduce and kReduce) maybe not + // equal for GPU. Because, the result of the different order of summing maybe + // different, for example, the result of `a+b+c+d` may be different with the + // result of `c+a+b+d`. + // For GPU, the implementation of kAllReduce and kReduce is adopted NCCL, + // so the result of kAllReduce and kReduce maybe not equal. + // For CPU, if you want to fix the order of summing to make the result + // of kAllReduce and kReduce no diff, you can add + // `FLAGS_cpu_deterministic=true` to env. enum class ReduceStrategy { kAllReduce = 0, kReduce = 1 }; enum class GradientScaleStrategy { diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index 68bdfbaf5..6c7e5c1fb 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -18,6 +18,10 @@ #include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/platform/profiler.h" +DEFINE_bool( + cpu_deterministic, false, + "Whether to make the result of computation deterministic in CPU side."); + namespace paddle { namespace framework { namespace details { @@ -91,11 +95,33 @@ void ReduceOpHandle::RunImpl() { } else { std::vector lod_tensors = GetInputValues(in_var_handles, var_scopes); + if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) { this->RunAndRecordEvent([&] { - ReduceLoDTensor func(lod_tensors, - out_var->GetMutable()); - VisitDataType(ToDataType(lod_tensors[0]->type()), func); + // FIXME(zcd): The order of summing is important, + // especially when the type of data is float or double. + // For example, the result of `a+b+c+d` may be different + // with the result of `c+a+b+d`, so the summing order should be fixed. + if (!FLAGS_cpu_deterministic) { + ReduceLoDTensor func(lod_tensors, + out_var->GetMutable()); + VisitDataType(ToDataType(lod_tensors[0]->type()), func); + } else { + // We sum lod_tensors to reduce_sum_trg which is in local_scopes_0 + // here, but it doesn't mean reduce_sum_trg must be in local_scopes_0. + auto &reduce_sum_trg = *this->local_scopes_[0] + ->FindVar(kLocalExecScopeName) + ->Get() + ->FindVar(out_var_handle->name_) + ->GetMutable(); + ReduceLoDTensor func(lod_tensors, &reduce_sum_trg); + VisitDataType(ToDataType(lod_tensors[0]->type()), func); + + auto trg = out_var->GetMutable(); + if (reduce_sum_trg.data() != trg->data()) { + TensorCopy(reduce_sum_trg, platform::CPUPlace(), trg); + } + } }); } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) { #ifdef PADDLE_WITH_CUDA diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 956e3c434..3b38c4280 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -123,7 +123,8 @@ def __bootstrap__(): read_env_flags = [ 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb', - 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads' + 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', + 'cpu_deterministic' ] if core.is_compiled_with_dist(): read_env_flags.append('rpc_deadline') diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py index b56129a43..a28428d8d 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -198,7 +198,7 @@ class TestResnet(TestParallelExecutorBase): model, use_cuda, iter=20, - delta2=1e-4): + delta2=1e-6): if use_cuda and not core.is_compiled_with_cuda(): return @@ -276,10 +276,10 @@ class TestResnet(TestParallelExecutorBase): model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3) def test_seresnext_with_new_strategy(self): - # self._compare_reduce_and_allreduce( - # model=SE_ResNeXt50Small, use_cuda=True) self._compare_reduce_and_allreduce( - model=SE_ResNeXt50Small, use_cuda=False, iter=5, delta2=1e-2) + model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-2) + self._compare_reduce_and_allreduce( + model=SE_ResNeXt50Small, use_cuda=False, iter=5) if __name__ == '__main__': -- GitLab