diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 450fe1508f2a505a233b3d300cb7c500894231e7..5c8776b62fe3a2640016412259721f812ff0579b 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -193,8 +193,7 @@ ParallelExecutor::ParallelExecutor( const std::unordered_set &bcast_vars, const ProgramDesc &main_program, const std::string &loss_var_name, Scope *scope, const std::vector &local_scopes, - const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy, - size_t num_trainers, size_t trainer_id) + const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy) : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; member_->use_cuda_ = exec_strategy.use_cuda_; @@ -253,7 +252,8 @@ ParallelExecutor::ParallelExecutor( } member_->nccl_ctxs_.reset(new platform::NCCLContextMap( - member_->places_, nccl_id, num_trainers, trainer_id)); + member_->places_, nccl_id, build_strategy.num_trainers_, + build_strategy.trainer_id_)); #else PADDLE_THROW("Not compiled with CUDA"); #endif diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 49d3f0d3f6f2a8965d39b656071d86bde42bfd93..121bbd55ad575477424a2fb12baab82585eae517 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -50,8 +50,7 @@ class ParallelExecutor { const std::string &loss_var_name, Scope *scope, const std::vector &local_scopes, const ExecutionStrategy &exec_strategy, - const BuildStrategy &build_strategy, - size_t num_trainers = 1, size_t trainer_id = 0); + const BuildStrategy &build_strategy); ~ParallelExecutor(); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 3b81d59ad965b7532ca729682e7aeb8eb96194a8..2d817bcb0ddd6e91b6641075fd33ff36fdc3af94 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1022,8 +1022,7 @@ All parameter, weight, gradient are variables in Paddle. pe.def(py::init &, const std::unordered_set &, const ProgramDesc &, const std::string &, Scope *, std::vector &, - const ExecutionStrategy &, const BuildStrategy &, size_t, - size_t>()) + const ExecutionStrategy &, const BuildStrategy &>()) // NOTE: even we return a vec* to Python use reference policy. // We still cannot get local_scope from this vector, since the element // of vec will be freed by Python GC. We can only return Scope* diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py new file mode 100644 index 0000000000000000000000000000000000000000..63331f5708a5f3b8ab72533ece38e9a75584fb2d --- /dev/null +++ b/python/paddle/fluid/compiler.py @@ -0,0 +1,118 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import multiprocessing +import os +import six +from .. import compat as cpt + +from . import core + +ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy +BuildStrategy = core.ParallelExecutor.BuildStrategy + + +def _place_obj(place): + p = core.Place() + p.set_place(place) + return p + + +class _ProgramCompiler(object): + def __init__(self, program): + self._program = program + self._compiled = False + self._is_data_parallel = False + + def _with_data_parallel(self, + loss_name=None, + build_strategy=None, + exec_strategy=None): + assert not self._is_data_parallel, "Already compiled with parallel." + self._is_data_parallel = True + self._build_strategy = build_strategy + self._exec_strategy = exec_strategy + self._loss_name = loss_name + return self + + def _compile_data_parallel(self): + self._places = [] + self._local_scopes = [] + + if self._exec_strategy is None: + self._exec_strategy = ExecutionStrategy() + if self._build_strategy is None: + self._build_strategy = BuildStrategy() + + self._exec_strategy.use_cuda = isinstance(self._place, core.CUDAPlace) + if self._exec_strategy.use_cuda: + gpus_env = os.getenv("FLAGS_selected_gpus") + if gpus_env: + gpus = [int(s) for s in gpus_env.split(",")] + else: + gpus = [ + i for i in six.moves.range(core.get_cuda_device_count()) + ] + self._places = [core.CUDAPlace(i) for i in gpus] + else: + cpu_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + self._places = [core.CPUPlace() for _ in six.moves.range(cpu_num)] + assert self._places, "no place for execution" + + if self._exec_strategy.num_threads == 0: + if self._exec_strategy.use_cuda: + # Experiments on se-resnext shows that too many threads hurt + # performance. Worth tunning for other models in the future. + self._exec_strategy.num_threads = len(self._places) * 4 + else: + cpu_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + self._exec_strategy.num_threads = cpu_num * 2 + + trainers_endpoints = self._program._trainers_endpoints + if self._build_strategy.num_trainers > 1 and trainers_endpoints: + assert self._build_strategy.num_trainers == len( + trainers_endpoints), "num_trainers == len(end_points)" + self._build_strategy.trainers_endpoints = trainers_endpoints + + self._persistable_vars = set([ + cpt.to_text(v.name) + for v in [ + var for var in self._program.list_vars() + if var.persistable and var.type != core.VarDesc.VarType.RAW + ] + ]) + + places = list(map(_place_obj, self._places)) + return core.ParallelExecutor( + places, self._persistable_vars, self._program.desc, + cpt.to_text(self._loss_name) + if self._loss_name else six.u(''), self._scope, self._local_scopes, + self._exec_strategy, self._build_strategy) + + def _compile(self, scope, place): + if self._compiled: + return self + self._compiled = True + + self._scope = scope + self._place = place + + if self._is_data_parallel: + self._executor = self._compile_data_parallel() + else: + p = _place_obj(self._place) + self._executor = core.Executor(p) + return self diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 5a9e908b61eeeea3fdfdfcc54d1f150f59a3973b..ee7df740075dbebab6721d5c0849005c7e315572 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -14,11 +14,15 @@ from __future__ import print_function +import os +import multiprocessing import numpy as np import contextlib import six from .framework import Program, default_main_program, Variable from . import core +from . import compiler +from .. import compat as cpt __all__ = ['Executor', 'global_scope', 'scope_guard'] @@ -275,11 +279,8 @@ class Executor(object): def __init__(self, place): self.place = place - p = core.Place() - p.set_place(place) - self.executor = core.Executor(p) - self.program_caches = dict() + self.executor = None self._closed = False def _get_program_cache(self, program_cache_key): @@ -361,6 +362,7 @@ class Executor(object): You can no long use this executor after calling this method. For the distributed training, this method would free the resource on PServers related to the current Trainer. + TODO(panyx0718): Why ParallelExecutor doesn't have close? Example: >>> cpu = core.CPUPlace() @@ -368,10 +370,58 @@ class Executor(object): >>> ... >>> exe.close() """ - if not self._closed: + if not self._closed and self.executor: self.executor.close() self._closed = True + def _run_parallel(self, + exe, + scope, + feed=None, + fetch_list=None, + return_numpy=True): + if isinstance(feed, dict): + feed_tensor_dict = dict() + for feed_name in feed: + feed_tensor = feed[feed_name] + if not isinstance(feed_tensor, core.LoDTensor): + feed_tensor = core.LoDTensor() + # always set to CPU place, since the tensor need to be splitted + # it is fast in CPU + feed_tensor.set(feed[feed_name], core.CPUPlace()) + feed_tensor_dict[feed_name] = feed_tensor + + exe.feed_and_split_tensor_into_local_scopes(feed_tensor_dict) + elif isinstance(feed, list) or isinstance(feed, tuple): + if len(feed) != len(self._places): + raise ValueError( + "Feed a list of tensor, the list should be the same size as places" + ) + + res = list() + for i, each in enumerate(feed): + if not isinstance(each, dict): + raise TypeError( + "Each element of feed list should be a dict") + res_dict = dict() + for feed_name in each: + tensor = each[feed_name] + if not isinstance(tensor, core.LoDTensor): + tmp = core.LoDTensor() + tmp.set(tensor, self._places[i]) + tensor = tmp + res_dict[feed_name] = tensor + res.append(res_dict) + exe.feed_tensors_into_local_scopes(res) + + fetch_var_name = '@FETCHED_VAR_NAME@' + exe.run(fetch_list, fetch_var_name) + arr = scope.find_var(fetch_var_name).get_lod_tensor_array() + + if return_numpy: + return as_numpy(arr) + return [arr[i] for i in range(len(arr))] + def run(self, program=None, feed=None, @@ -428,6 +478,47 @@ class Executor(object): if self._closed: raise RuntimeError("Attempted to use a closed Executor") + if scope is None: + scope = global_scope() + + compiled = isinstance(program, compiler._ProgramCompiler) + if not compiled: + p = core.Place() + p.set_place(self.place) + self.executor = core.Executor(p) + return self._run( + program, + feed=feed, + fetch_list=fetch_list, + feed_var_name=feed_var_name, + fetch_var_name=fetch_var_name, + scope=scope, + return_numpy=return_numpy, + use_program_cache=use_program_cache) + + program._compile(scope, self.place) + self.executor = program._executor + if program._is_data_parallel: + return self._run_parallel( + exe=program._executor, + scope=scope, + feed=feed, + fetch_list=fetch_list, + return_numpy=return_numpy) + else: + return self._run( + program._program, + feed=feed, + fetch_list=fetch_list, + feed_var_name=feed_var_name, + fetch_var_name=fetch_var_name, + scope=scope, + return_numpy=return_numpy, + use_program_cache=use_program_cache) + + def _run(self, program, feed, fetch_list, feed_var_name, fetch_var_name, + scope, return_numpy, use_program_cache): + if feed is None: feed = {} if not isinstance(feed, dict): @@ -444,9 +535,6 @@ class Executor(object): "Executor requires Program as its Parameter. But you passed in %s" % (type(program))) - if scope is None: - scope = global_scope() - cache_key = _get_program_cache_key(feed, fetch_list) if use_program_cache: cached_program = self._get_program_cache(cache_key) diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index c97a93ec36d4f4a7ff6a9f097551e2d21022d5b1..917db02bb852df5c540279e5a5bbeaa7b701b09d 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -167,9 +167,8 @@ class ParallelExecutor(object): # step7: init ParallelExecutor self.executor = core.ParallelExecutor( places, persistable_vars, main.desc, - cpt.to_text(loss_name) - if loss_name else six.u(''), scope, local_scopes, exec_strategy, - build_strategy, num_trainers, trainer_id) + cpt.to_text(loss_name) if loss_name else six.u(''), scope, + local_scopes, exec_strategy, build_strategy) self.scope = scope @@ -292,3 +291,6 @@ class ParallelExecutor(object): @property def device_count(self): return len(self._places) + + def close(self): + pass diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 2b0ab0cc3bc23bab140d2b7e8cb765e537ff3f5c..2038b57a6c20433a2d7c02eb21676cb6dce00eab 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -19,6 +19,7 @@ import os import unittest import paddle.fluid as fluid import paddle.fluid.core as core +from paddle.fluid import compiler import time import numpy as np import math @@ -44,15 +45,8 @@ class TestParallelExecutorBase(unittest.TestCase): optimizer=fluid.optimizer.Adam, use_fast_executor=False, enable_sequential_execution=False): - def run_executor(exe, feed, fetch_list, program=None): - if isinstance(exe, fluid.ParallelExecutor): - res = exe.run(fetch_list=fetch_list, feed=feed) - elif isinstance(exe, fluid.Executor): - if program is None: - program = fluid.default_main_program() - res = exe.run(program=program, feed=feed, fetch_list=fetch_list) - else: - raise ValueError('Unkown type exe') + def run_executor(exe, binary, feed, fetch_list): + res = exe.run(binary, feed=feed, fetch_list=fetch_list) return res main = fluid.Program() @@ -72,8 +66,8 @@ class TestParallelExecutorBase(unittest.TestCase): fluid.memory_optimize(main) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - startup_exe = fluid.Executor(place) - startup_exe.run(startup) + exe = fluid.Executor(place) + exe.run(startup) exec_strategy = fluid.ExecutionStrategy() exec_strategy.allow_op_delay = allow_op_delay if use_fast_executor: @@ -86,15 +80,13 @@ class TestParallelExecutorBase(unittest.TestCase): build_strategy.enable_sequential_execution = enable_sequential_execution if use_cuda and core.is_compiled_with_cuda(): build_strategy.remove_unnecessary_lock = True - if use_parallel_executor: - exe = fluid.ParallelExecutor( - use_cuda, + binary = compiler._ProgramCompiler(main)._with_data_parallel( loss_name=loss.name, - exec_strategy=exec_strategy, - build_strategy=build_strategy) + build_strategy=build_strategy, + exec_strategy=exec_strategy) else: - exe = fluid.Executor(place=place) + binary = compiler._ProgramCompiler(main) if batch_size is not None: batch_size *= fluid.core.get_cuda_device_count( @@ -102,13 +94,14 @@ class TestParallelExecutorBase(unittest.TestCase): os.environ.get('CPU_NUM', multiprocessing.cpu_count())) begin = time.time() first_loss, = run_executor( - exe=exe, feed=feed_dict, fetch_list=[loss.name]) + exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]) for i in range(iter): - run_executor(exe=exe, feed=feed_dict, fetch_list=[]) + run_executor( + exe=exe, binary=binary, feed=feed_dict, fetch_list=[]) last_loss, = run_executor( - exe=exe, feed=feed_dict, fetch_list=[loss.name]) + exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]) end = time.time() if batch_size is not None: diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 0caab08f0dc19efceadd723b474c10e1a2deb449..5cc5d9f3d3e98b9ab6117a563e70bff27e150c30 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -26,6 +26,7 @@ import pickle import numpy as np import paddle.fluid as fluid +from paddle.fluid import compiler RUN_STEP = 10 DEFAULT_BATCH_SIZE = 2 @@ -104,8 +105,8 @@ class TestDistRunnerBase(object): else: place = fluid.CPUPlace() - startup_exe = fluid.Executor(place) - startup_exe.run(fluid.default_startup_program()) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) strategy = fluid.ExecutionStrategy() strategy.num_threads = 1 @@ -125,19 +126,16 @@ class TestDistRunnerBase(object): mypass.set_int("num_repeats", args.batch_merge_repeat) if args.update_method == "nccl2": - num_trainers = len(args.endpoints.split(",")) - trainer_id = args.trainer_id + build_stra.num_trainers = len(args.endpoints.split(",")) + build_stra.trainer_id = args.trainer_id else: - num_trainers = 1 - trainer_id = 0 + build_stra.num_trainers = 1 + build_stra.trainer_id = 0 - exe = fluid.ParallelExecutor( - args.use_cuda, + binary = compiler._ProgramCompiler(trainer_prog)._with_data_parallel( loss_name=avg_cost.name, - exec_strategy=strategy, build_strategy=build_stra, - num_trainers=num_trainers, - trainer_id=trainer_id) + exec_strategy=strategy) feed_var_list = [ var for var in trainer_prog.global_block().vars.values() @@ -160,7 +158,8 @@ class TestDistRunnerBase(object): out_losses = [] for _ in six.moves.xrange(RUN_STEP): - loss, = exe.run(fetch_list=[avg_cost.name], + loss, = exe.run(binary, + fetch_list=[avg_cost.name], feed=feeder.feed(get_data())) out_losses.append(loss[0]) if six.PY2: