diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 42af482f852a1f32fa0fb5595b7c533d612f7d02..43e1bc6b2efecd24d2d5bade5b6a7727bfb0a607 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -31,9 +31,7 @@ function(windows_symbolic TARGET) endfunction() add_subdirectory(ir) -if (NOT WIN32) add_subdirectory(details) -endif (NOT WIN32) # ddim lib proto_library(framework_proto SRCS framework.proto) @@ -118,13 +116,8 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker) cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto) cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context) -if (NOT WIN32) cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog shape_inference data_transform lod_tensor profiler) -else() -cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog - shape_inference data_transform lod_tensor) -endif(NOT WIN32) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context) @@ -179,12 +172,10 @@ else() cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() -if (NOT WIN32) cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph build_strategy fast_threaded_ssa_graph_executor) -endif() # NOT WIN32 cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h index 949616f02d5168e6abab932d608e4b20ee64304a..c3a8b85423403992e3a12ceb0a1acbae82d25dfa 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h @@ -13,9 +13,9 @@ // limitations under the License. #pragma once +#include #include #include -#include "ThreadPool.h" #include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/execution_strategy.h" diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h index 165f11cd3b0015db92951bdc5de34f53dfdc36f6..26d3643f4edff1f2d71b1c761e915a6dacb485ad 100644 --- a/paddle/fluid/memory/allocation/cpu_allocator.h +++ b/paddle/fluid/memory/allocation/cpu_allocator.h @@ -17,7 +17,8 @@ #ifdef _WIN32 #define posix_memalign_free _aligned_free -#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno) +#define posix_memalign(p, a, s) \ + (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno) #endif namespace paddle { diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 0d0613e1a4364e300640b62687c8a045e40b9ca9..93cb5eb2dc0b3480ebd05dcc6b36d8915d057bab 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -1,4 +1,3 @@ -if (NOT WIN32) proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto) py_proto_compile(profiler_py_proto SRCS profiler.proto) @@ -6,11 +5,19 @@ add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch _ add_dependencies(profiler_py_proto profiler_py_proto_init) +if (NOT WIN32) add_custom_command(TARGET profiler_py_proto POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler." WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) +else(NOT WIN32) +string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler/") +add_custom_command(TARGET profiler_py_proto POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler + COMMAND copy /Y *.py ${proto_dstpath} + COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler." + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) endif(NOT WIN32) if(WITH_GPU) @@ -60,12 +67,9 @@ cc_test(init_test SRCS init_test.cc DEPS device_context) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context) - -if (NOT WIN32) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) -endif(NOT WIN32) nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor) cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor) diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h index f59fc40b71699a790978e22fd7e26da8d4d94c5f..eaf047d4744762f69d50bff8d467da8e3b8317cc 100644 --- a/paddle/fluid/platform/device_tracer.h +++ b/paddle/fluid/platform/device_tracer.h @@ -13,17 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#if !defined(_WIN32) -#include -#else -#include -#endif // !_WIN32 - -#include #include // NOLINT #include #include "paddle/fluid/platform/dynload/cupti.h" +#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/profiler.pb.h" namespace paddle { @@ -32,15 +26,11 @@ namespace platform { /////////////////////// // WARN: Under Development. Don't depend on it yet. ////////////////////// -#if !defined(_WIN32) inline uint64_t PosixInNsec() { struct timeval tv; gettimeofday(&tv, nullptr); return 1000 * (static_cast(tv.tv_sec) * 1000000 + tv.tv_usec); } -#else -inline uint64_t PosixInNsec() { return static_cast(0); } -#endif // !_WIN32 // DeviceTracer performs the following tasks: // 1. Register cuda callbacks for various events: kernel, memcpy, etc. diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 3643d2ad15b9e99e579f062c78cef81edb2a818f..31309738a52f07096eca891e745ae31ea6d56924 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -134,7 +134,7 @@ struct EOFException : public std::exception { #define LIKELY(condition) __builtin_expect(static_cast(condition), 1) #else // there is no equivalent intrinsics in msvc. -#define LIKELY(condition) !(condition) +#define LIKELY(condition) (condition) #endif template diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h index a07b993c8a84e26fb079804b6ec3ff9df3efd09e..8be77fe46457511220a26b9733df2c4b8d013823 100644 --- a/paddle/fluid/platform/port.h +++ b/paddle/fluid/platform/port.h @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -27,6 +28,7 @@ #include // dladdr #include // backtrace #include +#include #include // std::accumulate #else #include // _popen, _pclose @@ -57,6 +59,25 @@ static void *dlopen(const char *filename, int flag) { return reinterpret_cast(hModule); } +static int gettimeofday(struct timeval *tp, void *tzp) { + time_t clock; + struct tm tm; + SYSTEMTIME wtm; + + GetLocalTime(&wtm); + tm.tm_year = wtm.wYear - 1900; + tm.tm_mon = wtm.wMonth - 1; + tm.tm_mday = wtm.wDay; + tm.tm_hour = wtm.wHour; + tm.tm_min = wtm.wMinute; + tm.tm_sec = wtm.wSecond; + tm.tm_isdst = -1; + clock = mktime(&tm); + tp->tv_sec = clock; + tp->tv_usec = wtm.wMilliseconds * 1000; + + return (0); +} #endif // !_WIN32 static void ExecShellCommand(const std::string &cmd, std::string *message) { diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 56bf9e31a35fdec5b7f04849068ff96ac9776c0e..03c102e24a189ea9e1614ffa6773e0c6e742c107 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/port.h" -#include #include #include #include @@ -438,10 +438,10 @@ void ParseEvents(const std::vector>& events, event_items[index].total_time += event_time; // min time event_items[index].min_time = - std::min(event_time, event_items[index].min_time); + (std::min)(event_time, event_items[index].min_time); // max time event_items[index].max_time = - std::max(event_time, event_items[index].max_time); + (std::max)(event_time, event_items[index].max_time); } // remove the push marker from the list diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index e8eae874afa3d17f0d3374eef457cdbacb3f8424..f5d3490634f3199a23986ec3ae13d9fe3577ac35 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -69,7 +69,6 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx); void PopEvent(const std::string& name, const DeviceContext* dev_ctx); -#if !defined(_WIN32) struct RecordEvent { // dev_ctx can be set to nullptr if device is cpu. RecordEvent(const std::string& name, const DeviceContext* dev_ctx); @@ -106,15 +105,6 @@ struct RecordBlock { std::string name_; uint64_t start_ns_; }; -#else -// windows do not support profiler temporarily. -struct RecordEvent { - RecordEvent(const std::string& name, const DeviceContext* dev_ctx) {} -}; -struct RecordBlock { - explicit RecordBlock(int block_id) {} -}; -#endif // Return the event list of all threads. Assumed the returned value calls // event_lists, event_lists[i][j] represents the j-th Event of i-th thread. diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index 0e88a439cf6ca83e3d98725f58875adeeea86be0..11c68f3449ee26b64c121acd081479b37c94fac4 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -45,16 +45,15 @@ class StreamCallbackManager { inline void AddCallback(Callback &&callback) const { auto *stream_callback_context = new StreamCallbackContext(this, std::forward(callback)); - PADDLE_ENFORCE( #if CUDA_VERSION >= 10000 - cudaLaunchHostFunc(stream_, StreamCallbackManager::StreamCallbackFunc, - stream_callback_context) + PADDLE_ENFORCE(cudaLaunchHostFunc(stream_, + StreamCallbackManager::StreamCallbackFunc, + stream_callback_context)); // NOLINT #else - cudaStreamAddCallback(stream_, - StreamCallbackManager::StreamCallbackFunc, - stream_callback_context, 0) + PADDLE_ENFORCE(cudaStreamAddCallback( + stream_, StreamCallbackManager::StreamCallbackFunc, + stream_callback_context, 0)); // NOLINT #endif - ); // NOLINT } void Wait() const { thread_pool_.reset(new ThreadPool(1)); } diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 25e919105cba48ea28b6d429542c33ba48515ef5..fb6ee2f4a53925f64b61b9fe03f5a4bd7203ed53 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,9 +1,6 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder) +set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder parallel_executor profiler) set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc) -if(NOT WIN32) - list(APPEND PYBIND_DEPS parallel_executor profiler) -endif(NOT WIN32) if(WITH_PYTHON) if(WITH_AMD_GPU) hip_library(paddle_pybind SHARED diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 2f040e1c34c7956959dc9fe55b4aaf0b3b6f880c..102fa02adf3fb05e36183f87e95d8178be4e8742 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -36,9 +36,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/op_registry.h" -#ifndef _WIN32 #include "paddle/fluid/framework/parallel_executor.h" -#endif #include "paddle/fluid/framework/prune.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/selected_rows.h" @@ -637,7 +635,6 @@ All parameter, weight, gradient are variables in Paddle. #endif #endif -#ifndef _WIN32 py::enum_(m, "ProfilerState", py::arithmetic()) .value("kDisabled", platform::ProfilerState::kDisabled) .value("kCPU", platform::ProfilerState::kCPU) @@ -658,7 +655,6 @@ All parameter, weight, gradient are variables in Paddle. m.def("disable_profiler", platform::DisableProfiler); m.def("is_profiler_enabled", platform::IsProfileEnabled); m.def("reset_profiler", platform::ResetProfiler); -#endif py::class_> pass(m, "Pass"); pass.def(py::init()) @@ -687,7 +683,6 @@ All parameter, weight, gradient are variables in Paddle. .def("remove_pass", [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); }); -#ifndef _WIN32 // -- python binds for parallel executor. py::class_ pe(m, "ParallelExecutor"); py::class_ exec_strategy(pe, "ExecutionStrategy", R"DOC( @@ -913,7 +908,6 @@ All parameter, weight, gradient are variables in Paddle. pybind11::gil_scoped_release release; self.Run(fetch_tensors, fetched_var_name); }); -#endif BindRecordIOWriter(&m); return m.ptr(); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 6a4a5e098fc27e02104baa837d939a62cc2a22a8..543acf2d349c7d02480204699f497536c7a4ca60 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -47,8 +47,7 @@ from . import profiler from . import unique_name from . import recordio_writer from . import parallel_executor -if os.name != 'nt': - from .parallel_executor import * +from .parallel_executor import * from paddle.fluid.layers.math_op_patch import monkey_patch_variable Tensor = LoDTensor diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 0d53f53a9ef6d5f90d188e1ad10785b5558c3e80..3f4dd5eb712e738bbee8f93c062375033b8ab2f6 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -25,264 +25,263 @@ import os __all__ = ['ParallelExecutor', 'ExecutionStrategy', 'BuildStrategy'] -if os.name != 'nt': - ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy - BuildStrategy = core.ParallelExecutor.BuildStrategy - - class ParallelExecutor(object): +ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy +BuildStrategy = core.ParallelExecutor.BuildStrategy + + +class ParallelExecutor(object): + """ + ParallelExecutor is designed for data parallelism, which focuses on distributing + the data across different nodes and every node operates on the data in parallel. + If you use ParallelExecutor to run the current program on GPU, the node means GPU + device, and ParallelExecutor will get the available GPU device automatically on + the current machine. If you use ParallelExecutor to run the current program on CPU, + the node means the CPU device, and you can specify the CPU device number by adding + 'CPU_NUM' environment variable, for example 'CPU_NUM=4', if the environment variable + is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number + of CPUs in the system. + + Args: + use_cuda (bool): Whether to use CUDA or not. + loss_name (str): The loss name must set in training. Default None. + main_program (Program): The program that need to run, if not provided, + then default_main_program will be used. Default None. + share_vars_from(ParallelExecutor): If provide, it will share variables + from the specified ParallelExecutor. Default None. + exec_strategy(ExecutionStrategy): exec_strategy is used to control how to run + the program in ParallelExecutor, for example how many threads are used to + execute the program, how many iterations to clean up the temp variables + which is generated during execution. For more information, please refer + to fluid.ExecutionStrategy. Default None. + build_strategy(BuildStrategy): build_strategy is used to control how to + build the SSA Graph in ParallelExecutor by setting the property, + for example reduce_strategy, gradient_scale_strategy. For more information, + please refer to fluid.BuildStrategy. Default None. + num_trainers(int): If greater than 1, NCCL will be initialized with + multiple rank of nodes, each node should have same number of GPUs. + Distributed training will be enabled then. Default 1. + trainer_id(int): Must use together with num_trainers. trainer_id is the + "rank" of current node starts from 0. Default 0. + scope(Scope): scope to run with, default use fluid.global_scope(). + + Returns: + ParallelExecutor: The initialized ParallelExecutor object. + + Raises: + TypeError: If share_vars_from is provided, but not ParallelExecutor object. + + Examples: + .. code-block:: python + + train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name) + test_exe = fluid.ParallelExecutor(use_cuda=True, + main_program=test_program, + share_vars_from=train_exe) + + train_loss, = train_exe.run([loss.name], feed=feed_dict) + test_loss, = test_exe.run([loss.name], feed=feed_dict) + """ + + def __init__(self, + use_cuda, + loss_name=None, + main_program=None, + share_vars_from=None, + exec_strategy=None, + build_strategy=None, + num_trainers=1, + trainer_id=0, + scope=None): + self._places = [] + self._act_places = [] + if use_cuda: + for i in six.moves.range(core.get_cuda_device_count()): + p = core.Place() + self._act_places.append(core.CUDAPlace(i)) + p.set_place(self._act_places[-1]) + self._places.append(p) + else: + cpu_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + for i in six.moves.range(cpu_num): + p = core.Place() + self._act_places.append(core.CPUPlace()) + p.set_place(self._act_places[-1]) + self._places.append(p) + assert self._places, "no place for execution" + + if exec_strategy is None: + exec_strategy = ExecutionStrategy() + exec_strategy.use_cuda = use_cuda + + if exec_strategy.num_threads == 0: + if use_cuda: + # Experiments on se-resnext shows that too many threads hurt + # performance. Worth tunning for other models in the future. + exec_strategy.num_threads = len(self._places) * 4 + else: + cpu_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + exec_strategy.num_threads = cpu_num * 2 + + # Set 1 thread num under nccl2 distribute + # env to make sure all gpus run ops in same order. + if num_trainers > 1: + assert (use_cuda) + # FIXME(gongwb): avoid this set. + exec_strategy.num_threads = 1 + + if build_strategy is None: + build_strategy = BuildStrategy() + + main = main_program + main = main if main else framework.default_main_program() + if scope == None: + scope = executor.global_scope() + + if share_vars_from and not isinstance(share_vars_from, + ParallelExecutor): + raise TypeError("share_vars_from must be ParallelExecutor.") + + local_scopes = share_vars_from.executor.local_scopes( + ) if share_vars_from else [] + + self.persistable_vars = [ + v.name for v in [ + var for var in main.list_vars() + if var.persistable and var.type != core.VarDesc.VarType.RAW + ] + ] + + self.executor = core.ParallelExecutor( + self._places, + set([ + cpt.to_text(p.name) + for p in main.global_block().iter_parameters() + if not p.stop_gradient + ]), + set(cpt.to_text(var) for var in self.persistable_vars), main.desc, + cpt.to_text(loss_name) + if loss_name else six.u(''), scope, local_scopes, exec_strategy, + build_strategy, num_trainers, trainer_id) + self.scope = scope + + def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True): """ - ParallelExecutor is designed for data parallelism, which focuses on distributing - the data across different nodes and every node operates on the data in parallel. - If you use ParallelExecutor to run the current program on GPU, the node means GPU - device, and ParallelExecutor will get the available GPU device automatically on - the current machine. If you use ParallelExecutor to run the current program on CPU, - the node means the CPU device, and you can specify the CPU device number by adding - 'CPU_NUM' environment variable, for example 'CPU_NUM=4', if the environment variable - is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number - of CPUs in the system. + Run a parallel executor with fetch_list. + + The feed parameter can be a dict or a list. If feed is a dict, the + feed data will be split into multiple devices. If feed is a list, we + assume the data has been splitted into multiple devices, the each + element in the list will be copied to each device directly. + + For example, if the feed is a dict: + + >>> exe = ParallelExecutor() + >>> # the image will be splitted into devices. If there is two devices + >>> # each device will process an image with shape (24, 1, 28, 28) + >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))}) + + For example, if the feed is a list: + + >>> exe = ParallelExecutor() + >>> # each device will process each element in the list. + >>> # the 1st device will process an image with shape (48, 1, 28, 28) + >>> # the 2nd device will process an image with shape (32, 1, 28, 28) + >>> # + >>> # you can use exe.device_count to get the device number. + >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))}, + >>> {"image": numpy.random.random(size=(32, 1, 28, 28))}, + >>> ]) Args: - use_cuda (bool): Whether to use CUDA or not. - loss_name (str): The loss name must set in training. Default None. - main_program (Program): The program that need to run, if not provided, - then default_main_program will be used. Default None. - share_vars_from(ParallelExecutor): If provide, it will share variables - from the specified ParallelExecutor. Default None. - exec_strategy(ExecutionStrategy): exec_strategy is used to control how to run - the program in ParallelExecutor, for example how many threads are used to - execute the program, how many iterations to clean up the temp variables - which is generated during execution. For more information, please refer - to fluid.ExecutionStrategy. Default None. - build_strategy(BuildStrategy): build_strategy is used to control how to - build the SSA Graph in ParallelExecutor by setting the property, - for example reduce_strategy, gradient_scale_strategy. For more information, - please refer to fluid.BuildStrategy. Default None. - num_trainers(int): If greater than 1, NCCL will be initialized with - multiple rank of nodes, each node should have same number of GPUs. - Distributed training will be enabled then. Default 1. - trainer_id(int): Must use together with num_trainers. trainer_id is the - "rank" of current node starts from 0. Default 0. - scope(Scope): scope to run with, default use fluid.global_scope(). + fetch_list(list): The fetched variable names + feed(list|dict|None): The feed variables. If the feed is a dict, + tensors in that dict will be splitted into each devices. If + the feed is a list, each element of the list will be copied + to each device. Default None. + feed_dict: Alias for feed parameter, for backward compatibility. + This parameter has been deprecated. Default None. + return_numpy(bool): Whether converts the fetched tensor to numpy. + Default: True. Returns: - ParallelExecutor: The initialized ParallelExecutor object. + List: The fetched result list. Raises: - TypeError: If share_vars_from is provided, but not ParallelExecutor object. + ValueError: If the feed is a list, but its length is not equal the + length of active places, or its element's is not dict. + + NOTES: + 1. If the feed's type is dict, the number of data that feeds to + ParallelExecutor must be bigger than active places. Otherwise, + it will throw exception from C++ side. Special attention should be + paid to check whether the last batch of the dataset is bigger + than active places. + 2. If active places are more than one, the fetch results for each + variable is a list, and each element of this list is the variable of + respective active place. Examples: .. code-block:: python - train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name) - test_exe = fluid.ParallelExecutor(use_cuda=True, - main_program=test_program, - share_vars_from=train_exe) - - train_loss, = train_exe.run([loss.name], feed=feed_dict) - test_loss, = test_exe.run([loss.name], feed=feed_dict) + pe = fluid.ParallelExecutor(use_cuda=use_cuda, + loss_name=avg_cost.name, + main_program=fluid.default_main_program()) + loss = pe.run(feed=feeder.feed(cur_batch), + fetch_list=[avg_cost.name])) """ - - def __init__(self, - use_cuda, - loss_name=None, - main_program=None, - share_vars_from=None, - exec_strategy=None, - build_strategy=None, - num_trainers=1, - trainer_id=0, - scope=None): - self._places = [] - self._act_places = [] - if use_cuda: - for i in six.moves.range(core.get_cuda_device_count()): - p = core.Place() - self._act_places.append(core.CUDAPlace(i)) - p.set_place(self._act_places[-1]) - self._places.append(p) - else: - cpu_num = int( - os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - for i in six.moves.range(cpu_num): - p = core.Place() - self._act_places.append(core.CPUPlace()) - p.set_place(self._act_places[-1]) - self._places.append(p) - assert self._places, "no place for execution" - - if exec_strategy is None: - exec_strategy = ExecutionStrategy() - exec_strategy.use_cuda = use_cuda - - if exec_strategy.num_threads == 0: - if use_cuda: - # Experiments on se-resnext shows that too many threads hurt - # performance. Worth tunning for other models in the future. - exec_strategy.num_threads = len(self._places) * 4 - else: - cpu_num = int( - os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - exec_strategy.num_threads = cpu_num * 2 - - # Set 1 thread num under nccl2 distribute - # env to make sure all gpus run ops in same order. - if num_trainers > 1: - assert (use_cuda) - # FIXME(gongwb): avoid this set. - exec_strategy.num_threads = 1 - - if build_strategy is None: - build_strategy = BuildStrategy() - - main = main_program - main = main if main else framework.default_main_program() - if scope == None: - scope = executor.global_scope() - - if share_vars_from and not isinstance(share_vars_from, - ParallelExecutor): - raise TypeError("share_vars_from must be ParallelExecutor.") - - local_scopes = share_vars_from.executor.local_scopes( - ) if share_vars_from else [] - - self.persistable_vars = [ - v.name for v in [ - var for var in main.list_vars() - if var.persistable and var.type != core.VarDesc.VarType.RAW - ] - ] - - self.executor = core.ParallelExecutor( - self._places, - set([ - cpt.to_text(p.name) - for p in main.global_block().iter_parameters() - if not p.stop_gradient - ]), - set(cpt.to_text(var) - for var in self.persistable_vars), main.desc, - cpt.to_text(loss_name) - if loss_name else six.u(''), scope, local_scopes, exec_strategy, - build_strategy, num_trainers, trainer_id) - self.scope = scope - - def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True): - """ - Run a parallel executor with fetch_list. - - The feed parameter can be a dict or a list. If feed is a dict, the - feed data will be split into multiple devices. If feed is a list, we - assume the data has been splitted into multiple devices, the each - element in the list will be copied to each device directly. - - For example, if the feed is a dict: - - >>> exe = ParallelExecutor() - >>> # the image will be splitted into devices. If there is two devices - >>> # each device will process an image with shape (24, 1, 28, 28) - >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))}) - - For example, if the feed is a list: - - >>> exe = ParallelExecutor() - >>> # each device will process each element in the list. - >>> # the 1st device will process an image with shape (48, 1, 28, 28) - >>> # the 2nd device will process an image with shape (32, 1, 28, 28) - >>> # - >>> # you can use exe.device_count to get the device number. - >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))}, - >>> {"image": numpy.random.random(size=(32, 1, 28, 28))}, - >>> ]) - - Args: - fetch_list(list): The fetched variable names - feed(list|dict|None): The feed variables. If the feed is a dict, - tensors in that dict will be splitted into each devices. If - the feed is a list, each element of the list will be copied - to each device. Default None. - feed_dict: Alias for feed parameter, for backward compatibility. - This parameter has been deprecated. Default None. - return_numpy(bool): Whether converts the fetched tensor to numpy. - Default: True. - - Returns: - List: The fetched result list. - - Raises: - ValueError: If the feed is a list, but its length is not equal the - length of active places, or its element's is not dict. - - NOTES: - 1. If the feed's type is dict, the number of data that feeds to - ParallelExecutor must be bigger than active places. Otherwise, - it will throw exception from C++ side. Special attention should be - paid to check whether the last batch of the dataset is bigger - than active places. - 2. If active places are more than one, the fetch results for each - variable is a list, and each element of this list is the variable of - respective active place. - - Examples: - .. code-block:: python - - pe = fluid.ParallelExecutor(use_cuda=use_cuda, - loss_name=avg_cost.name, - main_program=fluid.default_main_program()) - loss = pe.run(feed=feeder.feed(cur_batch), - fetch_list=[avg_cost.name])) - """ - if feed is None and feed_dict is not None: - feed = feed_dict - print( - "`feed_dict` is deprecated. Please use `feed=`", - file=sys.stderr) - - if isinstance(feed, dict): - feed_tensor_dict = dict() - for feed_name in feed: - feed_tensor = feed[feed_name] - if not isinstance(feed_tensor, core.LoDTensor): - feed_tensor = core.LoDTensor() - # always set to CPU place, since the tensor need to be splitted - # it is fast in CPU - feed_tensor.set(feed[feed_name], core.CPUPlace()) - feed_tensor_dict[feed_name] = feed_tensor - - self.executor.feed_and_split_tensor_into_local_scopes( - feed_tensor_dict) - elif isinstance(feed, list) or isinstance(feed, tuple): - if len(feed) != len(self._act_places): - raise ValueError( - "Feed a list of tensor, the list should be the same size as places" - ) - - res = list() - - for i, each in enumerate(feed): - if not isinstance(each, dict): - raise TypeError( - "Each element of feed list should be a dict") - res_dict = dict() - for feed_name in each: - tensor = each[feed_name] - if not isinstance(tensor, core.LoDTensor): - tmp = core.LoDTensor() - tmp.set(tensor, self._act_places[i]) - tensor = tmp - res_dict[feed_name] = tensor - res.append(res_dict) - self.executor.feed_tensors_into_local_scopes(res) - - fetch_var_name = '@FETCHED_VAR_NAME@' - self.executor.run(fetch_list, fetch_var_name) - arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() - - if return_numpy: - return executor.as_numpy(arr) - - return [arr[i] for i in range(len(arr))] - - @property - def device_count(self): - return len(self._act_places) + if feed is None and feed_dict is not None: + feed = feed_dict + print( + "`feed_dict` is deprecated. Please use `feed=`", + file=sys.stderr) + + if isinstance(feed, dict): + feed_tensor_dict = dict() + for feed_name in feed: + feed_tensor = feed[feed_name] + if not isinstance(feed_tensor, core.LoDTensor): + feed_tensor = core.LoDTensor() + # always set to CPU place, since the tensor need to be splitted + # it is fast in CPU + feed_tensor.set(feed[feed_name], core.CPUPlace()) + feed_tensor_dict[feed_name] = feed_tensor + + self.executor.feed_and_split_tensor_into_local_scopes( + feed_tensor_dict) + elif isinstance(feed, list) or isinstance(feed, tuple): + if len(feed) != len(self._act_places): + raise ValueError( + "Feed a list of tensor, the list should be the same size as places" + ) + + res = list() + + for i, each in enumerate(feed): + if not isinstance(each, dict): + raise TypeError( + "Each element of feed list should be a dict") + res_dict = dict() + for feed_name in each: + tensor = each[feed_name] + if not isinstance(tensor, core.LoDTensor): + tmp = core.LoDTensor() + tmp.set(tensor, self._act_places[i]) + tensor = tmp + res_dict[feed_name] = tensor + res.append(res_dict) + self.executor.feed_tensors_into_local_scopes(res) + + fetch_var_name = '@FETCHED_VAR_NAME@' + self.executor.run(fetch_list, fetch_var_name) + arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() + + if return_numpy: + return executor.as_numpy(arr) + + return [arr[i] for i in range(len(arr))] + + @property + def device_count(self): + return len(self._act_places)