parallel_executor.py 9.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import core
import multiprocessing
import framework
import executor
J
JiayiFeng 已提交
19
import warnings
Y
Yu Yang 已提交
20
import sys
21

Y
yuyang18 已提交
22
__all__ = ['ParallelExecutor', 'ExecutionStrategy', 'BuildStrategy']
Y
yuyang18 已提交
23 24

ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
Y
yuyang18 已提交
25
BuildStrategy = core.ParallelExecutor.BuildStrategy
26 27 28


class ParallelExecutor(object):
X
Xin Pan 已提交
29 30
    def __init__(self,
                 use_cuda,
31 32
                 loss_name=None,
                 main_program=None,
Y
Yu Yang 已提交
33
                 share_vars_from=None,
Y
yuyang18 已提交
34
                 exec_strategy=None,
Y
yuyang18 已提交
35
                 build_strategy=None,
T
typhoonzero 已提交
36
                 num_trainers=1,
37
                 trainer_id=0,
Y
yuyang18 已提交
38
                 **kwargs):
39 40 41 42 43 44 45 46 47 48
        """
        ParallelExecutor can run program in parallel.

        Args:
            use_cuda(bool): Whether to use CUDA or not.
            loss_name(str, default None): The loss name must set in training.
            main_program(Program, default None): The program that need to run,
                if not provided, then default_main_program will be used.
            share_vars_from(ParallelExecutor, default None): If provied,
                it will share variables from the specified ParallelExecutor.
T
typhoonzero 已提交
49
            num_trainers(int, default 1): If greater than 1, NCCL will be
T
typhoonzero 已提交
50 51
                initialized with multpile rank of nodes, each node should have
                same number of GPUs. Distributed training will be enabled then.
T
typhoonzero 已提交
52
            trainer_id(int, default 0): Must use together with num_trainers.
T
typhoonzero 已提交
53
                trainer_id is the "rank" of current node starts from 0.
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71

        Returns:
            A ParallelExecutor object.

        Raises:
            TypeError: If share_vars_from is provided, but not ParallelExecutor
                object.

        Examples:
            .. code-block:: python

              train_exe = fluid.ParallelExecutor(
                  use_cuda=True, loss_name=loss.name)
              test_exe = fluid.ParallelExecutor(
                  use_cuda=True,
                  main_program=test_program,
                  share_vars_from=train_exe)

72 73
              train_loss, = train_exe.run([loss.name], feed=feed_dict)
              test_loss, = test_exe.run([loss.name], feed=feed_dict)
74
        """
Y
yuyang18 已提交
75 76 77 78 79 80 81 82
        if len(kwargs) != 0:
            err_msg = ""
            for key in kwargs:
                if key in dir(ExecutionStrategy):
                    err_msg += \
                        "Setting {0} by constructor is deprecated. Use " \
                        "strategy=ExecutionStrategy(); strategy.{0}=xxx; " \
                        "pe=ParallelExecutor(exec_strategy=strategy) " \
Y
yuyang18 已提交
83 84 85 86 87 88 89 90 91 92
                        "instead.\n ".format(key)
                elif key in dir(BuildStrategy):
                    err_msg += \
                        "Setting {0} by constructor is deprecated. Use " \
                        "strategy=BuildStrategy(); See help(" \
                        "paddle.fluid.ParallelExecutor.BuildStrategy) \n".format(
                            key)
                else:
                    err_msg += "Setting {0} by constructor is deprecated. Use strategy.\n".format(
                        key)
Y
yuyang18 已提交
93
            raise ValueError(err_msg)
94

X
Xin Pan 已提交
95 96
        self._places = []
        self._act_places = []
97 98 99
        if use_cuda:
            for i in xrange(core.get_cuda_device_count()):
                p = core.Place()
X
Xin Pan 已提交
100 101 102
                self._act_places.append(core.CUDAPlace(i))
                p.set_place(self._act_places[-1])
                self._places.append(p)
103
        else:
104
            for i in xrange(min(4, multiprocessing.cpu_count())):
105
                p = core.Place()
L
Luo Tao 已提交
106
                self._act_places.append(core.CPUPlace())
X
Xin Pan 已提交
107 108 109
                p.set_place(self._act_places[-1])
                self._places.append(p)
        assert self._places, "no place for execution"
110

Y
yuyang18 已提交
111 112
        if exec_strategy is None:
            exec_strategy = ExecutionStrategy()
113
        exec_strategy.use_event = use_cuda
Y
yuyang18 已提交
114 115

        if exec_strategy.num_threads == 0:
X
Xin Pan 已提交
116 117 118
            if use_cuda:
                # Experiments on se-resnext shows that too many threads hurt
                # performance. Worth tunning for other models in the future.
Y
yuyang18 已提交
119
                exec_strategy.num_threads = len(self._places) * 2
X
Xin Pan 已提交
120
            else:
Y
yuyang18 已提交
121
                exec_strategy.num_threads = min(
122
                    len(self._places) * 2, multiprocessing.cpu_count())
123

Y
yuyang18 已提交
124 125 126
        if build_strategy is None:
            build_strategy = BuildStrategy()

127 128
        main = main_program
        main = main if main else framework.default_main_program()
129 130
        scope = executor.global_scope()

131 132 133 134 135 136
        if share_vars_from and not isinstance(share_vars_from,
                                              ParallelExecutor):
            raise TypeError("share_vars_from must be ParallelExecutor.")
        local_scopes = share_vars_from.executor.local_scopes(
        ) if share_vars_from else []

T
typhoonzero 已提交
137
        self.persistable_vars = [
138
            v.name
139 140
            for v in filter(
                lambda var: var.persistable and var.type != core.VarDesc.VarType.RAW,
T
typhoonzero 已提交
141
                main.list_vars())
142 143
        ]

144
        self.executor = core.ParallelExecutor(
X
Xin Pan 已提交
145
            self._places,
146 147 148 149
            set([
                p.name for p in main.global_block().iter_parameters()
                if not p.stop_gradient
            ]),
150 151 152
            set(self.persistable_vars), main.desc, loss_name
            if loss_name else '', scope, local_scopes, exec_strategy,
            build_strategy, num_trainers, trainer_id)
153 154
        self.scope = scope

Y
Yu Yang 已提交
155
    def run(self, fetch_list, feed=None, feed_dict=None):
X
Xin Pan 已提交
156
        """
Y
Yu Yang 已提交
157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
        Run a parallel executor with fetch_list.

        The feed parameter can be a dict or a list. If feed is a dict, the
        feed data will be split into multiple devices. If feed is a list, we
        assume the data has been splitted into multiple devices, the each
        element in the list will be copied to each device directly.

        For example, if the feed is a dict:
        >>> exe = ParallelExecutor()
        >>> # the image will be splitted into devices. If there is two devices
        >>> # each device will process an image with shape (24, 1, 28, 28)
        >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))})

        For example, if the feed is a list:
        >>> exe = ParallelExecutor()
        >>> # each device will process each element in the list.
        >>> # the 1st device will process an image with shape (48, 1, 28, 28)
        >>> # the 2nd device will process an image with shape (32, 1, 28, 28)
        >>> #
        >>> # you can use exe.device_count to get the device number.
        >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))},
        >>>               {"image": numpy.random.random(size=(32, 1, 28, 28))},
        >>>              ])

X
Xin Pan 已提交
181

Y
Yu Yang 已提交
182 183
        Args:
            fetch_list(list): The fetched variable names
Y
Yu Yang 已提交
184 185 186 187
            feed(list|dict|None): The feed variables. If the feed is a dict,
                tensors in that dict will be splitted into each devices. If
                the feed is a list, each element of the list will be copied
                to each device.
Y
Yu Yang 已提交
188
            feed_dict: Alias for feed parameter, for backward compatibility.
Y
Yu Yang 已提交
189
                This parameter is deprecated.
Y
Yu Yang 已提交
190 191 192

        Returns: fetched result list.

X
Xin Pan 已提交
193
        """
194
        if feed is None and feed_dict is not None:
J
JiayiFeng 已提交
195
            feed = feed_dict
Y
Yu Yang 已提交
196
            print >> sys.stderr, "`feed_dict` is deprecated. Please use `feed=`"
Y
Yu Yang 已提交
197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232

        if isinstance(feed, dict):
            feed_tensor_dict = dict()
            for feed_name in feed:
                feed_tensor = feed[feed_name]
                if not isinstance(feed_tensor, core.LoDTensor):
                    feed_tensor = core.LoDTensor()
                    # always set to CPU place, since the tensor need to be splitted
                    # it is fast in CPU
                    feed_tensor.set(feed[feed_name], core.CPUPlace())
                feed_tensor_dict[feed_name] = feed_tensor

            self.executor.feed_and_split_tensor_into_local_scopes(
                feed_tensor_dict)
        elif isinstance(feed, list) or isinstance(feed, tuple):
            if len(feed) != len(self._act_places):
                raise ValueError(
                    "Feed a list of tensor, the list should be the same size as places"
                )

            res = list()

            for i, each in enumerate(feed):
                if not isinstance(each, dict):
                    raise TypeError(
                        "Each element of feed list should be a dict")
                res_dict = dict()
                for feed_name in each:
                    tensor = each[feed_name]
                    if not isinstance(tensor, core.LoDTensor):
                        tmp = core.LoDTensor()
                        tmp.set(tensor, self._act_places[i])
                        tensor = tmp
                    res_dict[feed_name] = tensor
                res.append(res_dict)
            self.executor.feed_tensors_into_local_scopes(res)
X
Xin Pan 已提交
233

234
        fetch_var_name = '@FETCHED_VAR_NAME@'
Y
Yu Yang 已提交
235
        self.executor.run(fetch_list, fetch_var_name)
236 237
        arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
        return [arr[i] for i in range(len(arr))]
T
typhoonzero 已提交
238 239 240

    def bcast_params(self):
        self.executor.bcast_params(set(self.persistable_vars))
Y
Yu Yang 已提交
241 242 243 244

    @property
    def device_count(self):
        return len(self._act_places)