test_dist_base.py 12.0 KB
Newer Older
X
Xin Pan 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
14 15

from __future__ import print_function
X
Xin Pan 已提交
16 17 18 19 20
import time

import unittest
import os
import sys
M
minqiyang 已提交
21
import six
X
Xin Pan 已提交
22 23
import signal
import subprocess
W
Wu Yi 已提交
24
import argparse
T
typhoonzero 已提交
25 26 27 28 29 30 31 32


class TestDistRunnerBase(object):
    def get_model(self, batch_size=2):
        raise NotImplementedError(
            "get_model should be implemented by child classes.")

    def get_transpiler(self, trainer_id, main_program, pserver_endpoints,
W
Wu Yi 已提交
33
                       trainers, sync_mode):
T
typhoonzero 已提交
34 35 36 37 38 39 40 41
        # NOTE: import fluid until runtime, or else forking processes will cause error.
        import paddle
        import paddle.fluid as fluid
        t = fluid.DistributeTranspiler()
        t.transpile(
            trainer_id=trainer_id,
            program=main_program,
            pservers=pserver_endpoints,
W
Wu Yi 已提交
42 43
            trainers=trainers,
            sync_mode=sync_mode)
T
typhoonzero 已提交
44 45
        return t

W
Wu Yi 已提交
46
    def run_pserver(self, args):
T
typhoonzero 已提交
47 48 49
        import paddle
        import paddle.fluid as fluid
        self.get_model(batch_size=2)
W
Wu Yi 已提交
50 51 52 53 54 55 56 57
        if args.mem_opt:
            fluid.memory_optimize(fluid.default_main_program())
        t = self.get_transpiler(args.trainer_id,
                                fluid.default_main_program(), args.endpoints,
                                args.trainers, args.sync_mode)
        pserver_prog = t.get_pserver_program(args.current_endpoint)
        startup_prog = t.get_startup_program(args.current_endpoint,
                                             pserver_prog)
T
typhoonzero 已提交
58 59 60 61 62
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(startup_prog)
        exe.run(pserver_prog)

W
Wu Yi 已提交
63
    def run_trainer(self, place, args):
T
typhoonzero 已提交
64 65 66
        import paddle
        import paddle.fluid as fluid
        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
W
Wu Yi 已提交
67 68 69 70 71 72 73 74
            self.get_model(batch_size=2)
        if args.mem_opt:
            fluid.memory_optimize(fluid.default_main_program())
        if args.is_dist:
            t = self.get_transpiler(args.trainer_id,
                                    fluid.default_main_program(),
                                    args.endpoints, args.trainers,
                                    args.sync_mode)
T
typhoonzero 已提交
75 76 77 78 79 80 81 82 83 84
            trainer_prog = t.get_trainer_program()
        else:
            trainer_prog = fluid.default_main_program()

        startup_exe = fluid.Executor(place)
        startup_exe.run(fluid.default_startup_program())

        strategy = fluid.ExecutionStrategy()
        strategy.num_threads = 1
        strategy.allow_op_delay = False
W
Wu Yi 已提交
85 86 87 88 89 90 91
        build_stra = fluid.BuildStrategy()

        if args.use_reduce:
            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
        else:
            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce

T
typhoonzero 已提交
92
        exe = fluid.ParallelExecutor(
W
Wu Yi 已提交
93 94 95 96
            True,
            loss_name=avg_cost.name,
            exec_strategy=strategy,
            build_strategy=build_stra)
T
typhoonzero 已提交
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124

        feed_var_list = [
            var for var in trainer_prog.global_block().vars.values()
            if var.is_data
        ]

        feeder = fluid.DataFeeder(feed_var_list, place)
        reader_generator = test_reader()

        data = next(reader_generator)
        first_loss, = exe.run(fetch_list=[avg_cost.name],
                              feed=feeder.feed(data))
        print(first_loss)

        for i in six.moves.xrange(5):
            data = next(reader_generator)
            loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))

        data = next(reader_generator)
        last_loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
        print(last_loss)


def runtime_main(test_class):
    import paddle
    import paddle.fluid as fluid
    import paddle.fluid.core as core

W
Wu Yi 已提交
125 126 127 128 129 130 131 132 133 134 135
    parser = argparse.ArgumentParser(description='Run dist test.')
    parser.add_argument(
        '--role', type=str, required=True, choices=['pserver', 'trainer'])
    parser.add_argument('--endpoints', type=str, required=False, default="")
    parser.add_argument('--is_dist', action='store_true')
    parser.add_argument('--trainer_id', type=int, required=False, default=0)
    parser.add_argument('--trainers', type=int, required=False, default=1)
    parser.add_argument(
        '--current_endpoint', type=str, required=False, default="")
    parser.add_argument('--sync_mode', action='store_true')
    parser.add_argument('--mem_opt', action='store_true')
W
Wu Yi 已提交
136
    parser.add_argument('--use_reduce', action='store_true')
W
Wu Yi 已提交
137 138

    args = parser.parse_args()
T
typhoonzero 已提交
139 140

    model = test_class()
W
Wu Yi 已提交
141 142
    if args.role == "pserver" and args.is_dist:
        model.run_pserver(args)
T
typhoonzero 已提交
143 144 145
    else:
        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
        ) else fluid.CPUPlace()
W
Wu Yi 已提交
146
        model.run_trainer(p, args)
X
Xin Pan 已提交
147

M
minqiyang 已提交
148

M
minqiyang 已提交
149
import paddle.compat as cpt
M
minqiyang 已提交
150

X
Xin Pan 已提交
151 152

class TestDistBase(unittest.TestCase):
W
Wu Yi 已提交
153 154 155
    def _setup_config(self):
        raise NotImplementedError("tests should have _setup_config implemented")

X
Xin Pan 已提交
156 157 158 159 160
    def setUp(self):
        self._trainers = 2
        self._pservers = 2
        self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
        self._python_interp = "python"
W
Wu Yi 已提交
161
        self._sync_mode = True
W
Wu Yi 已提交
162
        self._mem_opt = False
W
Wu Yi 已提交
163
        self._use_reduce = False
W
Wu Yi 已提交
164
        self._setup_config()
X
Xin Pan 已提交
165

G
gongweibao 已提交
166
    def start_pserver(self, model_file, check_error_log):
X
Xin Pan 已提交
167
        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
W
Wu Yi 已提交
168
        ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist"
W
Wu Yi 已提交
169
        ps0_cmd = ps_cmd % \
X
Xin Pan 已提交
170
            (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
W
Wu Yi 已提交
171
             self._trainers)
W
Wu Yi 已提交
172
        ps1_cmd = ps_cmd % \
X
Xin Pan 已提交
173
            (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
W
Wu Yi 已提交
174 175 176 177 178 179 180 181
             self._trainers)

        if self._sync_mode:
            ps0_cmd += " --sync_mode"
            ps1_cmd += " --sync_mode"
        if self._mem_opt:
            ps0_cmd += " --mem_opt"
            ps1_cmd += " --mem_opt"
X
Xin Pan 已提交
182

G
gongweibao 已提交
183 184 185
        ps0_pipe = subprocess.PIPE
        ps1_pipe = subprocess.PIPE
        if check_error_log:
W
Wu Yi 已提交
186 187
            print(ps0_cmd)
            print(ps1_cmd)
G
gongweibao 已提交
188 189 190
            ps0_pipe = open("/tmp/ps0_err.log", "wb")
            ps1_pipe = open("/tmp/ps1_err.log", "wb")

X
Xin Pan 已提交
191
        ps0_proc = subprocess.Popen(
W
Wu Yi 已提交
192
            ps0_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps0_pipe)
X
Xin Pan 已提交
193
        ps1_proc = subprocess.Popen(
W
Wu Yi 已提交
194
            ps1_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps1_pipe)
G
gongweibao 已提交
195 196 197 198 199

        if not check_error_log:
            return ps0_proc, ps1_proc, None, None
        else:
            return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
X
Xin Pan 已提交
200 201

    def _wait_ps_ready(self, pid):
X
polish  
Xin Pan 已提交
202
        retry_times = 50
X
Xin Pan 已提交
203 204 205 206 207 208 209 210
        while True:
            assert retry_times >= 0, "wait ps ready failed"
            time.sleep(3)
            try:
                # the listen_and_serv_op would touch a file which contains the listen port
                # on the /tmp directory until it was ready to process all the RPC call.
                os.stat("/tmp/paddle.%d.port" % pid)
                return
X
polish  
Xin Pan 已提交
211 212 213
            except os.error as e:
                sys.stderr.write('waiting for pserver: %s, left retry %d\n' %
                                 (e, retry_times))
X
Xin Pan 已提交
214 215
                retry_times -= 1

G
gongweibao 已提交
216
    def check_with_place(self, model_file, delta=1e-3, check_error_log=False):
W
Wu Yi 已提交
217
        # TODO(typhoonzero): should auto adapt GPU count on the machine.
X
Xin Pan 已提交
218 219 220 221
        required_envs = {
            "PATH": os.getenv("PATH"),
            "PYTHONPATH": os.getenv("PYTHONPATH"),
            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
W
Wu Yi 已提交
222 223
            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
            "FLAGS_cudnn_deterministic": "1"
X
Xin Pan 已提交
224
        }
G
gongweibao 已提交
225 226 227 228 229

        if check_error_log:
            required_envs["GLOG_v"] = "7"
            required_envs["GLOG_logtostderr"] = "1"

X
Xin Pan 已提交
230
        # Run local to get a base line
X
clean  
Xin Pan 已提交
231
        env_local = {"CUDA_VISIBLE_DEVICES": "0"}
X
Xin Pan 已提交
232
        env_local.update(required_envs)
W
Wu Yi 已提交
233
        local_cmd = "%s %s --role trainer" % (self._python_interp, model_file)
G
gongweibao 已提交
234 235 236 237 238 239 240 241 242 243 244 245 246 247
        if not check_error_log:
            local_proc = subprocess.Popen(
                local_cmd.split(" "),
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                env=env_local)
        else:
            err_log = open("/tmp/trainer.err.log", "wb")
            local_proc = subprocess.Popen(
                local_cmd.split(" "),
                stdout=subprocess.PIPE,
                stderr=err_log,
                env=env_local)

X
Xin Pan 已提交
248 249
        local_proc.wait()
        out, err = local_proc.communicate()
M
minqiyang 已提交
250
        local_ret = cpt.to_text(out)
X
Xin Pan 已提交
251 252 253 254
        sys.stderr.write('local_loss: %s\n' % local_ret)
        sys.stderr.write('local_stderr: %s\n' % err)

        # Run dist train to compare with local results
G
gongweibao 已提交
255 256
        ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model_file,
                                                          check_error_log)
X
Xin Pan 已提交
257 258 259 260
        self._wait_ps_ready(ps0.pid)
        self._wait_ps_ready(ps1.pid)

        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
W
Wu Yi 已提交
261
        tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist"
W
Wu Yi 已提交
262 263
        tr0_cmd = tr_cmd % \
            (self._python_interp, model_file, self._ps_endpoints,
W
Wu Yi 已提交
264
             0, ps0_ep, self._trainers)
W
Wu Yi 已提交
265 266
        tr1_cmd = tr_cmd % \
            (self._python_interp, model_file, self._ps_endpoints,
W
Wu Yi 已提交
267 268 269 270 271 272 273 274 275 276 277
             1, ps1_ep, self._trainers)

        if self._sync_mode:
            tr0_cmd += " --sync_mode"
            tr1_cmd += " --sync_mode"
        if self._mem_opt:
            tr0_cmd += " --mem_opt"
            tr1_cmd += " --mem_opt"
        if self._use_reduce:
            tr0_cmd += " --use_reduce"
            tr1_cmd += " --use_reduce"
X
Xin Pan 已提交
278

X
clean  
Xin Pan 已提交
279 280
        env0 = {"CUDA_VISIBLE_DEVICES": "0"}
        env1 = {"CUDA_VISIBLE_DEVICES": "1"}
X
Xin Pan 已提交
281 282 283 284
        env0.update(required_envs)
        env1.update(required_envs)
        FNULL = open(os.devnull, 'w')

G
gongweibao 已提交
285 286 287 288 289 290 291 292
        tr0_pipe = subprocess.PIPE
        tr1_pipe = subprocess.PIPE
        if check_error_log:
            print("tr0_cmd:", tr0_cmd)
            print("tr1_cmd:", tr1_cmd)
            tr0_pipe = open("/tmp/tr0_err.log", "wb")
            tr1_pipe = open("/tmp/tr1_err.log", "wb")

X
Xin Pan 已提交
293
        tr0_proc = subprocess.Popen(
W
Wu Yi 已提交
294
            tr0_cmd.strip().split(" "),
X
Xin Pan 已提交
295
            stdout=subprocess.PIPE,
G
gongweibao 已提交
296
            stderr=tr0_pipe,
X
Xin Pan 已提交
297 298
            env=env0)
        tr1_proc = subprocess.Popen(
W
Wu Yi 已提交
299
            tr1_cmd.strip().split(" "),
X
Xin Pan 已提交
300
            stdout=subprocess.PIPE,
G
gongweibao 已提交
301
            stderr=tr1_pipe,
X
Xin Pan 已提交
302 303 304 305 306 307
            env=env1)

        tr0_proc.wait()
        tr1_proc.wait()
        out, err = tr0_proc.communicate()
        sys.stderr.write('dist_stderr: %s\n' % err)
M
minqiyang 已提交
308
        loss_data0 = cpt.to_text(out)
X
Xin Pan 已提交
309 310 311 312 313 314 315 316 317
        sys.stderr.write('dist_loss: %s\n' % loss_data0)
        lines = loss_data0.split("\n")
        dist_first_loss = eval(lines[0].replace(" ", ","))[0]
        dist_last_loss = eval(lines[1].replace(" ", ","))[0]

        local_lines = local_ret.split("\n")
        local_first_loss = eval(local_lines[0])[0]
        local_last_loss = eval(local_lines[1])[0]

G
gongweibao 已提交
318 319 320 321 322 323 324
        # close trainer file
        if check_error_log:
            tr0_pipe.close()
            tr1_pipe.close()

            ps0_pipe.close()
            ps1_pipe.close()
T
typhoonzero 已提交
325
        # FIXME: use terminate() instead of sigkill.
X
Xin Pan 已提交
326 327
        os.kill(ps0.pid, signal.SIGKILL)
        os.kill(ps1.pid, signal.SIGKILL)
W
Wu Yi 已提交
328 329
        ps0.terminate()
        ps1.terminate()
W
Wu Yi 已提交
330 331
        ps0.wait()
        ps1.wait()
X
Xin Pan 已提交
332
        FNULL.close()
T
typhoonzero 已提交
333 334 335

        self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta)
        self.assertAlmostEqual(local_last_loss, dist_last_loss, delta=delta)