test_dist_base.py 12.3 KB
Newer Older
X
Xin Pan 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
14 15

from __future__ import print_function
X
Xin Pan 已提交
16 17 18 19 20
import time

import unittest
import os
import sys
M
minqiyang 已提交
21
import six
X
Xin Pan 已提交
22 23
import signal
import subprocess
W
Wu Yi 已提交
24
import argparse
T
typhoonzero 已提交
25 26 27 28 29 30 31 32


class TestDistRunnerBase(object):
    def get_model(self, batch_size=2):
        raise NotImplementedError(
            "get_model should be implemented by child classes.")

    def get_transpiler(self, trainer_id, main_program, pserver_endpoints,
W
Wu Yi 已提交
33
                       trainers, sync_mode):
T
typhoonzero 已提交
34 35 36 37 38 39 40 41
        # NOTE: import fluid until runtime, or else forking processes will cause error.
        import paddle
        import paddle.fluid as fluid
        t = fluid.DistributeTranspiler()
        t.transpile(
            trainer_id=trainer_id,
            program=main_program,
            pservers=pserver_endpoints,
W
Wu Yi 已提交
42 43
            trainers=trainers,
            sync_mode=sync_mode)
T
typhoonzero 已提交
44 45
        return t

W
Wu Yi 已提交
46
    def run_pserver(self, args):
T
typhoonzero 已提交
47 48 49
        import paddle
        import paddle.fluid as fluid
        self.get_model(batch_size=2)
W
Wu Yi 已提交
50 51 52 53 54 55 56 57
        if args.mem_opt:
            fluid.memory_optimize(fluid.default_main_program())
        t = self.get_transpiler(args.trainer_id,
                                fluid.default_main_program(), args.endpoints,
                                args.trainers, args.sync_mode)
        pserver_prog = t.get_pserver_program(args.current_endpoint)
        startup_prog = t.get_startup_program(args.current_endpoint,
                                             pserver_prog)
Y
Yancey1989 已提交
58

T
typhoonzero 已提交
59 60 61 62 63
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(startup_prog)
        exe.run(pserver_prog)

W
Wu Yi 已提交
64
    def run_trainer(self, use_cuda, args):
T
typhoonzero 已提交
65 66
        import paddle
        import paddle.fluid as fluid
W
Wu Yi 已提交
67
        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
T
typhoonzero 已提交
68
        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
W
Wu Yi 已提交
69 70 71 72 73 74 75 76
            self.get_model(batch_size=2)
        if args.mem_opt:
            fluid.memory_optimize(fluid.default_main_program())
        if args.is_dist:
            t = self.get_transpiler(args.trainer_id,
                                    fluid.default_main_program(),
                                    args.endpoints, args.trainers,
                                    args.sync_mode)
T
typhoonzero 已提交
77 78 79 80 81 82 83 84 85 86
            trainer_prog = t.get_trainer_program()
        else:
            trainer_prog = fluid.default_main_program()

        startup_exe = fluid.Executor(place)
        startup_exe.run(fluid.default_startup_program())

        strategy = fluid.ExecutionStrategy()
        strategy.num_threads = 1
        strategy.allow_op_delay = False
W
Wu Yi 已提交
87 88 89 90 91 92 93
        build_stra = fluid.BuildStrategy()

        if args.use_reduce:
            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
        else:
            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce

T
typhoonzero 已提交
94
        exe = fluid.ParallelExecutor(
W
Wu Yi 已提交
95
            use_cuda,
W
Wu Yi 已提交
96 97 98
            loss_name=avg_cost.name,
            exec_strategy=strategy,
            build_strategy=build_stra)
T
typhoonzero 已提交
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126

        feed_var_list = [
            var for var in trainer_prog.global_block().vars.values()
            if var.is_data
        ]

        feeder = fluid.DataFeeder(feed_var_list, place)
        reader_generator = test_reader()

        data = next(reader_generator)
        first_loss, = exe.run(fetch_list=[avg_cost.name],
                              feed=feeder.feed(data))
        print(first_loss)

        for i in six.moves.xrange(5):
            data = next(reader_generator)
            loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))

        data = next(reader_generator)
        last_loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
        print(last_loss)


def runtime_main(test_class):
    import paddle
    import paddle.fluid as fluid
    import paddle.fluid.core as core

W
Wu Yi 已提交
127 128 129 130 131 132 133 134 135 136 137
    parser = argparse.ArgumentParser(description='Run dist test.')
    parser.add_argument(
        '--role', type=str, required=True, choices=['pserver', 'trainer'])
    parser.add_argument('--endpoints', type=str, required=False, default="")
    parser.add_argument('--is_dist', action='store_true')
    parser.add_argument('--trainer_id', type=int, required=False, default=0)
    parser.add_argument('--trainers', type=int, required=False, default=1)
    parser.add_argument(
        '--current_endpoint', type=str, required=False, default="")
    parser.add_argument('--sync_mode', action='store_true')
    parser.add_argument('--mem_opt', action='store_true')
W
Wu Yi 已提交
138
    parser.add_argument('--use_reduce', action='store_true')
W
Wu Yi 已提交
139 140

    args = parser.parse_args()
T
typhoonzero 已提交
141 142

    model = test_class()
W
Wu Yi 已提交
143 144
    if args.role == "pserver" and args.is_dist:
        model.run_pserver(args)
T
typhoonzero 已提交
145
    else:
W
Wu Yi 已提交
146 147
        use_cuda = True if core.is_compiled_with_cuda() else False
        model.run_trainer(use_cuda, args)
X
Xin Pan 已提交
148

M
minqiyang 已提交
149

M
minqiyang 已提交
150
import paddle.compat as cpt
Y
Yancey1989 已提交
151 152
import socket
from contextlib import closing
M
minqiyang 已提交
153

X
Xin Pan 已提交
154 155

class TestDistBase(unittest.TestCase):
W
Wu Yi 已提交
156 157 158
    def _setup_config(self):
        raise NotImplementedError("tests should have _setup_config implemented")

X
Xin Pan 已提交
159 160 161
    def setUp(self):
        self._trainers = 2
        self._pservers = 2
Y
Yancey1989 已提交
162 163
        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
            self._find_free_port(), self._find_free_port())
X
Xin Pan 已提交
164
        self._python_interp = "python"
W
Wu Yi 已提交
165
        self._sync_mode = True
W
Wu Yi 已提交
166
        self._mem_opt = False
W
Wu Yi 已提交
167
        self._use_reduce = False
W
Wu Yi 已提交
168
        self._setup_config()
X
Xin Pan 已提交
169

Y
Yancey1989 已提交
170 171 172 173 174
    def _find_free_port(self):
        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
            s.bind(('', 0))
            return s.getsockname()[1]

G
gongweibao 已提交
175
    def start_pserver(self, model_file, check_error_log):
X
Xin Pan 已提交
176
        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
W
Wu Yi 已提交
177
        ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist"
W
Wu Yi 已提交
178
        ps0_cmd = ps_cmd % \
X
Xin Pan 已提交
179
            (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
W
Wu Yi 已提交
180
             self._trainers)
W
Wu Yi 已提交
181
        ps1_cmd = ps_cmd % \
X
Xin Pan 已提交
182
            (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
W
Wu Yi 已提交
183 184 185 186 187 188 189 190
             self._trainers)

        if self._sync_mode:
            ps0_cmd += " --sync_mode"
            ps1_cmd += " --sync_mode"
        if self._mem_opt:
            ps0_cmd += " --mem_opt"
            ps1_cmd += " --mem_opt"
X
Xin Pan 已提交
191

G
gongweibao 已提交
192 193 194
        ps0_pipe = subprocess.PIPE
        ps1_pipe = subprocess.PIPE
        if check_error_log:
W
Wu Yi 已提交
195 196
            print(ps0_cmd)
            print(ps1_cmd)
G
gongweibao 已提交
197 198 199
            ps0_pipe = open("/tmp/ps0_err.log", "wb")
            ps1_pipe = open("/tmp/ps1_err.log", "wb")

X
Xin Pan 已提交
200
        ps0_proc = subprocess.Popen(
W
Wu Yi 已提交
201
            ps0_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps0_pipe)
X
Xin Pan 已提交
202
        ps1_proc = subprocess.Popen(
W
Wu Yi 已提交
203
            ps1_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps1_pipe)
G
gongweibao 已提交
204 205 206 207 208

        if not check_error_log:
            return ps0_proc, ps1_proc, None, None
        else:
            return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
X
Xin Pan 已提交
209 210

    def _wait_ps_ready(self, pid):
X
polish  
Xin Pan 已提交
211
        retry_times = 50
X
Xin Pan 已提交
212 213 214 215 216 217 218 219
        while True:
            assert retry_times >= 0, "wait ps ready failed"
            time.sleep(3)
            try:
                # the listen_and_serv_op would touch a file which contains the listen port
                # on the /tmp directory until it was ready to process all the RPC call.
                os.stat("/tmp/paddle.%d.port" % pid)
                return
X
polish  
Xin Pan 已提交
220 221 222
            except os.error as e:
                sys.stderr.write('waiting for pserver: %s, left retry %d\n' %
                                 (e, retry_times))
X
Xin Pan 已提交
223 224
                retry_times -= 1

G
gongweibao 已提交
225
    def check_with_place(self, model_file, delta=1e-3, check_error_log=False):
W
Wu Yi 已提交
226
        # TODO(typhoonzero): should auto adapt GPU count on the machine.
X
Xin Pan 已提交
227
        required_envs = {
W
Wu Yi 已提交
228 229 230
            "PATH": os.getenv("PATH", ""),
            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
W
Wu Yi 已提交
231
            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
W
Wu Yi 已提交
232 233
            "FLAGS_cudnn_deterministic": "1",
            "CPU_NUM": "1"
X
Xin Pan 已提交
234
        }
G
gongweibao 已提交
235 236 237 238 239

        if check_error_log:
            required_envs["GLOG_v"] = "7"
            required_envs["GLOG_logtostderr"] = "1"

X
Xin Pan 已提交
240
        # Run local to get a base line
X
clean  
Xin Pan 已提交
241
        env_local = {"CUDA_VISIBLE_DEVICES": "0"}
X
Xin Pan 已提交
242
        env_local.update(required_envs)
W
Wu Yi 已提交
243
        local_cmd = "%s %s --role trainer" % (self._python_interp, model_file)
G
gongweibao 已提交
244 245 246 247 248 249 250 251 252 253 254 255 256 257
        if not check_error_log:
            local_proc = subprocess.Popen(
                local_cmd.split(" "),
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                env=env_local)
        else:
            err_log = open("/tmp/trainer.err.log", "wb")
            local_proc = subprocess.Popen(
                local_cmd.split(" "),
                stdout=subprocess.PIPE,
                stderr=err_log,
                env=env_local)

X
Xin Pan 已提交
258 259
        local_proc.wait()
        out, err = local_proc.communicate()
M
minqiyang 已提交
260
        local_ret = cpt.to_text(out)
X
Xin Pan 已提交
261 262 263 264
        sys.stderr.write('local_loss: %s\n' % local_ret)
        sys.stderr.write('local_stderr: %s\n' % err)

        # Run dist train to compare with local results
G
gongweibao 已提交
265 266
        ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model_file,
                                                          check_error_log)
X
Xin Pan 已提交
267 268 269 270
        self._wait_ps_ready(ps0.pid)
        self._wait_ps_ready(ps1.pid)

        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
W
Wu Yi 已提交
271
        tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist"
W
Wu Yi 已提交
272 273
        tr0_cmd = tr_cmd % \
            (self._python_interp, model_file, self._ps_endpoints,
W
Wu Yi 已提交
274
             0, ps0_ep, self._trainers)
W
Wu Yi 已提交
275 276
        tr1_cmd = tr_cmd % \
            (self._python_interp, model_file, self._ps_endpoints,
W
Wu Yi 已提交
277 278 279 280 281 282 283 284 285 286 287
             1, ps1_ep, self._trainers)

        if self._sync_mode:
            tr0_cmd += " --sync_mode"
            tr1_cmd += " --sync_mode"
        if self._mem_opt:
            tr0_cmd += " --mem_opt"
            tr1_cmd += " --mem_opt"
        if self._use_reduce:
            tr0_cmd += " --use_reduce"
            tr1_cmd += " --use_reduce"
X
Xin Pan 已提交
288

X
clean  
Xin Pan 已提交
289 290
        env0 = {"CUDA_VISIBLE_DEVICES": "0"}
        env1 = {"CUDA_VISIBLE_DEVICES": "1"}
X
Xin Pan 已提交
291 292 293 294
        env0.update(required_envs)
        env1.update(required_envs)
        FNULL = open(os.devnull, 'w')

G
gongweibao 已提交
295 296 297 298 299 300 301 302
        tr0_pipe = subprocess.PIPE
        tr1_pipe = subprocess.PIPE
        if check_error_log:
            print("tr0_cmd:", tr0_cmd)
            print("tr1_cmd:", tr1_cmd)
            tr0_pipe = open("/tmp/tr0_err.log", "wb")
            tr1_pipe = open("/tmp/tr1_err.log", "wb")

X
Xin Pan 已提交
303
        tr0_proc = subprocess.Popen(
W
Wu Yi 已提交
304
            tr0_cmd.strip().split(" "),
X
Xin Pan 已提交
305
            stdout=subprocess.PIPE,
G
gongweibao 已提交
306
            stderr=tr0_pipe,
X
Xin Pan 已提交
307 308
            env=env0)
        tr1_proc = subprocess.Popen(
W
Wu Yi 已提交
309
            tr1_cmd.strip().split(" "),
X
Xin Pan 已提交
310
            stdout=subprocess.PIPE,
G
gongweibao 已提交
311
            stderr=tr1_pipe,
X
Xin Pan 已提交
312 313 314 315 316 317
            env=env1)

        tr0_proc.wait()
        tr1_proc.wait()
        out, err = tr0_proc.communicate()
        sys.stderr.write('dist_stderr: %s\n' % err)
M
minqiyang 已提交
318
        loss_data0 = cpt.to_text(out)
X
Xin Pan 已提交
319 320 321 322 323 324 325 326 327
        sys.stderr.write('dist_loss: %s\n' % loss_data0)
        lines = loss_data0.split("\n")
        dist_first_loss = eval(lines[0].replace(" ", ","))[0]
        dist_last_loss = eval(lines[1].replace(" ", ","))[0]

        local_lines = local_ret.split("\n")
        local_first_loss = eval(local_lines[0])[0]
        local_last_loss = eval(local_lines[1])[0]

G
gongweibao 已提交
328 329 330 331 332 333 334
        # close trainer file
        if check_error_log:
            tr0_pipe.close()
            tr1_pipe.close()

            ps0_pipe.close()
            ps1_pipe.close()
T
typhoonzero 已提交
335
        # FIXME: use terminate() instead of sigkill.
X
Xin Pan 已提交
336 337
        os.kill(ps0.pid, signal.SIGKILL)
        os.kill(ps1.pid, signal.SIGKILL)
W
Wu Yi 已提交
338 339
        ps0.terminate()
        ps1.terminate()
W
Wu Yi 已提交
340 341
        ps0.wait()
        ps1.wait()
X
Xin Pan 已提交
342
        FNULL.close()
T
typhoonzero 已提交
343 344 345

        self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta)
        self.assertAlmostEqual(local_last_loss, dist_last_loss, delta=delta)