test_dist_base.py 11.8 KB
Newer Older
X
Xin Pan 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
14 15

from __future__ import print_function
X
Xin Pan 已提交
16 17 18 19 20
import time

import unittest
import os
import sys
M
minqiyang 已提交
21
import six
X
Xin Pan 已提交
22 23
import signal
import subprocess
Y
yi.wu 已提交
24
import argparse
T
typhoonzero 已提交
25 26 27 28 29 30 31 32


class TestDistRunnerBase(object):
    def get_model(self, batch_size=2):
        raise NotImplementedError(
            "get_model should be implemented by child classes.")

    def get_transpiler(self, trainer_id, main_program, pserver_endpoints,
Y
yi.wu 已提交
33
                       trainers, sync_mode):
T
typhoonzero 已提交
34 35 36 37 38 39 40 41
        # NOTE: import fluid until runtime, or else forking processes will cause error.
        import paddle
        import paddle.fluid as fluid
        t = fluid.DistributeTranspiler()
        t.transpile(
            trainer_id=trainer_id,
            program=main_program,
            pservers=pserver_endpoints,
Y
yi.wu 已提交
42 43
            trainers=trainers,
            sync_mode=sync_mode)
T
typhoonzero 已提交
44 45
        return t

Y
yi.wu 已提交
46
    def run_pserver(self, args):
T
typhoonzero 已提交
47 48 49
        import paddle
        import paddle.fluid as fluid
        self.get_model(batch_size=2)
Y
yi.wu 已提交
50 51 52 53 54 55
        t = self.get_transpiler(args.trainer_id,
                                fluid.default_main_program(), args.endpoints,
                                args.trainers, args.sync_mode)
        pserver_prog = t.get_pserver_program(args.current_endpoint)
        startup_prog = t.get_startup_program(args.current_endpoint,
                                             pserver_prog)
T
typhoonzero 已提交
56 57 58 59 60
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(startup_prog)
        exe.run(pserver_prog)

Y
yi.wu 已提交
61
    def run_trainer(self, place, args):
T
typhoonzero 已提交
62 63 64 65
        import paddle
        import paddle.fluid as fluid
        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
        self.get_model(batch_size=2)
Y
yi.wu 已提交
66 67 68 69 70
        if args.is_dist:
            t = self.get_transpiler(args.trainer_id,
                                    fluid.default_main_program(),
                                    args.endpoints, args.trainers,
                                    args.sync_mode)
T
typhoonzero 已提交
71 72 73 74 75 76 77 78 79 80
            trainer_prog = t.get_trainer_program()
        else:
            trainer_prog = fluid.default_main_program()

        startup_exe = fluid.Executor(place)
        startup_exe.run(fluid.default_startup_program())

        strategy = fluid.ExecutionStrategy()
        strategy.num_threads = 1
        strategy.allow_op_delay = False
W
Wu Yi 已提交
81 82 83 84 85 86 87
        build_stra = fluid.BuildStrategy()

        if args.use_reduce:
            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
        else:
            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce

T
typhoonzero 已提交
88
        exe = fluid.ParallelExecutor(
W
Wu Yi 已提交
89 90 91 92
            True,
            loss_name=avg_cost.name,
            exec_strategy=strategy,
            build_strategy=build_stra)
T
typhoonzero 已提交
93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120

        feed_var_list = [
            var for var in trainer_prog.global_block().vars.values()
            if var.is_data
        ]

        feeder = fluid.DataFeeder(feed_var_list, place)
        reader_generator = test_reader()

        data = next(reader_generator)
        first_loss, = exe.run(fetch_list=[avg_cost.name],
                              feed=feeder.feed(data))
        print(first_loss)

        for i in six.moves.xrange(5):
            data = next(reader_generator)
            loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))

        data = next(reader_generator)
        last_loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
        print(last_loss)


def runtime_main(test_class):
    import paddle
    import paddle.fluid as fluid
    import paddle.fluid.core as core

W
Wu Yi 已提交
121 122 123 124 125 126 127 128 129 130 131 132 133 134
    parser = argparse.ArgumentParser(description='Run dist test.')
    parser.add_argument(
        '--role', type=str, required=True, choices=['pserver', 'trainer'])
    parser.add_argument('--endpoints', type=str, required=False, default="")
    parser.add_argument('--is_dist', action='store_true')
    parser.add_argument('--trainer_id', type=int, required=False, default=0)
    parser.add_argument('--trainers', type=int, required=False, default=1)
    parser.add_argument(
        '--current_endpoint', type=str, required=False, default="")
    parser.add_argument('--sync_mode', action='store_true')
    parser.add_argument('--mem_opt', action='store_true')
    parser.add_argument('--use_reduce', action='store_true')

    args = parser.parse_args()
T
typhoonzero 已提交
135 136

    model = test_class()
Y
yi.wu 已提交
137 138
    if args.role == "pserver":
        model.run_pserver(args)
T
typhoonzero 已提交
139 140 141
    else:
        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
        ) else fluid.CPUPlace()
Y
yi.wu 已提交
142
        model.run_trainer(p, args)
X
Xin Pan 已提交
143

M
minqiyang 已提交
144

M
minqiyang 已提交
145
import paddle.compat as cpt
M
minqiyang 已提交
146

X
Xin Pan 已提交
147 148

class TestDistBase(unittest.TestCase):
Y
yi.wu 已提交
149 150 151
    def _setup_config(self):
        raise NotImplementedError("tests should have _setup_config implemented")

X
Xin Pan 已提交
152 153 154 155 156
    def setUp(self):
        self._trainers = 2
        self._pservers = 2
        self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
        self._python_interp = "python"
W
Wu Yi 已提交
157 158 159 160
        self._sync_mode = True
        self._mem_opt = False
        self._use_reduce = False
        self._setup_config()
X
Xin Pan 已提交
161

G
gongweibao 已提交
162
    def start_pserver(self, model_file, check_error_log):
X
Xin Pan 已提交
163
        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
W
Wu Yi 已提交
164 165
        ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist"
        ps0_cmd = ps_cmd % \
X
Xin Pan 已提交
166 167
            (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
             self._trainers)
W
Wu Yi 已提交
168
        ps1_cmd = ps_cmd % \
X
Xin Pan 已提交
169 170 171
            (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
             self._trainers)

W
Wu Yi 已提交
172 173 174 175 176 177 178
        if self._sync_mode:
            ps0_cmd += " --sync_mode"
            ps1_cmd += " --sync_mode"
        if self._mem_opt:
            ps0_cmd += " --mem_opt"
            ps1_cmd += " --mem_opt"

G
gongweibao 已提交
179 180 181 182 183 184 185 186
        ps0_pipe = subprocess.PIPE
        ps1_pipe = subprocess.PIPE
        if check_error_log:
            print("ps0_cmd:", ps0_cmd)
            print("ps1_cmd:", ps1_cmd)
            ps0_pipe = open("/tmp/ps0_err.log", "wb")
            ps1_pipe = open("/tmp/ps1_err.log", "wb")

X
Xin Pan 已提交
187
        ps0_proc = subprocess.Popen(
G
gongweibao 已提交
188
            ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=ps0_pipe)
X
Xin Pan 已提交
189
        ps1_proc = subprocess.Popen(
G
gongweibao 已提交
190 191 192 193 194 195
            ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=ps1_pipe)

        if not check_error_log:
            return ps0_proc, ps1_proc, None, None
        else:
            return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
X
Xin Pan 已提交
196 197

    def _wait_ps_ready(self, pid):
X
polish  
Xin Pan 已提交
198
        retry_times = 50
X
Xin Pan 已提交
199 200 201 202 203 204 205 206
        while True:
            assert retry_times >= 0, "wait ps ready failed"
            time.sleep(3)
            try:
                # the listen_and_serv_op would touch a file which contains the listen port
                # on the /tmp directory until it was ready to process all the RPC call.
                os.stat("/tmp/paddle.%d.port" % pid)
                return
X
polish  
Xin Pan 已提交
207 208 209
            except os.error as e:
                sys.stderr.write('waiting for pserver: %s, left retry %d\n' %
                                 (e, retry_times))
X
Xin Pan 已提交
210 211
                retry_times -= 1

G
gongweibao 已提交
212
    def check_with_place(self, model_file, delta=1e-3, check_error_log=False):
X
Xin Pan 已提交
213 214 215 216 217
        # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
        required_envs = {
            "PATH": os.getenv("PATH"),
            "PYTHONPATH": os.getenv("PYTHONPATH"),
            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
W
Wu Yi 已提交
218 219
            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
            "FLAGS_cudnn_deterministic": "1"
X
Xin Pan 已提交
220
        }
G
gongweibao 已提交
221 222 223 224 225

        if check_error_log:
            required_envs["GLOG_v"] = "7"
            required_envs["GLOG_logtostderr"] = "1"

X
Xin Pan 已提交
226
        # Run local to get a base line
X
clean  
Xin Pan 已提交
227
        env_local = {"CUDA_VISIBLE_DEVICES": "0"}
X
Xin Pan 已提交
228
        env_local.update(required_envs)
Y
yi.wu 已提交
229
        local_cmd = "%s %s --role trainer" % (self._python_interp, model_file)
G
gongweibao 已提交
230 231 232 233 234 235 236 237 238 239 240 241 242 243 244
        if not check_error_log:
            local_proc = subprocess.Popen(
                local_cmd.split(" "),
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                env=env_local)
        else:
            print("trainer cmd:", local_cmd)
            err_log = open("/tmp/trainer.err.log", "wb")
            local_proc = subprocess.Popen(
                local_cmd.split(" "),
                stdout=subprocess.PIPE,
                stderr=err_log,
                env=env_local)

X
Xin Pan 已提交
245 246
        local_proc.wait()
        out, err = local_proc.communicate()
M
minqiyang 已提交
247
        local_ret = cpt.to_text(out)
X
Xin Pan 已提交
248 249 250 251
        sys.stderr.write('local_loss: %s\n' % local_ret)
        sys.stderr.write('local_stderr: %s\n' % err)

        # Run dist train to compare with local results
G
gongweibao 已提交
252 253
        ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model_file,
                                                          check_error_log)
X
Xin Pan 已提交
254 255 256 257
        self._wait_ps_ready(ps0.pid)
        self._wait_ps_ready(ps1.pid)

        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
W
Wu Yi 已提交
258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274
        tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist"
        tr0_cmd = tr_cmd % \
            (self._python_interp, model_file, self._ps_endpoints,
             0, ps0_ep, self._trainers)
        tr1_cmd = tr_cmd % \
            (self._python_interp, model_file, self._ps_endpoints,
             1, ps1_ep, self._trainers)

        if self._sync_mode:
            tr0_cmd += " --sync_mode"
            tr1_cmd += " --sync_mode"
        if self._mem_opt:
            tr0_cmd += " --mem_opt"
            tr1_cmd += " --mem_opt"
        if self._use_reduce:
            tr0_cmd += " --use_reduce"
            tr1_cmd += " --use_reduce"
X
Xin Pan 已提交
275

X
clean  
Xin Pan 已提交
276 277
        env0 = {"CUDA_VISIBLE_DEVICES": "0"}
        env1 = {"CUDA_VISIBLE_DEVICES": "1"}
X
Xin Pan 已提交
278 279 280 281
        env0.update(required_envs)
        env1.update(required_envs)
        FNULL = open(os.devnull, 'w')

G
gongweibao 已提交
282 283 284 285 286 287 288 289
        tr0_pipe = subprocess.PIPE
        tr1_pipe = subprocess.PIPE
        if check_error_log:
            print("tr0_cmd:", tr0_cmd)
            print("tr1_cmd:", tr1_cmd)
            tr0_pipe = open("/tmp/tr0_err.log", "wb")
            tr1_pipe = open("/tmp/tr1_err.log", "wb")

X
Xin Pan 已提交
290 291 292
        tr0_proc = subprocess.Popen(
            tr0_cmd.split(" "),
            stdout=subprocess.PIPE,
G
gongweibao 已提交
293
            stderr=tr0_pipe,
X
Xin Pan 已提交
294 295 296 297
            env=env0)
        tr1_proc = subprocess.Popen(
            tr1_cmd.split(" "),
            stdout=subprocess.PIPE,
G
gongweibao 已提交
298
            stderr=tr1_pipe,
X
Xin Pan 已提交
299 300 301 302 303 304
            env=env1)

        tr0_proc.wait()
        tr1_proc.wait()
        out, err = tr0_proc.communicate()
        sys.stderr.write('dist_stderr: %s\n' % err)
M
minqiyang 已提交
305
        loss_data0 = cpt.to_text(out)
X
Xin Pan 已提交
306 307 308 309 310 311 312 313 314
        sys.stderr.write('dist_loss: %s\n' % loss_data0)
        lines = loss_data0.split("\n")
        dist_first_loss = eval(lines[0].replace(" ", ","))[0]
        dist_last_loss = eval(lines[1].replace(" ", ","))[0]

        local_lines = local_ret.split("\n")
        local_first_loss = eval(local_lines[0])[0]
        local_last_loss = eval(local_lines[1])[0]

G
gongweibao 已提交
315 316 317 318 319 320 321
        # close trainer file
        if check_error_log:
            tr0_pipe.close()
            tr1_pipe.close()

            ps0_pipe.close()
            ps1_pipe.close()
T
typhoonzero 已提交
322
        # FIXME: use terminate() instead of sigkill.
X
Xin Pan 已提交
323 324
        os.kill(ps0.pid, signal.SIGKILL)
        os.kill(ps1.pid, signal.SIGKILL)
W
Wu Yi 已提交
325 326 327 328
        ps0.terminate()
        ps1.terminate()
        ps0.wait()
        ps1.wait()
X
Xin Pan 已提交
329
        FNULL.close()
T
typhoonzero 已提交
330 331 332

        self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta)
        self.assertAlmostEqual(local_last_loss, dist_last_loss, delta=delta)