test_dist_base.py 11.8 KB
Newer Older
X
Xin Pan 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
14 15

from __future__ import print_function
X
Xin Pan 已提交
16 17 18 19 20
import time

import unittest
import os
import sys
M
minqiyang 已提交
21
import six
X
Xin Pan 已提交
22 23
import signal
import subprocess
T
typhoonzero 已提交
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
import six


class TestDistRunnerBase(object):
    def get_model(self, batch_size=2):
        raise NotImplementedError(
            "get_model should be implemented by child classes.")

    def get_transpiler(self, trainer_id, main_program, pserver_endpoints,
                       trainers):
        # NOTE: import fluid until runtime, or else forking processes will cause error.
        import paddle
        import paddle.fluid as fluid
        t = fluid.DistributeTranspiler()
        t.transpile(
            trainer_id=trainer_id,
            program=main_program,
            pservers=pserver_endpoints,
            trainers=trainers)
        return t

    def run_pserver(self, pserver_endpoints, trainers, current_endpoint,
                    trainer_id):
        import paddle
        import paddle.fluid as fluid
        self.get_model(batch_size=2)
        t = self.get_transpiler(trainer_id,
                                fluid.default_main_program(), pserver_endpoints,
                                trainers)
        pserver_prog = t.get_pserver_program(current_endpoint)
        startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(startup_prog)
        exe.run(pserver_prog)

    def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True):
        import paddle
        import paddle.fluid as fluid
        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
        self.get_model(batch_size=2)
        if is_dist:
            t = self.get_transpiler(trainer_id,
                                    fluid.default_main_program(), endpoints,
                                    trainers)
            trainer_prog = t.get_trainer_program()
        else:
            trainer_prog = fluid.default_main_program()

        startup_exe = fluid.Executor(place)
        startup_exe.run(fluid.default_startup_program())

        strategy = fluid.ExecutionStrategy()
        strategy.num_threads = 1
        strategy.allow_op_delay = False
W
Wu Yi 已提交
79 80 81 82 83 84 85
        build_stra = fluid.BuildStrategy()

        if args.use_reduce:
            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
        else:
            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce

T
typhoonzero 已提交
86
        exe = fluid.ParallelExecutor(
W
Wu Yi 已提交
87 88 89 90
            True,
            loss_name=avg_cost.name,
            exec_strategy=strategy,
            build_strategy=build_stra)
T
typhoonzero 已提交
91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118

        feed_var_list = [
            var for var in trainer_prog.global_block().vars.values()
            if var.is_data
        ]

        feeder = fluid.DataFeeder(feed_var_list, place)
        reader_generator = test_reader()

        data = next(reader_generator)
        first_loss, = exe.run(fetch_list=[avg_cost.name],
                              feed=feeder.feed(data))
        print(first_loss)

        for i in six.moves.xrange(5):
            data = next(reader_generator)
            loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))

        data = next(reader_generator)
        last_loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
        print(last_loss)


def runtime_main(test_class):
    import paddle
    import paddle.fluid as fluid
    import paddle.fluid.core as core

W
Wu Yi 已提交
119 120 121 122 123 124 125 126 127 128 129 130 131 132
    parser = argparse.ArgumentParser(description='Run dist test.')
    parser.add_argument(
        '--role', type=str, required=True, choices=['pserver', 'trainer'])
    parser.add_argument('--endpoints', type=str, required=False, default="")
    parser.add_argument('--is_dist', action='store_true')
    parser.add_argument('--trainer_id', type=int, required=False, default=0)
    parser.add_argument('--trainers', type=int, required=False, default=1)
    parser.add_argument(
        '--current_endpoint', type=str, required=False, default="")
    parser.add_argument('--sync_mode', action='store_true')
    parser.add_argument('--mem_opt', action='store_true')
    parser.add_argument('--use_reduce', action='store_true')

    args = parser.parse_args()
T
typhoonzero 已提交
133 134 135 136 137 138 139 140

    model = test_class()
    if role == "pserver":
        model.run_pserver(endpoints, trainers, current_endpoint, trainer_id)
    else:
        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
        ) else fluid.CPUPlace()
        model.run_trainer(p, endpoints, trainer_id, trainers, is_dist)
X
Xin Pan 已提交
141

M
minqiyang 已提交
142

M
minqiyang 已提交
143
import paddle.compat as cpt
M
minqiyang 已提交
144

X
Xin Pan 已提交
145 146 147 148 149 150 151

class TestDistBase(unittest.TestCase):
    def setUp(self):
        self._trainers = 2
        self._pservers = 2
        self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
        self._python_interp = "python"
W
Wu Yi 已提交
152 153 154 155
        self._sync_mode = True
        self._mem_opt = False
        self._use_reduce = False
        self._setup_config()
X
Xin Pan 已提交
156

G
gongweibao 已提交
157
    def start_pserver(self, model_file, check_error_log):
X
Xin Pan 已提交
158
        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
W
Wu Yi 已提交
159 160
        ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist"
        ps0_cmd = ps_cmd % \
X
Xin Pan 已提交
161 162
            (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
             self._trainers)
W
Wu Yi 已提交
163
        ps1_cmd = ps_cmd % \
X
Xin Pan 已提交
164 165 166
            (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
             self._trainers)

W
Wu Yi 已提交
167 168 169 170 171 172 173
        if self._sync_mode:
            ps0_cmd += " --sync_mode"
            ps1_cmd += " --sync_mode"
        if self._mem_opt:
            ps0_cmd += " --mem_opt"
            ps1_cmd += " --mem_opt"

G
gongweibao 已提交
174 175 176 177 178 179 180 181
        ps0_pipe = subprocess.PIPE
        ps1_pipe = subprocess.PIPE
        if check_error_log:
            print("ps0_cmd:", ps0_cmd)
            print("ps1_cmd:", ps1_cmd)
            ps0_pipe = open("/tmp/ps0_err.log", "wb")
            ps1_pipe = open("/tmp/ps1_err.log", "wb")

X
Xin Pan 已提交
182
        ps0_proc = subprocess.Popen(
G
gongweibao 已提交
183
            ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=ps0_pipe)
X
Xin Pan 已提交
184
        ps1_proc = subprocess.Popen(
G
gongweibao 已提交
185 186 187 188 189 190
            ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=ps1_pipe)

        if not check_error_log:
            return ps0_proc, ps1_proc, None, None
        else:
            return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
X
Xin Pan 已提交
191 192

    def _wait_ps_ready(self, pid):
X
polish  
Xin Pan 已提交
193
        retry_times = 50
X
Xin Pan 已提交
194 195 196 197 198 199 200 201
        while True:
            assert retry_times >= 0, "wait ps ready failed"
            time.sleep(3)
            try:
                # the listen_and_serv_op would touch a file which contains the listen port
                # on the /tmp directory until it was ready to process all the RPC call.
                os.stat("/tmp/paddle.%d.port" % pid)
                return
X
polish  
Xin Pan 已提交
202 203 204
            except os.error as e:
                sys.stderr.write('waiting for pserver: %s, left retry %d\n' %
                                 (e, retry_times))
X
Xin Pan 已提交
205 206
                retry_times -= 1

G
gongweibao 已提交
207
    def check_with_place(self, model_file, delta=1e-3, check_error_log=False):
X
Xin Pan 已提交
208 209 210 211 212
        # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
        required_envs = {
            "PATH": os.getenv("PATH"),
            "PYTHONPATH": os.getenv("PYTHONPATH"),
            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
W
Wu Yi 已提交
213 214
            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
            "FLAGS_cudnn_deterministic": "1"
X
Xin Pan 已提交
215
        }
G
gongweibao 已提交
216 217 218 219 220

        if check_error_log:
            required_envs["GLOG_v"] = "7"
            required_envs["GLOG_logtostderr"] = "1"

X
Xin Pan 已提交
221
        # Run local to get a base line
X
clean  
Xin Pan 已提交
222
        env_local = {"CUDA_VISIBLE_DEVICES": "0"}
X
Xin Pan 已提交
223 224 225 226
        env_local.update(required_envs)
        local_cmd = "%s %s trainer %s 0 %s %d FLASE" % \
            (self._python_interp, model_file,
             "127.0.0.1:1234", "127.0.0.1:1234", 1)
G
gongweibao 已提交
227 228 229 230 231 232 233 234 235 236 237 238 239 240 241
        if not check_error_log:
            local_proc = subprocess.Popen(
                local_cmd.split(" "),
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                env=env_local)
        else:
            print("trainer cmd:", local_cmd)
            err_log = open("/tmp/trainer.err.log", "wb")
            local_proc = subprocess.Popen(
                local_cmd.split(" "),
                stdout=subprocess.PIPE,
                stderr=err_log,
                env=env_local)

X
Xin Pan 已提交
242 243
        local_proc.wait()
        out, err = local_proc.communicate()
M
minqiyang 已提交
244
        local_ret = cpt.to_text(out)
X
Xin Pan 已提交
245 246 247 248
        sys.stderr.write('local_loss: %s\n' % local_ret)
        sys.stderr.write('local_stderr: %s\n' % err)

        # Run dist train to compare with local results
G
gongweibao 已提交
249 250
        ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model_file,
                                                          check_error_log)
X
Xin Pan 已提交
251 252 253 254
        self._wait_ps_ready(ps0.pid)
        self._wait_ps_ready(ps1.pid)

        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
W
Wu Yi 已提交
255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
        tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist"
        tr0_cmd = tr_cmd % \
            (self._python_interp, model_file, self._ps_endpoints,
             0, ps0_ep, self._trainers)
        tr1_cmd = tr_cmd % \
            (self._python_interp, model_file, self._ps_endpoints,
             1, ps1_ep, self._trainers)

        if self._sync_mode:
            tr0_cmd += " --sync_mode"
            tr1_cmd += " --sync_mode"
        if self._mem_opt:
            tr0_cmd += " --mem_opt"
            tr1_cmd += " --mem_opt"
        if self._use_reduce:
            tr0_cmd += " --use_reduce"
            tr1_cmd += " --use_reduce"
X
Xin Pan 已提交
272

X
clean  
Xin Pan 已提交
273 274
        env0 = {"CUDA_VISIBLE_DEVICES": "0"}
        env1 = {"CUDA_VISIBLE_DEVICES": "1"}
X
Xin Pan 已提交
275 276 277 278
        env0.update(required_envs)
        env1.update(required_envs)
        FNULL = open(os.devnull, 'w')

G
gongweibao 已提交
279 280 281 282 283 284 285 286
        tr0_pipe = subprocess.PIPE
        tr1_pipe = subprocess.PIPE
        if check_error_log:
            print("tr0_cmd:", tr0_cmd)
            print("tr1_cmd:", tr1_cmd)
            tr0_pipe = open("/tmp/tr0_err.log", "wb")
            tr1_pipe = open("/tmp/tr1_err.log", "wb")

X
Xin Pan 已提交
287 288 289
        tr0_proc = subprocess.Popen(
            tr0_cmd.split(" "),
            stdout=subprocess.PIPE,
G
gongweibao 已提交
290
            stderr=tr0_pipe,
X
Xin Pan 已提交
291 292 293 294
            env=env0)
        tr1_proc = subprocess.Popen(
            tr1_cmd.split(" "),
            stdout=subprocess.PIPE,
G
gongweibao 已提交
295
            stderr=tr1_pipe,
X
Xin Pan 已提交
296 297 298 299 300 301
            env=env1)

        tr0_proc.wait()
        tr1_proc.wait()
        out, err = tr0_proc.communicate()
        sys.stderr.write('dist_stderr: %s\n' % err)
M
minqiyang 已提交
302
        loss_data0 = cpt.to_text(out)
X
Xin Pan 已提交
303 304 305 306 307 308 309 310 311
        sys.stderr.write('dist_loss: %s\n' % loss_data0)
        lines = loss_data0.split("\n")
        dist_first_loss = eval(lines[0].replace(" ", ","))[0]
        dist_last_loss = eval(lines[1].replace(" ", ","))[0]

        local_lines = local_ret.split("\n")
        local_first_loss = eval(local_lines[0])[0]
        local_last_loss = eval(local_lines[1])[0]

G
gongweibao 已提交
312 313 314 315 316 317 318
        # close trainer file
        if check_error_log:
            tr0_pipe.close()
            tr1_pipe.close()

            ps0_pipe.close()
            ps1_pipe.close()
T
typhoonzero 已提交
319
        # FIXME: use terminate() instead of sigkill.
X
Xin Pan 已提交
320 321
        os.kill(ps0.pid, signal.SIGKILL)
        os.kill(ps1.pid, signal.SIGKILL)
W
Wu Yi 已提交
322 323 324 325
        ps0.terminate()
        ps1.terminate()
        ps0.wait()
        ps1.wait()
X
Xin Pan 已提交
326
        FNULL.close()
T
typhoonzero 已提交
327 328 329

        self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta)
        self.assertAlmostEqual(local_last_loss, dist_last_loss, delta=delta)