test_dist_base.py 11.3 KB
Newer Older
X
Xin Pan 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
14 15

from __future__ import print_function
X
Xin Pan 已提交
16 17 18 19 20
import time

import unittest
import os
import sys
M
minqiyang 已提交
21
import six
X
Xin Pan 已提交
22 23
import signal
import subprocess
T
typhoonzero 已提交
24 25 26 27 28 29 30 31 32
import six


class TestDistRunnerBase(object):
    def get_model(self, batch_size=2):
        raise NotImplementedError(
            "get_model should be implemented by child classes.")

    def get_transpiler(self, trainer_id, main_program, pserver_endpoints,
W
Wu Yi 已提交
33
                       trainers, sync_mode):
T
typhoonzero 已提交
34 35 36 37 38 39 40 41
        # NOTE: import fluid until runtime, or else forking processes will cause error.
        import paddle
        import paddle.fluid as fluid
        t = fluid.DistributeTranspiler()
        t.transpile(
            trainer_id=trainer_id,
            program=main_program,
            pservers=pserver_endpoints,
W
Wu Yi 已提交
42 43
            trainers=trainers,
            sync_mode=sync_mode)
T
typhoonzero 已提交
44 45
        return t

W
Wu Yi 已提交
46 47 48 49 50 51
    def run_pserver(self,
                    pserver_endpoints,
                    trainers,
                    current_endpoint,
                    trainer_id,
                    sync_mode=True):
T
typhoonzero 已提交
52 53 54 55 56
        import paddle
        import paddle.fluid as fluid
        self.get_model(batch_size=2)
        t = self.get_transpiler(trainer_id,
                                fluid.default_main_program(), pserver_endpoints,
W
Wu Yi 已提交
57
                                trainers, sync_mode)
T
typhoonzero 已提交
58 59 60 61 62 63 64
        pserver_prog = t.get_pserver_program(current_endpoint)
        startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(startup_prog)
        exe.run(pserver_prog)

W
Wu Yi 已提交
65 66 67 68 69 70 71
    def run_trainer(self,
                    place,
                    endpoints,
                    trainer_id,
                    trainers,
                    is_dist=True,
                    sync_mode=True):
T
typhoonzero 已提交
72 73 74 75 76 77 78
        import paddle
        import paddle.fluid as fluid
        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
        self.get_model(batch_size=2)
        if is_dist:
            t = self.get_transpiler(trainer_id,
                                    fluid.default_main_program(), endpoints,
W
Wu Yi 已提交
79
                                    trainers, sync_mode)
T
typhoonzero 已提交
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
            trainer_prog = t.get_trainer_program()
        else:
            trainer_prog = fluid.default_main_program()

        startup_exe = fluid.Executor(place)
        startup_exe.run(fluid.default_startup_program())

        strategy = fluid.ExecutionStrategy()
        strategy.num_threads = 1
        strategy.allow_op_delay = False
        exe = fluid.ParallelExecutor(
            True, loss_name=avg_cost.name, exec_strategy=strategy)

        feed_var_list = [
            var for var in trainer_prog.global_block().vars.values()
            if var.is_data
        ]

        feeder = fluid.DataFeeder(feed_var_list, place)
        reader_generator = test_reader()

        data = next(reader_generator)
        first_loss, = exe.run(fetch_list=[avg_cost.name],
                              feed=feeder.feed(data))
        print(first_loss)

        for i in six.moves.xrange(5):
            data = next(reader_generator)
            loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))

        data = next(reader_generator)
        last_loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))
        print(last_loss)


def runtime_main(test_class):
    import paddle
    import paddle.fluid as fluid
    import paddle.fluid.core as core

W
Wu Yi 已提交
120
    if len(sys.argv) != 8:
T
typhoonzero 已提交
121
        print(
W
Wu Yi 已提交
122
            "Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist] [sync_mode]"
T
typhoonzero 已提交
123 124 125 126 127 128 129
        )
    role = sys.argv[1]
    endpoints = sys.argv[2]
    trainer_id = int(sys.argv[3])
    current_endpoint = sys.argv[4]
    trainers = int(sys.argv[5])
    is_dist = True if sys.argv[6] == "TRUE" else False
W
Wu Yi 已提交
130
    sync_mode = True if sys.argv[7] == "TRUE" else False
T
typhoonzero 已提交
131 132 133

    model = test_class()
    if role == "pserver":
W
Wu Yi 已提交
134 135
        model.run_pserver(endpoints, trainers, current_endpoint, trainer_id,
                          sync_mode)
T
typhoonzero 已提交
136 137 138
    else:
        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
        ) else fluid.CPUPlace()
W
Wu Yi 已提交
139 140
        model.run_trainer(p, endpoints, trainer_id, trainers, is_dist,
                          sync_mode)
X
Xin Pan 已提交
141

M
minqiyang 已提交
142

M
minqiyang 已提交
143
import paddle.compat as cpt
M
minqiyang 已提交
144

X
Xin Pan 已提交
145 146

class TestDistBase(unittest.TestCase):
W
Wu Yi 已提交
147 148 149
    def _setup_config(self):
        raise NotImplementedError("tests should have _setup_config implemented")

X
Xin Pan 已提交
150 151 152 153 154
    def setUp(self):
        self._trainers = 2
        self._pservers = 2
        self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
        self._python_interp = "python"
W
Wu Yi 已提交
155 156
        self._sync_mode = True
        self._setup_config()
X
Xin Pan 已提交
157

T
tangwei12 已提交
158
    def start_pserver(self, model_file, check_error_log, required_envs):
W
Wu Yi 已提交
159
        sync_mode_str = "TRUE" if self._sync_mode else "FALSE"
X
Xin Pan 已提交
160
        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
W
Wu Yi 已提交
161
        ps0_cmd = "%s %s pserver %s 0 %s %d TRUE %s" % \
X
Xin Pan 已提交
162
            (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
W
Wu Yi 已提交
163 164
             self._trainers, sync_mode_str)
        ps1_cmd = "%s %s pserver %s 0 %s %d TRUE %s" % \
X
Xin Pan 已提交
165
            (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
W
Wu Yi 已提交
166
             self._trainers, sync_mode_str)
X
Xin Pan 已提交
167

G
gongweibao 已提交
168 169 170
        ps0_pipe = subprocess.PIPE
        ps1_pipe = subprocess.PIPE
        if check_error_log:
T
tangwei12 已提交
171 172
            required_envs["GLOG_v"] = "7"
            required_envs["GLOG_logtostderr"] = "1"
G
gongweibao 已提交
173 174 175 176 177
            print("ps0_cmd:", ps0_cmd)
            print("ps1_cmd:", ps1_cmd)
            ps0_pipe = open("/tmp/ps0_err.log", "wb")
            ps1_pipe = open("/tmp/ps1_err.log", "wb")

X
Xin Pan 已提交
178
        ps0_proc = subprocess.Popen(
T
tangwei12 已提交
179 180 181 182
            ps0_cmd.split(" "),
            stdout=subprocess.PIPE,
            stderr=ps0_pipe,
            env=required_envs)
X
Xin Pan 已提交
183
        ps1_proc = subprocess.Popen(
T
tangwei12 已提交
184 185 186 187
            ps1_cmd.split(" "),
            stdout=subprocess.PIPE,
            stderr=ps1_pipe,
            env=required_envs)
G
gongweibao 已提交
188 189 190 191 192

        if not check_error_log:
            return ps0_proc, ps1_proc, None, None
        else:
            return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
X
Xin Pan 已提交
193 194

    def _wait_ps_ready(self, pid):
X
polish  
Xin Pan 已提交
195
        retry_times = 50
X
Xin Pan 已提交
196 197 198 199 200 201 202 203
        while True:
            assert retry_times >= 0, "wait ps ready failed"
            time.sleep(3)
            try:
                # the listen_and_serv_op would touch a file which contains the listen port
                # on the /tmp directory until it was ready to process all the RPC call.
                os.stat("/tmp/paddle.%d.port" % pid)
                return
X
polish  
Xin Pan 已提交
204 205 206
            except os.error as e:
                sys.stderr.write('waiting for pserver: %s, left retry %d\n' %
                                 (e, retry_times))
X
Xin Pan 已提交
207 208
                retry_times -= 1

G
gongweibao 已提交
209
    def check_with_place(self, model_file, delta=1e-3, check_error_log=False):
X
Xin Pan 已提交
210 211 212 213 214
        # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
        required_envs = {
            "PATH": os.getenv("PATH"),
            "PYTHONPATH": os.getenv("PYTHONPATH"),
            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
W
Wu Yi 已提交
215 216
            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
            "FLAGS_cudnn_deterministic": "1"
X
Xin Pan 已提交
217
        }
G
gongweibao 已提交
218 219 220 221 222

        if check_error_log:
            required_envs["GLOG_v"] = "7"
            required_envs["GLOG_logtostderr"] = "1"

X
Xin Pan 已提交
223
        # Run local to get a base line
X
clean  
Xin Pan 已提交
224
        env_local = {"CUDA_VISIBLE_DEVICES": "0"}
X
Xin Pan 已提交
225
        env_local.update(required_envs)
W
Wu Yi 已提交
226 227
        sync_mode_str = "TRUE" if self._sync_mode else "FALSE"
        local_cmd = "%s %s trainer %s 0 %s %d FLASE %s" % \
X
Xin Pan 已提交
228
            (self._python_interp, model_file,
W
Wu Yi 已提交
229
             "127.0.0.1:1234", "127.0.0.1:1234", 1, sync_mode_str)
G
gongweibao 已提交
230 231 232 233 234 235 236 237 238 239 240 241 242 243 244
        if not check_error_log:
            local_proc = subprocess.Popen(
                local_cmd.split(" "),
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                env=env_local)
        else:
            print("trainer cmd:", local_cmd)
            err_log = open("/tmp/trainer.err.log", "wb")
            local_proc = subprocess.Popen(
                local_cmd.split(" "),
                stdout=subprocess.PIPE,
                stderr=err_log,
                env=env_local)

X
Xin Pan 已提交
245 246
        local_proc.wait()
        out, err = local_proc.communicate()
M
minqiyang 已提交
247
        local_ret = cpt.to_text(out)
X
Xin Pan 已提交
248 249 250 251
        sys.stderr.write('local_loss: %s\n' % local_ret)
        sys.stderr.write('local_stderr: %s\n' % err)

        # Run dist train to compare with local results
G
gongweibao 已提交
252 253
        ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model_file,
                                                          check_error_log)
X
Xin Pan 已提交
254 255 256 257
        self._wait_ps_ready(ps0.pid)
        self._wait_ps_ready(ps1.pid)

        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
W
Wu Yi 已提交
258
        tr0_cmd = "%s %s trainer %s 0 %s %d TRUE %s" % \
X
Xin Pan 已提交
259
            (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
W
Wu Yi 已提交
260 261
             self._trainers, sync_mode_str)
        tr1_cmd = "%s %s trainer %s 1 %s %d TRUE %s" % \
X
Xin Pan 已提交
262
            (self._python_interp, model_file, self._ps_endpoints, ps1_ep,
W
Wu Yi 已提交
263
             self._trainers, sync_mode_str)
X
Xin Pan 已提交
264

X
clean  
Xin Pan 已提交
265 266
        env0 = {"CUDA_VISIBLE_DEVICES": "0"}
        env1 = {"CUDA_VISIBLE_DEVICES": "1"}
X
Xin Pan 已提交
267 268 269 270
        env0.update(required_envs)
        env1.update(required_envs)
        FNULL = open(os.devnull, 'w')

G
gongweibao 已提交
271 272 273 274 275 276 277 278
        tr0_pipe = subprocess.PIPE
        tr1_pipe = subprocess.PIPE
        if check_error_log:
            print("tr0_cmd:", tr0_cmd)
            print("tr1_cmd:", tr1_cmd)
            tr0_pipe = open("/tmp/tr0_err.log", "wb")
            tr1_pipe = open("/tmp/tr1_err.log", "wb")

X
Xin Pan 已提交
279 280 281
        tr0_proc = subprocess.Popen(
            tr0_cmd.split(" "),
            stdout=subprocess.PIPE,
G
gongweibao 已提交
282
            stderr=tr0_pipe,
X
Xin Pan 已提交
283 284 285 286
            env=env0)
        tr1_proc = subprocess.Popen(
            tr1_cmd.split(" "),
            stdout=subprocess.PIPE,
G
gongweibao 已提交
287
            stderr=tr1_pipe,
X
Xin Pan 已提交
288 289 290 291 292 293
            env=env1)

        tr0_proc.wait()
        tr1_proc.wait()
        out, err = tr0_proc.communicate()
        sys.stderr.write('dist_stderr: %s\n' % err)
M
minqiyang 已提交
294
        loss_data0 = cpt.to_text(out)
X
Xin Pan 已提交
295 296 297 298 299 300 301 302 303
        sys.stderr.write('dist_loss: %s\n' % loss_data0)
        lines = loss_data0.split("\n")
        dist_first_loss = eval(lines[0].replace(" ", ","))[0]
        dist_last_loss = eval(lines[1].replace(" ", ","))[0]

        local_lines = local_ret.split("\n")
        local_first_loss = eval(local_lines[0])[0]
        local_last_loss = eval(local_lines[1])[0]

G
gongweibao 已提交
304 305 306 307 308 309 310
        # close trainer file
        if check_error_log:
            tr0_pipe.close()
            tr1_pipe.close()

            ps0_pipe.close()
            ps1_pipe.close()
T
typhoonzero 已提交
311
        # FIXME: use terminate() instead of sigkill.
X
Xin Pan 已提交
312 313 314
        os.kill(ps0.pid, signal.SIGKILL)
        os.kill(ps1.pid, signal.SIGKILL)
        FNULL.close()
T
typhoonzero 已提交
315 316 317

        self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta)
        self.assertAlmostEqual(local_last_loss, dist_last_loss, delta=delta)