test_dist_train.py 5.7 KB
Newer Older
T
typhoonzero 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

15 16
from __future__ import print_function

17 18
import os
import time
T
typhoonzero 已提交
19
import unittest
20
from multiprocessing import Process
Y
yi.wu 已提交
21
import signal
22 23

import numpy
T
typhoonzero 已提交
24 25 26

import paddle.fluid as fluid
import paddle.fluid.layers as layers
X
Xin Pan 已提交
27 28 29
from paddle.fluid.layers.io import ListenAndServ
from paddle.fluid.layers.io import Recv
from paddle.fluid.layers.io import Send
S
sneaxiy 已提交
30
import paddle.fluid.layers.ops as ops
31
from dist_test_utils import *
T
typhoonzero 已提交
32

G
gongweibao 已提交
33 34 35 36 37 38
from paddle.fluid import core

RPC_OP_ROLE_ATTR_NAME = op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName(
)
RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC

T
typhoonzero 已提交
39 40

class TestSendOp(unittest.TestCase):
41

T
typhoonzero 已提交
42
    def test_send(self):
43
        remove_ps_flag(os.getpid())
T
typhoonzero 已提交
44 45 46 47 48 49 50
        # Run init_serv in a thread
        place = fluid.CPUPlace()
        # NOTE: python thread will not work here due to GIL.
        p = Process(target=self.init_serv, args=(place, ))
        p.daemon = True
        p.start()

Y
yi.wu 已提交
51 52 53
        self.ps_timeout = 5
        self._wait_ps_ready(p.pid)

Y
yi.wu 已提交
54
        with open("/tmp/paddle.%d.port" % p.pid, "r") as fn:
T
typhoonzero 已提交
55 56 57 58 59 60
            selected_port = int(fn.readlines()[0])
        self.init_client(place, selected_port)

        self.run_local(place)
        self.assertTrue(numpy.allclose(self.local_out, self.dist_out))

61
        os.kill(p.pid, signal.SIGINT)
T
update  
typhoonzero 已提交
62 63
        p.join()

Y
yi.wu 已提交
64 65 66 67 68 69 70 71 72 73 74 75 76 77
    def _wait_ps_ready(self, pid):
        start_left_time = self.ps_timeout
        sleep_time = 0.5
        while True:
            assert start_left_time >= 0, "wait ps ready failed"
            time.sleep(sleep_time)
            try:
                # the listen_and_serv_op would touch a file which contains the listen port
                # on the /tmp directory until it was ready to process all the RPC call.
                os.stat("/tmp/paddle.%d.port" % pid)
                return
            except os.error:
                start_left_time -= sleep_time

T
typhoonzero 已提交
78 79 80 81
    def init_serv(self, place):
        main = fluid.Program()

        with fluid.program_guard(main):
X
Xin Pan 已提交
82
            serv = ListenAndServ("127.0.0.1:0", ["X"], optimizer_mode=False)
T
typhoonzero 已提交
83
            with serv.do():
84 85 86 87 88 89 90 91
                out_var = main.global_block().create_var(name="scale_0.tmp_0",
                                                         psersistable=True,
                                                         dtype="float32",
                                                         shape=[32, 32])
                x = layers.data(shape=[32, 32],
                                dtype='float32',
                                name="X",
                                append_batch_size=False)
T
typhoonzero 已提交
92
                fluid.initializer.Constant(value=1.0)(x, main.global_block())
S
sneaxiy 已提交
93
                ops._scale(x=x, scale=10.0, out=out_var)
T
typhoonzero 已提交
94 95 96 97 98 99 100

        self.server_exe = fluid.Executor(place)
        self.server_exe.run(main)

    def init_client(self, place, port):
        main = fluid.Program()
        with fluid.program_guard(main):
101 102 103 104 105 106 107 108 109 110 111 112 113 114
            main.global_block().append_op(type="fetch_barrier",
                                          inputs={},
                                          outputs={"Out": []},
                                          attrs={
                                              "endpoints":
                                              ["127.0.0.1:{0}".format(port)],
                                              RPC_OP_ROLE_ATTR_NAME:
                                              RPC_OP_ROLE_ATTR_VALUE
                                          })

            x = layers.data(shape=[32, 32],
                            dtype='float32',
                            name='X',
                            append_batch_size=False)
Z
Zeng Jinle 已提交
115
            x.persistable = True
T
typhoonzero 已提交
116
            fluid.initializer.Constant(value=2.3)(x, main.global_block())
G
gongweibao 已提交
117

T
typhoonzero 已提交
118 119 120 121 122
            get_var = main.global_block().create_var(
                name="scale_0.tmp_0",  # server side var
                dtype="float32",
                persistable=False,
                shape=[32, 32])
Y
yi.wu 已提交
123
            fluid.initializer.Constant(value=2.3)(get_var, main.global_block())
G
gongweibao 已提交
124

125 126
            # NOTE(zjl): `Send` is async send, which means that the sent
            # variable would be needed even though `Send` op runs.
Z
Zeng Jinle 已提交
127
            # Is it a right design? If I do not set `x.persistable = True`,
128
            # this unittest would hang in rpc client after x is deleted.
Z
Zeng Jinle 已提交
129
            #
130 131
            # BTW, `Send` is not a public API to users. So I set
            # `x.persistable = True` to be a hot fix of this unittest.
X
Xin Pan 已提交
132 133
            Send("127.0.0.1:%d" % port, [x])
            o = Recv("127.0.0.1:%d" % port, [get_var])
Y
yi.wu 已提交
134

T
typhoonzero 已提交
135 136 137 138 139 140
        exe = fluid.Executor(place)
        self.dist_out = exe.run(main, fetch_list=o)  # o is a list

    def run_local(self, place):
        main = fluid.Program()
        with fluid.program_guard(main):
141 142 143 144
            x = layers.data(shape=[32, 32],
                            dtype='float32',
                            name='X',
                            append_batch_size=False)
T
typhoonzero 已提交
145 146 147 148 149 150 151 152
            fluid.initializer.Constant(value=2.3)(x, main.global_block())
            o = layers.scale(x=x, scale=10.0)
        exe = fluid.Executor(place)
        self.local_out = exe.run(main, fetch_list=[o])


if __name__ == "__main__":
    unittest.main()