test_fit_a_line.py 8.8 KB
Newer Older
1
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
D
dzhwinter 已提交
2
#
D
dzhwinter 已提交
3 4 5
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
D
dzhwinter 已提交
6
#
D
dzhwinter 已提交
7
#     http://www.apache.org/licenses/LICENSE-2.0
D
dzhwinter 已提交
8
#
D
dzhwinter 已提交
9 10 11 12 13 14
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Y
Yang Yu 已提交
15
import contextlib
16
import math
武毅 已提交
17
import os
A
arlesniak 已提交
18
import struct
19
import sys
20
import tempfile
21 22 23 24 25 26 27
import unittest

import numpy

import paddle
import paddle.fluid as fluid
import paddle.static.amp as amp
Q
QI JUN 已提交
28

P
pangyoki 已提交
29 30
paddle.enable_static()

Q
QI JUN 已提交
31

A
arlesniak 已提交
32 33 34 35
def convert_uint16_to_float(in_list):
    in_list = numpy.asarray(in_list)
    out = numpy.vectorize(
        lambda x: struct.unpack('<f', struct.pack('<I', x << 16))[0],
36 37
        otypes=[numpy.float32],
    )(in_list.flat)
A
arlesniak 已提交
38 39 40
    return numpy.reshape(out, in_list.shape)


41 42 43 44
def convert_float_to_uint16(in_list):
    out = []
    for x in numpy.nditer(in_list):
        out.append(
45 46
            numpy.uint16(struct.unpack('<I', struct.pack('<f', x))[0] >> 16)
        )
47 48 49 50
    out = numpy.reshape(out, in_list.shape).view(numpy.uint16)
    return out


A
arlesniak 已提交
51
def train(use_cuda, save_dirname, is_local, use_bf16, pure_bf16):
G
GGBond8488 已提交
52 53 54 55
    x = paddle.static.data(name='x', shape=[-1, 13], dtype='float32')
    x.desc.set_need_check_feed(False)
    y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32')
    y.desc.set_need_check_feed(False)
Q
QI JUN 已提交
56

A
arlesniak 已提交
57 58 59
    if use_bf16:
        if not pure_bf16:
            with amp.bf16.bf16_guard():
C
Charles-hit 已提交
60
                y_predict = paddle.static.nn.fc(x=x, size=1, activation=None)
61 62 63
            cost = paddle.nn.functional.square_error_cost(
                input=y_predict, label=y
            )
64
            avg_cost = paddle.mean(cost)
A
arlesniak 已提交
65
        else:
C
Charles-hit 已提交
66
            y_predict = paddle.static.nn.fc(x=x, size=1, activation=None)
A
arlesniak 已提交
67
            with amp.bf16.bf16_guard():
68 69 70
                cost = paddle.nn.functional.square_error_cost(
                    input=y_predict, label=y
                )
71
                avg_cost = paddle.mean(cost)
A
arlesniak 已提交
72
    else:
C
Charles-hit 已提交
73
        y_predict = paddle.static.nn.fc(x=x, size=1, activation=None)
74
        cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y)
75
        avg_cost = paddle.mean(cost)
Q
QI JUN 已提交
76

77 78
    lr = 5e-3 if use_bf16 else 1e-3
    sgd_optimizer = fluid.optimizer.SGD(learning_rate=lr)
A
arlesniak 已提交
79

80
    if use_bf16:
A
arlesniak 已提交
81 82 83 84
        sgd_optimizer = amp.bf16.decorate_bf16(
            sgd_optimizer,
            amp_lists=amp.bf16.AutoMixedPrecisionListsBF16(),
            use_bf16_guard=False,
85 86 87 88 89
            use_pure_bf16=pure_bf16,
        )
    sgd_optimizer.minimize(
        avg_cost, startup_program=fluid.default_startup_program()
    )
Q
QI JUN 已提交
90

Y
Yang Yu 已提交
91
    BATCH_SIZE = 20
Q
QI JUN 已提交
92

93 94 95 96
    train_reader = paddle.batch(
        paddle.reader.shuffle(paddle.dataset.uci_housing.train(), buf_size=500),
        batch_size=BATCH_SIZE,
    )
Q
QI JUN 已提交
97

Y
Yang Yu 已提交
98 99 100
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)

武毅 已提交
101 102 103
    def train_loop(main_program):
        feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
        exe.run(fluid.default_startup_program())
A
arlesniak 已提交
104 105
        test_prog = main_program.clone(for_test=True)
        if pure_bf16:
106 107 108
            sgd_optimizer.amp_init(
                exe.place, test_program=test_prog, use_bf16_test=True
            )
武毅 已提交
109 110 111 112

        PASS_NUM = 100
        for pass_id in range(PASS_NUM):
            for data in train_reader():
113 114 115
                (avg_loss_value,) = exe.run(
                    main_program, feed=feeder.feed(data), fetch_list=[avg_cost]
                )
A
arlesniak 已提交
116 117
                if avg_loss_value.dtype == numpy.uint16:
                    avg_loss_value = convert_uint16_to_float(avg_loss_value)
118
                if avg_loss_value[0] < 10.0:
119
                    if save_dirname is not None:
120 121 122 123 124 125 126
                        paddle.static.save_inference_model(
                            save_dirname,
                            [x],
                            [y_predict],
                            exe,
                            clip_extra=False,
                        )
武毅 已提交
127 128 129
                    return
                if math.isnan(float(avg_loss_value)):
                    sys.exit("got NaN loss, training failed.")
130 131 132
        raise AssertionError(
            "Fit a line cost is too large, {0:2.2}".format(avg_loss_value[0])
        )
武毅 已提交
133 134 135 136

    if is_local:
        train_loop(fluid.default_main_program())
    else:
G
gongweibao 已提交
137 138
        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
武毅 已提交
139 140 141 142
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
G
gongweibao 已提交
143
        trainers = int(os.getenv("PADDLE_TRAINERS"))
武毅 已提交
144
        current_endpoint = os.getenv("POD_IP") + ":" + port
G
gongweibao 已提交
145 146
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
武毅 已提交
147
        t = fluid.DistributeTranspiler()
Y
Yancey1989 已提交
148
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
武毅 已提交
149 150
        if training_role == "PSERVER":
            pserver_prog = t.get_pserver_program(current_endpoint)
151 152 153
            pserver_startup = t.get_startup_program(
                current_endpoint, pserver_prog
            )
武毅 已提交
154 155 156 157
            exe.run(pserver_startup)
            exe.run(pserver_prog)
        elif training_role == "TRAINER":
            train_loop(t.get_trainer_program())
Y
Yang Yu 已提交
158 159


A
arlesniak 已提交
160
def infer(use_cuda, save_dirname=None, use_bf16=False):
161 162 163 164 165 166
    if save_dirname is None:
        return

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)

167 168
    inference_scope = fluid.core.Scope()
    with fluid.scope_guard(inference_scope):
169
        # Use paddle.static.load_inference_model to obtain the inference program desc,
T
tianshuo78520a 已提交
170
        # the feed_target_names (the names of variables that will be fed
171 172
        # data using feed operators), and the fetch_targets (variables that
        # we want to obtain data from using fetch operators).
173 174 175 176 177
        [
            inference_program,
            feed_target_names,
            fetch_targets,
        ] = paddle.static.load_inference_model(save_dirname, exe)
178 179 180 181

        # The input's dimension should be 2-D and the second dim is 13
        # The input data should be >= 0
        batch_size = 10
182

183 184 185
        test_reader = paddle.batch(
            paddle.dataset.uci_housing.test(), batch_size=batch_size
        )
186

187
        test_data = next(test_reader())
188 189 190
        test_feat = numpy.array([data[0] for data in test_data]).astype(
            "float32"
        )
191 192 193 194

        if use_bf16:
            test_feat = convert_float_to_uint16(test_feat)

195 196 197
        test_label = numpy.array([data[1] for data in test_data]).astype(
            "float32"
        )
198

199
        assert feed_target_names[0] == 'x'
200 201 202 203 204
        results = exe.run(
            inference_program,
            feed={feed_target_names[0]: numpy.array(test_feat)},
            fetch_list=fetch_targets,
        )
A
arlesniak 已提交
205 206
        if results[0].dtype == numpy.uint16:
            results[0] = convert_uint16_to_float(results[0])
207 208 209
        print("infer shape: ", results[0].shape)
        print("infer results: ", results[0])
        print("ground truth: ", test_label)
210 211


A
arlesniak 已提交
212
def main(use_cuda, is_local=True, use_bf16=False, pure_bf16=False):
213 214 215
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return

216 217 218
    if use_bf16 and not fluid.core.is_compiled_with_mkldnn():
        return

219
    temp_dir = tempfile.TemporaryDirectory()
220
    # Directory for saving the trained model
221
    save_dirname = os.path.join(temp_dir.name, "fit_a_line.inference.model")
222

A
arlesniak 已提交
223 224
    train(use_cuda, save_dirname, is_local, use_bf16, pure_bf16)
    infer(use_cuda, save_dirname, use_bf16)
225
    temp_dir.cleanup()
A
arlesniak 已提交
226 227 228 229 230 231 232 233 234 235 236


class TestFitALineBase(unittest.TestCase):
    @contextlib.contextmanager
    def program_scope_guard(self):
        prog = fluid.Program()
        startup_prog = fluid.Program()
        scope = fluid.core.Scope()
        with fluid.scope_guard(scope):
            with fluid.program_guard(prog, startup_prog):
                yield
237 238


A
arlesniak 已提交
239
class TestFitALine(TestFitALineBase):
Y
Yang Yu 已提交
240 241 242 243 244 245 246 247
    def test_cpu(self):
        with self.program_scope_guard():
            main(use_cuda=False)

    def test_cuda(self):
        with self.program_scope_guard():
            main(use_cuda=True)

A
arlesniak 已提交
248

249 250 251
@unittest.skipIf(
    not fluid.core.supports_bfloat16(), "place does not support BF16 evaluation"
)
A
arlesniak 已提交
252
class TestFitALineBF16(TestFitALineBase):
253 254 255 256
    def test_bf16(self):
        with self.program_scope_guard():
            main(use_cuda=False, use_bf16=True)

A
arlesniak 已提交
257 258 259
    def test_pure_bf16(self):
        with self.program_scope_guard():
            main(use_cuda=False, use_bf16=True, pure_bf16=True)
Y
Yang Yu 已提交
260 261 262 263


if __name__ == '__main__':
    unittest.main()