test_recognize_digits.py 10.2 KB
Newer Older
1
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Y
Yang Yu 已提交
2 3 4 5 6 7 8 9 10 11 12 13
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
14

15
import math
武毅 已提交
16
import os
17 18 19 20 21 22 23
import sys
import unittest

import numpy

import paddle
import paddle.fluid as fluid
24
import paddle.fluid.core as core
Y
Yang Yu 已提交
25

P
pangyoki 已提交
26 27
paddle.enable_static()

Y
Yang Yu 已提交
28 29 30 31 32
BATCH_SIZE = 64


def loss_net(hidden, label):
    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
33 34 35
    loss = paddle.nn.functional.cross_entropy(
        input=prediction, label=label, reduction='none', use_softmax=False
    )
36
    avg_loss = paddle.mean(loss)
37
    acc = paddle.static.accuracy(input=prediction, label=label)
L
Liu Yiqun 已提交
38
    return prediction, avg_loss, acc
Y
Yang Yu 已提交
39 40 41 42 43 44 45 46 47


def mlp(img, label):
    hidden = fluid.layers.fc(input=img, size=200, act='tanh')
    hidden = fluid.layers.fc(input=hidden, size=200, act='tanh')
    return loss_net(hidden, label)


def conv_net(img, label):
48 49 50 51 52 53 54 55
    conv_pool_1 = fluid.nets.simple_img_conv_pool(
        input=img,
        filter_size=5,
        num_filters=20,
        pool_size=2,
        pool_stride=2,
        act="relu",
    )
56
    conv_pool_1 = paddle.static.nn.batch_norm(conv_pool_1)
57 58 59 60 61 62 63 64
    conv_pool_2 = fluid.nets.simple_img_conv_pool(
        input=conv_pool_1,
        filter_size=5,
        num_filters=50,
        pool_size=2,
        pool_stride=2,
        act="relu",
    )
Y
Yang Yu 已提交
65 66 67
    return loss_net(conv_pool_2, label)


68 69 70 71 72 73 74 75 76 77
def train(
    nn_type,
    use_cuda,
    parallel,
    save_dirname=None,
    save_full_dirname=None,
    model_filename=None,
    params_filename=None,
    is_local=True,
):
78 79
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
Y
Yang Yu 已提交
80 81 82
    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')

83
    if nn_type == 'mlp':
Y
Yang Yu 已提交
84 85 86 87
        net_conf = mlp
    else:
        net_conf = conv_net

88
    if parallel:
X
Xin Pan 已提交
89
        raise NotImplementedError()
Y
Yang Yu 已提交
90
    else:
L
Liu Yiqun 已提交
91
        prediction, avg_loss, acc = net_conf(img, label)
Y
Yang Yu 已提交
92

93
    test_program = fluid.default_main_program().clone(for_test=True)
Y
Yang Yu 已提交
94

X
Xin Pan 已提交
95
    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
W
Wu Yi 已提交
96
    optimizer.minimize(avg_loss)
Y
Yang Yu 已提交
97

98
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
Y
Yang Yu 已提交
99 100 101

    exe = fluid.Executor(place)

102 103 104 105 106 107 108
    train_reader = paddle.batch(
        paddle.reader.shuffle(paddle.dataset.mnist.train(), buf_size=500),
        batch_size=BATCH_SIZE,
    )
    test_reader = paddle.batch(
        paddle.dataset.mnist.test(), batch_size=BATCH_SIZE
    )
Y
Yang Yu 已提交
109 110
    feeder = fluid.DataFeeder(feed_list=[img, label], place=place)

武毅 已提交
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
    def train_loop(main_program):
        exe.run(fluid.default_startup_program())

        PASS_NUM = 100
        for pass_id in range(PASS_NUM):
            for batch_id, data in enumerate(train_reader()):
                # train a mini-batch, fetch nothing
                exe.run(main_program, feed=feeder.feed(data))
                if (batch_id + 1) % 10 == 0:
                    acc_set = []
                    avg_loss_set = []
                    for test_data in test_reader():
                        acc_np, avg_loss_np = exe.run(
                            program=test_program,
                            feed=feeder.feed(test_data),
126 127
                            fetch_list=[acc, avg_loss],
                        )
武毅 已提交
128 129 130 131 132
                        acc_set.append(float(acc_np))
                        avg_loss_set.append(float(avg_loss_np))
                    # get test acc and loss
                    acc_val = numpy.array(acc_set).mean()
                    avg_loss_val = numpy.array(avg_loss_set).mean()
Q
Qi Li 已提交
133 134
                    if float(acc_val) > 0.2 or pass_id == (PASS_NUM - 1):
                        # Smaller value to increase CI speed
武毅 已提交
135 136
                        if save_dirname is not None:
                            fluid.io.save_inference_model(
137 138 139
                                save_dirname,
                                ["img"],
                                [prediction],
武毅 已提交
140 141
                                exe,
                                model_filename=model_filename,
142 143
                                params_filename=params_filename,
                            )
X
Xin Pan 已提交
144 145
                        if save_full_dirname is not None:
                            fluid.io.save_inference_model(
146 147 148
                                save_full_dirname,
                                [],
                                [],
X
Xin Pan 已提交
149 150 151
                                exe,
                                model_filename=model_filename,
                                params_filename=params_filename,
152 153
                                export_for_deployment=False,
                            )
武毅 已提交
154 155
                        return
                    else:
156
                        print(
157 158 159 160 161 162 163
                            'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.format(
                                pass_id,
                                batch_id + 1,
                                float(avg_loss_val),
                                float(acc_val),
                            )
                        )
武毅 已提交
164 165 166 167 168 169 170
                        if math.isnan(float(avg_loss_val)):
                            sys.exit("got NaN loss, training failed.")
        raise AssertionError("Loss of recognize digits is too large")

    if is_local:
        train_loop(fluid.default_main_program())
    else:
G
gongweibao 已提交
171 172
        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
武毅 已提交
173 174 175 176
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
G
gongweibao 已提交
177
        trainers = int(os.getenv("PADDLE_TRAINERS"))
武毅 已提交
178
        current_endpoint = os.getenv("POD_IP") + ":" + port
G
gongweibao 已提交
179 180
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
武毅 已提交
181
        t = fluid.DistributeTranspiler()
Y
Yancey1989 已提交
182
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
武毅 已提交
183 184
        if training_role == "PSERVER":
            pserver_prog = t.get_pserver_program(current_endpoint)
185 186 187
            pserver_startup = t.get_startup_program(
                current_endpoint, pserver_prog
            )
武毅 已提交
188 189 190 191
            exe.run(pserver_startup)
            exe.run(pserver_prog)
        elif training_role == "TRAINER":
            train_loop(t.get_trainer_program())
Y
Yang Yu 已提交
192 193


194 195 196
def infer(
    use_cuda, save_dirname=None, model_filename=None, params_filename=None
):
L
Liu Yiqun 已提交
197 198 199
    if save_dirname is None:
        return

200
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
L
Liu Yiqun 已提交
201 202
    exe = fluid.Executor(place)

203 204 205 206 207 208
    inference_scope = fluid.core.Scope()
    with fluid.scope_guard(inference_scope):
        # Use fluid.io.load_inference_model to obtain the inference program desc,
        # the feed_target_names (the names of variables that will be feeded
        # data using feed operators), and the fetch_targets (variables that
        # we want to obtain data from using fetch operators).
209 210 211 212 213 214 215
        [
            inference_program,
            feed_target_names,
            fetch_targets,
        ] = fluid.io.load_inference_model(
            save_dirname, exe, model_filename, params_filename
        )
216 217 218 219 220

        # The input's dimension of conv should be 4-D or 5-D.
        # Use normilized image pixels as input data, which should be in the range [-1.0, 1.0].
        batch_size = 1
        tensor_img = numpy.random.uniform(
221 222
            -1.0, 1.0, [batch_size, 1, 28, 28]
        ).astype("float32")
223 224 225

        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
        # and results will contain a list of data corresponding to fetch_targets.
226 227 228 229 230
        results = exe.run(
            inference_program,
            feed={feed_target_names[0]: tensor_img},
            fetch_list=fetch_targets,
        )
231
        print("infer results: ", results[0])
L
Liu Yiqun 已提交
232 233


234
def main(use_cuda, parallel, nn_type, combine):
235
    save_dirname = None
X
Xin Pan 已提交
236
    save_full_dirname = None
237 238
    model_filename = None
    params_filename = None
239 240
    if not use_cuda and not parallel:
        save_dirname = "recognize_digits_" + nn_type + ".inference.model"
X
Xin Pan 已提交
241
        save_full_dirname = "recognize_digits_" + nn_type + ".train.model"
242
        if combine:
243 244
            model_filename = "__model_combined__"
            params_filename = "__params_combined__"
245

武毅 已提交
246
    # call train() with is_local argument to run distributed train
247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
    train(
        nn_type=nn_type,
        use_cuda=use_cuda,
        parallel=parallel,
        save_dirname=save_dirname,
        save_full_dirname=save_full_dirname,
        model_filename=model_filename,
        params_filename=params_filename,
    )
    infer(
        use_cuda=use_cuda,
        save_dirname=save_dirname,
        model_filename=model_filename,
        params_filename=params_filename,
    )
262 263 264 265 266 267


class TestRecognizeDigits(unittest.TestCase):
    pass


268
def inject_test_method(use_cuda, parallel, nn_type, combine):
269 270 271 272 273 274
    def __impl__(self):
        prog = fluid.Program()
        startup_prog = fluid.Program()
        scope = fluid.core.Scope()
        with fluid.scope_guard(scope):
            with fluid.program_guard(prog, startup_prog):
275
                main(use_cuda, parallel, nn_type, combine)
276

277 278 279 280 281 282
    fn = 'test_{0}_{1}_{2}_{3}'.format(
        nn_type,
        'cuda' if use_cuda else 'cpu',
        'parallel' if parallel else 'normal',
        'combine' if combine else 'separate',
    )
283 284 285 286 287 288

    setattr(TestRecognizeDigits, fn, __impl__)


def inject_all_tests():
    for use_cuda in (False, True):
289 290
        if use_cuda and not core.is_compiled_with_cuda():
            continue
291
        for parallel in (False,):
292
            for nn_type in ('mlp', 'conv'):
293 294
                inject_test_method(use_cuda, parallel, nn_type, True)

295
    # Two unit-test for saving parameters as separate files
296
    inject_test_method(False, False, 'mlp', False)
297
    inject_test_method(False, False, 'conv', False)
298 299 300 301 302 303


inject_all_tests()

if __name__ == '__main__':
    unittest.main()