diff --git a/dygraph/dataloader_test.sh b/dygraph/dataloader_test.sh new file mode 100644 index 0000000000000000000000000000000000000000..8f9c5ada9d7786ba6b8caf126e0803c7de0adf75 --- /dev/null +++ b/dygraph/dataloader_test.sh @@ -0,0 +1,34 @@ +#!/bin/bash +mkdir dataloader_test_log + +# single card +python ./mnist/train.py > dataloader_test_log/mnist 2>&1 +python ./resnet/train.py > dataloader_test_log/resnet 2>&1 +python ./se_resnet/train.py > dataloader_test_log/se_resnet 2>&1 +python ./transformer/train.py > dataloader_test_log/transformer 2>&1 + +python ./mnist/train_sp.py > dataloader_test_log/mnist_sp 2>&1 +python ./resnet/train_sp.py > dataloader_test_log/resnet_sp 2>&1 +python ./se_resnet/train_sp.py > dataloader_test_log/se_resnet_sp 2>&1 +python ./transformer/train_sp.py > dataloader_test_log/transformer_sp 2>&1 + +python ./mnist/train_mp.py > dataloader_test_log/mnist_mp 2>&1 +python ./resnet/train_mp.py > dataloader_test_log/resnet_mp 2>&1 +python ./se_resnet/train_mp.py > dataloader_test_log/se_resnet_mp 2>&1 +python ./transformer/train_mp.py > dataloader_test_log/transformer_mp 2>&1 + +# multiple card +python -m paddle.distributed.launch --log_dir ./dataloader_test_log/mnist_8 ./mnist/train.py --use_data_parallel 1 +python -m paddle.distributed.launch --log_dir ./dataloader_test_log/resnet_8 ./resnet/train.py --use_data_parallel 1 +python -m paddle.distributed.launch --log_dir ./dataloader_test_log/se_resnet_8 ./se_resnet/train.py --use_data_parallel 1 +python -m paddle.distributed.launch --log_dir ./dataloader_test_log/transformer_8 ./transformer/train.py --use_data_parallel 1 + +python -m paddle.distributed.launch --log_dir ./dataloader_test_log/mnist_8_sp ./mnist/train_sp.py --use_data_parallel 1 +python -m paddle.distributed.launch --log_dir ./dataloader_test_log/resnet_8_sp ./resnet/train_sp.py --use_data_parallel 1 +python -m paddle.distributed.launch --log_dir ./dataloader_test_log/se_resnet_8_sp ./se_resnet/train_sp.py --use_data_parallel 1 +python -m paddle.distributed.launch --log_dir ./dataloader_test_log/transformer_8_sp ./transformer/train_sp.py --use_data_parallel 1 + +python -m paddle.distributed.launch --log_dir ./dataloader_test_log/mnist_8_mp ./mnist/train_mp.py --use_data_parallel 1 +python -m paddle.distributed.launch --log_dir ./dataloader_test_log/resnet_8_mp ./resnet/train_mp.py --use_data_parallel 1 +python -m paddle.distributed.launch --log_dir ./dataloader_test_log/se_resnet_8_mp ./se_resnet/train_mp.py --use_data_parallel 1 +python -m paddle.distributed.launch --log_dir ./dataloader_test_log/transformer_8_mp ./transformer/train_mp.py --use_data_parallel 1 \ No newline at end of file diff --git a/dygraph/mnist/train.py b/dygraph/mnist/train.py index f81df8f26458c93c1f658a9bc783d14a3c5b8256..39cc6283bda4b805ee20963113de7a8422c05a23 100644 --- a/dygraph/mnist/train.py +++ b/dygraph/mnist/train.py @@ -15,6 +15,7 @@ from __future__ import print_function import argparse import ast +import time import numpy as np from PIL import Image import os @@ -33,7 +34,7 @@ def parse_args(): default=False, help="The flag indicating whether to use data parallel mode to train the model." ) - parser.add_argument("-e", "--epoch", default=5, type=int, help="set epoch") + parser.add_argument("-e", "--epoch", default=1, type=int, help="set epoch") parser.add_argument("--ce", action="store_true", help="run ce") args = parser.parse_args() return args @@ -200,7 +201,9 @@ def train_mnist(args): test_reader = paddle.batch( paddle.dataset.mnist.test(), batch_size=BATCH_SIZE, drop_last=True) + total_train_time = 0 for epoch in range(epoch_num): + stime = time.time() for batch_id, data in enumerate(train_reader()): dy_x_data = np.array([x[0].reshape(1, 28, 28) for x in data]).astype('float32') @@ -229,6 +232,7 @@ def train_mnist(args): if batch_id % 100 == 0: print("Loss at epoch {} step {}: {:}".format( epoch, batch_id, avg_loss.numpy())) + total_train_time += (time.time() - stime) mnist.eval() test_cost, test_acc = test_mnist(test_reader, mnist, BATCH_SIZE) @@ -249,6 +253,7 @@ def train_mnist(args): inference_mnist() + print("total train time: {} s".format(total_train_time)) if __name__ == '__main__': args = parse_args() diff --git a/dygraph/mnist/train_mp.py b/dygraph/mnist/train_mp.py new file mode 100644 index 0000000000000000000000000000000000000000..0f6bfc14fe4712d43a4d4fef26f9e97b650bab39 --- /dev/null +++ b/dygraph/mnist/train_mp.py @@ -0,0 +1,268 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import argparse +import ast +import time +import numpy as np +from PIL import Image +import os +import paddle +import paddle.fluid as fluid +from paddle.fluid.optimizer import AdamOptimizer +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear +from paddle.fluid.dygraph.base import to_variable + + +def parse_args(): + parser = argparse.ArgumentParser("Training for Mnist.") + parser.add_argument( + "--use_data_parallel", + type=ast.literal_eval, + default=False, + help="The flag indicating whether to use data parallel mode to train the model." + ) + parser.add_argument("-e", "--epoch", default=1, type=int, help="set epoch") + parser.add_argument("--ce", action="store_true", help="run ce") + args = parser.parse_args() + return args + + +class SimpleImgConvPool(fluid.dygraph.Layer): + def __init__(self, + num_channels, + num_filters, + filter_size, + pool_size, + pool_stride, + pool_padding=0, + pool_type='max', + global_pooling=False, + conv_stride=1, + conv_padding=0, + conv_dilation=1, + conv_groups=1, + act=None, + use_cudnn=False, + param_attr=None, + bias_attr=None): + super(SimpleImgConvPool, self).__init__() + + self._conv2d = Conv2D( + num_channels=num_channels, + num_filters=num_filters, + filter_size=filter_size, + stride=conv_stride, + padding=conv_padding, + dilation=conv_dilation, + groups=conv_groups, + param_attr=None, + bias_attr=None, + act=act, + use_cudnn=use_cudnn) + + self._pool2d = Pool2D( + pool_size=pool_size, + pool_type=pool_type, + pool_stride=pool_stride, + pool_padding=pool_padding, + global_pooling=global_pooling, + use_cudnn=use_cudnn) + + def forward(self, inputs): + x = self._conv2d(inputs) + x = self._pool2d(x) + return x + + +class MNIST(fluid.dygraph.Layer): + def __init__(self): + super(MNIST, self).__init__() + + self._simple_img_conv_pool_1 = SimpleImgConvPool( + 1, 20, 5, 2, 2, act="relu") + + self._simple_img_conv_pool_2 = SimpleImgConvPool( + 20, 50, 5, 2, 2, act="relu") + + self.pool_2_shape = 50 * 4 * 4 + SIZE = 10 + scale = (2.0 / (self.pool_2_shape**2 * SIZE))**0.5 + self._fc = Linear(self.pool_2_shape, 10, + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=scale)), + act="softmax") + + def forward(self, inputs, label=None): + x = self._simple_img_conv_pool_1(inputs) + x = self._simple_img_conv_pool_2(x) + x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape]) + x = self._fc(x) + if label is not None: + acc = fluid.layers.accuracy(input=x, label=label) + return x, acc + else: + return x + +def reader_decorator(reader): + def __reader__(): + for item in reader(): + img = np.array(item[0]).astype('float32').reshape(1, 28, 28) + label = np.array(item[1]).astype('int64').reshape(1) + yield img, label + return __reader__ + +def test_mnist(reader, model, batch_size): + acc_set = [] + avg_loss_set = [] + for batch_id, data in enumerate(reader()): + img, label = data + label.stop_gradient = True + prediction, acc = model(img, label) + loss = fluid.layers.cross_entropy(input=prediction, label=label) + avg_loss = fluid.layers.mean(loss) + acc_set.append(float(acc.numpy())) + avg_loss_set.append(float(avg_loss.numpy())) + + # get test acc and loss + acc_val_mean = np.array(acc_set).mean() + avg_loss_val_mean = np.array(avg_loss_set).mean() + + return avg_loss_val_mean, acc_val_mean + + +def inference_mnist(): + place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \ + if args.use_data_parallel else fluid.CUDAPlace(0) + with fluid.dygraph.guard(place): + mnist_infer = MNIST() + # load checkpoint + model_dict, _ = fluid.load_dygraph("save_temp") + mnist_infer.set_dict(model_dict) + print("checkpoint loaded") + + # start evaluate mode + mnist_infer.eval() + + def load_image(file): + im = Image.open(file).convert('L') + im = im.resize((28, 28), Image.ANTIALIAS) + im = np.array(im).reshape(1, 1, 28, 28).astype(np.float32) + im = im / 255.0 * 2.0 - 1.0 + return im + + cur_dir = os.path.dirname(os.path.realpath(__file__)) + tensor_img = load_image(cur_dir + '/image/infer_3.png') + + results = mnist_infer(to_variable(tensor_img)) + lab = np.argsort(results.numpy()) + print("Inference result of image/infer_3.png is: %d" % lab[0][-1]) + + +def train_mnist(args): + epoch_num = args.epoch + BATCH_SIZE = 64 + + place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \ + if args.use_data_parallel else fluid.CUDAPlace(0) + with fluid.dygraph.guard(place): + if args.ce: + print("ce mode") + seed = 33 + np.random.seed(seed) + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + if args.use_data_parallel: + strategy = fluid.dygraph.parallel.prepare_context() + mnist = MNIST() + adam = AdamOptimizer(learning_rate=0.001, parameter_list=mnist.parameters()) + if args.use_data_parallel: + mnist = fluid.dygraph.parallel.DataParallel(mnist, strategy) + + train_reader = paddle.batch( + reader_decorator( + paddle.dataset.mnist.train()), + batch_size=BATCH_SIZE, + drop_last=True) + if args.use_data_parallel: + train_reader = fluid.contrib.reader.distributed_batch_reader( + train_reader) + + test_reader = paddle.batch( + reader_decorator( + paddle.dataset.mnist.test()), + batch_size=BATCH_SIZE, + drop_last=True) + + train_loader = fluid.io.DataLoader.from_generator(capacity=10, use_multiprocess=True) + train_loader.set_sample_list_generator(train_reader, places=place) + + test_loader = fluid.io.DataLoader.from_generator(capacity=10, use_multiprocess=True) + test_loader.set_sample_list_generator(test_reader, places=place) + + total_train_time = 0 + for epoch in range(epoch_num): + stime = time.time() + for batch_id, data in enumerate(train_loader()): + img, label = data + label.stop_gradient = True + + cost, acc = mnist(img, label) + + loss = fluid.layers.cross_entropy(cost, label) + avg_loss = fluid.layers.mean(loss) + + if args.use_data_parallel: + avg_loss = mnist.scale_loss(avg_loss) + avg_loss.backward() + mnist.apply_collective_grads() + else: + avg_loss.backward() + + adam.minimize(avg_loss) + # save checkpoint + mnist.clear_gradients() + if batch_id % 100 == 0: + print("Loss at epoch {} step {}: {:}".format( + epoch, batch_id, avg_loss.numpy())) + total_train_time += (time.time() - stime) + + mnist.eval() + test_cost, test_acc = test_mnist(test_loader, mnist, BATCH_SIZE) + mnist.train() + if args.ce: + print("kpis\ttest_acc\t%s" % test_acc) + print("kpis\ttest_cost\t%s" % test_cost) + print("Loss at epoch {} , Test avg_loss is: {}, acc is: {}".format( + epoch, test_cost, test_acc)) + + save_parameters = (not args.use_data_parallel) or ( + args.use_data_parallel and + fluid.dygraph.parallel.Env().local_rank == 0) + if save_parameters: + fluid.save_dygraph(mnist.state_dict(), "save_temp") + + print("checkpoint saved") + + inference_mnist() + + print("total train time: {} s".format(total_train_time)) + + +if __name__ == '__main__': + args = parse_args() + train_mnist(args) diff --git a/dygraph/mnist/train_sp.py b/dygraph/mnist/train_sp.py new file mode 100644 index 0000000000000000000000000000000000000000..fbdf1c691ec1a564225c55c1a8d2f108b8b03641 --- /dev/null +++ b/dygraph/mnist/train_sp.py @@ -0,0 +1,268 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import argparse +import ast +import time +import numpy as np +from PIL import Image +import os +import paddle +import paddle.fluid as fluid +from paddle.fluid.optimizer import AdamOptimizer +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear +from paddle.fluid.dygraph.base import to_variable + + +def parse_args(): + parser = argparse.ArgumentParser("Training for Mnist.") + parser.add_argument( + "--use_data_parallel", + type=ast.literal_eval, + default=False, + help="The flag indicating whether to use data parallel mode to train the model." + ) + parser.add_argument("-e", "--epoch", default=1, type=int, help="set epoch") + parser.add_argument("--ce", action="store_true", help="run ce") + args = parser.parse_args() + return args + + +class SimpleImgConvPool(fluid.dygraph.Layer): + def __init__(self, + num_channels, + num_filters, + filter_size, + pool_size, + pool_stride, + pool_padding=0, + pool_type='max', + global_pooling=False, + conv_stride=1, + conv_padding=0, + conv_dilation=1, + conv_groups=1, + act=None, + use_cudnn=False, + param_attr=None, + bias_attr=None): + super(SimpleImgConvPool, self).__init__() + + self._conv2d = Conv2D( + num_channels=num_channels, + num_filters=num_filters, + filter_size=filter_size, + stride=conv_stride, + padding=conv_padding, + dilation=conv_dilation, + groups=conv_groups, + param_attr=None, + bias_attr=None, + act=act, + use_cudnn=use_cudnn) + + self._pool2d = Pool2D( + pool_size=pool_size, + pool_type=pool_type, + pool_stride=pool_stride, + pool_padding=pool_padding, + global_pooling=global_pooling, + use_cudnn=use_cudnn) + + def forward(self, inputs): + x = self._conv2d(inputs) + x = self._pool2d(x) + return x + + +class MNIST(fluid.dygraph.Layer): + def __init__(self): + super(MNIST, self).__init__() + + self._simple_img_conv_pool_1 = SimpleImgConvPool( + 1, 20, 5, 2, 2, act="relu") + + self._simple_img_conv_pool_2 = SimpleImgConvPool( + 20, 50, 5, 2, 2, act="relu") + + self.pool_2_shape = 50 * 4 * 4 + SIZE = 10 + scale = (2.0 / (self.pool_2_shape**2 * SIZE))**0.5 + self._fc = Linear(self.pool_2_shape, 10, + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=scale)), + act="softmax") + + def forward(self, inputs, label=None): + x = self._simple_img_conv_pool_1(inputs) + x = self._simple_img_conv_pool_2(x) + x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape]) + x = self._fc(x) + if label is not None: + acc = fluid.layers.accuracy(input=x, label=label) + return x, acc + else: + return x + +def reader_decorator(reader): + def __reader__(): + for item in reader(): + img = np.array(item[0]).astype('float32').reshape(1, 28, 28) + label = np.array(item[1]).astype('int64').reshape(1) + yield img, label + return __reader__ + +def test_mnist(reader, model, batch_size): + acc_set = [] + avg_loss_set = [] + for batch_id, data in enumerate(reader()): + img, label = data + label.stop_gradient = True + prediction, acc = model(img, label) + loss = fluid.layers.cross_entropy(input=prediction, label=label) + avg_loss = fluid.layers.mean(loss) + acc_set.append(float(acc.numpy())) + avg_loss_set.append(float(avg_loss.numpy())) + + # get test acc and loss + acc_val_mean = np.array(acc_set).mean() + avg_loss_val_mean = np.array(avg_loss_set).mean() + + return avg_loss_val_mean, acc_val_mean + + +def inference_mnist(): + place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \ + if args.use_data_parallel else fluid.CUDAPlace(0) + with fluid.dygraph.guard(place): + mnist_infer = MNIST() + # load checkpoint + model_dict, _ = fluid.load_dygraph("save_temp") + mnist_infer.set_dict(model_dict) + print("checkpoint loaded") + + # start evaluate mode + mnist_infer.eval() + + def load_image(file): + im = Image.open(file).convert('L') + im = im.resize((28, 28), Image.ANTIALIAS) + im = np.array(im).reshape(1, 1, 28, 28).astype(np.float32) + im = im / 255.0 * 2.0 - 1.0 + return im + + cur_dir = os.path.dirname(os.path.realpath(__file__)) + tensor_img = load_image(cur_dir + '/image/infer_3.png') + + results = mnist_infer(to_variable(tensor_img)) + lab = np.argsort(results.numpy()) + print("Inference result of image/infer_3.png is: %d" % lab[0][-1]) + + +def train_mnist(args): + epoch_num = args.epoch + BATCH_SIZE = 64 + + place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \ + if args.use_data_parallel else fluid.CUDAPlace(0) + with fluid.dygraph.guard(place): + if args.ce: + print("ce mode") + seed = 33 + np.random.seed(seed) + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + if args.use_data_parallel: + strategy = fluid.dygraph.parallel.prepare_context() + mnist = MNIST() + adam = AdamOptimizer(learning_rate=0.001, parameter_list=mnist.parameters()) + if args.use_data_parallel: + mnist = fluid.dygraph.parallel.DataParallel(mnist, strategy) + + train_reader = paddle.batch( + reader_decorator( + paddle.dataset.mnist.train()), + batch_size=BATCH_SIZE, + drop_last=True) + if args.use_data_parallel: + train_reader = fluid.contrib.reader.distributed_batch_reader( + train_reader) + + test_reader = paddle.batch( + reader_decorator( + paddle.dataset.mnist.test()), + batch_size=BATCH_SIZE, + drop_last=True) + + train_loader = fluid.io.DataLoader.from_generator(capacity=10) + train_loader.set_sample_list_generator(train_reader, places=place) + + test_loader = fluid.io.DataLoader.from_generator(capacity=10) + test_loader.set_sample_list_generator(test_reader, places=place) + + total_train_time = 0 + for epoch in range(epoch_num): + stime = time.time() + for batch_id, data in enumerate(train_loader()): + img, label = data + label.stop_gradient = True + + cost, acc = mnist(img, label) + + loss = fluid.layers.cross_entropy(cost, label) + avg_loss = fluid.layers.mean(loss) + + if args.use_data_parallel: + avg_loss = mnist.scale_loss(avg_loss) + avg_loss.backward() + mnist.apply_collective_grads() + else: + avg_loss.backward() + + adam.minimize(avg_loss) + # save checkpoint + mnist.clear_gradients() + if batch_id % 100 == 0: + print("Loss at epoch {} step {}: {:}".format( + epoch, batch_id, avg_loss.numpy())) + total_train_time += (time.time() - stime) + + mnist.eval() + test_cost, test_acc = test_mnist(test_loader, mnist, BATCH_SIZE) + mnist.train() + if args.ce: + print("kpis\ttest_acc\t%s" % test_acc) + print("kpis\ttest_cost\t%s" % test_cost) + print("Loss at epoch {} , Test avg_loss is: {}, acc is: {}".format( + epoch, test_cost, test_acc)) + + save_parameters = (not args.use_data_parallel) or ( + args.use_data_parallel and + fluid.dygraph.parallel.Env().local_rank == 0) + if save_parameters: + fluid.save_dygraph(mnist.state_dict(), "save_temp") + + print("checkpoint saved") + + inference_mnist() + + print("total train time: {} s".format(total_train_time)) + + +if __name__ == '__main__': + args = parse_args() + train_mnist(args) diff --git a/dygraph/parse_dataloader_test_result.py b/dygraph/parse_dataloader_test_result.py new file mode 100644 index 0000000000000000000000000000000000000000..3c11c9a09c2f12343293675826bab2262f0782ca --- /dev/null +++ b/dygraph/parse_dataloader_test_result.py @@ -0,0 +1,40 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import sys +import logging +import paddle.compat as cpt + +TEST_LOG_DIR = './dataloader_test_log' + +def parse_test_result(dirname): + if not os.path.exists(dirname): + logging.warning("log dir is not exist.") + return + + for parent, _, filenames in os.walk(dirname): + for filename in filenames: + filepath = os.path.join(parent, filename) + if not os.path.isfile(filepath): + continue + with open(filepath, 'rb') as f: + for line in f.readlines(): + if line.startswith(b'total train time:'): + print("%s - %s" % (filepath, cpt.to_text(line)[:-1])) + +if __name__ == "__main__": + parse_test_result(TEST_LOG_DIR) \ No newline at end of file diff --git a/dygraph/resnet/train.py b/dygraph/resnet/train.py index d21f650b710c2cbe31415de7b434ccce80f9baf4..cea74c4dd053237ad2bb10aff1480a7fa949f77b 100644 --- a/dygraph/resnet/train.py +++ b/dygraph/resnet/train.py @@ -25,6 +25,7 @@ from paddle.fluid import framework import math import sys +import time IMAGENET1000 = 1281167 base_lr = 0.1 @@ -41,7 +42,7 @@ def parse_args(): help="The flag indicating whether to use data parallel mode to train the model." ) parser.add_argument( - "-e", "--epoch", default=120, type=int, help="set epoch") + "-e", "--epoch", default=1, type=int, help="set epoch") parser.add_argument( "-b", "--batch_size", default=32, type=int, help="set epoch") parser.add_argument("--ce", action="store_true", help="run ce") @@ -309,7 +310,8 @@ def train_resnet(): #file_name = './model/epoch_0.npz' #model_data = np.load( file_name ) - + + total_train_time = 0 for eop in range(epoch): resnet.train() @@ -324,6 +326,7 @@ def train_resnet(): print("load finished") + stime = time.time() for batch_id, data in enumerate(train_reader()): dy_x_data = np.array( [x[0].reshape(3, 224, 224) for x in data]).astype('float32') @@ -365,6 +368,7 @@ def train_resnet(): print( "epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f" % \ ( eop, batch_id, total_loss / total_sample, \ total_acc1 / total_sample, total_acc5 / total_sample)) + total_train_time += (time.time() - stime) if args.ce: print("kpis\ttrain_acc1\t%0.3f" % (total_acc1 / total_sample)) @@ -383,7 +387,8 @@ def train_resnet(): fluid.save_dygraph(resnet.state_dict(), 'resnet_params') + print("total train time: {} s".format(total_train_time)) -if __name__ == '__main__': +if __name__ == '__main__': train_resnet() diff --git a/dygraph/resnet/train_mp.py b/dygraph/resnet/train_mp.py new file mode 100644 index 0000000000000000000000000000000000000000..a7bf1fe1241f2b27f8cb123730215bbf65033e4e --- /dev/null +++ b/dygraph/resnet/train_mp.py @@ -0,0 +1,397 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import argparse +import ast +import paddle +import paddle.fluid as fluid +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear +from paddle.fluid.dygraph.base import to_variable + +from paddle.fluid import framework + +import math +import sys +import time + +IMAGENET1000 = 1281167 +base_lr = 0.1 +momentum_rate = 0.9 +l2_decay = 1e-4 + + +def parse_args(): + parser = argparse.ArgumentParser("Training for Resnet.") + parser.add_argument( + "--use_data_parallel", + type=ast.literal_eval, + default=False, + help="The flag indicating whether to use data parallel mode to train the model." + ) + parser.add_argument( + "-e", "--epoch", default=1, type=int, help="set epoch") + parser.add_argument( + "-b", "--batch_size", default=32, type=int, help="set epoch") + parser.add_argument("--ce", action="store_true", help="run ce") + args = parser.parse_args() + return args + + +args = parse_args() +batch_size = args.batch_size + + +def optimizer_setting(parameter_list=None): + + total_images = IMAGENET1000 + + step = int(math.ceil(float(total_images) / batch_size)) + + epochs = [30, 60, 90] + bd = [step * e for e in epochs] + + lr = [] + lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] + if fluid.in_dygraph_mode(): + optimizer = fluid.optimizer.Momentum( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr), + momentum=momentum_rate, + regularization=fluid.regularizer.L2Decay(l2_decay), + parameter_list=parameter_list) + else: + optimizer = fluid.optimizer.Momentum( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr), + momentum=momentum_rate, + regularization=fluid.regularizer.L2Decay(l2_decay)) + + + return optimizer + + +class ConvBNLayer(fluid.dygraph.Layer): + def __init__(self, + num_channels, + num_filters, + filter_size, + stride=1, + groups=1, + act=None): + super(ConvBNLayer, self).__init__() + + self._conv = Conv2D( + num_channels=num_channels, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + act=None, + bias_attr=False) + + self._batch_norm = BatchNorm(num_filters, act=act) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + + return y + + +class BottleneckBlock(fluid.dygraph.Layer): + def __init__(self, + num_channels, + num_filters, + stride, + shortcut=True): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer( + num_channels=num_channels, + num_filters=num_filters, + filter_size=1, + act='relu') + self.conv1 = ConvBNLayer( + num_channels=num_filters, + num_filters=num_filters, + filter_size=3, + stride=stride, + act='relu') + self.conv2 = ConvBNLayer( + num_channels=num_filters, + num_filters=num_filters * 4, + filter_size=1, + act=None) + + if not shortcut: + self.short = ConvBNLayer( + num_channels=num_channels, + num_filters=num_filters * 4, + filter_size=1, + stride=stride) + + self.shortcut = shortcut + + self._num_channels_out = num_filters * 4 + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + + y = fluid.layers.elementwise_add(x=short, y=conv2) + + layer_helper = LayerHelper(self.full_name(), act='relu') + return layer_helper.append_activation(y) + + +class ResNet(fluid.dygraph.Layer): + def __init__(self, layers=50, class_dim=102): + super(ResNet, self).__init__() + + self.layers = layers + supported_layers = [50, 101, 152] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format(supported_layers, layers) + + if layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + num_channels = [64, 256, 512, 1024] + num_filters = [64, 128, 256, 512] + + self.conv = ConvBNLayer( + num_channels=3, + num_filters=64, + filter_size=7, + stride=2, + act='relu') + self.pool2d_max = Pool2D( + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + + self.bottleneck_block_list = [] + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + bottleneck_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BottleneckBlock( + num_channels=num_channels[block] + if i == 0 else num_filters[block] * 4, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut)) + self.bottleneck_block_list.append(bottleneck_block) + shortcut = True + + self.pool2d_avg = Pool2D( + pool_size=7, pool_type='avg', global_pooling=True) + + self.pool2d_avg_output = num_filters[len(num_filters) - 1] * 4 * 1 * 1 + + import math + stdv = 1.0 / math.sqrt(2048 * 1.0) + + self.out = Linear(self.pool2d_avg_output, + class_dim, + act='softmax', + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, stdv))) + + def forward(self, inputs): + y = self.conv(inputs) + y = self.pool2d_max(y) + for bottleneck_block in self.bottleneck_block_list: + y = bottleneck_block(y) + y = self.pool2d_avg(y) + y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output]) + y = self.out(y) + return y + +def reader_decorator(reader): + def __reader__(): + for item in reader(): + img = np.array(item[0]).astype('float32').reshape(3, 224, 224) + label = np.array(item[1]).astype('int64').reshape(1) + yield img, label + return __reader__ + +def eval(model, data): + + model.eval() + total_loss = 0.0 + total_acc1 = 0.0 + total_acc5 = 0.0 + total_sample = 0 + for batch_id, data in enumerate(data()): + img = data[0] + label = data[1] + label.stop_gradient = True + + out = model(img) + #loss = fluid.layers.cross_entropy(input=out, label=label) + #avg_loss = fluid.layers.mean(x=loss) + + acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) + acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) + + #dy_out = avg_loss.numpy() + + #total_loss += dy_out + total_acc1 += acc_top1.numpy() + total_acc5 += acc_top5.numpy() + total_sample += 1 + + # print("epoch id: %d, batch step: %d, loss: %f" % (eop, batch_id, dy_out)) + if batch_id % 10 == 0: + print("test | batch step %d, acc1 %0.3f acc5 %0.3f" % \ + ( batch_id, total_acc1 / total_sample, total_acc5 / total_sample)) + if args.ce: + print("kpis\ttest_acc1\t%0.3f" % (total_acc1 / total_sample)) + print("kpis\ttest_acc5\t%0.3f" % (total_acc5 / total_sample)) + print("final eval acc1 %0.3f acc5 %0.3f" % \ + (total_acc1 / total_sample, total_acc5 / total_sample)) + + +def train_resnet(): + epoch = args.epoch + place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \ + if args.use_data_parallel else fluid.CUDAPlace(0) + with fluid.dygraph.guard(place): + if args.ce: + print("ce mode") + seed = 33 + np.random.seed(seed) + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + if args.use_data_parallel: + strategy = fluid.dygraph.parallel.prepare_context() + + resnet = ResNet() + optimizer = optimizer_setting(parameter_list=resnet.parameters()) + + if args.use_data_parallel: + resnet = fluid.dygraph.parallel.DataParallel(resnet, strategy) + + train_reader = paddle.batch( + reader_decorator( + paddle.dataset.flowers.train(use_xmap=False)), + batch_size=batch_size, + drop_last=True) + + if args.use_data_parallel: + train_reader = fluid.contrib.reader.distributed_batch_reader( + train_reader) + + test_reader = paddle.batch( + reader_decorator( + paddle.dataset.flowers.test(use_xmap=False)), + batch_size=batch_size, + drop_last=True) + + train_loader = fluid.io.DataLoader.from_generator(capacity=10, use_multiprocess=True) + train_loader.set_sample_list_generator(train_reader, places=place) + + test_loader = fluid.io.DataLoader.from_generator(capacity=10, use_multiprocess=True) + test_loader.set_sample_list_generator(test_reader, places=place) + + #file_name = './model/epoch_0.npz' + #model_data = np.load( file_name ) + total_train_time = 0 + for eop in range(epoch): + + resnet.train() + total_loss = 0.0 + total_acc1 = 0.0 + total_acc5 = 0.0 + total_sample = 0 + + #dict_state = resnet.state_dict() + + #resnet.load_dict( model_data ) + + print("load finished") + + stime = time.time() + for batch_id, data in enumerate(train_loader()): + img = data[0] + label = data[1] + label.stop_gradient = True + + out = resnet(img) + loss = fluid.layers.cross_entropy(input=out, label=label) + avg_loss = fluid.layers.mean(x=loss) + + acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) + acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) + + dy_out = avg_loss.numpy() + + if args.use_data_parallel: + avg_loss = resnet.scale_loss(avg_loss) + avg_loss.backward() + resnet.apply_collective_grads() + else: + avg_loss.backward() + + optimizer.minimize(avg_loss) + resnet.clear_gradients() + + total_loss += dy_out + total_acc1 += acc_top1.numpy() + total_acc5 += acc_top5.numpy() + total_sample += 1 + #print("epoch id: %d, batch step: %d, loss: %f" % (eop, batch_id, dy_out)) + if batch_id % 10 == 0: + print( "epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f" % \ + ( eop, batch_id, total_loss / total_sample, \ + total_acc1 / total_sample, total_acc5 / total_sample)) + total_train_time += (time.time() - stime) + + if args.ce: + print("kpis\ttrain_acc1\t%0.3f" % (total_acc1 / total_sample)) + print("kpis\ttrain_acc5\t%0.3f" % (total_acc5 / total_sample)) + print("kpis\ttrain_loss\t%0.3f" % (total_loss / total_sample)) + print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f" % \ + (eop, batch_id, total_loss / total_sample, \ + total_acc1 / total_sample, total_acc5 / total_sample)) + resnet.eval() + eval(resnet, test_loader) + + save_parameters = (not args.use_data_parallel) or ( + args.use_data_parallel and + fluid.dygraph.parallel.Env().local_rank == 0) + if save_parameters: + fluid.save_dygraph(resnet.state_dict(), + 'resnet_params') + + print("total train time: {} s".format(total_train_time)) + +if __name__ == '__main__': + train_resnet() diff --git a/dygraph/resnet/train_sp.py b/dygraph/resnet/train_sp.py new file mode 100644 index 0000000000000000000000000000000000000000..b1cb1aafdfcabfedad498818ec922c26196120a1 --- /dev/null +++ b/dygraph/resnet/train_sp.py @@ -0,0 +1,397 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import argparse +import ast +import paddle +import paddle.fluid as fluid +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear +from paddle.fluid.dygraph.base import to_variable + +from paddle.fluid import framework + +import math +import sys +import time + +IMAGENET1000 = 1281167 +base_lr = 0.1 +momentum_rate = 0.9 +l2_decay = 1e-4 + + +def parse_args(): + parser = argparse.ArgumentParser("Training for Resnet.") + parser.add_argument( + "--use_data_parallel", + type=ast.literal_eval, + default=False, + help="The flag indicating whether to use data parallel mode to train the model." + ) + parser.add_argument( + "-e", "--epoch", default=1, type=int, help="set epoch") + parser.add_argument( + "-b", "--batch_size", default=32, type=int, help="set epoch") + parser.add_argument("--ce", action="store_true", help="run ce") + args = parser.parse_args() + return args + + +args = parse_args() +batch_size = args.batch_size + + +def optimizer_setting(parameter_list=None): + + total_images = IMAGENET1000 + + step = int(math.ceil(float(total_images) / batch_size)) + + epochs = [30, 60, 90] + bd = [step * e for e in epochs] + + lr = [] + lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] + if fluid.in_dygraph_mode(): + optimizer = fluid.optimizer.Momentum( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr), + momentum=momentum_rate, + regularization=fluid.regularizer.L2Decay(l2_decay), + parameter_list=parameter_list) + else: + optimizer = fluid.optimizer.Momentum( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr), + momentum=momentum_rate, + regularization=fluid.regularizer.L2Decay(l2_decay)) + + + return optimizer + + +class ConvBNLayer(fluid.dygraph.Layer): + def __init__(self, + num_channels, + num_filters, + filter_size, + stride=1, + groups=1, + act=None): + super(ConvBNLayer, self).__init__() + + self._conv = Conv2D( + num_channels=num_channels, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + act=None, + bias_attr=False) + + self._batch_norm = BatchNorm(num_filters, act=act) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + + return y + + +class BottleneckBlock(fluid.dygraph.Layer): + def __init__(self, + num_channels, + num_filters, + stride, + shortcut=True): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer( + num_channels=num_channels, + num_filters=num_filters, + filter_size=1, + act='relu') + self.conv1 = ConvBNLayer( + num_channels=num_filters, + num_filters=num_filters, + filter_size=3, + stride=stride, + act='relu') + self.conv2 = ConvBNLayer( + num_channels=num_filters, + num_filters=num_filters * 4, + filter_size=1, + act=None) + + if not shortcut: + self.short = ConvBNLayer( + num_channels=num_channels, + num_filters=num_filters * 4, + filter_size=1, + stride=stride) + + self.shortcut = shortcut + + self._num_channels_out = num_filters * 4 + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + + y = fluid.layers.elementwise_add(x=short, y=conv2) + + layer_helper = LayerHelper(self.full_name(), act='relu') + return layer_helper.append_activation(y) + + +class ResNet(fluid.dygraph.Layer): + def __init__(self, layers=50, class_dim=102): + super(ResNet, self).__init__() + + self.layers = layers + supported_layers = [50, 101, 152] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format(supported_layers, layers) + + if layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + num_channels = [64, 256, 512, 1024] + num_filters = [64, 128, 256, 512] + + self.conv = ConvBNLayer( + num_channels=3, + num_filters=64, + filter_size=7, + stride=2, + act='relu') + self.pool2d_max = Pool2D( + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + + self.bottleneck_block_list = [] + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + bottleneck_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BottleneckBlock( + num_channels=num_channels[block] + if i == 0 else num_filters[block] * 4, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + shortcut=shortcut)) + self.bottleneck_block_list.append(bottleneck_block) + shortcut = True + + self.pool2d_avg = Pool2D( + pool_size=7, pool_type='avg', global_pooling=True) + + self.pool2d_avg_output = num_filters[len(num_filters) - 1] * 4 * 1 * 1 + + import math + stdv = 1.0 / math.sqrt(2048 * 1.0) + + self.out = Linear(self.pool2d_avg_output, + class_dim, + act='softmax', + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, stdv))) + + def forward(self, inputs): + y = self.conv(inputs) + y = self.pool2d_max(y) + for bottleneck_block in self.bottleneck_block_list: + y = bottleneck_block(y) + y = self.pool2d_avg(y) + y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output]) + y = self.out(y) + return y + +def reader_decorator(reader): + def __reader__(): + for item in reader(): + img = np.array(item[0]).astype('float32').reshape(3, 224, 224) + label = np.array(item[1]).astype('int64').reshape(1) + yield img, label + return __reader__ + +def eval(model, data): + + model.eval() + total_loss = 0.0 + total_acc1 = 0.0 + total_acc5 = 0.0 + total_sample = 0 + for batch_id, data in enumerate(data()): + img = data[0] + label = data[1] + label.stop_gradient = True + + out = model(img) + #loss = fluid.layers.cross_entropy(input=out, label=label) + #avg_loss = fluid.layers.mean(x=loss) + + acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) + acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) + + #dy_out = avg_loss.numpy() + + #total_loss += dy_out + total_acc1 += acc_top1.numpy() + total_acc5 += acc_top5.numpy() + total_sample += 1 + + # print("epoch id: %d, batch step: %d, loss: %f" % (eop, batch_id, dy_out)) + if batch_id % 10 == 0: + print("test | batch step %d, acc1 %0.3f acc5 %0.3f" % \ + ( batch_id, total_acc1 / total_sample, total_acc5 / total_sample)) + if args.ce: + print("kpis\ttest_acc1\t%0.3f" % (total_acc1 / total_sample)) + print("kpis\ttest_acc5\t%0.3f" % (total_acc5 / total_sample)) + print("final eval acc1 %0.3f acc5 %0.3f" % \ + (total_acc1 / total_sample, total_acc5 / total_sample)) + + +def train_resnet(): + epoch = args.epoch + place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \ + if args.use_data_parallel else fluid.CUDAPlace(0) + with fluid.dygraph.guard(place): + if args.ce: + print("ce mode") + seed = 33 + np.random.seed(seed) + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + if args.use_data_parallel: + strategy = fluid.dygraph.parallel.prepare_context() + + resnet = ResNet() + optimizer = optimizer_setting(parameter_list=resnet.parameters()) + + if args.use_data_parallel: + resnet = fluid.dygraph.parallel.DataParallel(resnet, strategy) + + train_reader = paddle.batch( + reader_decorator( + paddle.dataset.flowers.train(use_xmap=False)), + batch_size=batch_size, + drop_last=True) + + if args.use_data_parallel: + train_reader = fluid.contrib.reader.distributed_batch_reader( + train_reader) + + test_reader = paddle.batch( + reader_decorator( + paddle.dataset.flowers.test(use_xmap=False)), + batch_size=batch_size, + drop_last=True) + + train_loader = fluid.io.DataLoader.from_generator(capacity=10) + train_loader.set_sample_list_generator(train_reader, places=place) + + test_loader = fluid.io.DataLoader.from_generator(capacity=10) + test_loader.set_sample_list_generator(test_reader, places=place) + + #file_name = './model/epoch_0.npz' + #model_data = np.load( file_name ) + total_train_time = 0 + for eop in range(epoch): + + resnet.train() + total_loss = 0.0 + total_acc1 = 0.0 + total_acc5 = 0.0 + total_sample = 0 + + #dict_state = resnet.state_dict() + + #resnet.load_dict( model_data ) + + print("load finished") + + stime = time.time() + for batch_id, data in enumerate(train_loader()): + img = data[0] + label = data[1] + label.stop_gradient = True + + out = resnet(img) + loss = fluid.layers.cross_entropy(input=out, label=label) + avg_loss = fluid.layers.mean(x=loss) + + acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) + acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) + + dy_out = avg_loss.numpy() + + if args.use_data_parallel: + avg_loss = resnet.scale_loss(avg_loss) + avg_loss.backward() + resnet.apply_collective_grads() + else: + avg_loss.backward() + + optimizer.minimize(avg_loss) + resnet.clear_gradients() + + total_loss += dy_out + total_acc1 += acc_top1.numpy() + total_acc5 += acc_top5.numpy() + total_sample += 1 + #print("epoch id: %d, batch step: %d, loss: %f" % (eop, batch_id, dy_out)) + if batch_id % 10 == 0: + print( "epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f" % \ + ( eop, batch_id, total_loss / total_sample, \ + total_acc1 / total_sample, total_acc5 / total_sample)) + total_train_time += (time.time() - stime) + + if args.ce: + print("kpis\ttrain_acc1\t%0.3f" % (total_acc1 / total_sample)) + print("kpis\ttrain_acc5\t%0.3f" % (total_acc5 / total_sample)) + print("kpis\ttrain_loss\t%0.3f" % (total_loss / total_sample)) + print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f" % \ + (eop, batch_id, total_loss / total_sample, \ + total_acc1 / total_sample, total_acc5 / total_sample)) + resnet.eval() + eval(resnet, test_loader) + + save_parameters = (not args.use_data_parallel) or ( + args.use_data_parallel and + fluid.dygraph.parallel.Env().local_rank == 0) + if save_parameters: + fluid.save_dygraph(resnet.state_dict(), + 'resnet_params') + + print("total train time: {} s".format(total_train_time)) + +if __name__ == '__main__': + train_resnet() diff --git a/dygraph/se_resnet/train.py b/dygraph/se_resnet/train.py index a0ff8cd643a85b39f69aca9ec7d295a129310669..3957a2aa49b2aa9134854bcd3ef0fc02a448aac5 100644 --- a/dygraph/se_resnet/train.py +++ b/dygraph/se_resnet/train.py @@ -16,6 +16,7 @@ import contextlib import unittest import numpy as np import six +import time import paddle import paddle.fluid as fluid @@ -29,7 +30,7 @@ import argparse import ast parser = argparse.ArgumentParser("Training for Se-ResNeXt.") -parser.add_argument("-e", "--epoch", default=200, type=int, help="set epoch") +parser.add_argument("-e", "--epoch", default=1, type=int, help="set epoch") parser.add_argument("--ce", action="store_true", help="run ce") parser.add_argument( "--use_data_parallel", @@ -52,7 +53,7 @@ train_parameters = { "batch_size": batch_size, "lr": 0.0125, "total_images": 6149, - "num_epochs": 200 + "num_epochs": 1 } momentum_rate = 0.9 @@ -99,8 +100,7 @@ class ConvBNLayer(fluid.dygraph.Layer): padding=(filter_size - 1) // 2, groups=groups, act=None, - bias_attr=False, - param_attr=fluid.ParamAttr(name="weights")) + bias_attr=False) self._batch_norm = BatchNorm(num_filters, act=act) @@ -399,11 +399,13 @@ def train(): test_reader = paddle.batch( paddle.dataset.flowers.test(use_xmap=False), batch_size=32) + total_train_time = 0 for epoch_id in range(epoch_num): total_loss = 0.0 total_acc1 = 0.0 total_acc5 = 0.0 total_sample = 0 + stime = time.time() for batch_id, data in enumerate(train_reader()): dy_x_data = np.array([x[0].reshape(3, 224, 224) @@ -446,6 +448,7 @@ def train(): print( "epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f lr %0.5f" % \ ( epoch_id, batch_id, total_loss / total_sample, \ total_acc1 / total_sample, total_acc5 / total_sample, lr)) + total_train_time += (time.time() - stime) if args.ce: print("kpis\ttrain_acc1\t%0.3f" % (total_acc1 / total_sample)) @@ -458,6 +461,8 @@ def train(): eval(se_resnext, test_reader) se_resnext.train() + print("total train time: {} s".format(total_train_time)) + if __name__ == '__main__': train() diff --git a/dygraph/se_resnet/train_mp.py b/dygraph/se_resnet/train_mp.py new file mode 100644 index 0000000000000000000000000000000000000000..c016f50315742b44dc63a0de37d3a03e292319f7 --- /dev/null +++ b/dygraph/se_resnet/train_mp.py @@ -0,0 +1,469 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import unittest +import numpy as np +import six +import time + +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear +from paddle.fluid.dygraph.base import to_variable +import sys +import math +import argparse +import ast + +parser = argparse.ArgumentParser("Training for Se-ResNeXt.") +parser.add_argument("-e", "--epoch", default=1, type=int, help="set epoch") +parser.add_argument("--ce", action="store_true", help="run ce") +parser.add_argument( + "--use_data_parallel", + type=ast.literal_eval, + default=False, + help="The flag indicating whether to use data parallel mode to train the model." +) +args = parser.parse_args() +batch_size = 64 +train_parameters = { + "input_size": [3, 224, 224], + "input_mean": [0.485, 0.456, 0.406], + "input_std": [0.229, 0.224, 0.225], + "learning_strategy": { + "name": "cosine_decay", + "batch_size": batch_size, + "epochs": [40, 80, 100], + "steps": [0.1, 0.01, 0.001, 0.0001] + }, + "batch_size": batch_size, + "lr": 0.0125, + "total_images": 6149, + "num_epochs": 1 +} + +momentum_rate = 0.9 +l2_decay = 1.2e-4 + + +def optimizer_setting(params, parameter_list): + ls = params["learning_strategy"] + if "total_images" not in params: + total_images = 6149 + else: + total_images = params["total_images"] + + batch_size = ls["batch_size"] + step = int(math.ceil(float(total_images) / batch_size)) + bd = [step * e for e in ls["epochs"]] + lr = params["lr"] + num_epochs = params["num_epochs"] + optimizer = fluid.optimizer.Momentum( + learning_rate=fluid.layers.cosine_decay( + learning_rate=lr, step_each_epoch=step, epochs=num_epochs), + momentum=momentum_rate, + regularization=fluid.regularizer.L2Decay(l2_decay), + parameter_list=parameter_list) + + return optimizer + + +class ConvBNLayer(fluid.dygraph.Layer): + def __init__(self, + num_channels, + num_filters, + filter_size, + stride=1, + groups=1, + act=None): + super(ConvBNLayer, self).__init__() + + self._conv = Conv2D( + num_channels=num_channels, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + act=None, + bias_attr=False) + + self._batch_norm = BatchNorm(num_filters, act=act) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + + return y + + +class SqueezeExcitation(fluid.dygraph.Layer): + def __init__(self, num_channels, reduction_ratio): + + super(SqueezeExcitation, self).__init__() + self._num_channels = num_channels + self._pool = Pool2D(pool_size=0, pool_type='avg', global_pooling=True) + stdv = 1.0 / math.sqrt(num_channels * 1.0) + self._fc = Linear( + num_channels, + num_channels // reduction_ratio, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, stdv)), + act='relu') + stdv = 1.0 / math.sqrt(num_channels / 16.0 * 1.0) + self._excitation = Linear( + num_channels // reduction_ratio, + num_channels, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, stdv)), + act='sigmoid') + + def forward(self, input): + y = self._pool(input) + y = fluid.layers.reshape(y, shape=[-1, self._num_channels]) + y = self._fc(y) + y = self._excitation(y) + y = fluid.layers.elementwise_mul(x=input, y=y, axis=0) + return y + + +class BottleneckBlock(fluid.dygraph.Layer): + def __init__(self, + num_channels, + num_filters, + stride, + cardinality, + reduction_ratio, + shortcut=True): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer( + num_channels=num_channels, + num_filters=num_filters, + filter_size=1, + act="relu") + self.conv1 = ConvBNLayer( + num_channels=num_filters, + num_filters=num_filters, + filter_size=3, + stride=stride, + groups=cardinality, + act="relu") + self.conv2 = ConvBNLayer( + num_channels=num_filters, + num_filters=num_filters * 2, + filter_size=1, + act=None) + + self.scale = SqueezeExcitation( + num_channels=num_filters * 2, + reduction_ratio=reduction_ratio) + + if not shortcut: + self.short = ConvBNLayer( + num_channels=num_channels, + num_filters=num_filters * 2, + filter_size=1, + stride=stride) + + self.shortcut = shortcut + + self._num_channels_out = num_filters * 2 + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + scale = self.scale(conv2) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + + y = fluid.layers.elementwise_add(x=short, y=scale, act='relu') + return y + + +class SeResNeXt(fluid.dygraph.Layer): + def __init__(self, layers=50, class_dim=102): + super(SeResNeXt, self).__init__() + + self.layers = layers + supported_layers = [50, 101, 152] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format(supported_layers, layers) + + if layers == 50: + cardinality = 32 + reduction_ratio = 16 + depth = [3, 4, 6, 3] + num_filters = [128, 256, 512, 1024] + self.conv0 = ConvBNLayer( + num_channels=3, + num_filters=64, + filter_size=7, + stride=2, + act='relu') + self.pool = Pool2D( + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + elif layers == 101: + cardinality = 32 + reduction_ratio = 16 + depth = [3, 4, 23, 3] + num_filters = [128, 256, 512, 1024] + self.conv0 = ConvBNLayer( + num_channels=3, + num_filters=64, + filter_size=7, + stride=2, + act='relu') + self.pool = Pool2D( + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + elif layers == 152: + cardinality = 64 + reduction_ratio = 16 + depth = [3, 8, 36, 3] + num_filters = [128, 256, 512, 1024] + self.conv0 = ConvBNLayer( + num_channels=3, + num_filters=64, + filter_size=3, + stride=2, + act='relu') + self.conv1 = ConvBNLayer( + num_channels=64, + num_filters=64, + filter_size=3, + stride=1, + act='relu') + self.conv2 = ConvBNLayer( + num_channels=64, + num_filters=128, + filter_size=3, + stride=1, + act='relu') + self.pool = Pool2D( + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + + self.bottleneck_block_list = [] + num_channels = 64 + if layers == 152: + num_channels = 128 + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + bottleneck_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BottleneckBlock( + num_channels=num_channels, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + cardinality=cardinality, + reduction_ratio=reduction_ratio, + shortcut=shortcut)) + num_channels = bottleneck_block._num_channels_out + self.bottleneck_block_list.append(bottleneck_block) + shortcut = True + + self.pool2d_avg = Pool2D( + pool_size=7, pool_type='avg', global_pooling=True) + stdv = 1.0 / math.sqrt(2048 * 1.0) + + self.pool2d_avg_output = num_filters[len(num_filters) - 1] * 2 * 1 * 1 + + self.out = Linear(self.pool2d_avg_output, + class_dim, + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, stdv))) + + def forward(self, inputs): + if self.layers == 50 or self.layers == 101: + y = self.conv0(inputs) + y = self.pool(y) + elif self.layers == 152: + y = self.conv0(inputs) + y = self.conv1(y) + y = self.conv2(y) + y = self.pool(y) + + for bottleneck_block in self.bottleneck_block_list: + y = bottleneck_block(y) + y = self.pool2d_avg(y) + y = fluid.layers.dropout(y, dropout_prob=0.5, seed=100) + y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output]) + y = self.out(y) + return y + +def reader_decorator(reader): + def __reader__(): + for item in reader(): + img = np.array(item[0]).astype('float32').reshape(3, 224, 224) + label = np.array(item[1]).astype('int64').reshape(1) + yield img, label + return __reader__ + +def eval(model, data): + + model.eval() + batch_size = 32 + total_loss = 0.0 + total_acc1 = 0.0 + total_acc5 = 0.0 + total_sample = 0 + for batch_id, data in enumerate(data()): + img, label = data + label.stop_gradient = True + out = model(img) + + softmax_out = fluid.layers.softmax(out, use_cudnn=False) + loss = fluid.layers.cross_entropy(input=softmax_out, label=label) + avg_loss = fluid.layers.mean(x=loss) + acc_top1 = fluid.layers.accuracy(input=softmax_out, label=label, k=1) + acc_top5 = fluid.layers.accuracy(input=softmax_out, label=label, k=5) + dy_out = avg_loss.numpy() + + total_loss += dy_out + total_acc1 += acc_top1.numpy() + total_acc5 += acc_top5.numpy() + total_sample += 1 + if batch_id % 10 == 0: + print("test | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f" % \ + ( batch_id, total_loss / total_sample, \ + total_acc1 / total_sample, total_acc5 / total_sample)) + + if args.ce: + print("kpis\ttest_acc1\t%0.3f" % (total_acc1 / total_sample)) + print("kpis\ttest_acc5\t%0.3f" % (total_acc5 / total_sample)) + print("kpis\ttest_loss\t%0.3f" % (total_loss / total_sample)) + print("final eval loss %0.3f acc1 %0.3f acc5 %0.3f" % \ + (total_loss / total_sample, \ + total_acc1 / total_sample, total_acc5 / total_sample)) + + +def train(): + + epoch_num = train_parameters["num_epochs"] + if args.ce: + epoch_num = args.epoch + batch_size = train_parameters["batch_size"] + + trainer_count = fluid.dygraph.parallel.Env().nranks + place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \ + if args.use_data_parallel else fluid.CUDAPlace(0) + with fluid.dygraph.guard(place): + if args.ce: + print("ce mode") + seed = 90 + np.random.seed(seed) + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + if args.use_data_parallel: + strategy = fluid.dygraph.parallel.prepare_context() + se_resnext = SeResNeXt() + optimizer = optimizer_setting(train_parameters, se_resnext.parameters()) + if args.use_data_parallel: + se_resnext = fluid.dygraph.parallel.DataParallel(se_resnext, + strategy) + train_reader = paddle.batch( + reader_decorator( + paddle.dataset.flowers.train(use_xmap=False)), + batch_size=batch_size, + drop_last=True) + if args.use_data_parallel: + train_reader = fluid.contrib.reader.distributed_batch_reader( + train_reader) + test_reader = paddle.batch( + reader_decorator( + paddle.dataset.flowers.test(use_xmap=False)), + batch_size=32) + + train_loader = fluid.io.DataLoader.from_generator(capacity=10, use_multiprocess=True) + train_loader.set_sample_list_generator(train_reader, places=place) + + test_loader = fluid.io.DataLoader.from_generator(capacity=10, use_multiprocess=True) + test_loader.set_sample_list_generator(test_reader, places=place) + + total_train_time = 0 + for epoch_id in range(epoch_num): + total_loss = 0.0 + total_acc1 = 0.0 + total_acc5 = 0.0 + total_sample = 0 + stime = time.time() + for batch_id, data in enumerate(train_loader()): + img, label = data + label.stop_gradient = True + + out = se_resnext(img) + softmax_out = fluid.layers.softmax(out, use_cudnn=False) + loss = fluid.layers.cross_entropy( + input=softmax_out, label=label) + avg_loss = fluid.layers.mean(x=loss) + + acc_top1 = fluid.layers.accuracy( + input=softmax_out, label=label, k=1) + acc_top5 = fluid.layers.accuracy( + input=softmax_out, label=label, k=5) + + dy_out = avg_loss.numpy() + if args.use_data_parallel: + avg_loss = se_resnext.scale_loss(avg_loss) + avg_loss.backward() + se_resnext.apply_collective_grads() + else: + avg_loss.backward() + + optimizer.minimize(avg_loss) + se_resnext.clear_gradients() + + lr = optimizer._global_learning_rate().numpy() + total_loss += dy_out + total_acc1 += acc_top1.numpy() + total_acc5 += acc_top5.numpy() + total_sample += 1 + if batch_id % 10 == 0: + print( "epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f lr %0.5f" % \ + ( epoch_id, batch_id, total_loss / total_sample, \ + total_acc1 / total_sample, total_acc5 / total_sample, lr)) + total_train_time += (time.time() - stime) + + if args.ce: + print("kpis\ttrain_acc1\t%0.3f" % (total_acc1 / total_sample)) + print("kpis\ttrain_acc5\t%0.3f" % (total_acc5 / total_sample)) + print("kpis\ttrain_loss\t%0.3f" % (total_loss / total_sample)) + print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f" % \ + (epoch_id, batch_id, total_loss / total_sample, \ + total_acc1 / total_sample, total_acc5 / total_sample)) + se_resnext.eval() + eval(se_resnext, test_loader) + se_resnext.train() + + print("total train time: {} s".format(total_train_time)) + + +if __name__ == '__main__': + train() diff --git a/dygraph/se_resnet/train_sp.py b/dygraph/se_resnet/train_sp.py new file mode 100644 index 0000000000000000000000000000000000000000..f813941611840eefa93801cdc1812e266ac55cdf --- /dev/null +++ b/dygraph/se_resnet/train_sp.py @@ -0,0 +1,469 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import unittest +import numpy as np +import six +import time + +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear +from paddle.fluid.dygraph.base import to_variable +import sys +import math +import argparse +import ast + +parser = argparse.ArgumentParser("Training for Se-ResNeXt.") +parser.add_argument("-e", "--epoch", default=1, type=int, help="set epoch") +parser.add_argument("--ce", action="store_true", help="run ce") +parser.add_argument( + "--use_data_parallel", + type=ast.literal_eval, + default=False, + help="The flag indicating whether to use data parallel mode to train the model." +) +args = parser.parse_args() +batch_size = 64 +train_parameters = { + "input_size": [3, 224, 224], + "input_mean": [0.485, 0.456, 0.406], + "input_std": [0.229, 0.224, 0.225], + "learning_strategy": { + "name": "cosine_decay", + "batch_size": batch_size, + "epochs": [40, 80, 100], + "steps": [0.1, 0.01, 0.001, 0.0001] + }, + "batch_size": batch_size, + "lr": 0.0125, + "total_images": 6149, + "num_epochs": 1 +} + +momentum_rate = 0.9 +l2_decay = 1.2e-4 + + +def optimizer_setting(params, parameter_list): + ls = params["learning_strategy"] + if "total_images" not in params: + total_images = 6149 + else: + total_images = params["total_images"] + + batch_size = ls["batch_size"] + step = int(math.ceil(float(total_images) / batch_size)) + bd = [step * e for e in ls["epochs"]] + lr = params["lr"] + num_epochs = params["num_epochs"] + optimizer = fluid.optimizer.Momentum( + learning_rate=fluid.layers.cosine_decay( + learning_rate=lr, step_each_epoch=step, epochs=num_epochs), + momentum=momentum_rate, + regularization=fluid.regularizer.L2Decay(l2_decay), + parameter_list=parameter_list) + + return optimizer + + +class ConvBNLayer(fluid.dygraph.Layer): + def __init__(self, + num_channels, + num_filters, + filter_size, + stride=1, + groups=1, + act=None): + super(ConvBNLayer, self).__init__() + + self._conv = Conv2D( + num_channels=num_channels, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + act=None, + bias_attr=False) + + self._batch_norm = BatchNorm(num_filters, act=act) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + + return y + + +class SqueezeExcitation(fluid.dygraph.Layer): + def __init__(self, num_channels, reduction_ratio): + + super(SqueezeExcitation, self).__init__() + self._num_channels = num_channels + self._pool = Pool2D(pool_size=0, pool_type='avg', global_pooling=True) + stdv = 1.0 / math.sqrt(num_channels * 1.0) + self._fc = Linear( + num_channels, + num_channels // reduction_ratio, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, stdv)), + act='relu') + stdv = 1.0 / math.sqrt(num_channels / 16.0 * 1.0) + self._excitation = Linear( + num_channels // reduction_ratio, + num_channels, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, stdv)), + act='sigmoid') + + def forward(self, input): + y = self._pool(input) + y = fluid.layers.reshape(y, shape=[-1, self._num_channels]) + y = self._fc(y) + y = self._excitation(y) + y = fluid.layers.elementwise_mul(x=input, y=y, axis=0) + return y + + +class BottleneckBlock(fluid.dygraph.Layer): + def __init__(self, + num_channels, + num_filters, + stride, + cardinality, + reduction_ratio, + shortcut=True): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer( + num_channels=num_channels, + num_filters=num_filters, + filter_size=1, + act="relu") + self.conv1 = ConvBNLayer( + num_channels=num_filters, + num_filters=num_filters, + filter_size=3, + stride=stride, + groups=cardinality, + act="relu") + self.conv2 = ConvBNLayer( + num_channels=num_filters, + num_filters=num_filters * 2, + filter_size=1, + act=None) + + self.scale = SqueezeExcitation( + num_channels=num_filters * 2, + reduction_ratio=reduction_ratio) + + if not shortcut: + self.short = ConvBNLayer( + num_channels=num_channels, + num_filters=num_filters * 2, + filter_size=1, + stride=stride) + + self.shortcut = shortcut + + self._num_channels_out = num_filters * 2 + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + scale = self.scale(conv2) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + + y = fluid.layers.elementwise_add(x=short, y=scale, act='relu') + return y + + +class SeResNeXt(fluid.dygraph.Layer): + def __init__(self, layers=50, class_dim=102): + super(SeResNeXt, self).__init__() + + self.layers = layers + supported_layers = [50, 101, 152] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format(supported_layers, layers) + + if layers == 50: + cardinality = 32 + reduction_ratio = 16 + depth = [3, 4, 6, 3] + num_filters = [128, 256, 512, 1024] + self.conv0 = ConvBNLayer( + num_channels=3, + num_filters=64, + filter_size=7, + stride=2, + act='relu') + self.pool = Pool2D( + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + elif layers == 101: + cardinality = 32 + reduction_ratio = 16 + depth = [3, 4, 23, 3] + num_filters = [128, 256, 512, 1024] + self.conv0 = ConvBNLayer( + num_channels=3, + num_filters=64, + filter_size=7, + stride=2, + act='relu') + self.pool = Pool2D( + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + elif layers == 152: + cardinality = 64 + reduction_ratio = 16 + depth = [3, 8, 36, 3] + num_filters = [128, 256, 512, 1024] + self.conv0 = ConvBNLayer( + num_channels=3, + num_filters=64, + filter_size=3, + stride=2, + act='relu') + self.conv1 = ConvBNLayer( + num_channels=64, + num_filters=64, + filter_size=3, + stride=1, + act='relu') + self.conv2 = ConvBNLayer( + num_channels=64, + num_filters=128, + filter_size=3, + stride=1, + act='relu') + self.pool = Pool2D( + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + + self.bottleneck_block_list = [] + num_channels = 64 + if layers == 152: + num_channels = 128 + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + bottleneck_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BottleneckBlock( + num_channels=num_channels, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + cardinality=cardinality, + reduction_ratio=reduction_ratio, + shortcut=shortcut)) + num_channels = bottleneck_block._num_channels_out + self.bottleneck_block_list.append(bottleneck_block) + shortcut = True + + self.pool2d_avg = Pool2D( + pool_size=7, pool_type='avg', global_pooling=True) + stdv = 1.0 / math.sqrt(2048 * 1.0) + + self.pool2d_avg_output = num_filters[len(num_filters) - 1] * 2 * 1 * 1 + + self.out = Linear(self.pool2d_avg_output, + class_dim, + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, stdv))) + + def forward(self, inputs): + if self.layers == 50 or self.layers == 101: + y = self.conv0(inputs) + y = self.pool(y) + elif self.layers == 152: + y = self.conv0(inputs) + y = self.conv1(y) + y = self.conv2(y) + y = self.pool(y) + + for bottleneck_block in self.bottleneck_block_list: + y = bottleneck_block(y) + y = self.pool2d_avg(y) + y = fluid.layers.dropout(y, dropout_prob=0.5, seed=100) + y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output]) + y = self.out(y) + return y + +def reader_decorator(reader): + def __reader__(): + for item in reader(): + img = np.array(item[0]).astype('float32').reshape(3, 224, 224) + label = np.array(item[1]).astype('int64').reshape(1) + yield img, label + return __reader__ + +def eval(model, data): + + model.eval() + batch_size = 32 + total_loss = 0.0 + total_acc1 = 0.0 + total_acc5 = 0.0 + total_sample = 0 + for batch_id, data in enumerate(data()): + img, label = data + label.stop_gradient = True + out = model(img) + + softmax_out = fluid.layers.softmax(out, use_cudnn=False) + loss = fluid.layers.cross_entropy(input=softmax_out, label=label) + avg_loss = fluid.layers.mean(x=loss) + acc_top1 = fluid.layers.accuracy(input=softmax_out, label=label, k=1) + acc_top5 = fluid.layers.accuracy(input=softmax_out, label=label, k=5) + dy_out = avg_loss.numpy() + + total_loss += dy_out + total_acc1 += acc_top1.numpy() + total_acc5 += acc_top5.numpy() + total_sample += 1 + if batch_id % 10 == 0: + print("test | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f" % \ + ( batch_id, total_loss / total_sample, \ + total_acc1 / total_sample, total_acc5 / total_sample)) + + if args.ce: + print("kpis\ttest_acc1\t%0.3f" % (total_acc1 / total_sample)) + print("kpis\ttest_acc5\t%0.3f" % (total_acc5 / total_sample)) + print("kpis\ttest_loss\t%0.3f" % (total_loss / total_sample)) + print("final eval loss %0.3f acc1 %0.3f acc5 %0.3f" % \ + (total_loss / total_sample, \ + total_acc1 / total_sample, total_acc5 / total_sample)) + + +def train(): + + epoch_num = train_parameters["num_epochs"] + if args.ce: + epoch_num = args.epoch + batch_size = train_parameters["batch_size"] + + trainer_count = fluid.dygraph.parallel.Env().nranks + place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \ + if args.use_data_parallel else fluid.CUDAPlace(0) + with fluid.dygraph.guard(place): + if args.ce: + print("ce mode") + seed = 90 + np.random.seed(seed) + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + if args.use_data_parallel: + strategy = fluid.dygraph.parallel.prepare_context() + se_resnext = SeResNeXt() + optimizer = optimizer_setting(train_parameters, se_resnext.parameters()) + if args.use_data_parallel: + se_resnext = fluid.dygraph.parallel.DataParallel(se_resnext, + strategy) + train_reader = paddle.batch( + reader_decorator( + paddle.dataset.flowers.train(use_xmap=False)), + batch_size=batch_size, + drop_last=True) + if args.use_data_parallel: + train_reader = fluid.contrib.reader.distributed_batch_reader( + train_reader) + test_reader = paddle.batch( + reader_decorator( + paddle.dataset.flowers.test(use_xmap=False)), + batch_size=32) + + train_loader = fluid.io.DataLoader.from_generator(capacity=10) + train_loader.set_sample_list_generator(train_reader, places=place) + + test_loader = fluid.io.DataLoader.from_generator(capacity=10) + test_loader.set_sample_list_generator(test_reader, places=place) + + total_train_time = 0 + for epoch_id in range(epoch_num): + total_loss = 0.0 + total_acc1 = 0.0 + total_acc5 = 0.0 + total_sample = 0 + stime = time.time() + for batch_id, data in enumerate(train_loader()): + img, label = data + label.stop_gradient = True + + out = se_resnext(img) + softmax_out = fluid.layers.softmax(out, use_cudnn=False) + loss = fluid.layers.cross_entropy( + input=softmax_out, label=label) + avg_loss = fluid.layers.mean(x=loss) + + acc_top1 = fluid.layers.accuracy( + input=softmax_out, label=label, k=1) + acc_top5 = fluid.layers.accuracy( + input=softmax_out, label=label, k=5) + + dy_out = avg_loss.numpy() + if args.use_data_parallel: + avg_loss = se_resnext.scale_loss(avg_loss) + avg_loss.backward() + se_resnext.apply_collective_grads() + else: + avg_loss.backward() + + optimizer.minimize(avg_loss) + se_resnext.clear_gradients() + + lr = optimizer._global_learning_rate().numpy() + total_loss += dy_out + total_acc1 += acc_top1.numpy() + total_acc5 += acc_top5.numpy() + total_sample += 1 + if batch_id % 10 == 0: + print( "epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f lr %0.5f" % \ + ( epoch_id, batch_id, total_loss / total_sample, \ + total_acc1 / total_sample, total_acc5 / total_sample, lr)) + total_train_time += (time.time() - stime) + + if args.ce: + print("kpis\ttrain_acc1\t%0.3f" % (total_acc1 / total_sample)) + print("kpis\ttrain_acc5\t%0.3f" % (total_acc5 / total_sample)) + print("kpis\ttrain_loss\t%0.3f" % (total_loss / total_sample)) + print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f" % \ + (epoch_id, batch_id, total_loss / total_sample, \ + total_acc1 / total_sample, total_acc5 / total_sample)) + se_resnext.eval() + eval(se_resnext, test_loader) + se_resnext.train() + + print("total train time: {} s".format(total_train_time)) + + +if __name__ == '__main__': + train() diff --git a/dygraph/transformer/train.py b/dygraph/transformer/train.py index e043bca19edcee16f1f6bdf162bc5eaf569f9968..081a4cf174dc4da5238e65a1828d06e0090e26a2 100644 --- a/dygraph/transformer/train.py +++ b/dygraph/transformer/train.py @@ -15,6 +15,7 @@ from __future__ import print_function import argparse import ast +import time import numpy as np import paddle @@ -144,10 +145,12 @@ def train(args): batch_size=TrainTaskConfig.batch_size) # loop for training iterations + total_train_time = 0 for i in range(TrainTaskConfig.pass_num): dy_step = 0 sum_cost = 0 transformer.train() + stime = time.time() for batch in train_reader(): enc_inputs, dec_inputs, label, weights = prepare_train_input( batch, ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, @@ -170,6 +173,7 @@ def train(args): print("pass num : {}, batch_id: {}, dy_graph avg loss: {}". format(i, dy_step, dy_avg_cost.numpy() * trainer_count)) + total_train_time += (time.time() - stime) # switch to evaluation mode transformer.eval() @@ -190,6 +194,8 @@ def train(args): if fluid.dygraph.parallel.Env().dev_id == 0: fluid.save_dygraph(transformer.state_dict(), args.model_file) + print("total train time: {} s".format(total_train_time)) + if __name__ == '__main__': args = parse_args() diff --git a/dygraph/transformer/train_mp.py b/dygraph/transformer/train_mp.py new file mode 100644 index 0000000000000000000000000000000000000000..a6257b714a5d8a73039e058a7056ba700e36d54c --- /dev/null +++ b/dygraph/transformer/train_mp.py @@ -0,0 +1,231 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import argparse +import ast +import time + +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.dataset.wmt16 as wmt16 + +from model import TransFormer, NoamDecay +from config import * +from data_util import * + + +def parse_args(): + parser = argparse.ArgumentParser("Arguments for Training") + parser.add_argument( + "--use_data_parallel", + type=ast.literal_eval, + default=False, + help="The flag indicating whether to use multi-GPU.") + parser.add_argument( + "--model_file", + type=str, + default="transformer_params", + help="Save the model as a file named `model_file.pdparams`.") + parser.add_argument( + 'opts', + help='See config.py for all options', + default=None, + nargs=argparse.REMAINDER) + args = parser.parse_args() + merge_cfg_from_list(args.opts, [TrainTaskConfig, ModelHyperParams]) + return args + + +def prepare_train_input_array(insts, src_pad_idx, trg_pad_idx, n_head): + """ + inputs for training + """ + src_word, src_pos, src_slf_attn_bias, src_max_len = pad_batch_data( + [inst[0] for inst in insts], src_pad_idx, n_head, is_target=False) + src_word = src_word.reshape(-1, src_max_len) + src_pos = src_pos.reshape(-1, src_max_len) + trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = pad_batch_data( + [inst[1] for inst in insts], trg_pad_idx, n_head, is_target=True) + trg_word = trg_word.reshape(-1, trg_max_len) + trg_pos = trg_pos.reshape(-1, trg_max_len) + + trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :], + [1, 1, trg_max_len, 1]).astype("float32") + + lbl_word, lbl_weight, num_token = pad_batch_data( + [inst[2] for inst in insts], + trg_pad_idx, + n_head, + is_target=False, + is_label=True, + return_attn_bias=False, + return_max_len=False, + return_num_token=True) + + return src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, \ + trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight + +def input_data_array_reader(reader, src_pad_idx, trg_pad_idx, n_head): + def __reader__(): + r = reader() + for data in r: + src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, \ + trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight = \ + prepare_train_input_array(data, src_pad_idx, trg_pad_idx, n_head) + yield src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, \ + trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight + return __reader__ + +def group_inputs(var_inputs): + enc_inputs = var_inputs[0:len(encoder_data_input_fields)] + dec_inputs = var_inputs[len(encoder_data_input_fields + ):len(encoder_data_input_fields) + + len(decoder_data_input_fields[:-1])] + label = var_inputs[-2] + weights = var_inputs[-1] + + return enc_inputs, dec_inputs, label, weights + +def train(args): + """ + train models + :return: + """ + + trainer_count = fluid.dygraph.parallel.Env().nranks + place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \ + if args.use_data_parallel else fluid.CUDAPlace(0) + with fluid.dygraph.guard(place): + if args.use_data_parallel: + strategy = fluid.dygraph.parallel.prepare_context() + + # define model + transformer = TransFormer( + ModelHyperParams.src_vocab_size, + ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, + ModelHyperParams.n_layer, ModelHyperParams.n_head, + ModelHyperParams.d_key, ModelHyperParams.d_value, + ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, + ModelHyperParams.prepostprocess_dropout, + ModelHyperParams.attention_dropout, ModelHyperParams.relu_dropout, + ModelHyperParams.preprocess_cmd, ModelHyperParams.postprocess_cmd, + ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps) + # define optimizer + optimizer = fluid.optimizer.Adam(learning_rate=NoamDecay( + ModelHyperParams.d_model, TrainTaskConfig.warmup_steps, + TrainTaskConfig.learning_rate), + parameter_list = transformer.parameters(), + beta1=TrainTaskConfig.beta1, + beta2=TrainTaskConfig.beta2, + epsilon=TrainTaskConfig.eps) + # + if args.use_data_parallel: + transformer = fluid.dygraph.parallel.DataParallel( + transformer, strategy) + + # define data generator for training and validation + train_reader = input_data_array_reader( + paddle.batch( + wmt16.train( + ModelHyperParams.src_vocab_size, + ModelHyperParams.trg_vocab_size), + batch_size=TrainTaskConfig.batch_size), + ModelHyperParams.eos_idx, + ModelHyperParams.eos_idx, + ModelHyperParams.n_head) + + if args.use_data_parallel: + train_reader = fluid.contrib.reader.distributed_batch_reader( + train_reader) + + val_reader = input_data_array_reader( + paddle.batch( + wmt16.test( + ModelHyperParams.src_vocab_size, + ModelHyperParams.trg_vocab_size), + batch_size=TrainTaskConfig.batch_size), + ModelHyperParams.eos_idx, + ModelHyperParams.eos_idx, + ModelHyperParams.n_head) + + train_loader = fluid.io.DataLoader.from_generator(capacity=200, use_multiprocess=True) + train_loader.set_batch_generator(train_reader, places=place) + + val_loader = fluid.io.DataLoader.from_generator(capacity=200, use_multiprocess=True) + val_loader.set_batch_generator(val_reader, places=place) + + # loop for training iterations + total_train_time = 0 + for i in range(TrainTaskConfig.pass_num): + dy_step = 0 + sum_cost = 0 + transformer.train() + stime = time.time() + for batch in train_loader(): + src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, \ + trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight = batch + + enc_inputs, dec_inputs, label, weights = \ + group_inputs([src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, + trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight]) + + dy_sum_cost, dy_avg_cost, dy_predict, dy_token_num = transformer( + enc_inputs, dec_inputs, label, weights) + + if args.use_data_parallel: + dy_avg_cost = transformer.scale_loss(dy_avg_cost) + dy_avg_cost.backward() + transformer.apply_collective_grads() + else: + dy_avg_cost.backward() + optimizer.minimize(dy_avg_cost) + transformer.clear_gradients() + + dy_step = dy_step + 1 + if dy_step % 10 == 0: + print("pass num : {}, batch_id: {}, dy_graph avg loss: {}". + format(i, dy_step, + dy_avg_cost.numpy() * trainer_count)) + total_train_time += (time.time() - stime) + + # switch to evaluation mode + transformer.eval() + sum_cost = 0 + token_num = 0 + for batch in val_loader(): + src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, \ + trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight = batch + + enc_inputs, dec_inputs, label, weights = \ + group_inputs([src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, + trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight]) + + dy_sum_cost, dy_avg_cost, dy_predict, dy_token_num = transformer( + enc_inputs, dec_inputs, label, weights) + sum_cost += dy_sum_cost.numpy() + token_num += dy_token_num.numpy() + print("pass : {} finished, validation avg loss: {}".format( + i, sum_cost / token_num)) + + if fluid.dygraph.parallel.Env().dev_id == 0: + fluid.save_dygraph(transformer.state_dict(), args.model_file) + + print("total train time: {} s".format(total_train_time)) + + +if __name__ == '__main__': + args = parse_args() + train(args) diff --git a/dygraph/transformer/train_sp.py b/dygraph/transformer/train_sp.py new file mode 100644 index 0000000000000000000000000000000000000000..a4ff872f8036cae1871985d4c9594a255fe50a38 --- /dev/null +++ b/dygraph/transformer/train_sp.py @@ -0,0 +1,231 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import argparse +import ast +import time + +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.dataset.wmt16 as wmt16 + +from model import TransFormer, NoamDecay +from config import * +from data_util import * + + +def parse_args(): + parser = argparse.ArgumentParser("Arguments for Training") + parser.add_argument( + "--use_data_parallel", + type=ast.literal_eval, + default=False, + help="The flag indicating whether to use multi-GPU.") + parser.add_argument( + "--model_file", + type=str, + default="transformer_params", + help="Save the model as a file named `model_file.pdparams`.") + parser.add_argument( + 'opts', + help='See config.py for all options', + default=None, + nargs=argparse.REMAINDER) + args = parser.parse_args() + merge_cfg_from_list(args.opts, [TrainTaskConfig, ModelHyperParams]) + return args + + +def prepare_train_input_array(insts, src_pad_idx, trg_pad_idx, n_head): + """ + inputs for training + """ + src_word, src_pos, src_slf_attn_bias, src_max_len = pad_batch_data( + [inst[0] for inst in insts], src_pad_idx, n_head, is_target=False) + src_word = src_word.reshape(-1, src_max_len) + src_pos = src_pos.reshape(-1, src_max_len) + trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = pad_batch_data( + [inst[1] for inst in insts], trg_pad_idx, n_head, is_target=True) + trg_word = trg_word.reshape(-1, trg_max_len) + trg_pos = trg_pos.reshape(-1, trg_max_len) + + trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :], + [1, 1, trg_max_len, 1]).astype("float32") + + lbl_word, lbl_weight, num_token = pad_batch_data( + [inst[2] for inst in insts], + trg_pad_idx, + n_head, + is_target=False, + is_label=True, + return_attn_bias=False, + return_max_len=False, + return_num_token=True) + + return src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, \ + trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight + +def input_data_array_reader(reader, src_pad_idx, trg_pad_idx, n_head): + def __reader__(): + r = reader() + for data in r: + src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, \ + trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight = \ + prepare_train_input_array(data, src_pad_idx, trg_pad_idx, n_head) + yield src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, \ + trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight + return __reader__ + +def group_inputs(var_inputs): + enc_inputs = var_inputs[0:len(encoder_data_input_fields)] + dec_inputs = var_inputs[len(encoder_data_input_fields + ):len(encoder_data_input_fields) + + len(decoder_data_input_fields[:-1])] + label = var_inputs[-2] + weights = var_inputs[-1] + + return enc_inputs, dec_inputs, label, weights + +def train(args): + """ + train models + :return: + """ + + trainer_count = fluid.dygraph.parallel.Env().nranks + place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \ + if args.use_data_parallel else fluid.CUDAPlace(0) + with fluid.dygraph.guard(place): + if args.use_data_parallel: + strategy = fluid.dygraph.parallel.prepare_context() + + # define model + transformer = TransFormer( + ModelHyperParams.src_vocab_size, + ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, + ModelHyperParams.n_layer, ModelHyperParams.n_head, + ModelHyperParams.d_key, ModelHyperParams.d_value, + ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, + ModelHyperParams.prepostprocess_dropout, + ModelHyperParams.attention_dropout, ModelHyperParams.relu_dropout, + ModelHyperParams.preprocess_cmd, ModelHyperParams.postprocess_cmd, + ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps) + # define optimizer + optimizer = fluid.optimizer.Adam(learning_rate=NoamDecay( + ModelHyperParams.d_model, TrainTaskConfig.warmup_steps, + TrainTaskConfig.learning_rate), + parameter_list = transformer.parameters(), + beta1=TrainTaskConfig.beta1, + beta2=TrainTaskConfig.beta2, + epsilon=TrainTaskConfig.eps) + # + if args.use_data_parallel: + transformer = fluid.dygraph.parallel.DataParallel( + transformer, strategy) + + # define data generator for training and validation + train_reader = input_data_array_reader( + paddle.batch( + wmt16.train( + ModelHyperParams.src_vocab_size, + ModelHyperParams.trg_vocab_size), + batch_size=TrainTaskConfig.batch_size), + ModelHyperParams.eos_idx, + ModelHyperParams.eos_idx, + ModelHyperParams.n_head) + + if args.use_data_parallel: + train_reader = fluid.contrib.reader.distributed_batch_reader( + train_reader) + + val_reader = input_data_array_reader( + paddle.batch( + wmt16.test( + ModelHyperParams.src_vocab_size, + ModelHyperParams.trg_vocab_size), + batch_size=TrainTaskConfig.batch_size), + ModelHyperParams.eos_idx, + ModelHyperParams.eos_idx, + ModelHyperParams.n_head) + + train_loader = fluid.io.DataLoader.from_generator(capacity=200) + train_loader.set_batch_generator(train_reader, places=place) + + val_loader = fluid.io.DataLoader.from_generator(capacity=200) + val_loader.set_batch_generator(val_reader, places=place) + + # loop for training iterations + total_train_time = 0 + for i in range(TrainTaskConfig.pass_num): + dy_step = 0 + sum_cost = 0 + transformer.train() + stime = time.time() + for batch in train_loader(): + src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, \ + trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight = batch + + enc_inputs, dec_inputs, label, weights = \ + group_inputs([src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, + trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight]) + + dy_sum_cost, dy_avg_cost, dy_predict, dy_token_num = transformer( + enc_inputs, dec_inputs, label, weights) + + if args.use_data_parallel: + dy_avg_cost = transformer.scale_loss(dy_avg_cost) + dy_avg_cost.backward() + transformer.apply_collective_grads() + else: + dy_avg_cost.backward() + optimizer.minimize(dy_avg_cost) + transformer.clear_gradients() + + dy_step = dy_step + 1 + if dy_step % 10 == 0: + print("pass num : {}, batch_id: {}, dy_graph avg loss: {}". + format(i, dy_step, + dy_avg_cost.numpy() * trainer_count)) + total_train_time += (time.time() - stime) + + # switch to evaluation mode + transformer.eval() + sum_cost = 0 + token_num = 0 + for batch in val_loader(): + src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, \ + trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight = batch + + enc_inputs, dec_inputs, label, weights = \ + group_inputs([src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos, + trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight]) + + dy_sum_cost, dy_avg_cost, dy_predict, dy_token_num = transformer( + enc_inputs, dec_inputs, label, weights) + sum_cost += dy_sum_cost.numpy() + token_num += dy_token_num.numpy() + print("pass : {} finished, validation avg loss: {}".format( + i, sum_cost / token_num)) + + if fluid.dygraph.parallel.Env().dev_id == 0: + fluid.save_dygraph(transformer.state_dict(), args.model_file) + + print("total train time: {} s".format(total_train_time)) + + +if __name__ == '__main__': + args = parse_args() + train(args)