diff --git a/dygraph/mnist/train.py b/dygraph/mnist/train.py index f81df8f26458c93c1f658a9bc783d14a3c5b8256..58db6f1d728090cc63b0b802e7f765c37c5036aa 100644 --- a/dygraph/mnist/train.py +++ b/dygraph/mnist/train.py @@ -99,11 +99,13 @@ class MNIST(fluid.dygraph.Layer): self.pool_2_shape = 50 * 4 * 4 SIZE = 10 scale = (2.0 / (self.pool_2_shape**2 * SIZE))**0.5 - self._fc = Linear(self.pool_2_shape, 10, - param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.NormalInitializer( - loc=0.0, scale=scale)), - act="softmax") + self._fc = Linear( + self.pool_2_shape, + 10, + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=scale)), + act="softmax") def forward(self, inputs, label=None): x = self._simple_img_conv_pool_1(inputs) @@ -117,17 +119,21 @@ class MNIST(fluid.dygraph.Layer): return x +def reader_decorator(reader): + def __reader__(): + for item in reader(): + img = np.array(item[0]).astype('float32').reshape(1, 28, 28) + label = np.array(item[1]).astype('int64').reshape(1) + yield img, label + + return __reader__ + + def test_mnist(reader, model, batch_size): acc_set = [] avg_loss_set = [] for batch_id, data in enumerate(reader()): - dy_x_data = np.array([x[0].reshape(1, 28, 28) - for x in data]).astype('float32') - y_data = np.array( - [x[1] for x in data]).astype('int64').reshape(batch_size, 1) - - img = to_variable(dy_x_data) - label = to_variable(y_data) + img, label = data label.stop_gradient = True prediction, acc = model(img, label) loss = fluid.layers.cross_entropy(input=prediction, label=label) @@ -187,28 +193,33 @@ def train_mnist(args): if args.use_data_parallel: strategy = fluid.dygraph.parallel.prepare_context() mnist = MNIST() - adam = AdamOptimizer(learning_rate=0.001, parameter_list=mnist.parameters()) + adam = AdamOptimizer( + learning_rate=0.001, parameter_list=mnist.parameters()) if args.use_data_parallel: mnist = fluid.dygraph.parallel.DataParallel(mnist, strategy) train_reader = paddle.batch( - paddle.dataset.mnist.train(), batch_size=BATCH_SIZE, drop_last=True) + reader_decorator(paddle.dataset.mnist.train()), + batch_size=BATCH_SIZE, + drop_last=True) if args.use_data_parallel: train_reader = fluid.contrib.reader.distributed_batch_reader( train_reader) test_reader = paddle.batch( - paddle.dataset.mnist.test(), batch_size=BATCH_SIZE, drop_last=True) + reader_decorator(paddle.dataset.mnist.test()), + batch_size=BATCH_SIZE, + drop_last=True) + + train_loader = fluid.io.DataLoader.from_generator(capacity=10) + train_loader.set_sample_list_generator(train_reader, places=place) + + test_loader = fluid.io.DataLoader.from_generator(capacity=10) + test_loader.set_sample_list_generator(test_reader, places=place) for epoch in range(epoch_num): - for batch_id, data in enumerate(train_reader()): - dy_x_data = np.array([x[0].reshape(1, 28, 28) - for x in data]).astype('float32') - y_data = np.array( - [x[1] for x in data]).astype('int64').reshape(-1, 1) - - img = to_variable(dy_x_data) - label = to_variable(y_data) + for batch_id, data in enumerate(train_loader()): + img, label = data label.stop_gradient = True cost, acc = mnist(img, label) @@ -231,7 +242,7 @@ def train_mnist(args): epoch, batch_id, avg_loss.numpy())) mnist.eval() - test_cost, test_acc = test_mnist(test_reader, mnist, BATCH_SIZE) + test_cost, test_acc = test_mnist(test_loader, mnist, BATCH_SIZE) mnist.train() if args.ce: print("kpis\ttest_acc\t%s" % test_acc) @@ -244,7 +255,7 @@ def train_mnist(args): fluid.dygraph.parallel.Env().local_rank == 0) if save_parameters: fluid.save_dygraph(mnist.state_dict(), "save_temp") - + print("checkpoint saved") inference_mnist() diff --git a/dygraph/mobilenet/reader.py b/dygraph/mobilenet/reader.py index bba33c355ba02983c5d9d54b3bc5f2535d53cfb1..e598d19a3b44fdfcea31abd3f909c5639ba22d45 100644 --- a/dygraph/mobilenet/reader.py +++ b/dygraph/mobilenet/reader.py @@ -239,7 +239,7 @@ def process_image(sample, settings, mode, color_jitter, rotate): img /= img_std if mode == 'train' or mode == 'val': - return (img, sample[1]) + return (img, [sample[1]]) elif mode == 'test': return (img, ) diff --git a/dygraph/mobilenet/train.py b/dygraph/mobilenet/train.py index 16e27dc4fbc22675e2446dbc5ff146e1b6b5b909..547e9d45506b7cec9f84e7543d8a60fea2fadc9c 100644 --- a/dygraph/mobilenet/train.py +++ b/dygraph/mobilenet/train.py @@ -116,10 +116,8 @@ def train_mobilenet(): optimizer.set_dict(opti_dict) # 3. reader - train_data_loader, train_data = utility.create_data_loader( - is_train=True, args=args) - test_data_loader, test_data = utility.create_data_loader( - is_train=False, args=args) + train_data_loader = utility.create_data_loader(is_train=True, args=args) + test_data_loader = utility.create_data_loader(is_train=False, args=args) num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) imagenet_reader = reader.ImageNetReader(seed=0, place_num=place_num) train_reader = imagenet_reader.train(settings=args) @@ -145,8 +143,6 @@ def train_mobilenet(): t1 = time.time() if args.max_iter and total_batch_num == args.max_iter: return - label = to_variable(label.numpy().astype('int64').reshape( - int(args.batch_size // place_num), 1)) t_start = time.time() # 4.1.1 call net() diff --git a/dygraph/mobilenet/utils/utility.py b/dygraph/mobilenet/utils/utility.py index a7bc9c883edba2e6115d3fe96a61e569b5d7407a..22314941adb4f5ee2399147562310054a3392448 100644 --- a/dygraph/mobilenet/utils/utility.py +++ b/dygraph/mobilenet/utils/utility.py @@ -309,32 +309,14 @@ def create_data_loader(is_train, args): Returns: data_loader and the input data of net, """ - image_shape = [int(m) for m in args.image_shape.split(",")] - - feed_image = fluid.data( - name="feed_image", - shape=[None] + image_shape, - dtype="float32", - lod_level=0) - - feed_label = fluid.data( - name="feed_label", shape=[None, 1], dtype="int64", lod_level=0) - feed_y_a = fluid.data( - name="feed_y_a", shape=[None, 1], dtype="int64", lod_level=0) - if is_train and args.use_mixup: - feed_y_b = fluid.data( - name="feed_y_b", shape=[None, 1], dtype="int64", lod_level=0) - feed_lam = fluid.data( - name="feed_lam", shape=[None, 1], dtype="float32", lod_level=0) - data_loader = fluid.io.DataLoader.from_generator( capacity=64, use_double_buffer=True, iterable=True, return_list=True) - return data_loader, [feed_image, feed_y_a, feed_y_b, feed_lam] + return data_loader else: data_loader = fluid.io.DataLoader.from_generator( capacity=64, @@ -342,7 +324,7 @@ def create_data_loader(is_train, args): iterable=True, return_list=True) - return data_loader, [feed_image, feed_label] + return data_loader def print_info(pass_id, batch_id, print_step, metrics, time_info, info_mode): diff --git a/dygraph/ptb_lm/ptb_dy.py b/dygraph/ptb_lm/ptb_dy.py index d33e64194c33c5a4c7ddedbda405daa58fe330ae..0a8ed9494e16937ac1fac4068e37b2a6415212bb 100644 --- a/dygraph/ptb_lm/ptb_dy.py +++ b/dygraph/ptb_lm/ptb_dy.py @@ -1,461 +1,474 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import os -import unittest -import paddle.fluid as fluid -import paddle.fluid.core as core -from paddle.fluid.dygraph.nn import Embedding -import paddle.fluid.framework as framework -from paddle.fluid.optimizer import SGDOptimizer -from paddle.fluid.dygraph.base import to_variable -import numpy as np -import six -import multiprocessing - -import reader -import model_check -import time - -from args import * - -#import fluid.clip as clip -#from fluid.clip import * - -import sys -if sys.version[0] == '2': - reload(sys) - sys.setdefaultencoding("utf-8") - - -class SimpleLSTMRNN(fluid.Layer): - def __init__(self, - hidden_size, - num_steps, - num_layers=2, - init_scale=0.1, - dropout=None): - super(SimpleLSTMRNN, self).__init__() - self._hidden_size = hidden_size - self._num_layers = num_layers - self._init_scale = init_scale - self._dropout = dropout - self._num_steps = num_steps - self.cell_array = [] - self.hidden_array = [] - - self.weight_1_arr = [] - self.weight_2_arr = [] - self.bias_arr = [] - self.mask_array = [] - - for i in range(self._num_layers): - weight_1 = self.create_parameter( - attr=fluid.ParamAttr( - initializer=fluid.initializer.UniformInitializer( - low=-self._init_scale, high=self._init_scale)), - shape=[self._hidden_size * 2, self._hidden_size * 4], - dtype="float32", - default_initializer=fluid.initializer.UniformInitializer( - low=-self._init_scale, high=self._init_scale)) - self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1)) - bias_1 = self.create_parameter( - attr=fluid.ParamAttr( - initializer=fluid.initializer.UniformInitializer( - low=-self._init_scale, high=self._init_scale)), - shape=[self._hidden_size * 4], - dtype="float32", - default_initializer=fluid.initializer.Constant(0.0)) - self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1)) - - def forward(self, input_embedding, init_hidden=None, init_cell=None): - cell_array = [] - hidden_array = [] - - for i in range(self._num_layers): - hidden_array.append(init_hidden[i]) - cell_array.append(init_cell[i]) - - res = [] - for index in range(self._num_steps): - step_input = input_embedding[:,index,:] - for k in range(self._num_layers): - pre_hidden = hidden_array[k] - pre_cell = cell_array[k] - weight_1 = self.weight_1_arr[k] - bias = self.bias_arr[k] - - nn = fluid.layers.concat([step_input, pre_hidden], 1) - gate_input = fluid.layers.matmul(x=nn, y=weight_1) - - gate_input = fluid.layers.elementwise_add(gate_input, bias) - i, j, f, o = fluid.layers.split( - gate_input, num_or_sections=4, dim=-1) - c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid( - i) * fluid.layers.tanh(j) - m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o) - hidden_array[k] = m - cell_array[k] = c - step_input = m - - if self._dropout is not None and self._dropout > 0.0: - step_input = fluid.layers.dropout( - step_input, - dropout_prob=self._dropout, - dropout_implementation='upscale_in_train') - res.append(step_input) - real_res = fluid.layers.concat(res, 1) - real_res = fluid.layers.reshape(real_res, [ -1, self._num_steps, self._hidden_size]) - last_hidden = fluid.layers.concat(hidden_array, 1) - last_hidden = fluid.layers.reshape( - last_hidden, shape=[-1, self._num_layers, self._hidden_size]) - last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2]) - last_cell = fluid.layers.concat(cell_array, 1) - last_cell = fluid.layers.reshape( - last_cell, shape=[-1, self._num_layers, self._hidden_size]) - last_cell = fluid.layers.transpose(x=last_cell, perm=[1, 0, 2]) - return real_res, last_hidden, last_cell - - -class PtbModel(fluid.Layer): - def __init__(self, - hidden_size, - vocab_size, - num_layers=2, - num_steps=20, - init_scale=0.1, - dropout=None): - super(PtbModel, self).__init__() - self.hidden_size = hidden_size - self.vocab_size = vocab_size - self.init_scale = init_scale - self.num_layers = num_layers - self.num_steps = num_steps - self.dropout = dropout - self.simple_lstm_rnn = SimpleLSTMRNN( - hidden_size, - num_steps, - num_layers=num_layers, - init_scale=init_scale, - dropout=dropout) - self.embedding = Embedding( - size=[vocab_size, hidden_size], - dtype='float32', - is_sparse=False, - param_attr=fluid.ParamAttr( - name='embedding_para', - initializer=fluid.initializer.UniformInitializer( - low=-init_scale, high=init_scale))) - self.softmax_weight = self.create_parameter( - attr=fluid.ParamAttr(), - shape=[self.hidden_size, self.vocab_size], - dtype="float32", - default_initializer=fluid.initializer.UniformInitializer( - low=-self.init_scale, high=self.init_scale)) - self.softmax_bias = self.create_parameter( - attr=fluid.ParamAttr(), - shape=[self.vocab_size], - dtype="float32", - default_initializer=fluid.initializer.UniformInitializer( - low=-self.init_scale, high=self.init_scale)) - - def build_once(self, input, label, init_hidden, init_cell): - pass - - def forward(self, input, label, init_hidden, init_cell): - - init_h = fluid.layers.reshape( - init_hidden, shape=[self.num_layers, -1, self.hidden_size]) - - init_c = fluid.layers.reshape( - init_cell, shape=[self.num_layers, -1, self.hidden_size]) - - x_emb = self.embedding(input) - - x_emb = fluid.layers.reshape( - x_emb, shape=[-1, self.num_steps, self.hidden_size]) - if self.dropout is not None and self.dropout > 0.0: - x_emb = fluid.layers.dropout( - x_emb, - dropout_prob=self.dropout, - dropout_implementation='upscale_in_train') - rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(x_emb, init_h, - init_c) - - projection = fluid.layers.matmul(rnn_out, self.softmax_weight) - projection = fluid.layers.elementwise_add(projection, self.softmax_bias) - - loss = fluid.layers.softmax_with_cross_entropy( - logits=projection, label=label, soft_label=False) - loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps]) - loss = fluid.layers.reduce_mean(loss, dim=[0]) - loss = fluid.layers.reduce_sum(loss) - - return loss, last_hidden, last_cell - - def debug_emb(self): - - np.save("emb_grad", self.x_emb.gradient()) - - -def train_ptb_lm(): - args = parse_args() - - # check if set use_gpu=True in paddlepaddle cpu version - model_check.check_cuda(args.use_gpu) - - place = core.CPUPlace() - if args.use_gpu: - place = fluid.CUDAPlace(0) - dev_count = fluid.core.get_cuda_device_count() - else: - place = fluid.CPUPlace() - dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - - # check if paddlepaddle version is satisfied - model_check.check_version() - - model_type = args.model_type - - vocab_size = 10000 - if model_type == "test": - num_layers = 1 - batch_size = 2 - hidden_size = 10 - num_steps = 3 - init_scale = 0.1 - max_grad_norm = 5.0 - epoch_start_decay = 1 - max_epoch = 1 - dropout = 0.0 - lr_decay = 0.5 - base_learning_rate = 1.0 - elif model_type == "small": - num_layers = 2 - batch_size = 20 - hidden_size = 200 - num_steps = 20 - init_scale = 0.1 - max_grad_norm = 5.0 - epoch_start_decay = 4 - max_epoch = 13 - dropout = 0.0 - lr_decay = 0.5 - base_learning_rate = 1.0 - elif model_type == "medium": - num_layers = 2 - batch_size = 20 - hidden_size = 650 - num_steps = 35 - init_scale = 0.05 - max_grad_norm = 5.0 - epoch_start_decay = 6 - max_epoch = 39 - dropout = 0.5 - lr_decay = 0.8 - base_learning_rate = 1.0 - elif model_type == "large": - num_layers = 2 - batch_size = 20 - hidden_size = 1500 - num_steps = 35 - init_scale = 0.04 - max_grad_norm = 10.0 - epoch_start_decay = 14 - max_epoch = 55 - dropout = 0.65 - lr_decay = 1.0 / 1.15 - base_learning_rate = 1.0 - else: - print("model type not support") - return - - with fluid.dygraph.guard(place): - if args.ce: - print("ce mode") - seed = 33 - np.random.seed(seed) - fluid.default_startup_program().random_seed = seed - fluid.default_main_program().random_seed = seed - max_epoch = 1 - ptb_model = PtbModel( - hidden_size=hidden_size, - vocab_size=vocab_size, - num_layers=num_layers, - num_steps=num_steps, - init_scale=init_scale, - dropout=dropout) - - if args.init_from_pretrain_model: - if not os.path.exists(args.init_from_pretrain_model + '.pdparams'): - print(args.init_from_pretrain_model) - raise Warning("The pretrained params do not exist.") - return - fluid.load_dygraph(args.init_from_pretrain_model) - print("finish initing model from pretrained params from %s" % - (args.init_from_pretrain_model)) - - dy_param_updated = dict() - dy_param_init = dict() - dy_loss = None - last_hidden = None - last_cell = None - - data_path = args.data_path - print("begin to load data") - ptb_data = reader.get_ptb_data(data_path) - print("finished load data") - train_data, valid_data, test_data = ptb_data - - batch_len = len(train_data) // batch_size - total_batch_size = (batch_len - 1) // num_steps - log_interval = 200 - - bd = [] - lr_arr = [1.0] - for i in range(1, max_epoch): - bd.append(total_batch_size * i) - new_lr = base_learning_rate * (lr_decay** - max(i + 1 - epoch_start_decay, 0.0)) - lr_arr.append(new_lr) - - grad_clip = fluid.clip.GradientClipByGlobalNorm(max_grad_norm) - sgd = SGDOptimizer( - learning_rate=fluid.layers.piecewise_decay(boundaries=bd, values=lr_arr), - parameter_list=ptb_model.parameters(), - grad_clip=grad_clip) - - def eval(model, data): - print("begin to eval") - total_loss = 0.0 - iters = 0.0 - init_hidden_data = np.zeros( - (num_layers, batch_size, hidden_size), dtype='float32') - init_cell_data = np.zeros( - (num_layers, batch_size, hidden_size), dtype='float32') - - model.eval() - train_data_iter = reader.get_data_iter(data, batch_size, num_steps) - for batch_id, batch in enumerate(train_data_iter): - x_data, y_data = batch - x_data = x_data.reshape((-1, num_steps, 1)) - y_data = y_data.reshape((-1, num_steps, 1)) - x = to_variable(x_data) - y = to_variable(y_data) - init_hidden = to_variable(init_hidden_data) - init_cell = to_variable(init_cell_data) - dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, - init_cell) - - out_loss = dy_loss.numpy() - - init_hidden_data = last_hidden.numpy() - init_cell_data = last_cell.numpy() - - total_loss += out_loss - iters += num_steps - - print("eval finished") - ppl = np.exp(total_loss / iters) - print("ppl ", batch_id, ppl[0]) - - ce_time = [] - ce_ppl = [] - - total_batch_num = 0 #this is for benchmark - for epoch_id in range(max_epoch): - ptb_model.train() - total_loss = 0.0 - iters = 0.0 - init_hidden_data = np.zeros( - (num_layers, batch_size, hidden_size), dtype='float32') - init_cell_data = np.zeros( - (num_layers, batch_size, hidden_size), dtype='float32') - - train_data_iter = reader.get_data_iter(train_data, batch_size, - num_steps) - init_hidden = to_variable(init_hidden_data) - init_cell = to_variable(init_cell_data) - start_time = time.time() - for batch_id, batch in enumerate(train_data_iter): - if args.max_iter and total_batch_num == args.max_iter: - return - batch_start = time.time() - x_data, y_data = batch - - x_data = x_data.reshape((-1, num_steps, 1)) - y_data = y_data.reshape((-1, num_steps, 1)) - - x = to_variable(x_data) - y = to_variable(y_data) - - dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, - init_cell) - init_hidden = last_hidden.detach() - init_cell = last_cell.detach() - out_loss = dy_loss.numpy() - - dy_loss.backward() - sgd.minimize(dy_loss) - - ptb_model.clear_gradients() - total_loss += out_loss - batch_end = time.time() - train_batch_cost = batch_end - batch_start - iters += num_steps - total_batch_num = total_batch_num + 1 #this is for benchmark - - if batch_id > 0 and batch_id % log_interval == 0: - ppl = np.exp(total_loss / iters) - print("-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f, batch cost: %.5f" % - (epoch_id, batch_id, ppl[0], - sgd._global_learning_rate().numpy(), out_loss, train_batch_cost)) - - print("one epoch finished", epoch_id) - print("time cost ", time.time() - start_time) - ppl = np.exp(total_loss / iters) - ce_time.append(time.time() - start_time) - ce_ppl.append(ppl[0]) - print("-- Epoch:[%d]; ppl: %.5f" % (epoch_id, ppl[0])) - - if batch_size <= 20 and epoch_id == 0 and ppl[0] > 1000: - # for bad init, after first epoch, the loss is over 1000 - # no more need to continue - print("Parameters are randomly initialized and not good this time because the loss is over 1000 after the first epoch.") - print("Abort this training process and please start again.") - return - - save_model_dir = os.path.join(args.save_model_dir, - str(epoch_id), 'params') - fluid.save_dygraph(ptb_model.state_dict(), save_model_dir) - print("Saved model to: %s.\n" % save_model_dir) - - eval(ptb_model, valid_data) - - if args.ce: - _ppl = 0 - _time = 0 - try: - _time = ce_time[-1] - _ppl = ce_ppl[-1] - except: - print("ce info error") - print("kpis\ttrain_duration_card%s\t%s" % (dev_count, _time)) - print("kpis\ttrain_ppl_card%s\t%f" % (dev_count, _ppl)) - - eval(ptb_model, test_data) - -train_ptb_lm() +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import unittest +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.dygraph.nn import Embedding +import paddle.fluid.framework as framework +from paddle.fluid.optimizer import SGDOptimizer +from paddle.fluid.dygraph.base import to_variable +import numpy as np +import six +import multiprocessing + +import reader +import model_check +import time + +from args import * + +#import fluid.clip as clip +#from fluid.clip import * + +import sys +if sys.version[0] == '2': + reload(sys) + sys.setdefaultencoding("utf-8") + + +class SimpleLSTMRNN(fluid.Layer): + def __init__(self, + hidden_size, + num_steps, + num_layers=2, + init_scale=0.1, + dropout=None): + super(SimpleLSTMRNN, self).__init__() + self._hidden_size = hidden_size + self._num_layers = num_layers + self._init_scale = init_scale + self._dropout = dropout + self._num_steps = num_steps + self.cell_array = [] + self.hidden_array = [] + + self.weight_1_arr = [] + self.weight_2_arr = [] + self.bias_arr = [] + self.mask_array = [] + + for i in range(self._num_layers): + weight_1 = self.create_parameter( + attr=fluid.ParamAttr( + initializer=fluid.initializer.UniformInitializer( + low=-self._init_scale, high=self._init_scale)), + shape=[self._hidden_size * 2, self._hidden_size * 4], + dtype="float32", + default_initializer=fluid.initializer.UniformInitializer( + low=-self._init_scale, high=self._init_scale)) + self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1)) + bias_1 = self.create_parameter( + attr=fluid.ParamAttr( + initializer=fluid.initializer.UniformInitializer( + low=-self._init_scale, high=self._init_scale)), + shape=[self._hidden_size * 4], + dtype="float32", + default_initializer=fluid.initializer.Constant(0.0)) + self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1)) + + def forward(self, input_embedding, init_hidden=None, init_cell=None): + cell_array = [] + hidden_array = [] + + for i in range(self._num_layers): + hidden_array.append(init_hidden[i]) + cell_array.append(init_cell[i]) + + res = [] + for index in range(self._num_steps): + step_input = input_embedding[:, index, :] + for k in range(self._num_layers): + pre_hidden = hidden_array[k] + pre_cell = cell_array[k] + weight_1 = self.weight_1_arr[k] + bias = self.bias_arr[k] + + nn = fluid.layers.concat([step_input, pre_hidden], 1) + gate_input = fluid.layers.matmul(x=nn, y=weight_1) + + gate_input = fluid.layers.elementwise_add(gate_input, bias) + i, j, f, o = fluid.layers.split( + gate_input, num_or_sections=4, dim=-1) + c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid( + i) * fluid.layers.tanh(j) + m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o) + hidden_array[k] = m + cell_array[k] = c + step_input = m + + if self._dropout is not None and self._dropout > 0.0: + step_input = fluid.layers.dropout( + step_input, + dropout_prob=self._dropout, + dropout_implementation='upscale_in_train') + res.append(step_input) + real_res = fluid.layers.concat(res, 1) + real_res = fluid.layers.reshape( + real_res, [-1, self._num_steps, self._hidden_size]) + last_hidden = fluid.layers.concat(hidden_array, 1) + last_hidden = fluid.layers.reshape( + last_hidden, shape=[-1, self._num_layers, self._hidden_size]) + last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2]) + last_cell = fluid.layers.concat(cell_array, 1) + last_cell = fluid.layers.reshape( + last_cell, shape=[-1, self._num_layers, self._hidden_size]) + last_cell = fluid.layers.transpose(x=last_cell, perm=[1, 0, 2]) + return real_res, last_hidden, last_cell + + +class PtbModel(fluid.Layer): + def __init__(self, + hidden_size, + vocab_size, + num_layers=2, + num_steps=20, + init_scale=0.1, + dropout=None): + super(PtbModel, self).__init__() + self.hidden_size = hidden_size + self.vocab_size = vocab_size + self.init_scale = init_scale + self.num_layers = num_layers + self.num_steps = num_steps + self.dropout = dropout + self.simple_lstm_rnn = SimpleLSTMRNN( + hidden_size, + num_steps, + num_layers=num_layers, + init_scale=init_scale, + dropout=dropout) + self.embedding = Embedding( + size=[vocab_size, hidden_size], + dtype='float32', + is_sparse=False, + param_attr=fluid.ParamAttr( + name='embedding_para', + initializer=fluid.initializer.UniformInitializer( + low=-init_scale, high=init_scale))) + self.softmax_weight = self.create_parameter( + attr=fluid.ParamAttr(), + shape=[self.hidden_size, self.vocab_size], + dtype="float32", + default_initializer=fluid.initializer.UniformInitializer( + low=-self.init_scale, high=self.init_scale)) + self.softmax_bias = self.create_parameter( + attr=fluid.ParamAttr(), + shape=[self.vocab_size], + dtype="float32", + default_initializer=fluid.initializer.UniformInitializer( + low=-self.init_scale, high=self.init_scale)) + + def build_once(self, input, label, init_hidden, init_cell): + pass + + def forward(self, input, label, init_hidden, init_cell): + + init_h = fluid.layers.reshape( + init_hidden, shape=[self.num_layers, -1, self.hidden_size]) + + init_c = fluid.layers.reshape( + init_cell, shape=[self.num_layers, -1, self.hidden_size]) + + x_emb = self.embedding(input) + + x_emb = fluid.layers.reshape( + x_emb, shape=[-1, self.num_steps, self.hidden_size]) + if self.dropout is not None and self.dropout > 0.0: + x_emb = fluid.layers.dropout( + x_emb, + dropout_prob=self.dropout, + dropout_implementation='upscale_in_train') + rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(x_emb, init_h, + init_c) + + projection = fluid.layers.matmul(rnn_out, self.softmax_weight) + projection = fluid.layers.elementwise_add(projection, self.softmax_bias) + + loss = fluid.layers.softmax_with_cross_entropy( + logits=projection, label=label, soft_label=False) + loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps]) + loss = fluid.layers.reduce_mean(loss, dim=[0]) + loss = fluid.layers.reduce_sum(loss) + + return loss, last_hidden, last_cell + + def debug_emb(self): + + np.save("emb_grad", self.x_emb.gradient()) + + +def train_ptb_lm(): + args = parse_args() + + # check if set use_gpu=True in paddlepaddle cpu version + model_check.check_cuda(args.use_gpu) + + place = core.CPUPlace() + if args.use_gpu: + place = fluid.CUDAPlace(0) + dev_count = fluid.core.get_cuda_device_count() + else: + place = fluid.CPUPlace() + dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + + # check if paddlepaddle version is satisfied + model_check.check_version() + + model_type = args.model_type + + vocab_size = 10000 + if model_type == "test": + num_layers = 1 + batch_size = 2 + hidden_size = 10 + num_steps = 3 + init_scale = 0.1 + max_grad_norm = 5.0 + epoch_start_decay = 1 + max_epoch = 1 + dropout = 0.0 + lr_decay = 0.5 + base_learning_rate = 1.0 + elif model_type == "small": + num_layers = 2 + batch_size = 20 + hidden_size = 200 + num_steps = 20 + init_scale = 0.1 + max_grad_norm = 5.0 + epoch_start_decay = 4 + max_epoch = 13 + dropout = 0.0 + lr_decay = 0.5 + base_learning_rate = 1.0 + elif model_type == "medium": + num_layers = 2 + batch_size = 20 + hidden_size = 650 + num_steps = 35 + init_scale = 0.05 + max_grad_norm = 5.0 + epoch_start_decay = 6 + max_epoch = 39 + dropout = 0.5 + lr_decay = 0.8 + base_learning_rate = 1.0 + elif model_type == "large": + num_layers = 2 + batch_size = 20 + hidden_size = 1500 + num_steps = 35 + init_scale = 0.04 + max_grad_norm = 10.0 + epoch_start_decay = 14 + max_epoch = 55 + dropout = 0.65 + lr_decay = 1.0 / 1.15 + base_learning_rate = 1.0 + else: + print("model type not support") + return + + with fluid.dygraph.guard(place): + if args.ce: + print("ce mode") + seed = 33 + np.random.seed(seed) + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + max_epoch = 1 + ptb_model = PtbModel( + hidden_size=hidden_size, + vocab_size=vocab_size, + num_layers=num_layers, + num_steps=num_steps, + init_scale=init_scale, + dropout=dropout) + + if args.init_from_pretrain_model: + if not os.path.exists(args.init_from_pretrain_model + '.pdparams'): + print(args.init_from_pretrain_model) + raise Warning("The pretrained params do not exist.") + return + fluid.load_dygraph(args.init_from_pretrain_model) + print("finish initing model from pretrained params from %s" % + (args.init_from_pretrain_model)) + + dy_param_updated = dict() + dy_param_init = dict() + dy_loss = None + last_hidden = None + last_cell = None + + data_path = args.data_path + print("begin to load data") + ptb_data = reader.get_ptb_data(data_path) + print("finished load data") + train_data, valid_data, test_data = ptb_data + + batch_len = len(train_data) // batch_size + total_batch_size = (batch_len - 1) // num_steps + log_interval = 200 + + bd = [] + lr_arr = [1.0] + for i in range(1, max_epoch): + bd.append(total_batch_size * i) + new_lr = base_learning_rate * (lr_decay** + max(i + 1 - epoch_start_decay, 0.0)) + lr_arr.append(new_lr) + + grad_clip = fluid.clip.GradientClipByGlobalNorm(max_grad_norm) + sgd = SGDOptimizer( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr_arr), + parameter_list=ptb_model.parameters(), + grad_clip=grad_clip) + + def reader_decorator(reader): + def __reader__(): + for item in reader: + x_data = item[0].reshape((-1, num_steps, 1)) + y_data = item[1].reshape((-1, num_steps, 1)) + yield x_data, y_data + + return __reader__ + + def eval(model, data): + print("begin to eval") + total_loss = 0.0 + iters = 0.0 + init_hidden_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + init_cell_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + + model.eval() + train_data_iter = reader_decorator( + reader.get_data_iter(data, batch_size, num_steps)) + + eval_data_loader = fluid.io.DataLoader.from_generator(capacity=200) + eval_data_loader.set_batch_generator(train_data_iter, places=place) + + for batch_id, batch in enumerate(eval_data_loader): + x, y = batch + init_hidden = to_variable(init_hidden_data) + init_cell = to_variable(init_cell_data) + dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, + init_cell) + + out_loss = dy_loss.numpy() + + init_hidden_data = last_hidden.numpy() + init_cell_data = last_cell.numpy() + + total_loss += out_loss + iters += num_steps + + print("eval finished") + ppl = np.exp(total_loss / iters) + print("ppl ", batch_id, ppl[0]) + + ce_time = [] + ce_ppl = [] + + total_batch_num = 0 #this is for benchmark + for epoch_id in range(max_epoch): + ptb_model.train() + total_loss = 0.0 + iters = 0.0 + init_hidden_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + init_cell_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='float32') + + train_data_iter = reader_decorator( + reader.get_data_iter(train_data, batch_size, num_steps)) + + train_data_loader = fluid.io.DataLoader.from_generator(capacity=200) + train_data_loader.set_batch_generator(train_data_iter, places=place) + + init_hidden = to_variable(init_hidden_data) + init_cell = to_variable(init_cell_data) + start_time = time.time() + for batch_id, batch in enumerate(train_data_loader): + if args.max_iter and total_batch_num == args.max_iter: + return + batch_start = time.time() + x, y = batch + + dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, + init_cell) + init_hidden = last_hidden.detach() + init_cell = last_cell.detach() + out_loss = dy_loss.numpy() + + dy_loss.backward() + sgd.minimize(dy_loss) + + ptb_model.clear_gradients() + total_loss += out_loss + batch_end = time.time() + train_batch_cost = batch_end - batch_start + iters += num_steps + total_batch_num = total_batch_num + 1 #this is for benchmark + + if batch_id > 0 and batch_id % log_interval == 0: + ppl = np.exp(total_loss / iters) + print("-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f, batch cost: %.5f" % + (epoch_id, batch_id, ppl[0], + sgd._global_learning_rate().numpy(), out_loss, train_batch_cost)) + + print("one epoch finished", epoch_id) + print("time cost ", time.time() - start_time) + ppl = np.exp(total_loss / iters) + ce_time.append(time.time() - start_time) + ce_ppl.append(ppl[0]) + print("-- Epoch:[%d]; ppl: %.5f" % (epoch_id, ppl[0])) + + if batch_size <= 20 and epoch_id == 0 and ppl[0] > 1000: + # for bad init, after first epoch, the loss is over 1000 + # no more need to continue + print( + "Parameters are randomly initialized and not good this time because the loss is over 1000 after the first epoch." + ) + print("Abort this training process and please start again.") + return + + save_model_dir = os.path.join(args.save_model_dir, + str(epoch_id), 'params') + fluid.save_dygraph(ptb_model.state_dict(), save_model_dir) + print("Saved model to: %s.\n" % save_model_dir) + + eval(ptb_model, valid_data) + + if args.ce: + _ppl = 0 + _time = 0 + try: + _time = ce_time[-1] + _ppl = ce_ppl[-1] + except: + print("ce info error") + print("kpis\ttrain_duration_card%s\t%s" % (dev_count, _time)) + print("kpis\ttrain_ppl_card%s\t%f" % (dev_count, _ppl)) + + eval(ptb_model, test_data) + + +train_ptb_lm() diff --git a/dygraph/resnet/train.py b/dygraph/resnet/train.py index e92a39bde5bce633dda9452d5c0dad3399092248..5339cadcc88954f63d482f535ad72e5305f30490 100644 --- a/dygraph/resnet/train.py +++ b/dygraph/resnet/train.py @@ -81,7 +81,6 @@ def optimizer_setting(parameter_list=None): boundaries=bd, values=lr), momentum=momentum_rate, regularization=fluid.regularizer.L2Decay(l2_decay)) - return optimizer @@ -116,11 +115,7 @@ class ConvBNLayer(fluid.dygraph.Layer): class BottleneckBlock(fluid.dygraph.Layer): - def __init__(self, - num_channels, - num_filters, - stride, - shortcut=True): + def __init__(self, num_channels, num_filters, stride, shortcut=True): super(BottleneckBlock, self).__init__() self.conv0 = ConvBNLayer( @@ -186,16 +181,9 @@ class ResNet(fluid.dygraph.Layer): num_filters = [64, 128, 256, 512] self.conv = ConvBNLayer( - num_channels=3, - num_filters=64, - filter_size=7, - stride=2, - act='relu') + num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu') self.pool2d_max = Pool2D( - pool_size=3, - pool_stride=2, - pool_padding=1, - pool_type='max') + pool_size=3, pool_stride=2, pool_padding=1, pool_type='max') self.bottleneck_block_list = [] for block in range(len(depth)): @@ -220,11 +208,12 @@ class ResNet(fluid.dygraph.Layer): import math stdv = 1.0 / math.sqrt(2048 * 1.0) - self.out = Linear(self.pool2d_avg_output, - class_dim, - act='softmax', - param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.Uniform(-stdv, stdv))) + self.out = Linear( + self.pool2d_avg_output, + class_dim, + act='softmax', + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, stdv))) def forward(self, inputs): y = self.conv(inputs) @@ -237,6 +226,16 @@ class ResNet(fluid.dygraph.Layer): return y +def reader_decorator(reader): + def __reader__(): + for item in reader(): + img = np.array(item[0]).astype('float32').reshape(3, 224, 224) + label = np.array(item[1]).astype('int64').reshape(1) + yield img, label + + return __reader__ + + def eval(model, data): model.eval() @@ -245,15 +244,8 @@ def eval(model, data): total_acc5 = 0.0 total_sample = 0 for batch_id, data in enumerate(data()): - dy_x_data = np.array( - [x[0].reshape(3, 224, 224) for x in data]).astype('float32') - if len(np.array([x[1] for x in data]).astype('int64')) != batch_size: - continue - y_data = np.array([x[1] for x in data]).astype('int64').reshape( - batch_size, 1) - - img = to_variable(dy_x_data) - label = to_variable(y_data) + img = data[0] + label = data[1] label.stop_gradient = True out = model(img) @@ -303,13 +295,24 @@ def train_resnet(): resnet = fluid.dygraph.parallel.DataParallel(resnet, strategy) train_reader = paddle.batch( - paddle.dataset.flowers.train(use_xmap=False), batch_size=batch_size) + reader_decorator(paddle.dataset.flowers.train(use_xmap=True)), + batch_size=batch_size, + drop_last=True) + if args.use_data_parallel: train_reader = fluid.contrib.reader.distributed_batch_reader( train_reader) test_reader = paddle.batch( - paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size) + reader_decorator(paddle.dataset.flowers.test(use_xmap=True)), + batch_size=batch_size, + drop_last=True) + + train_loader = fluid.io.DataLoader.from_generator(capacity=10) + train_loader.set_sample_list_generator(train_reader, places=place) + + test_loader = fluid.io.DataLoader.from_generator(capacity=10) + test_loader.set_sample_list_generator(test_reader, places=place) #file_name = './model/epoch_0.npz' #model_data = np.load( file_name ) @@ -331,23 +334,13 @@ def train_resnet(): print("load finished") - for batch_id, data in enumerate(train_reader()): - + for batch_id, data in enumerate(train_loader()): #NOTE: used in benchmark if args.max_iter and total_batch_num == args.max_iter: return batch_start = time.time() - dy_x_data = np.array( - [x[0].reshape(3, 224, 224) for x in data]).astype('float32') - if len(np.array([x[1] - for x in data]).astype('int64')) != batch_size: - continue - y_data = np.array([x[1] for x in data]).astype('int64').reshape( - -1, 1) - - img = to_variable(dy_x_data) - label = to_variable(y_data) + img, label = data label.stop_gradient = True out = resnet(img) @@ -390,16 +383,14 @@ def train_resnet(): (eop, batch_id, total_loss / total_sample, \ total_acc1 / total_sample, total_acc5 / total_sample)) resnet.eval() - eval(resnet, test_reader) + eval(resnet, test_loader) save_parameters = (not args.use_data_parallel) or ( args.use_data_parallel and fluid.dygraph.parallel.Env().local_rank == 0) if save_parameters: - fluid.save_dygraph(resnet.state_dict(), - 'resnet_params') + fluid.save_dygraph(resnet.state_dict(), 'resnet_params') if __name__ == '__main__': - train_resnet() diff --git a/dygraph/se_resnet/train.py b/dygraph/se_resnet/train.py index 67b9dacf2e07e19e07d466683769641830a6fd36..0ba5de46f83dfcd3f3e820ce11aedc9da88925ff 100644 --- a/dygraph/se_resnet/train.py +++ b/dygraph/se_resnet/train.py @@ -169,8 +169,7 @@ class BottleneckBlock(fluid.dygraph.Layer): act=None) self.scale = SqueezeExcitation( - num_channels=num_filters * 2, - reduction_ratio=reduction_ratio) + num_channels=num_filters * 2, reduction_ratio=reduction_ratio) if not shortcut: self.short = ConvBNLayer( @@ -219,10 +218,7 @@ class SeResNeXt(fluid.dygraph.Layer): stride=2, act='relu') self.pool = Pool2D( - pool_size=3, - pool_stride=2, - pool_padding=1, - pool_type='max') + pool_size=3, pool_stride=2, pool_padding=1, pool_type='max') elif layers == 101: cardinality = 32 reduction_ratio = 16 @@ -235,10 +231,7 @@ class SeResNeXt(fluid.dygraph.Layer): stride=2, act='relu') self.pool = Pool2D( - pool_size=3, - pool_stride=2, - pool_padding=1, - pool_type='max') + pool_size=3, pool_stride=2, pool_padding=1, pool_type='max') elif layers == 152: cardinality = 64 reduction_ratio = 16 @@ -263,10 +256,7 @@ class SeResNeXt(fluid.dygraph.Layer): stride=1, act='relu') self.pool = Pool2D( - pool_size=3, - pool_stride=2, - pool_padding=1, - pool_type='max') + pool_size=3, pool_stride=2, pool_padding=1, pool_type='max') self.bottleneck_block_list = [] num_channels = 64 @@ -294,10 +284,11 @@ class SeResNeXt(fluid.dygraph.Layer): self.pool2d_avg_output = num_filters[len(num_filters) - 1] * 2 * 1 * 1 - self.out = Linear(self.pool2d_avg_output, - class_dim, - param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.Uniform(-stdv, stdv))) + self.out = Linear( + self.pool2d_avg_output, + class_dim, + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, stdv))) def forward(self, inputs): if self.layers == 50 or self.layers == 101: @@ -318,6 +309,16 @@ class SeResNeXt(fluid.dygraph.Layer): return y +def reader_decorator(reader): + def __reader__(): + for item in reader(): + img = np.array(item[0]).astype('float32').reshape(3, 224, 224) + label = np.array(item[1]).astype('int64').reshape(1) + yield img, label + + return __reader__ + + def eval(model, data): model.eval() @@ -327,15 +328,7 @@ def eval(model, data): total_acc5 = 0.0 total_sample = 0 for batch_id, data in enumerate(data()): - dy_x_data = np.array( - [x[0].reshape(3, 224, 224) for x in data]).astype('float32') - if len(np.array([x[1] for x in data]).astype('int64')) != batch_size: - continue - y_data = np.array([x[1] for x in data]).astype('int64').reshape( - batch_size, 1) - - img = to_variable(dy_x_data) - label = to_variable(y_data) + img, label = data label.stop_gradient = True out = model(img) @@ -389,29 +382,29 @@ def train(): se_resnext = fluid.dygraph.parallel.DataParallel(se_resnext, strategy) train_reader = paddle.batch( - paddle.dataset.flowers.train(use_xmap=False), + reader_decorator(paddle.dataset.flowers.train(use_xmap=False)), batch_size=batch_size, drop_last=True) if args.use_data_parallel: train_reader = fluid.contrib.reader.distributed_batch_reader( train_reader) test_reader = paddle.batch( - paddle.dataset.flowers.test(use_xmap=False), batch_size=32) + reader_decorator(paddle.dataset.flowers.test(use_xmap=False)), + batch_size=32) + + train_loader = fluid.io.DataLoader.from_generator(capacity=10) + train_loader.set_sample_list_generator(train_reader, places=place) + + test_loader = fluid.io.DataLoader.from_generator(capacity=10) + test_loader.set_sample_list_generator(test_reader, places=place) for epoch_id in range(epoch_num): total_loss = 0.0 total_acc1 = 0.0 total_acc5 = 0.0 total_sample = 0 - for batch_id, data in enumerate(train_reader()): - - dy_x_data = np.array([x[0].reshape(3, 224, 224) - for x in data]).astype('float32') - y_data = np.array([x[1] for x in data]).astype('int64').reshape( - batch_size, 1) - - img = to_variable(dy_x_data) - label = to_variable(y_data) + for batch_id, data in enumerate(train_loader()): + img, label = data label.stop_gradient = True out = se_resnext(img) @@ -454,7 +447,7 @@ def train(): (epoch_id, batch_id, total_loss / total_sample, \ total_acc1 / total_sample, total_acc5 / total_sample)) se_resnext.eval() - eval(se_resnext, test_reader) + eval(se_resnext, test_loader) se_resnext.train()