未验证 提交 b0239e3a 编写于 作者: C Chen Weihang 提交者: GitHub

change some model using data loader (#4595)

上级 edf1a872
......@@ -99,11 +99,13 @@ class MNIST(fluid.dygraph.Layer):
self.pool_2_shape = 50 * 4 * 4
SIZE = 10
scale = (2.0 / (self.pool_2_shape**2 * SIZE))**0.5
self._fc = Linear(self.pool_2_shape, 10,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale)),
act="softmax")
self._fc = Linear(
self.pool_2_shape,
10,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.NormalInitializer(
loc=0.0, scale=scale)),
act="softmax")
def forward(self, inputs, label=None):
x = self._simple_img_conv_pool_1(inputs)
......@@ -117,17 +119,21 @@ class MNIST(fluid.dygraph.Layer):
return x
def reader_decorator(reader):
def __reader__():
for item in reader():
img = np.array(item[0]).astype('float32').reshape(1, 28, 28)
label = np.array(item[1]).astype('int64').reshape(1)
yield img, label
return __reader__
def test_mnist(reader, model, batch_size):
acc_set = []
avg_loss_set = []
for batch_id, data in enumerate(reader()):
dy_x_data = np.array([x[0].reshape(1, 28, 28)
for x in data]).astype('float32')
y_data = np.array(
[x[1] for x in data]).astype('int64').reshape(batch_size, 1)
img = to_variable(dy_x_data)
label = to_variable(y_data)
img, label = data
label.stop_gradient = True
prediction, acc = model(img, label)
loss = fluid.layers.cross_entropy(input=prediction, label=label)
......@@ -187,28 +193,33 @@ def train_mnist(args):
if args.use_data_parallel:
strategy = fluid.dygraph.parallel.prepare_context()
mnist = MNIST()
adam = AdamOptimizer(learning_rate=0.001, parameter_list=mnist.parameters())
adam = AdamOptimizer(
learning_rate=0.001, parameter_list=mnist.parameters())
if args.use_data_parallel:
mnist = fluid.dygraph.parallel.DataParallel(mnist, strategy)
train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=BATCH_SIZE, drop_last=True)
reader_decorator(paddle.dataset.mnist.train()),
batch_size=BATCH_SIZE,
drop_last=True)
if args.use_data_parallel:
train_reader = fluid.contrib.reader.distributed_batch_reader(
train_reader)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=BATCH_SIZE, drop_last=True)
reader_decorator(paddle.dataset.mnist.test()),
batch_size=BATCH_SIZE,
drop_last=True)
train_loader = fluid.io.DataLoader.from_generator(capacity=10)
train_loader.set_sample_list_generator(train_reader, places=place)
test_loader = fluid.io.DataLoader.from_generator(capacity=10)
test_loader.set_sample_list_generator(test_reader, places=place)
for epoch in range(epoch_num):
for batch_id, data in enumerate(train_reader()):
dy_x_data = np.array([x[0].reshape(1, 28, 28)
for x in data]).astype('float32')
y_data = np.array(
[x[1] for x in data]).astype('int64').reshape(-1, 1)
img = to_variable(dy_x_data)
label = to_variable(y_data)
for batch_id, data in enumerate(train_loader()):
img, label = data
label.stop_gradient = True
cost, acc = mnist(img, label)
......@@ -231,7 +242,7 @@ def train_mnist(args):
epoch, batch_id, avg_loss.numpy()))
mnist.eval()
test_cost, test_acc = test_mnist(test_reader, mnist, BATCH_SIZE)
test_cost, test_acc = test_mnist(test_loader, mnist, BATCH_SIZE)
mnist.train()
if args.ce:
print("kpis\ttest_acc\t%s" % test_acc)
......@@ -244,7 +255,7 @@ def train_mnist(args):
fluid.dygraph.parallel.Env().local_rank == 0)
if save_parameters:
fluid.save_dygraph(mnist.state_dict(), "save_temp")
print("checkpoint saved")
inference_mnist()
......
......@@ -239,7 +239,7 @@ def process_image(sample, settings, mode, color_jitter, rotate):
img /= img_std
if mode == 'train' or mode == 'val':
return (img, sample[1])
return (img, [sample[1]])
elif mode == 'test':
return (img, )
......
......@@ -116,10 +116,8 @@ def train_mobilenet():
optimizer.set_dict(opti_dict)
# 3. reader
train_data_loader, train_data = utility.create_data_loader(
is_train=True, args=args)
test_data_loader, test_data = utility.create_data_loader(
is_train=False, args=args)
train_data_loader = utility.create_data_loader(is_train=True, args=args)
test_data_loader = utility.create_data_loader(is_train=False, args=args)
num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
imagenet_reader = reader.ImageNetReader(seed=0, place_num=place_num)
train_reader = imagenet_reader.train(settings=args)
......@@ -145,8 +143,6 @@ def train_mobilenet():
t1 = time.time()
if args.max_iter and total_batch_num == args.max_iter:
return
label = to_variable(label.numpy().astype('int64').reshape(
int(args.batch_size // place_num), 1))
t_start = time.time()
# 4.1.1 call net()
......
......@@ -309,32 +309,14 @@ def create_data_loader(is_train, args):
Returns:
data_loader and the input data of net,
"""
image_shape = [int(m) for m in args.image_shape.split(",")]
feed_image = fluid.data(
name="feed_image",
shape=[None] + image_shape,
dtype="float32",
lod_level=0)
feed_label = fluid.data(
name="feed_label", shape=[None, 1], dtype="int64", lod_level=0)
feed_y_a = fluid.data(
name="feed_y_a", shape=[None, 1], dtype="int64", lod_level=0)
if is_train and args.use_mixup:
feed_y_b = fluid.data(
name="feed_y_b", shape=[None, 1], dtype="int64", lod_level=0)
feed_lam = fluid.data(
name="feed_lam", shape=[None, 1], dtype="float32", lod_level=0)
data_loader = fluid.io.DataLoader.from_generator(
capacity=64,
use_double_buffer=True,
iterable=True,
return_list=True)
return data_loader, [feed_image, feed_y_a, feed_y_b, feed_lam]
return data_loader
else:
data_loader = fluid.io.DataLoader.from_generator(
capacity=64,
......@@ -342,7 +324,7 @@ def create_data_loader(is_train, args):
iterable=True,
return_list=True)
return data_loader, [feed_image, feed_label]
return data_loader
def print_info(pass_id, batch_id, print_step, metrics, time_info, info_mode):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import unittest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.dygraph.nn import Embedding
import paddle.fluid.framework as framework
from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.dygraph.base import to_variable
import numpy as np
import six
import multiprocessing
import reader
import model_check
import time
from args import *
#import fluid.clip as clip
#from fluid.clip import *
import sys
if sys.version[0] == '2':
reload(sys)
sys.setdefaultencoding("utf-8")
class SimpleLSTMRNN(fluid.Layer):
def __init__(self,
hidden_size,
num_steps,
num_layers=2,
init_scale=0.1,
dropout=None):
super(SimpleLSTMRNN, self).__init__()
self._hidden_size = hidden_size
self._num_layers = num_layers
self._init_scale = init_scale
self._dropout = dropout
self._num_steps = num_steps
self.cell_array = []
self.hidden_array = []
self.weight_1_arr = []
self.weight_2_arr = []
self.bias_arr = []
self.mask_array = []
for i in range(self._num_layers):
weight_1 = self.create_parameter(
attr=fluid.ParamAttr(
initializer=fluid.initializer.UniformInitializer(
low=-self._init_scale, high=self._init_scale)),
shape=[self._hidden_size * 2, self._hidden_size * 4],
dtype="float32",
default_initializer=fluid.initializer.UniformInitializer(
low=-self._init_scale, high=self._init_scale))
self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1))
bias_1 = self.create_parameter(
attr=fluid.ParamAttr(
initializer=fluid.initializer.UniformInitializer(
low=-self._init_scale, high=self._init_scale)),
shape=[self._hidden_size * 4],
dtype="float32",
default_initializer=fluid.initializer.Constant(0.0))
self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1))
def forward(self, input_embedding, init_hidden=None, init_cell=None):
cell_array = []
hidden_array = []
for i in range(self._num_layers):
hidden_array.append(init_hidden[i])
cell_array.append(init_cell[i])
res = []
for index in range(self._num_steps):
step_input = input_embedding[:,index,:]
for k in range(self._num_layers):
pre_hidden = hidden_array[k]
pre_cell = cell_array[k]
weight_1 = self.weight_1_arr[k]
bias = self.bias_arr[k]
nn = fluid.layers.concat([step_input, pre_hidden], 1)
gate_input = fluid.layers.matmul(x=nn, y=weight_1)
gate_input = fluid.layers.elementwise_add(gate_input, bias)
i, j, f, o = fluid.layers.split(
gate_input, num_or_sections=4, dim=-1)
c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid(
i) * fluid.layers.tanh(j)
m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o)
hidden_array[k] = m
cell_array[k] = c
step_input = m
if self._dropout is not None and self._dropout > 0.0:
step_input = fluid.layers.dropout(
step_input,
dropout_prob=self._dropout,
dropout_implementation='upscale_in_train')
res.append(step_input)
real_res = fluid.layers.concat(res, 1)
real_res = fluid.layers.reshape(real_res, [ -1, self._num_steps, self._hidden_size])
last_hidden = fluid.layers.concat(hidden_array, 1)
last_hidden = fluid.layers.reshape(
last_hidden, shape=[-1, self._num_layers, self._hidden_size])
last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2])
last_cell = fluid.layers.concat(cell_array, 1)
last_cell = fluid.layers.reshape(
last_cell, shape=[-1, self._num_layers, self._hidden_size])
last_cell = fluid.layers.transpose(x=last_cell, perm=[1, 0, 2])
return real_res, last_hidden, last_cell
class PtbModel(fluid.Layer):
def __init__(self,
hidden_size,
vocab_size,
num_layers=2,
num_steps=20,
init_scale=0.1,
dropout=None):
super(PtbModel, self).__init__()
self.hidden_size = hidden_size
self.vocab_size = vocab_size
self.init_scale = init_scale
self.num_layers = num_layers
self.num_steps = num_steps
self.dropout = dropout
self.simple_lstm_rnn = SimpleLSTMRNN(
hidden_size,
num_steps,
num_layers=num_layers,
init_scale=init_scale,
dropout=dropout)
self.embedding = Embedding(
size=[vocab_size, hidden_size],
dtype='float32',
is_sparse=False,
param_attr=fluid.ParamAttr(
name='embedding_para',
initializer=fluid.initializer.UniformInitializer(
low=-init_scale, high=init_scale)))
self.softmax_weight = self.create_parameter(
attr=fluid.ParamAttr(),
shape=[self.hidden_size, self.vocab_size],
dtype="float32",
default_initializer=fluid.initializer.UniformInitializer(
low=-self.init_scale, high=self.init_scale))
self.softmax_bias = self.create_parameter(
attr=fluid.ParamAttr(),
shape=[self.vocab_size],
dtype="float32",
default_initializer=fluid.initializer.UniformInitializer(
low=-self.init_scale, high=self.init_scale))
def build_once(self, input, label, init_hidden, init_cell):
pass
def forward(self, input, label, init_hidden, init_cell):
init_h = fluid.layers.reshape(
init_hidden, shape=[self.num_layers, -1, self.hidden_size])
init_c = fluid.layers.reshape(
init_cell, shape=[self.num_layers, -1, self.hidden_size])
x_emb = self.embedding(input)
x_emb = fluid.layers.reshape(
x_emb, shape=[-1, self.num_steps, self.hidden_size])
if self.dropout is not None and self.dropout > 0.0:
x_emb = fluid.layers.dropout(
x_emb,
dropout_prob=self.dropout,
dropout_implementation='upscale_in_train')
rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(x_emb, init_h,
init_c)
projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
projection = fluid.layers.elementwise_add(projection, self.softmax_bias)
loss = fluid.layers.softmax_with_cross_entropy(
logits=projection, label=label, soft_label=False)
loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps])
loss = fluid.layers.reduce_mean(loss, dim=[0])
loss = fluid.layers.reduce_sum(loss)
return loss, last_hidden, last_cell
def debug_emb(self):
np.save("emb_grad", self.x_emb.gradient())
def train_ptb_lm():
args = parse_args()
# check if set use_gpu=True in paddlepaddle cpu version
model_check.check_cuda(args.use_gpu)
place = core.CPUPlace()
if args.use_gpu:
place = fluid.CUDAPlace(0)
dev_count = fluid.core.get_cuda_device_count()
else:
place = fluid.CPUPlace()
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
# check if paddlepaddle version is satisfied
model_check.check_version()
model_type = args.model_type
vocab_size = 10000
if model_type == "test":
num_layers = 1
batch_size = 2
hidden_size = 10
num_steps = 3
init_scale = 0.1
max_grad_norm = 5.0
epoch_start_decay = 1
max_epoch = 1
dropout = 0.0
lr_decay = 0.5
base_learning_rate = 1.0
elif model_type == "small":
num_layers = 2
batch_size = 20
hidden_size = 200
num_steps = 20
init_scale = 0.1
max_grad_norm = 5.0
epoch_start_decay = 4
max_epoch = 13
dropout = 0.0
lr_decay = 0.5
base_learning_rate = 1.0
elif model_type == "medium":
num_layers = 2
batch_size = 20
hidden_size = 650
num_steps = 35
init_scale = 0.05
max_grad_norm = 5.0
epoch_start_decay = 6
max_epoch = 39
dropout = 0.5
lr_decay = 0.8
base_learning_rate = 1.0
elif model_type == "large":
num_layers = 2
batch_size = 20
hidden_size = 1500
num_steps = 35
init_scale = 0.04
max_grad_norm = 10.0
epoch_start_decay = 14
max_epoch = 55
dropout = 0.65
lr_decay = 1.0 / 1.15
base_learning_rate = 1.0
else:
print("model type not support")
return
with fluid.dygraph.guard(place):
if args.ce:
print("ce mode")
seed = 33
np.random.seed(seed)
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
max_epoch = 1
ptb_model = PtbModel(
hidden_size=hidden_size,
vocab_size=vocab_size,
num_layers=num_layers,
num_steps=num_steps,
init_scale=init_scale,
dropout=dropout)
if args.init_from_pretrain_model:
if not os.path.exists(args.init_from_pretrain_model + '.pdparams'):
print(args.init_from_pretrain_model)
raise Warning("The pretrained params do not exist.")
return
fluid.load_dygraph(args.init_from_pretrain_model)
print("finish initing model from pretrained params from %s" %
(args.init_from_pretrain_model))
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None
last_hidden = None
last_cell = None
data_path = args.data_path
print("begin to load data")
ptb_data = reader.get_ptb_data(data_path)
print("finished load data")
train_data, valid_data, test_data = ptb_data
batch_len = len(train_data) // batch_size
total_batch_size = (batch_len - 1) // num_steps
log_interval = 200
bd = []
lr_arr = [1.0]
for i in range(1, max_epoch):
bd.append(total_batch_size * i)
new_lr = base_learning_rate * (lr_decay**
max(i + 1 - epoch_start_decay, 0.0))
lr_arr.append(new_lr)
grad_clip = fluid.clip.GradientClipByGlobalNorm(max_grad_norm)
sgd = SGDOptimizer(
learning_rate=fluid.layers.piecewise_decay(boundaries=bd, values=lr_arr),
parameter_list=ptb_model.parameters(),
grad_clip=grad_clip)
def eval(model, data):
print("begin to eval")
total_loss = 0.0
iters = 0.0
init_hidden_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
init_cell_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
model.eval()
train_data_iter = reader.get_data_iter(data, batch_size, num_steps)
for batch_id, batch in enumerate(train_data_iter):
x_data, y_data = batch
x_data = x_data.reshape((-1, num_steps, 1))
y_data = y_data.reshape((-1, num_steps, 1))
x = to_variable(x_data)
y = to_variable(y_data)
init_hidden = to_variable(init_hidden_data)
init_cell = to_variable(init_cell_data)
dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
init_cell)
out_loss = dy_loss.numpy()
init_hidden_data = last_hidden.numpy()
init_cell_data = last_cell.numpy()
total_loss += out_loss
iters += num_steps
print("eval finished")
ppl = np.exp(total_loss / iters)
print("ppl ", batch_id, ppl[0])
ce_time = []
ce_ppl = []
total_batch_num = 0 #this is for benchmark
for epoch_id in range(max_epoch):
ptb_model.train()
total_loss = 0.0
iters = 0.0
init_hidden_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
init_cell_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
train_data_iter = reader.get_data_iter(train_data, batch_size,
num_steps)
init_hidden = to_variable(init_hidden_data)
init_cell = to_variable(init_cell_data)
start_time = time.time()
for batch_id, batch in enumerate(train_data_iter):
if args.max_iter and total_batch_num == args.max_iter:
return
batch_start = time.time()
x_data, y_data = batch
x_data = x_data.reshape((-1, num_steps, 1))
y_data = y_data.reshape((-1, num_steps, 1))
x = to_variable(x_data)
y = to_variable(y_data)
dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
init_cell)
init_hidden = last_hidden.detach()
init_cell = last_cell.detach()
out_loss = dy_loss.numpy()
dy_loss.backward()
sgd.minimize(dy_loss)
ptb_model.clear_gradients()
total_loss += out_loss
batch_end = time.time()
train_batch_cost = batch_end - batch_start
iters += num_steps
total_batch_num = total_batch_num + 1 #this is for benchmark
if batch_id > 0 and batch_id % log_interval == 0:
ppl = np.exp(total_loss / iters)
print("-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f, batch cost: %.5f" %
(epoch_id, batch_id, ppl[0],
sgd._global_learning_rate().numpy(), out_loss, train_batch_cost))
print("one epoch finished", epoch_id)
print("time cost ", time.time() - start_time)
ppl = np.exp(total_loss / iters)
ce_time.append(time.time() - start_time)
ce_ppl.append(ppl[0])
print("-- Epoch:[%d]; ppl: %.5f" % (epoch_id, ppl[0]))
if batch_size <= 20 and epoch_id == 0 and ppl[0] > 1000:
# for bad init, after first epoch, the loss is over 1000
# no more need to continue
print("Parameters are randomly initialized and not good this time because the loss is over 1000 after the first epoch.")
print("Abort this training process and please start again.")
return
save_model_dir = os.path.join(args.save_model_dir,
str(epoch_id), 'params')
fluid.save_dygraph(ptb_model.state_dict(), save_model_dir)
print("Saved model to: %s.\n" % save_model_dir)
eval(ptb_model, valid_data)
if args.ce:
_ppl = 0
_time = 0
try:
_time = ce_time[-1]
_ppl = ce_ppl[-1]
except:
print("ce info error")
print("kpis\ttrain_duration_card%s\t%s" % (dev_count, _time))
print("kpis\ttrain_ppl_card%s\t%f" % (dev_count, _ppl))
eval(ptb_model, test_data)
train_ptb_lm()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import unittest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.dygraph.nn import Embedding
import paddle.fluid.framework as framework
from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.dygraph.base import to_variable
import numpy as np
import six
import multiprocessing
import reader
import model_check
import time
from args import *
#import fluid.clip as clip
#from fluid.clip import *
import sys
if sys.version[0] == '2':
reload(sys)
sys.setdefaultencoding("utf-8")
class SimpleLSTMRNN(fluid.Layer):
def __init__(self,
hidden_size,
num_steps,
num_layers=2,
init_scale=0.1,
dropout=None):
super(SimpleLSTMRNN, self).__init__()
self._hidden_size = hidden_size
self._num_layers = num_layers
self._init_scale = init_scale
self._dropout = dropout
self._num_steps = num_steps
self.cell_array = []
self.hidden_array = []
self.weight_1_arr = []
self.weight_2_arr = []
self.bias_arr = []
self.mask_array = []
for i in range(self._num_layers):
weight_1 = self.create_parameter(
attr=fluid.ParamAttr(
initializer=fluid.initializer.UniformInitializer(
low=-self._init_scale, high=self._init_scale)),
shape=[self._hidden_size * 2, self._hidden_size * 4],
dtype="float32",
default_initializer=fluid.initializer.UniformInitializer(
low=-self._init_scale, high=self._init_scale))
self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1))
bias_1 = self.create_parameter(
attr=fluid.ParamAttr(
initializer=fluid.initializer.UniformInitializer(
low=-self._init_scale, high=self._init_scale)),
shape=[self._hidden_size * 4],
dtype="float32",
default_initializer=fluid.initializer.Constant(0.0))
self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1))
def forward(self, input_embedding, init_hidden=None, init_cell=None):
cell_array = []
hidden_array = []
for i in range(self._num_layers):
hidden_array.append(init_hidden[i])
cell_array.append(init_cell[i])
res = []
for index in range(self._num_steps):
step_input = input_embedding[:, index, :]
for k in range(self._num_layers):
pre_hidden = hidden_array[k]
pre_cell = cell_array[k]
weight_1 = self.weight_1_arr[k]
bias = self.bias_arr[k]
nn = fluid.layers.concat([step_input, pre_hidden], 1)
gate_input = fluid.layers.matmul(x=nn, y=weight_1)
gate_input = fluid.layers.elementwise_add(gate_input, bias)
i, j, f, o = fluid.layers.split(
gate_input, num_or_sections=4, dim=-1)
c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid(
i) * fluid.layers.tanh(j)
m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o)
hidden_array[k] = m
cell_array[k] = c
step_input = m
if self._dropout is not None and self._dropout > 0.0:
step_input = fluid.layers.dropout(
step_input,
dropout_prob=self._dropout,
dropout_implementation='upscale_in_train')
res.append(step_input)
real_res = fluid.layers.concat(res, 1)
real_res = fluid.layers.reshape(
real_res, [-1, self._num_steps, self._hidden_size])
last_hidden = fluid.layers.concat(hidden_array, 1)
last_hidden = fluid.layers.reshape(
last_hidden, shape=[-1, self._num_layers, self._hidden_size])
last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2])
last_cell = fluid.layers.concat(cell_array, 1)
last_cell = fluid.layers.reshape(
last_cell, shape=[-1, self._num_layers, self._hidden_size])
last_cell = fluid.layers.transpose(x=last_cell, perm=[1, 0, 2])
return real_res, last_hidden, last_cell
class PtbModel(fluid.Layer):
def __init__(self,
hidden_size,
vocab_size,
num_layers=2,
num_steps=20,
init_scale=0.1,
dropout=None):
super(PtbModel, self).__init__()
self.hidden_size = hidden_size
self.vocab_size = vocab_size
self.init_scale = init_scale
self.num_layers = num_layers
self.num_steps = num_steps
self.dropout = dropout
self.simple_lstm_rnn = SimpleLSTMRNN(
hidden_size,
num_steps,
num_layers=num_layers,
init_scale=init_scale,
dropout=dropout)
self.embedding = Embedding(
size=[vocab_size, hidden_size],
dtype='float32',
is_sparse=False,
param_attr=fluid.ParamAttr(
name='embedding_para',
initializer=fluid.initializer.UniformInitializer(
low=-init_scale, high=init_scale)))
self.softmax_weight = self.create_parameter(
attr=fluid.ParamAttr(),
shape=[self.hidden_size, self.vocab_size],
dtype="float32",
default_initializer=fluid.initializer.UniformInitializer(
low=-self.init_scale, high=self.init_scale))
self.softmax_bias = self.create_parameter(
attr=fluid.ParamAttr(),
shape=[self.vocab_size],
dtype="float32",
default_initializer=fluid.initializer.UniformInitializer(
low=-self.init_scale, high=self.init_scale))
def build_once(self, input, label, init_hidden, init_cell):
pass
def forward(self, input, label, init_hidden, init_cell):
init_h = fluid.layers.reshape(
init_hidden, shape=[self.num_layers, -1, self.hidden_size])
init_c = fluid.layers.reshape(
init_cell, shape=[self.num_layers, -1, self.hidden_size])
x_emb = self.embedding(input)
x_emb = fluid.layers.reshape(
x_emb, shape=[-1, self.num_steps, self.hidden_size])
if self.dropout is not None and self.dropout > 0.0:
x_emb = fluid.layers.dropout(
x_emb,
dropout_prob=self.dropout,
dropout_implementation='upscale_in_train')
rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(x_emb, init_h,
init_c)
projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
projection = fluid.layers.elementwise_add(projection, self.softmax_bias)
loss = fluid.layers.softmax_with_cross_entropy(
logits=projection, label=label, soft_label=False)
loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps])
loss = fluid.layers.reduce_mean(loss, dim=[0])
loss = fluid.layers.reduce_sum(loss)
return loss, last_hidden, last_cell
def debug_emb(self):
np.save("emb_grad", self.x_emb.gradient())
def train_ptb_lm():
args = parse_args()
# check if set use_gpu=True in paddlepaddle cpu version
model_check.check_cuda(args.use_gpu)
place = core.CPUPlace()
if args.use_gpu:
place = fluid.CUDAPlace(0)
dev_count = fluid.core.get_cuda_device_count()
else:
place = fluid.CPUPlace()
dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
# check if paddlepaddle version is satisfied
model_check.check_version()
model_type = args.model_type
vocab_size = 10000
if model_type == "test":
num_layers = 1
batch_size = 2
hidden_size = 10
num_steps = 3
init_scale = 0.1
max_grad_norm = 5.0
epoch_start_decay = 1
max_epoch = 1
dropout = 0.0
lr_decay = 0.5
base_learning_rate = 1.0
elif model_type == "small":
num_layers = 2
batch_size = 20
hidden_size = 200
num_steps = 20
init_scale = 0.1
max_grad_norm = 5.0
epoch_start_decay = 4
max_epoch = 13
dropout = 0.0
lr_decay = 0.5
base_learning_rate = 1.0
elif model_type == "medium":
num_layers = 2
batch_size = 20
hidden_size = 650
num_steps = 35
init_scale = 0.05
max_grad_norm = 5.0
epoch_start_decay = 6
max_epoch = 39
dropout = 0.5
lr_decay = 0.8
base_learning_rate = 1.0
elif model_type == "large":
num_layers = 2
batch_size = 20
hidden_size = 1500
num_steps = 35
init_scale = 0.04
max_grad_norm = 10.0
epoch_start_decay = 14
max_epoch = 55
dropout = 0.65
lr_decay = 1.0 / 1.15
base_learning_rate = 1.0
else:
print("model type not support")
return
with fluid.dygraph.guard(place):
if args.ce:
print("ce mode")
seed = 33
np.random.seed(seed)
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
max_epoch = 1
ptb_model = PtbModel(
hidden_size=hidden_size,
vocab_size=vocab_size,
num_layers=num_layers,
num_steps=num_steps,
init_scale=init_scale,
dropout=dropout)
if args.init_from_pretrain_model:
if not os.path.exists(args.init_from_pretrain_model + '.pdparams'):
print(args.init_from_pretrain_model)
raise Warning("The pretrained params do not exist.")
return
fluid.load_dygraph(args.init_from_pretrain_model)
print("finish initing model from pretrained params from %s" %
(args.init_from_pretrain_model))
dy_param_updated = dict()
dy_param_init = dict()
dy_loss = None
last_hidden = None
last_cell = None
data_path = args.data_path
print("begin to load data")
ptb_data = reader.get_ptb_data(data_path)
print("finished load data")
train_data, valid_data, test_data = ptb_data
batch_len = len(train_data) // batch_size
total_batch_size = (batch_len - 1) // num_steps
log_interval = 200
bd = []
lr_arr = [1.0]
for i in range(1, max_epoch):
bd.append(total_batch_size * i)
new_lr = base_learning_rate * (lr_decay**
max(i + 1 - epoch_start_decay, 0.0))
lr_arr.append(new_lr)
grad_clip = fluid.clip.GradientClipByGlobalNorm(max_grad_norm)
sgd = SGDOptimizer(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr),
parameter_list=ptb_model.parameters(),
grad_clip=grad_clip)
def reader_decorator(reader):
def __reader__():
for item in reader:
x_data = item[0].reshape((-1, num_steps, 1))
y_data = item[1].reshape((-1, num_steps, 1))
yield x_data, y_data
return __reader__
def eval(model, data):
print("begin to eval")
total_loss = 0.0
iters = 0.0
init_hidden_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
init_cell_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
model.eval()
train_data_iter = reader_decorator(
reader.get_data_iter(data, batch_size, num_steps))
eval_data_loader = fluid.io.DataLoader.from_generator(capacity=200)
eval_data_loader.set_batch_generator(train_data_iter, places=place)
for batch_id, batch in enumerate(eval_data_loader):
x, y = batch
init_hidden = to_variable(init_hidden_data)
init_cell = to_variable(init_cell_data)
dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
init_cell)
out_loss = dy_loss.numpy()
init_hidden_data = last_hidden.numpy()
init_cell_data = last_cell.numpy()
total_loss += out_loss
iters += num_steps
print("eval finished")
ppl = np.exp(total_loss / iters)
print("ppl ", batch_id, ppl[0])
ce_time = []
ce_ppl = []
total_batch_num = 0 #this is for benchmark
for epoch_id in range(max_epoch):
ptb_model.train()
total_loss = 0.0
iters = 0.0
init_hidden_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
init_cell_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32')
train_data_iter = reader_decorator(
reader.get_data_iter(train_data, batch_size, num_steps))
train_data_loader = fluid.io.DataLoader.from_generator(capacity=200)
train_data_loader.set_batch_generator(train_data_iter, places=place)
init_hidden = to_variable(init_hidden_data)
init_cell = to_variable(init_cell_data)
start_time = time.time()
for batch_id, batch in enumerate(train_data_loader):
if args.max_iter and total_batch_num == args.max_iter:
return
batch_start = time.time()
x, y = batch
dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
init_cell)
init_hidden = last_hidden.detach()
init_cell = last_cell.detach()
out_loss = dy_loss.numpy()
dy_loss.backward()
sgd.minimize(dy_loss)
ptb_model.clear_gradients()
total_loss += out_loss
batch_end = time.time()
train_batch_cost = batch_end - batch_start
iters += num_steps
total_batch_num = total_batch_num + 1 #this is for benchmark
if batch_id > 0 and batch_id % log_interval == 0:
ppl = np.exp(total_loss / iters)
print("-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f, batch cost: %.5f" %
(epoch_id, batch_id, ppl[0],
sgd._global_learning_rate().numpy(), out_loss, train_batch_cost))
print("one epoch finished", epoch_id)
print("time cost ", time.time() - start_time)
ppl = np.exp(total_loss / iters)
ce_time.append(time.time() - start_time)
ce_ppl.append(ppl[0])
print("-- Epoch:[%d]; ppl: %.5f" % (epoch_id, ppl[0]))
if batch_size <= 20 and epoch_id == 0 and ppl[0] > 1000:
# for bad init, after first epoch, the loss is over 1000
# no more need to continue
print(
"Parameters are randomly initialized and not good this time because the loss is over 1000 after the first epoch."
)
print("Abort this training process and please start again.")
return
save_model_dir = os.path.join(args.save_model_dir,
str(epoch_id), 'params')
fluid.save_dygraph(ptb_model.state_dict(), save_model_dir)
print("Saved model to: %s.\n" % save_model_dir)
eval(ptb_model, valid_data)
if args.ce:
_ppl = 0
_time = 0
try:
_time = ce_time[-1]
_ppl = ce_ppl[-1]
except:
print("ce info error")
print("kpis\ttrain_duration_card%s\t%s" % (dev_count, _time))
print("kpis\ttrain_ppl_card%s\t%f" % (dev_count, _ppl))
eval(ptb_model, test_data)
train_ptb_lm()
......@@ -81,7 +81,6 @@ def optimizer_setting(parameter_list=None):
boundaries=bd, values=lr),
momentum=momentum_rate,
regularization=fluid.regularizer.L2Decay(l2_decay))
return optimizer
......@@ -116,11 +115,7 @@ class ConvBNLayer(fluid.dygraph.Layer):
class BottleneckBlock(fluid.dygraph.Layer):
def __init__(self,
num_channels,
num_filters,
stride,
shortcut=True):
def __init__(self, num_channels, num_filters, stride, shortcut=True):
super(BottleneckBlock, self).__init__()
self.conv0 = ConvBNLayer(
......@@ -186,16 +181,9 @@ class ResNet(fluid.dygraph.Layer):
num_filters = [64, 128, 256, 512]
self.conv = ConvBNLayer(
num_channels=3,
num_filters=64,
filter_size=7,
stride=2,
act='relu')
num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
self.pool2d_max = Pool2D(
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
self.bottleneck_block_list = []
for block in range(len(depth)):
......@@ -220,11 +208,12 @@ class ResNet(fluid.dygraph.Layer):
import math
stdv = 1.0 / math.sqrt(2048 * 1.0)
self.out = Linear(self.pool2d_avg_output,
class_dim,
act='softmax',
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
self.out = Linear(
self.pool2d_avg_output,
class_dim,
act='softmax',
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
def forward(self, inputs):
y = self.conv(inputs)
......@@ -237,6 +226,16 @@ class ResNet(fluid.dygraph.Layer):
return y
def reader_decorator(reader):
def __reader__():
for item in reader():
img = np.array(item[0]).astype('float32').reshape(3, 224, 224)
label = np.array(item[1]).astype('int64').reshape(1)
yield img, label
return __reader__
def eval(model, data):
model.eval()
......@@ -245,15 +244,8 @@ def eval(model, data):
total_acc5 = 0.0
total_sample = 0
for batch_id, data in enumerate(data()):
dy_x_data = np.array(
[x[0].reshape(3, 224, 224) for x in data]).astype('float32')
if len(np.array([x[1] for x in data]).astype('int64')) != batch_size:
continue
y_data = np.array([x[1] for x in data]).astype('int64').reshape(
batch_size, 1)
img = to_variable(dy_x_data)
label = to_variable(y_data)
img = data[0]
label = data[1]
label.stop_gradient = True
out = model(img)
......@@ -303,13 +295,24 @@ def train_resnet():
resnet = fluid.dygraph.parallel.DataParallel(resnet, strategy)
train_reader = paddle.batch(
paddle.dataset.flowers.train(use_xmap=False), batch_size=batch_size)
reader_decorator(paddle.dataset.flowers.train(use_xmap=True)),
batch_size=batch_size,
drop_last=True)
if args.use_data_parallel:
train_reader = fluid.contrib.reader.distributed_batch_reader(
train_reader)
test_reader = paddle.batch(
paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size)
reader_decorator(paddle.dataset.flowers.test(use_xmap=True)),
batch_size=batch_size,
drop_last=True)
train_loader = fluid.io.DataLoader.from_generator(capacity=10)
train_loader.set_sample_list_generator(train_reader, places=place)
test_loader = fluid.io.DataLoader.from_generator(capacity=10)
test_loader.set_sample_list_generator(test_reader, places=place)
#file_name = './model/epoch_0.npz'
#model_data = np.load( file_name )
......@@ -331,23 +334,13 @@ def train_resnet():
print("load finished")
for batch_id, data in enumerate(train_reader()):
for batch_id, data in enumerate(train_loader()):
#NOTE: used in benchmark
if args.max_iter and total_batch_num == args.max_iter:
return
batch_start = time.time()
dy_x_data = np.array(
[x[0].reshape(3, 224, 224) for x in data]).astype('float32')
if len(np.array([x[1]
for x in data]).astype('int64')) != batch_size:
continue
y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-1, 1)
img = to_variable(dy_x_data)
label = to_variable(y_data)
img, label = data
label.stop_gradient = True
out = resnet(img)
......@@ -390,16 +383,14 @@ def train_resnet():
(eop, batch_id, total_loss / total_sample, \
total_acc1 / total_sample, total_acc5 / total_sample))
resnet.eval()
eval(resnet, test_reader)
eval(resnet, test_loader)
save_parameters = (not args.use_data_parallel) or (
args.use_data_parallel and
fluid.dygraph.parallel.Env().local_rank == 0)
if save_parameters:
fluid.save_dygraph(resnet.state_dict(),
'resnet_params')
fluid.save_dygraph(resnet.state_dict(), 'resnet_params')
if __name__ == '__main__':
train_resnet()
......@@ -169,8 +169,7 @@ class BottleneckBlock(fluid.dygraph.Layer):
act=None)
self.scale = SqueezeExcitation(
num_channels=num_filters * 2,
reduction_ratio=reduction_ratio)
num_channels=num_filters * 2, reduction_ratio=reduction_ratio)
if not shortcut:
self.short = ConvBNLayer(
......@@ -219,10 +218,7 @@ class SeResNeXt(fluid.dygraph.Layer):
stride=2,
act='relu')
self.pool = Pool2D(
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
elif layers == 101:
cardinality = 32
reduction_ratio = 16
......@@ -235,10 +231,7 @@ class SeResNeXt(fluid.dygraph.Layer):
stride=2,
act='relu')
self.pool = Pool2D(
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
elif layers == 152:
cardinality = 64
reduction_ratio = 16
......@@ -263,10 +256,7 @@ class SeResNeXt(fluid.dygraph.Layer):
stride=1,
act='relu')
self.pool = Pool2D(
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
self.bottleneck_block_list = []
num_channels = 64
......@@ -294,10 +284,11 @@ class SeResNeXt(fluid.dygraph.Layer):
self.pool2d_avg_output = num_filters[len(num_filters) - 1] * 2 * 1 * 1
self.out = Linear(self.pool2d_avg_output,
class_dim,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
self.out = Linear(
self.pool2d_avg_output,
class_dim,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
def forward(self, inputs):
if self.layers == 50 or self.layers == 101:
......@@ -318,6 +309,16 @@ class SeResNeXt(fluid.dygraph.Layer):
return y
def reader_decorator(reader):
def __reader__():
for item in reader():
img = np.array(item[0]).astype('float32').reshape(3, 224, 224)
label = np.array(item[1]).astype('int64').reshape(1)
yield img, label
return __reader__
def eval(model, data):
model.eval()
......@@ -327,15 +328,7 @@ def eval(model, data):
total_acc5 = 0.0
total_sample = 0
for batch_id, data in enumerate(data()):
dy_x_data = np.array(
[x[0].reshape(3, 224, 224) for x in data]).astype('float32')
if len(np.array([x[1] for x in data]).astype('int64')) != batch_size:
continue
y_data = np.array([x[1] for x in data]).astype('int64').reshape(
batch_size, 1)
img = to_variable(dy_x_data)
label = to_variable(y_data)
img, label = data
label.stop_gradient = True
out = model(img)
......@@ -389,29 +382,29 @@ def train():
se_resnext = fluid.dygraph.parallel.DataParallel(se_resnext,
strategy)
train_reader = paddle.batch(
paddle.dataset.flowers.train(use_xmap=False),
reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
batch_size=batch_size,
drop_last=True)
if args.use_data_parallel:
train_reader = fluid.contrib.reader.distributed_batch_reader(
train_reader)
test_reader = paddle.batch(
paddle.dataset.flowers.test(use_xmap=False), batch_size=32)
reader_decorator(paddle.dataset.flowers.test(use_xmap=False)),
batch_size=32)
train_loader = fluid.io.DataLoader.from_generator(capacity=10)
train_loader.set_sample_list_generator(train_reader, places=place)
test_loader = fluid.io.DataLoader.from_generator(capacity=10)
test_loader.set_sample_list_generator(test_reader, places=place)
for epoch_id in range(epoch_num):
total_loss = 0.0
total_acc1 = 0.0
total_acc5 = 0.0
total_sample = 0
for batch_id, data in enumerate(train_reader()):
dy_x_data = np.array([x[0].reshape(3, 224, 224)
for x in data]).astype('float32')
y_data = np.array([x[1] for x in data]).astype('int64').reshape(
batch_size, 1)
img = to_variable(dy_x_data)
label = to_variable(y_data)
for batch_id, data in enumerate(train_loader()):
img, label = data
label.stop_gradient = True
out = se_resnext(img)
......@@ -454,7 +447,7 @@ def train():
(epoch_id, batch_id, total_loss / total_sample, \
total_acc1 / total_sample, total_acc5 / total_sample))
se_resnext.eval()
eval(se_resnext, test_reader)
eval(se_resnext, test_loader)
se_resnext.train()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册