提交 f258a876 编写于 作者: B buaawht

Merge branch 'develop' of https://github.com/PaddlePaddle/models into new_method

.DS_Store .DS_Store
*.pyc *.pyc
.*~
...@@ -17,7 +17,7 @@ addons: ...@@ -17,7 +17,7 @@ addons:
- python-pip - python-pip
- python2.7-dev - python2.7-dev
- clang-format-3.8 - clang-format-3.8
ssh_known_hosts: 52.76.173.135 ssh_known_hosts: 13.229.163.131
before_install: before_install:
- if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
- sudo pip install -U virtualenv pre-commit pip - sudo pip install -U virtualenv pre-commit pip
......
...@@ -168,7 +168,7 @@ def profile(args): ...@@ -168,7 +168,7 @@ def profile(args):
start_time = time.time() start_time = time.time()
frames_seen = 0 frames_seen = 0
# load_data # load_data
(features, labels, lod) = batch_data (features, labels, lod, _) = batch_data
feature_t.set(features, place) feature_t.set(features, place)
feature_t.set_lod([lod]) feature_t.set_lod([lod])
label_t.set(labels, place) label_t.set(labels, place)
......
...@@ -192,7 +192,7 @@ def train(args): ...@@ -192,7 +192,7 @@ def train(args):
test_data_reader.batch_iterator(args.batch_size, test_data_reader.batch_iterator(args.batch_size,
args.minimum_batch_size)): args.minimum_batch_size)):
# load_data # load_data
(features, labels, lod) = batch_data (features, labels, lod, _) = batch_data
feature_t.set(features, place) feature_t.set(features, place)
feature_t.set_lod([lod]) feature_t.set_lod([lod])
label_t.set(labels, place) label_t.set(labels, place)
......
...@@ -18,19 +18,19 @@ This tool is used to convert a Caffe model to Fluid model ...@@ -18,19 +18,19 @@ This tool is used to convert a Caffe model to Fluid model
### Tested models ### Tested models
- Lenet on mnist dataset - Lenet
- ResNets:(ResNet-50, ResNet-101, ResNet-152) - ResNets:(ResNet-50, ResNet-101, ResNet-152)
model addr: `https://onedrive.live.com/?authkey=%21AAFW2-FVoxeVRck&id=4006CBB8476FF777%2117887&cid=4006CBB8476FF777`_ [model addr](https://onedrive.live.com/?authkey=%21AAFW2-FVoxeVRck&id=4006CBB8476FF777%2117887&cid=4006CBB8476FF777)
- GoogleNet: - GoogleNet:
model addr: `https://gist.github.com/jimmie33/7ea9f8ac0da259866b854460f4526034`_ [model addr](https://gist.github.com/jimmie33/7ea9f8ac0da259866b854460f4526034)
- VGG: - VGG:
model addr: `https://gist.github.com/ksimonyan/211839e770f7b538e2d8`_ [model addr](https://gist.github.com/ksimonyan/211839e770f7b538e2d8)
- AlexNet: - AlexNet:
model addr: `https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet`_ [model addr](https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet)
### Notes ### Notes
Some of this code come from here: https://github.com/ethereon/caffe-tensorflow Some of this code come from here: https://github.com/ethereon/caffe-tensorflow
#!/usr/bin/python
#
#a tool to compare tensors in two files or two directories
#
import sys
import os
def walk_dir(rootdir):
for subdir, dirs, files in os.walk(rootdir):
for file in files:
yield file
def calc_diff(f1, f2):
import numpy as np
d1 = np.load(f1).flatten()
d2 = np.load(f2).flatten()
d1_num = reduce(lambda x, y: x * y, d1.shape)
d2_num = reduce(lambda x, y: x * y, d2.shape)
if d1_num != d2_num:
print d1.shape
print d2.shape
assert (d1_num == d2_num), "their shape is not consistent"
try:
df = np.abs(d1 - d2)
max_df = np.max(df)
sq_df = np.mean(df * df)
return max_df, sq_df
except Exception as e:
return -1.0, -1.0
def compare(path1, path2):
def diff(f1, f2):
max_df, sq_df = calc_diff(f1, f2)
print('compare %s <=> %s with result[max_df:%.4e, sq_df:%.4e]' %
(f1, f2, max_df, sq_df))
assert (max_df < 1e-5), \
'max_df is too large with value[%.6e]' % (max_df)
assert (sq_df < 1e-10), \
'sq_df is too large with value[%.6e]' % (sq_df)
if os.path.exists(path1) is False:
print('not found %s' % (path1))
return 1
elif os.path.exists(path2) is False:
print('not found %s' % (path2))
return 1
if path1.find('.npy') > 0 and path2.find('.npy') > 0:
diff(path1, path2)
return
for f in walk_dir(path2):
if f.find('.npy') < 0:
continue
f1 = os.path.join(path1, f)
f2 = os.path.join(path2, f)
diff(f1, f2)
print('all checking succeed to pass')
return 0
if __name__ == "__main__":
if len(sys.argv) == 1:
path1 = 'lenet.tf/results'
path2 = 'lenet.paddle/results'
elif len(sys.argv) == 3:
path1 = sys.argv[1]
path2 = sys.argv[2]
else:
print('usage:')
print(' %s [path1] [path2]' % (sys.argv[0]))
exit(1)
print('compare inner result in %s %s' % (path1, path2))
exit(compare(path1, path2))
#!/bin/bash
#
#function:
# a tool used to check the difference of models' results generated by caffe model and paddle model
#
#howto:
# bash diff.sh resnet50 #when this has been finished, you can get the difference in precision
#
#notes:
# 0, in order to infer using caffe, we need pycaffe installed
# 1, prepare your caffe model in 'models.caffe/', eg: 'model.caffe/resnet101/resnet101.[prototxt|caffemodel]'
# 2, converted paddle model will be in 'models'
# 3, results of layers will be stored in 'results/${model_name}.[paddle|caffe]'
# 4, only the last layer will be checked by default
model_name="resnet50"
results_root="results/"
if [[ -n $1 ]];then
if [ $1 = "-h" ];then
echo "usage:"
echo " bash $0 [model_name]"
echo " eg:bash $0 resnet50"
exit 0
fi
model_name=$1
fi
mkdir -p $results_root
model_prototxt="models.caffe/$model_name/${model_name}.prototxt"
model_caffemodel="models.caffe/${model_name}/${model_name}.caffemodel"
#1, dump layers' results from paddle
paddle_results="$results_root/${model_name}.paddle"
rm -rf $paddle_results
rm -rf "results.paddle"
bash run.sh $model_name ./models.caffe/$model_name ./models/$model_name
if [[ $? -ne 0 ]] || [[ ! -e "results.paddle" ]];then
echo "not found paddle's results, maybe failed to convert"
exit 1
fi
mv results.paddle $paddle_results
#2, dump layers' results from caffe
caffe_results="$results_root/${model_name}.caffe"
rm -rf $caffe_results
rm -rf "results.caffe"
cfpython ./infer.py caffe $model_prototxt $model_caffemodel $paddle_results/data.npy
if [[ $? -ne 0 ]] || [[ ! -e "results.caffe" ]];then
echo "not found caffe's results, maybe failed to do inference with caffe"
exit 1
fi
mv results.caffe $caffe_results
#3, extract layer names
cat $model_prototxt | grep name | perl -ne 'if(/^\s*name:\s+\"([^\"]+)/){ print $1."\n";}' >.layer_names
#4, compare one by one
for i in $(cat ".layer_names" | tail -n1);do
echo "process $i"
python compare.py $caffe_results/${i}.npy $paddle_results/${i}.npy
done
...@@ -10,8 +10,11 @@ import os ...@@ -10,8 +10,11 @@ import os
import sys import sys
import inspect import inspect
import numpy as np import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
def import_fluid():
import paddle.fluid as fluid
return fluid
def load_data(imgfile, shape): def load_data(imgfile, shape):
...@@ -52,8 +55,10 @@ def build_model(net_file, net_name): ...@@ -52,8 +55,10 @@ def build_model(net_file, net_name):
print(e) print(e)
return None return None
input_name = 'data' fluid = import_fluid()
input_shape = MyNet.input_shapes()[input_name] inputs_dict = MyNet.input_shapes()
input_name = inputs_dict.keys()[0]
input_shape = inputs_dict[input_name]
images = fluid.layers.data(name='image', shape=input_shape, dtype='float32') images = fluid.layers.data(name='image', shape=input_shape, dtype='float32')
#label = fluid.layers.data(name='label', shape=[1], dtype='int64') #label = fluid.layers.data(name='label', shape=[1], dtype='int64')
...@@ -64,7 +69,7 @@ def build_model(net_file, net_name): ...@@ -64,7 +69,7 @@ def build_model(net_file, net_name):
def dump_results(results, names, root): def dump_results(results, names, root):
if os.path.exists(root) is False: if os.path.exists(root) is False:
os.path.mkdir(root) os.mkdir(root)
for i in range(len(names)): for i in range(len(names)):
n = names[i] n = names[i]
...@@ -73,9 +78,12 @@ def dump_results(results, names, root): ...@@ -73,9 +78,12 @@ def dump_results(results, names, root):
np.save(filename + '.npy', res) np.save(filename + '.npy', res)
def infer(net_file, net_name, model_file, imgfile, debug=False): def infer(net_file, net_name, model_file, imgfile, debug=True):
""" do inference using a model which consist 'xxx.py' and 'xxx.npy' """ do inference using a model which consist 'xxx.py' and 'xxx.npy'
""" """
fluid = import_fluid()
#1, build model #1, build model
net, input_shape = build_model(net_file, net_name) net, input_shape = build_model(net_file, net_name)
prediction = net.get_output() prediction = net.get_output()
...@@ -109,34 +117,79 @@ def infer(net_file, net_name, model_file, imgfile, debug=False): ...@@ -109,34 +117,79 @@ def infer(net_file, net_name, model_file, imgfile, debug=False):
fetch_list=fetch_list_var) fetch_list=fetch_list_var)
if debug is True: if debug is True:
dump_path = 'results.layers' dump_path = 'results.paddle'
dump_results(results, fetch_list_name, dump_path) dump_results(results, fetch_list_name, dump_path)
print('all results dumped to [%s]' % (dump_path)) print('all result of layers dumped to [%s]' % (dump_path))
else: else:
result = results[0] result = results[0]
print('predicted class:', np.argmax(result)) print('predicted class:', np.argmax(result))
return 0
def caffe_infer(prototxt, caffemodel, datafile):
""" do inference using pycaffe for debug,
all intermediate results will be dumpped to 'results.caffe'
"""
import caffe
net = caffe.Net(prototxt, caffemodel, caffe.TEST)
input_layer = net.blobs.keys()[0]
print('got name of input layer is:%s' % (input_layer))
input_shape = list(net.blobs[input_layer].data.shape[1:])
if '.npy' in datafile:
np_images = np.load(datafile)
else:
np_images = load_data(datafile, input_shape)
inputs = {input_layer: np_images}
net.forward_all(**inputs)
results = []
names = []
for k, v in net.blobs.items():
k = k.rstrip('_output')
k = k.replace('/', '_')
names.append(k)
results.append(v.data.copy())
dump_path = 'results.caffe'
dump_results(results, names, dump_path)
print('all result of layers dumped to [%s]' % (dump_path))
return 0
if __name__ == "__main__": if __name__ == "__main__":
""" maybe more convenient to use 'run.sh' to call this tool """ maybe more convenient to use 'run.sh' to call this tool
""" """
net_file = 'models/resnet50/resnet50.py' net_file = 'models/resnet50/resnet50.py'
weight_file = 'models/resnet50/resnet50.npy' weight_file = 'models/resnet50/resnet50.npy'
imgfile = 'data/65.jpeg' datafile = 'data/65.jpeg'
net_name = 'ResNet50' net_name = 'ResNet50'
argc = len(sys.argv) argc = len(sys.argv)
if argc == 5: if sys.argv[1] == 'caffe':
if len(sys.argv) != 5:
print('usage:')
print('\tpython %s caffe [prototxt] [caffemodel] [datafile]' %
(sys.argv[0]))
sys.exit(1)
prototxt = sys.argv[2]
caffemodel = sys.argv[3]
datafile = sys.argv[4]
sys.exit(caffe_infer(prototxt, caffemodel, datafile))
elif argc == 5:
net_file = sys.argv[1] net_file = sys.argv[1]
weight_file = sys.argv[2] weight_file = sys.argv[2]
imgfile = sys.argv[3] datafile = sys.argv[3]
net_name = sys.argv[4] net_name = sys.argv[4]
elif argc > 1: elif argc > 1:
print('usage:') print('usage:')
print('\tpython %s [net_file] [weight_file] [imgfile] [net_name]' % print('\tpython %s [net_file] [weight_file] [datafile] [net_name]' %
(sys.argv[0])) (sys.argv[0]))
print('\teg:python %s %s %s %s %s' % (sys.argv[0], net_file, print('\teg:python %s %s %s %s %s' % (sys.argv[0], net_file,
weight_file, imgfile, net_name)) weight_file, datafile, net_name))
sys.exit(1) sys.exit(1)
infer(net_file, net_name, weight_file, imgfile) infer(net_file, net_name, weight_file, datafile)
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#function: #function:
# a tool used to: # a tool used to:
# 1, convert a caffe model # 1, convert a caffe model
# 2, do inference using this model # 2, do inference(only in fluid) using this model
# #
#usage: #usage:
# bash run.sh resnet50 ./models.caffe/resnet50 ./models/resnet50 # bash run.sh resnet50 ./models.caffe/resnet50 ./models/resnet50
...@@ -65,7 +65,12 @@ if [[ -z $only_convert ]];then ...@@ -65,7 +65,12 @@ if [[ -z $only_convert ]];then
PYTHON=`which python` PYTHON=`which python`
fi fi
imgfile="data/65.jpeg" imgfile="data/65.jpeg"
net_name=`grep "name" $proto_file | head -n1 | perl -ne 'if(/\"([^\"]+)\"/){ print $1."\n";}'` #FIX ME:
# only look the first line in prototxt file for the name of this network, maybe not correct
net_name=`grep "name" $proto_file | head -n1 | perl -ne 'if(/^\s*name\s*:\s*\"([^\"]+)\"/){ print $1."\n";}'`
if [[ -z $net_name ]];then
net_name="MyNet"
fi
$PYTHON ./infer.py $net_file $weight_file $imgfile $net_name $PYTHON ./infer.py $net_file $weight_file $imgfile $net_name
ret=$? ret=$?
fi fi
......
...@@ -52,7 +52,10 @@ class Graph(object): ...@@ -52,7 +52,10 @@ class Graph(object):
def __init__(self, nodes=None, name=None): def __init__(self, nodes=None, name=None):
self.nodes = nodes or [] self.nodes = nodes or []
self.node_lut = {node.name: node for node in self.nodes} self.node_lut = {node.name: node for node in self.nodes}
self.name = name if name is None or name == '':
self.name = 'MyNet'
else:
self.name = name
def add_node(self, node): def add_node(self, node):
self.nodes.append(node) self.nodes.append(node)
......
...@@ -4,7 +4,7 @@ import numpy as np ...@@ -4,7 +4,7 @@ import numpy as np
def import_fluid(): def import_fluid():
import paddle.v2.fluid as fluid import paddle.fluid as fluid
return fluid return fluid
...@@ -64,7 +64,7 @@ class Network(object): ...@@ -64,7 +64,7 @@ class Network(object):
if os.path.isdir(data_path): if os.path.isdir(data_path):
assert (exe is not None), \ assert (exe is not None), \
'must provide a executor to load fluid model' 'must provide a executor to load fluid model'
fluid.io.load_persistables_if_exist(executor=exe, dirname=data_path) fluid.io.load_persistables(executor=exe, dirname=data_path)
return True return True
#load model from a npy file #load model from a npy file
...@@ -161,56 +161,28 @@ class Network(object): ...@@ -161,56 +161,28 @@ class Network(object):
output = fluid.layers.relu(x=input) output = fluid.layers.relu(x=input)
return output return output
def _adjust_pad_if_needed(self, i_hw, k_hw, s_hw, p_hw):
#adjust the padding if needed
i_h, i_w = i_hw
k_h, k_w = k_hw
s_h, s_w = s_hw
p_h, p_w = p_hw
def is_consistent(i, k, s, p):
o = i + 2 * p - k
if o % s == 0:
return True
else:
return False
real_p_h = 0
real_p_w = 0
if is_consistent(i_h, k_h, s_h, p_h) is False:
real_p_h = int(k_h / 2)
if is_consistent(i_w, k_w, s_w, p_w) is False:
real_p_w = int(k_w / 2)
return [real_p_h, real_p_w]
def pool(self, pool_type, input, k_h, k_w, s_h, s_w, name, padding): def pool(self, pool_type, input, k_h, k_w, s_h, s_w, name, padding):
# Get the number of channels in the input # Get the number of channels in the input
in_hw = input.shape[2:] in_hw = input.shape[2:]
k_hw = [k_h, k_w] k_hw = [k_h, k_w]
s_hw = [s_h, s_w] s_hw = [s_h, s_w]
if padding is None:
#fix bug about the difference between conv and pool
#more info: https://github.com/BVLC/caffe/issues/1318
padding = self._adjust_pad_if_needed(in_hw, k_hw, s_hw, [0, 0])
fluid = import_fluid() fluid = import_fluid()
output = fluid.layers.pool2d( output = fluid.layers.pool2d(
input=input, input=input,
pool_size=k_hw, pool_size=k_hw,
pool_stride=s_hw, pool_stride=s_hw,
pool_padding=padding, pool_padding=padding,
ceil_mode=True,
pool_type=pool_type) pool_type=pool_type)
return output return output
@layer @layer
def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None): def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=[0, 0]):
return self.pool('max', input, k_h, k_w, s_h, s_w, name, padding) return self.pool('max', input, k_h, k_w, s_h, s_w, name, padding)
@layer @layer
def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None): def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=[0, 0]):
return self.pool('avg', input, k_h, k_w, s_h, s_w, name, padding) return self.pool('avg', input, k_h, k_w, s_h, s_w, name, padding)
@layer @layer
...@@ -258,7 +230,12 @@ class Network(object): ...@@ -258,7 +230,12 @@ class Network(object):
return output return output
@layer @layer
def batch_normalization(self, input, name, scale_offset=True, relu=False): def batch_normalization(self,
input,
name,
scale_offset=True,
eps=1e-5,
relu=False):
# NOTE: Currently, only inference is supported # NOTE: Currently, only inference is supported
fluid = import_fluid() fluid = import_fluid()
prefix = name + '_' prefix = name + '_'
...@@ -276,7 +253,7 @@ class Network(object): ...@@ -276,7 +253,7 @@ class Network(object):
bias_attr=bias_attr, bias_attr=bias_attr,
moving_mean_name=mean_name, moving_mean_name=mean_name,
moving_variance_name=variance_name, moving_variance_name=variance_name,
epsilon=1e-5, epsilon=eps,
act='relu' if relu is True else None) act='relu' if relu is True else None)
return output return output
......
...@@ -142,7 +142,13 @@ class TensorFlowMapper(NodeMapper): ...@@ -142,7 +142,13 @@ class TensorFlowMapper(NodeMapper):
def map_batch_norm(self, node): def map_batch_norm(self, node):
scale_offset = len(node.data) == 4 scale_offset = len(node.data) == 4
kwargs = {} if scale_offset else {'scale_offset': False}
#this default value comes from caffe's param in batch_norm
default_eps = 1e-5
kwargs = {'scale_offset': scale_offset}
if node.parameters.eps != default_eps:
kwargs['eps'] = node.parameters.eps
return MaybeActivated( return MaybeActivated(
node, default=False)('batch_normalization', **kwargs) node, default=False)('batch_normalization', **kwargs)
...@@ -236,7 +242,7 @@ class TensorFlowEmitter(object): ...@@ -236,7 +242,7 @@ class TensorFlowEmitter(object):
func_def = self.statement('@classmethod') func_def = self.statement('@classmethod')
func_def += self.statement('def convert(cls, npy_model, fluid_path):') func_def += self.statement('def convert(cls, npy_model, fluid_path):')
self.indent() self.indent()
func_def += self.statement('import paddle.v2.fluid as fluid') func_def += self.statement('fluid = import_fluid()')
for l in codes: for l in codes:
func_def += self.statement(l) func_def += self.statement(l)
return '\n' + func_def return '\n' + func_def
......
import os
import numpy as np
import time
import sys
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import reader
def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1, def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
...@@ -124,164 +119,3 @@ def SE_ResNeXt(input, class_dim, infer=False, layers=50): ...@@ -124,164 +119,3 @@ def SE_ResNeXt(input, class_dim, infer=False, layers=50):
drop = pool drop = pool
out = fluid.layers.fc(input=drop, size=class_dim, act='softmax') out = fluid.layers.fc(input=drop, size=class_dim, act='softmax')
return out return out
def train(learning_rate,
batch_size,
num_passes,
init_model=None,
model_save_dir='model',
parallel=True,
use_nccl=True,
lr_strategy=None,
layers=50):
class_dim = 1000
image_shape = [3, 224, 224]
image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
if parallel:
places = fluid.layers.get_places()
pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
with pd.do():
image_ = pd.read_input(image)
label_ = pd.read_input(label)
out = SE_ResNeXt(input=image_, class_dim=class_dim, layers=layers)
cost = fluid.layers.cross_entropy(input=out, label=label_)
avg_cost = fluid.layers.mean(x=cost)
acc_top1 = fluid.layers.accuracy(input=out, label=label_, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label_, k=5)
pd.write_output(avg_cost)
pd.write_output(acc_top1)
pd.write_output(acc_top5)
avg_cost, acc_top1, acc_top5 = pd()
avg_cost = fluid.layers.mean(x=avg_cost)
acc_top1 = fluid.layers.mean(x=acc_top1)
acc_top5 = fluid.layers.mean(x=acc_top5)
else:
out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers)
cost = fluid.layers.cross_entropy(input=out, label=label)
avg_cost = fluid.layers.mean(x=cost)
acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
if lr_strategy is None:
optimizer = fluid.optimizer.Momentum(
learning_rate=learning_rate,
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
else:
bd = lr_strategy["bd"]
lr = lr_strategy["lr"]
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr),
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
opts = optimizer.minimize(avg_cost)
fluid.memory_optimize(fluid.default_main_program())
inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
inference_program = fluid.io.get_inference_program(
[avg_cost, acc_top1, acc_top5])
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
if init_model is not None:
fluid.io.load_persistables(exe, init_model)
train_reader = paddle.batch(reader.train(), batch_size=batch_size)
test_reader = paddle.batch(reader.test(), batch_size=batch_size)
feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
for pass_id in range(num_passes):
train_info = [[], [], []]
test_info = [[], [], []]
for batch_id, data in enumerate(train_reader()):
t1 = time.time()
loss, acc1, acc5 = exe.run(
fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost, acc_top1, acc_top5])
t2 = time.time()
period = t2 - t1
train_info[0].append(loss[0])
train_info[1].append(acc1[0])
train_info[2].append(acc5[0])
if batch_id % 10 == 0:
print("Pass {0}, trainbatch {1}, loss {2}, \
acc1 {3}, acc5 {4} time {5}"
.format(pass_id, \
batch_id, loss[0], acc1[0], acc5[0], \
"%2.2f sec" % period))
sys.stdout.flush()
train_loss = np.array(train_info[0]).mean()
train_acc1 = np.array(train_info[1]).mean()
train_acc5 = np.array(train_info[2]).mean()
for data in test_reader():
t1 = time.time()
loss, acc1, acc5 = exe.run(
inference_program,
feed=feeder.feed(data),
fetch_list=[avg_cost, acc_top1, acc_top5])
t2 = time.time()
period = t2 - t1
test_info[0].append(loss[0])
test_info[1].append(acc1[0])
test_info[2].append(acc5[0])
if batch_id % 10 == 0:
print("Pass {0},testbatch {1},loss {2}, \
acc1 {3},acc5 {4},time {5}"
.format(pass_id, \
batch_id, loss[0], acc1[0], acc5[0], \
"%2.2f sec" % period))
sys.stdout.flush()
test_loss = np.array(test_info[0]).mean()
test_acc1 = np.array(test_info[1]).mean()
test_acc5 = np.array(test_info[2]).mean()
print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \
test_loss {4}, test_acc1 {5}, test_acc5 {6}"
.format(pass_id, \
train_loss, train_acc1, train_acc5, test_loss, test_acc1, \
test_acc5))
sys.stdout.flush()
model_path = os.path.join(model_save_dir, str(pass_id))
if not os.path.isdir(model_path):
os.makedirs(model_path)
fluid.io.save_persistables(exe, model_path)
if __name__ == '__main__':
epoch_points = [30, 60, 90]
total_images = 1281167
batch_size = 256
step = int(total_images / batch_size + 1)
bd = [e * step for e in epoch_points]
lr = [0.1, 0.01, 0.001, 0.0001]
lr_strategy = {"bd": bd, "lr": lr}
use_nccl = True
# layers: 50, 152
layers = 50
train(
learning_rate=0.1,
batch_size=batch_size,
num_passes=120,
init_model=None,
parallel=True,
use_nccl=True,
lr_strategy=lr_strategy,
layers=layers)
import os
import numpy as np
import time
import sys
import paddle.v2 as paddle
import paddle.fluid as fluid
from se_resnext import SE_ResNeXt
import reader
import argparse
import functools
from utility import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('batch_size', int, 256, "Minibatch size.")
add_arg('num_layers', int, 50, "How many layers for SE-ResNeXt model.")
add_arg('with_mem_opt', bool, True, "Whether to use memory optimization or not.")
add_arg('parallel_exe', bool, True, "Whether to use ParallelExecutor to train or not.")
def train_paralle_do(args,
learning_rate,
batch_size,
num_passes,
init_model=None,
model_save_dir='model',
parallel=True,
use_nccl=True,
lr_strategy=None,
layers=50):
class_dim = 1000
image_shape = [3, 224, 224]
image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
if parallel:
places = fluid.layers.get_places()
pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
with pd.do():
image_ = pd.read_input(image)
label_ = pd.read_input(label)
out = SE_ResNeXt(input=image_, class_dim=class_dim, layers=layers)
cost = fluid.layers.cross_entropy(input=out, label=label_)
avg_cost = fluid.layers.mean(x=cost)
acc_top1 = fluid.layers.accuracy(input=out, label=label_, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label_, k=5)
pd.write_output(avg_cost)
pd.write_output(acc_top1)
pd.write_output(acc_top5)
avg_cost, acc_top1, acc_top5 = pd()
avg_cost = fluid.layers.mean(x=avg_cost)
acc_top1 = fluid.layers.mean(x=acc_top1)
acc_top5 = fluid.layers.mean(x=acc_top5)
else:
out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers)
cost = fluid.layers.cross_entropy(input=out, label=label)
avg_cost = fluid.layers.mean(x=cost)
acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
if lr_strategy is None:
optimizer = fluid.optimizer.Momentum(
learning_rate=learning_rate,
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
else:
bd = lr_strategy["bd"]
lr = lr_strategy["lr"]
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr),
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
inference_program = fluid.default_main_program().clone(for_test=True)
opts = optimizer.minimize(avg_cost)
if args.with_mem_opt:
fluid.memory_optimize(fluid.default_main_program())
fluid.memory_optimize(inference_program)
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
if init_model is not None:
fluid.io.load_persistables(exe, init_model)
train_reader = paddle.batch(reader.train(), batch_size=batch_size)
test_reader = paddle.batch(reader.test(), batch_size=batch_size)
feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
for pass_id in range(num_passes):
train_info = [[], [], []]
test_info = [[], [], []]
for batch_id, data in enumerate(train_reader()):
t1 = time.time()
loss, acc1, acc5 = exe.run(
fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost, acc_top1, acc_top5])
t2 = time.time()
period = t2 - t1
train_info[0].append(loss[0])
train_info[1].append(acc1[0])
train_info[2].append(acc5[0])
if batch_id % 10 == 0:
print("Pass {0}, trainbatch {1}, loss {2}, \
acc1 {3}, acc5 {4} time {5}"
.format(pass_id, \
batch_id, loss[0], acc1[0], acc5[0], \
"%2.2f sec" % period))
sys.stdout.flush()
train_loss = np.array(train_info[0]).mean()
train_acc1 = np.array(train_info[1]).mean()
train_acc5 = np.array(train_info[2]).mean()
for data in test_reader():
t1 = time.time()
loss, acc1, acc5 = exe.run(
inference_program,
feed=feeder.feed(data),
fetch_list=[avg_cost, acc_top1, acc_top5])
t2 = time.time()
period = t2 - t1
test_info[0].append(loss[0])
test_info[1].append(acc1[0])
test_info[2].append(acc5[0])
if batch_id % 10 == 0:
print("Pass {0},testbatch {1},loss {2}, \
acc1 {3},acc5 {4},time {5}"
.format(pass_id, \
batch_id, loss[0], acc1[0], acc5[0], \
"%2.2f sec" % period))
sys.stdout.flush()
test_loss = np.array(test_info[0]).mean()
test_acc1 = np.array(test_info[1]).mean()
test_acc5 = np.array(test_info[2]).mean()
print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \
test_loss {4}, test_acc1 {5}, test_acc5 {6}"
.format(pass_id, \
train_loss, train_acc1, train_acc5, test_loss, test_acc1, \
test_acc5))
sys.stdout.flush()
model_path = os.path.join(model_save_dir, str(pass_id))
if not os.path.isdir(model_path):
os.makedirs(model_path)
fluid.io.save_persistables(exe, model_path)
def train_parallel_exe(args,
learning_rate,
batch_size,
num_passes,
init_model=None,
model_save_dir='model',
parallel=True,
use_nccl=True,
lr_strategy=None,
layers=50):
class_dim = 1000
image_shape = [3, 224, 224]
image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers)
cost = fluid.layers.cross_entropy(input=out, label=label)
acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
avg_cost = fluid.layers.mean(x=cost)
test_program = fluid.default_main_program().clone(for_test=True)
if lr_strategy is None:
optimizer = fluid.optimizer.Momentum(
learning_rate=learning_rate,
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
else:
bd = lr_strategy["bd"]
lr = lr_strategy["lr"]
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr),
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
opts = optimizer.minimize(avg_cost)
if args.with_mem_opt:
fluid.memory_optimize(fluid.default_main_program())
fluid.memory_optimize(test_program)
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
if init_model is not None:
fluid.io.load_persistables(exe, init_model)
train_reader = paddle.batch(reader.train(), batch_size=batch_size)
test_reader = paddle.batch(reader.test(), batch_size=batch_size)
feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
test_exe = fluid.ParallelExecutor(
use_cuda=True,
main_program=test_program,
share_vars_from=train_exe)
fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name]
for pass_id in range(num_passes):
train_info = [[], [], []]
test_info = [[], [], []]
for batch_id, data in enumerate(train_reader()):
t1 = time.time()
loss, acc1, acc5 = train_exe.run(
fetch_list,
feed_dict=feeder.feed(data))
t2 = time.time()
period = t2 - t1
loss = np.mean(np.array(loss))
acc1 = np.mean(np.array(acc1))
acc5 = np.mean(np.array(acc5))
train_info[0].append(loss)
train_info[1].append(acc1)
train_info[2].append(acc5)
if batch_id % 10 == 0:
print("Pass {0}, trainbatch {1}, loss {2}, \
acc1 {3}, acc5 {4} time {5}"
.format(pass_id, \
batch_id, loss, acc1, acc5, \
"%2.2f sec" % period))
sys.stdout.flush()
train_loss = np.array(train_info[0]).mean()
train_acc1 = np.array(train_info[1]).mean()
train_acc5 = np.array(train_info[2]).mean()
for data in test_reader():
t1 = time.time()
loss, acc1, acc5 = test_exe.run(
fetch_list,
feed_dict=feeder.feed(data))
t2 = time.time()
period = t2 - t1
loss = np.mean(np.array(loss))
acc1 = np.mean(np.array(acc1))
acc5 = np.mean(np.array(acc5))
test_info[0].append(loss)
test_info[1].append(acc1)
test_info[2].append(acc5)
if batch_id % 10 == 0:
print("Pass {0},testbatch {1},loss {2}, \
acc1 {3},acc5 {4},time {5}"
.format(pass_id, \
batch_id, loss, acc1, acc5, \
"%2.2f sec" % period))
sys.stdout.flush()
test_loss = np.array(test_info[0]).mean()
test_acc1 = np.array(test_info[1]).mean()
test_acc5 = np.array(test_info[2]).mean()
print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \
test_loss {4}, test_acc1 {5}, test_acc5 {6}"
.format(pass_id, \
train_loss, train_acc1, train_acc5, test_loss, test_acc1, \
test_acc5))
sys.stdout.flush()
model_path = os.path.join(model_save_dir, str(pass_id))
if not os.path.isdir(model_path):
os.makedirs(model_path)
fluid.io.save_persistables(exe, model_path)
if __name__ == '__main__':
args = parser.parse_args()
print_arguments(args)
epoch_points = [30, 60, 90]
total_images = 1281167
batch_size = args.batch_size
step = int(total_images / batch_size + 1)
bd = [e * step for e in epoch_points]
lr = [0.1, 0.01, 0.001, 0.0001]
lr_strategy = {"bd": bd, "lr": lr}
use_nccl = True
# layers: 50, 152
layers = args.num_layers
method = train_parallel_exe if args.parallel_exe else train_parallel_do
method(args,
learning_rate=0.1,
batch_size=batch_size,
num_passes=120,
init_model=None,
parallel=True,
use_nccl=True,
lr_strategy=lr_strategy,
layers=layers)
"""Contains common utility functions."""
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import distutils.util
import numpy as np
from paddle.fluid import core
def print_arguments(args):
"""Print argparse's arguments.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
parser.add_argument("name", default="Jonh", type=str, help="User name.")
args = parser.parse_args()
print_arguments(args)
:param args: Input argparse.Namespace for printing.
:type args: argparse.Namespace
"""
print("----------- Configuration Arguments -----------")
for arg, value in sorted(vars(args).iteritems()):
print("%s: %s" % (arg, value))
print("------------------------------------------------")
def add_arguments(argname, type, default, help, argparser, **kwargs):
"""Add argparse's argument.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
add_argument("name", str, "Jonh", "User name.", parser)
args = parser.parse_args()
"""
type = distutils.util.strtobool if type == bool else type
argparser.add_argument(
"--" + argname,
default=default,
type=type,
help=help + ' Default: %(default)s.',
**kwargs)
...@@ -15,6 +15,9 @@ class TrainTaskConfig(object): ...@@ -15,6 +15,9 @@ class TrainTaskConfig(object):
# the parameters for learning rate scheduling. # the parameters for learning rate scheduling.
warmup_steps = 4000 warmup_steps = 4000
# the flag indicating to use average loss or sum loss when training.
use_avg_cost = False
# the directory for saving trained models. # the directory for saving trained models.
model_dir = "trained_models" model_dir = "trained_models"
...@@ -22,8 +25,7 @@ class TrainTaskConfig(object): ...@@ -22,8 +25,7 @@ class TrainTaskConfig(object):
class InferTaskConfig(object): class InferTaskConfig(object):
use_gpu = False use_gpu = False
# the number of examples in one run for sequence generation. # the number of examples in one run for sequence generation.
# currently the batch size can only be set to 1. batch_size = 10
batch_size = 1
# the parameters for beam search. # the parameters for beam search.
beam_size = 5 beam_size = 5
...@@ -31,37 +33,38 @@ class InferTaskConfig(object): ...@@ -31,37 +33,38 @@ class InferTaskConfig(object):
# the number of decoded sentences to output. # the number of decoded sentences to output.
n_best = 1 n_best = 1
# the flags indicating whether to output the special tokens.
output_bos = False
output_eos = False
output_unk = False
# the directory for loading the trained model. # the directory for loading the trained model.
model_path = "trained_models/pass_1.infer.model" model_path = "trained_models/pass_1.infer.model"
class ModelHyperParams(object): class ModelHyperParams(object):
# Dictionary size for source and target language. This model directly uses # This model directly uses paddle.dataset.wmt16 in which <bos>, <eos> and
# paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has # <unk> token has alreay been added. As for the <pad> token, any token
# alreay been added, but the <pad> token is not added. Transformer requires # included in dict can be used to pad, since the paddings' loss will be
# sequences in a mini-batch are padded to have the same length. A <pad> token is # masked out and make no effect on parameter gradients.
# added into the original dictionary in paddle.dateset.wmt16.
# size of source word dictionary. # size of source word dictionary.
src_vocab_size = 10000 src_vocab_size = 10000
# index for <pad> token in source language.
src_pad_idx = src_vocab_size
# size of target word dictionay # size of target word dictionay
trg_vocab_size = 10000 trg_vocab_size = 10000
# index for <pad> token in target language.
trg_pad_idx = trg_vocab_size
# index for <bos> token # index for <bos> token
bos_idx = 0 bos_idx = 0
# index for <eos> token # index for <eos> token
eos_idx = 1 eos_idx = 1
# index for <unk> token
unk_idx = 2
# position value corresponding to the <pad> token. # max length of sequences.
pos_pad_idx = 0 # The size of position encoding table should at least plus 1, since the
# sinusoid position encoding starts from 1 and 0 can be used as the padding
# max length of sequences. It should plus 1 to include position # token for position encoding.
# padding token for position encoding.
max_length = 50 max_length = 50
# the dimension for word embeddings, which is also the last dimension of # the dimension for word embeddings, which is also the last dimension of
...@@ -93,6 +96,7 @@ encoder_input_data_names = ( ...@@ -93,6 +96,7 @@ encoder_input_data_names = (
"src_word", "src_word",
"src_pos", "src_pos",
"src_slf_attn_bias", "src_slf_attn_bias",
"src_data_shape",
"src_slf_attn_pre_softmax_shape", "src_slf_attn_pre_softmax_shape",
"src_slf_attn_post_softmax_shape", ) "src_slf_attn_post_softmax_shape", )
...@@ -102,6 +106,7 @@ decoder_input_data_names = ( ...@@ -102,6 +106,7 @@ decoder_input_data_names = (
"trg_pos", "trg_pos",
"trg_slf_attn_bias", "trg_slf_attn_bias",
"trg_src_attn_bias", "trg_src_attn_bias",
"trg_data_shape",
"trg_slf_attn_pre_softmax_shape", "trg_slf_attn_pre_softmax_shape",
"trg_slf_attn_post_softmax_shape", "trg_slf_attn_post_softmax_shape",
"trg_src_attn_pre_softmax_shape", "trg_src_attn_pre_softmax_shape",
......
...@@ -11,10 +11,26 @@ from config import InferTaskConfig, ModelHyperParams, \ ...@@ -11,10 +11,26 @@ from config import InferTaskConfig, ModelHyperParams, \
from train import pad_batch_data from train import pad_batch_data
def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, def translate_batch(exe,
decoder, dec_in_names, dec_out_names, beam_size, max_length, src_words,
n_best, batch_size, n_head, src_pad_idx, trg_pad_idx, encoder,
bos_idx, eos_idx): enc_in_names,
enc_out_names,
decoder,
dec_in_names,
dec_out_names,
beam_size,
max_length,
n_best,
batch_size,
n_head,
d_model,
src_pad_idx,
trg_pad_idx,
bos_idx,
eos_idx,
unk_idx,
output_unk=True):
""" """
Run the encoder program once and run the decoder program multiple times to Run the encoder program once and run the decoder program multiple times to
implement beam search externally. implement beam search externally.
...@@ -25,9 +41,14 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, ...@@ -25,9 +41,14 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
src_pad_idx, src_pad_idx,
n_head, n_head,
is_target=False, is_target=False,
return_pos=True, is_label=False,
return_attn_bias=True, return_attn_bias=True,
return_max_len=False) return_max_len=False)
# Append the data shape input to reshape the output of embedding layer.
enc_in_data = enc_in_data + [
np.array(
[-1, enc_in_data[2].shape[-1], d_model], dtype="int32")
]
# Append the shape inputs to reshape before and after softmax in encoder # Append the shape inputs to reshape before and after softmax in encoder
# self attention. # self attention.
enc_in_data = enc_in_data + [ enc_in_data = enc_in_data + [
...@@ -44,11 +65,16 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, ...@@ -44,11 +65,16 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
scores = np.zeros((batch_size, beam_size), dtype="float32") scores = np.zeros((batch_size, beam_size), dtype="float32")
prev_branchs = [[] for i in range(batch_size)] prev_branchs = [[] for i in range(batch_size)]
next_ids = [[] for i in range(batch_size)] next_ids = [[] for i in range(batch_size)]
# Use beam_map to map the instance idx in batch to beam idx, since the # Use beam_inst_map to map beam idx to the instance idx in batch, since the
# size of feeded batch is changing. # size of feeded batch is changing.
beam_map = range(batch_size) beam_inst_map = {
beam_idx: inst_idx
for inst_idx, beam_idx in enumerate(range(batch_size))
}
# Use active_beams to recode the alive.
active_beams = range(batch_size)
def beam_backtrace(prev_branchs, next_ids, n_best=beam_size, add_bos=True): def beam_backtrace(prev_branchs, next_ids, n_best=beam_size):
""" """
Decode and select n_best sequences for one instance by backtrace. Decode and select n_best sequences for one instance by backtrace.
""" """
...@@ -60,7 +86,8 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, ...@@ -60,7 +86,8 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
seq.append(next_ids[j][k]) seq.append(next_ids[j][k])
k = prev_branchs[j][k] k = prev_branchs[j][k]
seq = seq[::-1] seq = seq[::-1]
seq = [bos_idx] + seq if add_bos else seq # Add the <bos>, since next_ids don't include the <bos>.
seq = [bos_idx] + seq
seqs.append(seq) seqs.append(seq)
return seqs return seqs
...@@ -82,8 +109,14 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, ...@@ -82,8 +109,14 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
[-1e9]).astype("float32") [-1e9]).astype("float32")
# This is used to remove attention on the paddings of source sequences. # This is used to remove attention on the paddings of source sequences.
trg_src_attn_bias = np.tile( trg_src_attn_bias = np.tile(
src_slf_attn_bias[:, :, ::src_max_length, :], src_slf_attn_bias[:, :, ::src_max_length, :][:, np.newaxis],
[beam_size, 1, trg_max_len, 1]) [1, beam_size, 1, trg_max_len, 1]).reshape([
-1, src_slf_attn_bias.shape[1], trg_max_len,
src_slf_attn_bias.shape[-1]
])
# Append the shape input to reshape the output of embedding layer.
trg_data_shape = np.array(
[batch_size * beam_size, trg_max_len, d_model], dtype="int32")
# Append the shape inputs to reshape before and after softmax in # Append the shape inputs to reshape before and after softmax in
# decoder self attention. # decoder self attention.
trg_slf_attn_pre_softmax_shape = np.array( trg_slf_attn_pre_softmax_shape = np.array(
...@@ -96,26 +129,27 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, ...@@ -96,26 +129,27 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
[-1, trg_src_attn_bias.shape[-1]], dtype="int32") [-1, trg_src_attn_bias.shape[-1]], dtype="int32")
trg_src_attn_post_softmax_shape = np.array( trg_src_attn_post_softmax_shape = np.array(
trg_src_attn_bias.shape, dtype="int32") trg_src_attn_bias.shape, dtype="int32")
enc_output = np.tile(enc_output, [beam_size, 1, 1]) enc_output = np.tile(
enc_output[:, np.newaxis], [1, beam_size, 1, 1]).reshape(
[-1, enc_output.shape[-2], enc_output.shape[-1]])
return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
trg_slf_attn_pre_softmax_shape, trg_slf_attn_post_softmax_shape, \ trg_data_shape, trg_slf_attn_pre_softmax_shape, \
trg_src_attn_pre_softmax_shape, trg_src_attn_post_softmax_shape, \ trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
enc_output trg_src_attn_post_softmax_shape, enc_output
def update_dec_in_data(dec_in_data, next_ids, active_beams): def update_dec_in_data(dec_in_data, next_ids, active_beams, beam_inst_map):
""" """
Update the input data of decoder mainly by slicing from the previous Update the input data of decoder mainly by slicing from the previous
input data and dropping the finished instance beams. input data and dropping the finished instance beams.
""" """
trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
trg_slf_attn_pre_softmax_shape, trg_slf_attn_post_softmax_shape, \ trg_data_shape, trg_slf_attn_pre_softmax_shape, \
trg_src_attn_pre_softmax_shape, trg_src_attn_post_softmax_shape, \ trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
enc_output = dec_in_data trg_src_attn_post_softmax_shape, enc_output = dec_in_data
trg_cur_len = len(next_ids[0]) + 1 # include the <bos> trg_cur_len = trg_slf_attn_bias.shape[-1] + 1
trg_words = np.array( trg_words = np.array(
[ [
beam_backtrace( beam_backtrace(prev_branchs[beam_idx], next_ids[beam_idx])
prev_branchs[beam_idx], next_ids[beam_idx], add_bos=True)
for beam_idx in active_beams for beam_idx in active_beams
], ],
dtype="int64") dtype="int64")
...@@ -123,6 +157,7 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, ...@@ -123,6 +157,7 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
trg_pos = np.array( trg_pos = np.array(
[range(1, trg_cur_len + 1)] * len(active_beams) * beam_size, [range(1, trg_cur_len + 1)] * len(active_beams) * beam_size,
dtype="int64").reshape([-1, 1]) dtype="int64").reshape([-1, 1])
active_beams = [beam_inst_map[beam_idx] for beam_idx in active_beams]
active_beams_indice = ( active_beams_indice = (
(np.array(active_beams) * beam_size)[:, np.newaxis] + (np.array(active_beams) * beam_size)[:, np.newaxis] +
np.array(range(beam_size))[np.newaxis, :]).flatten() np.array(range(beam_size))[np.newaxis, :]).flatten()
...@@ -137,6 +172,10 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, ...@@ -137,6 +172,10 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
trg_src_attn_bias = np.tile(trg_src_attn_bias[ trg_src_attn_bias = np.tile(trg_src_attn_bias[
active_beams_indice, :, ::trg_src_attn_bias.shape[2], :], active_beams_indice, :, ::trg_src_attn_bias.shape[2], :],
[1, 1, trg_cur_len, 1]) [1, 1, trg_cur_len, 1])
# Append the shape input to reshape the output of embedding layer.
trg_data_shape = np.array(
[len(active_beams) * beam_size, trg_cur_len, d_model],
dtype="int32")
# Append the shape inputs to reshape before and after softmax in # Append the shape inputs to reshape before and after softmax in
# decoder self attention. # decoder self attention.
trg_slf_attn_pre_softmax_shape = np.array( trg_slf_attn_pre_softmax_shape = np.array(
...@@ -151,9 +190,9 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, ...@@ -151,9 +190,9 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
trg_src_attn_bias.shape, dtype="int32") trg_src_attn_bias.shape, dtype="int32")
enc_output = enc_output[active_beams_indice, :, :] enc_output = enc_output[active_beams_indice, :, :]
return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
trg_slf_attn_pre_softmax_shape, trg_slf_attn_post_softmax_shape, \ trg_data_shape, trg_slf_attn_pre_softmax_shape, \
trg_src_attn_pre_softmax_shape, trg_src_attn_post_softmax_shape, \ trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
enc_output trg_src_attn_post_softmax_shape, enc_output
dec_in_data = init_dec_in_data(batch_size, beam_size, enc_in_data, dec_in_data = init_dec_in_data(batch_size, beam_size, enc_in_data,
enc_output) enc_output)
...@@ -162,13 +201,18 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, ...@@ -162,13 +201,18 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
feed=dict(zip(dec_in_names, dec_in_data)), feed=dict(zip(dec_in_names, dec_in_data)),
fetch_list=dec_out_names)[0] fetch_list=dec_out_names)[0]
predict_all = np.log( predict_all = np.log(
predict_all.reshape([len(beam_map) * beam_size, i + 1, -1])[:, predict_all.reshape([len(beam_inst_map) * beam_size, i + 1, -1])
-1, :]) [:, -1, :])
predict_all = (predict_all + scores[beam_map].reshape( predict_all = (predict_all + scores[active_beams].reshape(
[len(beam_map) * beam_size, -1])).reshape( [len(beam_inst_map) * beam_size, -1])).reshape(
[len(beam_map), beam_size, -1]) [len(beam_inst_map), beam_size, -1])
if not output_unk: # To exclude the <unk> token.
predict_all[:, :, unk_idx] = -1e9
active_beams = [] active_beams = []
for inst_idx, beam_idx in enumerate(beam_map): for beam_idx in range(batch_size):
if not beam_inst_map.has_key(beam_idx):
continue
inst_idx = beam_inst_map[beam_idx]
predict = (predict_all[inst_idx, :, :] predict = (predict_all[inst_idx, :, :]
if i != 0 else predict_all[inst_idx, 0, :]).flatten() if i != 0 else predict_all[inst_idx, 0, :]).flatten()
top_k_indice = np.argpartition(predict, -beam_size)[-beam_size:] top_k_indice = np.argpartition(predict, -beam_size)[-beam_size:]
...@@ -181,13 +225,20 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, ...@@ -181,13 +225,20 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
next_ids[beam_idx].append(top_scores_ids % predict_all.shape[-1]) next_ids[beam_idx].append(top_scores_ids % predict_all.shape[-1])
if next_ids[beam_idx][-1][0] != eos_idx: if next_ids[beam_idx][-1][0] != eos_idx:
active_beams.append(beam_idx) active_beams.append(beam_idx)
beam_map = active_beams if len(active_beams) == 0:
if len(beam_map) == 0:
break break
dec_in_data = update_dec_in_data(dec_in_data, next_ids, active_beams) dec_in_data = update_dec_in_data(dec_in_data, next_ids, active_beams,
beam_inst_map)
beam_inst_map = {
beam_idx: inst_idx
for inst_idx, beam_idx in enumerate(active_beams)
}
# Decode beams and select n_best sequences for each instance by backtrace. # Decode beams and select n_best sequences for each instance by backtrace.
seqs = [beam_backtrace(prev_branchs[beam_idx], next_ids[beam_idx], n_best)] seqs = [
beam_backtrace(prev_branchs[beam_idx], next_ids[beam_idx], n_best)
for beam_idx in range(batch_size)
]
return seqs, scores[:, :n_best].tolist() return seqs, scores[:, :n_best].tolist()
...@@ -195,29 +246,24 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, ...@@ -195,29 +246,24 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
def main(): def main():
place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace() place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
# The current program desc is coupled with batch_size and the only
# supported batch size is 1 currently.
encoder_program = fluid.Program() encoder_program = fluid.Program()
model.batch_size = InferTaskConfig.batch_size
with fluid.program_guard(main_program=encoder_program): with fluid.program_guard(main_program=encoder_program):
enc_output = encoder( enc_output = encoder(
ModelHyperParams.src_vocab_size + 1, ModelHyperParams.src_vocab_size, ModelHyperParams.max_length + 1,
ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_layer, ModelHyperParams.n_head,
ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_key, ModelHyperParams.d_value,
ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
ModelHyperParams.d_inner_hid, ModelHyperParams.dropout, ModelHyperParams.dropout)
ModelHyperParams.src_pad_idx, ModelHyperParams.pos_pad_idx)
model.batch_size = InferTaskConfig.batch_size * InferTaskConfig.beam_size
decoder_program = fluid.Program() decoder_program = fluid.Program()
with fluid.program_guard(main_program=decoder_program): with fluid.program_guard(main_program=decoder_program):
predict = decoder( predict = decoder(
ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1,
ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_layer, ModelHyperParams.n_head,
ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_key, ModelHyperParams.d_value,
ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
ModelHyperParams.d_inner_hid, ModelHyperParams.dropout, ModelHyperParams.dropout)
ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
# Load model parameters of encoder and decoder separately from the saved # Load model parameters of encoder and decoder separately from the saved
# transformer model. # transformer model.
...@@ -254,17 +300,51 @@ def main(): ...@@ -254,17 +300,51 @@ def main():
trg_idx2word = paddle.dataset.wmt16.get_dict( trg_idx2word = paddle.dataset.wmt16.get_dict(
"de", dict_size=ModelHyperParams.trg_vocab_size, reverse=True) "de", dict_size=ModelHyperParams.trg_vocab_size, reverse=True)
def post_process_seq(seq,
bos_idx=ModelHyperParams.bos_idx,
eos_idx=ModelHyperParams.eos_idx,
output_bos=InferTaskConfig.output_bos,
output_eos=InferTaskConfig.output_eos):
"""
Post-process the beam-search decoded sequence. Truncate from the first
<eos> and remove the <bos> and <eos> tokens currently.
"""
eos_pos = len(seq) - 1
for i, idx in enumerate(seq):
if idx == eos_idx:
eos_pos = i
break
seq = seq[:eos_pos + 1]
return filter(
lambda idx: (output_bos or idx != bos_idx) and \
(output_eos or idx != eos_idx),
seq)
for batch_id, data in enumerate(test_data()): for batch_id, data in enumerate(test_data()):
batch_seqs, batch_scores = translate_batch( batch_seqs, batch_scores = translate_batch(
exe, [item[0] for item in data], encoder_program, exe,
encoder_input_data_names, [enc_output.name], decoder_program, [item[0] for item in data],
decoder_input_data_names, [predict.name], InferTaskConfig.beam_size, encoder_program,
InferTaskConfig.max_length, InferTaskConfig.n_best, encoder_input_data_names,
len(data), ModelHyperParams.n_head, ModelHyperParams.src_pad_idx, [enc_output.name],
ModelHyperParams.trg_pad_idx, ModelHyperParams.bos_idx, decoder_program,
ModelHyperParams.eos_idx) decoder_input_data_names,
[predict.name],
InferTaskConfig.beam_size,
InferTaskConfig.max_length,
InferTaskConfig.n_best,
len(data),
ModelHyperParams.n_head,
ModelHyperParams.d_model,
ModelHyperParams.eos_idx, # Use eos_idx to pad.
ModelHyperParams.eos_idx, # Use eos_idx to pad.
ModelHyperParams.bos_idx,
ModelHyperParams.eos_idx,
ModelHyperParams.unk_idx,
output_unk=InferTaskConfig.output_unk)
for i in range(len(batch_seqs)): for i in range(len(batch_seqs)):
seqs = batch_seqs[i] # Post-process the beam-search decoded sequences.
seqs = map(post_process_seq, batch_seqs[i])
scores = batch_scores[i] scores = batch_scores[i]
for seq in seqs: for seq in seqs:
print(" ".join([trg_idx2word[idx] for idx in seq])) print(" ".join([trg_idx2word[idx] for idx in seq]))
......
...@@ -7,9 +7,6 @@ import paddle.fluid.layers as layers ...@@ -7,9 +7,6 @@ import paddle.fluid.layers as layers
from config import TrainTaskConfig, pos_enc_param_names, \ from config import TrainTaskConfig, pos_enc_param_names, \
encoder_input_data_names, decoder_input_data_names, label_data_names encoder_input_data_names, decoder_input_data_names, label_data_names
# FIXME(guosheng): Remove out the batch_size from the model.
batch_size = TrainTaskConfig.batch_size
def position_encoding_init(n_position, d_pos_vec): def position_encoding_init(n_position, d_pos_vec):
""" """
...@@ -85,9 +82,10 @@ def multi_head_attention(queries, ...@@ -85,9 +82,10 @@ def multi_head_attention(queries,
return x return x
hidden_size = x.shape[-1] hidden_size = x.shape[-1]
# FIXME(guosheng): Decouple the program desc with batch_size. # The value 0 in shape attr means copying the corresponding dimension
# size of the input as the output dimension size.
reshaped = layers.reshape( reshaped = layers.reshape(
x=x, shape=[batch_size, -1, n_head, hidden_size // n_head]) x=x, shape=[0, -1, n_head, hidden_size // n_head])
# permuate the dimensions into: # permuate the dimensions into:
# [batch_size, n_head, max_sequence_len, hidden_size_per_head] # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
...@@ -103,11 +101,11 @@ def multi_head_attention(queries, ...@@ -103,11 +101,11 @@ def multi_head_attention(queries,
raise ValueError("Input(x) should be a 4-D Tensor.") raise ValueError("Input(x) should be a 4-D Tensor.")
trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
# FIXME(guosheng): Decouple the program desc with batch_size. # The value 0 in shape attr means copying the corresponding dimension
# size of the input as the output dimension size.
return layers.reshape( return layers.reshape(
x=trans_x, x=trans_x,
shape=map(int, shape=map(int, [0, -1, trans_x.shape[2] * trans_x.shape[3]]))
[batch_size, -1, trans_x.shape[2] * trans_x.shape[3]]))
def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate): def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
""" """
...@@ -201,10 +199,9 @@ def prepare_encoder(src_word, ...@@ -201,10 +199,9 @@ def prepare_encoder(src_word,
src_pos, src_pos,
src_vocab_size, src_vocab_size,
src_emb_dim, src_emb_dim,
src_pad_idx,
src_max_len, src_max_len,
dropout_rate=0., dropout_rate=0.,
pos_pad_idx=0, src_data_shape=None,
pos_enc_param_name=None): pos_enc_param_name=None):
"""Add word embeddings and position encodings. """Add word embeddings and position encodings.
The output tensor has a shape of: The output tensor has a shape of:
...@@ -215,18 +212,17 @@ def prepare_encoder(src_word, ...@@ -215,18 +212,17 @@ def prepare_encoder(src_word,
src_word_emb = layers.embedding( src_word_emb = layers.embedding(
src_word, src_word,
size=[src_vocab_size, src_emb_dim], size=[src_vocab_size, src_emb_dim],
padding_idx=src_pad_idx,
param_attr=fluid.initializer.Normal(0., 1.)) param_attr=fluid.initializer.Normal(0., 1.))
src_pos_enc = layers.embedding( src_pos_enc = layers.embedding(
src_pos, src_pos,
size=[src_max_len, src_emb_dim], size=[src_max_len, src_emb_dim],
padding_idx=pos_pad_idx,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
name=pos_enc_param_name, trainable=False)) name=pos_enc_param_name, trainable=False))
enc_input = src_word_emb + src_pos_enc enc_input = src_word_emb + src_pos_enc
enc_input = layers.reshape(
# FIXME(guosheng): Decouple the program desc with batch_size. x=enc_input,
enc_input = layers.reshape(x=enc_input, shape=[batch_size, -1, src_emb_dim]) shape=[-1, src_max_len, src_emb_dim],
actual_shape=src_data_shape)
return layers.dropout( return layers.dropout(
enc_input, dropout_prob=dropout_rate, enc_input, dropout_prob=dropout_rate,
is_test=False) if dropout_rate else enc_input is_test=False) if dropout_rate else enc_input
...@@ -401,20 +397,23 @@ def decoder(dec_input, ...@@ -401,20 +397,23 @@ def decoder(dec_input,
def make_inputs(input_data_names, def make_inputs(input_data_names,
n_head, n_head,
d_model, d_model,
batch_size,
max_length, max_length,
is_pos, is_pos,
slf_attn_bias_flag, slf_attn_bias_flag,
src_attn_bias_flag, src_attn_bias_flag,
enc_output_flag=False, enc_output_flag=False,
data_shape_flag=True,
slf_attn_shape_flag=True, slf_attn_shape_flag=True,
src_attn_shape_flag=True): src_attn_shape_flag=True):
""" """
Define the input data layers for the transformer model. Define the input data layers for the transformer model.
""" """
input_layers = [] input_layers = []
# The shapes here act as placeholder. batch_size = 1 # Only for the infer-shape in compile time.
# The shapes set here is to pass the infer-shape in compile time. # The shapes here act as placeholder and are set to pass the infer-shape in
# compile time.
# The actual data shape of word is:
# [batch_size * max_len_in_batch, 1]
word = layers.data( word = layers.data(
name=input_data_names[len(input_layers)], name=input_data_names[len(input_layers)],
shape=[batch_size * max_length, 1], shape=[batch_size * max_length, 1],
...@@ -422,6 +421,8 @@ def make_inputs(input_data_names, ...@@ -422,6 +421,8 @@ def make_inputs(input_data_names,
append_batch_size=False) append_batch_size=False)
input_layers += [word] input_layers += [word]
# This is used for position data or label weight. # This is used for position data or label weight.
# The actual data shape of pos is:
# [batch_size * max_len_in_batch, 1]
pos = layers.data( pos = layers.data(
name=input_data_names[len(input_layers)], name=input_data_names[len(input_layers)],
shape=[batch_size * max_length, 1], shape=[batch_size * max_length, 1],
...@@ -432,6 +433,8 @@ def make_inputs(input_data_names, ...@@ -432,6 +433,8 @@ def make_inputs(input_data_names,
# This input is used to remove attention weights on paddings for the # This input is used to remove attention weights on paddings for the
# encoder and to remove attention weights on subsequent words for the # encoder and to remove attention weights on subsequent words for the
# decoder. # decoder.
# The actual data shape of slf_attn_bias_flag is:
# [batch_size, n_head, max_len_in_batch, max_len_in_batch]
slf_attn_bias = layers.data( slf_attn_bias = layers.data(
name=input_data_names[len(input_layers)], name=input_data_names[len(input_layers)],
shape=[batch_size, n_head, max_length, max_length], shape=[batch_size, n_head, max_length, max_length],
...@@ -439,40 +442,60 @@ def make_inputs(input_data_names, ...@@ -439,40 +442,60 @@ def make_inputs(input_data_names,
append_batch_size=False) append_batch_size=False)
input_layers += [slf_attn_bias] input_layers += [slf_attn_bias]
if src_attn_bias_flag: if src_attn_bias_flag:
# This input is used to remove attention weights on paddings. # This input is used to remove attention weights on paddings. It's used
# in encoder-decoder attention.
# The actual data shape of slf_attn_bias_flag is:
# [batch_size, n_head, trg_max_len_in_batch, src_max_len_in_batch]
src_attn_bias = layers.data( src_attn_bias = layers.data(
name=input_data_names[len(input_layers)], name=input_data_names[len(input_layers)],
shape=[batch_size, n_head, max_length, max_length], shape=[batch_size, n_head, max_length, max_length],
dtype="float32", dtype="float32",
append_batch_size=False) append_batch_size=False)
input_layers += [src_attn_bias] input_layers += [src_attn_bias]
if data_shape_flag:
# This input is used to reshape the output of embedding layer.
data_shape = layers.data(
name=input_data_names[len(input_layers)],
shape=[3],
dtype="int32",
append_batch_size=False)
input_layers += [data_shape]
if slf_attn_shape_flag: if slf_attn_shape_flag:
# This shape input is used to reshape before softmax in self attention.
slf_attn_pre_softmax_shape = layers.data( slf_attn_pre_softmax_shape = layers.data(
name=input_data_names[len(input_layers)], name=input_data_names[len(input_layers)],
shape=[3], shape=[2],
dtype="int32", dtype="int32",
append_batch_size=False) append_batch_size=False)
input_layers += [slf_attn_pre_softmax_shape] input_layers += [slf_attn_pre_softmax_shape]
# This shape input is used to reshape after softmax in self attention.
slf_attn_post_softmax_shape = layers.data( slf_attn_post_softmax_shape = layers.data(
name=input_data_names[len(input_layers)], name=input_data_names[len(input_layers)],
shape=[3], shape=[4],
dtype="int32", dtype="int32",
append_batch_size=False) append_batch_size=False)
input_layers += [slf_attn_post_softmax_shape] input_layers += [slf_attn_post_softmax_shape]
if src_attn_shape_flag: if src_attn_shape_flag:
# This shape input is used to reshape before softmax in encoder-decoder
# attention.
src_attn_pre_softmax_shape = layers.data( src_attn_pre_softmax_shape = layers.data(
name=input_data_names[len(input_layers)], name=input_data_names[len(input_layers)],
shape=[3], shape=[2],
dtype="int32", dtype="int32",
append_batch_size=False) append_batch_size=False)
input_layers += [src_attn_pre_softmax_shape] input_layers += [src_attn_pre_softmax_shape]
# This shape input is used to reshape after softmax in encoder-decoder
# attention.
src_attn_post_softmax_shape = layers.data( src_attn_post_softmax_shape = layers.data(
name=input_data_names[len(input_layers)], name=input_data_names[len(input_layers)],
shape=[3], shape=[4],
dtype="int32", dtype="int32",
append_batch_size=False) append_batch_size=False)
input_layers += [src_attn_post_softmax_shape] input_layers += [src_attn_post_softmax_shape]
if enc_output_flag: if enc_output_flag:
# This input is used in independent decoder program for inference.
# The actual data shape of slf_attn_bias_flag is:
# [batch_size, max_len_in_batch, d_model]
enc_output = layers.data( enc_output = layers.data(
name=input_data_names[len(input_layers)], name=input_data_names[len(input_layers)],
shape=[batch_size, max_length, d_model], shape=[batch_size, max_length, d_model],
...@@ -493,20 +516,17 @@ def transformer( ...@@ -493,20 +516,17 @@ def transformer(
d_value, d_value,
d_model, d_model,
d_inner_hid, d_inner_hid,
dropout_rate, dropout_rate, ):
src_pad_idx, enc_inputs = make_inputs(
trg_pad_idx,
pos_pad_idx, ):
enc_input_layers = make_inputs(
encoder_input_data_names, encoder_input_data_names,
n_head, n_head,
d_model, d_model,
batch_size,
max_length, max_length,
is_pos=True, is_pos=True,
slf_attn_bias_flag=True, slf_attn_bias_flag=True,
src_attn_bias_flag=False, src_attn_bias_flag=False,
enc_output_flag=False, enc_output_flag=False,
data_shape_flag=True,
slf_attn_shape_flag=True, slf_attn_shape_flag=True,
src_attn_shape_flag=False) src_attn_shape_flag=False)
...@@ -520,20 +540,18 @@ def transformer( ...@@ -520,20 +540,18 @@ def transformer(
d_model, d_model,
d_inner_hid, d_inner_hid,
dropout_rate, dropout_rate,
src_pad_idx, enc_inputs, )
pos_pad_idx,
enc_input_layers, )
dec_input_layers = make_inputs( dec_inputs = make_inputs(
decoder_input_data_names, decoder_input_data_names,
n_head, n_head,
d_model, d_model,
batch_size,
max_length, max_length,
is_pos=True, is_pos=True,
slf_attn_bias_flag=True, slf_attn_bias_flag=True,
src_attn_bias_flag=True, src_attn_bias_flag=True,
enc_output_flag=False, enc_output_flag=False,
data_shape_flag=True,
slf_attn_shape_flag=True, slf_attn_shape_flag=True,
src_attn_shape_flag=True) src_attn_shape_flag=True)
...@@ -547,9 +565,7 @@ def transformer( ...@@ -547,9 +565,7 @@ def transformer(
d_model, d_model,
d_inner_hid, d_inner_hid,
dropout_rate, dropout_rate,
trg_pad_idx, dec_inputs,
pos_pad_idx,
dec_input_layers,
enc_output, ) enc_output, )
# Padding index do not contribute to the total loss. The weights is used to # Padding index do not contribute to the total loss. The weights is used to
...@@ -558,17 +574,20 @@ def transformer( ...@@ -558,17 +574,20 @@ def transformer(
label_data_names, label_data_names,
n_head, n_head,
d_model, d_model,
batch_size,
max_length, max_length,
is_pos=False, is_pos=False,
slf_attn_bias_flag=False, slf_attn_bias_flag=False,
src_attn_bias_flag=False, src_attn_bias_flag=False,
enc_output_flag=False, enc_output_flag=False,
data_shape_flag=False,
slf_attn_shape_flag=False, slf_attn_shape_flag=False,
src_attn_shape_flag=False) src_attn_shape_flag=False)
cost = layers.softmax_with_cross_entropy(logits=predict, label=gold) cost = layers.softmax_with_cross_entropy(logits=predict, label=gold)
weighted_cost = cost * weights weighted_cost = cost * weights
return layers.reduce_sum(weighted_cost), predict sum_cost = layers.reduce_sum(weighted_cost)
token_num = layers.reduce_sum(weights)
avg_cost = sum_cost / token_num
return sum_cost, avg_cost, predict, token_num
def wrap_encoder(src_vocab_size, def wrap_encoder(src_vocab_size,
...@@ -580,38 +599,38 @@ def wrap_encoder(src_vocab_size, ...@@ -580,38 +599,38 @@ def wrap_encoder(src_vocab_size,
d_model, d_model,
d_inner_hid, d_inner_hid,
dropout_rate, dropout_rate,
src_pad_idx, enc_inputs=None):
pos_pad_idx,
enc_input_layers=None):
""" """
The wrapper assembles together all needed layers for the encoder. The wrapper assembles together all needed layers for the encoder.
""" """
if enc_input_layers is None: if enc_inputs is None:
# This is used to implement independent encoder program in inference. # This is used to implement independent encoder program in inference.
src_word, src_pos, src_slf_attn_bias, slf_attn_pre_softmax_shape, \ src_word, src_pos, src_slf_attn_bias, src_data_shape, \
slf_attn_post_softmax_shape = make_inputs( slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape = \
make_inputs(
encoder_input_data_names, encoder_input_data_names,
n_head, n_head,
d_model, d_model,
batch_size,
max_length, max_length,
is_pos=True, is_pos=True,
slf_attn_bias_flag=True, slf_attn_bias_flag=True,
src_attn_bias_flag=False, src_attn_bias_flag=False,
enc_output_flag=False, enc_output_flag=False,
data_shape_flag=True,
slf_attn_shape_flag=True, slf_attn_shape_flag=True,
src_attn_shape_flag=False) src_attn_shape_flag=False)
else: else:
src_word, src_pos, src_slf_attn_bias, slf_attn_pre_softmax_shape, \ src_word, src_pos, src_slf_attn_bias, src_data_shape, \
slf_attn_post_softmax_shape = enc_input_layers slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape = \
enc_inputs
enc_input = prepare_encoder( enc_input = prepare_encoder(
src_word, src_word,
src_pos, src_pos,
src_vocab_size, src_vocab_size,
d_model, d_model,
src_pad_idx,
max_length, max_length,
dropout_rate, ) dropout_rate,
src_data_shape, )
enc_output = encoder( enc_output = encoder(
enc_input, enc_input,
src_slf_attn_bias, src_slf_attn_bias,
...@@ -636,44 +655,42 @@ def wrap_decoder(trg_vocab_size, ...@@ -636,44 +655,42 @@ def wrap_decoder(trg_vocab_size,
d_model, d_model,
d_inner_hid, d_inner_hid,
dropout_rate, dropout_rate,
trg_pad_idx, dec_inputs=None,
pos_pad_idx,
dec_input_layers=None,
enc_output=None): enc_output=None):
""" """
The wrapper assembles together all needed layers for the decoder. The wrapper assembles together all needed layers for the decoder.
""" """
if dec_input_layers is None: if dec_inputs is None:
# This is used to implement independent decoder program in inference. # This is used to implement independent decoder program in inference.
trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape, \ trg_data_shape, slf_attn_pre_softmax_shape, \
src_attn_pre_softmax_shape, src_attn_post_softmax_shape, \ slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, \
enc_output = make_inputs( src_attn_post_softmax_shape, enc_output = make_inputs(
decoder_input_data_names, decoder_input_data_names,
n_head, n_head,
d_model, d_model,
batch_size,
max_length, max_length,
is_pos=True, is_pos=True,
slf_attn_bias_flag=True, slf_attn_bias_flag=True,
src_attn_bias_flag=True, src_attn_bias_flag=True,
enc_output_flag=True, enc_output_flag=True,
data_shape_flag=True,
slf_attn_shape_flag=True, slf_attn_shape_flag=True,
src_attn_shape_flag=True) src_attn_shape_flag=True)
else: else:
trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape, \ trg_data_shape, slf_attn_pre_softmax_shape, \
src_attn_pre_softmax_shape, src_attn_post_softmax_shape = \ slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, \
dec_input_layers src_attn_post_softmax_shape = dec_inputs
dec_input = prepare_decoder( dec_input = prepare_decoder(
trg_word, trg_word,
trg_pos, trg_pos,
trg_vocab_size, trg_vocab_size,
d_model, d_model,
trg_pad_idx,
max_length, max_length,
dropout_rate, ) dropout_rate,
trg_data_shape, )
dec_output = decoder( dec_output = decoder(
dec_input, dec_input,
enc_output, enc_output,
...@@ -697,5 +714,5 @@ def wrap_decoder(trg_vocab_size, ...@@ -697,5 +714,5 @@ def wrap_decoder(trg_vocab_size,
bias_attr=False, bias_attr=False,
num_flatten_dims=2), num_flatten_dims=2),
shape=[-1, trg_vocab_size], shape=[-1, trg_vocab_size],
act="softmax" if dec_input_layers is None else None) act="softmax" if dec_inputs is None else None)
return predict return predict
import os import os
import time
import numpy as np import numpy as np
import paddle import paddle
...@@ -14,7 +15,7 @@ def pad_batch_data(insts, ...@@ -14,7 +15,7 @@ def pad_batch_data(insts,
pad_idx, pad_idx,
n_head, n_head,
is_target=False, is_target=False,
return_pos=True, is_label=False,
return_attn_bias=True, return_attn_bias=True,
return_max_len=True): return_max_len=True):
""" """
...@@ -23,14 +24,20 @@ def pad_batch_data(insts, ...@@ -23,14 +24,20 @@ def pad_batch_data(insts,
""" """
return_list = [] return_list = []
max_len = max(len(inst) for inst in insts) max_len = max(len(inst) for inst in insts)
# Any token included in dict can be used to pad, since the paddings' loss
# will be masked out by weights and make no effect on parameter gradients.
inst_data = np.array( inst_data = np.array(
[inst + [pad_idx] * (max_len - len(inst)) for inst in insts]) [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
return_list += [inst_data.astype("int64").reshape([-1, 1])] return_list += [inst_data.astype("int64").reshape([-1, 1])]
if return_pos: if is_label: # label weight
inst_pos = np.array([[ inst_weight = np.array(
pos_i + 1 if w_i != pad_idx else 0 for pos_i, w_i in enumerate(inst) [[1.] * len(inst) + [0.] * (max_len - len(inst)) for inst in insts])
] for inst in inst_data]) return_list += [inst_weight.astype("float32").reshape([-1, 1])]
else: # position data
inst_pos = np.array([
range(1, len(inst) + 1) + [0] * (max_len - len(inst))
for inst in insts
])
return_list += [inst_pos.astype("int64").reshape([-1, 1])] return_list += [inst_pos.astype("int64").reshape([-1, 1])]
if return_attn_bias: if return_attn_bias:
if is_target: if is_target:
...@@ -56,7 +63,7 @@ def pad_batch_data(insts, ...@@ -56,7 +63,7 @@ def pad_batch_data(insts,
def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx, def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx,
max_length, n_head): n_head, d_model):
""" """
Put all padded data needed by training into a dict. Put all padded data needed by training into a dict.
""" """
...@@ -66,6 +73,10 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx, ...@@ -66,6 +73,10 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx,
[inst[1] for inst in insts], trg_pad_idx, n_head, is_target=True) [inst[1] for inst in insts], trg_pad_idx, n_head, is_target=True)
trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :], trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
[1, 1, trg_max_len, 1]).astype("float32") [1, 1, trg_max_len, 1]).astype("float32")
# These shape tensors are used in reshape_op.
src_data_shape = np.array([len(insts), src_max_len, d_model], dtype="int32")
trg_data_shape = np.array([len(insts), trg_max_len, d_model], dtype="int32")
src_slf_attn_pre_softmax_shape = np.array( src_slf_attn_pre_softmax_shape = np.array(
[-1, src_slf_attn_bias.shape[-1]], dtype="int32") [-1, src_slf_attn_bias.shape[-1]], dtype="int32")
src_slf_attn_post_softmax_shape = np.array( src_slf_attn_post_softmax_shape = np.array(
...@@ -78,17 +89,24 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx, ...@@ -78,17 +89,24 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx,
[-1, trg_src_attn_bias.shape[-1]], dtype="int32") [-1, trg_src_attn_bias.shape[-1]], dtype="int32")
trg_src_attn_post_softmax_shape = np.array( trg_src_attn_post_softmax_shape = np.array(
trg_src_attn_bias.shape, dtype="int32") trg_src_attn_bias.shape, dtype="int32")
lbl_word = pad_batch_data([inst[2] for inst in insts], trg_pad_idx, n_head,
False, False, False, False) lbl_word, lbl_weight = pad_batch_data(
lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1]) [inst[2] for inst in insts],
trg_pad_idx,
n_head,
is_target=False,
is_label=True,
return_attn_bias=False,
return_max_len=False)
input_dict = dict( input_dict = dict(
zip(input_data_names, [ zip(input_data_names, [
src_word, src_pos, src_slf_attn_bias, src_word, src_pos, src_slf_attn_bias, src_data_shape,
src_slf_attn_pre_softmax_shape, src_slf_attn_post_softmax_shape, src_slf_attn_pre_softmax_shape, src_slf_attn_post_softmax_shape,
trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias,
trg_slf_attn_pre_softmax_shape, trg_slf_attn_post_softmax_shape, trg_data_shape, trg_slf_attn_pre_softmax_shape,
trg_src_attn_pre_softmax_shape, trg_src_attn_post_softmax_shape, trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape,
lbl_word, lbl_weight trg_src_attn_post_softmax_shape, lbl_word, lbl_weight
])) ]))
return input_dict return input_dict
...@@ -97,14 +115,12 @@ def main(): ...@@ -97,14 +115,12 @@ def main():
place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace() place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
cost, predict = transformer( sum_cost, avg_cost, predict, token_num = transformer(
ModelHyperParams.src_vocab_size + 1, ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.n_head, ModelHyperParams.d_key,
ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_value, ModelHyperParams.d_model,
ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.d_inner_hid, ModelHyperParams.dropout)
ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model, lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model,
TrainTaskConfig.warmup_steps, place, TrainTaskConfig.warmup_steps, place,
...@@ -114,7 +130,7 @@ def main(): ...@@ -114,7 +130,7 @@ def main():
beta1=TrainTaskConfig.beta1, beta1=TrainTaskConfig.beta1,
beta2=TrainTaskConfig.beta2, beta2=TrainTaskConfig.beta2,
epsilon=TrainTaskConfig.eps) epsilon=TrainTaskConfig.eps)
optimizer.minimize(cost) optimizer.minimize(avg_cost if TrainTaskConfig.use_avg_cost else sum_cost)
train_data = paddle.batch( train_data = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
...@@ -126,27 +142,31 @@ def main(): ...@@ -126,27 +142,31 @@ def main():
# Program to do validation. # Program to do validation.
test_program = fluid.default_main_program().clone() test_program = fluid.default_main_program().clone()
with fluid.program_guard(test_program): with fluid.program_guard(test_program):
test_program = fluid.io.get_inference_program([cost]) test_program = fluid.io.get_inference_program([avg_cost])
val_data = paddle.batch( val_data = paddle.batch(
paddle.dataset.wmt16.validation(ModelHyperParams.src_vocab_size, paddle.dataset.wmt16.validation(ModelHyperParams.src_vocab_size,
ModelHyperParams.trg_vocab_size), ModelHyperParams.trg_vocab_size),
batch_size=TrainTaskConfig.batch_size) batch_size=TrainTaskConfig.batch_size)
def test(exe): def test(exe):
test_costs = [] test_total_cost = 0
test_total_token = 0
for batch_id, data in enumerate(val_data()): for batch_id, data in enumerate(val_data()):
if len(data) != TrainTaskConfig.batch_size:
continue
data_input = prepare_batch_input( data_input = prepare_batch_input(
data, encoder_input_data_names + decoder_input_data_names[:-1] + data, encoder_input_data_names + decoder_input_data_names[:-1] +
label_data_names, ModelHyperParams.src_pad_idx, label_data_names, ModelHyperParams.eos_idx,
ModelHyperParams.trg_pad_idx, ModelHyperParams.max_length, ModelHyperParams.eos_idx, ModelHyperParams.n_head,
ModelHyperParams.n_head) ModelHyperParams.d_model)
test_cost = exe.run(test_program, test_sum_cost, test_token_num = exe.run(
feed=data_input, test_program,
fetch_list=[cost])[0] feed=data_input,
test_costs.append(test_cost) fetch_list=[sum_cost, token_num],
return np.mean(test_costs) use_program_cache=True)
test_total_cost += test_sum_cost
test_total_token += test_token_num
test_avg_cost = test_total_cost / test_total_token
test_ppl = np.exp([min(test_avg_cost, 100)])
return test_avg_cost, test_ppl
# Initialize the parameters. # Initialize the parameters.
exe.run(fluid.framework.default_startup_program()) exe.run(fluid.framework.default_startup_program())
...@@ -158,27 +178,30 @@ def main(): ...@@ -158,27 +178,30 @@ def main():
ModelHyperParams.d_model), place) ModelHyperParams.d_model), place)
for pass_id in xrange(TrainTaskConfig.pass_num): for pass_id in xrange(TrainTaskConfig.pass_num):
pass_start_time = time.time()
for batch_id, data in enumerate(train_data()): for batch_id, data in enumerate(train_data()):
# The current program desc is coupled with batch_size, thus all
# mini-batches must have the same number of instances currently.
if len(data) != TrainTaskConfig.batch_size: if len(data) != TrainTaskConfig.batch_size:
continue continue
data_input = prepare_batch_input( data_input = prepare_batch_input(
data, encoder_input_data_names + decoder_input_data_names[:-1] + data, encoder_input_data_names + decoder_input_data_names[:-1] +
label_data_names, ModelHyperParams.src_pad_idx, label_data_names, ModelHyperParams.eos_idx,
ModelHyperParams.trg_pad_idx, ModelHyperParams.max_length, ModelHyperParams.eos_idx, ModelHyperParams.n_head,
ModelHyperParams.n_head) ModelHyperParams.d_model)
lr_scheduler.update_learning_rate(data_input) lr_scheduler.update_learning_rate(data_input)
outs = exe.run(fluid.framework.default_main_program(), outs = exe.run(fluid.framework.default_main_program(),
feed=data_input, feed=data_input,
fetch_list=[cost], fetch_list=[sum_cost, avg_cost],
use_program_cache=True) use_program_cache=True)
cost_val = np.array(outs[0]) sum_cost_val, avg_cost_val = np.array(outs[0]), np.array(outs[1])
print("pass_id = " + str(pass_id) + " batch = " + str(batch_id) + print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" %
" cost = " + str(cost_val)) (pass_id, batch_id, sum_cost_val, avg_cost_val,
np.exp([min(avg_cost_val[0], 100)])))
# Validate and save the model for inference. # Validate and save the model for inference.
val_cost = test(exe) val_avg_cost, val_ppl = test(exe)
print("pass_id = " + str(pass_id) + " val_cost = " + str(val_cost)) pass_end_time = time.time()
time_consumed = pass_end_time - pass_start_time
print("epoch: %d, val avg loss: %f, val ppl: %f, "
"consumed %fs" % (pass_id, val_avg_cost, val_ppl, time_consumed))
fluid.io.save_inference_model( fluid.io.save_inference_model(
os.path.join(TrainTaskConfig.model_dir, os.path.join(TrainTaskConfig.model_dir,
"pass_" + str(pass_id) + ".infer.model"), "pass_" + str(pass_id) + ".infer.model"),
......
./data/pascalvoc/VOCdevkit/
data/pascalvoc/test.txt
data/pascalvoc/trainval.txt
pretrained/ssd_mobilenet_v1_coco.tar.gz
pretrained/ssd_mobilenet_v1_coco
pretrained/mobilenet_v1_imagenet.tar.gz
pretrained/mobilenet_v1_imagenet
log*
...@@ -60,4 +60,5 @@ def prepare_filelist(devkit_dir, years, output_dir): ...@@ -60,4 +60,5 @@ def prepare_filelist(devkit_dir, years, output_dir):
ftest.write(item[0] + ' ' + item[1] + '\n') ftest.write(item[0] + ' ' + item[1] + '\n')
prepare_filelist(devkit_dir, years, '.') if __name__ == '__main__':
prepare_filelist(devkit_dir, years, '.')
DIR="$( cd "$(dirname "$0")" ; pwd -P )"
cd "$DIR"
# Download the data.
echo "Downloading..."
wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
# Extract the data.
echo "Extractint..."
tar -xf VOCtrainval_11-May-2012.tar
tar -xf VOCtrainval_06-Nov-2007.tar
tar -xf VOCtest_06-Nov-2007.tar
echo "Creating data lists..."
python create_list.py
...@@ -85,8 +85,7 @@ def satisfy_sample_constraint(sampler, sample_bbox, bbox_labels): ...@@ -85,8 +85,7 @@ def satisfy_sample_constraint(sampler, sample_bbox, bbox_labels):
return False return False
def generate_batch_samples(batch_sampler, bbox_labels, image_width, def generate_batch_samples(batch_sampler, bbox_labels):
image_height):
sampled_bbox = [] sampled_bbox = []
index = [] index = []
c = 0 c = 0
...@@ -217,8 +216,8 @@ def distort_image(img, settings): ...@@ -217,8 +216,8 @@ def distort_image(img, settings):
def expand_image(img, bbox_labels, img_width, img_height, settings): def expand_image(img, bbox_labels, img_width, img_height, settings):
prob = random.uniform(0, 1) prob = random.uniform(0, 1)
if prob < settings._expand_prob: if prob < settings._expand_prob:
expand_ratio = random.uniform(1, settings._expand_max_ratio) if _expand_max_ratio - 1 >= 0.01:
if expand_ratio - 1 >= 0.01: expand_ratio = random.uniform(1, settings._expand_max_ratio)
height = int(img_height * expand_ratio) height = int(img_height * expand_ratio)
width = int(img_width * expand_ratio) width = int(img_width * expand_ratio)
h_off = math.floor(random.uniform(0, height - img_height)) h_off = math.floor(random.uniform(0, height - img_height))
...@@ -231,5 +230,5 @@ def expand_image(img, bbox_labels, img_width, img_height, settings): ...@@ -231,5 +230,5 @@ def expand_image(img, bbox_labels, img_width, img_height, settings):
expand_img = Image.fromarray(expand_img) expand_img = Image.fromarray(expand_img)
expand_img.paste(img, (int(w_off), int(h_off))) expand_img.paste(img, (int(w_off), int(h_off)))
bbox_labels = transform_labels(bbox_labels, expand_bbox) bbox_labels = transform_labels(bbox_labels, expand_bbox)
return expand_img, bbox_labels return expand_img, bbox_labels, width, height
return img, bbox_labels return img, bbox_labels, img_width, img_height
import paddle.v2 as paddle
import paddle.fluid as fluid
import numpy as np
# From npy
def load_vars():
vars = {}
name_map = {}
with open('./ssd_mobilenet_v1_coco/names.map', 'r') as map_file:
for param in map_file:
fd_name, tf_name = param.strip().split('\t')
name_map[fd_name] = tf_name
tf_vars = np.load(
'./ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco_2017_11_17.npy').item()
for fd_name in name_map:
tf_name = name_map[fd_name]
tf_var = tf_vars[tf_name]
if len(tf_var.shape) == 4 and 'depthwise' in tf_name:
vars[fd_name] = np.transpose(tf_var, (2, 3, 0, 1))
elif len(tf_var.shape) == 4:
vars[fd_name] = np.transpose(tf_var, (3, 2, 0, 1))
else:
vars[fd_name] = tf_var
return vars
def load_and_set_vars(place):
vars = load_vars()
for k, v in vars.items():
t = fluid.global_scope().find_var(k).get_tensor()
#print(np.array(t).shape, v.shape, k)
assert np.array(t).shape == v.shape
t.set(v, place)
# From Paddle V1
def load_paddlev1_vars(place):
vars = {}
name_map = {}
with open('./caffe2paddle/names.map', 'r') as map_file:
for param in map_file:
fd_name, tf_name = param.strip().split('\t')
name_map[fd_name] = tf_name
from operator import mul
def load(file_name, shape):
with open(file_name, 'rb') as f:
f.read(16)
arr = np.fromfile(f, dtype=np.float32)
#print(arr.size, reduce(mul, shape), file_name)
assert arr.size == reduce(mul, shape)
return arr.reshape(shape)
for fd_name in name_map:
v1_name = name_map[fd_name]
t = fluid.global_scope().find_var(fd_name).get_tensor()
shape = np.array(t).shape
v1_var = load('./caffe2paddle/' + v1_name, shape)
t.set(v1_var, place)
if __name__ == "__main__":
load_vars()
...@@ -27,12 +27,7 @@ def conv_bn(input, ...@@ -27,12 +27,7 @@ def conv_bn(input,
bias_attr=False) bias_attr=False)
parameter_attr = ParamAttr(learning_rate=0.1, initializer=MSRA()) parameter_attr = ParamAttr(learning_rate=0.1, initializer=MSRA())
bias_attr = ParamAttr(learning_rate=0.2) bias_attr = ParamAttr(learning_rate=0.2)
return fluid.layers.batch_norm( return fluid.layers.batch_norm(input=conv, act=act)
input=conv,
act=act,
epsilon=0.00001,
param_attr=parameter_attr,
bias_attr=bias_attr)
def depthwise_separable(input, num_filters1, num_filters2, num_groups, stride, def depthwise_separable(input, num_filters1, num_filters2, num_groups, stride,
...@@ -76,7 +71,7 @@ def extra_block(input, num_filters1, num_filters2, num_groups, stride, scale): ...@@ -76,7 +71,7 @@ def extra_block(input, num_filters1, num_filters2, num_groups, stride, scale):
return normal_conv return normal_conv
def mobile_net(img, img_shape, scale=1.0): def mobile_net(num_classes, img, img_shape, scale=1.0):
# 300x300 # 300x300
tmp = conv_bn(img, 3, int(32 * scale), 2, 1, 3) tmp = conv_bn(img, 3, int(32 * scale), 2, 1, 3)
# 150x150 # 150x150
...@@ -104,10 +99,11 @@ def mobile_net(img, img_shape, scale=1.0): ...@@ -104,10 +99,11 @@ def mobile_net(img, img_shape, scale=1.0):
module16 = extra_block(module15, 128, 256, 1, 2, scale) module16 = extra_block(module15, 128, 256, 1, 2, scale)
# 2x2 # 2x2
module17 = extra_block(module16, 64, 128, 1, 2, scale) module17 = extra_block(module16, 64, 128, 1, 2, scale)
mbox_locs, mbox_confs, box, box_var = fluid.layers.multi_box_head( mbox_locs, mbox_confs, box, box_var = fluid.layers.multi_box_head(
inputs=[module11, module13, module14, module15, module16, module17], inputs=[module11, module13, module14, module15, module16, module17],
image=img, image=img,
num_classes=21, num_classes=num_classes,
min_ratio=20, min_ratio=20,
max_ratio=90, max_ratio=90,
min_sizes=[60.0, 105.0, 150.0, 195.0, 240.0, 285.0], min_sizes=[60.0, 105.0, 150.0, 195.0, 240.0, 285.0],
......
DIR="$( cd "$(dirname "$0")" ; pwd -P )"
cd "$DIR"
# Download the data.
echo "Downloading..."
wget http://paddlemodels.bj.bcebos.com/ssd_mobilenet_v1_coco.tar.gz
echo "Extractint..."
tar -xf ssd_mobilenet_v1_coco.tar.gz
DIR="$( cd "$(dirname "$0")" ; pwd -P )"
cd "$DIR"
# Download the data.
echo "Downloading..."
wget http://paddlemodels.bj.bcebos.com/mobilenet_v1_imagenet.tar.gz
echo "Extractint..."
tar -xf mobilenet_v1_imagenet.tar.gz
...@@ -16,19 +16,25 @@ import image_util ...@@ -16,19 +16,25 @@ import image_util
from paddle.utils.image_util import * from paddle.utils.image_util import *
import random import random
from PIL import Image from PIL import Image
from PIL import ImageDraw
import numpy as np import numpy as np
import xml.etree.ElementTree import xml.etree.ElementTree
import os import os
import time
import copy
class Settings(object): class Settings(object):
def __init__(self, data_dir, label_file, resize_h, resize_w, mean_value, def __init__(self, dataset, toy, data_dir, label_file, resize_h, resize_w,
apply_distort, apply_expand): mean_value, apply_distort, apply_expand):
self._dataset = dataset
self._toy = toy
self._data_dir = data_dir self._data_dir = data_dir
self._label_list = [] if dataset == "pascalvoc":
label_fpath = os.path.join(data_dir, label_file) self._label_list = []
for line in open(label_fpath): label_fpath = os.path.join(data_dir, label_file)
self._label_list.append(line.strip()) for line in open(label_fpath):
self._label_list.append(line.strip())
self._apply_distort = apply_distort self._apply_distort = apply_distort
self._apply_expand = apply_expand self._apply_expand = apply_expand
...@@ -47,6 +53,14 @@ class Settings(object): ...@@ -47,6 +53,14 @@ class Settings(object):
self._brightness_prob = 0.5 self._brightness_prob = 0.5
self._brightness_delta = 0.125 self._brightness_delta = 0.125
@property
def dataset(self):
return self._dataset
@property
def toy(self):
return self._toy
@property @property
def apply_distort(self): def apply_distort(self):
return self._apply_expand return self._apply_expand
...@@ -59,6 +73,10 @@ class Settings(object): ...@@ -59,6 +73,10 @@ class Settings(object):
def data_dir(self): def data_dir(self):
return self._data_dir return self._data_dir
@data_dir.setter
def data_dir(self, data_dir):
self._data_dir = data_dir
@property @property
def label_list(self): def label_list(self):
return self._label_list return self._label_list
...@@ -78,23 +96,76 @@ class Settings(object): ...@@ -78,23 +96,76 @@ class Settings(object):
def _reader_creator(settings, file_list, mode, shuffle): def _reader_creator(settings, file_list, mode, shuffle):
def reader(): def reader():
with open(file_list) as flist: if settings.dataset == 'coco':
lines = [line.strip() for line in flist] # cocoapi
if shuffle: from pycocotools.coco import COCO
random.shuffle(lines) from pycocotools.cocoeval import COCOeval
for line in lines:
coco = COCO(file_list)
image_ids = coco.getImgIds()
images = coco.loadImgs(image_ids)
category_ids = coco.getCatIds()
category_names = [
item['name'] for item in coco.loadCats(category_ids)
]
elif settings.dataset == 'pascalvoc':
flist = open(file_list)
images = [line.strip() for line in flist]
if not settings.toy == 0:
images = images[:settings.toy] if len(
images) > settings.toy else images
print("{} on {} with {} images".format(mode, settings.dataset,
len(images)))
if shuffle:
random.shuffle(images)
for image in images:
if settings.dataset == 'coco':
image_name = image['file_name']
image_path = os.path.join(settings.data_dir, image_name)
elif settings.dataset == 'pascalvoc':
if mode == 'train' or mode == 'test': if mode == 'train' or mode == 'test':
img_path, label_path = line.split() image_path, label_path = image.split()
img_path = os.path.join(settings.data_dir, img_path) image_path = os.path.join(settings.data_dir, image_path)
label_path = os.path.join(settings.data_dir, label_path) label_path = os.path.join(settings.data_dir, label_path)
elif mode == 'infer': elif mode == 'infer':
img_path = os.path.join(settings.data_dir, line) image_path = os.path.join(settings.data_dir, image)
img = Image.open(img_path) img = Image.open(image_path)
img_width, img_height = img.size if img.mode == 'L':
img = img.convert('RGB')
img_width, img_height = img.size
# layout: label | xmin | ymin | xmax | ymax | difficult if mode == 'train' or mode == 'test':
if mode == 'train' or mode == 'test': if settings.dataset == 'coco':
# layout: category_id | xmin | ymin | xmax | ymax | iscrowd | origin_coco_bbox | segmentation | area | image_id | annotation_id
bbox_labels = []
annIds = coco.getAnnIds(imgIds=image['id'])
anns = coco.loadAnns(annIds)
for ann in anns:
bbox_sample = []
# start from 1, leave 0 to background
bbox_sample.append(
float(category_ids.index(ann['category_id'])) + 1)
bbox = ann['bbox']
xmin, ymin, w, h = bbox
xmax = xmin + w
ymax = ymin + h
bbox_sample.append(float(xmin) / img_width)
bbox_sample.append(float(ymin) / img_height)
bbox_sample.append(float(xmax) / img_width)
bbox_sample.append(float(ymax) / img_height)
bbox_sample.append(float(ann['iscrowd']))
#bbox_sample.append(ann['bbox'])
#bbox_sample.append(ann['segmentation'])
#bbox_sample.append(ann['area'])
#bbox_sample.append(ann['image_id'])
#bbox_sample.append(ann['id'])
bbox_labels.append(bbox_sample)
elif settings.dataset == 'pascalvoc':
# layout: label | xmin | ymin | xmax | ymax | difficult
bbox_labels = [] bbox_labels = []
root = xml.etree.ElementTree.parse(label_path).getroot() root = xml.etree.ElementTree.parse(label_path).getroot()
for object in root.findall('object'): for object in root.findall('object'):
...@@ -117,91 +188,136 @@ def _reader_creator(settings, file_list, mode, shuffle): ...@@ -117,91 +188,136 @@ def _reader_creator(settings, file_list, mode, shuffle):
bbox_sample.append(difficult) bbox_sample.append(difficult)
bbox_labels.append(bbox_sample) bbox_labels.append(bbox_sample)
sample_labels = bbox_labels sample_labels = bbox_labels
if mode == 'train':
if settings._apply_distort:
img = image_util.distort_image(img, settings)
if settings._apply_expand:
img, bbox_labels = image_util.expand_image(
img, bbox_labels, img_width, img_height,
settings)
batch_sampler = []
# hard-code here
batch_sampler.append(
image_util.sampler(1, 1, 1.0, 1.0, 1.0, 1.0, 0.0,
0.0))
batch_sampler.append(
image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.1,
0.0))
batch_sampler.append(
image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.3,
0.0))
batch_sampler.append(
image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.5,
0.0))
batch_sampler.append(
image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.7,
0.0))
batch_sampler.append(
image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.9,
0.0))
batch_sampler.append(
image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.0,
1.0))
""" random crop """
sampled_bbox = image_util.generate_batch_samples(
batch_sampler, bbox_labels, img_width, img_height)
img = np.array(img)
if len(sampled_bbox) > 0:
idx = int(random.uniform(0, len(sampled_bbox)))
img, sample_labels = image_util.crop_image(
img, bbox_labels, sampled_bbox[idx], img_width,
img_height)
img = Image.fromarray(img)
img = img.resize((settings.resize_w, settings.resize_h),
Image.ANTIALIAS)
img = np.array(img)
if mode == 'train': if mode == 'train':
mirror = int(random.uniform(0, 2)) if settings._apply_distort:
if mirror == 1: img = image_util.distort_image(img, settings)
img = img[:, ::-1, :] if settings._apply_expand:
for i in xrange(len(sample_labels)): img, bbox_labels, img_width, img_height = image_util.expand_image(
tmp = sample_labels[i][1] img, bbox_labels, img_width, img_height, settings)
sample_labels[i][1] = 1 - sample_labels[i][3] batch_sampler = []
sample_labels[i][3] = 1 - tmp # hard-code here
batch_sampler.append(
if len(img.shape) == 3: image_util.sampler(1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0))
img = np.swapaxes(img, 1, 2) batch_sampler.append(
img = np.swapaxes(img, 1, 0) image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 0.0))
batch_sampler.append(
img = img[[2, 1, 0], :, :] image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 0.0))
img = img.astype('float32') batch_sampler.append(
img -= settings.img_mean image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 0.0))
img = img.flatten() batch_sampler.append(
img = img * 0.007843 image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 0.0))
batch_sampler.append(
sample_labels = np.array(sample_labels) image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 0.0))
if mode == 'train' or mode == 'test': batch_sampler.append(
if mode == 'train' and len(sample_labels) == 0: continue image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0))
yield img.astype( """ random crop """
'float32' sampled_bbox = image_util.generate_batch_samples(
), sample_labels[:, 1:5], sample_labels[:, 0].astype( batch_sampler, bbox_labels, img_width, img_height)
'int32'), sample_labels[:, -1].astype('int32')
elif mode == 'infer': img = np.array(img)
yield img.astype('float32') if len(sampled_bbox) > 0:
idx = int(random.uniform(0, len(sampled_bbox)))
img, sample_labels = image_util.crop_image(
img, bbox_labels, sampled_bbox[idx], img_width,
img_height)
img = Image.fromarray(img)
img = img.resize((settings.resize_w, settings.resize_h),
Image.ANTIALIAS)
img = np.array(img)
if mode == 'train':
mirror = int(random.uniform(0, 2))
if mirror == 1:
img = img[:, ::-1, :]
for i in xrange(len(sample_labels)):
tmp = sample_labels[i][1]
sample_labels[i][1] = 1 - sample_labels[i][3]
sample_labels[i][3] = 1 - tmp
# HWC to CHW
if len(img.shape) == 3:
img = np.swapaxes(img, 1, 2)
img = np.swapaxes(img, 1, 0)
# RBG to BGR
img = img[[2, 1, 0], :, :]
img = img.astype('float32')
img -= settings.img_mean
img = img.flatten()
img = img * 0.007843
sample_labels = np.array(sample_labels)
if mode == 'train' or mode == 'test':
if mode == 'train' and len(sample_labels) == 0: continue
if mode == 'test' and len(sample_labels) == 0: continue
yield img.astype(
'float32'
), sample_labels[:, 1:5], sample_labels[:, 0].astype(
'int32'), sample_labels[:, -1].astype('int32')
elif mode == 'infer':
yield img.astype('float32')
return reader return reader
def draw_bounding_box_on_image(image,
sample_labels,
image_name,
category_names,
color='red',
thickness=4,
with_text=True,
normalized=True):
image = Image.fromarray(image)
draw = ImageDraw.Draw(image)
im_width, im_height = image.size
if not normalized:
im_width, im_height = 1, 1
for item in sample_labels:
label = item[0]
category_name = category_names[int(label)]
bbox = item[1:5]
xmin, ymin, xmax, ymax = bbox
(left, right, top, bottom) = (xmin * im_width, xmax * im_width,
ymin * im_height, ymax * im_height)
draw.line(
[(left, top), (left, bottom), (right, bottom), (right, top),
(left, top)],
width=thickness,
fill=color)
if with_text:
if image.mode == 'RGB':
draw.text((left, top), category_name, (255, 255, 0))
image.save(image_name)
def train(settings, file_list, shuffle=True): def train(settings, file_list, shuffle=True):
return _reader_creator(settings, file_list, 'train', shuffle) file_list = os.path.join(settings.data_dir, file_list)
if settings.dataset == 'coco':
train_settings = copy.copy(settings)
if '2014' in file_list:
sub_dir = "train2014"
elif '2017' in file_list:
sub_dir = "train2017"
train_settings.data_dir = os.path.join(settings.data_dir, sub_dir)
return _reader_creator(train_settings, file_list, 'train', shuffle)
elif settings.dataset == 'pascalvoc':
return _reader_creator(settings, file_list, 'train', shuffle)
def test(settings, file_list): def test(settings, file_list):
return _reader_creator(settings, file_list, 'test', False) file_list = os.path.join(settings.data_dir, file_list)
if settings.dataset == 'coco':
test_settings = copy.copy(settings)
if '2014' in file_list:
sub_dir = "val2014"
elif '2017' in file_list:
sub_dir = "val2017"
test_settings.data_dir = os.path.join(settings.data_dir, sub_dir)
return _reader_creator(test_settings, file_list, 'test', False)
elif settings.dataset == 'pascalvoc':
return _reader_creator(settings, file_list, 'test', False)
def infer(settings, file_list): def infer(settings, file_list):
......
import paddle.v2 as paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import reader import reader
import load_model as load_model import load_model as load_model
from mobilenet_ssd import mobile_net from mobilenet_ssd import mobile_net
from utility import add_arguments, print_arguments from utility import add_arguments, print_arguments
import os import os
import time
import numpy as np import numpy as np
import argparse import argparse
import functools import functools
...@@ -12,22 +13,40 @@ import functools ...@@ -12,22 +13,40 @@ import functools
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser) add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable # yapf: disable
add_arg('batch_size', int, 32, "Minibatch size.") add_arg('learning_rate', float, 0.001, "Learning rate.")
add_arg('parallel', bool, True, "Whether use parallel training.") add_arg('batch_size', int, 32, "Minibatch size.")
add_arg('use_gpu', bool, True, "Whether use GPU.") add_arg('num_passes', int, 25, "Epoch number.")
add_arg('parallel', bool, True, "Whether use parallel training.")
add_arg('use_gpu', bool, True, "Whether use GPU.")
add_arg('use_nccl', bool, False, "Whether use NCCL.")
add_arg('dataset', str, 'pascalvoc', "coco or pascalvoc.")
add_arg('model_save_dir', str, 'model', "The path to save model.")
add_arg('pretrained_model', str, 'pretrained/ssd_mobilenet_v1_coco/', "The init model path.")
add_arg('apply_distort', bool, True, "Whether apply distort")
add_arg('apply_expand', bool, False, "Whether appley expand")
add_arg('resize_h', int, 300, "resize image size")
add_arg('resize_w', int, 300, "resize image size")
add_arg('mean_value_B', float, 127.5, "mean value which will be subtracted") #123.68
add_arg('mean_value_G', float, 127.5, "mean value which will be subtracted") #116.78
add_arg('mean_value_R', float, 127.5, "mean value which will be subtracted") #103.94
add_arg('is_toy', int, 0, "Toy for quick debug, 0 means using all data, while n means using only n sample")
# yapf: disable # yapf: disable
def train(args, def parallel_do(args,
train_file_list, train_file_list,
val_file_list, val_file_list,
data_args, data_args,
learning_rate, learning_rate,
batch_size, batch_size,
num_passes, num_passes,
model_save_dir='model', model_save_dir,
init_model_path=None): pretrained_model=None):
image_shape = [3, data_args.resize_h, data_args.resize_w] image_shape = [3, data_args.resize_h, data_args.resize_w]
if data_args.dataset == 'coco':
num_classes = 81
elif data_args.dataset == 'pascalvoc':
num_classes = 21
image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
gt_box = fluid.layers.data( gt_box = fluid.layers.data(
...@@ -39,15 +58,16 @@ def train(args, ...@@ -39,15 +58,16 @@ def train(args,
if args.parallel: if args.parallel:
places = fluid.layers.get_places() places = fluid.layers.get_places()
pd = fluid.layers.ParallelDo(places) pd = fluid.layers.ParallelDo(places, use_nccl=args.use_nccl)
with pd.do(): with pd.do():
image_ = pd.read_input(image) image_ = pd.read_input(image)
gt_box_ = pd.read_input(gt_box) gt_box_ = pd.read_input(gt_box)
gt_label_ = pd.read_input(gt_label) gt_label_ = pd.read_input(gt_label)
difficult_ = pd.read_input(difficult) difficult_ = pd.read_input(difficult)
locs, confs, box, box_var = mobile_net(image_, image_shape) locs, confs, box, box_var = mobile_net(num_classes, image_,
loss = fluid.layers.ssd_loss(locs, confs, gt_box_, gt_label_, image_shape)
box, box_var) loss = fluid.layers.ssd_loss(locs, confs, gt_box_, gt_label_, box,
box_var)
nmsed_out = fluid.layers.detection_output( nmsed_out = fluid.layers.detection_output(
locs, confs, box, box_var, nms_threshold=0.45) locs, confs, box, box_var, nms_threshold=0.45)
loss = fluid.layers.reduce_sum(loss) loss = fluid.layers.reduce_sum(loss)
...@@ -57,11 +77,11 @@ def train(args, ...@@ -57,11 +77,11 @@ def train(args,
loss, nmsed_out = pd() loss, nmsed_out = pd()
loss = fluid.layers.mean(loss) loss = fluid.layers.mean(loss)
else: else:
locs, confs, box, box_var = mobile_net(image, image_shape) locs, confs, box, box_var = mobile_net(num_classes, image, image_shape)
nmsed_out = fluid.layers.detection_output( nmsed_out = fluid.layers.detection_output(
locs, confs, box, box_var, nms_threshold=0.45) locs, confs, box, box_var, nms_threshold=0.45)
loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, box,
box, box_var) box_var)
loss = fluid.layers.reduce_sum(loss) loss = fluid.layers.reduce_sum(loss)
test_program = fluid.default_main_program().clone(for_test=True) test_program = fluid.default_main_program().clone(for_test=True)
...@@ -71,13 +91,20 @@ def train(args, ...@@ -71,13 +91,20 @@ def train(args,
gt_label, gt_label,
gt_box, gt_box,
difficult, difficult,
21, num_classes,
overlap_threshold=0.5, overlap_threshold=0.5,
evaluate_difficult=False, evaluate_difficult=False,
ap_version='11point') ap_version='integral')
boundaries = [40000, 60000] if data_args.dataset == 'coco':
values = [0.001, 0.0005, 0.00025] # learning rate decay in 12, 19 pass, respectively
if '2014' in train_file_list:
boundaries = [82783 / batch_size * 12, 82783 / batch_size * 19]
elif '2017' in train_file_list:
boundaries = [118287 / batch_size * 12, 118287 / batch_size * 19]
elif data_args.dataset == 'pascalvoc':
boundaries = [40000, 60000]
values = [learning_rate, learning_rate * 0.5, learning_rate * 0.25]
optimizer = fluid.optimizer.RMSProp( optimizer = fluid.optimizer.RMSProp(
learning_rate=fluid.layers.piecewise_decay(boundaries, values), learning_rate=fluid.layers.piecewise_decay(boundaries, values),
regularization=fluid.regularizer.L2Decay(0.00005), ) regularization=fluid.regularizer.L2Decay(0.00005), )
...@@ -88,8 +115,11 @@ def train(args, ...@@ -88,8 +115,11 @@ def train(args,
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
load_model.load_and_set_vars(place) if pretrained_model:
#load_model.load_paddlev1_vars(place) def if_exist(var):
return os.path.exists(os.path.join(pretrained_model, var.name))
fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
train_reader = paddle.batch( train_reader = paddle.batch(
reader.train(data_args, train_file_list), batch_size=batch_size) reader.train(data_args, train_file_list), batch_size=batch_size)
test_reader = paddle.batch( test_reader = paddle.batch(
...@@ -108,37 +138,167 @@ def train(args, ...@@ -108,37 +138,167 @@ def train(args,
print("Test {0}, map {1}".format(pass_id, test_map[0])) print("Test {0}, map {1}".format(pass_id, test_map[0]))
for pass_id in range(num_passes): for pass_id in range(num_passes):
start_time = time.time()
prev_start_time = start_time
end_time = 0
for batch_id, data in enumerate(train_reader()): for batch_id, data in enumerate(train_reader()):
prev_start_time = start_time
start_time = time.time()
loss_v = exe.run(fluid.default_main_program(), loss_v = exe.run(fluid.default_main_program(),
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[loss]) fetch_list=[loss])
end_time = time.time()
if batch_id % 20 == 0: if batch_id % 20 == 0:
print("Pass {0}, batch {1}, loss {2}" print("Pass {0}, batch {1}, loss {2}, time {3}".format(
.format(pass_id, batch_id, loss_v[0])) pass_id, batch_id, loss_v[0], start_time - prev_start_time))
test(pass_id) test(pass_id)
if pass_id % 10 == 0: if pass_id % 10 == 0 or pass_id == num_passes - 1:
model_path = os.path.join(model_save_dir, str(pass_id)) model_path = os.path.join(model_save_dir, str(pass_id))
print 'save models to %s' % (model_path) print 'save models to %s' % (model_path)
fluid.io.save_inference_model(model_path, ['image'], [nmsed_out], fluid.io.save_persistables(exe, model_path)
exe)
def parallel_exe(args,
train_file_list,
val_file_list,
data_args,
learning_rate,
batch_size,
num_passes,
model_save_dir='model',
pretrained_model=None):
image_shape = [3, data_args.resize_h, data_args.resize_w]
if data_args.dataset == 'coco':
num_classes = 81
elif data_args.dataset == 'pascalvoc':
num_classes = 21
image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
gt_box = fluid.layers.data(
name='gt_box', shape=[4], dtype='float32', lod_level=1)
gt_label = fluid.layers.data(
name='gt_label', shape=[1], dtype='int32', lod_level=1)
difficult = fluid.layers.data(
name='gt_difficult', shape=[1], dtype='int32', lod_level=1)
locs, confs, box, box_var = mobile_net(num_classes, image, image_shape)
nmsed_out = fluid.layers.detection_output(
locs, confs, box, box_var, nms_threshold=0.45)
loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, box,
box_var)
loss = fluid.layers.reduce_sum(loss)
test_program = fluid.default_main_program().clone(for_test=True)
with fluid.program_guard(test_program):
map_eval = fluid.evaluator.DetectionMAP(
nmsed_out,
gt_label,
gt_box,
difficult,
num_classes,
overlap_threshold=0.5,
evaluate_difficult=False,
ap_version='integral')
if data_args.dataset == 'coco':
# learning rate decay in 12, 19 pass, respectively
if '2014' in train_file_list:
boundaries = [82783 / batch_size * 12, 82783 / batch_size * 19]
elif '2017' in train_file_list:
boundaries = [118287 / batch_size * 12, 118287 / batch_size * 19]
elif data_args.dataset == 'pascalvoc':
boundaries = [40000, 60000]
values = [learning_rate, learning_rate * 0.5, learning_rate * 0.25]
optimizer = fluid.optimizer.RMSProp(
learning_rate=fluid.layers.piecewise_decay(boundaries, values),
regularization=fluid.regularizer.L2Decay(0.00005), )
optimizer.minimize(loss)
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
if pretrained_model:
def if_exist(var):
return os.path.exists(os.path.join(pretrained_model, var.name))
fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
train_exe = fluid.ParallelExecutor(use_cuda=args.use_gpu,
loss_name=loss.name)
train_reader = paddle.batch(
reader.train(data_args, train_file_list), batch_size=batch_size)
test_reader = paddle.batch(
reader.test(data_args, val_file_list), batch_size=batch_size)
feeder = fluid.DataFeeder(
place=place, feed_list=[image, gt_box, gt_label, difficult])
def test(pass_id):
_, accum_map = map_eval.get_map_var()
map_eval.reset(exe)
test_map = None
for _, data in enumerate(test_reader()):
test_map = exe.run(test_program,
feed=feeder.feed(data),
fetch_list=[accum_map])
print("Test {0}, map {1}".format(pass_id, test_map[0]))
for pass_id in range(num_passes):
start_time = time.time()
prev_start_time = start_time
end_time = 0
test(pass_id)
for batch_id, data in enumerate(train_reader()):
prev_start_time = start_time
start_time = time.time()
loss_v, = train_exe.run(fetch_list=[loss.name],
feed_dict=feeder.feed(data))
end_time = time.time()
loss_v = np.mean(np.array(loss_v))
if batch_id % 20 == 0:
print("Pass {0}, batch {1}, loss {2}, time {3}".format(
pass_id, batch_id, loss_v, start_time - prev_start_time))
if pass_id % 10 == 0 or pass_id == num_passes - 1:
model_path = os.path.join(model_save_dir, str(pass_id))
print 'save models to %s' % (model_path)
fluid.io.save_persistables(exe, model_path)
if __name__ == '__main__': if __name__ == '__main__':
args = parser.parse_args() args = parser.parse_args()
print_arguments(args) print_arguments(args)
data_dir = 'data/pascalvoc'
train_file_list = 'trainval.txt'
val_file_list = 'test.txt'
label_file = 'label_list'
model_save_dir = args.model_save_dir
if args.dataset == 'coco':
data_dir = './data/COCO17'
train_file_list = 'annotations/instances_train2017.json'
val_file_list = 'annotations/instances_val2017.json'
label_file = 'label_list'
data_args = reader.Settings( data_args = reader.Settings(
data_dir='./data', dataset=args.dataset,
label_file='label_list', toy=args.is_toy,
apply_distort=True, data_dir=data_dir,
apply_expand=True, label_file=label_file,
resize_h=300, apply_distort=args.apply_distort,
resize_w=300, apply_expand=args.apply_expand,
mean_value=[127.5, 127.5, 127.5]) resize_h=args.resize_h,
train(args, resize_w=args.resize_w,
train_file_list='./data/trainval.txt', mean_value=[args.mean_value_B, args.mean_value_G, args.mean_value_R])
val_file_list='./data/test.txt', #method = parallel_do
data_args=data_args, method = parallel_exe
learning_rate=0.001, method(args,
batch_size=args.batch_size, train_file_list=train_file_list,
num_passes=300) val_file_list=val_file_list,
data_args=data_args,
learning_rate=args.learning_rate,
batch_size=args.batch_size,
num_passes=args.num_passes,
model_save_dir=model_save_dir,
pretrained_model=args.pretrained_model)
...@@ -30,32 +30,28 @@ class PolicyGradient: ...@@ -30,32 +30,28 @@ class PolicyGradient:
acts = fluid.layers.data(name='acts', shape=[1], dtype='int64') acts = fluid.layers.data(name='acts', shape=[1], dtype='int64')
vt = fluid.layers.data(name='vt', shape=[1], dtype='float32') vt = fluid.layers.data(name='vt', shape=[1], dtype='float32')
# fc1 # fc1
fc1 = fluid.layers.fc( fc1 = fluid.layers.fc(input=obs, size=10, act="tanh") # tanh activation
input=obs,
size=10,
act="tanh" # tanh activation
)
# fc2 # fc2
self.all_act_prob = fluid.layers.fc(input=fc1, all_act_prob = fluid.layers.fc(input=fc1,
size=self.n_actions, size=self.n_actions,
act="softmax") act="softmax")
self.inferece_program = fluid.defaul_main_program().clone()
# to maximize total reward (log_p * R) is to minimize -(log_p * R) # to maximize total reward (log_p * R) is to minimize -(log_p * R)
neg_log_prob = fluid.layers.cross_entropy( neg_log_prob = fluid.layers.cross_entropy(
input=self.all_act_prob, input=self.all_act_prob,
label=acts) # this is negative log of chosen action label=acts) # this is negative log of chosen action
neg_log_prob_weight = fluid.layers.elementwise_mul(x=neg_log_prob, y=vt) neg_log_prob_weight = fluid.layers.elementwise_mul(x=neg_log_prob, y=vt)
loss = fluid.layers.reduce_mean( loss = fluid.layers.reduce_mean(
x=neg_log_prob_weight) # reward guided loss neg_log_prob_weight) # reward guided loss
sgd_optimizer = fluid.optimizer.SGD(self.lr) sgd_optimizer = fluid.optimizer.SGD(self.lr)
sgd_optimizer.minimize(loss) sgd_optimizer.minimize(loss)
self.exe.run(fluid.default_startup_program()) self.exe.run(fluid.default_startup_program())
def choose_action(self, observation): def choose_action(self, observation):
prob_weights = self.exe.run( prob_weights = self.exe.run(self.inferece_program,
fluid.default_main_program().prune(self.all_act_prob), feed={"obs": observation[np.newaxis, :]},
feed={"obs": observation[np.newaxis, :]}, fetch_list=[self.all_act_prob])
fetch_list=[self.all_act_prob])
prob_weights = np.array(prob_weights[0]) prob_weights = np.array(prob_weights[0])
action = np.random.choice( action = np.random.choice(
range(prob_weights.shape[1]), range(prob_weights.shape[1]),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册