diff --git a/.gitignore b/.gitignore index dde3895fc112ad34a839b2fed9210ac2288a959b..9492cff0cb9500079955856eedac883e39b522a8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .DS_Store *.pyc +.*~ diff --git a/.travis.yml b/.travis.yml index 7fa098a7c5d8ecdc5d5ea38fe38c3ddae959498a..ecc348e1482fac430f9d98990b8940ab57b2b75b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,7 +17,7 @@ addons: - python-pip - python2.7-dev - clang-format-3.8 - ssh_known_hosts: 52.76.173.135 + ssh_known_hosts: 13.229.163.131 before_install: - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi - sudo pip install -U virtualenv pre-commit pip diff --git a/fluid/DeepASR/tools/profile.py b/fluid/DeepASR/tools/profile.py index 69aee88e22d33ed80212692bf61e41e1666bf5e5..8d720c16cd0ec6a9d4bb533a878b07973ced7176 100644 --- a/fluid/DeepASR/tools/profile.py +++ b/fluid/DeepASR/tools/profile.py @@ -168,7 +168,7 @@ def profile(args): start_time = time.time() frames_seen = 0 # load_data - (features, labels, lod) = batch_data + (features, labels, lod, _) = batch_data feature_t.set(features, place) feature_t.set_lod([lod]) label_t.set(labels, place) diff --git a/fluid/DeepASR/train.py b/fluid/DeepASR/train.py index 3908a550cdcf095057ea6ab0b89e07dcecda51f9..be99998c8aa7f88d49dab711e94dcd7cfef042d6 100644 --- a/fluid/DeepASR/train.py +++ b/fluid/DeepASR/train.py @@ -192,7 +192,7 @@ def train(args): test_data_reader.batch_iterator(args.batch_size, args.minimum_batch_size)): # load_data - (features, labels, lod) = batch_data + (features, labels, lod, _) = batch_data feature_t.set(features, place) feature_t.set_lod([lod]) label_t.set(labels, place) diff --git a/fluid/image_classification/caffe2fluid/README.md b/fluid/image_classification/caffe2fluid/README.md index 5f565afe0c33db291092faeac632da3d51f95613..6aba34b9cafbd87b3474575fcbcee65819769c2f 100644 --- a/fluid/image_classification/caffe2fluid/README.md +++ b/fluid/image_classification/caffe2fluid/README.md @@ -18,19 +18,19 @@ This tool is used to convert a Caffe model to Fluid model ### Tested models -- Lenet on mnist dataset +- Lenet - ResNets:(ResNet-50, ResNet-101, ResNet-152) - model addr: `https://onedrive.live.com/?authkey=%21AAFW2-FVoxeVRck&id=4006CBB8476FF777%2117887&cid=4006CBB8476FF777`_ +[model addr](https://onedrive.live.com/?authkey=%21AAFW2-FVoxeVRck&id=4006CBB8476FF777%2117887&cid=4006CBB8476FF777) - GoogleNet: - model addr: `https://gist.github.com/jimmie33/7ea9f8ac0da259866b854460f4526034`_ +[model addr](https://gist.github.com/jimmie33/7ea9f8ac0da259866b854460f4526034) - VGG: - model addr: `https://gist.github.com/ksimonyan/211839e770f7b538e2d8`_ +[model addr](https://gist.github.com/ksimonyan/211839e770f7b538e2d8) - AlexNet: - model addr: `https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet`_ +[model addr](https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet) ### Notes Some of this code come from here: https://github.com/ethereon/caffe-tensorflow diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/compare.py b/fluid/image_classification/caffe2fluid/examples/imagenet/compare.py new file mode 100644 index 0000000000000000000000000000000000000000..07d4ed1af50a803aee206da6c7582d079a1a1dca --- /dev/null +++ b/fluid/image_classification/caffe2fluid/examples/imagenet/compare.py @@ -0,0 +1,85 @@ +#!/usr/bin/python + +# +#a tool to compare tensors in two files or two directories +# + +import sys +import os + + +def walk_dir(rootdir): + for subdir, dirs, files in os.walk(rootdir): + for file in files: + yield file + + +def calc_diff(f1, f2): + import numpy as np + + d1 = np.load(f1).flatten() + d2 = np.load(f2).flatten() + + d1_num = reduce(lambda x, y: x * y, d1.shape) + d2_num = reduce(lambda x, y: x * y, d2.shape) + if d1_num != d2_num: + print d1.shape + print d2.shape + assert (d1_num == d2_num), "their shape is not consistent" + + try: + df = np.abs(d1 - d2) + max_df = np.max(df) + sq_df = np.mean(df * df) + return max_df, sq_df + except Exception as e: + return -1.0, -1.0 + + +def compare(path1, path2): + def diff(f1, f2): + max_df, sq_df = calc_diff(f1, f2) + print('compare %s <=> %s with result[max_df:%.4e, sq_df:%.4e]' % + (f1, f2, max_df, sq_df)) + assert (max_df < 1e-5), \ + 'max_df is too large with value[%.6e]' % (max_df) + assert (sq_df < 1e-10), \ + 'sq_df is too large with value[%.6e]' % (sq_df) + + if os.path.exists(path1) is False: + print('not found %s' % (path1)) + return 1 + elif os.path.exists(path2) is False: + print('not found %s' % (path2)) + return 1 + + if path1.find('.npy') > 0 and path2.find('.npy') > 0: + diff(path1, path2) + return + + for f in walk_dir(path2): + if f.find('.npy') < 0: + continue + + f1 = os.path.join(path1, f) + f2 = os.path.join(path2, f) + diff(f1, f2) + + print('all checking succeed to pass') + return 0 + + +if __name__ == "__main__": + if len(sys.argv) == 1: + path1 = 'lenet.tf/results' + path2 = 'lenet.paddle/results' + elif len(sys.argv) == 3: + path1 = sys.argv[1] + path2 = sys.argv[2] + else: + print('usage:') + print(' %s [path1] [path2]' % (sys.argv[0])) + exit(1) + + print('compare inner result in %s %s' % (path1, path2)) + exit(compare(path1, path2)) diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/diff.sh b/fluid/image_classification/caffe2fluid/examples/imagenet/diff.sh new file mode 100644 index 0000000000000000000000000000000000000000..af72caea536d6b6c3d1027e7d1327af52a6ceda6 --- /dev/null +++ b/fluid/image_classification/caffe2fluid/examples/imagenet/diff.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# +#function: +# a tool used to check the difference of models' results generated by caffe model and paddle model +# +#howto: +# bash diff.sh resnet50 #when this has been finished, you can get the difference in precision +# +#notes: +# 0, in order to infer using caffe, we need pycaffe installed +# 1, prepare your caffe model in 'models.caffe/', eg: 'model.caffe/resnet101/resnet101.[prototxt|caffemodel]' +# 2, converted paddle model will be in 'models' +# 3, results of layers will be stored in 'results/${model_name}.[paddle|caffe]' +# 4, only the last layer will be checked by default + +model_name="resnet50" +results_root="results/" + +if [[ -n $1 ]];then + if [ $1 = "-h" ];then + echo "usage:" + echo " bash $0 [model_name]" + echo " eg:bash $0 resnet50" + exit 0 + fi + model_name=$1 +fi + +mkdir -p $results_root + +model_prototxt="models.caffe/$model_name/${model_name}.prototxt" +model_caffemodel="models.caffe/${model_name}/${model_name}.caffemodel" + +#1, dump layers' results from paddle +paddle_results="$results_root/${model_name}.paddle" +rm -rf $paddle_results +rm -rf "results.paddle" +bash run.sh $model_name ./models.caffe/$model_name ./models/$model_name +if [[ $? -ne 0 ]] || [[ ! -e "results.paddle" ]];then + echo "not found paddle's results, maybe failed to convert" + exit 1 +fi +mv results.paddle $paddle_results + +#2, dump layers' results from caffe +caffe_results="$results_root/${model_name}.caffe" +rm -rf $caffe_results +rm -rf "results.caffe" +cfpython ./infer.py caffe $model_prototxt $model_caffemodel $paddle_results/data.npy +if [[ $? -ne 0 ]] || [[ ! -e "results.caffe" ]];then + echo "not found caffe's results, maybe failed to do inference with caffe" + exit 1 +fi +mv results.caffe $caffe_results + +#3, extract layer names +cat $model_prototxt | grep name | perl -ne 'if(/^\s*name:\s+\"([^\"]+)/){ print $1."\n";}' >.layer_names + +#4, compare one by one +for i in $(cat ".layer_names" | tail -n1);do + echo "process $i" + python compare.py $caffe_results/${i}.npy $paddle_results/${i}.npy +done diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/infer.py b/fluid/image_classification/caffe2fluid/examples/imagenet/infer.py index ec594199be5a3e7a33c9673b1d5497c95f20d946..bb75caa9e7364465042c5c88f471e8f6f5137237 100644 --- a/fluid/image_classification/caffe2fluid/examples/imagenet/infer.py +++ b/fluid/image_classification/caffe2fluid/examples/imagenet/infer.py @@ -10,8 +10,11 @@ import os import sys import inspect import numpy as np -import paddle.v2 as paddle -import paddle.v2.fluid as fluid + + +def import_fluid(): + import paddle.fluid as fluid + return fluid def load_data(imgfile, shape): @@ -52,8 +55,10 @@ def build_model(net_file, net_name): print(e) return None - input_name = 'data' - input_shape = MyNet.input_shapes()[input_name] + fluid = import_fluid() + inputs_dict = MyNet.input_shapes() + input_name = inputs_dict.keys()[0] + input_shape = inputs_dict[input_name] images = fluid.layers.data(name='image', shape=input_shape, dtype='float32') #label = fluid.layers.data(name='label', shape=[1], dtype='int64') @@ -64,7 +69,7 @@ def build_model(net_file, net_name): def dump_results(results, names, root): if os.path.exists(root) is False: - os.path.mkdir(root) + os.mkdir(root) for i in range(len(names)): n = names[i] @@ -73,9 +78,12 @@ def dump_results(results, names, root): np.save(filename + '.npy', res) -def infer(net_file, net_name, model_file, imgfile, debug=False): +def infer(net_file, net_name, model_file, imgfile, debug=True): """ do inference using a model which consist 'xxx.py' and 'xxx.npy' """ + + fluid = import_fluid() + #1, build model net, input_shape = build_model(net_file, net_name) prediction = net.get_output() @@ -109,34 +117,79 @@ def infer(net_file, net_name, model_file, imgfile, debug=False): fetch_list=fetch_list_var) if debug is True: - dump_path = 'results.layers' + dump_path = 'results.paddle' dump_results(results, fetch_list_name, dump_path) - print('all results dumped to [%s]' % (dump_path)) + print('all result of layers dumped to [%s]' % (dump_path)) else: result = results[0] print('predicted class:', np.argmax(result)) + return 0 + + +def caffe_infer(prototxt, caffemodel, datafile): + """ do inference using pycaffe for debug, + all intermediate results will be dumpped to 'results.caffe' + """ + import caffe + + net = caffe.Net(prototxt, caffemodel, caffe.TEST) + input_layer = net.blobs.keys()[0] + print('got name of input layer is:%s' % (input_layer)) + input_shape = list(net.blobs[input_layer].data.shape[1:]) + + if '.npy' in datafile: + np_images = np.load(datafile) + else: + np_images = load_data(datafile, input_shape) + + inputs = {input_layer: np_images} + net.forward_all(**inputs) + + results = [] + names = [] + for k, v in net.blobs.items(): + k = k.rstrip('_output') + k = k.replace('/', '_') + names.append(k) + results.append(v.data.copy()) + + dump_path = 'results.caffe' + dump_results(results, names, dump_path) + print('all result of layers dumped to [%s]' % (dump_path)) + return 0 + if __name__ == "__main__": """ maybe more convenient to use 'run.sh' to call this tool """ net_file = 'models/resnet50/resnet50.py' weight_file = 'models/resnet50/resnet50.npy' - imgfile = 'data/65.jpeg' + datafile = 'data/65.jpeg' net_name = 'ResNet50' argc = len(sys.argv) - if argc == 5: + if sys.argv[1] == 'caffe': + if len(sys.argv) != 5: + print('usage:') + print('\tpython %s caffe [prototxt] [caffemodel] [datafile]' % + (sys.argv[0])) + sys.exit(1) + prototxt = sys.argv[2] + caffemodel = sys.argv[3] + datafile = sys.argv[4] + sys.exit(caffe_infer(prototxt, caffemodel, datafile)) + elif argc == 5: net_file = sys.argv[1] weight_file = sys.argv[2] - imgfile = sys.argv[3] + datafile = sys.argv[3] net_name = sys.argv[4] elif argc > 1: print('usage:') - print('\tpython %s [net_file] [weight_file] [imgfile] [net_name]' % + print('\tpython %s [net_file] [weight_file] [datafile] [net_name]' % (sys.argv[0])) print('\teg:python %s %s %s %s %s' % (sys.argv[0], net_file, - weight_file, imgfile, net_name)) + weight_file, datafile, net_name)) sys.exit(1) - infer(net_file, net_name, weight_file, imgfile) + infer(net_file, net_name, weight_file, datafile) diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/run.sh b/fluid/image_classification/caffe2fluid/examples/imagenet/run.sh index 7a1a5ebd7c0a5090c00a0c8ca6b0e11b110967dc..ff3cc4ac44a8ccaeb0b33f1bcdbc46886fb7d7e9 100644 --- a/fluid/image_classification/caffe2fluid/examples/imagenet/run.sh +++ b/fluid/image_classification/caffe2fluid/examples/imagenet/run.sh @@ -3,7 +3,7 @@ #function: # a tool used to: # 1, convert a caffe model -# 2, do inference using this model +# 2, do inference(only in fluid) using this model # #usage: # bash run.sh resnet50 ./models.caffe/resnet50 ./models/resnet50 @@ -65,7 +65,12 @@ if [[ -z $only_convert ]];then PYTHON=`which python` fi imgfile="data/65.jpeg" - net_name=`grep "name" $proto_file | head -n1 | perl -ne 'if(/\"([^\"]+)\"/){ print $1."\n";}'` + #FIX ME: + # only look the first line in prototxt file for the name of this network, maybe not correct + net_name=`grep "name" $proto_file | head -n1 | perl -ne 'if(/^\s*name\s*:\s*\"([^\"]+)\"/){ print $1."\n";}'` + if [[ -z $net_name ]];then + net_name="MyNet" + fi $PYTHON ./infer.py $net_file $weight_file $imgfile $net_name ret=$? fi diff --git a/fluid/image_classification/caffe2fluid/kaffe/graph.py b/fluid/image_classification/caffe2fluid/kaffe/graph.py index 5387f441852b8a318a41898ee0b62b4903ccdabb..c6fdada6e78c8fbeb98604033e4cb77995555ce9 100644 --- a/fluid/image_classification/caffe2fluid/kaffe/graph.py +++ b/fluid/image_classification/caffe2fluid/kaffe/graph.py @@ -52,7 +52,10 @@ class Graph(object): def __init__(self, nodes=None, name=None): self.nodes = nodes or [] self.node_lut = {node.name: node for node in self.nodes} - self.name = name + if name is None or name == '': + self.name = 'MyNet' + else: + self.name = name def add_node(self, node): self.nodes.append(node) diff --git a/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py b/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py index fd6a71cb6acbfffe2aed1d3680fb91c8c85dc3d3..ac5ecf1d4491efb5043502824514498f79ab4db0 100644 --- a/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py +++ b/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py @@ -4,7 +4,7 @@ import numpy as np def import_fluid(): - import paddle.v2.fluid as fluid + import paddle.fluid as fluid return fluid @@ -64,7 +64,7 @@ class Network(object): if os.path.isdir(data_path): assert (exe is not None), \ 'must provide a executor to load fluid model' - fluid.io.load_persistables_if_exist(executor=exe, dirname=data_path) + fluid.io.load_persistables(executor=exe, dirname=data_path) return True #load model from a npy file @@ -161,56 +161,28 @@ class Network(object): output = fluid.layers.relu(x=input) return output - def _adjust_pad_if_needed(self, i_hw, k_hw, s_hw, p_hw): - #adjust the padding if needed - i_h, i_w = i_hw - k_h, k_w = k_hw - s_h, s_w = s_hw - p_h, p_w = p_hw - - def is_consistent(i, k, s, p): - o = i + 2 * p - k - if o % s == 0: - return True - else: - return False - - real_p_h = 0 - real_p_w = 0 - if is_consistent(i_h, k_h, s_h, p_h) is False: - real_p_h = int(k_h / 2) - - if is_consistent(i_w, k_w, s_w, p_w) is False: - real_p_w = int(k_w / 2) - - return [real_p_h, real_p_w] - def pool(self, pool_type, input, k_h, k_w, s_h, s_w, name, padding): # Get the number of channels in the input in_hw = input.shape[2:] k_hw = [k_h, k_w] s_hw = [s_h, s_w] - if padding is None: - #fix bug about the difference between conv and pool - #more info: https://github.com/BVLC/caffe/issues/1318 - padding = self._adjust_pad_if_needed(in_hw, k_hw, s_hw, [0, 0]) - fluid = import_fluid() output = fluid.layers.pool2d( input=input, pool_size=k_hw, pool_stride=s_hw, pool_padding=padding, + ceil_mode=True, pool_type=pool_type) return output @layer - def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None): + def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=[0, 0]): return self.pool('max', input, k_h, k_w, s_h, s_w, name, padding) @layer - def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None): + def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=[0, 0]): return self.pool('avg', input, k_h, k_w, s_h, s_w, name, padding) @layer @@ -258,7 +230,12 @@ class Network(object): return output @layer - def batch_normalization(self, input, name, scale_offset=True, relu=False): + def batch_normalization(self, + input, + name, + scale_offset=True, + eps=1e-5, + relu=False): # NOTE: Currently, only inference is supported fluid = import_fluid() prefix = name + '_' @@ -276,7 +253,7 @@ class Network(object): bias_attr=bias_attr, moving_mean_name=mean_name, moving_variance_name=variance_name, - epsilon=1e-5, + epsilon=eps, act='relu' if relu is True else None) return output diff --git a/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py b/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py index 4d7ec49a39199bb1415f830d88f89e93a4b95266..3697529971fa6ca01d1703375243d16f0a0c1edd 100644 --- a/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py +++ b/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py @@ -142,7 +142,13 @@ class TensorFlowMapper(NodeMapper): def map_batch_norm(self, node): scale_offset = len(node.data) == 4 - kwargs = {} if scale_offset else {'scale_offset': False} + + #this default value comes from caffe's param in batch_norm + default_eps = 1e-5 + kwargs = {'scale_offset': scale_offset} + if node.parameters.eps != default_eps: + kwargs['eps'] = node.parameters.eps + return MaybeActivated( node, default=False)('batch_normalization', **kwargs) @@ -236,7 +242,7 @@ class TensorFlowEmitter(object): func_def = self.statement('@classmethod') func_def += self.statement('def convert(cls, npy_model, fluid_path):') self.indent() - func_def += self.statement('import paddle.v2.fluid as fluid') + func_def += self.statement('fluid = import_fluid()') for l in codes: func_def += self.statement(l) return '\n' + func_def diff --git a/fluid/image_classification/se_resnext.py b/fluid/image_classification/se_resnext.py index b1adf0baba8a987ae1a971e148375c6a0730d860..573c6bec5bdc3c08e9503e46f6e09fad2cb09707 100644 --- a/fluid/image_classification/se_resnext.py +++ b/fluid/image_classification/se_resnext.py @@ -1,10 +1,5 @@ -import os -import numpy as np -import time -import sys import paddle.v2 as paddle import paddle.fluid as fluid -import reader def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1, @@ -124,164 +119,3 @@ def SE_ResNeXt(input, class_dim, infer=False, layers=50): drop = pool out = fluid.layers.fc(input=drop, size=class_dim, act='softmax') return out - - -def train(learning_rate, - batch_size, - num_passes, - init_model=None, - model_save_dir='model', - parallel=True, - use_nccl=True, - lr_strategy=None, - layers=50): - class_dim = 1000 - image_shape = [3, 224, 224] - - image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - - if parallel: - places = fluid.layers.get_places() - pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl) - - with pd.do(): - image_ = pd.read_input(image) - label_ = pd.read_input(label) - out = SE_ResNeXt(input=image_, class_dim=class_dim, layers=layers) - cost = fluid.layers.cross_entropy(input=out, label=label_) - avg_cost = fluid.layers.mean(x=cost) - acc_top1 = fluid.layers.accuracy(input=out, label=label_, k=1) - acc_top5 = fluid.layers.accuracy(input=out, label=label_, k=5) - pd.write_output(avg_cost) - pd.write_output(acc_top1) - pd.write_output(acc_top5) - - avg_cost, acc_top1, acc_top5 = pd() - avg_cost = fluid.layers.mean(x=avg_cost) - acc_top1 = fluid.layers.mean(x=acc_top1) - acc_top5 = fluid.layers.mean(x=acc_top5) - else: - out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers) - cost = fluid.layers.cross_entropy(input=out, label=label) - avg_cost = fluid.layers.mean(x=cost) - acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) - acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) - - if lr_strategy is None: - optimizer = fluid.optimizer.Momentum( - learning_rate=learning_rate, - momentum=0.9, - regularization=fluid.regularizer.L2Decay(1e-4)) - else: - bd = lr_strategy["bd"] - lr = lr_strategy["lr"] - optimizer = fluid.optimizer.Momentum( - learning_rate=fluid.layers.piecewise_decay( - boundaries=bd, values=lr), - momentum=0.9, - regularization=fluid.regularizer.L2Decay(1e-4)) - - opts = optimizer.minimize(avg_cost) - fluid.memory_optimize(fluid.default_main_program()) - - inference_program = fluid.default_main_program().clone() - with fluid.program_guard(inference_program): - inference_program = fluid.io.get_inference_program( - [avg_cost, acc_top1, acc_top5]) - - place = fluid.CUDAPlace(0) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - - if init_model is not None: - fluid.io.load_persistables(exe, init_model) - - train_reader = paddle.batch(reader.train(), batch_size=batch_size) - test_reader = paddle.batch(reader.test(), batch_size=batch_size) - feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) - - for pass_id in range(num_passes): - train_info = [[], [], []] - test_info = [[], [], []] - for batch_id, data in enumerate(train_reader()): - t1 = time.time() - loss, acc1, acc5 = exe.run( - fluid.default_main_program(), - feed=feeder.feed(data), - fetch_list=[avg_cost, acc_top1, acc_top5]) - t2 = time.time() - period = t2 - t1 - train_info[0].append(loss[0]) - train_info[1].append(acc1[0]) - train_info[2].append(acc5[0]) - if batch_id % 10 == 0: - print("Pass {0}, trainbatch {1}, loss {2}, \ - acc1 {3}, acc5 {4} time {5}" - .format(pass_id, \ - batch_id, loss[0], acc1[0], acc5[0], \ - "%2.2f sec" % period)) - sys.stdout.flush() - - train_loss = np.array(train_info[0]).mean() - train_acc1 = np.array(train_info[1]).mean() - train_acc5 = np.array(train_info[2]).mean() - for data in test_reader(): - t1 = time.time() - loss, acc1, acc5 = exe.run( - inference_program, - feed=feeder.feed(data), - fetch_list=[avg_cost, acc_top1, acc_top5]) - t2 = time.time() - period = t2 - t1 - test_info[0].append(loss[0]) - test_info[1].append(acc1[0]) - test_info[2].append(acc5[0]) - if batch_id % 10 == 0: - print("Pass {0},testbatch {1},loss {2}, \ - acc1 {3},acc5 {4},time {5}" - .format(pass_id, \ - batch_id, loss[0], acc1[0], acc5[0], \ - "%2.2f sec" % period)) - sys.stdout.flush() - - test_loss = np.array(test_info[0]).mean() - test_acc1 = np.array(test_info[1]).mean() - test_acc5 = np.array(test_info[2]).mean() - - print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \ - test_loss {4}, test_acc1 {5}, test_acc5 {6}" - .format(pass_id, \ - train_loss, train_acc1, train_acc5, test_loss, test_acc1, \ - test_acc5)) - sys.stdout.flush() - - model_path = os.path.join(model_save_dir, str(pass_id)) - if not os.path.isdir(model_path): - os.makedirs(model_path) - fluid.io.save_persistables(exe, model_path) - - -if __name__ == '__main__': - epoch_points = [30, 60, 90] - total_images = 1281167 - batch_size = 256 - step = int(total_images / batch_size + 1) - bd = [e * step for e in epoch_points] - lr = [0.1, 0.01, 0.001, 0.0001] - - lr_strategy = {"bd": bd, "lr": lr} - - use_nccl = True - # layers: 50, 152 - layers = 50 - - train( - learning_rate=0.1, - batch_size=batch_size, - num_passes=120, - init_model=None, - parallel=True, - use_nccl=True, - lr_strategy=lr_strategy, - layers=layers) diff --git a/fluid/image_classification/train.py b/fluid/image_classification/train.py new file mode 100644 index 0000000000000000000000000000000000000000..f402c87d49862fd844d8cf36c6eb52f3e21895b3 --- /dev/null +++ b/fluid/image_classification/train.py @@ -0,0 +1,311 @@ +import os +import numpy as np +import time +import sys +import paddle.v2 as paddle +import paddle.fluid as fluid +from se_resnext import SE_ResNeXt +import reader + +import argparse +import functools +from utility import add_arguments, print_arguments + +parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable +add_arg('batch_size', int, 256, "Minibatch size.") +add_arg('num_layers', int, 50, "How many layers for SE-ResNeXt model.") +add_arg('with_mem_opt', bool, True, "Whether to use memory optimization or not.") +add_arg('parallel_exe', bool, True, "Whether to use ParallelExecutor to train or not.") + +def train_paralle_do(args, + learning_rate, + batch_size, + num_passes, + init_model=None, + model_save_dir='model', + parallel=True, + use_nccl=True, + lr_strategy=None, + layers=50): + class_dim = 1000 + image_shape = [3, 224, 224] + + image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + if parallel: + places = fluid.layers.get_places() + pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl) + + with pd.do(): + image_ = pd.read_input(image) + label_ = pd.read_input(label) + out = SE_ResNeXt(input=image_, class_dim=class_dim, layers=layers) + cost = fluid.layers.cross_entropy(input=out, label=label_) + avg_cost = fluid.layers.mean(x=cost) + acc_top1 = fluid.layers.accuracy(input=out, label=label_, k=1) + acc_top5 = fluid.layers.accuracy(input=out, label=label_, k=5) + pd.write_output(avg_cost) + pd.write_output(acc_top1) + pd.write_output(acc_top5) + + avg_cost, acc_top1, acc_top5 = pd() + avg_cost = fluid.layers.mean(x=avg_cost) + acc_top1 = fluid.layers.mean(x=acc_top1) + acc_top5 = fluid.layers.mean(x=acc_top5) + else: + out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers) + cost = fluid.layers.cross_entropy(input=out, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) + acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) + + if lr_strategy is None: + optimizer = fluid.optimizer.Momentum( + learning_rate=learning_rate, + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + else: + bd = lr_strategy["bd"] + lr = lr_strategy["lr"] + optimizer = fluid.optimizer.Momentum( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr), + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + + inference_program = fluid.default_main_program().clone(for_test=True) + + opts = optimizer.minimize(avg_cost) + if args.with_mem_opt: + fluid.memory_optimize(fluid.default_main_program()) + fluid.memory_optimize(inference_program) + + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + if init_model is not None: + fluid.io.load_persistables(exe, init_model) + + train_reader = paddle.batch(reader.train(), batch_size=batch_size) + test_reader = paddle.batch(reader.test(), batch_size=batch_size) + feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) + + for pass_id in range(num_passes): + train_info = [[], [], []] + test_info = [[], [], []] + for batch_id, data in enumerate(train_reader()): + t1 = time.time() + loss, acc1, acc5 = exe.run( + fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[avg_cost, acc_top1, acc_top5]) + t2 = time.time() + period = t2 - t1 + train_info[0].append(loss[0]) + train_info[1].append(acc1[0]) + train_info[2].append(acc5[0]) + if batch_id % 10 == 0: + print("Pass {0}, trainbatch {1}, loss {2}, \ + acc1 {3}, acc5 {4} time {5}" + .format(pass_id, \ + batch_id, loss[0], acc1[0], acc5[0], \ + "%2.2f sec" % period)) + sys.stdout.flush() + + train_loss = np.array(train_info[0]).mean() + train_acc1 = np.array(train_info[1]).mean() + train_acc5 = np.array(train_info[2]).mean() + for data in test_reader(): + t1 = time.time() + loss, acc1, acc5 = exe.run( + inference_program, + feed=feeder.feed(data), + fetch_list=[avg_cost, acc_top1, acc_top5]) + t2 = time.time() + period = t2 - t1 + test_info[0].append(loss[0]) + test_info[1].append(acc1[0]) + test_info[2].append(acc5[0]) + if batch_id % 10 == 0: + print("Pass {0},testbatch {1},loss {2}, \ + acc1 {3},acc5 {4},time {5}" + .format(pass_id, \ + batch_id, loss[0], acc1[0], acc5[0], \ + "%2.2f sec" % period)) + sys.stdout.flush() + + test_loss = np.array(test_info[0]).mean() + test_acc1 = np.array(test_info[1]).mean() + test_acc5 = np.array(test_info[2]).mean() + + print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \ + test_loss {4}, test_acc1 {5}, test_acc5 {6}" + .format(pass_id, \ + train_loss, train_acc1, train_acc5, test_loss, test_acc1, \ + test_acc5)) + sys.stdout.flush() + + model_path = os.path.join(model_save_dir, str(pass_id)) + if not os.path.isdir(model_path): + os.makedirs(model_path) + fluid.io.save_persistables(exe, model_path) + +def train_parallel_exe(args, + learning_rate, + batch_size, + num_passes, + init_model=None, + model_save_dir='model', + parallel=True, + use_nccl=True, + lr_strategy=None, + layers=50): + class_dim = 1000 + image_shape = [3, 224, 224] + + image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers) + cost = fluid.layers.cross_entropy(input=out, label=label) + acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) + acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) + avg_cost = fluid.layers.mean(x=cost) + + test_program = fluid.default_main_program().clone(for_test=True) + + if lr_strategy is None: + optimizer = fluid.optimizer.Momentum( + learning_rate=learning_rate, + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + else: + bd = lr_strategy["bd"] + lr = lr_strategy["lr"] + optimizer = fluid.optimizer.Momentum( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr), + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + + opts = optimizer.minimize(avg_cost) + + if args.with_mem_opt: + fluid.memory_optimize(fluid.default_main_program()) + fluid.memory_optimize(test_program) + + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + if init_model is not None: + fluid.io.load_persistables(exe, init_model) + + train_reader = paddle.batch(reader.train(), batch_size=batch_size) + test_reader = paddle.batch(reader.test(), batch_size=batch_size) + feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) + + train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name) + test_exe = fluid.ParallelExecutor( + use_cuda=True, + main_program=test_program, + share_vars_from=train_exe) + + fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name] + + for pass_id in range(num_passes): + train_info = [[], [], []] + test_info = [[], [], []] + for batch_id, data in enumerate(train_reader()): + t1 = time.time() + loss, acc1, acc5 = train_exe.run( + fetch_list, + feed_dict=feeder.feed(data)) + t2 = time.time() + period = t2 - t1 + loss = np.mean(np.array(loss)) + acc1 = np.mean(np.array(acc1)) + acc5 = np.mean(np.array(acc5)) + train_info[0].append(loss) + train_info[1].append(acc1) + train_info[2].append(acc5) + if batch_id % 10 == 0: + print("Pass {0}, trainbatch {1}, loss {2}, \ + acc1 {3}, acc5 {4} time {5}" + .format(pass_id, \ + batch_id, loss, acc1, acc5, \ + "%2.2f sec" % period)) + sys.stdout.flush() + + train_loss = np.array(train_info[0]).mean() + train_acc1 = np.array(train_info[1]).mean() + train_acc5 = np.array(train_info[2]).mean() + for data in test_reader(): + t1 = time.time() + loss, acc1, acc5 = test_exe.run( + fetch_list, + feed_dict=feeder.feed(data)) + t2 = time.time() + period = t2 - t1 + loss = np.mean(np.array(loss)) + acc1 = np.mean(np.array(acc1)) + acc5 = np.mean(np.array(acc5)) + test_info[0].append(loss) + test_info[1].append(acc1) + test_info[2].append(acc5) + if batch_id % 10 == 0: + print("Pass {0},testbatch {1},loss {2}, \ + acc1 {3},acc5 {4},time {5}" + .format(pass_id, \ + batch_id, loss, acc1, acc5, \ + "%2.2f sec" % period)) + sys.stdout.flush() + + test_loss = np.array(test_info[0]).mean() + test_acc1 = np.array(test_info[1]).mean() + test_acc5 = np.array(test_info[2]).mean() + + print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \ + test_loss {4}, test_acc1 {5}, test_acc5 {6}" + .format(pass_id, \ + train_loss, train_acc1, train_acc5, test_loss, test_acc1, \ + test_acc5)) + sys.stdout.flush() + + model_path = os.path.join(model_save_dir, str(pass_id)) + if not os.path.isdir(model_path): + os.makedirs(model_path) + fluid.io.save_persistables(exe, model_path) + + + + +if __name__ == '__main__': + args = parser.parse_args() + print_arguments(args) + + epoch_points = [30, 60, 90] + total_images = 1281167 + batch_size = args.batch_size + step = int(total_images / batch_size + 1) + bd = [e * step for e in epoch_points] + lr = [0.1, 0.01, 0.001, 0.0001] + + lr_strategy = {"bd": bd, "lr": lr} + + use_nccl = True + # layers: 50, 152 + layers = args.num_layers + method = train_parallel_exe if args.parallel_exe else train_parallel_do + method(args, + learning_rate=0.1, + batch_size=batch_size, + num_passes=120, + init_model=None, + parallel=True, + use_nccl=True, + lr_strategy=lr_strategy, + layers=layers) diff --git a/fluid/image_classification/utility.py b/fluid/image_classification/utility.py new file mode 100644 index 0000000000000000000000000000000000000000..506e6007ceb9059caf1163befb6ff594d67b547a --- /dev/null +++ b/fluid/image_classification/utility.py @@ -0,0 +1,62 @@ +"""Contains common utility functions.""" +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import distutils.util +import numpy as np +from paddle.fluid import core + + +def print_arguments(args): + """Print argparse's arguments. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + parser.add_argument("name", default="Jonh", type=str, help="User name.") + args = parser.parse_args() + print_arguments(args) + + :param args: Input argparse.Namespace for printing. + :type args: argparse.Namespace + """ + print("----------- Configuration Arguments -----------") + for arg, value in sorted(vars(args).iteritems()): + print("%s: %s" % (arg, value)) + print("------------------------------------------------") + + +def add_arguments(argname, type, default, help, argparser, **kwargs): + """Add argparse's argument. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + add_argument("name", str, "Jonh", "User name.", parser) + args = parser.parse_args() + """ + type = distutils.util.strtobool if type == bool else type + argparser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) diff --git a/fluid/neural_machine_translation/transformer/config.py b/fluid/neural_machine_translation/transformer/config.py index 8bfdf6461bdbfae92afe36520b3b056dddb4836c..c83b655f93ed80b8a599cb7fc789cf309d7d608b 100644 --- a/fluid/neural_machine_translation/transformer/config.py +++ b/fluid/neural_machine_translation/transformer/config.py @@ -15,6 +15,9 @@ class TrainTaskConfig(object): # the parameters for learning rate scheduling. warmup_steps = 4000 + # the flag indicating to use average loss or sum loss when training. + use_avg_cost = False + # the directory for saving trained models. model_dir = "trained_models" @@ -22,8 +25,7 @@ class TrainTaskConfig(object): class InferTaskConfig(object): use_gpu = False # the number of examples in one run for sequence generation. - # currently the batch size can only be set to 1. - batch_size = 1 + batch_size = 10 # the parameters for beam search. beam_size = 5 @@ -31,37 +33,38 @@ class InferTaskConfig(object): # the number of decoded sentences to output. n_best = 1 + # the flags indicating whether to output the special tokens. + output_bos = False + output_eos = False + output_unk = False + # the directory for loading the trained model. model_path = "trained_models/pass_1.infer.model" class ModelHyperParams(object): - # Dictionary size for source and target language. This model directly uses - # paddle.dataset.wmt16 in which , and token has - # alreay been added, but the token is not added. Transformer requires - # sequences in a mini-batch are padded to have the same length. A token is - # added into the original dictionary in paddle.dateset.wmt16. + # This model directly uses paddle.dataset.wmt16 in which , and + # token has alreay been added. As for the token, any token + # included in dict can be used to pad, since the paddings' loss will be + # masked out and make no effect on parameter gradients. # size of source word dictionary. src_vocab_size = 10000 - # index for token in source language. - src_pad_idx = src_vocab_size # size of target word dictionay trg_vocab_size = 10000 - # index for token in target language. - trg_pad_idx = trg_vocab_size # index for token bos_idx = 0 # index for token eos_idx = 1 + # index for token + unk_idx = 2 - # position value corresponding to the token. - pos_pad_idx = 0 - - # max length of sequences. It should plus 1 to include position - # padding token for position encoding. + # max length of sequences. + # The size of position encoding table should at least plus 1, since the + # sinusoid position encoding starts from 1 and 0 can be used as the padding + # token for position encoding. max_length = 50 # the dimension for word embeddings, which is also the last dimension of @@ -93,6 +96,7 @@ encoder_input_data_names = ( "src_word", "src_pos", "src_slf_attn_bias", + "src_data_shape", "src_slf_attn_pre_softmax_shape", "src_slf_attn_post_softmax_shape", ) @@ -102,6 +106,7 @@ decoder_input_data_names = ( "trg_pos", "trg_slf_attn_bias", "trg_src_attn_bias", + "trg_data_shape", "trg_slf_attn_pre_softmax_shape", "trg_slf_attn_post_softmax_shape", "trg_src_attn_pre_softmax_shape", diff --git a/fluid/neural_machine_translation/transformer/infer.py b/fluid/neural_machine_translation/transformer/infer.py index 02674df125c208dd1a4d5f7ea2b8735980048b05..ad7fc2fa39db15698842aae26c80d86f7592775b 100644 --- a/fluid/neural_machine_translation/transformer/infer.py +++ b/fluid/neural_machine_translation/transformer/infer.py @@ -11,10 +11,26 @@ from config import InferTaskConfig, ModelHyperParams, \ from train import pad_batch_data -def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, - decoder, dec_in_names, dec_out_names, beam_size, max_length, - n_best, batch_size, n_head, src_pad_idx, trg_pad_idx, - bos_idx, eos_idx): +def translate_batch(exe, + src_words, + encoder, + enc_in_names, + enc_out_names, + decoder, + dec_in_names, + dec_out_names, + beam_size, + max_length, + n_best, + batch_size, + n_head, + d_model, + src_pad_idx, + trg_pad_idx, + bos_idx, + eos_idx, + unk_idx, + output_unk=True): """ Run the encoder program once and run the decoder program multiple times to implement beam search externally. @@ -25,9 +41,14 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, src_pad_idx, n_head, is_target=False, - return_pos=True, + is_label=False, return_attn_bias=True, return_max_len=False) + # Append the data shape input to reshape the output of embedding layer. + enc_in_data = enc_in_data + [ + np.array( + [-1, enc_in_data[2].shape[-1], d_model], dtype="int32") + ] # Append the shape inputs to reshape before and after softmax in encoder # self attention. enc_in_data = enc_in_data + [ @@ -44,11 +65,16 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, scores = np.zeros((batch_size, beam_size), dtype="float32") prev_branchs = [[] for i in range(batch_size)] next_ids = [[] for i in range(batch_size)] - # Use beam_map to map the instance idx in batch to beam idx, since the + # Use beam_inst_map to map beam idx to the instance idx in batch, since the # size of feeded batch is changing. - beam_map = range(batch_size) + beam_inst_map = { + beam_idx: inst_idx + for inst_idx, beam_idx in enumerate(range(batch_size)) + } + # Use active_beams to recode the alive. + active_beams = range(batch_size) - def beam_backtrace(prev_branchs, next_ids, n_best=beam_size, add_bos=True): + def beam_backtrace(prev_branchs, next_ids, n_best=beam_size): """ Decode and select n_best sequences for one instance by backtrace. """ @@ -60,7 +86,8 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, seq.append(next_ids[j][k]) k = prev_branchs[j][k] seq = seq[::-1] - seq = [bos_idx] + seq if add_bos else seq + # Add the , since next_ids don't include the . + seq = [bos_idx] + seq seqs.append(seq) return seqs @@ -82,8 +109,14 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, [-1e9]).astype("float32") # This is used to remove attention on the paddings of source sequences. trg_src_attn_bias = np.tile( - src_slf_attn_bias[:, :, ::src_max_length, :], - [beam_size, 1, trg_max_len, 1]) + src_slf_attn_bias[:, :, ::src_max_length, :][:, np.newaxis], + [1, beam_size, 1, trg_max_len, 1]).reshape([ + -1, src_slf_attn_bias.shape[1], trg_max_len, + src_slf_attn_bias.shape[-1] + ]) + # Append the shape input to reshape the output of embedding layer. + trg_data_shape = np.array( + [batch_size * beam_size, trg_max_len, d_model], dtype="int32") # Append the shape inputs to reshape before and after softmax in # decoder self attention. trg_slf_attn_pre_softmax_shape = np.array( @@ -96,26 +129,27 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, [-1, trg_src_attn_bias.shape[-1]], dtype="int32") trg_src_attn_post_softmax_shape = np.array( trg_src_attn_bias.shape, dtype="int32") - enc_output = np.tile(enc_output, [beam_size, 1, 1]) + enc_output = np.tile( + enc_output[:, np.newaxis], [1, beam_size, 1, 1]).reshape( + [-1, enc_output.shape[-2], enc_output.shape[-1]]) return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ - trg_slf_attn_pre_softmax_shape, trg_slf_attn_post_softmax_shape, \ - trg_src_attn_pre_softmax_shape, trg_src_attn_post_softmax_shape, \ - enc_output + trg_data_shape, trg_slf_attn_pre_softmax_shape, \ + trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \ + trg_src_attn_post_softmax_shape, enc_output - def update_dec_in_data(dec_in_data, next_ids, active_beams): + def update_dec_in_data(dec_in_data, next_ids, active_beams, beam_inst_map): """ Update the input data of decoder mainly by slicing from the previous input data and dropping the finished instance beams. """ trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ - trg_slf_attn_pre_softmax_shape, trg_slf_attn_post_softmax_shape, \ - trg_src_attn_pre_softmax_shape, trg_src_attn_post_softmax_shape, \ - enc_output = dec_in_data - trg_cur_len = len(next_ids[0]) + 1 # include the + trg_data_shape, trg_slf_attn_pre_softmax_shape, \ + trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \ + trg_src_attn_post_softmax_shape, enc_output = dec_in_data + trg_cur_len = trg_slf_attn_bias.shape[-1] + 1 trg_words = np.array( [ - beam_backtrace( - prev_branchs[beam_idx], next_ids[beam_idx], add_bos=True) + beam_backtrace(prev_branchs[beam_idx], next_ids[beam_idx]) for beam_idx in active_beams ], dtype="int64") @@ -123,6 +157,7 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, trg_pos = np.array( [range(1, trg_cur_len + 1)] * len(active_beams) * beam_size, dtype="int64").reshape([-1, 1]) + active_beams = [beam_inst_map[beam_idx] for beam_idx in active_beams] active_beams_indice = ( (np.array(active_beams) * beam_size)[:, np.newaxis] + np.array(range(beam_size))[np.newaxis, :]).flatten() @@ -137,6 +172,10 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, trg_src_attn_bias = np.tile(trg_src_attn_bias[ active_beams_indice, :, ::trg_src_attn_bias.shape[2], :], [1, 1, trg_cur_len, 1]) + # Append the shape input to reshape the output of embedding layer. + trg_data_shape = np.array( + [len(active_beams) * beam_size, trg_cur_len, d_model], + dtype="int32") # Append the shape inputs to reshape before and after softmax in # decoder self attention. trg_slf_attn_pre_softmax_shape = np.array( @@ -151,9 +190,9 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, trg_src_attn_bias.shape, dtype="int32") enc_output = enc_output[active_beams_indice, :, :] return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ - trg_slf_attn_pre_softmax_shape, trg_slf_attn_post_softmax_shape, \ - trg_src_attn_pre_softmax_shape, trg_src_attn_post_softmax_shape, \ - enc_output + trg_data_shape, trg_slf_attn_pre_softmax_shape, \ + trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \ + trg_src_attn_post_softmax_shape, enc_output dec_in_data = init_dec_in_data(batch_size, beam_size, enc_in_data, enc_output) @@ -162,13 +201,18 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, feed=dict(zip(dec_in_names, dec_in_data)), fetch_list=dec_out_names)[0] predict_all = np.log( - predict_all.reshape([len(beam_map) * beam_size, i + 1, -1])[:, - -1, :]) - predict_all = (predict_all + scores[beam_map].reshape( - [len(beam_map) * beam_size, -1])).reshape( - [len(beam_map), beam_size, -1]) + predict_all.reshape([len(beam_inst_map) * beam_size, i + 1, -1]) + [:, -1, :]) + predict_all = (predict_all + scores[active_beams].reshape( + [len(beam_inst_map) * beam_size, -1])).reshape( + [len(beam_inst_map), beam_size, -1]) + if not output_unk: # To exclude the token. + predict_all[:, :, unk_idx] = -1e9 active_beams = [] - for inst_idx, beam_idx in enumerate(beam_map): + for beam_idx in range(batch_size): + if not beam_inst_map.has_key(beam_idx): + continue + inst_idx = beam_inst_map[beam_idx] predict = (predict_all[inst_idx, :, :] if i != 0 else predict_all[inst_idx, 0, :]).flatten() top_k_indice = np.argpartition(predict, -beam_size)[-beam_size:] @@ -181,13 +225,20 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, next_ids[beam_idx].append(top_scores_ids % predict_all.shape[-1]) if next_ids[beam_idx][-1][0] != eos_idx: active_beams.append(beam_idx) - beam_map = active_beams - if len(beam_map) == 0: + if len(active_beams) == 0: break - dec_in_data = update_dec_in_data(dec_in_data, next_ids, active_beams) + dec_in_data = update_dec_in_data(dec_in_data, next_ids, active_beams, + beam_inst_map) + beam_inst_map = { + beam_idx: inst_idx + for inst_idx, beam_idx in enumerate(active_beams) + } # Decode beams and select n_best sequences for each instance by backtrace. - seqs = [beam_backtrace(prev_branchs[beam_idx], next_ids[beam_idx], n_best)] + seqs = [ + beam_backtrace(prev_branchs[beam_idx], next_ids[beam_idx], n_best) + for beam_idx in range(batch_size) + ] return seqs, scores[:, :n_best].tolist() @@ -195,29 +246,24 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names, def main(): place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) - # The current program desc is coupled with batch_size and the only - # supported batch size is 1 currently. + encoder_program = fluid.Program() - model.batch_size = InferTaskConfig.batch_size with fluid.program_guard(main_program=encoder_program): enc_output = encoder( - ModelHyperParams.src_vocab_size + 1, - ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, - ModelHyperParams.n_head, ModelHyperParams.d_key, - ModelHyperParams.d_value, ModelHyperParams.d_model, - ModelHyperParams.d_inner_hid, ModelHyperParams.dropout, - ModelHyperParams.src_pad_idx, ModelHyperParams.pos_pad_idx) + ModelHyperParams.src_vocab_size, ModelHyperParams.max_length + 1, + ModelHyperParams.n_layer, ModelHyperParams.n_head, + ModelHyperParams.d_key, ModelHyperParams.d_value, + ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, + ModelHyperParams.dropout) - model.batch_size = InferTaskConfig.batch_size * InferTaskConfig.beam_size decoder_program = fluid.Program() with fluid.program_guard(main_program=decoder_program): predict = decoder( - ModelHyperParams.trg_vocab_size + 1, - ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, - ModelHyperParams.n_head, ModelHyperParams.d_key, - ModelHyperParams.d_value, ModelHyperParams.d_model, - ModelHyperParams.d_inner_hid, ModelHyperParams.dropout, - ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx) + ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, + ModelHyperParams.n_layer, ModelHyperParams.n_head, + ModelHyperParams.d_key, ModelHyperParams.d_value, + ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, + ModelHyperParams.dropout) # Load model parameters of encoder and decoder separately from the saved # transformer model. @@ -254,17 +300,51 @@ def main(): trg_idx2word = paddle.dataset.wmt16.get_dict( "de", dict_size=ModelHyperParams.trg_vocab_size, reverse=True) + def post_process_seq(seq, + bos_idx=ModelHyperParams.bos_idx, + eos_idx=ModelHyperParams.eos_idx, + output_bos=InferTaskConfig.output_bos, + output_eos=InferTaskConfig.output_eos): + """ + Post-process the beam-search decoded sequence. Truncate from the first + and remove the and tokens currently. + """ + eos_pos = len(seq) - 1 + for i, idx in enumerate(seq): + if idx == eos_idx: + eos_pos = i + break + seq = seq[:eos_pos + 1] + return filter( + lambda idx: (output_bos or idx != bos_idx) and \ + (output_eos or idx != eos_idx), + seq) + for batch_id, data in enumerate(test_data()): batch_seqs, batch_scores = translate_batch( - exe, [item[0] for item in data], encoder_program, - encoder_input_data_names, [enc_output.name], decoder_program, - decoder_input_data_names, [predict.name], InferTaskConfig.beam_size, - InferTaskConfig.max_length, InferTaskConfig.n_best, - len(data), ModelHyperParams.n_head, ModelHyperParams.src_pad_idx, - ModelHyperParams.trg_pad_idx, ModelHyperParams.bos_idx, - ModelHyperParams.eos_idx) + exe, + [item[0] for item in data], + encoder_program, + encoder_input_data_names, + [enc_output.name], + decoder_program, + decoder_input_data_names, + [predict.name], + InferTaskConfig.beam_size, + InferTaskConfig.max_length, + InferTaskConfig.n_best, + len(data), + ModelHyperParams.n_head, + ModelHyperParams.d_model, + ModelHyperParams.eos_idx, # Use eos_idx to pad. + ModelHyperParams.eos_idx, # Use eos_idx to pad. + ModelHyperParams.bos_idx, + ModelHyperParams.eos_idx, + ModelHyperParams.unk_idx, + output_unk=InferTaskConfig.output_unk) for i in range(len(batch_seqs)): - seqs = batch_seqs[i] + # Post-process the beam-search decoded sequences. + seqs = map(post_process_seq, batch_seqs[i]) scores = batch_scores[i] for seq in seqs: print(" ".join([trg_idx2word[idx] for idx in seq])) diff --git a/fluid/neural_machine_translation/transformer/model.py b/fluid/neural_machine_translation/transformer/model.py index ffc07e91421dbaf3ed6e370f04ec6f1d7439fcf8..2b5519674282edb4d927f48b0a32eb82b459514d 100644 --- a/fluid/neural_machine_translation/transformer/model.py +++ b/fluid/neural_machine_translation/transformer/model.py @@ -7,9 +7,6 @@ import paddle.fluid.layers as layers from config import TrainTaskConfig, pos_enc_param_names, \ encoder_input_data_names, decoder_input_data_names, label_data_names -# FIXME(guosheng): Remove out the batch_size from the model. -batch_size = TrainTaskConfig.batch_size - def position_encoding_init(n_position, d_pos_vec): """ @@ -85,9 +82,10 @@ def multi_head_attention(queries, return x hidden_size = x.shape[-1] - # FIXME(guosheng): Decouple the program desc with batch_size. + # The value 0 in shape attr means copying the corresponding dimension + # size of the input as the output dimension size. reshaped = layers.reshape( - x=x, shape=[batch_size, -1, n_head, hidden_size // n_head]) + x=x, shape=[0, -1, n_head, hidden_size // n_head]) # permuate the dimensions into: # [batch_size, n_head, max_sequence_len, hidden_size_per_head] @@ -103,11 +101,11 @@ def multi_head_attention(queries, raise ValueError("Input(x) should be a 4-D Tensor.") trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) - # FIXME(guosheng): Decouple the program desc with batch_size. + # The value 0 in shape attr means copying the corresponding dimension + # size of the input as the output dimension size. return layers.reshape( x=trans_x, - shape=map(int, - [batch_size, -1, trans_x.shape[2] * trans_x.shape[3]])) + shape=map(int, [0, -1, trans_x.shape[2] * trans_x.shape[3]])) def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate): """ @@ -201,10 +199,9 @@ def prepare_encoder(src_word, src_pos, src_vocab_size, src_emb_dim, - src_pad_idx, src_max_len, dropout_rate=0., - pos_pad_idx=0, + src_data_shape=None, pos_enc_param_name=None): """Add word embeddings and position encodings. The output tensor has a shape of: @@ -215,18 +212,17 @@ def prepare_encoder(src_word, src_word_emb = layers.embedding( src_word, size=[src_vocab_size, src_emb_dim], - padding_idx=src_pad_idx, param_attr=fluid.initializer.Normal(0., 1.)) src_pos_enc = layers.embedding( src_pos, size=[src_max_len, src_emb_dim], - padding_idx=pos_pad_idx, param_attr=fluid.ParamAttr( name=pos_enc_param_name, trainable=False)) enc_input = src_word_emb + src_pos_enc - - # FIXME(guosheng): Decouple the program desc with batch_size. - enc_input = layers.reshape(x=enc_input, shape=[batch_size, -1, src_emb_dim]) + enc_input = layers.reshape( + x=enc_input, + shape=[-1, src_max_len, src_emb_dim], + actual_shape=src_data_shape) return layers.dropout( enc_input, dropout_prob=dropout_rate, is_test=False) if dropout_rate else enc_input @@ -401,20 +397,23 @@ def decoder(dec_input, def make_inputs(input_data_names, n_head, d_model, - batch_size, max_length, is_pos, slf_attn_bias_flag, src_attn_bias_flag, enc_output_flag=False, + data_shape_flag=True, slf_attn_shape_flag=True, src_attn_shape_flag=True): """ Define the input data layers for the transformer model. """ input_layers = [] - # The shapes here act as placeholder. - # The shapes set here is to pass the infer-shape in compile time. + batch_size = 1 # Only for the infer-shape in compile time. + # The shapes here act as placeholder and are set to pass the infer-shape in + # compile time. + # The actual data shape of word is: + # [batch_size * max_len_in_batch, 1] word = layers.data( name=input_data_names[len(input_layers)], shape=[batch_size * max_length, 1], @@ -422,6 +421,8 @@ def make_inputs(input_data_names, append_batch_size=False) input_layers += [word] # This is used for position data or label weight. + # The actual data shape of pos is: + # [batch_size * max_len_in_batch, 1] pos = layers.data( name=input_data_names[len(input_layers)], shape=[batch_size * max_length, 1], @@ -432,6 +433,8 @@ def make_inputs(input_data_names, # This input is used to remove attention weights on paddings for the # encoder and to remove attention weights on subsequent words for the # decoder. + # The actual data shape of slf_attn_bias_flag is: + # [batch_size, n_head, max_len_in_batch, max_len_in_batch] slf_attn_bias = layers.data( name=input_data_names[len(input_layers)], shape=[batch_size, n_head, max_length, max_length], @@ -439,40 +442,60 @@ def make_inputs(input_data_names, append_batch_size=False) input_layers += [slf_attn_bias] if src_attn_bias_flag: - # This input is used to remove attention weights on paddings. + # This input is used to remove attention weights on paddings. It's used + # in encoder-decoder attention. + # The actual data shape of slf_attn_bias_flag is: + # [batch_size, n_head, trg_max_len_in_batch, src_max_len_in_batch] src_attn_bias = layers.data( name=input_data_names[len(input_layers)], shape=[batch_size, n_head, max_length, max_length], dtype="float32", append_batch_size=False) input_layers += [src_attn_bias] + if data_shape_flag: + # This input is used to reshape the output of embedding layer. + data_shape = layers.data( + name=input_data_names[len(input_layers)], + shape=[3], + dtype="int32", + append_batch_size=False) + input_layers += [data_shape] if slf_attn_shape_flag: + # This shape input is used to reshape before softmax in self attention. slf_attn_pre_softmax_shape = layers.data( name=input_data_names[len(input_layers)], - shape=[3], + shape=[2], dtype="int32", append_batch_size=False) input_layers += [slf_attn_pre_softmax_shape] + # This shape input is used to reshape after softmax in self attention. slf_attn_post_softmax_shape = layers.data( name=input_data_names[len(input_layers)], - shape=[3], + shape=[4], dtype="int32", append_batch_size=False) input_layers += [slf_attn_post_softmax_shape] if src_attn_shape_flag: + # This shape input is used to reshape before softmax in encoder-decoder + # attention. src_attn_pre_softmax_shape = layers.data( name=input_data_names[len(input_layers)], - shape=[3], + shape=[2], dtype="int32", append_batch_size=False) input_layers += [src_attn_pre_softmax_shape] + # This shape input is used to reshape after softmax in encoder-decoder + # attention. src_attn_post_softmax_shape = layers.data( name=input_data_names[len(input_layers)], - shape=[3], + shape=[4], dtype="int32", append_batch_size=False) input_layers += [src_attn_post_softmax_shape] if enc_output_flag: + # This input is used in independent decoder program for inference. + # The actual data shape of slf_attn_bias_flag is: + # [batch_size, max_len_in_batch, d_model] enc_output = layers.data( name=input_data_names[len(input_layers)], shape=[batch_size, max_length, d_model], @@ -493,20 +516,17 @@ def transformer( d_value, d_model, d_inner_hid, - dropout_rate, - src_pad_idx, - trg_pad_idx, - pos_pad_idx, ): - enc_input_layers = make_inputs( + dropout_rate, ): + enc_inputs = make_inputs( encoder_input_data_names, n_head, d_model, - batch_size, max_length, is_pos=True, slf_attn_bias_flag=True, src_attn_bias_flag=False, enc_output_flag=False, + data_shape_flag=True, slf_attn_shape_flag=True, src_attn_shape_flag=False) @@ -520,20 +540,18 @@ def transformer( d_model, d_inner_hid, dropout_rate, - src_pad_idx, - pos_pad_idx, - enc_input_layers, ) + enc_inputs, ) - dec_input_layers = make_inputs( + dec_inputs = make_inputs( decoder_input_data_names, n_head, d_model, - batch_size, max_length, is_pos=True, slf_attn_bias_flag=True, src_attn_bias_flag=True, enc_output_flag=False, + data_shape_flag=True, slf_attn_shape_flag=True, src_attn_shape_flag=True) @@ -547,9 +565,7 @@ def transformer( d_model, d_inner_hid, dropout_rate, - trg_pad_idx, - pos_pad_idx, - dec_input_layers, + dec_inputs, enc_output, ) # Padding index do not contribute to the total loss. The weights is used to @@ -558,17 +574,20 @@ def transformer( label_data_names, n_head, d_model, - batch_size, max_length, is_pos=False, slf_attn_bias_flag=False, src_attn_bias_flag=False, enc_output_flag=False, + data_shape_flag=False, slf_attn_shape_flag=False, src_attn_shape_flag=False) cost = layers.softmax_with_cross_entropy(logits=predict, label=gold) weighted_cost = cost * weights - return layers.reduce_sum(weighted_cost), predict + sum_cost = layers.reduce_sum(weighted_cost) + token_num = layers.reduce_sum(weights) + avg_cost = sum_cost / token_num + return sum_cost, avg_cost, predict, token_num def wrap_encoder(src_vocab_size, @@ -580,38 +599,38 @@ def wrap_encoder(src_vocab_size, d_model, d_inner_hid, dropout_rate, - src_pad_idx, - pos_pad_idx, - enc_input_layers=None): + enc_inputs=None): """ The wrapper assembles together all needed layers for the encoder. """ - if enc_input_layers is None: + if enc_inputs is None: # This is used to implement independent encoder program in inference. - src_word, src_pos, src_slf_attn_bias, slf_attn_pre_softmax_shape, \ - slf_attn_post_softmax_shape = make_inputs( + src_word, src_pos, src_slf_attn_bias, src_data_shape, \ + slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape = \ + make_inputs( encoder_input_data_names, n_head, d_model, - batch_size, max_length, is_pos=True, slf_attn_bias_flag=True, src_attn_bias_flag=False, enc_output_flag=False, + data_shape_flag=True, slf_attn_shape_flag=True, src_attn_shape_flag=False) else: - src_word, src_pos, src_slf_attn_bias, slf_attn_pre_softmax_shape, \ - slf_attn_post_softmax_shape = enc_input_layers + src_word, src_pos, src_slf_attn_bias, src_data_shape, \ + slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape = \ + enc_inputs enc_input = prepare_encoder( src_word, src_pos, src_vocab_size, d_model, - src_pad_idx, max_length, - dropout_rate, ) + dropout_rate, + src_data_shape, ) enc_output = encoder( enc_input, src_slf_attn_bias, @@ -636,44 +655,42 @@ def wrap_decoder(trg_vocab_size, d_model, d_inner_hid, dropout_rate, - trg_pad_idx, - pos_pad_idx, - dec_input_layers=None, + dec_inputs=None, enc_output=None): """ The wrapper assembles together all needed layers for the decoder. """ - if dec_input_layers is None: + if dec_inputs is None: # This is used to implement independent decoder program in inference. trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ - slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape, \ - src_attn_pre_softmax_shape, src_attn_post_softmax_shape, \ - enc_output = make_inputs( + trg_data_shape, slf_attn_pre_softmax_shape, \ + slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, \ + src_attn_post_softmax_shape, enc_output = make_inputs( decoder_input_data_names, n_head, d_model, - batch_size, max_length, is_pos=True, slf_attn_bias_flag=True, src_attn_bias_flag=True, enc_output_flag=True, + data_shape_flag=True, slf_attn_shape_flag=True, src_attn_shape_flag=True) else: trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ - slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape, \ - src_attn_pre_softmax_shape, src_attn_post_softmax_shape = \ - dec_input_layers + trg_data_shape, slf_attn_pre_softmax_shape, \ + slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, \ + src_attn_post_softmax_shape = dec_inputs dec_input = prepare_decoder( trg_word, trg_pos, trg_vocab_size, d_model, - trg_pad_idx, max_length, - dropout_rate, ) + dropout_rate, + trg_data_shape, ) dec_output = decoder( dec_input, enc_output, @@ -697,5 +714,5 @@ def wrap_decoder(trg_vocab_size, bias_attr=False, num_flatten_dims=2), shape=[-1, trg_vocab_size], - act="softmax" if dec_input_layers is None else None) + act="softmax" if dec_inputs is None else None) return predict diff --git a/fluid/neural_machine_translation/transformer/train.py b/fluid/neural_machine_translation/transformer/train.py index 79876a5dd352c96d337d38552f117b8cc3b2c664..ffbc4bd57a3a06531e36e3eddc142febf2c57d02 100644 --- a/fluid/neural_machine_translation/transformer/train.py +++ b/fluid/neural_machine_translation/transformer/train.py @@ -1,4 +1,5 @@ import os +import time import numpy as np import paddle @@ -14,7 +15,7 @@ def pad_batch_data(insts, pad_idx, n_head, is_target=False, - return_pos=True, + is_label=False, return_attn_bias=True, return_max_len=True): """ @@ -23,14 +24,20 @@ def pad_batch_data(insts, """ return_list = [] max_len = max(len(inst) for inst in insts) + # Any token included in dict can be used to pad, since the paddings' loss + # will be masked out by weights and make no effect on parameter gradients. inst_data = np.array( [inst + [pad_idx] * (max_len - len(inst)) for inst in insts]) return_list += [inst_data.astype("int64").reshape([-1, 1])] - if return_pos: - inst_pos = np.array([[ - pos_i + 1 if w_i != pad_idx else 0 for pos_i, w_i in enumerate(inst) - ] for inst in inst_data]) - + if is_label: # label weight + inst_weight = np.array( + [[1.] * len(inst) + [0.] * (max_len - len(inst)) for inst in insts]) + return_list += [inst_weight.astype("float32").reshape([-1, 1])] + else: # position data + inst_pos = np.array([ + range(1, len(inst) + 1) + [0] * (max_len - len(inst)) + for inst in insts + ]) return_list += [inst_pos.astype("int64").reshape([-1, 1])] if return_attn_bias: if is_target: @@ -56,7 +63,7 @@ def pad_batch_data(insts, def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx, - max_length, n_head): + n_head, d_model): """ Put all padded data needed by training into a dict. """ @@ -66,6 +73,10 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx, [inst[1] for inst in insts], trg_pad_idx, n_head, is_target=True) trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :], [1, 1, trg_max_len, 1]).astype("float32") + + # These shape tensors are used in reshape_op. + src_data_shape = np.array([len(insts), src_max_len, d_model], dtype="int32") + trg_data_shape = np.array([len(insts), trg_max_len, d_model], dtype="int32") src_slf_attn_pre_softmax_shape = np.array( [-1, src_slf_attn_bias.shape[-1]], dtype="int32") src_slf_attn_post_softmax_shape = np.array( @@ -78,17 +89,24 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx, [-1, trg_src_attn_bias.shape[-1]], dtype="int32") trg_src_attn_post_softmax_shape = np.array( trg_src_attn_bias.shape, dtype="int32") - lbl_word = pad_batch_data([inst[2] for inst in insts], trg_pad_idx, n_head, - False, False, False, False) - lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1]) + + lbl_word, lbl_weight = pad_batch_data( + [inst[2] for inst in insts], + trg_pad_idx, + n_head, + is_target=False, + is_label=True, + return_attn_bias=False, + return_max_len=False) + input_dict = dict( zip(input_data_names, [ - src_word, src_pos, src_slf_attn_bias, + src_word, src_pos, src_slf_attn_bias, src_data_shape, src_slf_attn_pre_softmax_shape, src_slf_attn_post_softmax_shape, trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, - trg_slf_attn_pre_softmax_shape, trg_slf_attn_post_softmax_shape, - trg_src_attn_pre_softmax_shape, trg_src_attn_post_softmax_shape, - lbl_word, lbl_weight + trg_data_shape, trg_slf_attn_pre_softmax_shape, + trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, + trg_src_attn_post_softmax_shape, lbl_word, lbl_weight ])) return input_dict @@ -97,14 +115,12 @@ def main(): place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) - cost, predict = transformer( - ModelHyperParams.src_vocab_size + 1, - ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1, - ModelHyperParams.n_layer, ModelHyperParams.n_head, - ModelHyperParams.d_key, ModelHyperParams.d_value, - ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, - ModelHyperParams.dropout, ModelHyperParams.src_pad_idx, - ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx) + sum_cost, avg_cost, predict, token_num = transformer( + ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, + ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, + ModelHyperParams.n_head, ModelHyperParams.d_key, + ModelHyperParams.d_value, ModelHyperParams.d_model, + ModelHyperParams.d_inner_hid, ModelHyperParams.dropout) lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model, TrainTaskConfig.warmup_steps, place, @@ -114,7 +130,7 @@ def main(): beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps) - optimizer.minimize(cost) + optimizer.minimize(avg_cost if TrainTaskConfig.use_avg_cost else sum_cost) train_data = paddle.batch( paddle.reader.shuffle( @@ -126,27 +142,31 @@ def main(): # Program to do validation. test_program = fluid.default_main_program().clone() with fluid.program_guard(test_program): - test_program = fluid.io.get_inference_program([cost]) + test_program = fluid.io.get_inference_program([avg_cost]) val_data = paddle.batch( paddle.dataset.wmt16.validation(ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size), batch_size=TrainTaskConfig.batch_size) def test(exe): - test_costs = [] + test_total_cost = 0 + test_total_token = 0 for batch_id, data in enumerate(val_data()): - if len(data) != TrainTaskConfig.batch_size: - continue data_input = prepare_batch_input( data, encoder_input_data_names + decoder_input_data_names[:-1] + - label_data_names, ModelHyperParams.src_pad_idx, - ModelHyperParams.trg_pad_idx, ModelHyperParams.max_length, - ModelHyperParams.n_head) - test_cost = exe.run(test_program, - feed=data_input, - fetch_list=[cost])[0] - test_costs.append(test_cost) - return np.mean(test_costs) + label_data_names, ModelHyperParams.eos_idx, + ModelHyperParams.eos_idx, ModelHyperParams.n_head, + ModelHyperParams.d_model) + test_sum_cost, test_token_num = exe.run( + test_program, + feed=data_input, + fetch_list=[sum_cost, token_num], + use_program_cache=True) + test_total_cost += test_sum_cost + test_total_token += test_token_num + test_avg_cost = test_total_cost / test_total_token + test_ppl = np.exp([min(test_avg_cost, 100)]) + return test_avg_cost, test_ppl # Initialize the parameters. exe.run(fluid.framework.default_startup_program()) @@ -158,27 +178,30 @@ def main(): ModelHyperParams.d_model), place) for pass_id in xrange(TrainTaskConfig.pass_num): + pass_start_time = time.time() for batch_id, data in enumerate(train_data()): - # The current program desc is coupled with batch_size, thus all - # mini-batches must have the same number of instances currently. if len(data) != TrainTaskConfig.batch_size: continue data_input = prepare_batch_input( data, encoder_input_data_names + decoder_input_data_names[:-1] + - label_data_names, ModelHyperParams.src_pad_idx, - ModelHyperParams.trg_pad_idx, ModelHyperParams.max_length, - ModelHyperParams.n_head) + label_data_names, ModelHyperParams.eos_idx, + ModelHyperParams.eos_idx, ModelHyperParams.n_head, + ModelHyperParams.d_model) lr_scheduler.update_learning_rate(data_input) outs = exe.run(fluid.framework.default_main_program(), feed=data_input, - fetch_list=[cost], + fetch_list=[sum_cost, avg_cost], use_program_cache=True) - cost_val = np.array(outs[0]) - print("pass_id = " + str(pass_id) + " batch = " + str(batch_id) + - " cost = " + str(cost_val)) + sum_cost_val, avg_cost_val = np.array(outs[0]), np.array(outs[1]) + print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" % + (pass_id, batch_id, sum_cost_val, avg_cost_val, + np.exp([min(avg_cost_val[0], 100)]))) # Validate and save the model for inference. - val_cost = test(exe) - print("pass_id = " + str(pass_id) + " val_cost = " + str(val_cost)) + val_avg_cost, val_ppl = test(exe) + pass_end_time = time.time() + time_consumed = pass_end_time - pass_start_time + print("epoch: %d, val avg loss: %f, val ppl: %f, " + "consumed %fs" % (pass_id, val_avg_cost, val_ppl, time_consumed)) fluid.io.save_inference_model( os.path.join(TrainTaskConfig.model_dir, "pass_" + str(pass_id) + ".infer.model"), diff --git a/fluid/object_detection/.gitignore b/fluid/object_detection/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..3321aa105e8c63b5ba915782fd69bc90debbf56c --- /dev/null +++ b/fluid/object_detection/.gitignore @@ -0,0 +1,8 @@ +./data/pascalvoc/VOCdevkit/ +data/pascalvoc/test.txt +data/pascalvoc/trainval.txt +pretrained/ssd_mobilenet_v1_coco.tar.gz +pretrained/ssd_mobilenet_v1_coco +pretrained/mobilenet_v1_imagenet.tar.gz +pretrained/mobilenet_v1_imagenet +log* diff --git a/fluid/object_detection/data/prepare_voc_data.py b/fluid/object_detection/data/pascalvoc/create_list.py similarity index 96% rename from fluid/object_detection/data/prepare_voc_data.py rename to fluid/object_detection/data/pascalvoc/create_list.py index a652956e91ab8277bc6670d4dc85905fc52a3203..1f53b182fdab937c250945fdb8ee1da8cd85f46e 100644 --- a/fluid/object_detection/data/prepare_voc_data.py +++ b/fluid/object_detection/data/pascalvoc/create_list.py @@ -60,4 +60,5 @@ def prepare_filelist(devkit_dir, years, output_dir): ftest.write(item[0] + ' ' + item[1] + '\n') -prepare_filelist(devkit_dir, years, '.') +if __name__ == '__main__': + prepare_filelist(devkit_dir, years, '.') diff --git a/fluid/object_detection/data/pascalvoc/download.sh b/fluid/object_detection/data/pascalvoc/download.sh new file mode 100644 index 0000000000000000000000000000000000000000..55bbb0e5a43f937ee478c9502444b22c493890ae --- /dev/null +++ b/fluid/object_detection/data/pascalvoc/download.sh @@ -0,0 +1,16 @@ +DIR="$( cd "$(dirname "$0")" ; pwd -P )" +cd "$DIR" + +# Download the data. +echo "Downloading..." +wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar +wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar +wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar +# Extract the data. +echo "Extractint..." +tar -xf VOCtrainval_11-May-2012.tar +tar -xf VOCtrainval_06-Nov-2007.tar +tar -xf VOCtest_06-Nov-2007.tar + +echo "Creating data lists..." +python create_list.py diff --git a/fluid/object_detection/data/label_list b/fluid/object_detection/data/pascalvoc/label_list similarity index 100% rename from fluid/object_detection/data/label_list rename to fluid/object_detection/data/pascalvoc/label_list diff --git a/fluid/object_detection/image_util.py b/fluid/object_detection/image_util.py index e538449aa9f9ce15e7730de293a98fe753403a87..b8464cfe8745b33249a8da3427689aec6904cd99 100644 --- a/fluid/object_detection/image_util.py +++ b/fluid/object_detection/image_util.py @@ -85,8 +85,7 @@ def satisfy_sample_constraint(sampler, sample_bbox, bbox_labels): return False -def generate_batch_samples(batch_sampler, bbox_labels, image_width, - image_height): +def generate_batch_samples(batch_sampler, bbox_labels): sampled_bbox = [] index = [] c = 0 @@ -217,8 +216,8 @@ def distort_image(img, settings): def expand_image(img, bbox_labels, img_width, img_height, settings): prob = random.uniform(0, 1) if prob < settings._expand_prob: - expand_ratio = random.uniform(1, settings._expand_max_ratio) - if expand_ratio - 1 >= 0.01: + if _expand_max_ratio - 1 >= 0.01: + expand_ratio = random.uniform(1, settings._expand_max_ratio) height = int(img_height * expand_ratio) width = int(img_width * expand_ratio) h_off = math.floor(random.uniform(0, height - img_height)) @@ -231,5 +230,5 @@ def expand_image(img, bbox_labels, img_width, img_height, settings): expand_img = Image.fromarray(expand_img) expand_img.paste(img, (int(w_off), int(h_off))) bbox_labels = transform_labels(bbox_labels, expand_bbox) - return expand_img, bbox_labels - return img, bbox_labels + return expand_img, bbox_labels, width, height + return img, bbox_labels, img_width, img_height diff --git a/fluid/object_detection/load_model.py b/fluid/object_detection/load_model.py deleted file mode 100644 index 8c7389efea33699b2f90243311ff89747f831d06..0000000000000000000000000000000000000000 --- a/fluid/object_detection/load_model.py +++ /dev/null @@ -1,67 +0,0 @@ -import paddle.v2 as paddle -import paddle.fluid as fluid -import numpy as np - - -# From npy -def load_vars(): - vars = {} - name_map = {} - with open('./ssd_mobilenet_v1_coco/names.map', 'r') as map_file: - for param in map_file: - fd_name, tf_name = param.strip().split('\t') - name_map[fd_name] = tf_name - - tf_vars = np.load( - './ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco_2017_11_17.npy').item() - for fd_name in name_map: - tf_name = name_map[fd_name] - tf_var = tf_vars[tf_name] - if len(tf_var.shape) == 4 and 'depthwise' in tf_name: - vars[fd_name] = np.transpose(tf_var, (2, 3, 0, 1)) - elif len(tf_var.shape) == 4: - vars[fd_name] = np.transpose(tf_var, (3, 2, 0, 1)) - else: - vars[fd_name] = tf_var - - return vars - - -def load_and_set_vars(place): - vars = load_vars() - for k, v in vars.items(): - t = fluid.global_scope().find_var(k).get_tensor() - #print(np.array(t).shape, v.shape, k) - assert np.array(t).shape == v.shape - t.set(v, place) - - -# From Paddle V1 -def load_paddlev1_vars(place): - vars = {} - name_map = {} - with open('./caffe2paddle/names.map', 'r') as map_file: - for param in map_file: - fd_name, tf_name = param.strip().split('\t') - name_map[fd_name] = tf_name - - from operator import mul - - def load(file_name, shape): - with open(file_name, 'rb') as f: - f.read(16) - arr = np.fromfile(f, dtype=np.float32) - #print(arr.size, reduce(mul, shape), file_name) - assert arr.size == reduce(mul, shape) - return arr.reshape(shape) - - for fd_name in name_map: - v1_name = name_map[fd_name] - t = fluid.global_scope().find_var(fd_name).get_tensor() - shape = np.array(t).shape - v1_var = load('./caffe2paddle/' + v1_name, shape) - t.set(v1_var, place) - - -if __name__ == "__main__": - load_vars() diff --git a/fluid/object_detection/mobilenet_ssd.py b/fluid/object_detection/mobilenet_ssd.py index 21869647aa261a1baacbe934453c8af4416b75b7..c39883196056aede5d410554e14a0198e540d754 100644 --- a/fluid/object_detection/mobilenet_ssd.py +++ b/fluid/object_detection/mobilenet_ssd.py @@ -27,12 +27,7 @@ def conv_bn(input, bias_attr=False) parameter_attr = ParamAttr(learning_rate=0.1, initializer=MSRA()) bias_attr = ParamAttr(learning_rate=0.2) - return fluid.layers.batch_norm( - input=conv, - act=act, - epsilon=0.00001, - param_attr=parameter_attr, - bias_attr=bias_attr) + return fluid.layers.batch_norm(input=conv, act=act) def depthwise_separable(input, num_filters1, num_filters2, num_groups, stride, @@ -76,7 +71,7 @@ def extra_block(input, num_filters1, num_filters2, num_groups, stride, scale): return normal_conv -def mobile_net(img, img_shape, scale=1.0): +def mobile_net(num_classes, img, img_shape, scale=1.0): # 300x300 tmp = conv_bn(img, 3, int(32 * scale), 2, 1, 3) # 150x150 @@ -104,10 +99,11 @@ def mobile_net(img, img_shape, scale=1.0): module16 = extra_block(module15, 128, 256, 1, 2, scale) # 2x2 module17 = extra_block(module16, 64, 128, 1, 2, scale) + mbox_locs, mbox_confs, box, box_var = fluid.layers.multi_box_head( inputs=[module11, module13, module14, module15, module16, module17], image=img, - num_classes=21, + num_classes=num_classes, min_ratio=20, max_ratio=90, min_sizes=[60.0, 105.0, 150.0, 195.0, 240.0, 285.0], diff --git a/fluid/object_detection/pretrained/download_coco.sh b/fluid/object_detection/pretrained/download_coco.sh new file mode 100644 index 0000000000000000000000000000000000000000..1cd1836f7c6e32f9f308a0c9a29d10efbc6f183f --- /dev/null +++ b/fluid/object_detection/pretrained/download_coco.sh @@ -0,0 +1,8 @@ +DIR="$( cd "$(dirname "$0")" ; pwd -P )" +cd "$DIR" + +# Download the data. +echo "Downloading..." +wget http://paddlemodels.bj.bcebos.com/ssd_mobilenet_v1_coco.tar.gz +echo "Extractint..." +tar -xf ssd_mobilenet_v1_coco.tar.gz diff --git a/fluid/object_detection/pretrained/download_imagenet.sh b/fluid/object_detection/pretrained/download_imagenet.sh new file mode 100644 index 0000000000000000000000000000000000000000..eb7c6767d9f9585342c2ba89a2f28f070d1351c2 --- /dev/null +++ b/fluid/object_detection/pretrained/download_imagenet.sh @@ -0,0 +1,8 @@ +DIR="$( cd "$(dirname "$0")" ; pwd -P )" +cd "$DIR" + +# Download the data. +echo "Downloading..." +wget http://paddlemodels.bj.bcebos.com/mobilenet_v1_imagenet.tar.gz +echo "Extractint..." +tar -xf mobilenet_v1_imagenet.tar.gz diff --git a/fluid/object_detection/reader.py b/fluid/object_detection/reader.py index 6a6beb6e50f5b0a7f6b969ca53868178db2527a6..43c54b4c4f0ed84c35ba98f84a76cf390fd47afd 100644 --- a/fluid/object_detection/reader.py +++ b/fluid/object_detection/reader.py @@ -16,19 +16,25 @@ import image_util from paddle.utils.image_util import * import random from PIL import Image +from PIL import ImageDraw import numpy as np import xml.etree.ElementTree import os +import time +import copy class Settings(object): - def __init__(self, data_dir, label_file, resize_h, resize_w, mean_value, - apply_distort, apply_expand): + def __init__(self, dataset, toy, data_dir, label_file, resize_h, resize_w, + mean_value, apply_distort, apply_expand): + self._dataset = dataset + self._toy = toy self._data_dir = data_dir - self._label_list = [] - label_fpath = os.path.join(data_dir, label_file) - for line in open(label_fpath): - self._label_list.append(line.strip()) + if dataset == "pascalvoc": + self._label_list = [] + label_fpath = os.path.join(data_dir, label_file) + for line in open(label_fpath): + self._label_list.append(line.strip()) self._apply_distort = apply_distort self._apply_expand = apply_expand @@ -47,6 +53,14 @@ class Settings(object): self._brightness_prob = 0.5 self._brightness_delta = 0.125 + @property + def dataset(self): + return self._dataset + + @property + def toy(self): + return self._toy + @property def apply_distort(self): return self._apply_expand @@ -59,6 +73,10 @@ class Settings(object): def data_dir(self): return self._data_dir + @data_dir.setter + def data_dir(self, data_dir): + self._data_dir = data_dir + @property def label_list(self): return self._label_list @@ -78,23 +96,76 @@ class Settings(object): def _reader_creator(settings, file_list, mode, shuffle): def reader(): - with open(file_list) as flist: - lines = [line.strip() for line in flist] - if shuffle: - random.shuffle(lines) - for line in lines: + if settings.dataset == 'coco': + # cocoapi + from pycocotools.coco import COCO + from pycocotools.cocoeval import COCOeval + + coco = COCO(file_list) + image_ids = coco.getImgIds() + images = coco.loadImgs(image_ids) + category_ids = coco.getCatIds() + category_names = [ + item['name'] for item in coco.loadCats(category_ids) + ] + elif settings.dataset == 'pascalvoc': + flist = open(file_list) + images = [line.strip() for line in flist] + + if not settings.toy == 0: + images = images[:settings.toy] if len( + images) > settings.toy else images + print("{} on {} with {} images".format(mode, settings.dataset, + len(images))) + + if shuffle: + random.shuffle(images) + + for image in images: + if settings.dataset == 'coco': + image_name = image['file_name'] + image_path = os.path.join(settings.data_dir, image_name) + elif settings.dataset == 'pascalvoc': if mode == 'train' or mode == 'test': - img_path, label_path = line.split() - img_path = os.path.join(settings.data_dir, img_path) + image_path, label_path = image.split() + image_path = os.path.join(settings.data_dir, image_path) label_path = os.path.join(settings.data_dir, label_path) elif mode == 'infer': - img_path = os.path.join(settings.data_dir, line) + image_path = os.path.join(settings.data_dir, image) - img = Image.open(img_path) - img_width, img_height = img.size + img = Image.open(image_path) + if img.mode == 'L': + img = img.convert('RGB') + img_width, img_height = img.size - # layout: label | xmin | ymin | xmax | ymax | difficult - if mode == 'train' or mode == 'test': + if mode == 'train' or mode == 'test': + if settings.dataset == 'coco': + # layout: category_id | xmin | ymin | xmax | ymax | iscrowd | origin_coco_bbox | segmentation | area | image_id | annotation_id + bbox_labels = [] + annIds = coco.getAnnIds(imgIds=image['id']) + anns = coco.loadAnns(annIds) + for ann in anns: + bbox_sample = [] + # start from 1, leave 0 to background + bbox_sample.append( + float(category_ids.index(ann['category_id'])) + 1) + bbox = ann['bbox'] + xmin, ymin, w, h = bbox + xmax = xmin + w + ymax = ymin + h + bbox_sample.append(float(xmin) / img_width) + bbox_sample.append(float(ymin) / img_height) + bbox_sample.append(float(xmax) / img_width) + bbox_sample.append(float(ymax) / img_height) + bbox_sample.append(float(ann['iscrowd'])) + #bbox_sample.append(ann['bbox']) + #bbox_sample.append(ann['segmentation']) + #bbox_sample.append(ann['area']) + #bbox_sample.append(ann['image_id']) + #bbox_sample.append(ann['id']) + bbox_labels.append(bbox_sample) + elif settings.dataset == 'pascalvoc': + # layout: label | xmin | ymin | xmax | ymax | difficult bbox_labels = [] root = xml.etree.ElementTree.parse(label_path).getroot() for object in root.findall('object'): @@ -117,91 +188,136 @@ def _reader_creator(settings, file_list, mode, shuffle): bbox_sample.append(difficult) bbox_labels.append(bbox_sample) - sample_labels = bbox_labels - if mode == 'train': - if settings._apply_distort: - img = image_util.distort_image(img, settings) - if settings._apply_expand: - img, bbox_labels = image_util.expand_image( - img, bbox_labels, img_width, img_height, - settings) - batch_sampler = [] - # hard-code here - batch_sampler.append( - image_util.sampler(1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, - 0.0)) - batch_sampler.append( - image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, - 0.0)) - batch_sampler.append( - image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, - 0.0)) - batch_sampler.append( - image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, - 0.0)) - batch_sampler.append( - image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, - 0.0)) - batch_sampler.append( - image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, - 0.0)) - batch_sampler.append( - image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, - 1.0)) - """ random crop """ - sampled_bbox = image_util.generate_batch_samples( - batch_sampler, bbox_labels, img_width, img_height) - - img = np.array(img) - if len(sampled_bbox) > 0: - idx = int(random.uniform(0, len(sampled_bbox))) - img, sample_labels = image_util.crop_image( - img, bbox_labels, sampled_bbox[idx], img_width, - img_height) - - img = Image.fromarray(img) - img = img.resize((settings.resize_w, settings.resize_h), - Image.ANTIALIAS) - img = np.array(img) - + sample_labels = bbox_labels if mode == 'train': - mirror = int(random.uniform(0, 2)) - if mirror == 1: - img = img[:, ::-1, :] - for i in xrange(len(sample_labels)): - tmp = sample_labels[i][1] - sample_labels[i][1] = 1 - sample_labels[i][3] - sample_labels[i][3] = 1 - tmp - - if len(img.shape) == 3: - img = np.swapaxes(img, 1, 2) - img = np.swapaxes(img, 1, 0) - - img = img[[2, 1, 0], :, :] - img = img.astype('float32') - img -= settings.img_mean - img = img.flatten() - img = img * 0.007843 - - sample_labels = np.array(sample_labels) - if mode == 'train' or mode == 'test': - if mode == 'train' and len(sample_labels) == 0: continue - yield img.astype( - 'float32' - ), sample_labels[:, 1:5], sample_labels[:, 0].astype( - 'int32'), sample_labels[:, -1].astype('int32') - elif mode == 'infer': - yield img.astype('float32') + if settings._apply_distort: + img = image_util.distort_image(img, settings) + if settings._apply_expand: + img, bbox_labels, img_width, img_height = image_util.expand_image( + img, bbox_labels, img_width, img_height, settings) + batch_sampler = [] + # hard-code here + batch_sampler.append( + image_util.sampler(1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0)) + batch_sampler.append( + image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 0.0)) + batch_sampler.append( + image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 0.0)) + batch_sampler.append( + image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 0.0)) + batch_sampler.append( + image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 0.0)) + batch_sampler.append( + image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 0.0)) + batch_sampler.append( + image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0)) + """ random crop """ + sampled_bbox = image_util.generate_batch_samples( + batch_sampler, bbox_labels, img_width, img_height) + + img = np.array(img) + if len(sampled_bbox) > 0: + idx = int(random.uniform(0, len(sampled_bbox))) + img, sample_labels = image_util.crop_image( + img, bbox_labels, sampled_bbox[idx], img_width, + img_height) + + img = Image.fromarray(img) + img = img.resize((settings.resize_w, settings.resize_h), + Image.ANTIALIAS) + img = np.array(img) + + if mode == 'train': + mirror = int(random.uniform(0, 2)) + if mirror == 1: + img = img[:, ::-1, :] + for i in xrange(len(sample_labels)): + tmp = sample_labels[i][1] + sample_labels[i][1] = 1 - sample_labels[i][3] + sample_labels[i][3] = 1 - tmp + + # HWC to CHW + if len(img.shape) == 3: + img = np.swapaxes(img, 1, 2) + img = np.swapaxes(img, 1, 0) + # RBG to BGR + img = img[[2, 1, 0], :, :] + img = img.astype('float32') + img -= settings.img_mean + img = img.flatten() + img = img * 0.007843 + + sample_labels = np.array(sample_labels) + if mode == 'train' or mode == 'test': + if mode == 'train' and len(sample_labels) == 0: continue + if mode == 'test' and len(sample_labels) == 0: continue + yield img.astype( + 'float32' + ), sample_labels[:, 1:5], sample_labels[:, 0].astype( + 'int32'), sample_labels[:, -1].astype('int32') + elif mode == 'infer': + yield img.astype('float32') return reader +def draw_bounding_box_on_image(image, + sample_labels, + image_name, + category_names, + color='red', + thickness=4, + with_text=True, + normalized=True): + image = Image.fromarray(image) + draw = ImageDraw.Draw(image) + im_width, im_height = image.size + if not normalized: + im_width, im_height = 1, 1 + for item in sample_labels: + label = item[0] + category_name = category_names[int(label)] + bbox = item[1:5] + xmin, ymin, xmax, ymax = bbox + (left, right, top, bottom) = (xmin * im_width, xmax * im_width, + ymin * im_height, ymax * im_height) + draw.line( + [(left, top), (left, bottom), (right, bottom), (right, top), + (left, top)], + width=thickness, + fill=color) + if with_text: + if image.mode == 'RGB': + draw.text((left, top), category_name, (255, 255, 0)) + image.save(image_name) + + def train(settings, file_list, shuffle=True): - return _reader_creator(settings, file_list, 'train', shuffle) + file_list = os.path.join(settings.data_dir, file_list) + if settings.dataset == 'coco': + train_settings = copy.copy(settings) + if '2014' in file_list: + sub_dir = "train2014" + elif '2017' in file_list: + sub_dir = "train2017" + train_settings.data_dir = os.path.join(settings.data_dir, sub_dir) + return _reader_creator(train_settings, file_list, 'train', shuffle) + elif settings.dataset == 'pascalvoc': + return _reader_creator(settings, file_list, 'train', shuffle) def test(settings, file_list): - return _reader_creator(settings, file_list, 'test', False) + file_list = os.path.join(settings.data_dir, file_list) + if settings.dataset == 'coco': + test_settings = copy.copy(settings) + if '2014' in file_list: + sub_dir = "val2014" + elif '2017' in file_list: + sub_dir = "val2017" + test_settings.data_dir = os.path.join(settings.data_dir, sub_dir) + return _reader_creator(test_settings, file_list, 'test', False) + elif settings.dataset == 'pascalvoc': + return _reader_creator(settings, file_list, 'test', False) def infer(settings, file_list): diff --git a/fluid/object_detection/train.py b/fluid/object_detection/train.py index a6c8e9e273cb2e08fc789acfdf9f92cb4e70f341..0f2856ca14cd155f600f4cf23a3403262b5bb110 100644 --- a/fluid/object_detection/train.py +++ b/fluid/object_detection/train.py @@ -1,10 +1,11 @@ -import paddle.v2 as paddle +import paddle import paddle.fluid as fluid import reader import load_model as load_model from mobilenet_ssd import mobile_net from utility import add_arguments, print_arguments import os +import time import numpy as np import argparse import functools @@ -12,22 +13,40 @@ import functools parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable -add_arg('batch_size', int, 32, "Minibatch size.") -add_arg('parallel', bool, True, "Whether use parallel training.") -add_arg('use_gpu', bool, True, "Whether use GPU.") +add_arg('learning_rate', float, 0.001, "Learning rate.") +add_arg('batch_size', int, 32, "Minibatch size.") +add_arg('num_passes', int, 25, "Epoch number.") +add_arg('parallel', bool, True, "Whether use parallel training.") +add_arg('use_gpu', bool, True, "Whether use GPU.") +add_arg('use_nccl', bool, False, "Whether use NCCL.") +add_arg('dataset', str, 'pascalvoc', "coco or pascalvoc.") +add_arg('model_save_dir', str, 'model', "The path to save model.") +add_arg('pretrained_model', str, 'pretrained/ssd_mobilenet_v1_coco/', "The init model path.") +add_arg('apply_distort', bool, True, "Whether apply distort") +add_arg('apply_expand', bool, False, "Whether appley expand") +add_arg('resize_h', int, 300, "resize image size") +add_arg('resize_w', int, 300, "resize image size") +add_arg('mean_value_B', float, 127.5, "mean value which will be subtracted") #123.68 +add_arg('mean_value_G', float, 127.5, "mean value which will be subtracted") #116.78 +add_arg('mean_value_R', float, 127.5, "mean value which will be subtracted") #103.94 +add_arg('is_toy', int, 0, "Toy for quick debug, 0 means using all data, while n means using only n sample") # yapf: disable -def train(args, - train_file_list, - val_file_list, - data_args, - learning_rate, - batch_size, - num_passes, - model_save_dir='model', - init_model_path=None): +def parallel_do(args, + train_file_list, + val_file_list, + data_args, + learning_rate, + batch_size, + num_passes, + model_save_dir, + pretrained_model=None): image_shape = [3, data_args.resize_h, data_args.resize_w] + if data_args.dataset == 'coco': + num_classes = 81 + elif data_args.dataset == 'pascalvoc': + num_classes = 21 image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') gt_box = fluid.layers.data( @@ -39,15 +58,16 @@ def train(args, if args.parallel: places = fluid.layers.get_places() - pd = fluid.layers.ParallelDo(places) + pd = fluid.layers.ParallelDo(places, use_nccl=args.use_nccl) with pd.do(): image_ = pd.read_input(image) gt_box_ = pd.read_input(gt_box) gt_label_ = pd.read_input(gt_label) difficult_ = pd.read_input(difficult) - locs, confs, box, box_var = mobile_net(image_, image_shape) - loss = fluid.layers.ssd_loss(locs, confs, gt_box_, gt_label_, - box, box_var) + locs, confs, box, box_var = mobile_net(num_classes, image_, + image_shape) + loss = fluid.layers.ssd_loss(locs, confs, gt_box_, gt_label_, box, + box_var) nmsed_out = fluid.layers.detection_output( locs, confs, box, box_var, nms_threshold=0.45) loss = fluid.layers.reduce_sum(loss) @@ -57,11 +77,11 @@ def train(args, loss, nmsed_out = pd() loss = fluid.layers.mean(loss) else: - locs, confs, box, box_var = mobile_net(image, image_shape) + locs, confs, box, box_var = mobile_net(num_classes, image, image_shape) nmsed_out = fluid.layers.detection_output( locs, confs, box, box_var, nms_threshold=0.45) - loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, - box, box_var) + loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, box, + box_var) loss = fluid.layers.reduce_sum(loss) test_program = fluid.default_main_program().clone(for_test=True) @@ -71,13 +91,20 @@ def train(args, gt_label, gt_box, difficult, - 21, + num_classes, overlap_threshold=0.5, evaluate_difficult=False, - ap_version='11point') + ap_version='integral') - boundaries = [40000, 60000] - values = [0.001, 0.0005, 0.00025] + if data_args.dataset == 'coco': + # learning rate decay in 12, 19 pass, respectively + if '2014' in train_file_list: + boundaries = [82783 / batch_size * 12, 82783 / batch_size * 19] + elif '2017' in train_file_list: + boundaries = [118287 / batch_size * 12, 118287 / batch_size * 19] + elif data_args.dataset == 'pascalvoc': + boundaries = [40000, 60000] + values = [learning_rate, learning_rate * 0.5, learning_rate * 0.25] optimizer = fluid.optimizer.RMSProp( learning_rate=fluid.layers.piecewise_decay(boundaries, values), regularization=fluid.regularizer.L2Decay(0.00005), ) @@ -88,8 +115,11 @@ def train(args, exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) - load_model.load_and_set_vars(place) - #load_model.load_paddlev1_vars(place) + if pretrained_model: + def if_exist(var): + return os.path.exists(os.path.join(pretrained_model, var.name)) + fluid.io.load_vars(exe, pretrained_model, predicate=if_exist) + train_reader = paddle.batch( reader.train(data_args, train_file_list), batch_size=batch_size) test_reader = paddle.batch( @@ -108,37 +138,167 @@ def train(args, print("Test {0}, map {1}".format(pass_id, test_map[0])) for pass_id in range(num_passes): + start_time = time.time() + prev_start_time = start_time + end_time = 0 for batch_id, data in enumerate(train_reader()): + prev_start_time = start_time + start_time = time.time() loss_v = exe.run(fluid.default_main_program(), feed=feeder.feed(data), fetch_list=[loss]) + end_time = time.time() if batch_id % 20 == 0: - print("Pass {0}, batch {1}, loss {2}" - .format(pass_id, batch_id, loss_v[0])) + print("Pass {0}, batch {1}, loss {2}, time {3}".format( + pass_id, batch_id, loss_v[0], start_time - prev_start_time)) test(pass_id) - if pass_id % 10 == 0: + if pass_id % 10 == 0 or pass_id == num_passes - 1: model_path = os.path.join(model_save_dir, str(pass_id)) print 'save models to %s' % (model_path) - fluid.io.save_inference_model(model_path, ['image'], [nmsed_out], - exe) + fluid.io.save_persistables(exe, model_path) + + +def parallel_exe(args, + train_file_list, + val_file_list, + data_args, + learning_rate, + batch_size, + num_passes, + model_save_dir='model', + pretrained_model=None): + image_shape = [3, data_args.resize_h, data_args.resize_w] + if data_args.dataset == 'coco': + num_classes = 81 + elif data_args.dataset == 'pascalvoc': + num_classes = 21 + + image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') + gt_box = fluid.layers.data( + name='gt_box', shape=[4], dtype='float32', lod_level=1) + gt_label = fluid.layers.data( + name='gt_label', shape=[1], dtype='int32', lod_level=1) + difficult = fluid.layers.data( + name='gt_difficult', shape=[1], dtype='int32', lod_level=1) + locs, confs, box, box_var = mobile_net(num_classes, image, image_shape) + nmsed_out = fluid.layers.detection_output( + locs, confs, box, box_var, nms_threshold=0.45) + loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, box, + box_var) + loss = fluid.layers.reduce_sum(loss) + + test_program = fluid.default_main_program().clone(for_test=True) + with fluid.program_guard(test_program): + map_eval = fluid.evaluator.DetectionMAP( + nmsed_out, + gt_label, + gt_box, + difficult, + num_classes, + overlap_threshold=0.5, + evaluate_difficult=False, + ap_version='integral') + + if data_args.dataset == 'coco': + # learning rate decay in 12, 19 pass, respectively + if '2014' in train_file_list: + boundaries = [82783 / batch_size * 12, 82783 / batch_size * 19] + elif '2017' in train_file_list: + boundaries = [118287 / batch_size * 12, 118287 / batch_size * 19] + elif data_args.dataset == 'pascalvoc': + boundaries = [40000, 60000] + values = [learning_rate, learning_rate * 0.5, learning_rate * 0.25] + optimizer = fluid.optimizer.RMSProp( + learning_rate=fluid.layers.piecewise_decay(boundaries, values), + regularization=fluid.regularizer.L2Decay(0.00005), ) + + optimizer.minimize(loss) + + place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + if pretrained_model: + def if_exist(var): + return os.path.exists(os.path.join(pretrained_model, var.name)) + fluid.io.load_vars(exe, pretrained_model, predicate=if_exist) + + train_exe = fluid.ParallelExecutor(use_cuda=args.use_gpu, + loss_name=loss.name) + + train_reader = paddle.batch( + reader.train(data_args, train_file_list), batch_size=batch_size) + test_reader = paddle.batch( + reader.test(data_args, val_file_list), batch_size=batch_size) + feeder = fluid.DataFeeder( + place=place, feed_list=[image, gt_box, gt_label, difficult]) + + def test(pass_id): + _, accum_map = map_eval.get_map_var() + map_eval.reset(exe) + test_map = None + for _, data in enumerate(test_reader()): + test_map = exe.run(test_program, + feed=feeder.feed(data), + fetch_list=[accum_map]) + print("Test {0}, map {1}".format(pass_id, test_map[0])) + + for pass_id in range(num_passes): + start_time = time.time() + prev_start_time = start_time + end_time = 0 + test(pass_id) + for batch_id, data in enumerate(train_reader()): + prev_start_time = start_time + start_time = time.time() + loss_v, = train_exe.run(fetch_list=[loss.name], + feed_dict=feeder.feed(data)) + end_time = time.time() + loss_v = np.mean(np.array(loss_v)) + if batch_id % 20 == 0: + print("Pass {0}, batch {1}, loss {2}, time {3}".format( + pass_id, batch_id, loss_v, start_time - prev_start_time)) + + if pass_id % 10 == 0 or pass_id == num_passes - 1: + model_path = os.path.join(model_save_dir, str(pass_id)) + print 'save models to %s' % (model_path) + fluid.io.save_persistables(exe, model_path) if __name__ == '__main__': args = parser.parse_args() print_arguments(args) + + data_dir = 'data/pascalvoc' + train_file_list = 'trainval.txt' + val_file_list = 'test.txt' + label_file = 'label_list' + model_save_dir = args.model_save_dir + if args.dataset == 'coco': + data_dir = './data/COCO17' + train_file_list = 'annotations/instances_train2017.json' + val_file_list = 'annotations/instances_val2017.json' + label_file = 'label_list' + data_args = reader.Settings( - data_dir='./data', - label_file='label_list', - apply_distort=True, - apply_expand=True, - resize_h=300, - resize_w=300, - mean_value=[127.5, 127.5, 127.5]) - train(args, - train_file_list='./data/trainval.txt', - val_file_list='./data/test.txt', - data_args=data_args, - learning_rate=0.001, - batch_size=args.batch_size, - num_passes=300) + dataset=args.dataset, + toy=args.is_toy, + data_dir=data_dir, + label_file=label_file, + apply_distort=args.apply_distort, + apply_expand=args.apply_expand, + resize_h=args.resize_h, + resize_w=args.resize_w, + mean_value=[args.mean_value_B, args.mean_value_G, args.mean_value_R]) + #method = parallel_do + method = parallel_exe + method(args, + train_file_list=train_file_list, + val_file_list=val_file_list, + data_args=data_args, + learning_rate=args.learning_rate, + batch_size=args.batch_size, + num_passes=args.num_passes, + model_save_dir=model_save_dir, + pretrained_model=args.pretrained_model) diff --git a/fluid/policy_gradient/brain.py b/fluid/policy_gradient/brain.py index 8387833065d89e0a61b90734771a8d9db5ac1eb4..ad556902f1f2d9b40e9ce8905373541decffa642 100644 --- a/fluid/policy_gradient/brain.py +++ b/fluid/policy_gradient/brain.py @@ -30,32 +30,28 @@ class PolicyGradient: acts = fluid.layers.data(name='acts', shape=[1], dtype='int64') vt = fluid.layers.data(name='vt', shape=[1], dtype='float32') # fc1 - fc1 = fluid.layers.fc( - input=obs, - size=10, - act="tanh" # tanh activation - ) + fc1 = fluid.layers.fc(input=obs, size=10, act="tanh") # tanh activation # fc2 - self.all_act_prob = fluid.layers.fc(input=fc1, - size=self.n_actions, - act="softmax") + all_act_prob = fluid.layers.fc(input=fc1, + size=self.n_actions, + act="softmax") + self.inferece_program = fluid.defaul_main_program().clone() # to maximize total reward (log_p * R) is to minimize -(log_p * R) neg_log_prob = fluid.layers.cross_entropy( input=self.all_act_prob, label=acts) # this is negative log of chosen action neg_log_prob_weight = fluid.layers.elementwise_mul(x=neg_log_prob, y=vt) loss = fluid.layers.reduce_mean( - x=neg_log_prob_weight) # reward guided loss + neg_log_prob_weight) # reward guided loss sgd_optimizer = fluid.optimizer.SGD(self.lr) sgd_optimizer.minimize(loss) self.exe.run(fluid.default_startup_program()) def choose_action(self, observation): - prob_weights = self.exe.run( - fluid.default_main_program().prune(self.all_act_prob), - feed={"obs": observation[np.newaxis, :]}, - fetch_list=[self.all_act_prob]) + prob_weights = self.exe.run(self.inferece_program, + feed={"obs": observation[np.newaxis, :]}, + fetch_list=[self.all_act_prob]) prob_weights = np.array(prob_weights[0]) action = np.random.choice( range(prob_weights.shape[1]),