diff --git a/fluid/DeepASR/examples/aishell/download_pretrained_model.sh b/fluid/DeepASR/examples/aishell/download_pretrained_model.sh new file mode 100644 index 0000000000000000000000000000000000000000..a8813e241c4f6e40392dff6f173160d2bbd77175 --- /dev/null +++ b/fluid/DeepASR/examples/aishell/download_pretrained_model.sh @@ -0,0 +1,15 @@ +url=http://deep-asr-data.gz.bcebos.com/aishell_pretrained_model.tar.gz +md5=7b51bde64e884f43901b7a3461ccbfa3 + +wget -c $url + +echo "Checking md5 sum ..." +md5sum_tmp=`md5sum aishell_pretrained_model.tar.gz | cut -d ' ' -f1` + +if [ $md5sum_tmp != $md5 ]; then + echo "Md5sum check failed, please remove and redownload " + "aishell_pretrained_model.tar.gz." + exit 1 +fi + +tar xvf aishell_pretrained_model.tar.gz diff --git a/fluid/DeepASR/score_error_rate.py b/fluid/DeepASR/score_error_rate.py index 5ecbca0862e3ea5981ef9ed8537fb98fabf2f62d..dde5a2448afffcae61c4d033159a5b081e6c79e8 100644 --- a/fluid/DeepASR/score_error_rate.py +++ b/fluid/DeepASR/score_error_rate.py @@ -16,10 +16,18 @@ def parse_args(): default='cer', choices=['cer', 'wer'], help="Error rate type. (default: %(default)s)") + parser.add_argument( + '--special_tokens', + type=str, + default='', + help="Special tokens in scoring CER, seperated by space. " + "They shouldn't be splitted and should be treated as one special " + "character. Example: ' ' " + "(default: %(default)s)") parser.add_argument( '--ref', type=str, required=True, help="The ground truth text.") parser.add_argument( - '--hyp', type=str, required=True, help="The decoding result.") + '--hyp', type=str, required=True, help="The decoding result text.") args = parser.parse_args() return args @@ -31,6 +39,8 @@ if __name__ == '__main__': sum_errors, sum_ref_len = 0.0, 0 sent_cnt, not_in_ref_cnt = 0, 0 + special_tokens = args.special_tokens.split(" ") + with open(args.ref, "r") as ref_txt: line = ref_txt.readline() while line: @@ -51,6 +61,8 @@ if __name__ == '__main__': continue if args.error_rate_type == 'cer': + for sp_tok in special_tokens: + sent = sent.replace(sp_tok, '\0') errors, ref_len = char_errors( ref_dict[key].decode("utf8"), sent.decode("utf8"), diff --git a/fluid/face_detection/.gitignore b/fluid/face_detection/.gitignore index ea3e7b052591ddb7d19525a685c13971bededf6f..0636bd5b2995e0a0fa27fe54be6ccbbb78074dca 100644 --- a/fluid/face_detection/.gitignore +++ b/fluid/face_detection/.gitignore @@ -9,3 +9,4 @@ log* output* pred eval_tools +box* diff --git a/fluid/face_detection/README_cn.md b/fluid/face_detection/README_cn.md index 1213a59dba4dc7b4c001deef7e2029f45c232ff0..8987b00fb2c66daedd42993214b5c9bab42a99c3 100644 --- a/fluid/face_detection/README_cn.md +++ b/fluid/face_detection/README_cn.md @@ -93,7 +93,7 @@ tar -xf vgg_ilsvrc_16_fc_reduced.tar.gz && rm -f vgg_ilsvrc_16_fc_reduced.tar.gz `train.py` 是训练模块的主要执行程序,调用示例如下: ```bash -python -u train.py --batch_size=16 --pretrained_model=vgg_ilsvrc_16_fc_reduced +python -u train.py --batch_size=12 --pretrained_model=vgg_ilsvrc_16_fc_reduced ``` - 可以通过设置 `export CUDA_VISIBLE_DEVICES=0,1,2,3` 指定想要使用的GPU数量。 - 更多的可选参数见: diff --git a/fluid/face_detection/train.py b/fluid/face_detection/train.py index b62ac26d0d7236421e80ed4396c6ed3d0f72c310..1680dc5ce06a6bd4e7dcc910a68382a6846adc77 100644 --- a/fluid/face_detection/train.py +++ b/fluid/face_detection/train.py @@ -16,14 +16,14 @@ add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable add_arg('parallel', bool, True, "Whether use multi-GPU/threads or not.") add_arg('learning_rate', float, 0.001, "The start learning rate.") -add_arg('batch_size', int, 16, "Minibatch size.") +add_arg('batch_size', int, 12, "Minibatch size.") add_arg('num_passes', int, 160, "Epoch number.") add_arg('use_gpu', bool, True, "Whether use GPU.") add_arg('use_pyramidbox', bool, True, "Whether use PyramidBox model.") add_arg('model_save_dir', str, 'output', "The path to save model.") add_arg('resize_h', int, 640, "The resized image height.") add_arg('resize_w', int, 640, "The resized image width.") -add_arg('with_mem_opt', bool, True, "Whether to use memory optimization or not.") +add_arg('with_mem_opt', bool, False, "Whether to use memory optimization or not.") add_arg('pretrained_model', str, './vgg_ilsvrc_16_fc_reduced/', "The init model path.") #yapf: enable diff --git a/fluid/image_classification/caffe2fluid/examples/imagenet/compare.py b/fluid/image_classification/caffe2fluid/examples/imagenet/compare.py index 05fbd6b85c2d70124817e7c5a2d5a90e78ba7847..45b1f5303ce77de7c7f5e3a232517c26e159b2fa 100644 --- a/fluid/image_classification/caffe2fluid/examples/imagenet/compare.py +++ b/fluid/image_classification/caffe2fluid/examples/imagenet/compare.py @@ -24,15 +24,10 @@ def calc_diff(f1, f2): #print d2.shape #print d1[0, 0, 0:10, 0:10] #print d2[0, 0, 0:10, 0:10] - #d1 = d1[:, :, 1:-2, 1:-2] - #d2 = d2[:, :, 1:-2, 1:-2] d1 = d1.flatten() d2 = d2.flatten() - #print d1[:10] - #print d2[:10] - d1_num = reduce(lambda x, y: x * y, d1.shape) d2_num = reduce(lambda x, y: x * y, d2.shape) if d1_num != d2_num: @@ -41,7 +36,11 @@ def calc_diff(f1, f2): assert (d1_num == d2_num), "their shape is not consistent" try: + mask = np.abs(d1) >= np.abs(d2) + mask = mask.astype('int32') + df = np.abs(d1 - d2) + df = df / (1.0e-10 + np.abs(d1) * mask + np.abs(d2) * (1 - mask)) max_df = np.max(df) sq_df = np.mean(df * df) return max_df, sq_df diff --git a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/__init__.py b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/__init__.py index 703c6a0a8091df79c73465be8c52248af518f3ca..73c7bed2a4ce475c84337b813a5552abc57ab998 100644 --- a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/__init__.py +++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/__init__.py @@ -8,6 +8,12 @@ import axpy import flatten import argmax import reshape +import roipooling +import priorbox +import permute +import detection_out +import normalize +import select #custom layer import ends diff --git a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/detection_out.py b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/detection_out.py new file mode 100644 index 0000000000000000000000000000000000000000..b59930a74a28fa82b701b413556371075d6e8113 --- /dev/null +++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/detection_out.py @@ -0,0 +1,79 @@ +""" A custom layer for 'detectionout' used in 'SSD' model to produce outputs + Note: Since Paddle's implementation of 'detectionout' applied 'flatten' and 'softmax' ops on the input of 'conf', + while Caffe's implementation do not. Hence, you should ajust generated 'ssd.py' to remove 'softmax' and 'flatten' ops applied on 'conf' input. +""" + +from .register import register + + +def detectionoutput_shape(input_shape): + """ the output shape of this layer is dynamic and not determined by 'input_shape' + + Args: + @input_shape (list of int): input shape + + Returns: + @output_shape (list of num): a list of numbers represent the output shape + """ + output_shape = [-1, 6] + return output_shape + + +def detectionoutput_layer(inputs, + name, + background_label=0, + share_location=True, + nms_param=None, + keep_top_k=100, + confidence_threshold=0.1): + """ build a layer of type 'detectionout' using fluid + + Args: + @inputs (list of variables): input fluid variables for this layer + @name (str): name for this layer + + Returns: + output (variable): output variable for this layer + """ + import paddle.fluid as fluid + + if nms_param is None: + nms_param = {"nms_threshold": 0.3, "top_k": 10, "eta": 1.0} + + mbox_conf_flatten = inputs[1] + mbox_priorbox = inputs[2] + mbox_priorbox_list = fluid.layers.split(mbox_priorbox, 2, dim=1) + pb = mbox_priorbox_list[0] + pbv = mbox_priorbox_list[1] + pb = fluid.layers.reshape(x=pb, shape=[-1, 4]) + pbv = fluid.layers.reshape(x=pbv, shape=[-1, 4]) + mbox_loc = inputs[0] + mbox_loc = fluid.layers.reshape( + x=mbox_loc, shape=[-1, mbox_conf_flatten.shape[1], 4]) + + default = {"nms_threshold": 0.3, "top_k": 10, "eta": 1.0} + fields = ['eta', 'top_k', 'nms_threshold'] + + for f in default.keys(): + if not nms_param.has_key(f): + nms_param[f] = default[f] + + nmsed_outs = fluid.layers.detection_output( + scores=mbox_conf_flatten, + loc=mbox_loc, + prior_box=pb, + prior_box_var=pbv, + background_label=background_label, + nms_threshold=nms_param["nms_threshold"], + nms_top_k=nms_param["top_k"], + keep_top_k=keep_top_k, + score_threshold=confidence_threshold, + nms_eta=nms_param["eta"]) + + return nmsed_outs + + +register( + kind='DetectionOutput', + shape=detectionoutput_shape, + layer=detectionoutput_layer) diff --git a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/flatten.py b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/flatten.py index 8f7af4266f7fd4b7b6e8ee868f44f1b35f35cb00..ebb97718e3294bb473752bc6235917bed0db0650 100644 --- a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/flatten.py +++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/flatten.py @@ -4,11 +4,6 @@ from .register import register -def import_fluid(): - import paddle.fluid as fluid - return fluid - - def flatten_shape(input_shape, axis=1, end_axis=-1): """ calculate the output shape of this layer using input shape @@ -28,7 +23,7 @@ def flatten_shape(input_shape, axis=1, end_axis=-1): start_axis += len(input_shape) if end_axis < 0: - end_axis += len(input_shape) + end_axis += len(input_shape) + 1 assert start_axis <= end_axis, 'invalid axis[%d] or end_axis[%d] params'\ % (start_axis, end_axis) @@ -52,18 +47,16 @@ def flatten_layer(input, name, axis=1, end_axis=-1): Returns: output (variable): output variable for this layer """ - fluid = import_fluid() + import paddle.fluid as fluid input_shape = list(input.shape) - dims = len(input_shape) - start_axis = axis if axis >= 0 else axis + dims - end_axis = end_axis if end_axis >= 0 else end_axis + dims - assert start_axis <= end_axis, 'invalid axis or end_axis params' - output_shape = input_shape[0:start_axis] - flat_sz = reduce(lambda a, b: a * b, input_shape[start_axis:end_axis]) - output_shape += [flat_sz] - output_shape += input_shape[end_axis:-1] + if input_shape[0] == -1: + input_shape[0] = 1 + output_shape = flatten_shape(input_shape, axis=axis, end_axis=end_axis) + output_shape[0] = -1 + else: + output_shape = flatten_shape(input_shape, axis=axis, end_axis=end_axis) output = fluid.layers.reshape(input, shape=output_shape, name=name) diff --git a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/normalize.py b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/normalize.py new file mode 100644 index 0000000000000000000000000000000000000000..f6e8c00fb126009c5724fa6509c32c2b8c96bace --- /dev/null +++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/normalize.py @@ -0,0 +1,56 @@ +""" A custom layer for 'normalize' op +""" + +from .register import register + + +def normalize_shape(input_shape, + across_spatial=True, + scale_filler=True, + eps=1e-10): + """ calculate the output shape of this layer using input shapes + + Args: + @input_shape (list of tuples): input shape + + Returns: + @output_shape (list of num): a list of numbers represent the output shape + """ + output_shape = input_shape + return output_shape + + +def normalize_layer(input, + name, + across_spatial=True, + scale_filler=True, + channel_shared=False, + eps=1e-10): + """ build a layer of type 'normalize' using fluid + + Args: + @inputs (list of variables): input fluid variables for this layer + @name (str): name for this layer + + Returns: + output (variable): output variable for this layer + """ + import paddle.fluid as fluid + + param_prefix = name.split('.')[0] + + assert across_spatial == False, "Only support across_spatial == False for Normalize[%s]" % ( + name) + l2_norm = fluid.layers.l2_normalize(input, axis=1) # l2 norm along channel + + shape = [1] if channel_shared else [input.shape[1]] + scale_attr = fluid.ParamAttr(name=param_prefix + '_scale') + scale_param = fluid.layers.create_parameter( + shape=shape, dtype=input.dtype, name=name, attr=scale_attr) + + out = fluid.layers.elementwise_mul( + x=l2_norm, y=scale_param, axis=-1 if channel_shared else 1) + return out + + +register(kind='Normalize', shape=normalize_shape, layer=normalize_layer) diff --git a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/permute.py b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/permute.py new file mode 100644 index 0000000000000000000000000000000000000000..f0633fd5ff6b24a47adcd765e221e916bb1508f6 --- /dev/null +++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/permute.py @@ -0,0 +1,40 @@ +""" A custom layer for 'Permute' which is equivalent to transpose in paddle +""" + +from .register import register + + +def permute_shape(input_shape, order): + """ calculate the output shape of this layer using input shapes + + Args: + @input_shape (list of numbers): input shape + + Returns: + @output_shape (list of num): a list of numbers represent the output shape + """ + output_shape = [] + for ii in order: + assert ii < len(input_shape), "invalid order for permute[%s]" % (name) + output_shape.append(input_shape[ii]) + return output_shape + + +def permute_layer(input, name, order): + """ build a layer of type 'permute' using fluid + + Args: + @input (input variable): input fluid variables for this layer + @name (str): name for this layer + @order (list of int): order to permute the dims + + Returns: + output (variable): output variable for this layer + """ + import paddle.fluid as fluid + output = fluid.layers.transpose(input, order, name=name) + + return output + + +register(kind='Permute', shape=permute_shape, layer=permute_layer) diff --git a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/priorbox.py b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/priorbox.py new file mode 100644 index 0000000000000000000000000000000000000000..c3c23fbdb17a4992f41946a9889790f0782bd7e7 --- /dev/null +++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/priorbox.py @@ -0,0 +1,100 @@ +""" A custom layer for 'priorbox' which is used in ssd to generate prior box info + Since the order of prior box is different between caffe and paddle, + we use 'slice' and 'concate' ops to align them. +""" + +from .register import register + + +def priorbox_shape(input_shapes, min_size, max_size=None, aspect_ratio=None): + """ calculate the output shape of this layer using input shapes + + Args: + @input_shapes (list of tuples): a list of input shapes + + Returns: + @output_shape (list of num): a list of numbers represent the output shape + """ + assert len(input_shapes) == 2, "invalid inputs for Priorbox[%s]" % (name) + fc_shape = input_shapes[0] + N = 1 + if not max_size == None: + N += 1 + if not aspect_ratio == None: + N += 2 * len(aspect_ratio) + + N_bbx = fc_shape[2] * fc_shape[3] * N + output_shape = [1, 2, 4 * N_bbx] + return output_shape + + +def priorbox_layer(inputs, + name, + min_size, + step, + max_size=None, + aspect_ratio=None, + flip=True, + clip=False, + variance=[], + offset=0.5): + """ build a layer of type 'Priorbox' using fluid + + Args: + @inputs (list of variables): input fluid variables for this layer + @name (str): name for this layer + + Returns: + output (variable): output variable for this layer + """ + import paddle.fluid as fluid + + assert len(inputs) == 2, "invalid inputs for Priorbox[%s]" % (name) + input = inputs[0] + image = inputs[1] + box, variance_ = fluid.layers.prior_box( + input, + image, + min_size, + max_size, + aspect_ratio, + variance, + flip, + clip, (step, step), + offset, + min_max_aspect_ratios_order=True) + """ + #adjust layout when the output is not consistent with caffe's + + feat_shape = list(input.shape) + H = feat_shape[2] + W = feat_shape[3] + box_tmp = fluid.layers.reshape(box, [H, W, -1, 4]) + nb_prior_bbx = int(box_tmp.shape[2]) + tensor_list = fluid.layers.split(box_tmp, nb_prior_bbx, 2) + + #TODO: + # current implementation for this layer is not efficient + # and we should fix this bug in future when Paddle support the same prior-box layout with Caffe + index_list = [0] + index_list = index_list * nb_prior_bbx + index_offset = 0 + if max_size is not None: + index_list[1] = -1 + index_offset = 1 + for ii in xrange(2 * len(aspect_ratio)): + index_list[ii + 1 + index_offset] = ii + 1 + + tensor_list_gathered = [tensor_list[ii] for ii in index_list] + caffe_prior_bbx = fluid.layers.concat(tensor_list_gathered, axis=2) + box = fluid.layers.reshape(caffe_prior_bbx, [1, 1, -1]) + """ + + box = fluid.layers.reshape(box, [1, 1, -1]) + variance_ = fluid.layers.reshape(variance_, [1, 1, -1]) + output = fluid.layers.concat([box, variance_], axis=1) + + return output + + +register(kind='PriorBox', shape=priorbox_shape, layer=priorbox_layer) diff --git a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/reshape.py b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/reshape.py index 6b8d5681ec68c7a899cb3fdbd4fca0249402bfa0..da82e4d67c7cbb558c223bce528cb23c7feb91c8 100644 --- a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/reshape.py +++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/reshape.py @@ -68,15 +68,23 @@ def reshape_shape(input_sp, shape, axis=0, num_axes=-1): top_dim = shape['dim'][i] if top_dim == 0: copy_axes.append(i) + copy_axis_index = start_axis + i + output_shape[copy_axis_index] = input_shape[copy_axis_index] elif top_dim == -1: assert inferred_axis == -1, "[Reshape]new shape contains multiple -1 dims" + inferred_axis = i else: constant_count *= top_dim if inferred_axis >= 0: explicit_count = constant_count - explicit_count *= count(input_shape[0:start_axis]) - explicit_count *= count(input_shape[end_axis:]) + l = input_shape[0:start_axis] + if len(l) > 0: + explicit_count *= count(l) + + l = input_shape[end_axis:] + if len(l) > 0: + explicit_count *= count(l) for i in range(len(copy_axes)): explicit_count *= output_shape[start_axis + copy_axes[i]] @@ -84,6 +92,7 @@ def reshape_shape(input_sp, shape, axis=0, num_axes=-1): assert input_count % explicit_count == 0, "[Reshape]botom count[%d] "\ "must be divisible by product of the specified dimensions[%d] "\ % (input_count, explicit_count) + output_shape[start_axis + inferred_axis] = input_count / explicit_count output_count = count(output_shape) assert output_count == input_count, "[Reshape]output count[%d] must match input count[%d]" % ( @@ -117,6 +126,7 @@ def reshape_layer(input, name, shape, axis=0, num_axes=-1): output_shape = reshape_shape(input_shape, shape, axis, num_axes) output = fluid.layers.reshape(input, shape=output_shape, name=name) + return output diff --git a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/roipooling.py b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/roipooling.py new file mode 100644 index 0000000000000000000000000000000000000000..ccbf24ab7e7ed624f76dff3c9392315f8020a6bf --- /dev/null +++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/roipooling.py @@ -0,0 +1,53 @@ +""" a custom layer for 'ROIPooling', maybe we should implement this in standard way. + more info can be found here: http://caffe.berkeleyvision.org/tutorial/layers/ROIPooling.html +""" +from .register import register + + +def roipooling_shape(input_shapes, pooled_h, pooled_w, spatial_scale): + """ calculate the output shape of this layer using input shape + + Args: + @input_shape (list of num): a list of number which represents the input shape + @out_max_val (bool): parameter from caffe's ROIPooling layer + @top_k (int): parameter from caffe's ROIPooling layer + @axis (int): parameter from caffe's ROIPooling layer + + Returns: + @output_shape (list of num): a list of numbers represent the output shape + """ + assert len(input_shapes) == 2, "not valid input shape for roipooling layer" + base_fea_shape = input_shapes[0] + rois_shape = input_shapes[1] + output_shape = base_fea_shape + output_shape[0] = rois_shape[0] + output_shape[2] = pooled_h + output_shape[3] = pooled_w + return output_shape + + +def roipooling_layer(inputs, name, pooled_h, pooled_w, spatial_scale): + """ build a layer of type 'ROIPooling' using fluid + + Args: + @input (variable): input fluid variable for this layer + @name (str): name for this layer + @out_max_val (bool): parameter from caffe's ROIPooling layer + @top_k (int): parameter from caffe's ROIPooling layer + @axis (int): parameter from caffe's ROIPooling layer + + Returns: + output (variable): output variable for this layer + """ + + import paddle.fluid as fluid + assert len(inputs) == 2, "not valid input shape for roipooling layer" + base_fea = inputs[0] + rois = inputs[1][:, 1:5] + rois_fea = fluid.layers.roi_pool(base_fea, rois, pooled_h, pooled_w, + spatial_scale) + + return rois_fea + + +register(kind='ROIPooling', shape=roipooling_shape, layer=roipooling_layer) diff --git a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/select.py b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/select.py new file mode 100644 index 0000000000000000000000000000000000000000..708ac64801914fde8792e4f26edf561829063e14 --- /dev/null +++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/select.py @@ -0,0 +1,67 @@ +""" a custom layer for 'select' which is used to replace standard 'Slice' layer + for converting layer with multiple different output tensors +""" +from .register import register + + +def select_shape(input_shape, slice_point, axis=1): + """ calculate the output shape of this layer using input shape + + Args: + @input_shape (list of num): a list of number which represents the input shape + @slice_point (list): parameter from caffe's Slice layer + @axis (int): parameter from caffe's Slice layer + + Returns: + @output_shape (list of num): a list of numbers represent the output shape + """ + + input_shape = list(input_shape) + start = slice_point[0] + if len(slice_point) == 2: + end = slice_point[1] + else: + end = input_shape[axis] + + assert end > start, "invalid slice_point with [start:%d, end:%d]"\ + % (start, end) + output_shape = input_shape + output_shape[axis] = end - start + return output_shape + + +def select_layer(input, name, slice_point, axis=1): + """ build a layer of type 'Slice' using fluid + + Args: + @input (variable): input fluid variable for this layer + @name (str): name for this layer + @slice_point (list): parameter from caffe's Slice layer + @axis (int): parameter from caffe's Slice layer + + Returns: + output (variable): output variable for this layer + """ + import paddle.fluid as fluid + input_shape = list(input.shape) + + start = slice_point[0] + if len(slice_point) == 2: + end = slice_point[1] + else: + end = input_shape[axis] + + sections = [] + if start > 0: + sections.append(start) + + pos = len(sections) + sections.append(end - start) + if end != input_shape[axis]: + sections.append(input_shape[axis] - end) + + outputs = fluid.layers.split(input, sections, dim=axis, name=name) + return outputs[pos] + + +register(kind='Select', shape=select_shape, layer=select_layer) diff --git a/fluid/image_classification/caffe2fluid/kaffe/layers.py b/fluid/image_classification/caffe2fluid/kaffe/layers.py index f2d54c59fe8ee78840ce7d23a67694e495ceddf8..98ef6b65329dd7ba314efdd638f72313d796e39f 100644 --- a/fluid/image_classification/caffe2fluid/kaffe/layers.py +++ b/fluid/image_classification/caffe2fluid/kaffe/layers.py @@ -16,7 +16,7 @@ LAYER_DESCRIPTORS = { 'Concat': shape_concat, 'ContrastiveLoss': shape_scalar, 'Convolution': shape_convolution, - 'Deconvolution': shape_not_implemented, + 'Deconvolution': shape_deconvolution, 'Data': shape_data, 'Dropout': shape_identity, 'DummyData': shape_data, @@ -39,6 +39,7 @@ LAYER_DESCRIPTORS = { 'Pooling': shape_pool, 'Power': shape_identity, 'ReLU': shape_identity, + 'PReLU': shape_identity, 'Scale': shape_identity, 'Sigmoid': shape_identity, 'SigmoidCrossEntropyLoss': shape_scalar, @@ -179,6 +180,11 @@ class LayerAdapter(object): @property def parameters(self): name = NodeDispatch.get_handler_name(self.kind) + if self.kind.lower() == "normalize": + name = "norm" + elif self.kind.lower() == "deconvolution": + name = "convolution" + name = '_'.join((name, 'param')) try: return getattr(self.layer, name) @@ -207,7 +213,9 @@ class LayerAdapter(object): @property def kernel_parameters(self): - assert self.kind in (NodeKind.Convolution, NodeKind.Pooling) + assert self.kind in (NodeKind.Convolution, NodeKind.Pooling,\ + NodeKind.Deconvolution) + params = self.parameters k_h = self.get_kernel_value(params.kernel_h, params.kernel_size, 0) k_w = self.get_kernel_value(params.kernel_w, params.kernel_size, 1) @@ -217,9 +225,25 @@ class LayerAdapter(object): params.stride_w, params.stride, 1, default=1) p_h = self.get_kernel_value(params.pad_h, params.pad, 0, default=0) p_w = self.get_kernel_value(params.pad_w, params.pad, 1, default=0) - return KernelParameters(k_h, k_w, s_h, s_w, p_h, p_w) - -KernelParameters = namedtuple('KernelParameters', [ - 'kernel_h', 'kernel_w', 'stride_h', 'stride_w', 'pad_h', 'pad_w' -]) + dila_h = dila_w = 1 + if self.kind in (NodeKind.Convolution, NodeKind.Deconvolution): + dila_len = len(params.dilation) + if dila_len == 2: + dila_h = params.dilation[0] + dila_w = params.dilation[1] + elif dila_len == 1: + dila_h = dila_w = params.dilation[0] + else: + assert dila_len == 0, "invalid length[%s] of dilation in convolution" % ( + dila_len) + + return KernelParameters(k_h, k_w, s_h, s_w, p_h, p_w, dila_h, dila_w) + + +KernelParameters = namedtuple( + 'KernelParameters', + [ + 'kernel_h', 'kernel_w', 'stride_h', 'stride_w', 'pad_h', 'pad_w', + 'dila_h', 'dila_w' + ], ) diff --git a/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py b/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py index e8b0f2c3a91aaafcfc0951524ac64ed9723ad902..1fc98b057dbf16228c834674f5aee8c4bd123935 100644 --- a/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py +++ b/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py @@ -91,7 +91,7 @@ class Network(object): name = '%s_%s' % (op_name, param_name) v = fluid.global_scope().find_var(name) w = v.get_tensor() - w.set(data, place) + w.set(data.reshape(w.shape()), place) except ValueError: if not ignore_missing: raise @@ -144,6 +144,7 @@ class Network(object): relu=True, relu_negative_slope=0.0, padding=None, + dilation=1, group=1, biased=True): if padding is None: @@ -173,6 +174,7 @@ class Network(object): num_filters=c_o, stride=[s_h, s_w], padding=padding, + dilation=dilation, groups=group, param_attr=fluid.ParamAttr(name=prefix + "weights"), bias_attr=fluid.ParamAttr(name=prefix + "biases"), @@ -183,13 +185,71 @@ class Network(object): return output + @layer + def deconv(self, + input, + k_h, + k_w, + c_o, + s_h, + s_w, + name, + relu=True, + relu_negative_slope=0.0, + padding=None, + dilation=1, + biased=True): + if padding is None: + padding = [0, 0] + + # Get the number of channels in the input + c_i, h_i, w_i = input.shape[1:] + + fluid = import_fluid() + prefix = name + '_' + leaky_relu = False + act = 'relu' + if relu is False: + act = None + elif relu_negative_slope != 0.0: + leaky_relu = True + act = None + + p_h = padding[0] + p_w = padding[1] + h_o = (h_i - 1) * s_h - 2 * p_h + dilation * (k_h - 1) + 1 + w_o = (w_i - 1) * s_w - 2 * p_w + dilation * (k_w - 1) + 1 + output = fluid.layers.conv2d_transpose( + name=self.get_unique_output_name(name, 'conv2d_transpose'), + input=input, + num_filters=c_o, + output_size=[h_o, w_o], + filter_size=[k_h, k_w], + padding=padding, + stride=[s_h, s_w], + dilation=dilation, + param_attr=fluid.ParamAttr(name=prefix + "weights"), + bias_attr=fluid.ParamAttr(name=prefix + "biases"), + act=act) + + if leaky_relu: + output = fluid.layers.leaky_relu(output, alpha=relu_negative_slope) + + return output + @layer def relu(self, input, name): fluid = import_fluid() - output = fluid.layers.relu( - name=self.get_unique_output_name(name, 'relu'), x=input) + output = fluid.layers.relu(input) return output + @layer + def prelu(self, input, channel_shared, name): + #fluid = import_fluid() + #output = fluid.layers.relu(input) + #return output + raise NotImplementedError('prelu not implemented') + def pool(self, pool_type, input, k_h, k_w, s_h, s_w, ceil_mode, padding, name): # Get the number of channels in the input @@ -256,6 +316,12 @@ class Network(object): return fluid.layers.sigmoid( input, name=self.get_unique_output_name(name, 'sigmoid')) + @layer + def tanh(self, input, name): + fluid = import_fluid() + return fluid.layers.tanh( + input, name=self.get_unique_output_name(name, 'tanh')) + @layer def lrn(self, input, radius, alpha, beta, name, bias=1.0): fluid = import_fluid() @@ -322,7 +388,8 @@ class Network(object): name, scale_offset=True, eps=1e-5, - relu=False): + relu=False, + relu_negative_slope=0.0): # NOTE: Currently, only inference is supported fluid = import_fluid() prefix = name + '_' @@ -332,6 +399,15 @@ class Network(object): name=prefix + 'offset') mean_name = prefix + 'mean' variance_name = prefix + 'variance' + + leaky_relu = False + act = 'relu' + if relu is False: + act = None + elif relu_negative_slope != 0.0: + leaky_relu = True + act = None + output = fluid.layers.batch_norm( name=self.get_unique_output_name(name, 'batch_norm'), input=input, @@ -341,7 +417,10 @@ class Network(object): moving_mean_name=mean_name, moving_variance_name=variance_name, epsilon=eps, - act='relu' if relu is True else None) + act=act) + + if leaky_relu: + output = fluid.layers.leaky_relu(output, alpha=relu_negative_slope) return output diff --git a/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py b/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py index 02a600bcd0ac7732b5162070064cd10ff1359dc2..76a318d68de2932c83d158f38a5619043c55f0a8 100644 --- a/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py +++ b/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py @@ -9,21 +9,6 @@ from ..transformers import (DataInjector, DataReshaper, NodeRenamer, from . import network -def get_padding_type(kernel_params, input_shape, output_shape): - '''Translates Caffe's numeric padding to one of ('SAME', 'VALID'). - Caffe supports arbitrary padding values, while Paddle only - supports 'SAME' and 'VALID' modes. So, not all Caffe paddings - can be translated to Paddle. There are some subtleties to - how the padding edge-cases are handled. These are described here: - https://github.com/Yangqing/caffe2/blob/master/caffe2/proto/caffe2_legacy.proto - ''' - k_h, k_w, s_h, s_w, p_h, p_w = kernel_params - if p_h > 0 or p_w > 0: - return [p_h, p_w] - else: - return None - - class PaddleNode(object): '''An intermediate representation for Paddle operations.''' @@ -78,10 +63,11 @@ class PaddleMapper(NodeMapper): def get_kernel_params(self, node): kernel_params = node.layer.kernel_parameters input_shape = node.get_only_parent().output_shape - padding = get_padding_type(kernel_params, input_shape, - node.output_shape) - # Only emit the padding if it's not the default value. - padding = {'padding': padding} if padding is not None else {} + padding = [kernel_params.pad_h, kernel_params.pad_w] + if padding[0] == 0 and padding[1] == 0: + padding = {} + else: + padding = {'padding': padding} return (kernel_params, padding) def map_convolution(self, node): @@ -95,15 +81,44 @@ class PaddleMapper(NodeMapper): kwargs['group'] = group if not node.parameters.bias_term: kwargs['biased'] = False + + if kernel_params.dila_h != 1 or kernel_params.dila_w != 1: + kwargs['dilation'] = (kernel_params.dila_h, kernel_params.dila_w) + assert kernel_params.kernel_h == h assert kernel_params.kernel_w == w return MaybeActivated(node)( 'conv', kernel_params.kernel_h, kernel_params.kernel_w, c_o, kernel_params.stride_h, kernel_params.stride_w, **kwargs) + def map_deconvolution(self, node): + (kernel_params, kwargs) = self.get_kernel_params(node) + h = kernel_params.kernel_h + w = kernel_params.kernel_w + c_o = node.output_shape[1] + c_i = node.parents[0].output_shape[1] + if not node.parameters.bias_term: + kwargs['biased'] = False + + if kernel_params.dila_h != 1 or kernel_params.dila_w != 1: + kwargs['dilation'] = (kernel_params.dila_h, kernel_params.dila_w) + + assert kernel_params.kernel_h == h + assert kernel_params.kernel_w == w + return MaybeActivated(node)( + 'deconv', kernel_params.kernel_h, kernel_params.kernel_w, c_o, + kernel_params.stride_h, kernel_params.stride_w, **kwargs) + def map_relu(self, node): return PaddleNode('relu') + def map_prelu(self, node): + channel_shared = getattr(node.parameters, 'channel_shared', False) + return PaddleNode('prelu', channel_shared) + + def map_tanh(self, node): + return PaddleNode('tanh') + def map_pooling(self, node): pool_type = node.parameters.pool if pool_type == 0: diff --git a/fluid/image_classification/caffe2fluid/kaffe/shapes.py b/fluid/image_classification/caffe2fluid/kaffe/shapes.py index 379cfce6dd3d4c562fd5b89d3b13c467f65c83f8..0e00dca55f1c4df7a3ce8924836db42b00641a32 100644 --- a/fluid/image_classification/caffe2fluid/kaffe/shapes.py +++ b/fluid/image_classification/caffe2fluid/kaffe/shapes.py @@ -6,6 +6,8 @@ from .errors import KaffeError Tensor4DShape = namedtuple('Tensor4DShape', ['batch_size', 'channels', 'height', 'width']) +Tensor3DShape = namedtuple('Tensor3DShape', ['batch_size', 'data1', 'data2']) + Tensor2DShape = namedtuple('Tensor2DShape', ['batch_size', 'data']) ScalarShape = namedtuple('ScalarShape', ['batch_size']) @@ -14,6 +16,8 @@ ScalarShape = namedtuple('ScalarShape', ['batch_size']) def make_tensor(batch_size, d1=None, d2=None, d3=None): if d3 is not None: return Tensor4DShape(batch_size, d1, d2, d3) + elif d1 is not None and d2 is not None: + return Tensor3DShape(batch_size, d1, d2) elif d1 is not None and d2 is None: return Tensor2DShape(batch_size, d1) elif d1 is None and d2 is None and d3 is None: @@ -24,10 +28,14 @@ def make_tensor(batch_size, d1=None, d2=None, d3=None): def get_filter_output_shape(i_h, i_w, params, round_func): - o_h = (i_h + 2 * params.pad_h - params.kernel_h - ) / float(params.stride_h) + 1 - o_w = (i_w + 2 * params.pad_w - params.kernel_w - ) / float(params.stride_w) + 1 + dila_h = getattr(params, 'dila_h', 1) + dila_w = getattr(params, 'dila_w', 1) + + o_h = (i_h + 2 * params.pad_h - + (dila_h * (params.kernel_h - 1) + 1)) / float(params.stride_h) + 1 + o_w = (i_w + 2 * params.pad_w - + (dila_w * (params.kernel_w - 1) + 1)) / float(params.stride_w) + 1 + return (int(round_func(o_h)), int(round_func(o_w))) @@ -97,6 +105,34 @@ def shape_convolution(node): return get_strided_kernel_output_shape(node, math.floor) +def shape_deconvolution(node): + assert node.layer is not None + input_shape = node.get_only_parent().output_shape + h_i = input_shape.height + w_i = input_shape.width + + params = node.layer.kernel_parameters + p_h = params.pad_h + p_w = params.pad_w + + dila_h = params.dila_h + dila_w = params.dila_w + + k_h = params.kernel_h + k_w = params.kernel_w + + s_h = params.stride_h + s_w = params.stride_w + + h_o = (h_i - 1) * s_h - 2 * p_h + dila_h * (k_h - 1) + 1 + w_o = (w_i - 1) * s_w - 2 * p_w + dila_w * (k_w - 1) + 1 + + params = node.layer.parameters + has_c_o = hasattr(params, 'num_output') + c = params.num_output if has_c_o else input_shape.channels + return make_tensor(input_shape.batch_size, c, h_o, w_o) + + def shape_pool(node): global_pool = getattr(node.layer.parameters, 'global_pooling', False) if global_pool: diff --git a/fluid/image_classification/caffe2fluid/kaffe/transformers.py b/fluid/image_classification/caffe2fluid/kaffe/transformers.py index 6b53e05a57a657015662c24ae2d974d6f25e7d00..a07ad42541cd342f70a87974ec140e23a10b4efe 100644 --- a/fluid/image_classification/caffe2fluid/kaffe/transformers.py +++ b/fluid/image_classification/caffe2fluid/kaffe/transformers.py @@ -325,7 +325,8 @@ class ParameterNamer(object): for node in graph.nodes: if node.data is None: continue - if node.kind in (NodeKind.Convolution, NodeKind.InnerProduct): + if node.kind in (NodeKind.Convolution, NodeKind.InnerProduct,\ + NodeKind.Deconvolution): names = ('weights', ) if node.parameters.bias_term: names += ('biases', ) @@ -337,6 +338,8 @@ class ParameterNamer(object): names = ('scale', ) if getattr(node.parameters, 'bias_term', False): names = ('scale', 'offset') + elif node.kind == "Normalize": + names = ('scale', ) else: warn('Unhandled parameters when naming this it[%s]' % (node.kind)) diff --git a/fluid/image_classification/data/ILSVRC2012/download_imagenet2012.sh b/fluid/image_classification/data/ILSVRC2012/download_imagenet2012.sh index 947b8900bd944759437a55c20fb32bca4a1b9380..3e6e0ce6d6df0b8c5a5e7814e510eb64006ce34d 100644 --- a/fluid/image_classification/data/ILSVRC2012/download_imagenet2012.sh +++ b/fluid/image_classification/data/ILSVRC2012/download_imagenet2012.sh @@ -34,7 +34,7 @@ tar xf ${valid_tar} -C ${valid_folder} echo "Download imagenet label file: val_list.txt & train_list.txt" label_file=ImageNet_label.tgz -label_url=http://imagenet-data.bj.bcebos.com/${label_file} +label_url=http://paddle-imagenet-models.bj.bcebos.com/${label_file} wget -nd -c ${label_url} tar zxf ${label_file} diff --git a/fluid/image_classification/reader.py b/fluid/image_classification/reader.py index b503b67ce09fba80bc49a07665ba0290e75f1ed1..3ea26ccbfead97dc97e8858ee05a6582f2b3bc9e 100644 --- a/fluid/image_classification/reader.py +++ b/fluid/image_classification/reader.py @@ -160,5 +160,5 @@ def val(file_list=TEST_LIST): return _reader_creator(file_list, 'val', shuffle=False) -def test(file_list): +def test(file_list=TEST_LIST): return _reader_creator(file_list, 'test', shuffle=False) diff --git a/fluid/image_classification/train.py b/fluid/image_classification/train.py index 74588e21c93e40ee7f5bcde7d6cbbc7c873278ba..51bf9901246cb554baaef22a8e526d0ecd81bd0a 100644 --- a/fluid/image_classification/train.py +++ b/fluid/image_classification/train.py @@ -157,7 +157,8 @@ def train(args): test_reader = paddle.batch(reader.val(), batch_size=test_batch_size) feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) - train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name) + train_exe = fluid.ParallelExecutor( + use_cuda=True if args.use_gpu else False, loss_name=avg_cost.name) fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name] diff --git a/fluid/language_model/.run_ce.sh b/fluid/language_model/.run_ce.sh new file mode 100644 index 0000000000000000000000000000000000000000..5ee2d8aa0582b2b8504f9ba645b6252aa75f23bf --- /dev/null +++ b/fluid/language_model/.run_ce.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 + +cudaid=${language_model:=0} # use 0-th card as default +export CUDA_VISIBLE_DEVICES=$cudaid + +FLAGS_benchmark=true python train.py --enable_ce | python _ce.py + +cudaid=${language_model_m:=0,1,2,3} # use 0,1,2,3 card as default +export CUDA_VISIBLE_DEVICES=$cudaid + +FLAGS_benchmark=true python train.py --enable_ce | python _ce.py diff --git a/fluid/language_model/_ce.py b/fluid/language_model/_ce.py new file mode 100644 index 0000000000000000000000000000000000000000..d4999d7a1e14e333f1c7056b3dc2c5b506682ec6 --- /dev/null +++ b/fluid/language_model/_ce.py @@ -0,0 +1,62 @@ +# this file is only used for continuous evaluation test! + +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi +from kpi import DurationKpi + +imikolov_20_avg_ppl_kpi = CostKpi('imikolov_20_avg_ppl', 0.2, 0) +imikolov_20_pass_duration_kpi = DurationKpi( + 'imikolov_20_pass_duration', 0.02, 0, actived=True) +imikolov_20_avg_ppl_kpi_card4 = CostKpi('imikolov_20_avg_ppl_card4', 0.2, 0) +imikolov_20_pass_duration_kpi_card4 = DurationKpi( + 'imikolov_20_pass_duration_card4', 0.03, 0, actived=True) + +tracking_kpis = [ + imikolov_20_avg_ppl_kpi, + imikolov_20_pass_duration_kpi, + imikolov_20_avg_ppl_kpi_card4, + imikolov_20_pass_duration_kpi_card4, +] + + +def parse_log(log): + ''' + This method should be implemented by model developers. + + The suggestion: + + each line in the log should be key, value, for example: + + " + train_cost\t1.0 + test_cost\t1.0 + train_cost\t1.0 + train_cost\t1.0 + train_acc\t1.2 + " + ''' + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + log_to_ce(log) diff --git a/fluid/language_model/train.py b/fluid/language_model/train.py index 59fc3a987746af7aec9b61b5c817400b6b6546d0..f3e7a7398bf13e14c74ce1d10d90b7bf34031698 100644 --- a/fluid/language_model/train.py +++ b/fluid/language_model/train.py @@ -1,14 +1,28 @@ +import os import sys import time import numpy as np import math - +import argparse import paddle.fluid as fluid -import paddle.v2 as paddle +import paddle import utils +SEED = 102 + + +def parse_args(): + parser = argparse.ArgumentParser("language_model benchmark.") + parser.add_argument( + '--enable_ce', + action='store_true', + help='If set, run \ + the task with continuous evaluation logs.') + args = parser.parse_args() + return args + def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound): """ network definition """ @@ -63,31 +77,26 @@ def train(train_reader, init_low_bound=-0.04, init_high_bound=0.04): """ train network """ + + args = parse_args() + if args.enable_ce: + # random seed must set before configuring the network. + fluid.default_startup_program().random_seed = SEED vocab_size = len(vocab) + #Input data src_wordseq = fluid.layers.data( name="src_wordseq", shape=[1], dtype="int64", lod_level=1) dst_wordseq = fluid.layers.data( name="dst_wordseq", shape=[1], dtype="int64", lod_level=1) + # Train program avg_cost = None - if not parallel: - cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size, - init_low_bound, init_high_bound) - avg_cost = fluid.layers.mean(x=cost) - else: - places = fluid.layers.get_places() - pd = fluid.layers.ParallelDo(places) - with pd.do(): - cost = network( - pd.read_input(src_wordseq), - pd.read_input(dst_wordseq), vocab_size, hid_size, - init_low_bound, init_high_bound) - pd.write_output(cost) - - cost = pd() - avg_cost = fluid.layers.mean(x=cost) + cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size, + init_low_bound, init_high_bound) + avg_cost = fluid.layers.mean(x=cost) + # Optimization to minimize lost sgd_optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.exponential_decay( learning_rate=base_lr, @@ -96,39 +105,56 @@ def train(train_reader, staircase=True)) sgd_optimizer.minimize(avg_cost) + # Initialize executor place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) + + train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name) + total_time = 0.0 + fetch_list = [avg_cost.name] for pass_idx in xrange(pass_num): epoch_idx = pass_idx + 1 print "epoch_%d start" % epoch_idx t0 = time.time() i = 0 + newest_ppl = 0 for data in train_reader(): i += 1 lod_src_wordseq = utils.to_lodtensor( map(lambda x: x[0], data), place) lod_dst_wordseq = utils.to_lodtensor( map(lambda x: x[1], data), place) - ret_avg_cost = exe.run(fluid.default_main_program(), - feed={ - "src_wordseq": lod_src_wordseq, - "dst_wordseq": lod_dst_wordseq - }, - fetch_list=[avg_cost], - use_program_cache=True) - avg_ppl = math.exp(ret_avg_cost[0]) + ret_avg_cost = train_exe.run(feed={ + "src_wordseq": lod_src_wordseq, + "dst_wordseq": lod_dst_wordseq + }, + fetch_list=fetch_list) + avg_ppl = np.exp(ret_avg_cost[0]) + newest_ppl = np.mean(avg_ppl) if i % 100 == 0: - print "step:%d ppl:%.3f" % (i, avg_ppl) + print "step:%d ppl:%.3f" % (i, newest_ppl) t1 = time.time() total_time += t1 - t0 print "epoch:%d num_steps:%d time_cost(s):%f" % (epoch_idx, i, total_time / epoch_idx) + if pass_idx == pass_num - 1 and args.enable_ce: + #Note: The following logs are special for CE monitoring. + #Other situations do not need to care about these logs. + gpu_num = get_cards() + if gpu_num == 1: + print("kpis imikolov_20_pass_duration %s" % + (total_time / epoch_idx)) + print("kpis imikolov_20_avg_ppl %s" % newest_ppl) + else: + print("kpis imikolov_20_pass_duration_card%s %s" % \ + (gpu_num, total_time / epoch_idx)) + print("kpis imikolov_20_avg_ppl_card%s %s" % + (gpu_num, newest_ppl)) save_dir = "%s/epoch_%d" % (model_dir, epoch_idx) feed_var_names = ["src_wordseq", "dst_wordseq"] fetch_vars = [avg_cost] @@ -138,11 +164,22 @@ def train(train_reader, print("finish training") +def get_cards(enable_ce): + if enable_ce: + cards = os.environ.get('CUDA_VISIBLE_DEVICES') + num = len(cards.split(",")) + return num + else: + return fluid.core.get_cuda_device_count() + + def train_net(): """ do training """ batch_size = 20 + args = parse_args() vocab, train_reader, test_reader = utils.prepare_data( - batch_size=batch_size, buffer_size=1000, word_freq_threshold=0) + batch_size=batch_size * get_cards(args.enable_ce), buffer_size=1000, \ + word_freq_threshold=0, enable_ce = args.enable_ce) train( train_reader=train_reader, vocab=vocab, @@ -152,7 +189,7 @@ def train_net(): batch_size=batch_size, pass_num=12, use_cuda=True, - parallel=False, + parallel=True, model_dir="model", init_low_bound=-0.1, init_high_bound=0.1) diff --git a/fluid/language_model/utils.py b/fluid/language_model/utils.py index c5909046176586556a2aedba5dd5d12810b3ea8d..dd03a89835e620dc8432a6ca16392fc5173a12d4 100644 --- a/fluid/language_model/utils.py +++ b/fluid/language_model/utils.py @@ -3,7 +3,7 @@ import time import numpy as np import paddle.fluid as fluid -import paddle.v2 as paddle +import paddle def to_lodtensor(data, place): @@ -22,17 +22,28 @@ def to_lodtensor(data, place): return res -def prepare_data(batch_size, buffer_size=1000, word_freq_threshold=0): +def prepare_data(batch_size, + buffer_size=1000, + word_freq_threshold=0, + enable_ce=False): """ prepare the English Pann Treebank (PTB) data """ vocab = paddle.dataset.imikolov.build_dict(word_freq_threshold) - train_reader = paddle.batch( - paddle.reader.shuffle( + if enable_ce: + train_reader = paddle.batch( paddle.dataset.imikolov.train( vocab, buffer_size, data_type=paddle.dataset.imikolov.DataType.SEQ), - buf_size=buffer_size), - batch_size) + batch_size) + else: + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.imikolov.train( + vocab, + buffer_size, + data_type=paddle.dataset.imikolov.DataType.SEQ), + buf_size=buffer_size), + batch_size) test_reader = paddle.batch( paddle.dataset.imikolov.test( vocab, buffer_size, data_type=paddle.dataset.imikolov.DataType.SEQ), diff --git a/fluid/mnist/.run_ce.sh b/fluid/mnist/.run_ce.sh new file mode 100755 index 0000000000000000000000000000000000000000..d6ccf429b52da1ff26ac02df5af287461a823a98 --- /dev/null +++ b/fluid/mnist/.run_ce.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# This file is only used for continuous evaluation. + +rm -rf *_factor.txt +model_file='model.py' +python $model_file --batch_size 128 --pass_num 5 --device CPU | python _ce.py diff --git a/fluid/mnist/_ce.py b/fluid/mnist/_ce.py new file mode 100644 index 0000000000000000000000000000000000000000..9c2dba53526d2e976252fce05c7ff7f0f44b39b2 --- /dev/null +++ b/fluid/mnist/_ce.py @@ -0,0 +1,61 @@ +# this file is only used for continuous evaluation test! + +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi, DurationKpi, AccKpi + +# NOTE kpi.py should shared in models in some way!!!! + +train_cost_kpi = CostKpi('train_cost', 0.02, actived=True) +test_acc_kpi = AccKpi('test_acc', 0.005, actived=True) +train_duration_kpi = DurationKpi('train_duration', 0.06, actived=True) +train_acc_kpi = AccKpi('train_acc', 0.005, actived=True) + +tracking_kpis = [ + train_acc_kpi, + train_cost_kpi, + test_acc_kpi, + train_duration_kpi, +] + + +def parse_log(log): + ''' + This method should be implemented by model developers. + + The suggestion: + + each line in the log should be key, value, for example: + + " + train_cost\t1.0 + test_cost\t1.0 + train_cost\t1.0 + train_cost\t1.0 + train_acc\t1.2 + " + ''' + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + log_to_ce(log) diff --git a/fluid/mnist/model.py b/fluid/mnist/model.py new file mode 100644 index 0000000000000000000000000000000000000000..e719cca404092fdf96484093d39f2c7c54cd9988 --- /dev/null +++ b/fluid/mnist/model.py @@ -0,0 +1,199 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import argparse +import time + +import paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler + +SEED = 90 +DTYPE = "float32" + +# random seed must set before configuring the network. +fluid.default_startup_program().random_seed = SEED + + +def parse_args(): + parser = argparse.ArgumentParser("mnist model benchmark.") + parser.add_argument( + '--batch_size', type=int, default=128, help='The minibatch size.') + parser.add_argument( + '--iterations', type=int, default=35, help='The number of minibatches.') + parser.add_argument( + '--pass_num', type=int, default=5, help='The number of passes.') + parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help='The device type.') + parser.add_argument( + '--infer_only', action='store_true', help='If set, run forward only.') + parser.add_argument( + '--use_cprof', action='store_true', help='If set, use cProfile.') + parser.add_argument( + '--use_nvprof', + action='store_true', + help='If set, use nvprof for CUDA.') + args = parser.parse_args() + return args + + +def print_arguments(args): + vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and + vars(args)['device'] == 'GPU') + print('----------- Configuration Arguments -----------') + for arg, value in sorted(vars(args).iteritems()): + print('%s: %s' % (arg, value)) + print('------------------------------------------------') + + +def cnn_model(data): + conv_pool_1 = fluid.nets.simple_img_conv_pool( + input=data, + filter_size=5, + num_filters=20, + pool_size=2, + pool_stride=2, + act="relu") + conv_pool_2 = fluid.nets.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + pool_size=2, + pool_stride=2, + act="relu") + + # TODO(dzhwinter) : refine the initializer and random seed settting + SIZE = 10 + input_shape = conv_pool_2.shape + param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE] + scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5 + + predict = fluid.layers.fc( + input=conv_pool_2, + size=SIZE, + act="softmax", + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=scale))) + return predict + + +def eval_test(exe, batch_acc, batch_size_tensor, inference_program): + test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=args.batch_size) + test_pass_acc = fluid.average.WeightedAverage() + for batch_id, data in enumerate(test_reader()): + img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]), + data)).astype(DTYPE) + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([len(y_data), 1]) + + acc, weight = exe.run(inference_program, + feed={"pixel": img_data, + "label": y_data}, + fetch_list=[batch_acc, batch_size_tensor]) + test_pass_acc.add(value=acc, weight=weight) + pass_acc = test_pass_acc.eval() + return pass_acc + + +def run_benchmark(model, args): + if args.use_cprof: + pr = cProfile.Profile() + pr.enable() + start_time = time.time() + # Input data + images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE) + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + # Train program + predict = model(images) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + # Evaluator + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy( + input=predict, label=label, total=batch_size_tensor) + + # inference program + inference_program = fluid.default_main_program().clone() + with fluid.program_guard(inference_program): + inference_program = fluid.io.get_inference_program( + target_vars=[batch_acc, batch_size_tensor]) + + # Optimization + opt = fluid.optimizer.AdamOptimizer( + learning_rate=0.001, beta1=0.9, beta2=0.999) + opt.minimize(avg_cost) + + fluid.memory_optimize(fluid.default_main_program()) + + # Initialize executor + place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0) + exe = fluid.Executor(place) + + # Parameter initialization + exe.run(fluid.default_startup_program()) + + # Reader + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=args.batch_size) + + accuracy = fluid.average.WeightedAverage() + for pass_id in range(args.pass_num): + accuracy.reset() + pass_start = time.time() + every_pass_loss = [] + for batch_id, data in enumerate(train_reader()): + img_data = np.array( + map(lambda x: x[0].reshape([1, 28, 28]), data)).astype(DTYPE) + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([len(y_data), 1]) + + start = time.time() + loss, acc, weight = exe.run( + fluid.default_main_program(), + feed={"pixel": img_data, + "label": y_data}, + fetch_list=[avg_cost, batch_acc, batch_size_tensor] + ) # The accuracy is the accumulation of batches, but not the current batch. + end = time.time() + accuracy.add(value=acc, weight=weight) + every_pass_loss.append(loss) + print("Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" % + (pass_id, batch_id, loss, acc)) + + pass_end = time.time() + + train_avg_acc = accuracy.eval() + train_avg_loss = np.mean(every_pass_loss) + test_avg_acc = eval_test(exe, batch_acc, batch_size_tensor, + inference_program) + + print( + "pass=%d, train_avg_acc=%f,train_avg_loss=%f, test_avg_acc=%f, elapse=%f" + % (pass_id, train_avg_acc, train_avg_loss, test_avg_acc, + (pass_end - pass_start))) + #Note: The following logs are special for CE monitoring. + #Other situations do not need to care about these logs. + print("kpis train_acc %f" % train_avg_acc) + print("kpis train_cost %f" % train_avg_loss) + print("kpis test_acc %f" % test_avg_acc) + print("kpis train_duration %f" % (pass_end - pass_start)) + + +if __name__ == '__main__': + args = parse_args() + print_arguments(args) + if args.use_nvprof and args.device == 'GPU': + with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: + run_benchmark(cnn_model, args) + else: + run_benchmark(cnn_model, args) diff --git a/fluid/neural_machine_translation/rnn_search/.run_ce.sh b/fluid/neural_machine_translation/rnn_search/.run_ce.sh new file mode 100755 index 0000000000000000000000000000000000000000..6be159cb5268ae215998e7a19045f7aa0d620f63 --- /dev/null +++ b/fluid/neural_machine_translation/rnn_search/.run_ce.sh @@ -0,0 +1,5 @@ +###!/bin/bash +####This file is only used for continuous evaluation. + +model_file='train.py' +python $model_file --pass_num 1 --learning_rate 0.001 --save_interval 10 --enable_ce | python _ce.py diff --git a/fluid/neural_machine_translation/rnn_search/_ce.py b/fluid/neural_machine_translation/rnn_search/_ce.py new file mode 100644 index 0000000000000000000000000000000000000000..e948336e82141c4a2072a02f73b51cb7b4396ca0 --- /dev/null +++ b/fluid/neural_machine_translation/rnn_search/_ce.py @@ -0,0 +1,63 @@ +####this file is only used for continuous evaluation test! + +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi, DurationKpi, AccKpi + +#### NOTE kpi.py should shared in models in some way!!!! + +train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True) +test_cost_kpi = CostKpi('test_cost', 0.005, 0, actived=True) +train_duration_kpi = DurationKpi('train_duration', 0.06, 0, actived=True) + +tracking_kpis = [ + train_cost_kpi, + test_cost_kpi, + train_duration_kpi, +] + + +def parse_log(log): + ''' + This method should be implemented by model developers. + + The suggestion: + + each line in the log should be key, value, for example: + + " + train_cost\t1.0 + test_cost\t1.0 + train_cost\t1.0 + train_cost\t1.0 + train_acc\t1.2 + " + ''' + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + print("-----%s" % fs) + kpi_name = fs[1] + kpi_value = float(fs[2]) + yield kpi_name, kpi_value + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + print("*****") + print(log) + print("****") + log_to_ce(log) diff --git a/fluid/neural_machine_translation/rnn_search/args.py b/fluid/neural_machine_translation/rnn_search/args.py new file mode 100644 index 0000000000000000000000000000000000000000..fc0b61b2da1f1a4c2ddbe5785cb4f2f6aad92af6 --- /dev/null +++ b/fluid/neural_machine_translation/rnn_search/args.py @@ -0,0 +1,97 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import distutils.util + + +def parse_args(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--embedding_dim", + type=int, + default=512, + help="The dimension of embedding table. (default: %(default)d)") + parser.add_argument( + "--encoder_size", + type=int, + default=512, + help="The size of encoder bi-rnn unit. (default: %(default)d)") + parser.add_argument( + "--decoder_size", + type=int, + default=512, + help="The size of decoder rnn unit. (default: %(default)d)") + parser.add_argument( + "--batch_size", + type=int, + default=32, + help="The sequence number of a mini-batch data. (default: %(default)d)") + parser.add_argument( + "--dict_size", + type=int, + default=30000, + help="The dictionary capacity. Dictionaries of source sequence and " + "target dictionary have same capacity. (default: %(default)d)") + parser.add_argument( + "--pass_num", + type=int, + default=5, + help="The pass number to train. (default: %(default)d)") + parser.add_argument( + "--learning_rate", + type=float, + default=0.01, + help="Learning rate used to train the model. (default: %(default)f)") + parser.add_argument( + "--no_attention", + action='store_true', + help="If set, run no attention model instead of attention model.") + parser.add_argument( + "--beam_size", + type=int, + default=3, + help="The width for beam searching. (default: %(default)d)") + parser.add_argument( + "--use_gpu", + type=distutils.util.strtobool, + default=True, + help="Whether to use gpu. (default: %(default)d)") + parser.add_argument( + "--max_length", + type=int, + default=50, + help="The maximum length of sequence when doing generation. " + "(default: %(default)d)") + parser.add_argument( + "--save_dir", + type=str, + default="model", + help="Specify the path to save trained models.") + parser.add_argument( + "--save_interval", + type=int, + default=1, + help="Save the trained model every n passes." + "(default: %(default)d)") + parser.add_argument( + "--enable_ce", + action='store_true', + help="If set, run the task with continuous evaluation logs.") + args = parser.parse_args() + return args diff --git a/fluid/neural_machine_translation/rnn_search/attention_model.py b/fluid/neural_machine_translation/rnn_search/attention_model.py new file mode 100644 index 0000000000000000000000000000000000000000..3cf23a96efcdf1fe69fbf26905bcd8a113db6a7d --- /dev/null +++ b/fluid/neural_machine_translation/rnn_search/attention_model.py @@ -0,0 +1,221 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle.fluid as fluid +from paddle.fluid.contrib.decoder.beam_search_decoder import * + + +def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): + def linear(inputs): + return fluid.layers.fc(input=inputs, size=size, bias_attr=True) + + forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t])) + + cell_t = fluid.layers.sums(input=[ + fluid.layers.elementwise_mul( + x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul( + x=input_gate, y=cell_tilde) + ]) + + hidden_t = fluid.layers.elementwise_mul( + x=output_gate, y=fluid.layers.tanh(x=cell_t)) + + return hidden_t, cell_t + + +def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim, + target_dict_dim, is_generating, beam_size, max_length): + """Construct a seq2seq network.""" + + def bi_lstm_encoder(input_seq, gate_size): + # A bi-directional lstm encoder implementation. + # Linear transformation part for input gate, output gate, forget gate + # and cell activation vectors need be done outside of dynamic_lstm. + # So the output size is 4 times of gate_size. + input_forward_proj = fluid.layers.fc(input=input_seq, + size=gate_size * 4, + act='tanh', + bias_attr=False) + forward, _ = fluid.layers.dynamic_lstm( + input=input_forward_proj, size=gate_size * 4, use_peepholes=False) + input_reversed_proj = fluid.layers.fc(input=input_seq, + size=gate_size * 4, + act='tanh', + bias_attr=False) + reversed, _ = fluid.layers.dynamic_lstm( + input=input_reversed_proj, + size=gate_size * 4, + is_reverse=True, + use_peepholes=False) + return forward, reversed + + # The encoding process. Encodes the input words into tensors. + src_word_idx = fluid.layers.data( + name='source_sequence', shape=[1], dtype='int64', lod_level=1) + + src_embedding = fluid.layers.embedding( + input=src_word_idx, + size=[source_dict_dim, embedding_dim], + dtype='float32') + + src_forward, src_reversed = bi_lstm_encoder( + input_seq=src_embedding, gate_size=encoder_size) + + encoded_vector = fluid.layers.concat( + input=[src_forward, src_reversed], axis=1) + + encoded_proj = fluid.layers.fc(input=encoded_vector, + size=decoder_size, + bias_attr=False) + + backward_first = fluid.layers.sequence_pool( + input=src_reversed, pool_type='first') + + decoder_boot = fluid.layers.fc(input=backward_first, + size=decoder_size, + bias_attr=False, + act='tanh') + + cell_init = fluid.layers.fill_constant_batch_size_like( + input=decoder_boot, + value=0.0, + shape=[-1, decoder_size], + dtype='float32') + cell_init.stop_gradient = False + + # Create a RNN state cell by providing the input and hidden states, and + # specifies the hidden state as output. + h = InitState(init=decoder_boot, need_reorder=True) + c = InitState(init=cell_init) + + state_cell = StateCell( + inputs={'x': None, + 'encoder_vec': None, + 'encoder_proj': None}, + states={'h': h, + 'c': c}, + out_state='h') + + def simple_attention(encoder_vec, encoder_proj, decoder_state): + # The implementation of simple attention model + decoder_state_proj = fluid.layers.fc(input=decoder_state, + size=decoder_size, + bias_attr=False) + decoder_state_expand = fluid.layers.sequence_expand( + x=decoder_state_proj, y=encoder_proj) + # concated lod should inherit from encoder_proj + concated = fluid.layers.concat( + input=[encoder_proj, decoder_state_expand], axis=1) + attention_weights = fluid.layers.fc(input=concated, + size=1, + bias_attr=False) + attention_weights = fluid.layers.sequence_softmax( + input=attention_weights) + weigths_reshape = fluid.layers.reshape(x=attention_weights, shape=[-1]) + scaled = fluid.layers.elementwise_mul( + x=encoder_vec, y=weigths_reshape, axis=0) + context = fluid.layers.sequence_pool(input=scaled, pool_type='sum') + return context + + @state_cell.state_updater + def state_updater(state_cell): + # Define the updater of RNN state cell + current_word = state_cell.get_input('x') + encoder_vec = state_cell.get_input('encoder_vec') + encoder_proj = state_cell.get_input('encoder_proj') + prev_h = state_cell.get_state('h') + prev_c = state_cell.get_state('c') + context = simple_attention(encoder_vec, encoder_proj, prev_h) + decoder_inputs = fluid.layers.concat( + input=[context, current_word], axis=1) + h, c = lstm_step(decoder_inputs, prev_h, prev_c, decoder_size) + state_cell.set_state('h', h) + state_cell.set_state('c', c) + + # Define the decoding process + if not is_generating: + # Training process + trg_word_idx = fluid.layers.data( + name='target_sequence', shape=[1], dtype='int64', lod_level=1) + + trg_embedding = fluid.layers.embedding( + input=trg_word_idx, + size=[target_dict_dim, embedding_dim], + dtype='float32') + + # A decoder for training + decoder = TrainingDecoder(state_cell) + + with decoder.block(): + current_word = decoder.step_input(trg_embedding) + encoder_vec = decoder.static_input(encoded_vector) + encoder_proj = decoder.static_input(encoded_proj) + decoder.state_cell.compute_state(inputs={ + 'x': current_word, + 'encoder_vec': encoder_vec, + 'encoder_proj': encoder_proj + }) + h = decoder.state_cell.get_state('h') + decoder.state_cell.update_states() + out = fluid.layers.fc(input=h, + size=target_dict_dim, + bias_attr=True, + act='softmax') + decoder.output(out) + + label = fluid.layers.data( + name='label_sequence', shape=[1], dtype='int64', lod_level=1) + cost = fluid.layers.cross_entropy(input=decoder(), label=label) + avg_cost = fluid.layers.mean(x=cost) + feeding_list = ["source_sequence", "target_sequence", "label_sequence"] + return avg_cost, feeding_list + + else: + # Inference + init_ids = fluid.layers.data( + name="init_ids", shape=[1], dtype="int64", lod_level=2) + init_scores = fluid.layers.data( + name="init_scores", shape=[1], dtype="float32", lod_level=2) + + # A beam search decoder + decoder = BeamSearchDecoder( + state_cell=state_cell, + init_ids=init_ids, + init_scores=init_scores, + target_dict_dim=target_dict_dim, + word_dim=embedding_dim, + input_var_dict={ + 'encoder_vec': encoded_vector, + 'encoder_proj': encoded_proj + }, + topk_size=50, + sparse_emb=True, + max_len=max_length, + beam_size=beam_size, + end_id=1, + name=None) + + decoder.decode() + + translation_ids, translation_scores = decoder() + feeding_list = ["source_sequence"] + + return translation_ids, translation_scores, feeding_list diff --git a/fluid/neural_machine_translation/rnn_search/infer.py b/fluid/neural_machine_translation/rnn_search/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..51bdf9cda4694d4d849ff333e5c8e47978fb8815 --- /dev/null +++ b/fluid/neural_machine_translation/rnn_search/infer.py @@ -0,0 +1,136 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import os + +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.fluid.framework as framework +from paddle.fluid.executor import Executor +from paddle.fluid.contrib.decoder.beam_search_decoder import * + +from args import * +import attention_model +import no_attention_model + + +def infer(): + args = parse_args() + + # Inference + if args.no_attention: + translation_ids, translation_scores, feed_order = \ + no_attention_model.seq_to_seq_net( + args.embedding_dim, + args.encoder_size, + args.decoder_size, + args.dict_size, + args.dict_size, + True, + beam_size=args.beam_size, + max_length=args.max_length) + else: + translation_ids, translation_scores, feed_order = \ + attention_model.seq_to_seq_net( + args.embedding_dim, + args.encoder_size, + args.decoder_size, + args.dict_size, + args.dict_size, + True, + beam_size=args.beam_size, + max_length=args.max_length) + + test_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.test(args.dict_size), buf_size=1000), + batch_size=args.batch_size, + drop_last=False) + + place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() + exe = Executor(place) + exe.run(framework.default_startup_program()) + + model_path = os.path.join(args.save_dir, str(args.pass_num)) + fluid.io.load_persistables( + executor=exe, + dirname=model_path, + main_program=framework.default_main_program()) + + src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size) + + feed_list = [ + framework.default_main_program().global_block().var(var_name) + for var_name in feed_order[0:1] + ] + feeder = fluid.DataFeeder(feed_list, place) + + for batch_id, data in enumerate(test_batch_generator()): + # The value of batch_size may vary in the last batch + batch_size = len(data) + + # Setup initial ids and scores lod tensor + init_ids_data = np.array([0 for _ in range(batch_size)], dtype='int64') + init_scores_data = np.array( + [1. for _ in range(batch_size)], dtype='float32') + init_ids_data = init_ids_data.reshape((batch_size, 1)) + init_scores_data = init_scores_data.reshape((batch_size, 1)) + init_recursive_seq_lens = [1] * batch_size + init_recursive_seq_lens = [ + init_recursive_seq_lens, init_recursive_seq_lens + ] + init_ids = fluid.create_lod_tensor(init_ids_data, + init_recursive_seq_lens, place) + init_scores = fluid.create_lod_tensor(init_scores_data, + init_recursive_seq_lens, place) + + # Feed dict for inference + feed_dict = feeder.feed(map(lambda x: [x[0]], data)) + feed_dict['init_ids'] = init_ids + feed_dict['init_scores'] = init_scores + + fetch_outs = exe.run(framework.default_main_program(), + feed=feed_dict, + fetch_list=[translation_ids, translation_scores], + return_numpy=False) + + # Split the output words by lod levels + lod_level_1 = fetch_outs[0].lod()[1] + token_array = np.array(fetch_outs[0]) + result = [] + for i in xrange(len(lod_level_1) - 1): + sentence_list = [ + trg_dict[token] + for token in token_array[lod_level_1[i]:lod_level_1[i + 1]] + ] + sentence = " ".join(sentence_list[1:-1]) + result.append(sentence) + lod_level_0 = fetch_outs[0].lod()[0] + paragraphs = [ + result[lod_level_0[i]:lod_level_0[i + 1]] + for i in xrange(len(lod_level_0) - 1) + ] + + for paragraph in paragraphs: + print(paragraph) + + +if __name__ == '__main__': + infer() diff --git a/fluid/neural_machine_translation/rnn_search/no_attention_model.py b/fluid/neural_machine_translation/rnn_search/no_attention_model.py new file mode 100644 index 0000000000000000000000000000000000000000..57e7dbe42ad37bbd5d4c85ab4d58b2e1dd3d961b --- /dev/null +++ b/fluid/neural_machine_translation/rnn_search/no_attention_model.py @@ -0,0 +1,127 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle.fluid.layers as layers +from paddle.fluid.contrib.decoder.beam_search_decoder import * + + +def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim, + target_dict_dim, is_generating, beam_size, max_length): + def encoder(): + # Encoder implementation of RNN translation + src_word = layers.data( + name="src_word", shape=[1], dtype='int64', lod_level=1) + src_embedding = layers.embedding( + input=src_word, + size=[source_dict_dim, embedding_dim], + dtype='float32', + is_sparse=True) + + fc1 = layers.fc(input=src_embedding, size=encoder_size * 4, act='tanh') + lstm_hidden0, lstm_0 = layers.dynamic_lstm( + input=fc1, size=encoder_size * 4) + encoder_out = layers.sequence_last_step(input=lstm_hidden0) + return encoder_out + + def decoder_state_cell(context): + # Decoder state cell, specifies the hidden state variable and its updater + h = InitState(init=context, need_reorder=True) + state_cell = StateCell( + inputs={'x': None}, states={'h': h}, out_state='h') + + @state_cell.state_updater + def updater(state_cell): + current_word = state_cell.get_input('x') + prev_h = state_cell.get_state('h') + # make sure lod of h heritted from prev_h + h = layers.fc(input=[prev_h, current_word], + size=decoder_size, + act='tanh') + state_cell.set_state('h', h) + + return state_cell + + def decoder_train(state_cell): + # Decoder for training implementation of RNN translation + trg_word = layers.data( + name="target_word", shape=[1], dtype='int64', lod_level=1) + trg_embedding = layers.embedding( + input=trg_word, + size=[target_dict_dim, embedding_dim], + dtype='float32', + is_sparse=True) + + # A training decoder + decoder = TrainingDecoder(state_cell) + + # Define the computation in each RNN step done by decoder + with decoder.block(): + current_word = decoder.step_input(trg_embedding) + decoder.state_cell.compute_state(inputs={'x': current_word}) + current_score = layers.fc(input=decoder.state_cell.get_state('h'), + size=target_dict_dim, + act='softmax') + decoder.state_cell.update_states() + decoder.output(current_score) + + return decoder() + + def decoder_infer(state_cell): + # Decoder for inference implementation + init_ids = layers.data( + name="init_ids", shape=[1], dtype="int64", lod_level=2) + init_scores = layers.data( + name="init_scores", shape=[1], dtype="float32", lod_level=2) + + # A beam search decoder for inference + decoder = BeamSearchDecoder( + state_cell=state_cell, + init_ids=init_ids, + init_scores=init_scores, + target_dict_dim=target_dict_dim, + word_dim=embedding_dim, + input_var_dict={}, + topk_size=50, + sparse_emb=True, + max_len=max_length, + beam_size=beam_size, + end_id=1, + name=None) + decoder.decode() + translation_ids, translation_scores = decoder() + + return translation_ids, translation_scores + + context = encoder() + state_cell = decoder_state_cell(context) + + if not is_generating: + label = layers.data( + name="target_next_word", shape=[1], dtype='int64', lod_level=1) + + rnn_out = decoder_train(state_cell) + + cost = layers.cross_entropy(input=rnn_out, label=label) + avg_cost = layers.mean(x=cost) + + feeding_list = ['src_word', 'target_word', 'target_next_word'] + return avg_cost, feeding_list + else: + translation_ids, translation_scores = decoder_infer(state_cell) + feeding_list = ['src_word'] + return translation_ids, translation_scores, feeding_list diff --git a/fluid/neural_machine_translation/rnn_search/train.py b/fluid/neural_machine_translation/rnn_search/train.py new file mode 100644 index 0000000000000000000000000000000000000000..ade0dd751af1a2e83bb99da22281061dce44fbd1 --- /dev/null +++ b/fluid/neural_machine_translation/rnn_search/train.py @@ -0,0 +1,170 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import time +import os + +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.fluid.framework as framework +from paddle.fluid.executor import Executor +from paddle.fluid.contrib.decoder.beam_search_decoder import * + +from args import * +import attention_model +import no_attention_model + + +def train(): + args = parse_args() + + if args.enable_ce: + framework.default_startup_program().random_seed = 111 + + # Training process + if args.no_attention: + avg_cost, feed_order = no_attention_model.seq_to_seq_net( + args.embedding_dim, + args.encoder_size, + args.decoder_size, + args.dict_size, + args.dict_size, + False, + beam_size=args.beam_size, + max_length=args.max_length) + else: + avg_cost, feed_order = attention_model.seq_to_seq_net( + args.embedding_dim, + args.encoder_size, + args.decoder_size, + args.dict_size, + args.dict_size, + False, + beam_size=args.beam_size, + max_length=args.max_length) + + # clone from default main program and use it as the validation program + main_program = fluid.default_main_program() + inference_program = fluid.default_main_program().clone() + + optimizer = fluid.optimizer.Adam( + learning_rate=args.learning_rate, + regularization=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=1e-5)) + + optimizer.minimize(avg_cost) + + # Disable shuffle for Continuous Evaluation only + if not args.enable_ce: + train_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(args.dict_size), buf_size=1000), + batch_size=args.batch_size, + drop_last=False) + + test_batch_generator = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.test(args.dict_size), buf_size=1000), + batch_size=args.batch_size, + drop_last=False) + else: + train_batch_generator = paddle.batch( + paddle.dataset.wmt14.train(args.dict_size), + batch_size=args.batch_size, + drop_last=False) + + test_batch_generator = paddle.batch( + paddle.dataset.wmt14.test(args.dict_size), + batch_size=args.batch_size, + drop_last=False) + + place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() + exe = Executor(place) + exe.run(framework.default_startup_program()) + + feed_list = [ + main_program.global_block().var(var_name) for var_name in feed_order + ] + feeder = fluid.DataFeeder(feed_list, place) + + def validation(): + # Use test set as validation each pass + total_loss = 0.0 + count = 0 + val_feed_list = [ + inference_program.global_block().var(var_name) + for var_name in feed_order + ] + val_feeder = fluid.DataFeeder(val_feed_list, place) + + for batch_id, data in enumerate(test_batch_generator()): + val_fetch_outs = exe.run(inference_program, + feed=val_feeder.feed(data), + fetch_list=[avg_cost], + return_numpy=False) + + total_loss += np.array(val_fetch_outs[0])[0] + count += 1 + + return total_loss / count + + for pass_id in range(1, args.pass_num + 1): + pass_start_time = time.time() + words_seen = 0 + for batch_id, data in enumerate(train_batch_generator()): + words_seen += len(data) * 2 + + fetch_outs = exe.run(framework.default_main_program(), + feed=feeder.feed(data), + fetch_list=[avg_cost]) + + avg_cost_train = np.array(fetch_outs[0]) + print('pass_id=%d, batch_id=%d, train_loss: %f' % + (pass_id, batch_id, avg_cost_train)) + # This is for continuous evaluation only + if args.enable_ce and batch_id >= 100: + break + + pass_end_time = time.time() + test_loss = validation() + time_consumed = pass_end_time - pass_start_time + words_per_sec = words_seen / time_consumed + print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" % + (pass_id, test_loss, words_per_sec, time_consumed)) + + # This log is for continuous evaluation only + if args.enable_ce: + print("kpis\ttrain_cost\t%f" % avg_cost_train) + print("kpis\ttest_cost\t%f" % test_loss) + print("kpis\ttrain_duration\t%f" % time_consumed) + + if pass_id % args.save_interval == 0: + model_path = os.path.join(args.save_dir, str(pass_id)) + if not os.path.isdir(model_path): + os.makedirs(model_path) + + fluid.io.save_persistables( + executor=exe, + dirname=model_path, + main_program=framework.default_main_program()) + + +if __name__ == '__main__': + train() diff --git a/fluid/neural_machine_translation/transformer/README_cn.md b/fluid/neural_machine_translation/transformer/README_cn.md index 547b525b40abbfc3009e3948273db52ff394e535..561c5c30debc60a07050a2988bde8a70f9bc3bb5 100644 --- a/fluid/neural_machine_translation/transformer/README_cn.md +++ b/fluid/neural_machine_translation/transformer/README_cn.md @@ -9,13 +9,14 @@ ```text . ├── images # README 文档中的图片 -├── optim.py # learning rate scheduling 计算程序 +├── config.py # 训练、预测以及模型参数配置 ├── infer.py # 预测脚本 ├── model.py # 模型定义 +├── optim.py # learning rate scheduling 计算程序 ├── reader.py # 数据读取接口 ├── README.md # 文档 ├── train.py # 训练脚本 -└── config.py # 训练、预测以及模型参数配置 +└── util.py # wordpiece 数据解码工具 ``` ### 简介 @@ -58,34 +59,43 @@ Decoder 具有和 Encoder 类似的结构,只是相比于组成 Encoder 的 la ### 数据准备 -我们以 [WMT'16 EN-DE 数据集](http://www.statmt.org/wmt16/translation-task.html)作为示例,同时参照论文中的设置使用 BPE(byte-pair encoding)[4]编码的数据,使用这种方式表示的数据能够更好的解决未登录词(out-of-vocabulary,OOV)的问题。用到的 BPE 数据可以参照[这里](https://github.com/google/seq2seq/blob/master/docs/data.md)进行下载,下载后解压,其中 `train.tok.clean.bpe.32000.en` 和 `train.tok.clean.bpe.32000.de` 为使用 BPE 的训练数据(平行语料,分别对应了英语和德语,经过了 tokenize 和 BPE 的处理),`newstest2013.tok.bpe.32000.en` 和 `newstest2013.tok.bpe.32000.de` 等为测试数据(`newstest2013.tok.en` 和 `newstest2013.tok.de` 等则为对应的未使用 BPE 的测试数据),`vocab.bpe.32000` 为相应的词典文件(源语言和目标语言共享该词典文件)。 +WMT 数据集是机器翻译领域公认的主流数据集;WMT 英德和英法数据集也是 Transformer 论文中所用数据集,其中英德数据集使用了 BPE(byte-pair encoding)[4]编码的数据,英法数据集使用了 wordpiece [5]的数据。我们这里也将使用 WMT 英德和英法翻译数据,并和论文保持一致使用 BPE 和 wordpiece 的数据,下面给出了使用的方法。对于其他自定义数据,参照下文遵循或转换为类似的数据格式即可。 + +#### WMT 英德翻译数据 -由于本示例中的数据读取脚本 `reader.py` 使用的样本数据的格式为 `\t` 分隔的的源语言和目标语言句子对(句子中的词之间使用空格分隔), 因此需要将源语言到目标语言的平行语料库文件合并为一个文件,可以执行以下命令进行合并: +[WMT'16 EN-DE 数据集](http://www.statmt.org/wmt16/translation-task.html)是一个中等规模的数据集。参照论文,英德数据集我们使用 BPE 编码的数据,这能够更好的解决未登录词(out-of-vocabulary,OOV)的问题[4]。用到的 BPE 数据可以参照[这里](https://github.com/google/seq2seq/blob/master/docs/data.md)进行下载(如果希望在自定义数据中使用 BPE 编码,可以参照[这里](https://github.com/rsennrich/subword-nmt)进行预处理),下载后解压,其中 `train.tok.clean.bpe.32000.en` 和 `train.tok.clean.bpe.32000.de` 为使用 BPE 的训练数据(平行语料,分别对应了英语和德语,经过了 tokenize 和 BPE 的处理),`newstest2013.tok.bpe.32000.en` 和 `newstest2013.tok.bpe.32000.de` 等为测试数据(`newstest2013.tok.en` 和 `newstest2013.tok.de` 等则为对应的未使用 BPE 的测试数据),`vocab.bpe.32000` 为相应的词典文件(源语言和目标语言共享该词典文件)。 + +由于本示例中的数据读取脚本 `reader.py` 默认使用的样本数据的格式为 `\t` 分隔的的源语言和目标语言句子对(默认句子中的词之间使用空格分隔),因此需要将源语言到目标语言的平行语料库文件合并为一个文件,可以执行以下命令进行合并: ```sh paste -d '\t' train.tok.clean.bpe.32000.en train.tok.clean.bpe.32000.de > train.tok.clean.bpe.32000.en-de ``` -此外,下载的词典文件 `vocab.bpe.32000` 中未包含表示序列开始、序列结束和未登录词的特殊符号,可以使用如下命令在词典中加入 `` 、`` 和 `` 作为这三个特殊符号。 +此外,下载的词典文件 `vocab.bpe.32000` 中未包含表示序列开始、序列结束和未登录词的特殊符号,可以使用如下命令在词典中加入 `` 、`` 和 `` 作为这三个特殊符号(用 BPE 表示数据已有效避免了未登录词的问题,这里加入只是做通用处理)。 ```sh sed -i '1i\\n\n' vocab.bpe.32000 ``` -对于其他自定义数据,遵循或转换为上述的数据格式即可。如果希望在自定义数据中使用 BPE 编码,可以参照[这里](https://github.com/rsennrich/subword-nmt)进行预处理。 +#### WMT 英法翻译数据 + +[WMT'14 EN-FR 数据集](http://www.statmt.org/wmt14/translation-task.html)是一个较大规模的数据集。参照论文,英法数据我们使用 wordpiece 表示的数据,wordpiece 和 BPE 类似同为采用 sub-word units 来解决 OOV 问题的方法[5]。我们提供了已完成预处理的 wordpiece 数据的下载,可以从[这里](http://transformer-data.bj.bcebos.com/wmt14_enfr.tar)下载,其中 `train.wordpiece.en-fr` 为使用 wordpiece 的训练数据,`newstest2014.wordpiece.en-fr` 为测试数据(`newstest2014.tok.en` 和 `newstest2014.tok.fr` 为对应的未经 wordpiece 处理过的测试数据,使用[脚本](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl)进行了 tokenize 的处理),`vocab.wordpiece.en-fr` 为相应的词典文件(源语言和目标语言共享该词典文件)。 + +提供的英法翻译数据无需进行额外的处理,可以直接使用;需要注意的是,这些用 wordpiece 表示的数据中句子内的 token 之间使用 `\x01` 而非空格进行分隔(因部分 token 内包含空格),这需要在训练时进行指定。 ### 模型训练 -`train.py` 是模型训练脚本,可以执行以下命令进行模型训练: +`train.py` 是模型训练脚本。以英德翻译数据为例,可以执行以下命令进行模型训练: ```sh python -u train.py \ --src_vocab_fpath data/vocab.bpe.32000 \ --trg_vocab_fpath data/vocab.bpe.32000 \ --special_token '' '' '' \ --train_file_pattern data/train.tok.clean.bpe.32000.en-de \ + --token_delimiter ' ' \ --use_token_batch True \ --batch_size 3200 \ --sort_type pool \ - --pool_size 200000 \ + --pool_size 200000 ``` -上述命令中设置了源语言词典文件路径(`src_vocab_fpath`)、目标语言词典文件路径(`trg_vocab_fpath`)、训练数据文件(`train_file_pattern`,支持通配符)等数据相关的参数和构造 batch 方式(`use_token_batch` 指出数据按照 token 数目或者 sequence 数目组成 batch)等 reader 相关的参数。有关这些参数更详细的信息可以通过执行以下命令查看: +上述命令中设置了源语言词典文件路径(`src_vocab_fpath`)、目标语言词典文件路径(`trg_vocab_fpath`)、训练数据文件(`train_file_pattern`,支持通配符)等数据相关的参数和构造 batch 方式(`use_token_batch` 指定了数据按照 token 数目或者 sequence 数目组成 batch)等 reader 相关的参数。有关这些参数更详细的信息可以通过执行以下命令查看: ```sh python train.py --help ``` @@ -98,19 +108,20 @@ python -u train.py \ --trg_vocab_fpath data/vocab.bpe.32000 \ --special_token '' '' '' \ --train_file_pattern data/train.tok.clean.bpe.32000.en-de \ + --token_delimiter ' ' \ --use_token_batch True \ --batch_size 3200 \ --sort_type pool \ --pool_size 200000 \ - n_layer 8 \ + n_layer 6 \ n_head 16 \ d_model 1024 \ d_inner_hid 4096 \ dropout 0.3 ``` -有关这些参数更详细信息的还请参考 `config.py` 中的注释说明。 +有关这些参数更详细信息的请参考 `config.py` 中的注释说明。对于英法翻译数据,执行训练和英德翻译训练类似,修改命令中的词典和数据文件为英法数据相应文件的路径,另外要注意的是由于英法翻译数据 token 间不是使用空格进行分隔,需要修改 `token_delimiter` 参数的设置为 `--token_delimiter '\x01'`。 -训练时默认使用所有 GPU,可以通过 `CUDA_VISIBLE_DEVICES` 环境变量来设置使用的 GPU 数目。在训练过程中,每个 epoch 结束后将保存模型到参数 `model_dir` 指定的目录,每个 iteration 将打印如下的日志到标准输出: +训练时默认使用所有 GPU,可以通过 `CUDA_VISIBLE_DEVICES` 环境变量来设置使用的 GPU 数目。也可以只使用 CPU 训练(通过参数 `--divice CPU` 设置),训练速度相对较慢。在训练过程中,每个 epoch 结束后将保存模型到参数 `model_dir` 指定的目录,每个 epoch 内也会每隔1000个 iteration 进行一次保存,每个 iteration 将打印如下的日志到标准输出: ```txt epoch: 0, batch: 0, sum loss: 258793.343750, avg loss: 11.069005, ppl: 64151.644531 epoch: 0, batch: 1, sum loss: 256140.718750, avg loss: 11.059616, ppl: 63552.148438 @@ -126,38 +137,120 @@ epoch: 0, batch: 9, sum loss: 245157.500000, avg loss: 10.966562, ppl: 57905.187 ### 模型预测 -`infer.py` 是模型预测脚本,模型训练完成后可以执行以下命令对指定文件中的文本进行翻译: +`infer.py` 是模型预测脚本。以英德翻译数据为例,模型训练完成后可以执行以下命令对指定文件中的文本进行翻译: ```sh python -u infer.py \ --src_vocab_fpath data/vocab.bpe.32000 \ --trg_vocab_fpath data/vocab.bpe.32000 \ --special_token '' '' '' \ --test_file_pattern data/newstest2013.tok.bpe.32000.en-de \ + --use_wordpiece False \ + --token_delimiter ' ' \ --batch_size 4 \ model_path trained_models/pass_20.infer.model \ - beam_size 5 + beam_size 5 \ max_out_len 256 ``` 和模型训练时类似,预测时也需要设置数据和 reader 相关的参数,并可以执行 `python infer.py --help` 查看这些参数的说明(部分参数意义和训练时略有不同);同样可以在预测命令中设置模型超参数,但应与模型训练时的设置一致;此外相比于模型训练,预测时还有一些额外的参数,如需要设置 `model_path` 来给出模型所在目录,可以设置 `beam_size` 和 `max_out_len` 来指定 Beam Search 算法的搜索宽度和最大深度(翻译长度),这些参数也可以在 `config.py` 中的 `InferTaskConfig` 内查阅注释说明并进行更改设置。 -执行以上预测命令会打印翻译结果到标准输出,每行输出是对应行输入的得分最高的翻译。需要注意,对于使用 BPE 的数据,预测出的翻译结果也将是 BPE 表示的数据,要恢复成原始的数据(这里指 tokenize 后的数据)才能进行正确的评估,可以使用以下命令来恢复 `predict.txt` 内的翻译结果到 `predict.tok.txt` 中。 - +执行以上预测命令会打印翻译结果到标准输出,每行输出是对应行输入的得分最高的翻译。对于使用 BPE 的英德数据,预测出的翻译结果也将是 BPE 表示的数据,要还原成原始的数据(这里指 tokenize 后的数据)才能进行正确的评估,可以使用以下命令来恢复 `predict.txt` 内的翻译结果到 `predict.tok.txt` 中(无需再次 tokenize 处理): ```sh sed 's/@@ //g' predict.txt > predict.tok.txt ``` -接下来就可以使用参考翻译(这里使用的是 `newstest2013.tok.de`)对翻译结果进行 BLEU 指标的评估了。计算 BLEU 值的一个较为广泛使用的脚本可以从[这里](https://raw.githubusercontent.com/moses-smt/mosesdecoder/master/scripts/generic/multi-bleu.perl)获取,获取后执行如下命令: +对于英法翻译的 wordpiece 数据,执行预测和英德翻译预测类似,修改命令中的词典和数据文件为英法数据相应文件的路径,另外需要注意修改 `token_delimiter` 参数的设置为 `--token_delimiter '\x01'`;同时要修改 `use_wordpiece` 参数的设置为 `--use_wordpiece True`,这会在预测时将翻译得到的 wordpiece 数据还原为原始数据输出。为了使用 tokenize 的数据进行评估,还需要对翻译结果进行 tokenize 的处理,[Moses](https://github.com/moses-smt/mosesdecoder) 提供了一系列机器翻译相关的脚本。执行 `git clone https://github.com/moses-smt/mosesdecoder.git` 克隆 mosesdecoder 仓库后,可以使用其中的 `tokenizer.perl` 脚本对 `predict.txt` 内的翻译结果进行 tokenize 处理并输出到 `predict.tok.txt` 中,如下: +```sh +perl mosesdecoder/scripts/tokenizer/tokenizer.perl -l fr < predict.txt > predict.tok.txt +``` + +接下来就可以使用参考翻译对翻译结果进行 BLEU 指标的评估了。计算 BLEU 值的脚本也在 Moses 中包含,以英德翻译 `newstest2013.tok.de` 数据为例,执行如下命令: ```sh -perl multi-bleu.perl data/newstest2013.tok.de < predict.tok.txt +perl mosesdecoder/scripts/generic/multi-bleu.perl data/newstest2013.tok.de < predict.tok.txt ``` 可以看到类似如下的结果。 ``` BLEU = 25.08, 58.3/31.5/19.6/12.6 (BP=0.966, ratio=0.967, hyp_len=61321, ref_len=63412) ``` +目前在未使用 model average 的情况下,使用默认配置单机八卡(同论文中 base model 的配置)进行训练,英德翻译在 `newstest2013` 上测试 BLEU 值为25.,在 `newstest2014` 上测试 BLEU 值为26.;英法翻译在 `newstest2014` 上测试 BLEU 值为36.。 -### 参考文献 +### 分布式训练 + +Transformer 模型支持同步或者异步的分布式训练。分布式的配置主要两个方面: + +1 命令行配置 + + - `--local`,有两个取值,`True`表示单机训练,而`False`表示使用分布式训练。默认为单机训练模式。 + + - `--sync`,有两个取值,但只有当`--local`参数为False才会产生影响,其中`True`表示同步训练模式,`False`表示异步训练模式。默认为同步训练模式。 + +2 环境变量配置 + + 在分布式训练模式下,会手动配置训练的trainer数量和pserver数量。在网络拓扑上,每一个trainer都会和每一个pserver相连,pserver作为服务端,而trainer作为客户端。下面分pserver和trainer说明具体的参数配置: + +1) pserver配置 + +- `PADDLE_IS_LOCAL=[0|1]` 是否是分布式训练,`0`标识是分布式,`1`标识是单机 +- `TRAINING_ROLE=PSERVER` 标识当前节点是pserver + +- `POD_IP=ip` 设置当前pserver使用对外服务的地址 + +- `PADDLE_PORT=port` 设置当前pserver对外服务监听端口号,和`POD_IP`共同构成对外的唯一标识 + +- `PADDLE_TRAINERS_NUM=num` 设置pserver连接的trainer的数量 + +下面是配置的示例, 使用两个pserver, 192.168.2.2上的配置如下: +``` +export PADDLE_PSERVERS=192.168.2.2,192.168.2.3 +export POD_IP=192.168.2.2 +export PADDLE_TRAINERS_NUM=2 +export TRAINING_ROLE=PSERVER +export PADDLE_IS_LOCAL=0 +export PADDLE_PORT=6177 +``` +192.168.2.3上的配置如下: +``` +export PADDLE_PSERVERS=192.168.2.2,192.168.2.3 +export POD_IP=192.168.2.3 +export PADDLE_TRAINERS_NUM=2 +export TRAINING_ROLE=PSERVER +export PADDLE_IS_LOCAL=0 +export PADDLE_PORT=6177 +``` +2) trainer配置 + +- `PADDLE_IS_LOCAL=[0|1]` 是否是分布式训练,`0`标识是分布式,`1`标识是单机 + +- `TRAINING_ROLE=TRAINER` 标识当前节点是trainer + +- `PADDLE_PSERVERS=[ip1,ip2,……]` 设置pserver的ip地址,用于告知trainer互联的pserver的ip, 使用`,`分割 + +- `PADDLE_TRAINER_ID=num` 设置当前节点的编号, 编号的取值范围为0到N-1的整数 + +- `PADDLE_PORT=port` 设置请求的pserver服务端口号 + +下面是配置的示例, 使用两个trainer, trainer 1上的配置如下: +``` +export TRAINING_ROLE=TRAINER +export PADDLE_PSERVERS=192.168.2.2,192.168.2.3 +export PADDLE_TRAINERS_NUM=2 +export PADDLE_TRAINER_ID=0 +export PADDLE_IS_LOCAL=0 +export PADDLE_PORT=6177 +``` +trainer 2上的配置如下: +``` +export TRAINING_ROLE=TRAINER +export PADDLE_PSERVERS=192.168.2.2,192.168.2.3 +export PADDLE_TRAINERS_NUM=2 +export PADDLE_TRAINER_ID=1 +export PADDLE_IS_LOCAL=0 +export PADDLE_PORT=6177 +``` + +### 参考文献 1. Vaswani A, Shazeer N, Parmar N, et al. [Attention is all you need](http://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf)[C]//Advances in Neural Information Processing Systems. 2017: 6000-6010. 2. He K, Zhang X, Ren S, et al. [Deep residual learning for image recognition](http://openaccess.thecvf.com/content_cvpr_2016/papers/He_Deep_Residual_Learning_CVPR_2016_paper.pdf)[C]//Proceedings of the IEEE conference on computer vision and pattern recognition. 2016: 770-778. 3. Ba J L, Kiros J R, Hinton G E. [Layer normalization](https://arxiv.org/pdf/1607.06450.pdf)[J]. arXiv preprint arXiv:1607.06450, 2016. 4. Sennrich R, Haddow B, Birch A. [Neural machine translation of rare words with subword units](https://arxiv.org/pdf/1508.07909)[J]. arXiv preprint arXiv:1508.07909, 2015. +5. Wu Y, Schuster M, Chen Z, et al. [Google's neural machine translation system: Bridging the gap between human and machine translation](https://arxiv.org/pdf/1609.08144.pdf)[J]. arXiv preprint arXiv:1609.08144, 2016. diff --git a/fluid/neural_machine_translation/transformer/infer.py b/fluid/neural_machine_translation/transformer/infer.py index 505bf0b0062bda27a0299ed7d844e2f05abd95b8..8ac6837a3bddf0d280e10aee92964c1a501c0626 100644 --- a/fluid/neural_machine_translation/transformer/infer.py +++ b/fluid/neural_machine_translation/transformer/infer.py @@ -1,5 +1,7 @@ import argparse +import ast import numpy as np +from functools import partial import paddle import paddle.fluid as fluid @@ -11,6 +13,7 @@ from model import fast_decode as fast_decoder from config import * from train import pad_batch_data import reader +import util def parse_args(): @@ -46,6 +49,22 @@ def parse_args(): default=["", "", ""], nargs=3, help="The , and tokens in the dictionary.") + parser.add_argument( + "--use_wordpiece", + type=ast.literal_eval, + default=False, + help="The flag indicating if the data in wordpiece. The EN-FR data " + "we provided is wordpiece data. For wordpiece data, converting ids to " + "original words is a little different and some special codes are " + "provided in util.py to do this.") + parser.add_argument( + "--token_delimiter", + type=partial( + str.decode, encoding="string-escape"), + default=" ", + help="The delimiter used to split tokens in source or target sentences. " + "For EN-DE BPE data we provided, use spaces as token delimiter.; " + "For EN-FR wordpiece data we provided, use '\x01' as token delimiter.") parser.add_argument( 'opts', help='See config.py for all options', @@ -320,7 +339,7 @@ def post_process_seq(seq, seq) -def py_infer(test_data, trg_idx2word): +def py_infer(test_data, trg_idx2word, use_wordpiece): """ Inference by beam search implented by python, while the calculations from symbols to probilities execute by Fluid operators. @@ -399,7 +418,10 @@ def py_infer(test_data, trg_idx2word): seqs = map(post_process_seq, batch_seqs[i]) scores = batch_scores[i] for seq in seqs: - print(" ".join([trg_idx2word[idx] for idx in seq])) + if use_wordpiece: + print(util.subword_ids_to_str(seq, trg_idx2word)) + else: + print(" ".join([trg_idx2word[idx] for idx in seq])) def prepare_batch_input(insts, data_input_names, util_input_names, src_pad_idx, @@ -465,7 +487,7 @@ def prepare_batch_input(insts, data_input_names, util_input_names, src_pad_idx, return input_dict -def fast_infer(test_data, trg_idx2word): +def fast_infer(test_data, trg_idx2word, use_wordpiece): """ Inference by beam search decoder based solely on Fluid operators. """ @@ -520,7 +542,9 @@ def fast_infer(test_data, trg_idx2word): trg_idx2word[idx] for idx in post_process_seq( np.array(seq_ids)[sub_start:sub_end]) - ])) + ]) if not use_wordpiece else util.subtoken_ids_to_str( + post_process_seq(np.array(seq_ids)[sub_start:sub_end]), + trg_idx2word)) scores[i].append(np.array(seq_scores)[sub_end - 1]) print hyps[i][-1] if len(hyps[i]) >= InferTaskConfig.n_best: @@ -534,8 +558,9 @@ def infer(args, inferencer=fast_infer): src_vocab_fpath=args.src_vocab_fpath, trg_vocab_fpath=args.trg_vocab_fpath, fpattern=args.test_file_pattern, - batch_size=args.batch_size, + token_delimiter=args.token_delimiter, use_token_batch=False, + batch_size=args.batch_size, pool_size=args.pool_size, sort_type=reader.SortType.NONE, shuffle=False, @@ -548,7 +573,7 @@ def infer(args, inferencer=fast_infer): clip_last_batch=False) trg_idx2word = test_data.load_dict( dict_path=args.trg_vocab_fpath, reverse=True) - inferencer(test_data, trg_idx2word) + inferencer(test_data, trg_idx2word, args.use_wordpiece) if __name__ == "__main__": diff --git a/fluid/neural_machine_translation/transformer/profile.py b/fluid/neural_machine_translation/transformer/profile.py new file mode 100644 index 0000000000000000000000000000000000000000..caf3125b8ae3ed666eb42b4bbcde73b2f0c42ca3 --- /dev/null +++ b/fluid/neural_machine_translation/transformer/profile.py @@ -0,0 +1,244 @@ +import os +import time +import argparse +import ast +import numpy as np +import multiprocessing + +import paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler + +from train import split_data, read_multiple, prepare_batch_input +from model import transformer, position_encoding_init +from optim import LearningRateScheduler +from config import * +import reader + + +def parse_args(): + parser = argparse.ArgumentParser( + "Profile the training process for Transformer.") + parser.add_argument( + "--src_vocab_fpath", + type=str, + required=True, + help="The path of vocabulary file of source language.") + parser.add_argument( + "--trg_vocab_fpath", + type=str, + required=True, + help="The path of vocabulary file of target language.") + parser.add_argument( + "--train_file_pattern", + type=str, + required=True, + help="The pattern to match training data files.") + parser.add_argument( + "--use_token_batch", + type=ast.literal_eval, + default=True, + help="The flag indicating whether to " + "produce batch data according to token number.") + parser.add_argument( + "--batch_size", + type=int, + default=2048, + help="The number of sequences contained in a mini-batch, or the maximum " + "number of tokens (include paddings) contained in a mini-batch. Note " + "that this represents the number on single device and the actual batch " + "size for multi-devices will multiply the device number.") + parser.add_argument( + "--num_iters", + type=int, + default=10, + help="The maximum number of iterations profiling over.") + parser.add_argument( + "--pool_size", + type=int, + default=10000, + help="The buffer size to pool data.") + parser.add_argument( + "--special_token", + type=str, + default=["", "", ""], + nargs=3, + help="The , and tokens in the dictionary.") + parser.add_argument( + 'opts', + help='See config.py for all options', + default=None, + nargs=argparse.REMAINDER) + parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help="The device type.") + + args = parser.parse_args() + # Append args related to dict + src_dict = reader.DataReader.load_dict(args.src_vocab_fpath) + trg_dict = reader.DataReader.load_dict(args.trg_vocab_fpath) + dict_args = [ + "src_vocab_size", str(len(src_dict)), "trg_vocab_size", + str(len(trg_dict)), "bos_idx", str(src_dict[args.special_token[0]]), + "eos_idx", str(src_dict[args.special_token[1]]), "unk_idx", + str(src_dict[args.special_token[2]]) + ] + merge_cfg_from_list(args.opts + dict_args, + [TrainTaskConfig, ModelHyperParams]) + return args + + +def train_loop(exe, train_progm, init, num_iters, train_data, dev_count, + sum_cost, avg_cost, lr_scheduler, token_num, predict): + + data_input_names = encoder_data_input_fields + decoder_data_input_fields[: + -1] + label_data_input_fields + util_input_names = encoder_util_input_fields + decoder_util_input_fields + + start_time = time.time() + exec_time = 0.0 + for batch_id, data in enumerate(train_data()): + if batch_id >= num_iters: + break + feed_list = [] + total_num_token = 0 + for place_id, data_buffer in enumerate( + split_data( + data, num_part=dev_count)): + data_input_dict, util_input_dict, num_token = prepare_batch_input( + data_buffer, data_input_names, util_input_names, + ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, + ModelHyperParams.n_head, ModelHyperParams.d_model) + total_num_token += num_token + feed_kv_pairs = data_input_dict.items() + util_input_dict.items() + lr_rate = lr_scheduler.update_learning_rate() + feed_kv_pairs += {lr_scheduler.learning_rate.name: lr_rate}.items() + feed_list.append(dict(feed_kv_pairs)) + + if not init: + for pos_enc_param_name in pos_enc_param_names: + pos_enc = position_encoding_init( + ModelHyperParams.max_length + 1, + ModelHyperParams.d_model) + feed_list[place_id][pos_enc_param_name] = pos_enc + for feed_dict in feed_list: + feed_dict[sum_cost.name + "@GRAD"] = 1. / total_num_token + + exe_start_time = time.time() + if dev_count > 1: + # prallel executor + outs = exe.run(fetch_list=[sum_cost.name, token_num.name], + feed=feed_list) + else: + # executor + outs = exe.run(fetch_list=[sum_cost, token_num], feed=feed_list[0]) + exec_time += time.time() - exe_start_time + + sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1]) + total_sum_cost = sum_cost_val.sum() # sum the cost from multi-devices + total_token_num = token_num_val.sum() + total_avg_cost = total_sum_cost / total_token_num + print("batch: %d, sum loss: %f, avg loss: %f, ppl: %f" % + (batch_id, total_sum_cost, total_avg_cost, + np.exp([min(total_avg_cost, 100)]))) + init = True + return time.time() - start_time, exec_time + + +def profile(args): + print args + + if args.device == 'CPU': + TrainTaskConfig.use_gpu = False + + if not TrainTaskConfig.use_gpu: + place = fluid.CPUPlace() + dev_count = multiprocessing.cpu_count() + else: + place = fluid.CUDAPlace(0) + dev_count = fluid.core.get_cuda_device_count() + + exe = fluid.Executor(place) + + sum_cost, avg_cost, predict, token_num = transformer( + ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, + ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, + ModelHyperParams.n_head, ModelHyperParams.d_key, + ModelHyperParams.d_value, ModelHyperParams.d_model, + ModelHyperParams.d_inner_hid, ModelHyperParams.dropout, + ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps) + lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model, + TrainTaskConfig.warmup_steps, + TrainTaskConfig.learning_rate) + + optimizer = fluid.optimizer.Adam( + learning_rate=lr_scheduler.learning_rate, + beta1=TrainTaskConfig.beta1, + beta2=TrainTaskConfig.beta2, + epsilon=TrainTaskConfig.eps) + optimizer.minimize(sum_cost) + + # Initialize the parameters. + if TrainTaskConfig.ckpt_path: + fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path) + lr_scheduler.current_steps = TrainTaskConfig.start_step + else: + exe.run(fluid.framework.default_startup_program()) + + # Disable all sorts for they will be done in the 1st batch. + train_data = reader.DataReader( + src_vocab_fpath=args.src_vocab_fpath, + trg_vocab_fpath=args.trg_vocab_fpath, + fpattern=args.train_file_pattern, + use_token_batch=args.use_token_batch, + batch_size=args.batch_size * (1 if args.use_token_batch else dev_count), + pool_size=args.pool_size, + sort_type='none', + shuffle=False, + shuffle_batch=False, + start_mark=args.special_token[0], + end_mark=args.special_token[1], + unk_mark=args.special_token[2], + # count start and end tokens out + max_length=ModelHyperParams.max_length - 2, + clip_last_batch=False) + train_data = read_multiple( + reader=train_data.batch_generator, + count=dev_count if args.use_token_batch else 1) + + if dev_count > 1: + build_strategy = fluid.BuildStrategy() + build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized + train_exe = fluid.ParallelExecutor( + use_cuda=TrainTaskConfig.use_gpu, + loss_name=sum_cost.name, + main_program=fluid.default_main_program(), + build_strategy=build_strategy) + + print("Warming up ...") + train_loop(exe if dev_count == 1 else train_exe, + fluid.default_main_program(), False, 3, train_data, dev_count, + sum_cost, avg_cost, lr_scheduler, token_num, predict) + + print("\nProfiling ...") + if dev_count == 1: + with profiler.profiler('All', 'total', '/tmp/profile_file'): + total_time, exec_time = train_loop( + exe, + fluid.default_main_program(), True, args.num_iters, train_data, + dev_count, sum_cost, avg_cost, lr_scheduler, token_num, predict) + else: + total_time, exec_time = train_loop( + train_exe, + fluid.default_main_program(), True, args.num_iters, train_data, + dev_count, sum_cost, avg_cost, lr_scheduler, token_num, predict) + print("Elapsed time: total %f s, in executor %f s" % + (total_time, exec_time)) + + +if __name__ == "__main__": + args = parse_args() + profile(args) diff --git a/fluid/neural_machine_translation/transformer/reader.py b/fluid/neural_machine_translation/transformer/reader.py index 27bd82b13a0480e80bdfcdc72eaa670854f4cd3a..a67d0e6d8aa48ef54709b250ce1763c2b0bb524c 100644 --- a/fluid/neural_machine_translation/transformer/reader.py +++ b/fluid/neural_machine_translation/transformer/reader.py @@ -116,9 +116,12 @@ class DataReader(object): :param use_token_batch: Whether to produce batch data according to token number. :type use_token_batch: bool - :param delimiter: The delimiter used to split source and target in each - line of data file. - :type delimiter: basestring + :param field_delimiter: The delimiter used to split source and target in + each line of data file. + :type field_delimiter: basestring + :param token_delimiter: The delimiter used to split tokens in source or + target sentences. + :type token_delimiter: basestring :param start_mark: The token representing for the beginning of sentences in dictionary. :type start_mark: basestring @@ -145,7 +148,8 @@ class DataReader(object): shuffle=True, shuffle_batch=False, use_token_batch=False, - delimiter="\t", + field_delimiter="\t", + token_delimiter=" ", start_mark="", end_mark="", unk_mark="", @@ -164,7 +168,8 @@ class DataReader(object): self._shuffle_batch = shuffle_batch self._min_length = min_length self._max_length = max_length - self._delimiter = delimiter + self._field_delimiter = field_delimiter + self._token_delimiter = token_delimiter self._epoch_batches = [] src_seq_words, trg_seq_words = self._load_data(fpattern, tar_fname) @@ -196,7 +201,7 @@ class DataReader(object): trg_seq_words = [] for line in f_obj: - fields = line.strip().split(self._delimiter) + fields = line.strip().split(self._field_delimiter) if (not self._only_src and len(fields) != 2) or (self._only_src and len(fields) != 1): @@ -207,7 +212,7 @@ class DataReader(object): max_len = -1 for i, seq in enumerate(fields): - seq_words = seq.split() + seq_words = seq.split(self._token_delimiter) max_len = max(max_len, len(seq_words)) if len(seq_words) == 0 or \ len(seq_words) < self._min_length or \ @@ -258,9 +263,9 @@ class DataReader(object): with open(dict_path, "r") as fdict: for idx, line in enumerate(fdict): if reverse: - word_dict[idx] = line.strip() + word_dict[idx] = line.strip('\n') else: - word_dict[line.strip()] = idx + word_dict[line.strip('\n')] = idx return word_dict def _sample_generator(self): diff --git a/fluid/neural_machine_translation/transformer/train.py b/fluid/neural_machine_translation/transformer/train.py index cdd7dfed8235a42da867e08e16e0aef4ba500fa1..5175c48e62aa6cc480e766478a5be154791c362e 100644 --- a/fluid/neural_machine_translation/transformer/train.py +++ b/fluid/neural_machine_translation/transformer/train.py @@ -3,6 +3,8 @@ import time import argparse import ast import numpy as np +import multiprocessing +from functools import partial import paddle import paddle.fluid as fluid @@ -75,11 +77,33 @@ def parse_args(): default=["", "", ""], nargs=3, help="The , and tokens in the dictionary.") + parser.add_argument( + "--token_delimiter", + type=partial( + str.decode, encoding="string-escape"), + default=" ", + help="The delimiter used to split tokens in source or target sentences. " + "For EN-DE BPE data we provided, use spaces as token delimiter. " + "For EN-FR wordpiece data we provided, use '\x01' as token delimiter.") parser.add_argument( 'opts', help='See config.py for all options', default=None, nargs=argparse.REMAINDER) + parser.add_argument( + '--local', + type=ast.literal_eval, + default=True, + help='Whether to run as local mode.') + parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help="The device type.") + parser.add_argument( + '--sync', type=ast.literal_eval, default=True, help="sync mode.") + args = parser.parse_args() # Append args related to dict src_dict = reader.DataReader.load_dict(args.src_vocab_fpath) @@ -247,40 +271,81 @@ def split_data(data, num_part): ] -def train(args): - dev_count = fluid.core.get_cuda_device_count() +def test_context(train_progm, avg_cost, train_exe, dev_count, data_input_names, + util_input_names, sum_cost, token_num): + # Context to do validation. + test_program = train_progm.clone() + with fluid.program_guard(test_program): + test_program = fluid.io.get_inference_program([avg_cost]) - sum_cost, avg_cost, predict, token_num = transformer( - ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, - ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, - ModelHyperParams.n_head, ModelHyperParams.d_key, - ModelHyperParams.d_value, ModelHyperParams.d_model, - ModelHyperParams.d_inner_hid, ModelHyperParams.dropout, - ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps) + val_data = reader.DataReader( + src_vocab_fpath=args.src_vocab_fpath, + trg_vocab_fpath=args.trg_vocab_fpath, + fpattern=args.val_file_pattern, + token_delimiter=args.token_delimiter, + use_token_batch=args.use_token_batch, + batch_size=args.batch_size * (1 if args.use_token_batch else dev_count), + pool_size=args.pool_size, + sort_type=args.sort_type, + start_mark=args.special_token[0], + end_mark=args.special_token[1], + unk_mark=args.special_token[2], + # count start and end tokens out + max_length=ModelHyperParams.max_length - 2, + clip_last_batch=False, + shuffle=False, + shuffle_batch=False) - lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model, - TrainTaskConfig.warmup_steps, - TrainTaskConfig.learning_rate) - optimizer = fluid.optimizer.Adam( - learning_rate=lr_scheduler.learning_rate, - beta1=TrainTaskConfig.beta1, - beta2=TrainTaskConfig.beta2, - epsilon=TrainTaskConfig.eps) - optimizer.minimize(sum_cost) - - place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace() - exe = fluid.Executor(place) + test_exe = fluid.ParallelExecutor( + use_cuda=TrainTaskConfig.use_gpu, + main_program=test_program, + share_vars_from=train_exe) + + def test(exe=test_exe): + test_total_cost = 0 + test_total_token = 0 + test_data = read_multiple( + reader=val_data.batch_generator, + count=dev_count if args.use_token_batch else 1) + for batch_id, data in enumerate(test_data()): + feed_list = [] + for place_id, data_buffer in enumerate( + split_data( + data, num_part=dev_count)): + data_input_dict, util_input_dict, _ = prepare_batch_input( + data_buffer, data_input_names, util_input_names, + ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, + ModelHyperParams.n_head, ModelHyperParams.d_model) + feed_list.append( + dict(data_input_dict.items() + util_input_dict.items())) + + outs = exe.run(feed=feed_list, + fetch_list=[sum_cost.name, token_num.name]) + sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1]) + test_total_cost += sum_cost_val.sum() + test_total_token += token_num_val.sum() + test_avg_cost = test_total_cost / test_total_token + test_ppl = np.exp([min(test_avg_cost, 100)]) + return test_avg_cost, test_ppl + + return test + + +def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler, + token_num, predict): # Initialize the parameters. if TrainTaskConfig.ckpt_path: fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path) lr_scheduler.current_steps = TrainTaskConfig.start_step else: + print "init fluid.framework.default_startup_program" exe.run(fluid.framework.default_startup_program()) train_data = reader.DataReader( src_vocab_fpath=args.src_vocab_fpath, trg_vocab_fpath=args.trg_vocab_fpath, fpattern=args.train_file_pattern, + token_delimiter=args.token_delimiter, use_token_batch=args.use_token_batch, batch_size=args.batch_size * (1 if args.use_token_batch else dev_count), pool_size=args.pool_size, @@ -305,77 +370,26 @@ def train(args): train_exe = fluid.ParallelExecutor( use_cuda=TrainTaskConfig.use_gpu, loss_name=sum_cost.name, + main_program=train_progm, build_strategy=build_strategy) - def test_context(): - # Context to do validation. - test_program = fluid.default_main_program().clone(for_test=True) - test_exe = fluid.ParallelExecutor( - use_cuda=TrainTaskConfig.use_gpu, - main_program=test_program, - share_vars_from=train_exe) - - val_data = reader.DataReader( - src_vocab_fpath=args.src_vocab_fpath, - trg_vocab_fpath=args.trg_vocab_fpath, - fpattern=args.val_file_pattern, - use_token_batch=args.use_token_batch, - batch_size=args.batch_size * - (1 if args.use_token_batch else dev_count), - pool_size=args.pool_size, - sort_type=args.sort_type, - start_mark=args.special_token[0], - end_mark=args.special_token[1], - unk_mark=args.special_token[2], - # count start and end tokens out - max_length=ModelHyperParams.max_length - 2, - clip_last_batch=False, - shuffle=False, - shuffle_batch=False) - - def test(exe=test_exe): - test_total_cost = 0 - test_total_token = 0 - test_data = read_multiple( - reader=val_data.batch_generator, - count=dev_count if args.use_token_batch else 1) - for batch_id, data in enumerate(test_data()): - feed_list = [] - for place_id, data_buffer in enumerate( - split_data( - data, num_part=dev_count)): - data_input_dict, util_input_dict, _ = prepare_batch_input( - data_buffer, data_input_names, util_input_names, - ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, - ModelHyperParams.n_head, ModelHyperParams.d_model) - feed_list.append( - dict(data_input_dict.items() + util_input_dict.items())) - - outs = exe.run(feed=feed_list, - fetch_list=[sum_cost.name, token_num.name]) - sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[ - 1]) - test_total_cost += sum_cost_val.sum() - test_total_token += token_num_val.sum() - test_avg_cost = test_total_cost / test_total_token - test_ppl = np.exp([min(test_avg_cost, 100)]) - return test_avg_cost, test_ppl - - return test - - if args.val_file_pattern is not None: - test = test_context() - data_input_names = encoder_data_input_fields + decoder_data_input_fields[: -1] + label_data_input_fields util_input_names = encoder_util_input_fields + decoder_util_input_fields + + if args.val_file_pattern is not None: + test = test_context(train_progm, avg_cost, train_exe, dev_count, + data_input_names, util_input_names, sum_cost, + token_num) + init = False for pass_id in xrange(TrainTaskConfig.pass_num): pass_start_time = time.time() for batch_id, data in enumerate(train_data()): feed_list = [] total_num_token = 0 - lr_rate = lr_scheduler.update_learning_rate() + if args.local: + lr_rate = lr_scheduler.update_learning_rate() for place_id, data_buffer in enumerate( split_data( data, num_part=dev_count)): @@ -384,11 +398,15 @@ def train(args): ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, ModelHyperParams.n_head, ModelHyperParams.d_model) total_num_token += num_token - feed_list.append( - dict(data_input_dict.items() + util_input_dict.items() + - {lr_scheduler.learning_rate.name: lr_rate}.items())) - - if not init: # init the position encoding table + feed_kv_pairs = data_input_dict.items() + util_input_dict.items( + ) + if args.local: + feed_kv_pairs += { + lr_scheduler.learning_rate.name: lr_rate + }.items() + feed_list.append(dict(feed_kv_pairs)) + + if not init: for pos_enc_param_name in pos_enc_param_names: pos_enc = position_encoding_init( ModelHyperParams.max_length + 1, @@ -406,12 +424,16 @@ def train(args): print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" % (pass_id, batch_id, total_sum_cost, total_avg_cost, np.exp([min(total_avg_cost, 100)]))) + if batch_id > 0 and batch_id % 1000 == 0: + fluid.io.save_persistables( + exe, + os.path.join(TrainTaskConfig.ckpt_dir, "latest.checkpoint")) init = True # Validate and save the model for inference. - print("epoch: %d, " % pass_id + ( - "val avg loss: %f, val ppl: %f, " % test() - if args.val_file_pattern is not None else "") + "consumed %fs" % ( - time.time() - pass_start_time)) + print("epoch: %d, " % pass_id + + ("val avg loss: %f, val ppl: %f, " % test() + if args.val_file_pattern is not None else "") + "consumed %fs" % + (time.time() - pass_start_time)) fluid.io.save_persistables( exe, os.path.join(TrainTaskConfig.ckpt_dir, @@ -422,6 +444,107 @@ def train(args): data_input_names[:-2] + util_input_names, [predict], exe) +def train(args): + # priority: ENV > args > config + is_local = os.getenv("PADDLE_IS_LOCAL", "1") + if is_local == '0': + args.local = False + print args + + if args.device == 'CPU': + TrainTaskConfig.use_gpu = False + + training_role = os.getenv("TRAINING_ROLE", "TRAINER") + + if training_role == "PSERVER" or (not TrainTaskConfig.use_gpu): + place = fluid.CPUPlace() + dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + else: + place = fluid.CUDAPlace(0) + dev_count = fluid.core.get_cuda_device_count() + + exe = fluid.Executor(place) + + sum_cost, avg_cost, predict, token_num = transformer( + ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, + ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, + ModelHyperParams.n_head, ModelHyperParams.d_key, + ModelHyperParams.d_value, ModelHyperParams.d_model, + ModelHyperParams.d_inner_hid, ModelHyperParams.dropout, + ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps) + lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model, + TrainTaskConfig.warmup_steps, + TrainTaskConfig.learning_rate) + + if args.local: + optimizer = fluid.optimizer.Adam( + learning_rate=lr_scheduler.learning_rate, + beta1=TrainTaskConfig.beta1, + beta2=TrainTaskConfig.beta2, + epsilon=TrainTaskConfig.eps) + optimizer.minimize(sum_cost) + elif args.sync == False: + optimizer = fluid.optimizer.SGD(0.003) + optimizer.minimize(sum_cost) + else: + lr_decay = fluid.layers\ + .learning_rate_scheduler\ + .noam_decay(ModelHyperParams.d_model, + TrainTaskConfig.warmup_steps) + + optimizer = fluid.optimizer.Adam( + learning_rate=lr_decay, + beta1=TrainTaskConfig.beta1, + beta2=TrainTaskConfig.beta2, + epsilon=TrainTaskConfig.eps) + optimizer.minimize(sum_cost) + + if args.local: + print("local start_up:") + train_loop(exe, + fluid.default_main_program(), dev_count, sum_cost, avg_cost, + lr_scheduler, token_num, predict) + else: + port = os.getenv("PADDLE_PORT", "6174") + pserver_ips = os.getenv("PADDLE_PSERVERS") # ip,ip... + eplist = [] + for ip in pserver_ips.split(","): + eplist.append(':'.join([ip, port])) + pserver_endpoints = ",".join(eplist) # ip:port,ip:port... + trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "0")) + current_endpoint = os.getenv("POD_IP") + ":" + port + trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + t = fluid.DistributeTranspiler() + t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) + + if training_role == "PSERVER": + current_endpoint = os.getenv("POD_IP") + ":" + os.getenv( + "PADDLE_PORT") + if not current_endpoint: + print("need env SERVER_ENDPOINT") + exit(1) + pserver_prog = t.get_pserver_program(current_endpoint) + pserver_startup = t.get_startup_program(current_endpoint, + pserver_prog) + + print "psserver begin run" + with open('pserver_startup.desc', 'w') as f: + f.write(str(pserver_startup)) + with open('pserver_prog.desc', 'w') as f: + f.write(str(pserver_prog)) + exe.run(pserver_startup) + exe.run(pserver_prog) + elif training_role == "TRAINER": + + trainer_prog = t.get_trainer_program() + with open('trainer_prog.desc', 'w') as f: + f.write(str(trainer_prog)) + train_loop(exe, trainer_prog, dev_count, sum_cost, avg_cost, + lr_scheduler, token_num, predict) + else: + print("environment var TRAINER_ROLE should be TRAINER os PSERVER") + + if __name__ == "__main__": args = parse_args() train(args) diff --git a/fluid/neural_machine_translation/transformer/util.py b/fluid/neural_machine_translation/transformer/util.py new file mode 100644 index 0000000000000000000000000000000000000000..190abf92f4f48bfc943bd99bf61a222cc6c9d2f0 --- /dev/null +++ b/fluid/neural_machine_translation/transformer/util.py @@ -0,0 +1,68 @@ +import sys +import re +import six +import unicodedata + +# Regular expression for unescaping token strings. +# '\u' is converted to '_' +# '\\' is converted to '\' +# '\213;' is converted to unichr(213) +# Inverse of escaping. +_UNESCAPE_REGEX = re.compile(r"\\u|\\\\|\\([0-9]+);") + +# This set contains all letter and number characters. +_ALPHANUMERIC_CHAR_SET = set( + six.unichr(i) for i in range(sys.maxunicode) + if (unicodedata.category(six.unichr(i)).startswith("L") or + unicodedata.category(six.unichr(i)).startswith("N"))) + + +def unescape_token(escaped_token): + """ + Inverse of encoding escaping. + """ + + def match(m): + if m.group(1) is None: + return u"_" if m.group(0) == u"\\u" else u"\\" + + try: + return six.unichr(int(m.group(1))) + except (ValueError, OverflowError) as _: + return u"\u3013" # Unicode for undefined character. + + trimmed = escaped_token[:-1] if escaped_token.endswith( + "_") else escaped_token + return _UNESCAPE_REGEX.sub(match, trimmed) + + +def subtoken_ids_to_str(subtoken_ids, vocabs): + """ + Convert a list of subtoken(word piece) ids to a native string. + Refer to SubwordTextEncoder in Tensor2Tensor. + """ + subtokens = [vocabs.get(subtoken_id, u"") for subtoken_id in subtoken_ids] + + # Convert a list of subtokens to a list of tokens. + concatenated = "".join([ + t if isinstance(t, unicode) else t.decode("utf-8") for t in subtokens + ]) + split = concatenated.split("_") + tokens = [] + for t in split: + if t: + unescaped = unescape_token(t + "_") + if unescaped: + tokens.append(unescaped) + + # Convert a list of tokens to a unicode string (by inserting spaces bewteen + # word tokens). + token_is_alnum = [t[0] in _ALPHANUMERIC_CHAR_SET for t in tokens] + ret = [] + for i, token in enumerate(tokens): + if i > 0 and token_is_alnum[i - 1] and token_is_alnum[i]: + ret.append(u" ") + ret.append(token) + seq = "".join(ret) + + return seq.encode("utf-8") diff --git a/fluid/neural_machine_translation_rnn_search b/fluid/neural_machine_translation_rnn_search new file mode 120000 index 0000000000000000000000000000000000000000..29002f1776a3f4e0bfa0b32a1aebc44d66b65628 --- /dev/null +++ b/fluid/neural_machine_translation_rnn_search @@ -0,0 +1 @@ +./neural_machine_translation/rnn_search \ No newline at end of file diff --git a/fluid/object_detection/.run_ce.sh b/fluid/object_detection/.run_ce.sh new file mode 100755 index 0000000000000000000000000000000000000000..50809e77043e0eb0bb5f6bf5a9904d8113c85756 --- /dev/null +++ b/fluid/object_detection/.run_ce.sh @@ -0,0 +1,19 @@ +###!/bin/bash +####This file is only used for continuous evaluation. + +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 + +if [ ! -d "/root/.cache/paddle/dataset/pascalvoc" ];then + mkdir -p /root/.cache/paddle/dataset/pascalvoc + ./data/pascalvoc/download.sh + cp -r ./data/pascalvoc/. /home/.cache/paddle/dataset/pascalvoc +fi + +cudaid=${object_detection_cudaid:=0} +export CUDA_VISIBLE_DEVICES=$cudaid +FLAGS_benchmark=true python train.py --enable_ce=True --batch_size=64 --num_passes=2 --data_dir=/root/.cache/paddle/dataset/pascalvoc/ | python _ce.py + +cudaid=${object_detection_cudaid:=0,1,2,3} +export CUDA_VISIBLE_DEVICES=$cudaid +FLAGS_benchmark=true python train.py --enable_ce=True --batch_size=64 --num_passes=2 --data_dir=/root/.cache/paddle/dataset/pascalvoc/ | python _ce.py diff --git a/fluid/object_detection/_ce.py b/fluid/object_detection/_ce.py new file mode 100644 index 0000000000000000000000000000000000000000..4f17ff324d8c4bb1d0cecca2401e584a7ec5e3af --- /dev/null +++ b/fluid/object_detection/_ce.py @@ -0,0 +1,72 @@ +####this file is only used for continuous evaluation test! + +import os +import sys +sys.path.append(os.environ['ceroot']) +from kpi import CostKpi, DurationKpi, AccKpi + +#### NOTE kpi.py should shared in models in some way!!!! + +train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True) +test_acc_kpi = AccKpi('test_acc', 0.01, 0, actived=True) +train_speed_kpi = AccKpi('train_speed', 0.2, 0, actived=True) +train_cost_card4_kpi = CostKpi('train_cost_card4', 0.02, 0, actived=True) +test_acc_card4_kpi = AccKpi('test_acc_card4', 0.01, 0, actived=True) +train_speed_card4_kpi = AccKpi('train_speed_card4', 0.2, 0, actived=True) + +tracking_kpis = [ + train_cost_kpi, + test_acc_kpi, + train_speed_kpi, + train_cost_card4_kpi, + test_acc_card4_kpi, + train_speed_card4_kpi, +] + + +def parse_log(log): + ''' + This method should be implemented by model developers. + + The suggestion: + + each line in the log should be key, value, for example: + + " + train_cost\t1.0 + test_cost\t1.0 + train_cost\t1.0 + train_cost\t1.0 + train_acc\t1.2 + " + ''' + #kpi_map = {} + for line in log.split('\n'): + fs = line.strip().split('\t') + print(fs) + if len(fs) == 3 and fs[0] == 'kpis': + print("-----%s" % fs) + kpi_name = fs[1] + kpi_value = float(fs[2]) + #kpi_map[kpi_name] = kpi_value + yield kpi_name, kpi_value + #return kpi_map + + +def log_to_ce(log): + kpi_tracker = {} + for kpi in tracking_kpis: + kpi_tracker[kpi.name] = kpi + + for (kpi_name, kpi_value) in parse_log(log): + print(kpi_name, kpi_value) + kpi_tracker[kpi_name].add_record(kpi_value) + kpi_tracker[kpi_name].persist() + + +if __name__ == '__main__': + log = sys.stdin.read() + print("*****") + print(log) + print("****") + log_to_ce(log) diff --git a/fluid/object_detection/data/coco/download.sh b/fluid/object_detection/data/coco/download.sh index 50bc8a6894463549a2b18197704450621e969c9d..6f262ccebb635e993b35349890a793430d9ad597 100644 --- a/fluid/object_detection/data/coco/download.sh +++ b/fluid/object_detection/data/coco/download.sh @@ -11,10 +11,10 @@ wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip # Extract the data. echo "Extracting..." -unzip train2014.tar -unzip val2014.tar -unzip train2017.tar -unzip val2017.tar -unzip annotations_trainval2014.tar -unzip annotations_trainval2017.tar +unzip train2014.zip +unzip val2014.zip +unzip train2017.zip +unzip val2017.zip +unzip annotations_trainval2014.zip +unzip annotations_trainval2017.zip diff --git a/fluid/object_detection/mobilenet_ssd.py b/fluid/object_detection/mobilenet_ssd.py index c39883196056aede5d410554e14a0198e540d754..b87c0558447397e0a5b6a7a1e689a316d1ee8e14 100644 --- a/fluid/object_detection/mobilenet_ssd.py +++ b/fluid/object_detection/mobilenet_ssd.py @@ -1,4 +1,3 @@ -import paddle.v2 as paddle import paddle.fluid as fluid from paddle.fluid.initializer import MSRA from paddle.fluid.param_attr import ParamAttr diff --git a/fluid/object_detection/train.py b/fluid/object_detection/train.py index c29bd070eda4cf82f5ac36a3eb5699ae13ae86d2..aadcc904f55f077c06630a1f8e27a6bf4b422c05 100644 --- a/fluid/object_detection/train.py +++ b/fluid/object_detection/train.py @@ -23,7 +23,7 @@ add_arg('dataset', str, 'pascalvoc', "coco2014, coco2017, and pascalv add_arg('model_save_dir', str, 'model', "The path to save model.") add_arg('pretrained_model', str, 'pretrained/ssd_mobilenet_v1_coco/', "The init model path.") add_arg('apply_distort', bool, True, "Whether apply distort.") -add_arg('apply_expand', bool, True, "Whether appley expand.") +add_arg('apply_expand', bool, True, "Whether apply expand.") add_arg('nms_threshold', float, 0.45, "NMS threshold.") add_arg('ap_version', str, '11point', "integral, 11point.") add_arg('resize_h', int, 300, "The resized image height.") @@ -32,6 +32,8 @@ add_arg('mean_value_B', float, 127.5, "Mean value for B channel which will add_arg('mean_value_G', float, 127.5, "Mean value for G channel which will be subtracted.") #116.78 add_arg('mean_value_R', float, 127.5, "Mean value for R channel which will be subtracted.") #103.94 add_arg('is_toy', int, 0, "Toy for quick debug, 0 means using all data, while n means using only n sample.") +add_arg('data_dir', str, 'data/pascalvoc', "data directory") +add_arg('enable_ce', bool, False, "Whether use CE to evaluate the model") #yapf: enable @@ -44,6 +46,9 @@ def train(args, num_passes, model_save_dir, pretrained_model=None): + if args.enable_ce: + fluid.framework.default_startup_program().random_seed = 111 + image_shape = [3, data_args.resize_h, data_args.resize_w] if 'coco' in data_args.dataset: num_classes = 91 @@ -117,8 +122,12 @@ def train(args, train_exe = fluid.ParallelExecutor( use_cuda=args.use_gpu, loss_name=loss.name) - train_reader = paddle.batch( - reader.train(data_args, train_file_list), batch_size=batch_size) + if not args.enable_ce: + train_reader = paddle.batch( + reader.train(data_args, train_file_list), batch_size=batch_size) + else: + train_reader = paddle.batch( + reader.train(data_args, train_file_list, False), batch_size=batch_size) test_reader = paddle.batch( reader.test(data_args, val_file_list), batch_size=batch_size) feeder = fluid.DataFeeder( @@ -136,22 +145,29 @@ def train(args, def test(pass_id, best_map): _, accum_map = map_eval.get_map_var() map_eval.reset(exe) + every_pass_map=[] for batch_id, data in enumerate(test_reader()): test_map, = exe.run(test_program, feed=feeder.feed(data), fetch_list=[accum_map]) if batch_id % 20 == 0: + every_pass_map.append(test_map) print("Batch {0}, map {1}".format(batch_id, test_map)) + mean_map = np.mean(every_pass_map) if test_map[0] > best_map: best_map = test_map[0] save_model('best_model') print("Pass {0}, test map {1}".format(pass_id, test_map)) - return best_map + return best_map, mean_map + total_time = 0.0 for pass_id in range(num_passes): + epoch_idx = pass_id + 1 start_time = time.time() prev_start_time = start_time - end_time = 0 + every_pass_loss = [] + iter = 0 + pass_duration = 0.0 for batch_id, data in enumerate(train_reader()): prev_start_time = start_time start_time = time.time() @@ -165,26 +181,40 @@ def train(args, loss_v, = exe.run(fluid.default_main_program(), feed=feeder.feed(data), fetch_list=[loss]) - end_time = time.time() loss_v = np.mean(np.array(loss_v)) + every_pass_loss.append(loss_v) if batch_id % 20 == 0: print("Pass {0}, batch {1}, loss {2}, time {3}".format( pass_id, batch_id, loss_v, start_time - prev_start_time)) - best_map = test(pass_id, best_map) + + end_time = time.time() + best_map, mean_map = test(pass_id, best_map) + if args.enable_ce and pass_id == 1: + total_time += end_time - start_time + train_avg_loss = np.mean(every_pass_loss) + if devices_num == 1: + print ("kpis train_cost %s" % train_avg_loss) + print ("kpis test_acc %s" % mean_map) + print ("kpis train_speed %s" % (total_time / epoch_idx)) + else: + print ("kpis train_cost_card%s %s" % (devices_num, train_avg_loss)) + print ("kpis test_acc_card%s %s" % (devices_num, mean_map)) + print ("kpis train_speed_card%s %f" % (devices_num, total_time / epoch_idx)) + + if pass_id % 10 == 0 or pass_id == num_passes - 1: save_model(str(pass_id)) print("Best test map {0}".format(best_map)) - if __name__ == '__main__': args = parser.parse_args() print_arguments(args) - data_dir = 'data/pascalvoc' - train_file_list = 'trainval.txt' - val_file_list = 'test.txt' + data_dir = args.data_dir label_file = 'label_list' model_save_dir = args.model_save_dir + train_file_list = 'trainval.txt' + val_file_list = 'test.txt' if 'coco' in args.dataset: data_dir = 'data/coco' if '2014' in args.dataset: diff --git a/fluid/ocr_recognition/README.md b/fluid/ocr_recognition/README.md index 4475695fdd06596b44967bd2d5d44530cccbf2e5..50b72440818384a0d8e80ab214faaabddbd93f90 100644 --- a/fluid/ocr_recognition/README.md +++ b/fluid/ocr_recognition/README.md @@ -113,6 +113,10 @@ data/test_images/00003.jpg ``` env CUDA_VISIABLE_DEVICES=0 python ctc_train.py ``` +使用默认数据在CPU上训练: +``` +env OMP_NUM_THREADS= python ctc_train.py --use_gpu False --parallel=False +``` 使用默认数据在GPU多卡上训练: diff --git a/fluid/ocr_recognition/crnn_ctc_model.py b/fluid/ocr_recognition/crnn_ctc_model.py index 79cf7b23954ce3331f46c50ee165dac720deae43..a5d4c70f868a6c973ff3e8b372a2eb387d1f191f 100644 --- a/fluid/ocr_recognition/crnn_ctc_model.py +++ b/fluid/ocr_recognition/crnn_ctc_model.py @@ -12,7 +12,8 @@ def conv_bn_pool(input, bias=None, param_0=None, is_test=False, - pooling=True): + pooling=True, + use_cudnn=False): tmp = input for i in xrange(group): tmp = fluid.layers.conv2d( @@ -22,7 +23,7 @@ def conv_bn_pool(input, padding=1, param_attr=param if param_0 is None else param_0, act=None, # LinearActivation - use_cudnn=True) + use_cudnn=use_cudnn) tmp = fluid.layers.batch_norm( input=tmp, act=act, @@ -35,13 +36,17 @@ def conv_bn_pool(input, pool_size=2, pool_type='max', pool_stride=2, - use_cudnn=True, + use_cudnn=use_cudnn, ceil_mode=True) return tmp -def ocr_convs(input, regularizer=None, gradient_clip=None, is_test=False): +def ocr_convs(input, + regularizer=None, + gradient_clip=None, + is_test=False, + use_cudnn=False): b = fluid.ParamAttr( regularizer=regularizer, gradient_clip=gradient_clip, @@ -56,12 +61,36 @@ def ocr_convs(input, regularizer=None, gradient_clip=None, is_test=False): initializer=fluid.initializer.Normal(0.0, 0.01)) tmp = input tmp = conv_bn_pool( - tmp, 2, [16, 16], param=w1, bias=b, param_0=w0, is_test=is_test) + tmp, + 2, [16, 16], + param=w1, + bias=b, + param_0=w0, + is_test=is_test, + use_cudnn=use_cudnn) - tmp = conv_bn_pool(tmp, 2, [32, 32], param=w1, bias=b, is_test=is_test) - tmp = conv_bn_pool(tmp, 2, [64, 64], param=w1, bias=b, is_test=is_test) tmp = conv_bn_pool( - tmp, 2, [128, 128], param=w1, bias=b, is_test=is_test, pooling=False) + tmp, + 2, [32, 32], + param=w1, + bias=b, + is_test=is_test, + use_cudnn=use_cudnn) + tmp = conv_bn_pool( + tmp, + 2, [64, 64], + param=w1, + bias=b, + is_test=is_test, + use_cudnn=use_cudnn) + tmp = conv_bn_pool( + tmp, + 2, [128, 128], + param=w1, + bias=b, + is_test=is_test, + pooling=False, + use_cudnn=use_cudnn) return tmp @@ -70,12 +99,14 @@ def encoder_net(images, rnn_hidden_size=200, regularizer=None, gradient_clip=None, - is_test=False): + is_test=False, + use_cudnn=False): conv_features = ocr_convs( images, regularizer=regularizer, gradient_clip=gradient_clip, - is_test=is_test) + is_test=is_test, + use_cudnn=use_cudnn) sliced_feature = fluid.layers.im2sequence( input=conv_features, stride=[1, 1], @@ -142,7 +173,11 @@ def ctc_train_net(images, label, args, num_classes): learning_rate_decay = None regularizer = fluid.regularizer.L2Decay(L2_RATE) - fc_out = encoder_net(images, num_classes, regularizer=regularizer) + fc_out = encoder_net( + images, + num_classes, + regularizer=regularizer, + use_cudnn=True if args.use_gpu else False) cost = fluid.layers.warpctc( input=fc_out, label=label, blank=num_classes, norm_by_times=True) sum_cost = fluid.layers.reduce_sum(cost) @@ -166,19 +201,18 @@ def ctc_train_net(images, label, args, num_classes): if args.average_window > 0: model_average = fluid.optimizer.ModelAverage( args.average_window, - params_grads, min_average_window=args.min_average_window, max_average_window=args.max_average_window) return sum_cost, error_evaluator, inference_program, model_average -def ctc_infer(images, num_classes): - fc_out = encoder_net(images, num_classes, is_test=True) +def ctc_infer(images, num_classes, use_cudnn): + fc_out = encoder_net(images, num_classes, is_test=True, use_cudnn=use_cudnn) return fluid.layers.ctc_greedy_decoder(input=fc_out, blank=num_classes) -def ctc_eval(images, label, num_classes): - fc_out = encoder_net(images, num_classes, is_test=True) +def ctc_eval(images, label, num_classes, use_cudnn): + fc_out = encoder_net(images, num_classes, is_test=True, use_cudnn=use_cudnn) decoded_out = fluid.layers.ctc_greedy_decoder( input=fc_out, blank=num_classes) diff --git a/fluid/ocr_recognition/ctc_reader.py b/fluid/ocr_recognition/ctc_reader.py index db05dbeae73b67b12aebacdc84a04d5b180d2132..9cbe310c9656d0501915281a5bc0f5236a42f6b1 100644 --- a/fluid/ocr_recognition/ctc_reader.py +++ b/fluid/ocr_recognition/ctc_reader.py @@ -25,7 +25,7 @@ class DataGenerator(object): def __init__(self): pass - def train_reader(self, img_root_dir, img_label_list, batchsize): + def train_reader(self, img_root_dir, img_label_list, batchsize, cycle): ''' Reader interface for training. @@ -35,6 +35,10 @@ class DataGenerator(object): :param img_label_list: The path of the file for training. :type img_label_list: str + :param cycle: If number of iterations is greater than dataset_size / batch_size + it reiterates dataset over as many times as necessary. + :type cycle: bool + ''' img_label_lines = [] @@ -65,24 +69,29 @@ class DataGenerator(object): def reader(): sizes = len(img_label_lines) / batchsize - for i in range(sizes): - result = [] - sz = [0, 0] - for j in range(batchsize): - line = img_label_lines[i * batchsize + j] - # h, w, img_name, labels - items = line.split(' ') - - label = [int(c) for c in items[-1].split(',')] - img = Image.open(os.path.join(img_root_dir, items[ - 2])).convert('L') #zhuanhuidu - if j == 0: - sz = img.size - img = img.resize((sz[0], sz[1])) - img = np.array(img) - 127.5 - img = img[np.newaxis, ...] - result.append([img, label]) - yield result + if sizes == 0: + raise ValueError('Batch size is bigger than the dataset size.') + while True: + for i in range(sizes): + result = [] + sz = [0, 0] + for j in range(batchsize): + line = img_label_lines[i * batchsize + j] + # h, w, img_name, labels + items = line.split(' ') + + label = [int(c) for c in items[-1].split(',')] + img = Image.open(os.path.join(img_root_dir, items[ + 2])).convert('L') #zhuanhuidu + if j == 0: + sz = img.size + img = img.resize((sz[0], sz[1])) + img = np.array(img) - 127.5 + img = img[np.newaxis, ...] + result.append([img, label]) + yield result + if not cycle: + break return reader @@ -111,7 +120,7 @@ class DataGenerator(object): return reader - def infer_reader(self, img_root_dir=None, img_label_list=None): + def infer_reader(self, img_root_dir=None, img_label_list=None, cycle=False): '''A reader interface for inference. :param img_root_dir: The root path of the images for training. @@ -122,11 +131,15 @@ class DataGenerator(object): was None. If img_label_list was set to None, it will read image path from stdin. :type img_root_dir: str + + :param cycle: If number of iterations is greater than dataset_size / + batch_size it reiterates dataset over as many times as necessary. + :type cycle: bool ''' def reader(): - if img_label_list is not None: - for line in open(img_label_list): + def yield_img_and_label(lines): + for line in lines: if img_root_dir is not None: # h, w, img_name, labels img_name = line.split(' ')[2] @@ -138,6 +151,16 @@ class DataGenerator(object): img = img[np.newaxis, ...] label = [int(c) for c in line.split(' ')[3].split(',')] yield img, label + + if img_label_list is not None: + lines = [] + with open(img_label_list) as f: + lines = f.readlines() + for img, label in yield_img_and_label(lines): + yield img, label + while cycle: + for img, label in yield_img_and_label(lines): + yield img, label else: while True: img_path = raw_input("Please input the path of image: ") @@ -161,14 +184,15 @@ def data_shape(): return DATA_SHAPE -def train(batch_size, train_images_dir=None, train_list_file=None): +def train(batch_size, train_images_dir=None, train_list_file=None, cycle=False): generator = DataGenerator() if train_images_dir is None: data_dir = download_data() train_images_dir = path.join(data_dir, TRAIN_DATA_DIR_NAME) if train_list_file is None: train_list_file = path.join(data_dir, TRAIN_LIST_FILE_NAME) - return generator.train_reader(train_images_dir, train_list_file, batch_size) + return generator.train_reader(train_images_dir, train_list_file, batch_size, + cycle) def test(batch_size=1, test_images_dir=None, test_list_file=None): @@ -182,10 +206,14 @@ def test(batch_size=1, test_images_dir=None, test_list_file=None): generator.test_reader(test_images_dir, test_list_file), batch_size) -def inference(infer_images_dir=None, infer_list_file=None): +def inference(batch_size=1, + infer_images_dir=None, + infer_list_file=None, + cycle=False): generator = DataGenerator() return paddle.batch( - generator.infer_reader(infer_images_dir, infer_list_file), 1) + generator.infer_reader(infer_images_dir, infer_list_file, cycle), + batch_size) def download_data(): diff --git a/fluid/ocr_recognition/ctc_train.py b/fluid/ocr_recognition/ctc_train.py index dc68cc2e2e9f6e98d2331ff926109d5df56d1df6..f9a5427240acb1dac6cc40ae58b263f08204169b 100644 --- a/fluid/ocr_recognition/ctc_train.py +++ b/fluid/ocr_recognition/ctc_train.py @@ -1,5 +1,6 @@ """Trainer for OCR CTC model.""" import paddle.fluid as fluid +import paddle.fluid.profiler as profiler from utility import add_arguments, print_arguments, to_lodtensor, get_feeder_data from crnn_ctc_model import ctc_train_net import ctc_reader @@ -14,7 +15,7 @@ parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable add_arg('batch_size', int, 32, "Minibatch size.") -add_arg('total_step', int, 720000, "Number of training iterations.") +add_arg('total_step', int, 720000, "The number of iterations. Zero or less means whole training set. More than 0 means the training set might be looped until # of iterations is reached.") add_arg('log_period', int, 1000, "Log period.") add_arg('save_model_period', int, 15000, "Save model period. '-1' means never saving the model.") add_arg('eval_period', int, 15000, "Evaluate period. '-1' means never evaluating the model.") @@ -25,6 +26,9 @@ add_arg('min_average_window',int, 10000, "Min average window.") add_arg('max_average_window',int, 12500, "Max average window. It is proposed to be set as the number of minibatch in a pass.") add_arg('average_window', float, 0.15, "Average window.") add_arg('parallel', bool, False, "Whether use parallel training.") +add_arg('profile', bool, False, "Whether to use profiling.") +add_arg('skip_batch_num', int, 0, "The number of first minibatches to skip as warm-up for better performance test.") +add_arg('skip_test', bool, False, "Whether to skip test phase.") # yapf: enable @@ -49,7 +53,8 @@ def train(args, data_reader=ctc_reader): train_reader = data_reader.train( args.batch_size, train_images_dir=train_images, - train_list_file=train_list) + train_list_file=train_list, + cycle=args.total_step > 0) test_reader = data_reader.test( test_images_dir=test_images, test_list_file=test_list) @@ -74,7 +79,7 @@ def train(args, data_reader=ctc_reader): error_evaluator.reset(exe) if args.parallel: train_exe = fluid.ParallelExecutor( - use_cuda=True, loss_name=sum_cost.name) + use_cuda=True if args.use_gpu else False, loss_name=sum_cost.name) fetch_vars = [sum_cost] + error_evaluator.metrics @@ -85,8 +90,8 @@ def train(args, data_reader=ctc_reader): feed=get_feeder_data(data, place)) results = [np.array(result).sum() for result in results] else: - results = exe.run(feed=get_feeder_data(data, place), - fetch_list=fetch_vars) + results = train_exe.run(feed=get_feeder_data(data, place), + fetch_list=fetch_vars) results = [result[0] for result in results] return results @@ -109,17 +114,29 @@ def train(args, data_reader=ctc_reader): print "Saved model to: %s/%s." % (args.save_model_dir, filename) iter_num = 0 - while True: + stop = False + while not stop: total_loss = 0.0 total_seq_error = 0.0 + batch_times = [] # train a pass for data in train_reader(): - iter_num += 1 - if iter_num > args.total_step: - return + if args.total_step > 0 and iter_num == args.total_step + args.skip_batch_num: + stop = True + break + if iter_num < args.skip_batch_num: + print("Warm-up iteration") + if iter_num == args.skip_batch_num: + profiler.reset_profiler() + start = time.time() results = train_one_batch(data) + batch_time = time.time() - start + fps = args.batch_size / batch_time + batch_times.append(batch_time) total_loss += results[0] total_seq_error += results[2] + + iter_num += 1 # training log if iter_num % args.log_period == 0: print "\nTime: %s; Iter[%d]; Avg Warp-CTC loss: %.3f; Avg seq err: %.3f" % ( @@ -131,7 +148,7 @@ def train(args, data_reader=ctc_reader): total_seq_error = 0.0 # evaluate - if iter_num % args.eval_period == 0: + if not args.skip_test and iter_num % args.eval_period == 0: if model_average: with model_average.apply(exe): test(iter_num) @@ -145,12 +162,35 @@ def train(args, data_reader=ctc_reader): save_model(args, exe, iter_num) else: save_model(args, exe, iter_num) + # Postprocess benchmark data + latencies = batch_times[args.skip_batch_num:] + latency_avg = np.average(latencies) + latency_pc99 = np.percentile(latencies, 99) + fpses = np.divide(args.batch_size, latencies) + fps_avg = np.average(fpses) + fps_pc99 = np.percentile(fpses, 1) + + # Benchmark output + print('\nTotal examples (incl. warm-up): %d' % + (iter_num * args.batch_size)) + print('average latency: %.5f s, 99pc latency: %.5f s' % (latency_avg, + latency_pc99)) + print('average fps: %.5f, fps for 99pc latency: %.5f' % (fps_avg, + fps_pc99)) def main(): args = parser.parse_args() print_arguments(args) - train(args, data_reader=ctc_reader) + if args.profile: + if args.use_gpu: + with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: + train(args, data_reader=ctc_reader) + else: + with profiler.profiler("CPU", sorted_key='total') as cpuprof: + train(args, data_reader=ctc_reader) + else: + train(args, data_reader=ctc_reader) if __name__ == "__main__": diff --git a/fluid/ocr_recognition/infer.py b/fluid/ocr_recognition/infer.py index 080e3f5f84efbb73e3c2381e809222fd2a90c416..154242c9e3ca8fea26f34b5cda0c2bac5a3d0ef1 100644 --- a/fluid/ocr_recognition/infer.py +++ b/fluid/ocr_recognition/infer.py @@ -1,5 +1,6 @@ import paddle.v2 as paddle import paddle.fluid as fluid +import paddle.fluid.profiler as profiler from utility import add_arguments, print_arguments, to_lodtensor, get_feeder_data from crnn_ctc_model import ctc_infer import numpy as np @@ -7,6 +8,7 @@ import ctc_reader import argparse import functools import os +import time parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) @@ -16,6 +18,10 @@ add_arg('input_images_dir', str, None, "The directory of images.") add_arg('input_images_list', str, None, "The list file of images.") add_arg('dict', str, None, "The dictionary. The result of inference will be index sequence if the dictionary was None.") add_arg('use_gpu', bool, True, "Whether use GPU to infer.") +add_arg('iterations', int, 0, "The number of iterations. Zero or less means whole test set. More than 0 means the test set might be looped until # of iterations is reached.") +add_arg('profile', bool, False, "Whether to use profiling.") +add_arg('skip_batch_num', int, 0, "The number of first minibatches to skip as warm-up for better performance test.") +add_arg('batch_size', int, 1, "The minibatch size.") # yapf: enable @@ -25,11 +31,14 @@ def inference(args, infer=ctc_infer, data_reader=ctc_reader): data_shape = data_reader.data_shape() # define network images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') - sequence = infer(images, num_classes) + sequence = infer( + images, num_classes, use_cudnn=True if args.use_gpu else False) # data reader infer_reader = data_reader.inference( + batch_size=args.batch_size, infer_images_dir=args.input_images_dir, - infer_list_file=args.input_images_list) + infer_list_file=args.input_images_list, + cycle=True if args.iterations > 0 else False) # prepare environment place = fluid.CPUPlace() if args.use_gpu: @@ -56,23 +65,67 @@ def inference(args, infer=ctc_infer, data_reader=ctc_reader): fluid.io.load_params(exe, dirname=model_dir, filename=model_file_name) print "Init model from: %s." % args.model_path + batch_times = [] + iters = 0 for data in infer_reader(): + if args.iterations > 0 and iters == args.iterations + args.skip_batch_num: + break + if iters < args.skip_batch_num: + print("Warm-up itaration") + if iters == args.skip_batch_num: + profiler.reset_profiler() + + start = time.time() result = exe.run(fluid.default_main_program(), feed=get_feeder_data( data, place, need_label=False), fetch_list=[sequence], return_numpy=False) + batch_time = time.time() - start + fps = args.batch_size / batch_time + batch_times.append(batch_time) indexes = np.array(result[0]).flatten() if dict_map is not None: - print "result: %s" % ([dict_map[index] for index in indexes], ) + print "Iteration %d, latency: %.5f s, fps: %f, result: %s" % ( + iters, + batch_time, + fps, + [dict_map[index] for index in indexes], ) else: - print "result: %s" % (indexes, ) + print "Iteration %d, latency: %.5f s, fps: %f, result: %s" % ( + iters, + batch_time, + fps, + indexes, ) + + iters += 1 + + latencies = batch_times[args.skip_batch_num:] + latency_avg = np.average(latencies) + latency_pc99 = np.percentile(latencies, 99) + fpses = np.divide(args.batch_size, latencies) + fps_avg = np.average(fpses) + fps_pc99 = np.percentile(fpses, 1) + + # Benchmark output + print('\nTotal examples (incl. warm-up): %d' % (iters * args.batch_size)) + print('average latency: %.5f s, 99pc latency: %.5f s' % (latency_avg, + latency_pc99)) + print('average fps: %.5f, fps for 99pc latency: %.5f' % (fps_avg, fps_pc99)) def main(): args = parser.parse_args() print_arguments(args) - inference(args, data_reader=ctc_reader) + if args.profile: + if args.use_gpu: + with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: + inference(args, data_reader=ctc_reader) + else: + with profiler.profiler("CPU", sorted_key='total') as cpuprof: + inference(args, data_reader=ctc_reader) + else: + inference(args, data_reader=ctc_reader) if __name__ == "__main__": diff --git a/fluid/ocr_recognition/scripts/README.md b/fluid/ocr_recognition/scripts/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6320834e6c29fa24496527b265e2aa0d52475ad6 --- /dev/null +++ b/fluid/ocr_recognition/scripts/README.md @@ -0,0 +1,46 @@ +## Introduction +Scripts enclosed in the folder serve as examples of commands that start training +and inference of a model, and are subject to further customisation. + +# Running with MKL-DNN +In order to run training or inference using MKL-DNN library, please use +`FLAGS_use_mkldnn=1` environmental variable. + +## Prerequisites +In order to run the training and inference, no special requirements are posed. + +## Training +To run training on *CPU*, please execute: + +```sh +source train.sh CPU +``` + +To run training on *CPU* with MKL-DNN, please execute: + +```sh +source train.sh MKLDNN +``` + +To run training on *GPU*, please execute: + +```sh +source train.sh GPU +``` + +## Inference +To perform inference on the trained model using *CPU*, please run: +```sh +source infer.sh CPU +``` + +To perform inference on the trained model using *CPU* with MKL-DNN, please run: +```sh +source infer.sh MKLDNN +``` + +To perform inference on the trained model using *GPU*, please run: + +```sh +source infer.sh GPU +``` diff --git a/fluid/ocr_recognition/scripts/infer.sh b/fluid/ocr_recognition/scripts/infer.sh new file mode 100644 index 0000000000000000000000000000000000000000..6273ad153157138b3b7ecafece461ee01eda2955 --- /dev/null +++ b/fluid/ocr_recognition/scripts/infer.sh @@ -0,0 +1,42 @@ +#!/bin/bash +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 + +mode=$1 # gpu, cpu, mkldnn +if [ "$mode" = "CPU" ]; then + use_gpu="False" + model_path="cpu_model" +elif [ "$mode" = "GPU" ]; then + use_gpu="True" + model_path="gpu_model" +elif [ "$mode" = "MKLDNN" ]; then + use_gpu="False" + model_path="mkldnn_model" + export FLAGS_use_mkldnn=1 +else + echo "Invalid mode provided. Please use one of {GPU, CPU, MKLDNN}" + exit 1 +fi + +ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs` +if [ $ht -eq 1 ]; then # HT is OFF + if [ -z "$KMP_AFFINITY" ]; then + export KMP_AFFINITY="granularity=fine,compact,0,0" + fi + if [ -z "$OMP_DYNAMIC" ]; then + export OMP_DYNAMIC="FALSE" + fi +else # HT is ON + if [ -z "$KMP_AFFINITY" ]; then + export KMP_AFFINITY="granularity=fine,compact,1,0" + fi +fi + +python ../infer.py \ + --model_path $model_path/model_00001 \ + --input_images_list ~/.cache/paddle/dataset/ctc_data/data/test.list \ + --input_images_dir ~/.cache/paddle/dataset/ctc_data/data/test_images \ + --use_gpu $use_gpu \ + --batch_size 32 \ + --iterations 5 \ + --skip_batch_num 2 diff --git a/fluid/ocr_recognition/scripts/train.sh b/fluid/ocr_recognition/scripts/train.sh new file mode 100644 index 0000000000000000000000000000000000000000..ceb7c06c0548e1d5ff90651d2fd8624288cc8804 --- /dev/null +++ b/fluid/ocr_recognition/scripts/train.sh @@ -0,0 +1,55 @@ +#!/bin/bash +export MKL_NUM_THREADS=1 +export OMP_NUM_THREADS=1 + +batch_size=32 +core_num=`lscpu |grep -m1 "CPU(s)"|awk -F':' '{print $2}'|xargs` +mode=$1 # gpu, cpu, mkldnn +if [ "$mode" = "CPU" ]; then + if [ $core_num -gt $batch_size ]; then + echo "Batch size should be greater or equal to the number of + available cores, when parallel mode is set to True." + fi + use_gpu="False" + save_model_dir="cpu_model" + parallel="True" +elif [ "$mode" = "GPU" ]; then + use_gpu="True" + save_model_dir="gpu_model" + parallel="True" +elif [ "$mode" = "MKLDNN" ]; then + if [ $core_num -gt $batch_size ]; then + echo "Batch size should be greater or equal to the number of + available cores, when parallel mode is set to True." + fi + use_gpu="False" + save_model_dir="mkldnn_model" + parallel="False" + export FLAGS_use_mkldnn=1 +else + echo "Invalid mode provided. Please use one of {GPU, CPU, MKLDNN}" + exit 1 +fi + +ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs` +if [ $ht -eq 1 ]; then # HT is OFF + if [ -z "$KMP_AFFINITY" ]; then + export KMP_AFFINITY="granularity=fine,compact,0,0" + fi + if [ -z "$OMP_DYNAMIC" ]; then + export OMP_DYNAMIC="FALSE" + fi +else # HT is ON + if [ -z "$KMP_AFFINITY" ]; then + export KMP_AFFINITY="granularity=fine,compact,1,0" + fi +fi + +python ../ctc_train.py \ + --use_gpu $use_gpu \ + --parallel $parallel \ + --batch_size $batch_size \ + --save_model_period 1 \ + --total_step 1 \ + --save_model_dir $save_model_dir +