diff --git a/fluid/ocr_ctc/dummy_reader.py b/fluid/ocr_ctc/dummy_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..f514fdbfc260585661192409902bbc6b69f25451 --- /dev/null +++ b/fluid/ocr_ctc/dummy_reader.py @@ -0,0 +1,26 @@ +import numpy as np +DATA_SHAPE = [1, 512, 512] + + +def _read_creater(num_sample=1024, num_class=20, min_seq_len=1, max_seq_len=10): + def reader(): + for i in range(num_sample): + sequence_len = np.random.randint(min_seq_len, max_seq_len) + x = np.random.uniform(0.1, 1, DATA_SHAPE).astype("float32") + y = np.random.randint(0, num_class + 1, + [sequence_len]).astype("int32") + yield x, y + + return reader + + +def train(num_sample=16): + return _read_creater(num_sample=num_sample) + + +def test(num_sample=16): + return _read_creater(num_sample=num_sample) + + +def data_shape(): + return DATA_SHAPE diff --git a/fluid/ocr_ctc/train.py b/fluid/ocr_ctc/train.py index d70b2546196976739a85d9a489d67574b9d64c44..6018192066d1717b7319603ee2627ca4987be152 100644 --- a/fluid/ocr_ctc/train.py +++ b/fluid/ocr_ctc/train.py @@ -12,22 +12,29 @@ #See the License for the specific language governing permissions and #limitations under the License. import sys - import paddle.v2 as paddle import paddle.v2.fluid as fluid +from paddle.v2.fluid import core import numpy as np +import dummy_reader -def random_reader(num_class): - def reader(): - sequence_len = np.random.randint(5, 10) - yield np.random.uniform(0.1, 1, [1, 512, 512]), np.random.randint( - 0, num_class + 1, [sequence_len]) - - return reader +def to_lodtensor(data, place): + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int32") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + res = core.LoDTensor() + res.set(flattened_data, place) + res.set_lod([lod]) + return res -def ocr_conv(input, num, with_bn): +def ocr_conv(input, num, with_bn, param_attrs): assert (num % 4 == 0) def conv_block(input, filter_size, group_size, with_bn): @@ -40,7 +47,8 @@ def ocr_conv(input, num, with_bn): conv_filter_size=3, conv_act='relu', conv_with_batchnorm=with_bn, - pool_type='max') + pool_type='max', + param_attr=param_attrs) conv1 = conv_block(input, 16, (num / 4), with_bn) conv2 = conv_block(conv1, 32, (num / 4), with_bn) @@ -49,62 +57,101 @@ def ocr_conv(input, num, with_bn): return conv4 -num_classes = 9054 -data_shape = [1, 512, 512] - -images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') -label = fluid.layers.data(name='label', shape=[1], dtype='int64') - -# encoder part -conv_features = ocr_conv(images, 8, True) - -sliced_feature = fluid.layers.im2sequence( - input=conv_features, stride=[1, 1], filter_size=[1, 3]) - -# TODO(wanghaoshuang): repaced by GRU -gru_forward, _ = fluid.layers.dynamic_lstm(input=sliced_feature, size=3 * 128) -gru_backward, _ = fluid.layers.dynamic_lstm( - input=sliced_feature, size=3 * 128, is_reverse=True) - -fc_out = fluid.layers.fc(input=[gru_forward, gru_backward], - size=num_classes + 1) - -cost = fluid.layers.warpctc( - input=fc_out, - label=label, - size=num_classes + 1, - blank=num_classes, - norm_by_times=True) -avg_cost = fluid.layers.mean(x=cost) - -# TODO(wanghaoshuang): set clipping -optimizer = fluid.optimizer.Momentum( - learning_rate=((1.0e-3) / 16), momentum=0.9) -opts = optimizer.minimize(cost) - -decoded_out = fluid.layers.ctc_greedy_decoder(input=fc_out, blank=num_classes) -error_evaluator = fluid.evaluator.EditDistance(input=decoded_out, label=label) - -BATCH_SIZE = 16 -PASS_NUM = 1 - -# TODO(wanghaoshuang): replaced by correct data reader -train_reader = paddle.batch( - paddle.reader.shuffle( - random_reader(num_classes), buf_size=128 * 10), - batch_size=BATCH_SIZE) - -place = fluid.CPUPlace() -exe = fluid.Executor(place) -feeder = fluid.DataFeeder(place=place, feed_list=[images, label]) -exe.run(fluid.default_startup_program()) - -for pass_id in range(PASS_NUM): - error_evaluator.reset(exe) - for data in train_reader(): - loss, error = exe.run(fluid.default_main_program(), - feed=feeder.feed(data), - fetch_list=[avg_cost] + error_evaluator.metrics) - pass_error = error_evaluator.eval(exe) - print "loss: %s; distance error: %s; pass_dis_error: %s;" % ( - str(loss), str(error), str(pass_error)) +def ocr_ctc_net(images, num_classes, param_attrs): + conv_features = ocr_conv(images, 8, True, param_attrs) + sliced_feature = fluid.layers.im2sequence( + input=conv_features, stride=[1, 1], filter_size=[1, 3]) + gru_forward = fluid.layers.dynamic_gru( + input=sliced_feature, size=128, param_attr=param_attrs) + gru_backward = fluid.layers.dynamic_gru( + input=sliced_feature, size=128, is_reverse=True, param_attr=param_attrs) + + fc_out = fluid.layers.fc(input=[gru_forward, gru_backward], + size=num_classes + 1, + param_attr=param_attrs) + return fc_out + + +def get_feeder_data(data, place): + pixel_tensor = core.LoDTensor() + pixel_data = np.concatenate( + map(lambda x: x[0][np.newaxis, :], data), axis=0).astype("float32") + pixel_tensor.set(pixel_data, place) + label_tensor = to_lodtensor(map(lambda x: x[1], data), place) + return {"pixel": pixel_tensor, "label": label_tensor} + + +def train(num_classes=20, + l2=0.0005 * 16, + clip_threshold=10, + data_reader=dummy_reader, + learning_rate=((1.0e-3) / 16), + momentum=0.9, + batch_size=4, + pass_num=2): + + param_attrs = fluid.ParamAttr( + regularizer=fluid.regularizer.L2Decay(l2), + gradient_clip=fluid.clip.GradientClipByValue(clip_threshold)) + data_shape = data_reader.data_shape() + images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') + label = fluid.layers.data( + name='label', shape=[1], dtype='int32', lod_level=1) + + fc_out = ocr_ctc_net(images, num_classes, param_attrs) + + cost = fluid.layers.warpctc( + input=fc_out, + label=label, + size=num_classes + 1, + blank=num_classes, + norm_by_times=True) + avg_cost = fluid.layers.mean(x=cost) + + optimizer = fluid.optimizer.Momentum( + learning_rate=learning_rate, momentum=momentum) + opts = optimizer.minimize(cost) + + decoded_out = fluid.layers.ctc_greedy_decoder( + input=fc_out, blank=num_classes) + casted_label = fluid.layers.cast(x=label, dtype='int64') + error_evaluator = fluid.evaluator.EditDistance( + input=decoded_out, label=casted_label) + + train_reader = paddle.batch(data_reader.train(), batch_size=batch_size) + test_reader = paddle.batch(data_reader.test(), batch_size=batch_size) + + #place = fluid.CPUPlace() + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(place=place, feed_list=[images, label]) + exe.run(fluid.default_startup_program()) + + inference_program = fluid.io.get_inference_program(error_evaluator) + for pass_id in range(pass_num): + error_evaluator.reset(exe) + batch_id = 0 + for data in train_reader(): + loss, batch_edit_distance, _, _ = exe.run( + fluid.default_main_program(), + feed=get_feeder_data(data, place), + fetch_list=[avg_cost] + error_evaluator.metrics) + print "Pass[%d], batch[%d]; loss: %s; edit distance: %s" % ( + pass_id, batch_id, loss[0], batch_edit_distance[0]) + batch_id += 1 + + train_edit_distance = error_evaluator.eval(exe) + print "End pass[%d]; train data edit_distance: %s" % ( + pass_id, str(train_edit_distance)) + + # test + error_evaluator.reset(exe) + for data in test_reader(): + exe.run(inference_program, feed=get_feeder_data(data, place)) + test_edit_distance = error_evaluator.eval(exe) + print "End pass[%d]; test data edit_distance: %s" % ( + pass_id, str(test_edit_distance)) + + +if __name__ == "__main__": + train()