Merge pull request #199 from jacquesqiao/clear-machine-translation

clean v1 script in machine translation

Merge pull request #199 from jacquesqiao/clear-machine-translation
clean v1 script in machine translation
27615cbf · jacquesqiao · GitHub · 00eeb5f4 · e247bb3a · 27615cbf
5 changed file
--- a/machine_translation/api_train.py
+++ b/machine_translation/api_train.py
+import sys
 import paddle.v2 as paddle


@@ -104,7 +105,9 @@ def main():
    parameters = paddle.parameters.create(cost)

    # define optimize method and trainer
-    optimizer = paddle.optimizer.Adam(learning_rate=1e-4)
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=5e-5,
+        regularization=paddle.optimizer.L2Regularization(rate=1e-3))
    trainer = paddle.trainer.SGD(
        cost=cost, parameters=parameters, update_equation=optimizer)

@@ -124,8 +127,11 @@ def main():
    def event_handler(event):
        if isinstance(event, paddle.event.EndIteration):
            if event.batch_id % 10 == 0:
-                print "Pass %d, Batch %d, Cost %f, %s" % (
+                print "\nPass %d, Batch %d, Cost %f, %s" % (
                    event.pass_id, event.batch_id, event.cost, event.metrics)
+            else:
+                sys.stdout.write('.')
+                sys.stdout.flush()

    # start to train
    trainer.train(

--- a/machine_translation/dataprovider.py
+++ b/machine_translation/dataprovider.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.PyDataProvider2 import *
-
-UNK_IDX = 2
-START = "<s>"
-END = "<e>"
-
-
-def hook(settings, src_dict_path, trg_dict_path, is_generating, file_list,
-         **kwargs):
-    # job_mode = 1: training mode
-    # job_mode = 0: generating mode
-    settings.job_mode = not is_generating
-
-    def fun(dict_path):
-        out_dict = dict()
-        with open(dict_path, "r") as fin:
-            out_dict = {
-                line.strip(): line_count
-                for line_count, line in enumerate(fin)
-            }
-        return out_dict
-
-    settings.src_dict = fun(src_dict_path)
-    settings.trg_dict = fun(trg_dict_path)
-
-    settings.logger.info("src dict len : %d" % (len(settings.src_dict)))
-
-    if settings.job_mode:
-        settings.slots = {
-            'source_language_word':
-            integer_value_sequence(len(settings.src_dict)),
-            'target_language_word':
-            integer_value_sequence(len(settings.trg_dict)),
-            'target_language_next_word':
-            integer_value_sequence(len(settings.trg_dict))
-        }
-        settings.logger.info("trg dict len : %d" % (len(settings.trg_dict)))
-    else:
-        settings.slots = {
-            'source_language_word':
-            integer_value_sequence(len(settings.src_dict)),
-            'sent_id':
-            integer_value_sequence(len(open(file_list[0], "r").readlines()))
-        }
-
-
-def _get_ids(s, dictionary):
-    words = s.strip().split()
-    return [dictionary[START]] + \
-           [dictionary.get(w, UNK_IDX) for w in words] + \
-           [dictionary[END]]
-
-
-@provider(init_hook=hook, pool_size=50000)
-def process(settings, file_name):
-    with open(file_name, 'r') as f:
-        for line_count, line in enumerate(f):
-            line_split = line.strip().split('\t')
-            if settings.job_mode and len(line_split) != 2:
-                continue
-            src_seq = line_split[0]  # one source sequence
-            src_ids = _get_ids(src_seq, settings.src_dict)
-
-            if settings.job_mode:
-                trg_seq = line_split[1]  # one target sequence
-                trg_words = trg_seq.split()
-                trg_ids = [settings.trg_dict.get(w, UNK_IDX) for w in trg_words]
-
-                # remove sequence whose length > 80 in training mode
-                if len(src_ids) > 80 or len(trg_ids) > 80:
-                    continue
-                trg_ids_next = trg_ids + [settings.trg_dict[END]]
-                trg_ids = [settings.trg_dict[START]] + trg_ids
-                yield {
-                    'source_language_word': src_ids,
-                    'target_language_word': trg_ids,
-                    'target_language_next_word': trg_ids_next
-                }
-            else:
-                yield {'source_language_word': src_ids, 'sent_id': [line_count]}
--- a/machine_translation/gen.sh
+++ b/machine_translation/gen.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-paddle train \
-    --job=test \
-    --config='seqToseq_net.py' \
-    --save_dir='pretrained/wmt14_model' \
-    --use_gpu=false \
-    --num_passes=13 \
-    --test_pass=12 \
-    --trainer_count=1 \
-    --config_args=is_generating=1,gen_trans_file="gen_result" \
-    2>&1 | tee 'gen.log'
--- a/machine_translation/seqToseq_net.py
+++ b/machine_translation/seqToseq_net.py
-# edit-mode: -*- python -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from paddle.trainer_config_helpers import *
-
-### Data Definiation
-data_dir = "./data/pre-wmt14"
-src_lang_dict = os.path.join(data_dir, 'src.dict')
-trg_lang_dict = os.path.join(data_dir, 'trg.dict')
-is_generating = get_config_arg("is_generating", bool, False)
-
-if not is_generating:
-    train_list = os.path.join(data_dir, 'train.list')
-    test_list = os.path.join(data_dir, 'test.list')
-else:
-    train_list = None
-    test_list = os.path.join(data_dir, 'gen.list')
-
-define_py_data_sources2(
-    train_list,
-    test_list,
-    module="dataprovider",
-    obj="process",
-    args={
-        "src_dict_path": src_lang_dict,
-        "trg_dict_path": trg_lang_dict,
-        "is_generating": is_generating
-    })
-
-### Algorithm Configuration
-settings(learning_method=AdamOptimizer(), batch_size=50, learning_rate=5e-4)
-
-### Network Architecture
-source_dict_dim = len(open(src_lang_dict, "r").readlines())
-target_dict_dim = len(open(trg_lang_dict, "r").readlines())
-word_vector_dim = 512  # dimension of word vector
-decoder_size = 512  # dimension of hidden unit in GRU Decoder network
-encoder_size = 512  # dimension of hidden unit in GRU Encoder network
-
-if is_generating:
-    beam_size = 3  # expand width in beam search
-    max_length = 250  # a stop condition of sequence generation
-    gen_trans_file = get_config_arg("gen_trans_file", str, None)
-
-#### Encoder
-src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
-src_embedding = embedding_layer(
-    input=src_word_id,
-    size=word_vector_dim,
-    param_attr=ParamAttr(name='_source_language_embedding'))
-src_forward = simple_gru(input=src_embedding, size=encoder_size)
-src_backward = simple_gru(input=src_embedding, size=encoder_size, reverse=True)
-encoded_vector = concat_layer(input=[src_forward, src_backward])
-
-#### Decoder
-with mixed_layer(size=decoder_size) as encoded_proj:
-    encoded_proj += full_matrix_projection(input=encoded_vector)
-
-backward_first = first_seq(input=src_backward)
-with mixed_layer(
-        size=decoder_size,
-        act=TanhActivation(), ) as decoder_boot:
-    decoder_boot += full_matrix_projection(input=backward_first)
-
-
-def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
-    decoder_mem = memory(
-        name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
-
-    context = simple_attention(
-        encoded_sequence=enc_vec,
-        encoded_proj=enc_proj,
-        decoder_state=decoder_mem, )
-
-    with mixed_layer(size=decoder_size * 3) as decoder_inputs:
-        decoder_inputs += full_matrix_projection(input=context)
-        decoder_inputs += full_matrix_projection(input=current_word)
-
-    gru_step = gru_step_layer(
-        name='gru_decoder',
-        input=decoder_inputs,
-        output_mem=decoder_mem,
-        size=decoder_size)
-
-    with mixed_layer(
-            size=target_dict_dim, bias_attr=True,
-            act=SoftmaxActivation()) as out:
-        out += full_matrix_projection(input=gru_step)
-    return out
-
-
-decoder_group_name = "decoder_group"
-group_input1 = StaticInput(input=encoded_vector, is_seq=True)
-group_input2 = StaticInput(input=encoded_proj, is_seq=True)
-group_inputs = [group_input1, group_input2]
-
-if not is_generating:
-    trg_embedding = embedding_layer(
-        input=data_layer(name='target_language_word', size=target_dict_dim),
-        size=word_vector_dim,
-        param_attr=ParamAttr(name='_target_language_embedding'))
-    group_inputs.append(trg_embedding)
-
-    # For decoder equipped with attention mechanism, in training,
-    # target embeding (the groudtruth) is the data input,
-    # while encoded source sequence is accessed to as an unbounded memory.
-    # Here, the StaticInput defines a read-only memory
-    # for the recurrent_group.
-    decoder = recurrent_group(
-        name=decoder_group_name,
-        step=gru_decoder_with_attention,
-        input=group_inputs)
-
-    lbl = data_layer(name='target_language_next_word', size=target_dict_dim)
-    cost = classification_cost(input=decoder, label=lbl)
-    outputs(cost)
-else:
-    # In generation, the decoder predicts a next target word based on
-    # the encoded source sequence and the last generated target word.
-
-    # The encoded source sequence (encoder's output) must be specified by
-    # StaticInput, which is a read-only memory.
-    # Embedding of the last generated word is automatically gotten by
-    # GeneratedInputs, which is initialized by a start mark, such as <s>,
-    # and must be included in generation.
-
-    trg_embedding = GeneratedInput(
-        size=target_dict_dim,
-        embedding_name='_target_language_embedding',
-        embedding_size=word_vector_dim)
-    group_inputs.append(trg_embedding)
-
-    beam_gen = beam_search(
-        name=decoder_group_name,
-        step=gru_decoder_with_attention,
-        input=group_inputs,
-        bos_id=0,
-        eos_id=1,
-        beam_size=beam_size,
-        max_length=max_length)
-
-    seqtext_printer_evaluator(
-        input=beam_gen,
-        id_input=data_layer(name="sent_id", size=1),
-        dict_file=trg_lang_dict,
-        result_file=gen_trans_file)
-    outputs(beam_gen)
--- a/machine_translation/train.sh
+++ b/machine_translation/train.sh
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-paddle train \
--config='seqToseq_net.py' \
--save_dir='model' \
--use_gpu=false \
--num_passes=16 \
--show_parameter_stats_period=100 \
--trainer_count=4 \
--log_period=10 \
--dot_period=5 \
-2>&1 | tee 'train.log'