提交 27615cbf 编写于 作者: J jacquesqiao 提交者: GitHub

Merge pull request #199 from jacquesqiao/clear-machine-translation

clean v1 script in machine translation
import sys
import paddle.v2 as paddle
......@@ -104,7 +105,9 @@ def main():
parameters = paddle.parameters.create(cost)
# define optimize method and trainer
optimizer = paddle.optimizer.Adam(learning_rate=1e-4)
optimizer = paddle.optimizer.Adam(
learning_rate=5e-5,
regularization=paddle.optimizer.L2Regularization(rate=1e-3))
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer)
......@@ -124,8 +127,11 @@ def main():
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 10 == 0:
print "Pass %d, Batch %d, Cost %f, %s" % (
print "\nPass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics)
else:
sys.stdout.write('.')
sys.stdout.flush()
# start to train
trainer.train(
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer.PyDataProvider2 import *
UNK_IDX = 2
START = "<s>"
END = "<e>"
def hook(settings, src_dict_path, trg_dict_path, is_generating, file_list,
**kwargs):
# job_mode = 1: training mode
# job_mode = 0: generating mode
settings.job_mode = not is_generating
def fun(dict_path):
out_dict = dict()
with open(dict_path, "r") as fin:
out_dict = {
line.strip(): line_count
for line_count, line in enumerate(fin)
}
return out_dict
settings.src_dict = fun(src_dict_path)
settings.trg_dict = fun(trg_dict_path)
settings.logger.info("src dict len : %d" % (len(settings.src_dict)))
if settings.job_mode:
settings.slots = {
'source_language_word':
integer_value_sequence(len(settings.src_dict)),
'target_language_word':
integer_value_sequence(len(settings.trg_dict)),
'target_language_next_word':
integer_value_sequence(len(settings.trg_dict))
}
settings.logger.info("trg dict len : %d" % (len(settings.trg_dict)))
else:
settings.slots = {
'source_language_word':
integer_value_sequence(len(settings.src_dict)),
'sent_id':
integer_value_sequence(len(open(file_list[0], "r").readlines()))
}
def _get_ids(s, dictionary):
words = s.strip().split()
return [dictionary[START]] + \
[dictionary.get(w, UNK_IDX) for w in words] + \
[dictionary[END]]
@provider(init_hook=hook, pool_size=50000)
def process(settings, file_name):
with open(file_name, 'r') as f:
for line_count, line in enumerate(f):
line_split = line.strip().split('\t')
if settings.job_mode and len(line_split) != 2:
continue
src_seq = line_split[0] # one source sequence
src_ids = _get_ids(src_seq, settings.src_dict)
if settings.job_mode:
trg_seq = line_split[1] # one target sequence
trg_words = trg_seq.split()
trg_ids = [settings.trg_dict.get(w, UNK_IDX) for w in trg_words]
# remove sequence whose length > 80 in training mode
if len(src_ids) > 80 or len(trg_ids) > 80:
continue
trg_ids_next = trg_ids + [settings.trg_dict[END]]
trg_ids = [settings.trg_dict[START]] + trg_ids
yield {
'source_language_word': src_ids,
'target_language_word': trg_ids,
'target_language_next_word': trg_ids_next
}
else:
yield {'source_language_word': src_ids, 'sent_id': [line_count]}
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
paddle train \
--job=test \
--config='seqToseq_net.py' \
--save_dir='pretrained/wmt14_model' \
--use_gpu=false \
--num_passes=13 \
--test_pass=12 \
--trainer_count=1 \
--config_args=is_generating=1,gen_trans_file="gen_result" \
2>&1 | tee 'gen.log'
# edit-mode: -*- python -*-
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from paddle.trainer_config_helpers import *
### Data Definiation
data_dir = "./data/pre-wmt14"
src_lang_dict = os.path.join(data_dir, 'src.dict')
trg_lang_dict = os.path.join(data_dir, 'trg.dict')
is_generating = get_config_arg("is_generating", bool, False)
if not is_generating:
train_list = os.path.join(data_dir, 'train.list')
test_list = os.path.join(data_dir, 'test.list')
else:
train_list = None
test_list = os.path.join(data_dir, 'gen.list')
define_py_data_sources2(
train_list,
test_list,
module="dataprovider",
obj="process",
args={
"src_dict_path": src_lang_dict,
"trg_dict_path": trg_lang_dict,
"is_generating": is_generating
})
### Algorithm Configuration
settings(learning_method=AdamOptimizer(), batch_size=50, learning_rate=5e-4)
### Network Architecture
source_dict_dim = len(open(src_lang_dict, "r").readlines())
target_dict_dim = len(open(trg_lang_dict, "r").readlines())
word_vector_dim = 512 # dimension of word vector
decoder_size = 512 # dimension of hidden unit in GRU Decoder network
encoder_size = 512 # dimension of hidden unit in GRU Encoder network
if is_generating:
beam_size = 3 # expand width in beam search
max_length = 250 # a stop condition of sequence generation
gen_trans_file = get_config_arg("gen_trans_file", str, None)
#### Encoder
src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
src_embedding = embedding_layer(
input=src_word_id,
size=word_vector_dim,
param_attr=ParamAttr(name='_source_language_embedding'))
src_forward = simple_gru(input=src_embedding, size=encoder_size)
src_backward = simple_gru(input=src_embedding, size=encoder_size, reverse=True)
encoded_vector = concat_layer(input=[src_forward, src_backward])
#### Decoder
with mixed_layer(size=decoder_size) as encoded_proj:
encoded_proj += full_matrix_projection(input=encoded_vector)
backward_first = first_seq(input=src_backward)
with mixed_layer(
size=decoder_size,
act=TanhActivation(), ) as decoder_boot:
decoder_boot += full_matrix_projection(input=backward_first)
def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
decoder_mem = memory(
name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
context = simple_attention(
encoded_sequence=enc_vec,
encoded_proj=enc_proj,
decoder_state=decoder_mem, )
with mixed_layer(size=decoder_size * 3) as decoder_inputs:
decoder_inputs += full_matrix_projection(input=context)
decoder_inputs += full_matrix_projection(input=current_word)
gru_step = gru_step_layer(
name='gru_decoder',
input=decoder_inputs,
output_mem=decoder_mem,
size=decoder_size)
with mixed_layer(
size=target_dict_dim, bias_attr=True,
act=SoftmaxActivation()) as out:
out += full_matrix_projection(input=gru_step)
return out
decoder_group_name = "decoder_group"
group_input1 = StaticInput(input=encoded_vector, is_seq=True)
group_input2 = StaticInput(input=encoded_proj, is_seq=True)
group_inputs = [group_input1, group_input2]
if not is_generating:
trg_embedding = embedding_layer(
input=data_layer(name='target_language_word', size=target_dict_dim),
size=word_vector_dim,
param_attr=ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
# For decoder equipped with attention mechanism, in training,
# target embeding (the groudtruth) is the data input,
# while encoded source sequence is accessed to as an unbounded memory.
# Here, the StaticInput defines a read-only memory
# for the recurrent_group.
decoder = recurrent_group(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs)
lbl = data_layer(name='target_language_next_word', size=target_dict_dim)
cost = classification_cost(input=decoder, label=lbl)
outputs(cost)
else:
# In generation, the decoder predicts a next target word based on
# the encoded source sequence and the last generated target word.
# The encoded source sequence (encoder's output) must be specified by
# StaticInput, which is a read-only memory.
# Embedding of the last generated word is automatically gotten by
# GeneratedInputs, which is initialized by a start mark, such as <s>,
# and must be included in generation.
trg_embedding = GeneratedInput(
size=target_dict_dim,
embedding_name='_target_language_embedding',
embedding_size=word_vector_dim)
group_inputs.append(trg_embedding)
beam_gen = beam_search(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs,
bos_id=0,
eos_id=1,
beam_size=beam_size,
max_length=max_length)
seqtext_printer_evaluator(
input=beam_gen,
id_input=data_layer(name="sent_id", size=1),
dict_file=trg_lang_dict,
result_file=gen_trans_file)
outputs(beam_gen)
#!/bin/bash
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
paddle train \
--config='seqToseq_net.py' \
--save_dir='model' \
--use_gpu=false \
--num_passes=16 \
--show_parameter_stats_period=100 \
--trainer_count=4 \
--log_period=10 \
--dot_period=5 \
2>&1 | tee 'train.log'
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册