diff --git a/dygraph/seq2seq/args.py b/dygraph/seq2seq/args.py index b754702e21a6fd2d05cd6b7c6191356db21b2269..df787c8c2d947fc2a2716df734d6b4ff9c6559e7 100644 --- a/dygraph/seq2seq/args.py +++ b/dygraph/seq2seq/args.py @@ -34,12 +34,6 @@ def parse_args(): parser.add_argument("--src_lang", type=str, help="source language suffix") parser.add_argument("--tar_lang", type=str, help="target language suffix") - parser.add_argument( - "--attention", - type=eval, - default=False, - help="Whether use attention model") - parser.add_argument( "--optimizer", type=str, diff --git a/dygraph/seq2seq/attention_model.py b/dygraph/seq2seq/attention_model.py index bae46d47219bf68288b91b1aeecc138b9f98945d..7fc2e337c911514f49bc03c8a38172fbeb2f7beb 100644 --- a/dygraph/seq2seq/attention_model.py +++ b/dygraph/seq2seq/attention_model.py @@ -28,13 +28,12 @@ class AttentionModel(Layer): trg_vocab_size, num_layers=1, init_scale=0.1, - padding_idx=2, + padding_idx=0, dropout=None, beam_size=1, beam_start_token=1, beam_end_token=2, beam_max_step_num=100, - mode='train', dtype="float32"): super(AttentionModel, self).__init__() self.hidden_size = hidden_size @@ -47,7 +46,6 @@ class AttentionModel(Layer): self.beam_start_token = beam_start_token self.beam_end_token = beam_end_token self.beam_max_step_num = beam_max_step_num - self.mode = mode self.kinf = 1e9 self.encoder = Encoder(src_vocab_size, hidden_size, num_layers, diff --git a/dygraph/seq2seq/reader.py b/dygraph/seq2seq/reader.py index c76fe7cfb932165db77d755a99e5553f8698e5dc..907cf262262961e7d1e2d8790fa85d7923a5558f 100644 --- a/dygraph/seq2seq/reader.py +++ b/dygraph/seq2seq/reader.py @@ -125,14 +125,14 @@ def raw_data(src_lang, src_vocab = _build_vocab(src_vocab_file) tar_vocab = _build_vocab(tar_vocab_file) - train_src, train_tar = _para_file_to_ids( src_train_file, tar_train_file, \ - src_vocab, tar_vocab ) + train_src, train_tar = _para_file_to_ids(src_train_file, tar_train_file, \ + src_vocab, tar_vocab) train_src, train_tar = filter_len( train_src, train_tar, max_sequence_len=max_sequence_len) - eval_src, eval_tar = _para_file_to_ids( src_eval_file, tar_eval_file, \ - src_vocab, tar_vocab ) + eval_src, eval_tar = _para_file_to_ids(src_eval_file, tar_eval_file, \ + src_vocab, tar_vocab) - test_src, test_tar = _para_file_to_ids( src_test_file, tar_test_file, \ + test_src, test_tar = _para_file_to_ids(src_test_file, tar_test_file, \ src_vocab, tar_vocab ) return (train_src, train_tar), (eval_src, eval_tar), (test_src, test_tar),\ @@ -143,8 +143,8 @@ def raw_mono_data(vocab_file, file_path): src_vocab = _build_vocab(vocab_file) - test_src, test_tar = _para_file_to_ids( file_path, file_path, \ - src_vocab, src_vocab ) + test_src, test_tar = _para_file_to_ids(file_path, file_path, \ + src_vocab, src_vocab) return (test_src, test_tar) @@ -160,7 +160,8 @@ class IWSLTDataset(Dataset): src_data, trg_data = raw_data data_pair = [] for src, trg in zip(src_data, trg_data): - data_pair.append([src, trg]) + if len(src) > 0: + data_pair.append([src, trg]) sorted_data_pair = sorted(data_pair, key=lambda k: len(k[0])) src_data = [data_pair[0] for data_pair in sorted_data_pair] diff --git a/dygraph/seq2seq/run.sh b/dygraph/seq2seq/run.sh index 8005ec26af0b415085c02d364c07eecb54af3d72..d8eb0800f717ba6e7191b35552cccdd58ad634e2 100644 --- a/dygraph/seq2seq/run.sh +++ b/dygraph/seq2seq/run.sh @@ -3,13 +3,12 @@ export CUDA_VISIBLE_DEVICES=0 python train.py \ --src_lang en --tar_lang vi \ - --attention True \ --num_layers 2 \ --hidden_size 512 \ --src_vocab_size 17191 \ --tar_vocab_size 7709 \ --batch_size 128 \ - --dropout 0.0 \ + --dropout 0.2 \ --init_scale 0.2 \ --max_grad_norm 5.0 \ --train_data_prefix data/en-vi/train \ @@ -20,6 +19,7 @@ python train.py \ --model_path attention_models \ --enable_ce \ --learning_rate 0.002 \ - --dtype float64 \ + --dtype float32 \ --optimizer adam \ - --max_epoch 1 + --max_epoch 12 \ + --padding_idx 2