fix #2217 (#2229)

a4a7df17 · Bruce · Yibing Liu · f13e3681 · a4a7df17 · a4a7df17
3 changed file
--- a/PaddleNLP/lexical_analysis/reader.py
+++ b/PaddleNLP/lexical_analysis/reader.py
@@ -54,7 +54,7 @@ class Dataset(object):
    def get_num_examples(self, filename):
        """num of line of file"""
-        return sum(1 for line in open(filename, "r"))
+        return sum(1 for line in io.open(filename, "r", encoding='utf-8'))
    def word_to_ids(self, words):
        """convert word to word index"""

--- a/PaddleNLP/lexical_analysis/run.sh
+++ b/PaddleNLP/lexical_analysis/run.sh
@@ -2,7 +2,7 @@
 export FLAGS_fraction_of_gpu_memory_to_use=0.5
 export FLAGS_eager_delete_tensor_gb=0.0
 export FLAGS_fast_eager_deletion_mode=1
-#export CUDA_VISIBLE_DEVICES=0,1,2,3
+export CUDA_VISIBLE_DEVICES=2       #   which GPU to use
 #alias python='./anaconda2/bin/python'
@@ -19,7 +19,7 @@ function run_train() {
        --save_model_per_batches 10000 \
        --batch_size 100 \
        --epoch 10 \
-        --use_gpu 0 \
+        --use_cuda true \
        --traindata_shuffle_buffer 200000 \
        --word_emb_dim 768 \
        --grnn_hidden_dim 768 \
@@ -43,7 +43,7 @@ function run_eval() {
        --word_emb_dim 768 \
        --grnn_hidden_dim 768 \
        --bigru_num 2 \
-        --use_gpu 0 \
+        --use_cuda True \
        --init_checkpoint ./model_baseline \
        --test_data ./data/test.tsv \
        --word_dict_path ./conf/word.dic \
@@ -62,7 +62,7 @@ function run_infer() {
        --word_emb_dim 768 \
        --grnn_hidden_dim 768 \
        --bigru_num 2 \
-        --use_gpu 0 \
+        --use_cuda True \
        --init_checkpoint ./model_baseline/ \
        --infer_data ./data/test.tsv \
        --word_dict_path ./conf/word.dic \

--- a/PaddleNLP/lexical_analysis/run_sequence_labeling.py
+++ b/PaddleNLP/lexical_analysis/run_sequence_labeling.py
@@ -40,23 +40,17 @@ data_g = utils.ArgumentGroup(parser, "data", "data paths")
 data_g.add_arg("word_dict_path", str, "./conf/word.dic", "The path of the word dictionary.")
 data_g.add_arg("label_dict_path", str, "./conf/tag.dic", "The path of the label dictionary.")
 data_g.add_arg("word_rep_dict_path", str, "./conf/q2b.dic", "The path of the word replacement Dictionary.")
-data_g.add_arg("train_data", str, "./data/train_data", "The folder where the training data is located.")
+data_g.add_arg("train_data", str, "./data/train.tsv", "The folder where the training data is located.")
-data_g.add_arg("test_data", str, "./data/test_data", "The folder where the training data is located.")
+data_g.add_arg("test_data", str, "./data/test.tsv", "The folder where the training data is located.")
 data_g.add_arg("infer_data", str, "./data/test.tsv", "The folder where the training data is located.")
 data_g.add_arg("model_save_dir", str, "./models", "The model will be saved in this path.")
 data_g.add_arg("init_checkpoint", str, "", "Path to init model")
-data_g.add_arg("corpus_type_list", str, ["human", "feed", "query", "title", "news"],
-        "The pattern list of different types of corpus used in training.", nargs='+')
-data_g.add_arg("corpus_proportion_list", float, [0.2, 0.2, 0.2, 0.2, 0.2],
-        "The proportion list of different types of corpus used in training.", nargs='+')
 # 3. train parameters
 train_g = utils.ArgumentGroup(parser, "training", "training options")
 train_g.add_arg("do_train", bool, True, "whether to perform training")
-train_g.add_arg("do_valid", bool, False, "whether to perform validation")
+train_g.add_arg("do_test", bool, True, "whether to perform testing")
-train_g.add_arg("do_test", bool, True, "whether to perform validation")
 train_g.add_arg("do_infer", bool, False, "whether to perform inference")
 train_g.add_arg("random_seed", int, 0, "random seed for training")
 train_g.add_arg("save_model_per_batches", int, 10000, "Save the model once per xxxx batch of training")
@@ -64,7 +58,7 @@ train_g.add_arg("valid_model_per_batches", int, 1000, "Do the validation once pe
 train_g.add_arg("batch_size", int, 80, "The number of sequences contained in a mini-batch, "
        "or the maximum number of tokens (include paddings) contained in a mini-batch.")
 train_g.add_arg("epoch", int, 10, "corpus iteration num")
-train_g.add_arg("use_gpu", int, -1, "Whether or not to use GPU. -1 means CPU, else GPU id")
+train_g.add_arg("use_cuda", bool, False, "If set, use GPU for training.")
 train_g.add_arg("traindata_shuffle_buffer", int, 200, "The buffer size used in shuffle the training data.")
 train_g.add_arg("base_learning_rate", float, 1e-3, "The basic learning rate that affects the entire network.")
 train_g.add_arg("emb_learning_rate", float, 5,
@@ -76,12 +70,6 @@ train_g.add_arg("crf_learning_rate", float, 0.2,
 args = parser.parse_args()
 # yapf: enable.
-if len(args.corpus_proportion_list) != len(args.corpus_type_list):
-    sys.stderr.write(
-        "The length of corpus_proportion_list should be equal to the length of corpus_type_list.\n"
-    )
-    exit(-1)
 print(args)
@@ -217,8 +205,8 @@ def main(args):
    # init executor
-    if args.use_gpu >= 0:
+    if args.use_cuda:
-        place = fluid.CUDAPlace(args.use_gpu)
+        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()