Revert "Add build_raw_data.py" (#1739)

44786e9c · qingqing01 · GitHub · ef06ad53 · ef06ad53
隐藏空白更改
内联并排

Showing with 0 addition and 62 deletion

fluid/PaddleNLP/text_classification/async_executor/data_generator/build_raw_data.py ...ification/async_executor/data_generator/build_raw_data.py +0 -62

未找到文件。
--- a/fluid/PaddleNLP/text_classification/async_executor/data_generator/build_raw_data.py
+++ b/fluid/PaddleNLP/text_classification/async_executor/data_generator/build_raw_data.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Build raw data
-"""
-from __future__ import print_function
-import sys
-import os
-import random
-import re
-data_type = sys.argv[1]
-
-if not (data_type == "train" or data_type == "test"):
-    print("python %s [test/train]" % sys.argv[0], file=sys.stderr)
-    sys.exit(-1)
-
-pos_folder = "aclImdb/" + data_type + "/pos/"
-neg_folder = "aclImdb/" + data_type + "/neg/"
-
-pos_train_list = [(pos_folder + x, "1") for x in os.listdir(pos_folder)]
-neg_train_list = [(neg_folder + x, "0") for x in os.listdir(neg_folder)]
-
-all_train_list = pos_train_list + neg_train_list
-random.shuffle(all_train_list)
-
-
-def load_dict(dictfile):
-    """
-    Load word id dict
-    """
-    vocab = {}
-    wid = 0
-    with open(dictfile) as f:
-        for line in f:
-            vocab[line.strip()] = str(wid)
-            wid += 1
-    return vocab
-
-
-vocab = load_dict("aclImdb/imdb.vocab")
-unk_id = str(len(vocab))
-print("vocab size: ", len(vocab), file=sys.stderr)
-pattern = re.compile(r'(;|,|\.|\?|!|\s|\(|\))')
-
-for fitem in all_train_list:
-    label = str(fitem[1])
-    fname = fitem[0]
-    with open(fname) as f:
-        sent = f.readline().lower().replace("<br />", " ").strip()
-        out_s = "%s | %s" % (sent, label)
-        print(out_s, file=sys.stdout)