# -*- coding: utf-8 -*- # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Utilities for parsing PTB text files.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import collections import os import sys import numpy as np import io from collections import Counter Py3 = sys.version_info[0] == 3 if Py3: line_tok = '\n' else: line_tok = u'\n' PAD_ID = 0 BOS_ID = 1 EOS_ID = 2 UNK_ID = 3 def _read_words(filename): data = [] with io.open(filename, "r", encoding='utf-8') as f: if Py3: return f.read().replace("\n", "").split() else: return f.read().decode("utf-8").replace(u"\n", u"").split() def read_all_line(filename): data = [] with io.open(filename, "r", encoding='utf-8') as f: for line in f.readlines(): data.append(line.strip()) return data def _vocab(vocab_file, train_file, max_vocab_cnt): lines = read_all_line(train_file) all_words = [] for line in lines: all_words.extend(line.split()) vocab_count = Counter(all_words).most_common() raw_vocab_size = min(len(vocab_count), max_vocab_cnt) with io.open(vocab_file, "w", encoding='utf-8') as f: for voc, fre in vocab_count[0:max_vocab_cnt]: f.write(voc) f.write(line_tok) def _build_vocab(vocab_file, train_file=None, max_vocab_cnt=-1): if not os.path.exists(vocab_file): _vocab(vocab_file, train_file, max_vocab_cnt) vocab_dict = {"": 0, "": 1, "": 2, "": 3} ids = 4 with io.open(vocab_file, "r", encoding='utf-8') as f: for line in f.readlines(): vocab_dict[line.strip()] = ids ids += 1 # rev_vocab = {value:key for key, value in vocab_dict.items()} print("vocab word num", ids) return vocab_dict def _para_file_to_ids(src_file, src_vocab): src_data = [] with io.open(src_file, "r", encoding='utf-8') as f_src: for line in f_src.readlines(): arra = line.strip().split() ids = [BOS_ID] ids.extend( [src_vocab[w] if w in src_vocab else UNK_ID for w in arra]) ids.append(EOS_ID) src_data.append(ids) return src_data def filter_len(src, max_sequence_len=128): new_src = [] for id1 in src: if len(id1) > max_sequence_len: id1 = id1[:max_sequence_len] new_src.append(id1) return new_src def raw_data(dataset_prefix, max_sequence_len=50, max_vocab_cnt=-1): src_vocab_file = dataset_prefix + ".vocab.txt" src_train_file = dataset_prefix + ".train.txt" src_eval_file = dataset_prefix + ".valid.txt" src_test_file = dataset_prefix + ".test.txt" src_vocab = _build_vocab(src_vocab_file, src_train_file, max_vocab_cnt) train_src = _para_file_to_ids(src_train_file, src_vocab) train_src = filter_len(train_src, max_sequence_len=max_sequence_len) eval_src = _para_file_to_ids(src_eval_file, src_vocab) test_src = _para_file_to_ids(src_test_file, src_vocab) return train_src, eval_src, test_src, src_vocab def get_vocab(dataset_prefix, max_sequence_len=50): src_vocab_file = dataset_prefix + ".vocab.txt" src_vocab = _build_vocab(src_vocab_file) rev_vocab = {} for key, value in src_vocab.items(): rev_vocab[value] = key return src_vocab, rev_vocab def raw_mono_data(vocab_file, file_path): src_vocab = _build_vocab(vocab_file) test_src, test_tar = _para_file_to_ids( file_path, file_path, \ src_vocab, src_vocab ) return (test_src, test_tar) def get_data_iter(raw_data, batch_size, sort_cache=False, cache_num=1, mode='train', enable_ce=False): src_data = raw_data data_len = len(src_data) index = np.arange(data_len) if mode == "train" and not enable_ce: np.random.shuffle(index) def to_pad_np(data): max_len = 0 for ele in data: if len(ele) > max_len: max_len = len(ele) ids = np.ones( (batch_size, max_len), dtype='int64') * PAD_ID # PAD_ID = 0 mask = np.zeros((batch_size), dtype='int32') for i, ele in enumerate(data): ids[i, :len(ele)] = ele mask[i] = len(ele) return ids, mask b_src = [] if mode != "train": cache_num = 1 for j in range(data_len): if len(b_src) == batch_size * cache_num: if sort_cache: new_cache = sorted(b_src, key=lambda k: len(k)) new_cache = b_src for i in range(cache_num): batch_data = new_cache[i * batch_size:(i + 1) * batch_size] src_ids, src_mask = to_pad_np(batch_data) yield (src_ids, src_mask) b_src = [] b_src.append(src_data[index[j]]) if len(b_src) > 0: if sort_cache: new_cache = sorted(b_src, key=lambda k: len(k)) new_cache = b_src for i in range(0, len(b_src), batch_size): end_index = min((i + 1) * batch_size, len(b_src)) batch_data = new_cache[i * batch_size:end_index] src_ids, src_mask = to_pad_np(batch_data) yield (src_ids, src_mask)