# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This module provides utilities for data generator and optimizer definition """ import sys import time import numpy as np import paddle.fluid as fluid import paddle import quora_question_pairs def to_lodtensor(data, place): """ convert to LODtensor """ seq_lens = [len(seq) for seq in data] cur_len = 0 lod = [cur_len] for l in seq_lens: cur_len += l lod.append(cur_len) flattened_data = np.concatenate(data, axis=0).astype("int64") flattened_data = flattened_data.reshape([len(flattened_data), 1]) res = fluid.LoDTensor() res.set(flattened_data, place) res.set_lod([lod]) return res def getOptimizer(global_config): """ get Optimizer by config """ if global_config.optimizer_type == "adam": optimizer = fluid.optimizer.Adam( learning_rate=fluid.layers.exponential_decay( learning_rate=global_config.learning_rate, decay_steps=global_config.train_samples_num // global_config.batch_size, decay_rate=global_config.lr_decay)) elif global_config.optimizer_type == "sgd": optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.exponential_decay( learning_rate=global_config.learning_rate, decay_steps=global_config.train_samples_num // global_config.batch_size, decay_rate=global_config.lr_decay)) elif global_config.optimizer_type == "adagrad": optimizer = fluid.optimizer.Adagrad( learning_rate=fluid.layers.exponential_decay( learning_rate=global_config.learning_rate, decay_steps=global_config.train_samples_num // global_config.batch_size, decay_rate=global_config.lr_decay)) return optimizer def get_pretrained_word_embedding(word2vec, word2id, config): """get pretrained embedding in shape [config.dict_dim, config.emb_dim]""" print("preparing pretrained word embedding ...") assert (config.dict_dim >= len(word2id)) word2id = sorted(word2id.items(), key=lambda x: x[1]) words = [x[0] for x in word2id] words = words + [''] * (config.dict_dim - len(words)) pretrained_emb = [] for _, word in enumerate(words): if word in word2vec: assert (len(word2vec[word] == config.emb_dim)) if config.embedding_norm: pretrained_emb.append(word2vec[word] / np.linalg.norm(word2vec[word])) else: pretrained_emb.append(word2vec[word]) elif config.OOV_fill == 'uniform': pretrained_emb.append( np.random.uniform( -0.05, 0.05, size=[config.emb_dim]).astype(np.float32)) elif config.OOV_fill == 'normal': pretrained_emb.append( np.random.normal( loc=0.0, scale=0.1, size=[config.emb_dim]).astype( np.float32)) else: print("Unkown OOV fill method: ", OOV_fill) exit() word_embedding = np.stack(pretrained_emb) return word_embedding def getDict(data_type="quora_question_pairs"): """ get word2id dict from quora dataset """ print("Generating word dict...") if data_type == "quora_question_pairs": word_dict = quora_question_pairs.word_dict() else: raise RuntimeError("No such dataset") print("Vocab size: ", len(word_dict)) return word_dict def duplicate(reader): """ duplicate the quora qestion pairs since there are 2 questions in a sample Input: reader, which yield (question1, question2, label) Output: reader, which yield (question1, question2, label) and yield (question2, question1, label) """ def duplicated_reader(): for data in reader(): (q1, q2, label) = data yield (q1, q2, label) yield (q2, q1, label) return duplicated_reader def pad(reader, PAD_ID): """ Input: reader, yield batches of [(question1, question2, label), ... ] Output: padded_reader, yield batches of [(padded_question1, padded_question2, mask1, mask2, label), ... ] """ assert (isinstance(PAD_ID, int)) def padded_reader(): for batch in reader(): max_len1 = max([len(data[0]) for data in batch]) max_len2 = max([len(data[1]) for data in batch]) padded_batch = [] for data in batch: question1, question2, label = data seq_len1 = len(question1) seq_len2 = len(question2) mask1 = [1] * seq_len1 + [0] * (max_len1 - seq_len1) mask2 = [1] * seq_len2 + [0] * (max_len2 - seq_len2) padded_question1 = question1 + [PAD_ID] * (max_len1 - seq_len1) padded_question2 = question2 + [PAD_ID] * (max_len2 - seq_len2) padded_question1 = [ [x] for x in padded_question1 ] # last dim of questions must be 1, according to fluid's request padded_question2 = [[x] for x in padded_question2] assert (len(mask1) == max_len1) assert (len(mask2) == max_len2) assert (len(padded_question1) == max_len1) assert (len(padded_question2) == max_len2) padded_batch.append( (padded_question1, padded_question2, mask1, mask2, label)) yield padded_batch return padded_reader def prepare_data(data_type, word_dict, batch_size, buf_size=50000, duplicate_data=False, use_pad=False): """ prepare data """ PAD_ID = word_dict[''] if data_type == "quora_question_pairs": # train/dev/test reader are batched iters which yield a batch of (question1, question2, label) each time # qestion1 and question2 are lists of word ID # label is 0 or 1 # for example: ([1, 3, 2], [7, 5, 4, 99], 1) def prepare_reader(reader): if duplicate_data: reader = duplicate(reader) reader = paddle.batch( paddle.reader.shuffle( reader, buf_size=buf_size), batch_size=batch_size, drop_last=False) if use_pad: reader = pad(reader, PAD_ID=PAD_ID) return reader train_reader = prepare_reader(quora_question_pairs.train(word_dict)) dev_reader = prepare_reader(quora_question_pairs.dev(word_dict)) test_reader = prepare_reader(quora_question_pairs.test(word_dict)) else: raise RuntimeError("no such dataset") return train_reader, dev_reader, test_reader