#!/usr/bin/env python # -*- coding: utf-8 -*- """ # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ from __future__ import unicode_literals import sys import os import random import paddle import logging import paddle.fluid as fluid import numpy as np import collections import six import codecs try: import configparser as cp except ImportError: import ConfigParser as cp random_seed = 7 logger = logging.getLogger() format = "%(asctime)s - %(name)s - %(levelname)s -%(filename)s-%(lineno)4d -%(message)s" # format = "%(levelname)8s: %(asctime)s: %(filename)s:%(lineno)4d %(message)s" logging.basicConfig(format=format) logger.setLevel(logging.INFO) logger = logging.getLogger('Paddle-DDC') def str2bool(v): """[ because argparse does not support to parse "true, False" as python boolean directly] Arguments: v {[type]} -- [description] Returns: [type] -- [description] """ return v.lower() in ("true", "t", "1") def to_lodtensor(data, place): """ convert ot LODtensor """ seq_lens = [len(seq) for seq in data] cur_len = 0 lod = [cur_len] for l in seq_lens: cur_len += l lod.append(cur_len) flattened_data = np.concatenate(data, axis=0).astype("int64") flattened_data = flattened_data.reshape([len(flattened_data), 1]) res = fluid.LoDTensor() res.set(flattened_data, place) res.set_lod([lod]) return res class ArgumentGroup(object): """[ArgumentGroup] Arguments: object {[type]} -- [description] """ def __init__(self, parser, title, des): self._group = parser.add_argument_group(title=title, description=des) def add_arg(self, name, type, default, help, **kwargs): """[add_arg] Arguments: name {[type]} -- [description] type {[type]} -- [description] default {[type]} -- [description] help {[type]} -- [description] """ type = str2bool if type == bool else type self._group.add_argument( "--" + name, default=default, type=type, help=help + ' Default: %(default)s.', **kwargs) class DataReader(object): """[get data generator for dataset] Arguments: object {[type]} -- [description] Returns: [type] -- [description] """ def __init__(self, char_vocab, intent_dict, max_len): self._char_vocab = char_vocab self._intent_dict = intent_dict self._oov_id = 0 self.intent_size = len(intent_dict) self.all_data = [] self.max_len = max_len self.padding_id = 0 def _get_num_examples(self): return len(self.all_data) def prepare_data(self, data_path, batch_size, mode): """ prepare data """ # print word_dict_path # assert os.path.exists( # word_dict_path), "The given word dictionary dose not exist." assert os.path.exists(data_path), "The given data file does not exist." if mode == "train": train_reader = fluid.io.batch( paddle.reader.shuffle( self.data_reader( data_path, self.max_len, shuffle=True), buf_size=batch_size * 100), batch_size) # train_reader = fluid.io.batch(self.data_reader(data_path), batch_size) return train_reader else: test_reader = fluid.io.batch( self.data_reader(data_path, self.max_len), batch_size) return test_reader def data_reader(self, file_path, max_len, shuffle=False): """ Convert query into id list use fixed voc """ for line in codecs.open(file_path, "r", encoding="utf8"): line = line.strip() if isinstance(line, six.binary_type): line = line.decode("utf8", errors="ignore") query, intent = line.split("\t") char_id_list = list(map(lambda x: 0 if x not in self._char_vocab else int(self._char_vocab[x]), \ list(query))) if len(char_id_list) < max_len: char_id_list.extend([self.padding_id] * (max_len - len(char_id_list))) char_id_list = char_id_list[:max_len] intent_id_list = [self.padding_id] * self.intent_size for item in intent.split('\2'): intent_id_list[int(self._intent_dict[item])] = 1 self.all_data.append([char_id_list, intent_id_list]) if shuffle: random.seed(random_seed) random.shuffle(self.all_data) def reader(): """ reader """ for char_id_list, intent_id_list in self.all_data: # print char_id_list, intent_id yield char_id_list, intent_id_list return reader class DataProcesser(object): """[file process methods] Arguments: object {[type]} -- [description] Returns: [type] -- [description] """ @staticmethod def read_dict(filename): """ read_dict: key\2value """ res_dict = {} for line in codecs.open(filename, encoding="utf8"): try: if isinstance(line, six.binary_type): line = line.strip().decode("utf8") line = line.strip() key, value = line.strip().split("\2") res_dict[key] = value except Exception as err: logger.error(str(err)) logger.error("read dict[%s] failed" % filename) return res_dict @staticmethod def build_dict(filename, save_dir, min_num_char=2, min_num_intent=2): """[build_dict from file] Arguments: filename {[type]} -- [description] save_dir {[type]} -- [description] Keyword Arguments: min_num_char {int} -- [description] (default: {2}) min_num_intent {int} -- [description] (default: {2}) """ char_dict = {} intent_dict = {} # readfile for line in codecs.open(filename): line = line.strip() if isinstance(line, six.binary_type): line = line.strip().decode("utf8", errors="ignore") query, intents = line.split("\t") # read query for char_item in list(query): if char_item not in char_dict: char_dict[char_item] = 0 char_dict[char_item] += 1 # read intents for intent in intents.split('\002'): if intent not in intent_dict: intent_dict[intent] = 0 intent_dict[intent] += 1 # save char dict with codecs.open( "%s/char.dict" % save_dir, "w", encoding="utf8") as f_out: f_out.write("PAD\0020\n") f_out.write("OOV\0021\n") char_id = 2 for key, value in char_dict.items(): if value >= min_num_char: if isinstance(key, six.binary_type): key = key.encode("utf8") f_out.write("%s\002%d\n" % (key, char_id)) char_id += 1 # save intent dict with codecs.open( "%s/domain.dict" % save_dir, "w", encoding="utf8") as f_out: f_out.write("SYS_OTHER\0020\n") intent_id = 1 for key, value in intent_dict.items(): if value >= min_num_intent and key != u'SYS_OTHER': if isinstance(key, six.binary_type): key = key.encode("utf8") f_out.write("%s\002%d\n" % (key, intent_id)) intent_id += 1 class ConfigReader(object): """[read model config file] Arguments: object {[type]} -- [description] Returns: [type] -- [description] """ @staticmethod def read_conf(conf_file): """[read_conf] Arguments: conf_file {[type]} -- [description] Returns: [type] -- [description] """ flow_data = collections.defaultdict(lambda: {}) class2key = set(["model"]) param_conf = cp.ConfigParser() param_conf.read(conf_file) for section in param_conf.sections(): if section not in class2key: continue for option in param_conf.items(section): flow_data[section][option[0]] = eval(option[1]) return flow_data def init_checkpoint(exe, init_checkpoint_path, main_program): """ Init CheckPoint """ fluid.load(main_program, init_checkpoint_path, exe) print("Load model from {}".format(init_checkpoint_path)) def print_arguments(args): """ Print Arguments """ print('----------- Configuration Arguments -----------') for arg, value in sorted(six.iteritems(vars(args))): print('%s: %s' % (arg, value)) print('------------------------------------------------') def check_version(version='1.6.0'): """ Log error and exit when the installed version of paddlepaddle is not satisfied. """ err = "PaddlePaddle version 1.6 or higher is required, " \ "or a suitable develop version is satisfied as well. \n" \ "Please make sure the version is good with your code." \ try: fluid.require_version(version) except Exception as e: logger.error(err) sys.exit(1)