diff --git a/generate_sequence_by_rnn_lm/.gitignore b/generate_sequence_by_rnn_lm/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..203ec9a67426fee99e6228716433bb1bec8ff14f --- /dev/null +++ b/generate_sequence_by_rnn_lm/.gitignore @@ -0,0 +1,3 @@ +*.pyc +*.tar.gz +models diff --git a/language_model/README.md b/generate_sequence_by_rnn_lm/README.md similarity index 100% rename from language_model/README.md rename to generate_sequence_by_rnn_lm/README.md diff --git a/language_model/beam_search.py b/generate_sequence_by_rnn_lm/beam_search.py similarity index 93% rename from language_model/beam_search.py rename to generate_sequence_by_rnn_lm/beam_search.py index 59767d4c2a71d96c6fc920963fee502fe95443df..b0bfa2b3c3a814454ad2f847347ead3848d13ec2 100644 --- a/language_model/beam_search.py +++ b/generate_sequence_by_rnn_lm/beam_search.py @@ -13,7 +13,7 @@ __all__ = ["BeamSearch"] class BeamSearch(object): """ - generating sequence by using beam search + Generating sequence by beam search NOTE: this class only implements generating one sentence at a time. """ @@ -21,14 +21,14 @@ class BeamSearch(object): """ constructor method. - :param inferer: object of paddle.Inference that represent the entire - network to forward compute the test batch. + :param inferer: object of paddle.Inference that represents the entire + network to forward compute the test batch :type inferer: paddle.Inference :param word_dict_file: path of word dictionary file :type word_dict_file: str :param beam_size: expansion width in each iteration :type param beam_size: int - :param max_gen_len: the maximum number of iterations. + :param max_gen_len: the maximum number of iterations :type max_gen_len: int """ self.inferer = inferer @@ -43,7 +43,7 @@ class BeamSearch(object): self.unk_id = next(x[0] for x in self.ids_2_word.iteritems() if x[1] == "") except StopIteration: - logger.fatal(("the word dictionay must contains an ending mark " + logger.fatal(("the word dictionay must contain an ending mark " "in the text generation task.")) self.candidate_paths = [] @@ -52,7 +52,7 @@ class BeamSearch(object): def _top_k(self, softmax_out, k): """ get indices of the words with k highest probablities. - NOTE: will be exclued if it is among the top k words, then word + NOTE: will be excluded if it is among the top k words, then word with (k + 1)th highest probability will be returned. :param softmax_out: probablity over the dictionary @@ -71,7 +71,7 @@ class BeamSearch(object): :params batch: the input data batch :type batch: list - :return: probalities of the predicted word + :return: probablities of the predicted word :rtype: ndarray """ return self.inferer.infer(input=batch, field=["value"]) diff --git a/language_model/config.py b/generate_sequence_by_rnn_lm/config.py similarity index 100% rename from language_model/config.py rename to generate_sequence_by_rnn_lm/config.py diff --git a/language_model/data/train_data_examples.txt b/generate_sequence_by_rnn_lm/data/train_data_examples.txt similarity index 100% rename from language_model/data/train_data_examples.txt rename to generate_sequence_by_rnn_lm/data/train_data_examples.txt diff --git a/language_model/generate.py b/generate_sequence_by_rnn_lm/generate.py similarity index 100% rename from language_model/generate.py rename to generate_sequence_by_rnn_lm/generate.py diff --git a/language_model/images/ngram.png b/generate_sequence_by_rnn_lm/images/ngram.png similarity index 100% rename from language_model/images/ngram.png rename to generate_sequence_by_rnn_lm/images/ngram.png diff --git a/language_model/images/rnn.png b/generate_sequence_by_rnn_lm/images/rnn.png similarity index 100% rename from language_model/images/rnn.png rename to generate_sequence_by_rnn_lm/images/rnn.png diff --git a/language_model/index.html b/generate_sequence_by_rnn_lm/index.html similarity index 100% rename from language_model/index.html rename to generate_sequence_by_rnn_lm/index.html diff --git a/language_model/network_conf.py b/generate_sequence_by_rnn_lm/network_conf.py similarity index 81% rename from language_model/network_conf.py rename to generate_sequence_by_rnn_lm/network_conf.py index 0a9be317dc0f7c6c5a77ea83bd689e00dfba903a..7306337bf7515ddfd4df137c3ee81f8aa4fa7b90 100644 --- a/language_model/network_conf.py +++ b/generate_sequence_by_rnn_lm/network_conf.py @@ -12,12 +12,18 @@ def rnn_lm(vocab_dim, """ RNN language model definition. - :param vocab_dim: size of vocab. - :param emb_dim: embedding vector"s dimension. + :param vocab_dim: size of vocabulary. + :type vocab_dim: int + :param emb_dim: dimension of the embedding vector + :type emb_dim: int :param rnn_type: the type of RNN cell. - :param hidden_size: number of unit. - :param stacked_rnn_num: layer number. + :type rnn_type: int + :param hidden_size: number of hidden unit. + :type hidden_size: int + :param stacked_rnn_num: number of stacked rnn cell. + :type stacked_rnn_num: int :return: cost and output layer of model. + :rtype: LayerOutput """ # input layers diff --git a/language_model/reader.py b/generate_sequence_by_rnn_lm/reader.py similarity index 100% rename from language_model/reader.py rename to generate_sequence_by_rnn_lm/reader.py diff --git a/language_model/train.py b/generate_sequence_by_rnn_lm/train.py similarity index 92% rename from language_model/train.py rename to generate_sequence_by_rnn_lm/train.py index da83da15d27ccc0a5ea58420690cd436e5784371..2958592748c0fe982972717017dede11c04ebb7e 100644 --- a/language_model/train.py +++ b/generate_sequence_by_rnn_lm/train.py @@ -20,12 +20,16 @@ def train(topology, """ train model. - :param model_cost: cost layer of the model to train. + :param topology: cost layer of the model to train. + :type topology: LayerOuput :param train_reader: train data reader. + :type trainer_reader: collections.Iterable :param test_reader: test data reader. - :param model_file_name_prefix: model"s prefix name. - :param num_passes: epoch. - :return: + :type test_reader: collections.Iterable + :param model_save_dir: path to save the trained model + :type model_save_dir: str + :param num_passes: number of epoch + :type num_passes: int """ if not os.path.exists(model_save_dir): os.mkdir(model_save_dir) diff --git a/language_model/utils.py b/generate_sequence_by_rnn_lm/utils.py similarity index 70% rename from language_model/utils.py rename to generate_sequence_by_rnn_lm/utils.py index 8e3152e8bffebefa60f7ebfb1eec5c73a2b86e6c..179b68f8234caa776c03729aff7bfab22b8e5592 100644 --- a/language_model/utils.py +++ b/generate_sequence_by_rnn_lm/utils.py @@ -17,14 +17,19 @@ def build_dict(data_file, insert_extra_words=["", ""]): """ :param data_file: path of data file + :type data_file: str :param save_path: path to save the word dictionary + :type save_path: str :param vocab_max_size: if vocab_max_size is set, top vocab_max_size words will be added into word vocabulary + :type vocab_max_size: int :param cutoff_thd: if cutoff_thd is set, words whose frequencies are less - than cutoff_thd will not added into word vocabulary. + than cutoff_thd will not be added into word vocabulary. NOTE that: vocab_max_size and cutoff_thd cannot be set at the same time + :type cutoff_word_fre: int :param extra_keys: extra keys defined by users that added into the word - dictionary, ususally these keys includes , start and ending marks + dictionary, ususally these keys include , start and ending marks + :type extra_keys: list """ word_count = defaultdict(int) with open(data_file, "r") as f: @@ -53,12 +58,29 @@ def build_dict(data_file, def load_dict(dict_path): """ + load word dictionary from the given file. Each line of the give file is + a word in the word dictionary. The first column of the line, seperated by + TAB, is the key, while the line index is the value. + :param dict_path: path of word dictionary + :type dict_path: str + :return: the dictionary + :rtype: dict """ return dict((line.strip().split("\t")[0], idx) for idx, line in enumerate(open(dict_path, "r").readlines())) def load_reverse_dict(dict_path): + """ + load word dictionary from the given file. Each line of the give file is + a word in the word dictionary. The line index is the key, while the first + column of the line, seperated by TAB, is the value. + + :param dict_path: path of word dictionary + :type dict_path: str + :return: the dictionary + :rtype: dict + """ return dict((idx, line.strip().split("\t")[0]) for idx, line in enumerate(open(dict_path, "r").readlines()))