diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4efc176ff712c7d212dc0bec9a7d1aefc9d86d7f..5ed1f4c4beb6eae16d319b25ec9959b61fe3fbc3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,3 +25,11 @@ files: \.md$ - id: remove-tabs files: \.md$ +- repo: local + hooks: + - id: convert-markdown-into-html + name: convert-markdown-into-html + description: Convert README.md into index.html + entry: python .pre-commit-hooks/convert_markdown_into_html.py + language: system + files: .+README\.md$ diff --git a/.pre-commit-hooks/convert_markdown_into_html.py b/.pre-commit-hooks/convert_markdown_into_html.py new file mode 100644 index 0000000000000000000000000000000000000000..66f44ef23c5d9a82436dfbe4b6bcdfc4e69ab55a --- /dev/null +++ b/.pre-commit-hooks/convert_markdown_into_html.py @@ -0,0 +1,95 @@ +import argparse +import re +import sys + +HEAD = """ + + + + + + + + + + + + + + + + +
+
+ + + + + + + +""" + + +def convert_markdown_into_html(argv=None): + parser = argparse.ArgumentParser() + parser.add_argument('filenames', nargs='*', help='Filenames to fix') + args = parser.parse_args(argv) + + retv = 0 + + for filename in args.filenames: + with open( + re.sub(r"README", "index", re.sub(r"\.md$", ".html", filename)), + "w") as output: + output.write(HEAD) + with open(filename) as input: + for line in input: + output.write(line) + output.write(TAIL) + + return retv + + +if __name__ == '__main__': + sys.exit(convert_markdown_into_html()) diff --git a/.travis.yml b/.travis.yml index 42c07b39c9a2e95c905144d2cfdec8e2100e49ef..f069c12016c0bc0cac26bf028e4f8fd025b2842a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,6 +2,8 @@ language: cpp cache: ccache sudo: required dist: trusty +services: + - docker os: - linux env: @@ -16,8 +18,13 @@ addons: - python2.7-dev before_install: - pip install -U virtualenv pre-commit pip + - docker pull paddlepaddle/paddle:latest script: - .travis/precommit.sh + - docker run -i --rm -v "$PWD:/py_unittest" paddlepaddle/paddle:latest /bin/bash -c + "cd /py_unittest && find . -name 'tests' -type d -print0 | xargs -0 -I{} -n1 bash -c 'cd {}; + python -m unittest discover -v'" + notifications: email: on_success: change diff --git a/ctr/index.html b/ctr/index.html new file mode 100644 index 0000000000000000000000000000000000000000..ff0c5d9b19ec046b61f7f38d6eb9e70dff33e1ec --- /dev/null +++ b/ctr/index.html @@ -0,0 +1,300 @@ + + + + + + + + + + + + + + + + + +
+
+ + + + + + + diff --git a/deep_speech_2/README.md b/deep_speech_2/README.md index bb1815c0087064730818f56150445bacc0919cd4..7a372e9bed262d2ee5bc8640a0f480b9ce34cd34 100644 --- a/deep_speech_2/README.md +++ b/deep_speech_2/README.md @@ -18,9 +18,14 @@ For some machines, we also need to install libsndfile1. Details to be added. ``` cd data python librispeech.py +cat manifest.libri.train-* > manifest.libri.train-all cd .. ``` +After running librispeech.py, we have several "manifest" json files named with a prefix `manifest.libri.`. A manifest file summarizes a speech data set, with each line containing the meta data (i.e. audio filepath, transcription text, audio duration) of each audio file within the data set, in json format. + +By `cat manifest.libri.train-* > manifest.libri.train-all`, we simply merge the three seperate sample sets of LibriSpeech (train-clean-100, train-clean-360, train-other-500) into one training set. This is a simple way for merging different data sets. + More help for arguments: ``` @@ -32,13 +37,13 @@ python librispeech.py --help For GPU Training: ``` -CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4 +CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4 --train_manifest_path ./data/manifest.libri.train-all ``` For CPU Training: ``` -python train.py --trainer_count 8 --use_gpu False +python train.py --trainer_count 8 --use_gpu False -- train_manifest_path ./data/manifest.libri.train-all ``` More help for arguments: diff --git a/deep_speech_2/audio_data_utils.py b/deep_speech_2/audio_data_utils.py index c717bcf182811d1b043bf0e83e2e31209be18e46..1cd29be114a416636db8c2d7e888d0d8d6c2a8a8 100644 --- a/deep_speech_2/audio_data_utils.py +++ b/deep_speech_2/audio_data_utils.py @@ -8,6 +8,7 @@ import json import random import soundfile import numpy as np +import itertools import os RANDOM_SEED = 0 @@ -62,6 +63,7 @@ class DataGenerator(object): self.__stride_ms__ = stride_ms self.__window_ms__ = window_ms self.__max_frequency__ = max_frequency + self.__epoc__ = 0 self.__random__ = random.Random(RANDOM_SEED) # load vocabulary (dictionary) self.__vocab_dict__, self.__vocab_list__ = \ @@ -245,10 +247,42 @@ class DataGenerator(object): new_batch.append((padded_audio, text)) return new_batch - def instance_reader_creator(self, - manifest_path, - sort_by_duration=True, - shuffle=False): + def __batch_shuffle__(self, manifest, batch_size): + """ + The instances have different lengths and they cannot be + combined into a single matrix multiplication. It usually + sorts the training examples by length and combines only + similarly-sized instances into minibatches, pads with + silence when necessary so that all instances in a batch + have the same length. This batch shuffle fuction is used + to make similarly-sized instances into minibatches and + make a batch-wise shuffle. + + 1. Sort the audio clips by duration. + 2. Generate a random number `k`, k in [0, batch_size). + 3. Randomly remove `k` instances in order to make different mini-batches, + then make minibatches and each minibatch size is batch_size. + 4. Shuffle the minibatches. + + :param manifest: manifest file. + :type manifest: list + :param batch_size: Batch size. This size is also used for generate + a random number for batch shuffle. + :type batch_size: int + :return: batch shuffled mainifest. + :rtype: list + """ + manifest.sort(key=lambda x: x["duration"]) + shift_len = self.__random__.randint(0, batch_size - 1) + batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size) + self.__random__.shuffle(batch_manifest) + batch_manifest = list(sum(batch_manifest, ())) + res_len = len(manifest) - shift_len - len(batch_manifest) + batch_manifest.extend(manifest[-res_len:]) + batch_manifest.extend(manifest[0:shift_len]) + return batch_manifest + + def instance_reader_creator(self, manifest): """ Instance reader creator for audio data. Creat a callable function to produce instances of data. @@ -256,32 +290,13 @@ class DataGenerator(object): Instance: a tuple of a numpy ndarray of audio spectrogram and a list of tokenized and indexed transcription text. - :param manifest_path: Filepath of manifest for audio clip files. - :type manifest_path: basestring - :param sort_by_duration: Sort the audio clips by duration if set True - (for SortaGrad). - :type sort_by_duration: bool - :param shuffle: Shuffle the audio clips if set True. - :type shuffle: bool + :param manifest: Filepath of manifest for audio clip files. + :type manifest: basestring :return: Data reader function. :rtype: callable """ - if sort_by_duration and shuffle: - sort_by_duration = False - logger.warn("When shuffle set to true, " - "sort_by_duration is forced to set False.") def reader(): - # read manifest - manifest = self.__read_manifest__( - manifest_path=manifest_path, - max_duration=self.__max_duration__, - min_duration=self.__min_duration__) - # sort (by duration) or shuffle manifest - if sort_by_duration: - manifest.sort(key=lambda x: x["duration"]) - if shuffle: - self.__random__.shuffle(manifest) # extract spectrogram feature for instance in manifest: spectrogram = self.__audio_featurize__( @@ -296,8 +311,8 @@ class DataGenerator(object): batch_size, padding_to=-1, flatten=False, - sort_by_duration=True, - shuffle=False): + sortagrad=False, + batch_shuffle=False): """ Batch data reader creator for audio data. Creat a callable function to produce batches of data. @@ -317,20 +332,32 @@ class DataGenerator(object): :param flatten: If set True, audio data will be flatten to be a 1-dim ndarray. Otherwise, 2-dim ndarray. Default is False. :type flatten: bool - :param sort_by_duration: Sort the audio clips by duration if set True - (for SortaGrad). - :type sort_by_duration: bool - :param shuffle: Shuffle the audio clips if set True. - :type shuffle: bool + :param sortagrad: Sort the audio clips by duration in the first epoc + if set True. + :type sortagrad: bool + :param batch_shuffle: Shuffle the audio clips if set True. It is + not a thorough instance-wise shuffle, but a + specific batch-wise shuffle. For more details, + please see `__batch_shuffle__` function. + :type batch_shuffle: bool :return: Batch reader function, producing batches of data when called. :rtype: callable """ def batch_reader(): - instance_reader = self.instance_reader_creator( + # read manifest + manifest = self.__read_manifest__( manifest_path=manifest_path, - sort_by_duration=sort_by_duration, - shuffle=shuffle) + max_duration=self.__max_duration__, + min_duration=self.__min_duration__) + + # sort (by duration) or shuffle manifest + if self.__epoc__ == 0 and sortagrad: + manifest.sort(key=lambda x: x["duration"]) + elif batch_shuffle: + manifest = self.__batch_shuffle__(manifest, batch_size) + + instance_reader = self.instance_reader_creator(manifest) batch = [] for instance in instance_reader(): batch.append(instance) @@ -339,6 +366,7 @@ class DataGenerator(object): batch = [] if len(batch) > 0: yield self.__padding_batch__(batch, padding_to, flatten) + self.__epoc__ += 1 return batch_reader diff --git a/deep_speech_2/data/librispeech.py b/deep_speech_2/data/librispeech.py index 838fee59786d244ccd0e9ea487911791c52c7cda..653caa9267b62aa8415a26be2143de874bb15e88 100644 --- a/deep_speech_2/data/librispeech.py +++ b/deep_speech_2/data/librispeech.py @@ -1,13 +1,14 @@ """ - Download, unpack and create manifest for Librespeech dataset. + Download, unpack and create manifest json files for the Librespeech dataset. - Manifest is a json file with each line containing one audio clip filepath, - its transcription text string, and its duration. It servers as a unified - interfance to organize different data sets. + A manifest is a json file summarizing filelist in a data set, with each line + containing the meta data (i.e. audio filepath, transcription text, audio + duration) of each audio file in the data set. """ import paddle.v2 as paddle from paddle.v2.dataset.common import md5file +import distutils.util import os import wget import tarfile @@ -27,7 +28,9 @@ URL_TRAIN_CLEAN_360 = URL_ROOT + "/train-clean-360.tar.gz" URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz" MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9" +MD5_TEST_OTHER = "fb5a50374b501bb3bac4815ee91d3135" MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1" +MD5_DEV_OTHER = "c8d0bcc9cca99d4f8b62fcc847357931" MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522" MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa" MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708" @@ -44,6 +47,13 @@ parser.add_argument( default="manifest.libri", type=str, help="Filepath prefix for output manifests. (default: %(default)s)") +parser.add_argument( + "--full_download", + default="True", + type=distutils.util.strtobool, + help="Download all datasets for Librispeech." + " If False, only download a minimal requirement (test-clean, dev-clean" + " train-clean-100). (default: %(default)s)") args = parser.parse_args() @@ -57,7 +67,10 @@ def download(url, md5sum, target_dir): print("Downloading %s ..." % url) wget.download(url, target_dir) print("\nMD5 Chesksum %s ..." % filepath) - assert md5file(filepath) == md5sum, "MD5 checksum failed." + if not md5file(filepath) == md5sum: + raise RuntimeError("MD5 checksum failed.") + else: + print("File exists, skip downloading. (%s)" % filepath) return filepath @@ -69,21 +82,17 @@ def unpack(filepath, target_dir): tar = tarfile.open(filepath) tar.extractall(target_dir) tar.close() - return target_dir def create_manifest(data_dir, manifest_path): """ - Create a manifest file summarizing the dataset (list of filepath and meta - data). - - Each line of the manifest contains one audio clip filepath, its - transcription text string, and its duration. Manifest file servers as a - unified interfance to organize data sets. + Create a manifest json file summarizing the data set, with each line + containing the meta data (i.e. audio filepath, transcription text, audio + duration) of each audio file within the data set. """ print("Creating manifest %s ..." % manifest_path) json_lines = [] - for subfolder, _, filelist in os.walk(data_dir): + for subfolder, _, filelist in sorted(os.walk(data_dir)): text_filelist = [ filename for filename in filelist if filename.endswith('trans.txt') ] @@ -111,9 +120,16 @@ def prepare_dataset(url, md5sum, target_dir, manifest_path): """ Download, unpack and create summmary manifest file. """ - filepath = download(url, md5sum, target_dir) - unpacked_dir = unpack(filepath, target_dir) - create_manifest(unpacked_dir, manifest_path) + if not os.path.exists(os.path.join(target_dir, "LibriSpeech")): + # download + filepath = download(url, md5sum, target_dir) + # unpack + unpack(filepath, target_dir) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + # create manifest json file + create_manifest(target_dir, manifest_path) def main(): @@ -132,6 +148,27 @@ def main(): md5sum=MD5_TRAIN_CLEAN_100, target_dir=os.path.join(args.target_dir, "train-clean-100"), manifest_path=args.manifest_prefix + ".train-clean-100") + if args.full_download: + prepare_dataset( + url=URL_TEST_OTHER, + md5sum=MD5_TEST_OTHER, + target_dir=os.path.join(args.target_dir, "test-other"), + manifest_path=args.manifest_prefix + ".test-other") + prepare_dataset( + url=URL_DEV_OTHER, + md5sum=MD5_DEV_OTHER, + target_dir=os.path.join(args.target_dir, "dev-other"), + manifest_path=args.manifest_prefix + ".dev-other") + prepare_dataset( + url=URL_TRAIN_CLEAN_360, + md5sum=MD5_TRAIN_CLEAN_360, + target_dir=os.path.join(args.target_dir, "train-clean-360"), + manifest_path=args.manifest_prefix + ".train-clean-360") + prepare_dataset( + url=URL_TRAIN_OTHER_500, + md5sum=MD5_TRAIN_OTHER_500, + target_dir=os.path.join(args.target_dir, "train-other-500"), + manifest_path=args.manifest_prefix + ".train-other-500") if __name__ == '__main__': diff --git a/deep_speech_2/train.py b/deep_speech_2/train.py index e6a7d076bbfe36e837767657e87a75f73cd2b6d2..957c24267ce24c917ca8437683d03eefec6636d5 100644 --- a/deep_speech_2/train.py +++ b/deep_speech_2/train.py @@ -11,6 +11,7 @@ import sys from model import deep_speech2 from audio_data_utils import DataGenerator import numpy as np +import os #TODO: add WER metric @@ -78,6 +79,13 @@ parser.add_argument( default='data/eng_vocab.txt', type=str, help="Vocabulary filepath. (default: %(default)s)") +parser.add_argument( + "--init_model_path", + default=None, + type=str, + help="If set None, the training will start from scratch. " + "Otherwise, the training will resume from " + "the existing model of this path. (default: %(default)s)") args = parser.parse_args() @@ -85,23 +93,27 @@ def train(): """ DeepSpeech2 training. """ + # initialize data generator - data_generator = DataGenerator( - vocab_filepath=args.vocab_filepath, - normalizer_manifest_path=args.normalizer_manifest_path, - normalizer_num_samples=200, - max_duration=20.0, - min_duration=0.0, - stride_ms=10, - window_ms=20) + def data_generator(): + return DataGenerator( + vocab_filepath=args.vocab_filepath, + normalizer_manifest_path=args.normalizer_manifest_path, + normalizer_num_samples=200, + max_duration=20.0, + min_duration=0.0, + stride_ms=10, + window_ms=20) + train_generator = data_generator() + test_generator = data_generator() # create network config - dict_size = data_generator.vocabulary_size() + dict_size = train_generator.vocabulary_size() + # paddle.data_type.dense_array is used for variable batch input. + # the size 161 * 161 is only an placeholder value and the real shape + # of input batch data will be set at each batch. audio_data = paddle.layer.data( - name="audio_spectrogram", - height=161, - width=2000, - type=paddle.data_type.dense_vector(322000)) + name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161)) text_data = paddle.layer.data( name="transcript_text", type=paddle.data_type.integer_value_sequence(dict_size)) @@ -114,36 +126,30 @@ def train(): rnn_size=args.rnn_layer_size, is_inference=False) - # create parameters and optimizer - parameters = paddle.parameters.create(cost) + # create/load parameters and optimizer + if args.init_model_path is None: + parameters = paddle.parameters.create(cost) + else: + if not os.path.isfile(args.init_model_path): + raise IOError("Invalid model!") + parameters = paddle.parameters.Parameters.from_tar( + gzip.open(args.init_model_path)) optimizer = paddle.optimizer.Adam( learning_rate=args.adam_learning_rate, gradient_clipping_threshold=400) trainer = paddle.trainer.SGD( cost=cost, parameters=parameters, update_equation=optimizer) # prepare data reader - train_batch_reader_sortagrad = data_generator.batch_reader_creator( - manifest_path=args.train_manifest_path, - batch_size=args.batch_size, - padding_to=2000, - flatten=True, - sort_by_duration=True, - shuffle=False) - train_batch_reader_nosortagrad = data_generator.batch_reader_creator( + train_batch_reader = train_generator.batch_reader_creator( manifest_path=args.train_manifest_path, batch_size=args.batch_size, - padding_to=2000, - flatten=True, - sort_by_duration=False, - shuffle=True) - test_batch_reader = data_generator.batch_reader_creator( + sortagrad=True if args.init_model_path is None else False, + batch_shuffle=True) + test_batch_reader = test_generator.batch_reader_creator( manifest_path=args.dev_manifest_path, batch_size=args.batch_size, - padding_to=2000, - flatten=True, - sort_by_duration=False, - shuffle=False) - feeding = data_generator.data_name_feeding() + batch_shuffle=False) + feeding = train_generator.data_name_feeding() # create event handler def event_handler(event): @@ -169,17 +175,8 @@ def train(): time.time() - start_time, event.pass_id, result.cost) # run train - # first pass with sortagrad - if args.use_sortagrad: - trainer.train( - reader=train_batch_reader_sortagrad, - event_handler=event_handler, - num_passes=1, - feeding=feeding) - args.num_passes -= 1 - # other passes without sortagrad trainer.train( - reader=train_batch_reader_nosortagrad, + reader=train_batch_reader, event_handler=event_handler, num_passes=args.num_passes, feeding=feeding) diff --git a/image_classification/caffe2paddle/README.md b/image_classification/caffe2paddle/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c90e000186e974803494cd5d25df1fc71004c37b --- /dev/null +++ b/image_classification/caffe2paddle/README.md @@ -0,0 +1,39 @@ +## 使用说明 + +`caffe2paddle.py`提供了将Caffe训练的模型转换为PaddlePaddle可使用的模型的接口`ModelConverter`,其封装了图像领域常用的Convolution、BatchNorm等layer的转换函数,可以完成VGG、ResNet等常用模型的转换。模型转换的基本过程是:基于Caffe的Python API加载模型并依次获取每一个layer的信息,将其中的参数根据layer类型与PaddlePaddle适配后序列化保存(对于Pooling等无需训练的layer不做处理),输出可以直接为PaddlePaddle的Python API加载使用的模型文件。 + +可以按如下方法使用`ModelConverter`接口: + +```python +# 定义以下变量为相应的文件路径和文件名 +caffe_model_file = "./ResNet-50-deploy.prototxt" # Caffe网络配置文件的路径 +caffe_pretrained_file = "./ResNet-50-model.caffemodel" # Caffe模型文件的路径 +paddle_tar_name = "Paddle_ResNet50.tar.gz" # 输出的Paddle模型的文件名 + +# 初始化,从指定文件加载模型 +converter = ModelConverter(caffe_model_file=caffe_model_file, + caffe_pretrained_file=caffe_pretrained_file, + paddle_tar_name=paddle_tar_name) +# 进行模型转换 +converter.convert() +``` + +`caffe2paddle.py`中已提供以上步骤,修改其中文件相关变量的值后执行`python caffe2paddle.py`即可完成模型转换。此外,为辅助验证转换结果,`ModelConverter`中封装了使用Caffe API预测的接口`caffe_predict`,使用如下所示,将会打印按类别概率排序的(类别id, 概率)的列表: + +```python +# img为图片路径,mean_file为图像均值文件的路径 +converter.caffe_predict(img="./cat.jpg", mean_file="./imagenet/ilsvrc_2012_mean.npy") +``` + +需要注意,在模型转换时会对layer的参数进行命名,这里默认使用PaddlePaddle中默认的layer和参数命名规则:以`wrap_name_default`中的值和该layer类型的调用计数构造layer name,并以此为前缀构造参数名,比如第一个InnerProduct层(相应转换函数说明见下方)的bias参数将被命名为`___fc_layer_0__.wbias`。 + +```python +# 对InnerProduct层的参数进行转换,使用name值构造对应layer的参数名 +# wrap_name_default设置默认name值为fc_layer +@wrap_name_default("fc_layer") +def convert_InnerProduct_layer(self, params, name=None) +``` + +为此,在验证和使用转换得到的模型时,编写PaddlePaddle网络配置无需指定layer name并且要保证和Caffe端模型使用同样的拓扑顺序,尤其是对于ResNet这种有分支的网络结构,要保证两分支在PaddlePaddle和Caffe中先后顺序一致,这样才能够使得模型参数正确加载。 + +如果不希望使用默认的命名,并且在PaddlePaddle网络配置中指定了layer name,可以建立Caffe和PaddlePaddle网络配置间layer name对应关系的`dict`并在调用`ModelConverter.convert`时作为`name_map`的值传入,这样在命名保存layer中的参数时将使用相应的layer name,不受拓扑顺序的影响。另外这里只针对Caffe网络配置中Convolution、InnerProduct和BatchNorm类别的layer建立`name_map`即可(一方面,对于Pooling等无需训练的layer不需要保存,故这里没有提供转换接口;另一方面,对于Caffe中的Scale类别的layer,由于Caffe和PaddlePaddle在实现上的一些差别,PaddlePaddle中的batch_norm层是BatchNorm和Scale层的复合,故这里对Scale进行了特殊处理)。 diff --git a/image_classification/caffe2paddle/caffe2paddle.py b/image_classification/caffe2paddle/caffe2paddle.py new file mode 100644 index 0000000000000000000000000000000000000000..a4011f281538d31d076f7f554d1dbb8a2ceb1d5a --- /dev/null +++ b/image_classification/caffe2paddle/caffe2paddle.py @@ -0,0 +1,187 @@ +import os +import struct +import gzip +import tarfile +import cStringIO +import numpy as np +import cv2 +import caffe +from paddle.proto.ParameterConfig_pb2 import ParameterConfig +from paddle.trainer_config_helpers.default_decorators import wrap_name_default + + +class ModelConverter(object): + def __init__(self, caffe_model_file, caffe_pretrained_file, + paddle_tar_name): + self.net = caffe.Net(caffe_model_file, caffe_pretrained_file, + caffe.TEST) + self.tar_name = paddle_tar_name + self.params = dict() + self.pre_layer_name = "" + self.pre_layer_type = "" + + def convert(self, name_map=None): + layer_dict = self.net.layer_dict + for layer_name in layer_dict.keys(): + layer = layer_dict[layer_name] + layer_params = layer.blobs + layer_type = layer.type + if len(layer_params) > 0: + self.pre_layer_name = getattr( + self, "convert_" + layer_type + "_layer")( + layer_params, + name=None + if name_map == None else name_map.get(layer_name)) + self.pre_layer_type = layer_type + with gzip.open(self.tar_name, 'w') as f: + self.to_tar(f) + return + + def to_tar(self, f): + tar = tarfile.TarFile(fileobj=f, mode='w') + for param_name in self.params.keys(): + param_conf, param_data = self.params[param_name] + + confStr = param_conf.SerializeToString() + tarinfo = tarfile.TarInfo(name="%s.protobuf" % param_name) + tarinfo.size = len(confStr) + buf = cStringIO.StringIO(confStr) + buf.seek(0) + tar.addfile(tarinfo, fileobj=buf) + + buf = cStringIO.StringIO() + self.serialize(param_data, buf) + tarinfo = tarfile.TarInfo(name=param_name) + buf.seek(0) + tarinfo.size = len(buf.getvalue()) + tar.addfile(tarinfo, buf) + + @staticmethod + def serialize(data, f): + f.write(struct.pack("IIQ", 0, 4, data.size)) + f.write(data.tobytes()) + + @wrap_name_default("conv") + def convert_Convolution_layer(self, params, name=None): + for i in range(len(params)): + data = np.array(params[i].data) + if len(params) == 2: + suffix = "0" if i == 0 else "bias" + file_name = "_%s.w%s" % (name, suffix) + else: + file_name = "_%s.w%s" % (name, str(i)) + param_conf = ParameterConfig() + param_conf.name = file_name + param_conf.size = reduce(lambda a, b: a * b, data.shape) + self.params[file_name] = (param_conf, data.flatten()) + + return name + + @wrap_name_default("fc_layer") + def convert_InnerProduct_layer(self, params, name=None): + for i in range(len(params)): + data = np.array(params[i].data) + if len(params) == 2: + suffix = "0" if i == 0 else "bias" + file_name = "_%s.w%s" % (name, suffix) + else: + file_name = "_%s.w%s" % (name, str(i)) + data = np.transpose(data) + param_conf = ParameterConfig() + param_conf.name = file_name + dims = list(data.shape) + if len(dims) < 2: + dims.insert(0, 1) + param_conf.size = reduce(lambda a, b: a * b, dims) + param_conf.dims.extend(dims) + self.params[file_name] = (param_conf, data.flatten()) + return name + + @wrap_name_default("batch_norm") + def convert_BatchNorm_layer(self, params, name=None): + scale = 1 / np.array(params[-1].data)[0] if np.array( + params[-1].data)[0] != 0 else 0 + for i in range(2): + data = np.array(params[i].data) * scale + file_name = "_%s.w%s" % (name, str(i + 1)) + param_conf = ParameterConfig() + param_conf.name = file_name + dims = list(data.shape) + assert len(dims) == 1 + dims.insert(0, 1) + param_conf.size = reduce(lambda a, b: a * b, dims) + param_conf.dims.extend(dims) + self.params[file_name] = (param_conf, data.flatten()) + return name + + def convert_Scale_layer(self, params, name=None): + assert self.pre_layer_type == "BatchNorm" + name = self.pre_layer_name + for i in range(len(params)): + data = np.array(params[i].data) + suffix = "0" if i == 0 else "bias" + file_name = "_%s.w%s" % (name, suffix) + param_conf = ParameterConfig() + param_conf.name = file_name + dims = list(data.shape) + assert len(dims) == 1 + dims.insert(0, 1) + param_conf.size = reduce(lambda a, b: a * b, dims) + if i == 1: + param_conf.dims.extend(dims) + self.params[file_name] = (param_conf, data.flatten()) + return name + + def caffe_predict(self, + img, + mean_file='./caffe/imagenet/ilsvrc_2012_mean.npy'): + net = self.net + + net.blobs['data'].data[...] = load_image(img, mean_file=mean_file) + out = net.forward() + + output_prob = net.blobs['prob'].data[0].flatten() + print zip(np.argsort(output_prob)[::-1], np.sort(output_prob)[::-1]) + + +def load_image(file, resize_size=256, crop_size=224, mean_file=None): + # load image + im = cv2.imread(file) + # resize + h, w = im.shape[:2] + h_new, w_new = resize_size, resize_size + if h > w: + h_new = resize_size * h / w + else: + w_new = resize_size * w / h + im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC) + # crop + h, w = im.shape[:2] + h_start = (h - crop_size) / 2 + w_start = (w - crop_size) / 2 + h_end, w_end = h_start + crop_size, w_start + crop_size + im = im[h_start:h_end, w_start:w_end, :] + # transpose to CHW order + im = im.transpose((2, 0, 1)) + + if mean_file: + mu = np.load(mean_file) + mu = mu.mean(1).mean(1) + im = im - mu[:, None, None] + im = im / 255.0 + return im + + +if __name__ == "__main__": + caffe_model_file = "./ResNet-50-deploy.prototxt" + caffe_pretrained_file = "./ResNet-50-model.caffemodel" + paddle_tar_name = "Paddle_ResNet50.tar.gz" + + converter = ModelConverter( + caffe_model_file=caffe_model_file, + caffe_pretrained_file=caffe_pretrained_file, + paddle_tar_name=paddle_tar_name) + converter.convert() + + converter.caffe_predict("./cat.jpg", + "./caffe/imagenet/ilsvrc_2012_mean.npy") diff --git a/ltr/index.html b/ltr/index.html new file mode 100644 index 0000000000000000000000000000000000000000..cce283e33d1865f862578ae17e1dc54a54e9ba72 --- /dev/null +++ b/ltr/index.html @@ -0,0 +1,418 @@ + + + + + + + + + + + + + + + + + +
+
+ + + + + + + diff --git a/nce_cost/README.md b/nce_cost/README.md index a0990367ef8b03c70c29d285e22ef85907e1d0b7..fce8bdaf80501e5bed650e93efc6c438284031c9 100644 --- a/nce_cost/README.md +++ b/nce_cost/README.md @@ -1 +1,115 @@ -TBD +# 噪声对比估计加速词向量训练 +## 背景介绍 +在自然语言处理领域中,通常使用特征向量来表示一个单词,但是如何使用准确的词向量来表示语义却是一个难点,详细内容可以在[词向量章节](https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/README.cn.md)中查阅到,原作者使用神经概率语言模型(Neural Probabilistic Language Model, NPLM)来训练词向量,尽管 NPLM 有优异的精度表现,但是相对于传统的 N-gram 统计模型,训练时间还是太漫长了\[[3](#参考文献)\]。常用的优化这个问题算法主要有两个:一个是 hierarchical-sigmoid \[[2](#参考文献)\] 另一个 噪声对比估计(Noise-contrastive estimation, NCE)\[[1](#参考文献)\]。为了克服这个问题本文引入了 NCE 方法。本文将以训练 NPLM 作为例子来讲述如何使用 NCE。 + +## NCE 概览 +NCE 是一种快速对离散分布进行估计的方法,应用到本文中的问题:训练 NPLM 计算开销很大,原因是 softmax 函数计算时需要考虑每个类别的指数项,必须计算字典中的所有单词,而在一般语料集上面字典往往非常大\[[3](#参考文献)\],从而导致整个训练过程十分耗时。与常用的 hierarchical-sigmoid \[[2](#参考文献)\] 方法相比,NCE 不再使用复杂的二叉树来构造目标函数,而是采用相对简单的随机负采样,以大幅提升计算效率。 + + +假设已知具体的上下文 $h$,并且知道这个分布为 $P^h(w)$ ,并将从中抽样出来的数据作为正样例,而从一个噪音分布 $P_n(w)$ 抽样的数据作为负样例。我们可以任意选择合适的噪音分布,默认为无偏的均匀分布。这里我们同时假设噪音样例 k 倍于数据样例,则训练数据被抽中的概率为\[[1](#参考文献)\]: + +$$P^h(D=1|w,\theta)=\frac { P_\theta^h(w) }{ P^h_\theta(w)+kP_n(w) } =\sigma (\Delta s_\theta(w,h))$$ + +其中 $\Delta s_\theta(w,h)=s_\theta(w,h)-\log (kP_n(w))$ ,$s_\theta(w,h)$ 表示选择在生成 $w$ 字并处于上下文 $h$ 时的特征向量,整体目标函数的目的就是增大正样本的概率同时降低负样本的概率。目标函数如下[[1](#参考文献)]: + +$$ +J^h(\theta )=E_{ P_d^h }\left[ \log { P^h(D=1|w,\theta ) } \right] +kE_{ P_n }\left[ \log P^h (D=0|w,\theta ) \right]$$ +$$ + \\\\\qquad =E_{ P_d^h }\left[ \log { \sigma (\Delta s_\theta(w,h)) } \right] +kE_{ P_n }\left[ \log (1-\sigma (\Delta s_\theta(w,h))) \right]$$ + +总体上来说,NCE 是通过构造逻辑回归(logistic regression),对正样例和负样例做二分类,对于每一个样本,将自身的预测词 label 作为正样例,同时采样出 $k$ 个其他词 label 作为负样例,从而只需要计算样本在这 $k+1$ 个 label 上的概率。相比原始的 softmax 分类需要计算每个类别的分数,然后归一化得到概率,节约了大量的时间消耗。 + +## 实验数据 +本文采用 Penn Treebank (PTB) 数据集([Tomas Mikolov预处理版本](http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz))来训练语言模型。PaddlePaddle 提供 [paddle.dataset.imikolov](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/dataset/imikolov.py) 接口来方便调用这些数据,如果当前目录没有找到数据它会自动下载并验证文件的完整性。并提供大小为5的滑动窗口对数据做预处理工作,方便后期处理。语料语种为英文,共有42068句训练数据,3761句测试数据。 + +## 网络结构 +N-gram 神经概率语言模型详细网络结构见图1: + +

+
+图1. 网络配置结构 +

+可以看到,模型主要分为如下几个部分构成: + +1. **输入层**:输入的 ptb 样本由原始的英文单词组成,将每个英文单词转换为字典中的 id 表示,使用唯一的 id 表示可以区分每个单词。 + +2. **词向量层**:比起原先的 id 表示,词向量表示更能体现词与词之间的语义关系。这里使用可更新的 embedding 矩阵,将原先的 id 表示转换为固定维度的词向量表示。训练完成之后,词语之间的语义相似度可以使用词向量之间的距离来表示,语义越相似,距离越近。 + +3. **词向量拼接层**:将词向量进行串联,并将词向量首尾相接形成一个长向量。这样可以方便后面全连接层的处理。 + +4. **全连接隐层**:将上一层获得的长向量输入到一层隐层的神经网络,输出特征向量。全连接的隐层可以增强网络的学习能力。 + +5. **NCE层**:训练时可以直接实用 PaddlePaddle 提供的 NCE Layer。 + + +## 训练阶段 +训练直接运行``` python train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含 ptb 数据集,如果未包含,则自动下载。运行过程中,每1000个 iteration 会打印模型训练信息,主要包含训练损失,每个 pass 会计算测试数据集上的损失,并同时会保存最新的模型快照。在 PaddlePaddle 中有已经实现好的 NCE Layer,一些参数需要自行根据实际场景进行设计,可参考的调参方案如下: + + +| 参数名 | 参数作用 | 介绍 | +|:------ |:-------| :--------| +| param\_attr / bias\_attr | 用来设置参数名字 | 可以方便后面预测阶段好来实现网络的参数共享,具体内容在下一个章节里会陈述。| +| num\_neg\_samples | 参数负责控制对负样例的采样个数。 | 可以控制正负样本比例,这个值取值区间为 [1, 字典大小-1],负样本个数越多则整个模型的训练速度越慢,模型精度也会越高 | +| neg\_distribution | 控制生成负样例标签的分布,默认是一个均匀分布。 | 可以自行控制负样本采样时各个类别的采样权重,比如希望正样例为“晴天”时,负样例“洪水”在训练时更被着重区分,则可以将“洪水”这个类别的采样权重增加。 | +| act | 表示使用何种激活函数。 | 根据 NCE 的原理,这里应该使用 sigmoid 函数。 | + + +具体代码实现如下: + +```python +cost = paddle.layer.nce( + input=hidden_layer, + label=next_word, + num_classes=dict_size, + param_attr=paddle.attr.Param(name='nce_w'), + bias_attr=paddle.attr.Param(name='nce_b'), + act=paddle.activation.Sigmoid(), + num_neg_samples=25, + neg_distribution=None) +``` + + +## 预测阶段 +预测直接运行` python infer.py `,程序首先会加载最新模型,然后按照 batch 大小依次进行预测,并打印预测结果。因为训练和预测计算逻辑不一样,预测阶段需要共享 NCE Layer 中的逻辑回归训练时得到的参数,所以要写一个推断层,推断层的参数为预先训练好的参数。 + +具体实现推断层的方法:先是通过 `paddle.attr.Param` 方法获取参数值,然后使用 `paddle.layer.trans_full_matrix_projection` 对隐层输出向量 `hidden_layer` 做一个矩阵右乘,PaddlePaddle 会自行在模型中寻找相同参数名的参数并获取。右乘求和后得到类别向量,将类别向量输入 softmax 做一个归一操作,和为1,从而得到最后的类别概率分布。 + +代码实现如下: + +```python +with paddle.layer.mixed( + size=dict_size, + act=paddle.activation.Softmax(), + bias_attr=paddle.attr.Param(name='nce_b')) as prediction: + prediction += paddle.layer.trans_full_matrix_projection( + input=hidden_layer, param_attr=paddle.attr.Param(name='nce_w')) +``` + +预测的输出形式为: + +``` +-------------------------- +No.68 Input: ' for possible +Ground Truth Output: +Predict Output: + +-------------------------- +No.69 Input: for possible +Ground Truth Output: on +Predict Output: + +-------------------------- +No.70 Input: for possible on +Ground Truth Output: the +Predict Output: the + +``` + +每一个短线表示一次的预测,第二行显示第几条测试样例,并给出输入的4个单词,第三行为真实的标签,第四行为预测的标签。 + +## 参考文献 +1. Mnih A, Kavukcuoglu K. [Learning word embeddings efficiently with noise-contrastive estimation](https://papers.nips.cc/paper/5165-learning-word-embeddings-efficiently-with-noise-contrastive-estimation.pdf)[C]//Advances in neural information processing systems. 2013: 2265-2273. + +2. Morin, F., & Bengio, Y. (2005, January). [Hierarchical Probabilistic Neural Network Language Model](http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf). In Aistats (Vol. 5, pp. 246-252). + +3. Mnih A, Teh Y W. [A Fast and Simple Algorithm for Training Neural Probabilistic Language Models](http://xueshu.baidu.com/s?wd=paperuri%3A%280735b97df93976efb333ac8c266a1eb2%29&filter=sc_long_sign&tn=SE_xueshusource_2kduw22v&sc_vurl=http%3A%2F%2Farxiv.org%2Fabs%2F1206.6426&ie=utf-8&sc_us=5770715420073315630)[J]. Computer Science, 2012:1751-1758. diff --git a/nce_cost/images/network_conf.png b/nce_cost/images/network_conf.png new file mode 100644 index 0000000000000000000000000000000000000000..749f8a365db1e1c18d829a460de7c45b27892d19 Binary files /dev/null and b/nce_cost/images/network_conf.png differ diff --git a/nce_cost/infer.py b/nce_cost/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..53e3aef45fc02ac008caa7102836ac47915be1fc --- /dev/null +++ b/nce_cost/infer.py @@ -0,0 +1,70 @@ +# -*- encoding:utf-8 -*- +import numpy as np +import glob +import gzip +import paddle.v2 as paddle +from nce_conf import network_conf + + +def main(): + paddle.init(use_gpu=False, trainer_count=1) + word_dict = paddle.dataset.imikolov.build_dict() + dict_size = len(word_dict) + + prediction_layer = network_conf( + is_train=False, + hidden_size=128, + embedding_size=512, + dict_size=dict_size) + + models_list = glob.glob('./models/*') + models_list = sorted(models_list) + + with gzip.open(models_list[-1], 'r') as f: + parameters = paddle.parameters.Parameters.from_tar(f) + + idx_word_dict = dict((v, k) for k, v in word_dict.items()) + batch_size = 64 + batch_ins = [] + ins_iter = paddle.dataset.imikolov.test(word_dict, 5) + + infer_data = [] + infer_data_label = [] + for item in paddle.dataset.imikolov.test(word_dict, 5)(): + infer_data.append((item[:4])) + infer_data_label.append(item[4]) + # Choose 100 samples from the test set to show how to infer. + if len(infer_data_label) == 100: + break + + feeding = { + 'firstw': 0, + 'secondw': 1, + 'thirdw': 2, + 'fourthw': 3, + 'fifthw': 4 + } + + predictions = paddle.infer( + output_layer=prediction_layer, + parameters=parameters, + input=infer_data, + feeding=feeding, + field=['value']) + + for i, (prob, data, + label) in enumerate(zip(predictions, infer_data, infer_data_label)): + print '--------------------------' + print "No.%d Input: " % (i+1) + \ + idx_word_dict[data[0]] + ' ' + \ + idx_word_dict[data[1]] + ' ' + \ + idx_word_dict[data[2]] + ' ' + \ + idx_word_dict[data[3]] + print 'Ground Truth Output: ' + idx_word_dict[label] + print 'Predict Output: ' + idx_word_dict[prob.argsort( + kind='heapsort', axis=0)[-1]] + print + + +if __name__ == '__main__': + main() diff --git a/nce_cost/nce_conf.py b/nce_cost/nce_conf.py new file mode 100644 index 0000000000000000000000000000000000000000..962a9ccc80906bc2272245d0e297142397ffb024 --- /dev/null +++ b/nce_cost/nce_conf.py @@ -0,0 +1,61 @@ +# -*- encoding:utf-8 -*- +import math +import paddle.v2 as paddle + + +def network_conf(hidden_size, embedding_size, dict_size, is_train): + + first_word = paddle.layer.data( + name="firstw", type=paddle.data_type.integer_value(dict_size)) + second_word = paddle.layer.data( + name="secondw", type=paddle.data_type.integer_value(dict_size)) + third_word = paddle.layer.data( + name="thirdw", type=paddle.data_type.integer_value(dict_size)) + fourth_word = paddle.layer.data( + name="fourthw", type=paddle.data_type.integer_value(dict_size)) + next_word = paddle.layer.data( + name="fifthw", type=paddle.data_type.integer_value(dict_size)) + + embed_param_attr = paddle.attr.Param( + name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0) + first_embedding = paddle.layer.embedding( + input=first_word, size=embedding_size, param_attr=embed_param_attr) + second_embedding = paddle.layer.embedding( + input=second_word, size=embedding_size, param_attr=embed_param_attr) + third_embedding = paddle.layer.embedding( + input=third_word, size=embedding_size, param_attr=embed_param_attr) + fourth_embedding = paddle.layer.embedding( + input=fourth_word, size=embedding_size, param_attr=embed_param_attr) + + context_embedding = paddle.layer.concat(input=[ + first_embedding, second_embedding, third_embedding, fourth_embedding + ]) + + hidden_layer = paddle.layer.fc( + input=context_embedding, + size=hidden_size, + act=paddle.activation.Tanh(), + bias_attr=paddle.attr.Param(learning_rate=1), + param_attr=paddle.attr.Param( + initial_std=1. / math.sqrt(embedding_size * 8), learning_rate=1)) + + if is_train == True: + cost = paddle.layer.nce( + input=hidden_layer, + label=next_word, + num_classes=dict_size, + param_attr=paddle.attr.Param(name='nce_w'), + bias_attr=paddle.attr.Param(name='nce_b'), + act=paddle.activation.Sigmoid(), + num_neg_samples=25, + neg_distribution=None) + return cost + else: + with paddle.layer.mixed( + size=dict_size, + act=paddle.activation.Softmax(), + bias_attr=paddle.attr.Param(name='nce_b')) as prediction: + prediction += paddle.layer.trans_full_matrix_projection( + input=hidden_layer, param_attr=paddle.attr.Param(name='nce_w')) + + return prediction diff --git a/nce_cost/train.py b/nce_cost/train.py new file mode 100644 index 0000000000000000000000000000000000000000..a8b437c1dd9bfc89fd03598b9a4201693c3074d7 --- /dev/null +++ b/nce_cost/train.py @@ -0,0 +1,52 @@ +# -*- encoding:utf-8 -*- +import paddle.v2 as paddle +import gzip + +from nce_conf import network_conf + + +def main(): + paddle.init(use_gpu=False, trainer_count=1) + word_dict = paddle.dataset.imikolov.build_dict() + dict_size = len(word_dict) + + cost = network_conf( + is_train=True, hidden_size=128, embedding_size=512, dict_size=dict_size) + + parameters = paddle.parameters.create(cost) + adagrad = paddle.optimizer.Adam(learning_rate=1e-4) + trainer = paddle.trainer.SGD(cost, parameters, adagrad) + + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 1000 == 0: + print "Pass %d, Batch %d, Cost %f" % ( + event.pass_id, event.batch_id, event.cost) + + if isinstance(event, paddle.event.EndPass): + result = trainer.test( + paddle.batch(paddle.dataset.imikolov.test(word_dict, 5), 64)) + print "Test here.. Pass %d, Cost %f" % (event.pass_id, result.cost) + + model_name = "./models/model_pass_%05d.tar.gz" % event.pass_id + print "Save model into %s ..." % model_name + with gzip.open(model_name, 'w') as f: + parameters.to_tar(f) + + feeding = { + 'firstw': 0, + 'secondw': 1, + 'thirdw': 2, + 'fourthw': 3, + 'fifthw': 4 + } + + trainer.train( + paddle.batch(paddle.dataset.imikolov.train(word_dict, 5), 64), + num_passes=1000, + event_handler=event_handler, + feeding=feeding) + + +if __name__ == '__main__': + main() diff --git a/nmt_without_attention/README.md b/nmt_without_attention/README.md index 38361bbfbc3e029de872eba967a17453c5e7dac1..a54b715102574dae1b619997a1ed7a2bfc14131c 100644 --- a/nmt_without_attention/README.md +++ b/nmt_without_attention/README.md @@ -91,11 +91,11 @@ PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08 ```python #### Decoder encoder_last = paddle.layer.last_seq(input=encoded_vector) -with paddle.layer.mixed( +encoder_last_projected = paddle.layer.mixed( size=decoder_size, - act=paddle.activation.Tanh()) as encoder_last_projected: - encoder_last_projected += paddle.layer.full_matrix_projection( - input=encoder_last) + act=paddle.activation.Tanh(), + input=paddle.layer.full_matrix_projection(input=encoder_last)) + # gru step def gru_decoder_without_attention(enc_vec, current_word): ''' @@ -112,10 +112,12 @@ def gru_decoder_without_attention(enc_vec, current_word): context = paddle.layer.last_seq(input=enc_vec) - with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs: - decoder_inputs +=paddle.layer.full_matrix_projection(input=context) - decoder_inputs += paddle.layer.full_matrix_projection( - input=current_word) + decoder_inputs = paddle.layer.mixed( + size=decoder_size * 3, + input=[ + paddle.layer.full_matrix_projection(input=context), + paddle.layer.full_matrix_projection(input=current_word) + ]) gru_step = paddle.layer.gru_step( name='gru_decoder', @@ -125,24 +127,24 @@ def gru_decoder_without_attention(enc_vec, current_word): output_mem=decoder_mem, size=decoder_size) - with paddle.layer.mixed( - size=target_dict_dim, - bias_attr=True, - act=paddle.activation.Softmax()) as out: - out += paddle.layer.full_matrix_projection(input=gru_step) + out = paddle.layer.mixed( + size=target_dict_dim, + bias_attr=True, + act=paddle.activation.Softmax(), + input=paddle.layer.full_matrix_projection(input=gru_step)) return out ``` 在模型训练和测试阶段,解码器的行为有很大的不同: - **训练阶段**:目标翻译结果的词向量`trg_embedding`作为参数传递给单步逻辑`gru_decoder_without_attention()`,函数`recurrent_group()`循环调用单步逻辑执行,最后计算目标翻译与实际解码的差异cost并返回; -- **测试阶段**:解码器根据最后一个生成的词预测下一个词,`GeneratedInputV2()`自动取出模型预测出的概率最高的$k$个词的词向量传递给单步逻辑,`beam_search()`函数调用单步逻辑函数`gru_decoder_without_attention()`完成柱搜索并作为结果返回。 +- **测试阶段**:解码器根据最后一个生成的词预测下一个词,`GeneratedInput()`自动取出模型预测出的概率最高的$k$个词的词向量传递给单步逻辑,`beam_search()`函数调用单步逻辑函数`gru_decoder_without_attention()`完成柱搜索并作为结果返回。 训练和生成的逻辑分别实现在如下的`if-else`条件分支中: ```python decoder_group_name = "decoder_group" -group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True) +group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) group_inputs = [group_input1] if not generating: trg_embedding = paddle.layer.embedding( @@ -166,7 +168,7 @@ if not generating: return cost else: - trg_embedding = paddle.layer.GeneratedInputV2( + trg_embedding = paddle.layer.GeneratedInput( size=target_dict_dim, embedding_name='_target_language_embedding', embedding_size=word_vector_dim) diff --git a/nmt_without_attention/index.html b/nmt_without_attention/index.html new file mode 100644 index 0000000000000000000000000000000000000000..35177ee5a679fe4f826dfd219721ef2e36b7df83 --- /dev/null +++ b/nmt_without_attention/index.html @@ -0,0 +1,419 @@ + + + + + + + + + + + + + + + + + +
+
+ + + + + + + diff --git a/nmt_without_attention/nmt_without_attention.py b/nmt_without_attention/nmt_without_attention.py index e5a4e1b602226da802c5903d83c0d963ae37bd44..5a61b525e67f7d07f66ae8cc5064c0244bc0b6f3 100644 --- a/nmt_without_attention/nmt_without_attention.py +++ b/nmt_without_attention/nmt_without_attention.py @@ -16,7 +16,7 @@ def seq2seq_net(source_dict_dim, target_dict_dim, generating=False): ''' Define the network structure of NMT, including encoder and decoder. - :param source_dict_dim: size of source dictionary + :param source_dict_dim: size of source dictionary :type source_dict_dim : int :param target_dict_dim: size of target dictionary :type target_dict_dim: int @@ -41,11 +41,11 @@ def seq2seq_net(source_dict_dim, target_dict_dim, generating=False): return_seq=True) #### Decoder encoder_last = paddle.layer.last_seq(input=encoded_vector) - with paddle.layer.mixed( - size=decoder_size, - act=paddle.activation.Tanh()) as encoder_last_projected: - encoder_last_projected += paddle.layer.full_matrix_projection( - input=encoder_last) + encoder_last_projected = paddle.layer.mixed( + size=decoder_size, + act=paddle.activation.Tanh(), + input=paddle.layer.full_matrix_projection(input=encoder_last)) + # gru step def gru_decoder_without_attention(enc_vec, current_word): ''' @@ -63,10 +63,12 @@ def seq2seq_net(source_dict_dim, target_dict_dim, generating=False): context = paddle.layer.last_seq(input=enc_vec) - with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs: - decoder_inputs += paddle.layer.full_matrix_projection(input=context) - decoder_inputs += paddle.layer.full_matrix_projection( - input=current_word) + decoder_inputs = paddle.layer.mixed( + size=decoder_size * 3, + input=[ + paddle.layer.full_matrix_projection(input=context), + paddle.layer.full_matrix_projection(input=current_word) + ]) gru_step = paddle.layer.gru_step( name='gru_decoder', @@ -76,15 +78,15 @@ def seq2seq_net(source_dict_dim, target_dict_dim, generating=False): output_mem=decoder_mem, size=decoder_size) - with paddle.layer.mixed( - size=target_dict_dim, - bias_attr=True, - act=paddle.activation.Softmax()) as out: - out += paddle.layer.full_matrix_projection(input=gru_step) + out = paddle.layer.mixed( + size=target_dict_dim, + bias_attr=True, + act=paddle.activation.Softmax(), + input=paddle.layer.full_matrix_projection(input=gru_step)) return out decoder_group_name = "decoder_group" - group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True) + group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) group_inputs = [group_input1] if not generating: @@ -109,7 +111,7 @@ def seq2seq_net(source_dict_dim, target_dict_dim, generating=False): return cost else: - trg_embedding = paddle.layer.GeneratedInputV2( + trg_embedding = paddle.layer.GeneratedInput( size=target_dict_dim, embedding_name='_target_language_embedding', embedding_size=word_vector_dim) @@ -194,7 +196,7 @@ def generate(source_dict_dim, target_dict_dim, init_models_path): beam_gen = seq2seq_net(source_dict_dim, target_dict_dim, True) with gzip.open(init_models_path) as f: parameters = paddle.parameters.Parameters.from_tar(f) - # prob is the prediction probabilities, and id is the prediction word. + # prob is the prediction probabilities, and id is the prediction word. beam_result = paddle.infer( output_layer=beam_gen, parameters=parameters, @@ -244,10 +246,10 @@ def main(): target_language_dict_dim = 30000 if generating: - # shoud pass the right generated model's path here + # modify this path to speicify a trained model. init_models_path = 'models/nmt_without_att_params_batch_1800.tar.gz' if not os.path.exists(init_models_path): - print "Cannot find models for generation" + print "trained model cannot be found." exit(1) generate(source_language_dict_dim, target_language_dict_dim, init_models_path) diff --git a/sequence_tagging_for_ner/index.html b/sequence_tagging_for_ner/index.html new file mode 100644 index 0000000000000000000000000000000000000000..b7c6c8994abdbcd80ff7347960d984e5528311a1 --- /dev/null +++ b/sequence_tagging_for_ner/index.html @@ -0,0 +1,314 @@ + + + + + + + + + + + + + + + + + +
+
+ + + + + + + diff --git a/text_classification/index.html b/text_classification/index.html new file mode 100644 index 0000000000000000000000000000000000000000..3ee660d8471269bfebf2444fb7c4a97deb550561 --- /dev/null +++ b/text_classification/index.html @@ -0,0 +1,302 @@ + + + + + + + + + + + + + + + + + +
+
+ + + + + + + diff --git a/word_embedding/index.html b/word_embedding/index.html new file mode 100644 index 0000000000000000000000000000000000000000..83f6809d669d9ec6e0dd002f414ba8247068e270 --- /dev/null +++ b/word_embedding/index.html @@ -0,0 +1,227 @@ + + + + + + + + + + + + + + + + + +
+
+ + + + + + +