From b69ef47800b232f3267a73c7c93f57246c202a81 Mon Sep 17 00:00:00 2001 From: sandyhouse Date: Thu, 27 Feb 2020 20:59:57 +0800 Subject: [PATCH] add advanced_en.md --- docs/source/md/advanced_en.md | 264 ++++++++++++++++++++++++++++++++++ 1 file changed, 264 insertions(+) create mode 100644 docs/source/md/advanced_en.md diff --git a/docs/source/md/advanced_en.md b/docs/source/md/advanced_en.md new file mode 100644 index 0000000..0a2c22b --- /dev/null +++ b/docs/source/md/advanced_en.md @@ -0,0 +1,264 @@ +# Advanced Usage + +## Checkpoints Uploading and downloading for HDFS +With PLSC, checkpoints can be uploaded to or downloaded from HDFS file systems automatically when HDFS information is set by *set_hdfs_info* api. + +### Checkpoints Uploading +The following example code shows how to upload checkpoints to HDFS: +```python +from plsc import Entry + +if __name__ == "__main__": + ins = Entry() + ins.set_model_save_dir('./saved_model') + ins.set_hdfs_info("your_hdfs_addr", "name,passwd", "some_dir_on_hdfs") + ins.train() +``` + +### Checkpoints Downloading +The following code snippet shows how to download checkpoints from HDFS: +```python +from plsc import Entry + +if __name__ == "__main__": + ins = Entry() + ins.set_checkpoint_dir('./saved_model') + ins.set_hdfs_info("your_hdfs_addr", + "name,passwd", + fs_checkpoint_dir="some_dir") +``` + +Using above code, checkpoints in "some_dir" on HDFS file systems will be downloaded on local "./saved_model" directory. Please make sure the local directory "./saved_model" exists. + +## Pre-processing for images in base64 format +In practice, base64 is a common format to store images. All image data is stored in one file and each line of the file represents a image data in base64 format and its corresponding label. + +The following gives an example structure of datasets: +```shell script +dataset + |-- file_list.txt + |-- dataset.part1 + |-- dataset.part2 + | .... + `-- dataset.part10 +``` + +The file file_list.txt records all data files, each line of which represents a data file, for example dataset.part1. + +For distributed training, every GPU card has to process the same number of data, and global shuffle is usually used for all images. + +The provided pre-processing tool does global shuffle on all training images and splits these images into groups evenly. The number of groups is equal to the number of GPU cards used. + +### How to use +The pre-processing tool is put in the directory "tools". To use it, the sqlite3 module is required, which can be installed using the following command: +```shell script +pip install sqlite3 +``` + +Using the following command to show the help message: +```shell script +python tools/process_base64_files.py --help +``` + +The tool provides the following options: +- data_dir: the root directory for datasets +- file_list: the file to record data files, e.g., file_list.txt +- nranks: number of final data files + +The usage of the tool is as follows: +```shell script +python tools/process_base64_files.py --data_dir=./dataset --file_list=file_list.txt --nranks=8 +``` + +Then, eight data files will be generated, each of which contains the same number of samples. + +Note: plsc.utils.base64_reader can be used to reader images stored in the format of base64. + +## Mixed Precision Training +PLSC supports mixed precision training, which can be used to improve training speed and decrease memory usage. + +### How to use +Using the following code to enable mixed precision training: +```python +from plsc import Entry + +def main(): + ins = Entry() + ins.set_mixed_precision(True) + ins.train() + +if __name__ == "__main__": + main() +``` + +For more information about mixed precision training please refer to: +- Paper: [MIXED PRECISION TRAINING](https://arxiv.org/abs/1710.03740) +- Nvidia Guider: [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) + +## User-defined Models +By default, PLSC will use ResNet50 model, and it provides ResNet101 and ResNet152 models as well. + +Users can define their own models based on the base class plsc.models.base_model.BaseModel by implementing the *build_network* method. + +The following code shows how to define a custom model: +```python +import paddle.fluid as fluid +from plsc import Entry +from plsc.models.base_model import BaseModel + +class ResNet(BaseModel): + def __init__(self, layers=50, emb_dim=512): + super(ResNet, self).__init__() + self.layers = layers + self.emb_dim = emb_dim + + def build_network(self, + input, + label, + is_train): + layers = self.layers + supported_layers = [50, 101, 152] + assert layers in supported_layers, \ + "supported layers {}, but given {}".format(supported_layers, layers) + + if layers == 50: + depth = [3, 4, 14, 3] + num_filters = [64, 128, 256, 512] + elif layers == 101: + depth = [3, 4, 23, 3] + num_filters = [256, 512, 1024, 2048] + elif layers == 152: + depth = [3, 8, 36, 3] + num_filters = [256, 512, 1024, 2048] + + conv = self.conv_bn_layer(input=input, + num_filters=64, + filter_size=3, + stride=1, + pad=1, + act='prelu', + is_train=is_train) + + for block in range(len(depth)): + for i in range(depth[block]): + conv = self.bottleneck_block( + input=conv, + num_filters=num_filters[block], + stride=2 if i == 0 else 1, + is_train=is_train) + + bn = fluid.layers.batch_norm(input=conv, + act=None, + epsilon=2e-05, + is_test=False if is_train else True) + drop = fluid.layers.dropout( + x=bn, + dropout_prob=0.4, + dropout_implementation='upscale_in_train', + is_test=False if is_train else True) + fc = fluid.layers.fc( + input=drop, + size=self.emb_dim, + act=None, + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Xavier(uniform=False, fan_in=0.0)), + bias_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.ConstantInitializer())) + emb = fluid.layers.batch_norm(input=fc, + act=None, + epsilon=2e-05, + is_test=False if is_train else True) + return emb + + def conv_bn_layer( + ... ... + +if __name__ == "__main__": + ins = Entry() + ins.set_model(ResNet()) + ins.train() +``` + +## How to use custom training data +With PLSC, we assume the dataset is organized in the following structure: +```shell script +train_data/ +|-- images +`-- label.txt +``` +All images are stored in the directory 'images', and the file 'label.txt' are used to record the index of all images, each line of which represents the relative path for a image and its corresponding label. + +When dataset for users are organized in their own structure, using the following steps to use their own datasets: +1. Define a generator, which pre-processing users' images (e.g., resizing) and generate samples one by one using *yield*; + * A sample is a tuple of (data, label), where data represents a images after decoding and preprocessing +2. Use paddle.batch to wrap the above generator, and get the batched generator +3. Assign the batched reader to the 'train_reader' member of plsc.Entry + +We assume the dataset for a user is organized as follows: +```shell script +train_data/ +|-- images +`-- label.txt +``` +First, using the following code to define a generator: +```python +import random +import os +from PIL import Image + +def arc_train(data_dir): + label_file = os.path.join(data_dir, 'label.txt') + train_image_list = None + with open(label_file, 'r') as f: + train_image_list = f.readlines() + train_image_list = get_train_image_list(data_dir) + + def reader(): + for j in range(len(train_image_list)): + path, label = train_image_list[j] + path = os.path.join(data_dir, path) + img = Image.open(path) + if random.randint(0, 1) == 1: + img = img.transpose(Image.FLIP_LEFT_RIGHT) + if img.mode != 'RGB': + img = img.convert('RGB') + img = np.array(img).astype('float32').transpose((2, 0, 1)) + yield img, label + + return reader +``` + +The following example code shows how to use the custom dataset: +```python +import argparse +import paddle +from plsc import Entry +import reader + +parser = argparse.ArgumentParser() +parser.add_argument("--data_dir", + type=str, + default="./data", + help="Directory for datasets.") +args = parser.parse_args() + + +def main(): + global args + ins = Entry() + ins.set_dataset_dir(args.data_dir) + train_reader = reader.arc_train(args.data_dir) + # Batch the above samples; + batched_train_reader = paddle.batch(train_reader, + ins.train_batch_size) + # Set the reader to use during training to the above batch reader. + ins.train_reader = batched_train_reader + + ins.train() + + +if __name__ == "__main__": + main() +``` + +For more examples, please refer to [example](../../../demo/custom_reader.py). \ No newline at end of file -- GitLab