diff --git a/doc/fluid/howto/cluster/fluid_recordio.md b/doc/fluid/howto/cluster/fluid_recordio.md new file mode 100644 index 0000000000000000000000000000000000000000..55ce63ec193948424cd0b87f13d56b9cf6154dfc --- /dev/null +++ b/doc/fluid/howto/cluster/fluid_recordio.md @@ -0,0 +1,127 @@ +# How to use RecordIO in Fluid + +If you want to use RecordIO as your training data format, you need to convert to your training data +to RecordIO files and reading them in the process of training, PaddlePaddle Fluid provides some +interface to deal with the RecordIO files. + +## Generate RecordIO File + +Before start training with RecordIO files, you need to convert your training data +to RecordIO format by `fluid.recordio_writer.convert_reader_to_recordio_file`, the sample codes +as follows: + +```python + reader = paddle.batch(mnist.train(), batch_size=1) + feeder = fluid.DataFeeder( + feed_list=[ # order is image and label + fluid.layers.data( + name='image', shape=[784]), + fluid.layers.data( + name='label', shape=[1], dtype='int64'), + ], + place=fluid.CPUPlace()) + fluid.recordio_writer.convert_reader_to_recordio_file('./mnist.recordio', reader, feeder) +``` + +The above code snippet would generate a RecordIO `./mnist.recordio` on your host. + +**NOTE**: we recommend users to set `batch_size=1` when generating the recordio files so that users can +adjust it flexibly while reading it. + +## Use the RecordIO file in a Local Training Job + +PaddlePaddle Fluid provides an interface `fluid.layers.io.open_recordio_file` to load your RecordIO file +and then you can use them as a Layer in your network configuration, the sample codes as follows: + +```python + data_file = fluid.layers.io.open_recordio_file( + filename="./mnist.recordio", + shapes=[(-1, 784),(-1, 1)], + lod_levels=[0, 0], + dtypes=["float32", "int32"]) + data_file = fluid.layers.io.batch(data_file, batch_size=4) + + img, label = fluid.layers.io.read_file(data_file) + hidden = fluid.layers.fc(input=img, size=100, act='tanh') + prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + avg_loss = fluid.layers.mean(loss) + + fluid.optimizer.Adam(learning_rate=1e-3).minimize(avg_loss) + + place = fluid.CPUPlace() + + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + avg_loss_np = [] + + # train a pass + batch_id = 0 + while True: + tmp, = exe.run(fetch_list=[avg_loss]) + + avg_loss_np.append(tmp) + print(batch_id) + batch_id += 1 +``` + +## Use the RecordIO files in Distributed Training + +1. generate multiple RecordIO files + +For a distributed training job, you may have multiple trainer nodes, +and one or more RecordIO files for one trainer node, you can use the interface +`fluid.recordio_writer.convert_reader_to_recordio_files` to convert your training data +into multiple RecordIO files, the sample codes as follows: + +```python + reader = paddle.batch(mnist.train(), batch_size=1) + feeder = fluid.DataFeeder( + feed_list=[ # order is image and label + fluid.layers.data( + name='image', shape=[784]), + fluid.layers.data( + name='label', shape=[1], dtype='int64'), + ], + place=fluid.CPUPlace()) + fluid.recordio_writer.convert_reader_to_recordio_files( + filename_suffix='./mnist.recordio', batch_per_file=100, reader, feeder) +``` + +The above codes would generate multiple RecordIO files on your host like: + +```bash +. + \_mnist-00000.recordio + |-mnist-00001.recordio + |-mnist-00002.recordio + |-mnist-00003.recordio + |-mnist-00004.recordio +``` + +2. open multiple RecordIO files by `fluid.layers.io.open_files` + +For a distributed training job, the distributed operator system will schedule trainer process on multiple nodes, +each trainer process reads parts of the whole training data, we usually take the following approach to make the training +data allocated by each trainer process as uniform as possiable: + +```python +def gen_train_list(file_pattern, trainers, trainer_id): + file_list = glob.glob(file_pattern) + ret_list = [] + for idx, f in enumerate(file_list): + if (idx + trainers) % trainers == trainer_id: + ret_list.append(f) + return ret_list + +trainers = int(os.getenv("TRAINERS")) +trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID")) +data_file = fluid.layers.io.open_files( + filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0), + thread_num=1, + shapes=[(-1, 784),(-1, 1)], + lod_levels=[0, 0], + dtypes=["float32", "int32"]) +img, label = fluid.layers.io.read_file(data_files) +... +``` diff --git a/python/paddle/fluid/recordio_writer.py b/python/paddle/fluid/recordio_writer.py index 5accaacd5361165d30b92c71ae4fd62e23e44e07..8d48e9abef0fb9861284c6302b30efb0e3994989 100644 --- a/python/paddle/fluid/recordio_writer.py +++ b/python/paddle/fluid/recordio_writer.py @@ -12,10 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import core import contextlib - -__all__ = ['convert_reader_to_recordio_file'] +__all__ = [ + 'convert_reader_to_recordio_file', 'convert_reader_to_recordio_files' +] @contextlib.contextmanager @@ -46,3 +48,36 @@ def convert_reader_to_recordio_file( writer.complete_append_tensor() counter += 1 return counter + + +def convert_reader_to_recordio_files( + filename, + batch_per_file, + reader_creator, + feeder, + compressor=core.RecordIOWriter.Compressor.Snappy, + max_num_records=1000, + feed_order=None): + if feed_order is None: + feed_order = feeder.feed_names + f_name, f_ext = os.path.splitext(filename) + assert (f_ext == ".recordio") + + lines = [] + f_idx = 0 + counter = 0 + for idx, batch in enumerate(reader_creator()): + lines.append(batch) + if idx >= batch_per_file and idx % batch_per_file == 0: + filename = "%s-%05d%s" % (f_name, f_idx, f_ext) + with create_recordio_writer(filename, compressor, + max_num_records) as writer: + for l in lines: + res = feeder.feed(l) + for each in feed_order: + writer.append_tensor(res[each]) + writer.complete_append_tensor() + counter += 1 + lines = [] + f_idx += 1 + return counter diff --git a/tools/codestyle/.gitignore b/tools/codestyle/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..0d20b6487c61e7d1bde93acf4a14b7a89083a16d --- /dev/null +++ b/tools/codestyle/.gitignore @@ -0,0 +1 @@ +*.pyc