未验证 提交 cddc7096 编写于 作者: S ShenLiang 提交者: GitHub

fix InMemoryDataset doc (#28688)

* add Inmemorydataset
上级 bb5f8e35
...@@ -242,12 +242,15 @@ class InMemoryDataset(DatasetBase): ...@@ -242,12 +242,15 @@ class InMemoryDataset(DatasetBase):
""" """
:api_attr: Static Graph :api_attr: Static Graph
InMemoryDataset, it will load data into memory It will load data into memory and shuffle data before training.
and shuffle data before training.
Examples:
.. code-block:: python
Example:
import paddle import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset() dataset = paddle.distributed.InMemoryDataset()
""" """
def __init__(self): def __init__(self):
...@@ -288,6 +291,7 @@ class InMemoryDataset(DatasetBase): ...@@ -288,6 +291,7 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python .. code-block:: python
import paddle import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset() dataset = paddle.distributed.InMemoryDataset()
dataset.init( dataset.init(
batch_size=1, batch_size=1,
...@@ -329,11 +333,11 @@ class InMemoryDataset(DatasetBase): ...@@ -329,11 +333,11 @@ class InMemoryDataset(DatasetBase):
""" """
:api_attr: Static Graph :api_attr: Static Graph
should be called in user's python scripts to update setings of dataset instance should be called in user's python scripts to update setings of dataset instance.
Args: Args:
kwargs: Keyword arguments. Currently, we support following keys in **kwargs, kwargs: Keyword arguments. Currently, we support following keys in **kwargs,
including single node settings and advanced distributed related settings: including single node settings and advanced distributed related settings:
batch_size(int): batch size. It will be effective during training. default is 1. batch_size(int): batch size. It will be effective during training. default is 1.
thread_num(int): thread num, it is the num of readers. default is 1. thread_num(int): thread num, it is the num of readers. default is 1.
use_var(list): list of variables. Variables which you will use. default is []. use_var(list): list of variables. Variables which you will use. default is [].
...@@ -360,6 +364,8 @@ class InMemoryDataset(DatasetBase): ...@@ -360,6 +364,8 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python .. code-block:: python
import paddle import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset() dataset = paddle.distributed.InMemoryDataset()
dataset.init( dataset.init(
batch_size=1, batch_size=1,
...@@ -409,6 +415,7 @@ class InMemoryDataset(DatasetBase): ...@@ -409,6 +415,7 @@ class InMemoryDataset(DatasetBase):
:api_attr: Static Graph :api_attr: Static Graph
should be called only once in user's python scripts to initialize setings of dataset instance should be called only once in user's python scripts to initialize setings of dataset instance
Args: Args:
kwargs: Keyword arguments. Currently, we support following keys in **kwargs: kwargs: Keyword arguments. Currently, we support following keys in **kwargs:
...@@ -427,23 +434,20 @@ class InMemoryDataset(DatasetBase): ...@@ -427,23 +434,20 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python .. code-block:: python
import paddle import paddle
import os
paddle.enable_static()
with open("test_queue_dataset_run_a.txt", "w") as f: with open("test_queue_dataset_run_a.txt", "w") as f:
data = "2 1 2 2 5 4 2 2 7 2 1 3\n" data = "2 1 2 2 5 4 2 2 7 2 1 3"
data += "2 6 2 2 1 4 2 2 4 2 2 3\n"
data += "2 5 2 2 9 9 2 2 7 2 1 3\n"
data += "2 7 2 2 1 9 2 3 7 2 5 3\n"
f.write(data) f.write(data)
with open("test_queue_dataset_run_b.txt", "w") as f: with open("test_queue_dataset_run_b.txt", "w") as f:
data = "2 1 2 2 5 4 2 2 7 2 1 3\n" data = "2 1 2 2 5 4 2 2 7 2 1 3"
data += "2 6 2 2 1 4 2 2 4 2 2 3\n"
data += "2 5 2 2 9 9 2 2 7 2 1 3\n"
data += "2 7 2 2 1 9 2 3 7 2 5 3\n"
f.write(data) f.write(data)
slots = ["slot1", "slot2", "slot3", "slot4"] slots = ["slot1", "slot2", "slot3", "slot4"]
slots_vars = [] slots_vars = []
for slot in slots: for slot in slots:
var = fluid.data( var = paddle.static.data(
name=slot, shape=[None, 1], dtype="int64", lod_level=1) name=slot, shape=[None, 1], dtype="int64", lod_level=1)
slots_vars.append(var) slots_vars.append(var)
...@@ -458,9 +462,7 @@ class InMemoryDataset(DatasetBase): ...@@ -458,9 +462,7 @@ class InMemoryDataset(DatasetBase):
["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"]) ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
dataset.load_into_memory() dataset.load_into_memory()
paddle.enable_static() place = paddle.CPUPlace()
place = paddle.CUDAPlace(0) if paddle.fluid.core.is_compiled_with_cuda() else paddle.CPUPlace()
exe = paddle.static.Executor(place) exe = paddle.static.Executor(place)
startup_program = paddle.static.Program() startup_program = paddle.static.Program()
main_program = paddle.static.Program() main_program = paddle.static.Program()
...@@ -470,6 +472,7 @@ class InMemoryDataset(DatasetBase): ...@@ -470,6 +472,7 @@ class InMemoryDataset(DatasetBase):
os.remove("./test_queue_dataset_run_a.txt") os.remove("./test_queue_dataset_run_a.txt")
os.remove("./test_queue_dataset_run_b.txt") os.remove("./test_queue_dataset_run_b.txt")
""" """
batch_size = kwargs.get("batch_size", 1) batch_size = kwargs.get("batch_size", 1)
thread_num = kwargs.get("thread_num", 1) thread_num = kwargs.get("thread_num", 1)
...@@ -545,6 +548,7 @@ class InMemoryDataset(DatasetBase): ...@@ -545,6 +548,7 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python .. code-block:: python
import paddle import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset() dataset = paddle.distributed.InMemoryDataset()
dataset._set_queue_num(12) dataset._set_queue_num(12)
...@@ -563,6 +567,7 @@ class InMemoryDataset(DatasetBase): ...@@ -563,6 +567,7 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python .. code-block:: python
import paddle import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset() dataset = paddle.distributed.InMemoryDataset()
dataset._set_parse_ins_id(True) dataset._set_parse_ins_id(True)
...@@ -580,6 +585,7 @@ class InMemoryDataset(DatasetBase): ...@@ -580,6 +585,7 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python .. code-block:: python
import paddle import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset() dataset = paddle.distributed.InMemoryDataset()
dataset._set_parse_content(True) dataset._set_parse_content(True)
...@@ -597,6 +603,7 @@ class InMemoryDataset(DatasetBase): ...@@ -597,6 +603,7 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python .. code-block:: python
import paddle import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset() dataset = paddle.distributed.InMemoryDataset()
dataset._set_fleet_send_batch_size(800) dataset._set_fleet_send_batch_size(800)
...@@ -614,6 +621,7 @@ class InMemoryDataset(DatasetBase): ...@@ -614,6 +621,7 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python .. code-block:: python
import paddle import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset() dataset = paddle.distributed.InMemoryDataset()
dataset._set_fleet_send_sleep_seconds(2) dataset._set_fleet_send_sleep_seconds(2)
...@@ -632,6 +640,7 @@ class InMemoryDataset(DatasetBase): ...@@ -632,6 +640,7 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python .. code-block:: python
import paddle import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset() dataset = paddle.distributed.InMemoryDataset()
dataset._set_merge_by_lineid() dataset._set_merge_by_lineid()
...@@ -660,7 +669,21 @@ class InMemoryDataset(DatasetBase): ...@@ -660,7 +669,21 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python .. code-block:: python
import paddle import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset() dataset = paddle.distributed.InMemoryDataset()
slots = ["slot1", "slot2", "slot3", "slot4"]
slots_vars = []
for slot in slots:
var = paddle.static.data(
name=slot, shape=[None, 1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset.init(
batch_size=1,
thread_num=2,
input_type=1,
pipe_command="cat",
use_var=slots_vars)
filelist = ["a.txt", "b.txt"] filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist) dataset.set_filelist(filelist)
dataset.load_into_memory() dataset.load_into_memory()
...@@ -681,7 +704,21 @@ class InMemoryDataset(DatasetBase): ...@@ -681,7 +704,21 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python .. code-block:: python
import paddle import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset() dataset = paddle.distributed.InMemoryDataset()
slots = ["slot1", "slot2", "slot3", "slot4"]
slots_vars = []
for slot in slots:
var = paddle.static.data(
name=slot, shape=[None, 1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset.init(
batch_size=1,
thread_num=2,
input_type=1,
pipe_command="cat",
use_var=slots_vars)
filelist = ["a.txt", "b.txt"] filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist) dataset.set_filelist(filelist)
dataset.preload_into_memory() dataset.preload_into_memory()
...@@ -704,7 +741,21 @@ class InMemoryDataset(DatasetBase): ...@@ -704,7 +741,21 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python .. code-block:: python
import paddle import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset() dataset = paddle.distributed.InMemoryDataset()
slots = ["slot1", "slot2", "slot3", "slot4"]
slots_vars = []
for slot in slots:
var = paddle.static.data(
name=slot, shape=[None, 1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset.init(
batch_size=1,
thread_num=2,
input_type=1,
pipe_command="cat",
use_var=slots_vars)
filelist = ["a.txt", "b.txt"] filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist) dataset.set_filelist(filelist)
dataset.preload_into_memory() dataset.preload_into_memory()
...@@ -723,7 +774,21 @@ class InMemoryDataset(DatasetBase): ...@@ -723,7 +774,21 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python .. code-block:: python
import paddle import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset() dataset = paddle.distributed.InMemoryDataset()
slots = ["slot1", "slot2", "slot3", "slot4"]
slots_vars = []
for slot in slots:
var = paddle.static.data(
name=slot, shape=[None, 1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset.init(
batch_size=1,
thread_num=2,
input_type=1,
pipe_command="cat",
use_var=slots_vars)
filelist = ["a.txt", "b.txt"] filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist) dataset.set_filelist(filelist)
dataset.load_into_memory() dataset.load_into_memory()
...@@ -744,12 +809,25 @@ class InMemoryDataset(DatasetBase): ...@@ -744,12 +809,25 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python .. code-block:: python
import paddle import paddle
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset() dataset = paddle.distributed.InMemoryDataset()
slots = ["slot1", "slot2", "slot3", "slot4"]
slots_vars = []
for slot in slots:
var = paddle.static.data(
name=slot, shape=[None, 1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset.init(
batch_size=1,
thread_num=2,
input_type=1,
pipe_command="cat",
use_var=slots_vars)
filelist = ["a.txt", "b.txt"] filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist) dataset.set_filelist(filelist)
dataset.load_into_memory() dataset.load_into_memory()
dataset.global_shuffle(fleet) dataset.global_shuffle()
Args: Args:
fleet(Fleet): fleet singleton. Default None. fleet(Fleet): fleet singleton. Default None.
...@@ -788,12 +866,25 @@ class InMemoryDataset(DatasetBase): ...@@ -788,12 +866,25 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python .. code-block:: python
import paddle import paddle
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset() dataset = paddle.distributed.InMemoryDataset()
slots = ["slot1", "slot2", "slot3", "slot4"]
slots_vars = []
for slot in slots:
var = paddle.static.data(
name=slot, shape=[None, 1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset.init(
batch_size=1,
thread_num=2,
input_type=1,
pipe_command="cat",
use_var=slots_vars)
filelist = ["a.txt", "b.txt"] filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist) dataset.set_filelist(filelist)
dataset.load_into_memory() dataset.load_into_memory()
dataset.global_shuffle(fleet) dataset.global_shuffle()
exe = paddle.static.Executor(paddle.CPUPlace()) exe = paddle.static.Executor(paddle.CPUPlace())
startup_program = paddle.static.Program() startup_program = paddle.static.Program()
main_program = paddle.static.Program() main_program = paddle.static.Program()
...@@ -824,12 +915,25 @@ class InMemoryDataset(DatasetBase): ...@@ -824,12 +915,25 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python .. code-block:: python
import paddle import paddle
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset() dataset = paddle.distributed.InMemoryDataset()
slots = ["slot1", "slot2", "slot3", "slot4"]
slots_vars = []
for slot in slots:
var = paddle.static.data(
name=slot, shape=[None, 1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset.init(
batch_size=1,
thread_num=2,
input_type=1,
pipe_command="cat",
use_var=slots_vars)
filelist = ["a.txt", "b.txt"] filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist) dataset.set_filelist(filelist)
dataset.load_into_memory() dataset.load_into_memory()
print dataset.get_memory_data_size(fleet) print dataset.get_memory_data_size()
""" """
import numpy as np import numpy as np
...@@ -863,13 +967,27 @@ class InMemoryDataset(DatasetBase): ...@@ -863,13 +967,27 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python .. code-block:: python
import paddle import paddle
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset() dataset = paddle.distributed.InMemoryDataset()
dataset = paddle.distributed.InMemoryDataset()
slots = ["slot1", "slot2", "slot3", "slot4"]
slots_vars = []
for slot in slots:
var = paddle.static.data(
name=slot, shape=[None, 1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset.init(
batch_size=1,
thread_num=2,
input_type=1,
pipe_command="cat",
use_var=slots_vars)
filelist = ["a.txt", "b.txt"] filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist) dataset.set_filelist(filelist)
dataset.load_into_memory() dataset.load_into_memory()
dataset.global_shuffle(fleet) dataset.global_shuffle()
print dataset.get_shuffle_data_size(fleet) print dataset.get_shuffle_data_size()
""" """
import numpy as np import numpy as np
...@@ -897,6 +1015,7 @@ class InMemoryDataset(DatasetBase): ...@@ -897,6 +1015,7 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python .. code-block:: python
import paddle import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset() dataset = paddle.distributed.InMemoryDataset()
dataset._set_fea_eval(1000000, True) dataset._set_fea_eval(1000000, True)
...@@ -917,11 +1036,29 @@ class InMemoryDataset(DatasetBase): ...@@ -917,11 +1036,29 @@ class InMemoryDataset(DatasetBase):
slots(list[string]): the set of slots(string) to do slots shuffle. slots(list[string]): the set of slots(string) to do slots shuffle.
Examples: Examples:
.. code-block:: python
import paddle import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset() dataset = paddle.distributed.InMemoryDataset()
dataset.set_merge_by_lineid() dataset._init_distributed_settings(fea_eval=True)
#suppose there is a slot 0 slots = ["slot1", "slot2", "slot3", "slot4"]
dataset.slots_shuffle(['0']) slots_vars = []
for slot in slots:
var = paddle.static.data(
name=slot, shape=[None, 1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset.init(
batch_size=1,
thread_num=2,
input_type=1,
pipe_command="cat",
use_var=slots_vars)
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
dataset.slots_shuffle(['slot1'])
""" """
if self.fea_eval: if self.fea_eval:
slots_set = set(slots) slots_set = set(slots)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册