未验证 提交 cddc7096 编写于 作者: S ShenLiang 提交者: GitHub

fix InMemoryDataset doc (#28688)

* add Inmemorydataset
上级 bb5f8e35
......@@ -242,12 +242,15 @@ class InMemoryDataset(DatasetBase):
"""
:api_attr: Static Graph
InMemoryDataset, it will load data into memory
and shuffle data before training.
It will load data into memory and shuffle data before training.
Examples:
.. code-block:: python
Example:
import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset()
"""
def __init__(self):
......@@ -288,6 +291,7 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python
import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset()
dataset.init(
batch_size=1,
......@@ -329,11 +333,11 @@ class InMemoryDataset(DatasetBase):
"""
:api_attr: Static Graph
should be called in user's python scripts to update setings of dataset instance
should be called in user's python scripts to update setings of dataset instance.
Args:
kwargs: Keyword arguments. Currently, we support following keys in **kwargs,
including single node settings and advanced distributed related settings:
batch_size(int): batch size. It will be effective during training. default is 1.
thread_num(int): thread num, it is the num of readers. default is 1.
use_var(list): list of variables. Variables which you will use. default is [].
......@@ -360,6 +364,8 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python
import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset()
dataset.init(
batch_size=1,
......@@ -409,6 +415,7 @@ class InMemoryDataset(DatasetBase):
:api_attr: Static Graph
should be called only once in user's python scripts to initialize setings of dataset instance
Args:
kwargs: Keyword arguments. Currently, we support following keys in **kwargs:
......@@ -427,23 +434,20 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python
import paddle
import os
paddle.enable_static()
with open("test_queue_dataset_run_a.txt", "w") as f:
data = "2 1 2 2 5 4 2 2 7 2 1 3\n"
data += "2 6 2 2 1 4 2 2 4 2 2 3\n"
data += "2 5 2 2 9 9 2 2 7 2 1 3\n"
data += "2 7 2 2 1 9 2 3 7 2 5 3\n"
data = "2 1 2 2 5 4 2 2 7 2 1 3"
f.write(data)
with open("test_queue_dataset_run_b.txt", "w") as f:
data = "2 1 2 2 5 4 2 2 7 2 1 3\n"
data += "2 6 2 2 1 4 2 2 4 2 2 3\n"
data += "2 5 2 2 9 9 2 2 7 2 1 3\n"
data += "2 7 2 2 1 9 2 3 7 2 5 3\n"
data = "2 1 2 2 5 4 2 2 7 2 1 3"
f.write(data)
slots = ["slot1", "slot2", "slot3", "slot4"]
slots_vars = []
for slot in slots:
var = fluid.data(
var = paddle.static.data(
name=slot, shape=[None, 1], dtype="int64", lod_level=1)
slots_vars.append(var)
......@@ -458,9 +462,7 @@ class InMemoryDataset(DatasetBase):
["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
dataset.load_into_memory()
paddle.enable_static()
place = paddle.CUDAPlace(0) if paddle.fluid.core.is_compiled_with_cuda() else paddle.CPUPlace()
place = paddle.CPUPlace()
exe = paddle.static.Executor(place)
startup_program = paddle.static.Program()
main_program = paddle.static.Program()
......@@ -470,6 +472,7 @@ class InMemoryDataset(DatasetBase):
os.remove("./test_queue_dataset_run_a.txt")
os.remove("./test_queue_dataset_run_b.txt")
"""
batch_size = kwargs.get("batch_size", 1)
thread_num = kwargs.get("thread_num", 1)
......@@ -545,6 +548,7 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python
import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset()
dataset._set_queue_num(12)
......@@ -563,6 +567,7 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python
import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset()
dataset._set_parse_ins_id(True)
......@@ -580,6 +585,7 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python
import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset()
dataset._set_parse_content(True)
......@@ -597,6 +603,7 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python
import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset()
dataset._set_fleet_send_batch_size(800)
......@@ -614,6 +621,7 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python
import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset()
dataset._set_fleet_send_sleep_seconds(2)
......@@ -632,6 +640,7 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python
import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset()
dataset._set_merge_by_lineid()
......@@ -660,7 +669,21 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python
import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset()
slots = ["slot1", "slot2", "slot3", "slot4"]
slots_vars = []
for slot in slots:
var = paddle.static.data(
name=slot, shape=[None, 1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset.init(
batch_size=1,
thread_num=2,
input_type=1,
pipe_command="cat",
use_var=slots_vars)
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
......@@ -681,7 +704,21 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python
import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset()
slots = ["slot1", "slot2", "slot3", "slot4"]
slots_vars = []
for slot in slots:
var = paddle.static.data(
name=slot, shape=[None, 1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset.init(
batch_size=1,
thread_num=2,
input_type=1,
pipe_command="cat",
use_var=slots_vars)
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.preload_into_memory()
......@@ -704,7 +741,21 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python
import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset()
slots = ["slot1", "slot2", "slot3", "slot4"]
slots_vars = []
for slot in slots:
var = paddle.static.data(
name=slot, shape=[None, 1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset.init(
batch_size=1,
thread_num=2,
input_type=1,
pipe_command="cat",
use_var=slots_vars)
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.preload_into_memory()
......@@ -723,7 +774,21 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python
import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset()
slots = ["slot1", "slot2", "slot3", "slot4"]
slots_vars = []
for slot in slots:
var = paddle.static.data(
name=slot, shape=[None, 1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset.init(
batch_size=1,
thread_num=2,
input_type=1,
pipe_command="cat",
use_var=slots_vars)
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
......@@ -744,12 +809,25 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python
import paddle
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset()
slots = ["slot1", "slot2", "slot3", "slot4"]
slots_vars = []
for slot in slots:
var = paddle.static.data(
name=slot, shape=[None, 1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset.init(
batch_size=1,
thread_num=2,
input_type=1,
pipe_command="cat",
use_var=slots_vars)
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
dataset.global_shuffle(fleet)
dataset.global_shuffle()
Args:
fleet(Fleet): fleet singleton. Default None.
......@@ -788,12 +866,25 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python
import paddle
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset()
slots = ["slot1", "slot2", "slot3", "slot4"]
slots_vars = []
for slot in slots:
var = paddle.static.data(
name=slot, shape=[None, 1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset.init(
batch_size=1,
thread_num=2,
input_type=1,
pipe_command="cat",
use_var=slots_vars)
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
dataset.global_shuffle(fleet)
dataset.global_shuffle()
exe = paddle.static.Executor(paddle.CPUPlace())
startup_program = paddle.static.Program()
main_program = paddle.static.Program()
......@@ -824,12 +915,25 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python
import paddle
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset()
slots = ["slot1", "slot2", "slot3", "slot4"]
slots_vars = []
for slot in slots:
var = paddle.static.data(
name=slot, shape=[None, 1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset.init(
batch_size=1,
thread_num=2,
input_type=1,
pipe_command="cat",
use_var=slots_vars)
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
print dataset.get_memory_data_size(fleet)
print dataset.get_memory_data_size()
"""
import numpy as np
......@@ -863,13 +967,27 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python
import paddle
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset()
dataset = paddle.distributed.InMemoryDataset()
slots = ["slot1", "slot2", "slot3", "slot4"]
slots_vars = []
for slot in slots:
var = paddle.static.data(
name=slot, shape=[None, 1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset.init(
batch_size=1,
thread_num=2,
input_type=1,
pipe_command="cat",
use_var=slots_vars)
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
dataset.global_shuffle(fleet)
print dataset.get_shuffle_data_size(fleet)
dataset.global_shuffle()
print dataset.get_shuffle_data_size()
"""
import numpy as np
......@@ -897,6 +1015,7 @@ class InMemoryDataset(DatasetBase):
.. code-block:: python
import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset()
dataset._set_fea_eval(1000000, True)
......@@ -917,11 +1036,29 @@ class InMemoryDataset(DatasetBase):
slots(list[string]): the set of slots(string) to do slots shuffle.
Examples:
.. code-block:: python
import paddle
paddle.enable_static()
dataset = paddle.distributed.InMemoryDataset()
dataset.set_merge_by_lineid()
#suppose there is a slot 0
dataset.slots_shuffle(['0'])
dataset._init_distributed_settings(fea_eval=True)
slots = ["slot1", "slot2", "slot3", "slot4"]
slots_vars = []
for slot in slots:
var = paddle.static.data(
name=slot, shape=[None, 1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset.init(
batch_size=1,
thread_num=2,
input_type=1,
pipe_command="cat",
use_var=slots_vars)
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
dataset.slots_shuffle(['slot1'])
"""
if self.fea_eval:
slots_set = set(slots)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册