未验证 提交 20c8432a 编写于 作者: X xujiaqi01 提交者: GitHub

move dataset from paddfle.fluid to paddle.fleet (#25887)

* move dataset to fleet
test=develop

* fix
test=develop

* fix
test=develop

* fix
test=develop

* test=develop

* test=develop

* test=develop

* test=develop

* test=develop

* test=develop

* test=develop
上级 68c6160e
...@@ -16,10 +16,13 @@ ...@@ -16,10 +16,13 @@
from .base.distributed_strategy import DistributedStrategy from .base.distributed_strategy import DistributedStrategy
from .base.fleet_base import Fleet from .base.fleet_base import Fleet
from .base.util_factory import UtilBase from .base.util_factory import UtilBase
from .dataset import *
#from .base.role_maker import PaddleCloudRoleMaker #from .base.role_maker import PaddleCloudRoleMaker
__all__ = ["DistributedStrategy", "UtilBase"] __all__ = [
"DistributedStrategy", "UtilBase", "DatasetFactory", "DatasetBase",
"InMemoryDataset", "QueueDataset"
]
fleet = Fleet() fleet = Fleet()
init = fleet.init init = fleet.init
......
...@@ -10,3 +10,5 @@ ...@@ -10,3 +10,5 @@
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
from .dataset import *
此差异已折叠。
...@@ -28,7 +28,6 @@ from .layers.io import monkey_patch_reader_methods, _copy_reader_var_, double_bu ...@@ -28,7 +28,6 @@ from .layers.io import monkey_patch_reader_methods, _copy_reader_var_, double_bu
from .unique_name import UniqueNameGenerator from .unique_name import UniqueNameGenerator
import logging import logging
import warnings import warnings
from .dataset import DatasetBase, InMemoryDataset
### Dygraph DataLoader configs ### ### Dygraph DataLoader configs ###
import os import os
...@@ -1670,7 +1669,7 @@ class PyReader(DataLoaderBase): ...@@ -1670,7 +1669,7 @@ class PyReader(DataLoaderBase):
class DatasetLoader(DataLoaderBase): class DatasetLoader(DataLoaderBase):
def __init__(self, dataset, places, drop_last): def __init__(self, dataset, places, drop_last):
assert isinstance(dataset, assert isinstance(dataset, paddle.fleet.dataset.
DatasetBase), "dataset must be type of DatasetBase" DatasetBase), "dataset must be type of DatasetBase"
assert not in_dygraph_mode( assert not in_dygraph_mode(
), "DatasetLoader is not supported in dygraph mode yet" ), "DatasetLoader is not supported in dygraph mode yet"
...@@ -1686,7 +1685,7 @@ class DatasetLoader(DataLoaderBase): ...@@ -1686,7 +1685,7 @@ class DatasetLoader(DataLoaderBase):
dataset.set_thread(thread_num) dataset.set_thread(thread_num)
if isinstance(dataset, if isinstance(dataset, paddle.fleet.dataset.
InMemoryDataset) and dataset.queue_num > thread_num: InMemoryDataset) and dataset.queue_num > thread_num:
logging.warn("queue_num {} which is set in Dataset is ignored". logging.warn("queue_num {} which is set in Dataset is ignored".
format(dataset.queue_num)) format(dataset.queue_num))
......
...@@ -210,7 +210,7 @@ class TestDistCTR2x2(FleetDistRunnerBase): ...@@ -210,7 +210,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
filelist.append(train_file_path) filelist.append(train_file_path)
# config dataset # config dataset
dataset = fluid.DatasetFactory().create_dataset() dataset = paddle.fleet.DatasetFactory().create_dataset()
dataset.set_batch_size(batch_size) dataset.set_batch_size(batch_size)
dataset.set_use_var(self.feeds) dataset.set_use_var(self.feeds)
pipe_command = 'python ctr_dataset_reader.py' pipe_command = 'python ctr_dataset_reader.py'
......
...@@ -17,6 +17,7 @@ including create, config, run, etc. ...@@ -17,6 +17,7 @@ including create, config, run, etc.
""" """
from __future__ import print_function from __future__ import print_function
import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.compat as cpt import paddle.compat as cpt
import paddle.fluid.core as core import paddle.fluid.core as core
...@@ -37,23 +38,26 @@ class TestDataset(unittest.TestCase): ...@@ -37,23 +38,26 @@ class TestDataset(unittest.TestCase):
def test_dataset_create(self): def test_dataset_create(self):
""" Testcase for dataset create. """ """ Testcase for dataset create. """
try: try:
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
except: except:
self.assertTrue(False) self.assertTrue(False)
try: try:
dataset = fluid.DatasetFactory().create_dataset("QueueDataset") dataset = paddle.fleet.DatasetFactory().create_dataset(
"QueueDataset")
except: except:
self.assertTrue(False) self.assertTrue(False)
try: try:
dataset = fluid.DatasetFactory().create_dataset( dataset = paddle.fleet.DatasetFactory().create_dataset(
"FileInstantDataset") "FileInstantDataset")
except: except:
self.assertTrue(False) self.assertTrue(False)
try: try:
dataset = fluid.DatasetFactory().create_dataset("MyOwnDataset") dataset = paddle.fleet.DatasetFactory().create_dataset(
"MyOwnDataset")
self.assertTrue(False) self.assertTrue(False)
except: except:
self.assertTrue(True) self.assertTrue(True)
...@@ -91,7 +95,8 @@ class TestDataset(unittest.TestCase): ...@@ -91,7 +95,8 @@ class TestDataset(unittest.TestCase):
name=slot, shape=[1], dtype="int64", lod_level=1) name=slot, shape=[1], dtype="int64", lod_level=1)
slots_vars.append(var) slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32) dataset.set_batch_size(32)
dataset.set_thread(3) dataset.set_thread(3)
dataset.set_filelist( dataset.set_filelist(
...@@ -125,7 +130,7 @@ class TestDataset(unittest.TestCase): ...@@ -125,7 +130,7 @@ class TestDataset(unittest.TestCase):
dataset.set_trainer_num(4) dataset.set_trainer_num(4)
dataset.set_hdfs_config("my_fs_name", "my_fs_ugi") dataset.set_hdfs_config("my_fs_name", "my_fs_ugi")
dataset.set_download_cmd("./read_from_afs my_fs_name my_fs_ugi") dataset.set_download_cmd("./read_from_afs my_fs_name my_fs_ugi")
dataset.enable_pv_merge() dataset.set_enable_pv_merge(False)
thread_num = dataset.get_thread_num() thread_num = dataset.get_thread_num()
self.assertEqual(thread_num, 12) self.assertEqual(thread_num, 12)
...@@ -171,7 +176,8 @@ class TestDataset(unittest.TestCase): ...@@ -171,7 +176,8 @@ class TestDataset(unittest.TestCase):
name=slot, shape=[1], dtype="int64", lod_level=1) name=slot, shape=[1], dtype="int64", lod_level=1)
slots_vars.append(var) slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32) dataset.set_batch_size(32)
dataset.set_thread(3) dataset.set_thread(3)
dataset.set_filelist([filename1, filename2]) dataset.set_filelist([filename1, filename2])
...@@ -222,7 +228,8 @@ class TestDataset(unittest.TestCase): ...@@ -222,7 +228,8 @@ class TestDataset(unittest.TestCase):
name=slot, shape=[1], dtype="int64", lod_level=1) name=slot, shape=[1], dtype="int64", lod_level=1)
slots_vars.append(var) slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32) dataset.set_batch_size(32)
dataset.set_thread(3) dataset.set_thread(3)
dataset.set_filelist([ dataset.set_filelist([
...@@ -293,7 +300,8 @@ class TestDataset(unittest.TestCase): ...@@ -293,7 +300,8 @@ class TestDataset(unittest.TestCase):
name=slot, shape=[1], dtype="float32", lod_level=1) name=slot, shape=[1], dtype="float32", lod_level=1)
slots_vars.append(var) slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32) dataset.set_batch_size(32)
dataset.set_thread(1) dataset.set_thread(1)
dataset.set_parse_ins_id(True) dataset.set_parse_ins_id(True)
...@@ -359,7 +367,8 @@ class TestDataset(unittest.TestCase): ...@@ -359,7 +367,8 @@ class TestDataset(unittest.TestCase):
name="slot4", shape=[1], dtype="float32", lod_level=0) name="slot4", shape=[1], dtype="float32", lod_level=0)
slots_vars = [var1, var2, var3, var4] slots_vars = [var1, var2, var3, var4]
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32) dataset.set_batch_size(32)
dataset.set_thread(1) dataset.set_thread(1)
dataset.set_parse_ins_id(True) dataset.set_parse_ins_id(True)
...@@ -414,7 +423,8 @@ class TestDataset(unittest.TestCase): ...@@ -414,7 +423,8 @@ class TestDataset(unittest.TestCase):
name=slot, shape=[1], dtype="float32", lod_level=1) name=slot, shape=[1], dtype="float32", lod_level=1)
slots_vars.append(var) slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32) dataset.set_batch_size(32)
dataset.set_thread(3) dataset.set_thread(3)
dataset.set_filelist([ dataset.set_filelist([
...@@ -507,7 +517,7 @@ class TestDataset(unittest.TestCase): ...@@ -507,7 +517,7 @@ class TestDataset(unittest.TestCase):
name=slot, shape=[1], dtype="int64", lod_level=1) name=slot, shape=[1], dtype="int64", lod_level=1)
slots_vars.append(var) slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("QueueDataset") dataset = paddle.fleet.DatasetFactory().create_dataset("QueueDataset")
dataset.set_batch_size(32) dataset.set_batch_size(32)
dataset.set_thread(3) dataset.set_thread(3)
dataset.set_filelist( dataset.set_filelist(
...@@ -532,7 +542,7 @@ class TestDataset(unittest.TestCase): ...@@ -532,7 +542,7 @@ class TestDataset(unittest.TestCase):
except Exception as e: except Exception as e:
self.assertTrue(False) self.assertTrue(False)
dataset2 = fluid.DatasetFactory().create_dataset("QueueDataset") dataset2 = paddle.fleet.DatasetFactory().create_dataset("QueueDataset")
dataset2.set_use_var(slots_vars) dataset2.set_use_var(slots_vars)
dataset2.set_batch_size(32) dataset2.set_batch_size(32)
dataset2.set_thread(3) dataset2.set_thread(3)
...@@ -573,7 +583,7 @@ class TestDataset(unittest.TestCase): ...@@ -573,7 +583,7 @@ class TestDataset(unittest.TestCase):
name=slot, shape=[1], dtype="float32", lod_level=1) name=slot, shape=[1], dtype="float32", lod_level=1)
slots_vars.append(var) slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("QueueDataset") dataset = paddle.fleet.DatasetFactory().create_dataset("QueueDataset")
dataset.set_batch_size(32) dataset.set_batch_size(32)
dataset.set_thread(3) dataset.set_thread(3)
dataset.set_filelist( dataset.set_filelist(
...@@ -628,7 +638,8 @@ class TestDataset(unittest.TestCase): ...@@ -628,7 +638,8 @@ class TestDataset(unittest.TestCase):
name=slot, shape=[None, 1], dtype="int64", lod_level=1) name=slot, shape=[None, 1], dtype="int64", lod_level=1)
slots_vars.append(var) slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_input_type(1) dataset.set_input_type(1)
dataset.set_batch_size(1) dataset.set_batch_size(1)
dataset.set_thread(2) dataset.set_thread(2)
...@@ -707,7 +718,7 @@ class TestDatasetWithFetchHandler(unittest.TestCase): ...@@ -707,7 +718,7 @@ class TestDatasetWithFetchHandler(unittest.TestCase):
inputs(list): inputs of get_dataset inputs(list): inputs of get_dataset
files(list): files of get_dataset files(list): files of get_dataset
""" """
dataset = fluid.DatasetFactory().create_dataset("QueueDataset") dataset = paddle.fleet.DatasetFactory().create_dataset("QueueDataset")
dataset.set_batch_size(32) dataset.set_batch_size(32)
dataset.set_thread(3) dataset.set_thread(3)
dataset.set_filelist(files) dataset.set_filelist(files)
...@@ -864,7 +875,8 @@ class TestDataset2(unittest.TestCase): ...@@ -864,7 +875,8 @@ class TestDataset2(unittest.TestCase):
except ImportError as e: except ImportError as e:
print("warning: no mpi4py") print("warning: no mpi4py")
exe.run(startup_program) exe.run(startup_program)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32) dataset.set_batch_size(32)
dataset.set_thread(3) dataset.set_thread(3)
dataset.set_filelist([ dataset.set_filelist([
...@@ -884,9 +896,6 @@ class TestDataset2(unittest.TestCase): ...@@ -884,9 +896,6 @@ class TestDataset2(unittest.TestCase):
""" """
Testcase for InMemoryDataset from create to run. Testcase for InMemoryDataset from create to run.
""" """
self.skipTest("parameter server will add pslib UT later")
with open("test_in_memory_dataset2_run2_a.txt", "w") as f: with open("test_in_memory_dataset2_run2_a.txt", "w") as f:
data = "1 1 2 3 3 4 5 5 5 5 1 1\n" data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
data += "1 2 2 3 4 4 6 6 6 6 1 2\n" data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
...@@ -902,7 +911,7 @@ class TestDataset2(unittest.TestCase): ...@@ -902,7 +911,7 @@ class TestDataset2(unittest.TestCase):
train_program = fluid.Program() train_program = fluid.Program()
startup_program = fluid.Program() startup_program = fluid.Program()
scope = fluid.Scope() scope = fluid.Scope()
from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
with fluid.program_guard(train_program, startup_program): with fluid.program_guard(train_program, startup_program):
slots = ["slot1_ff", "slot2_ff", "slot3_ff", "slot4_ff"] slots = ["slot1_ff", "slot2_ff", "slot3_ff", "slot4_ff"]
slots_vars = [] slots_vars = []
...@@ -936,7 +945,8 @@ class TestDataset2(unittest.TestCase): ...@@ -936,7 +945,8 @@ class TestDataset2(unittest.TestCase):
except ImportError as e: except ImportError as e:
print("warning: no mpi4py") print("warning: no mpi4py")
exe.run(startup_program) exe.run(startup_program)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32) dataset.set_batch_size(32)
dataset.set_thread(3) dataset.set_thread(3)
dataset.set_filelist([ dataset.set_filelist([
...@@ -952,6 +962,63 @@ class TestDataset2(unittest.TestCase): ...@@ -952,6 +962,63 @@ class TestDataset2(unittest.TestCase):
print("warning: catch expected error") print("warning: catch expected error")
fleet._opt_info = None fleet._opt_info = None
fleet._fleet_ptr = None fleet._fleet_ptr = None
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_rank_offset("")
dataset.set_pv_batch_size(1)
dataset.set_hdfs_config("", "")
d = paddle.fleet.DatasetBase()
try:
dataset.set_feed_type("MultiSlotInMemoryDataFeed")
except:
print("warning: catch expected error")
dataset.thread_num = 0
try:
dataset._prepare_to_run()
except:
print("warning: catch expected error")
dataset.set_parse_logkey(True)
dataset.set_merge_by_sid(True)
dataset.set_enable_pv_merge(True)
try:
dataset.preprocess_instance()
except:
print("warning: catch expected error")
try:
dataset.set_current_phase(1)
except:
print("warning: catch expected error")
try:
dataset.postprocess_instance()
except:
print("warning: catch expected error")
dataset.set_fleet_send_batch_size(1024)
try:
dataset.global_shuffle()
except:
print("warning: catch expected error")
dataset.get_pv_data_size()
dataset.get_memory_data_size()
dataset.get_shuffle_data_size()
dataset = paddle.fleet.DatasetFactory().create_dataset(
"QueueDataset")
try:
dataset.local_shuffle()
except:
print("warning: catch expected error")
try:
dataset.global_shuffle()
except:
print("warning: catch expected error")
dataset = paddle.fleet.FileInstantDataset()
try:
dataset.local_shuffle()
except:
print("warning: catch expected error")
try:
dataset.global_shuffle()
except:
print("warning: catch expected error")
os.remove("./test_in_memory_dataset2_run2_a.txt") os.remove("./test_in_memory_dataset2_run2_a.txt")
os.remove("./test_in_memory_dataset2_run2_b.txt") os.remove("./test_in_memory_dataset2_run2_b.txt")
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import numpy as np import numpy as np
import six import six
...@@ -96,7 +97,8 @@ class DatasetLoaderTestBase(unittest.TestCase): ...@@ -96,7 +97,8 @@ class DatasetLoaderTestBase(unittest.TestCase):
def check_batch_number(self, place, randomize_batch_num=False): def check_batch_number(self, place, randomize_batch_num=False):
main_prog, startup_prog, feeds = self.build_network() main_prog, startup_prog, feeds = self.build_network()
dataset = fluid.DatasetFactory().create_dataset(self.dataset_name) dataset = paddle.fleet.DatasetFactory().create_dataset(
self.dataset_name)
dataset.set_batch_size(BATCH_SIZE) dataset.set_batch_size(BATCH_SIZE)
if isinstance(place, fluid.CPUPlace): if isinstance(place, fluid.CPUPlace):
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
"""Test cases for role makers.""" """Test cases for role makers."""
from __future__ import print_function from __future__ import print_function
import paddle
import os import os
import unittest import unittest
...@@ -162,7 +163,8 @@ class TestCloudRoleMaker2(unittest.TestCase): ...@@ -162,7 +163,8 @@ class TestCloudRoleMaker2(unittest.TestCase):
data = "1 1 1 1\n" data = "1 1 1 1\n"
f.write(data) f.write(data)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_filelist(["test_fleet_gloo_role_maker_1.txt"]) dataset.set_filelist(["test_fleet_gloo_role_maker_1.txt"])
dataset.set_use_var([show, label]) dataset.set_use_var([show, label])
dataset.load_into_memory() dataset.load_into_memory()
......
...@@ -16,6 +16,7 @@ TestCases for Monitor ...@@ -16,6 +16,7 @@ TestCases for Monitor
""" """
from __future__ import print_function from __future__ import print_function
import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
import numpy as np import numpy as np
...@@ -51,7 +52,8 @@ class TestDatasetWithStat(unittest.TestCase): ...@@ -51,7 +52,8 @@ class TestDatasetWithStat(unittest.TestCase):
name=slot, shape=[1], dtype="int64", lod_level=1) name=slot, shape=[1], dtype="int64", lod_level=1)
slots_vars.append(var) slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32) dataset.set_batch_size(32)
dataset.set_thread(3) dataset.set_thread(3)
dataset.set_filelist([ dataset.set_filelist([
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
import numpy as np import numpy as np
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册