未验证 提交 20c8432a 编写于 作者: X xujiaqi01 提交者: GitHub

move dataset from paddfle.fluid to paddle.fleet (#25887)

* move dataset to fleet
test=develop

* fix
test=develop

* fix
test=develop

* fix
test=develop

* test=develop

* test=develop

* test=develop

* test=develop

* test=develop

* test=develop

* test=develop
上级 68c6160e
......@@ -16,10 +16,13 @@
from .base.distributed_strategy import DistributedStrategy
from .base.fleet_base import Fleet
from .base.util_factory import UtilBase
from .dataset import *
#from .base.role_maker import PaddleCloudRoleMaker
__all__ = ["DistributedStrategy", "UtilBase"]
__all__ = [
"DistributedStrategy", "UtilBase", "DatasetFactory", "DatasetBase",
"InMemoryDataset", "QueueDataset"
]
fleet = Fleet()
init = fleet.init
......
......@@ -10,3 +10,5 @@
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
from .dataset import *
此差异已折叠。
......@@ -28,7 +28,6 @@ from .layers.io import monkey_patch_reader_methods, _copy_reader_var_, double_bu
from .unique_name import UniqueNameGenerator
import logging
import warnings
from .dataset import DatasetBase, InMemoryDataset
### Dygraph DataLoader configs ###
import os
......@@ -1670,7 +1669,7 @@ class PyReader(DataLoaderBase):
class DatasetLoader(DataLoaderBase):
def __init__(self, dataset, places, drop_last):
assert isinstance(dataset,
assert isinstance(dataset, paddle.fleet.dataset.
DatasetBase), "dataset must be type of DatasetBase"
assert not in_dygraph_mode(
), "DatasetLoader is not supported in dygraph mode yet"
......@@ -1686,7 +1685,7 @@ class DatasetLoader(DataLoaderBase):
dataset.set_thread(thread_num)
if isinstance(dataset,
if isinstance(dataset, paddle.fleet.dataset.
InMemoryDataset) and dataset.queue_num > thread_num:
logging.warn("queue_num {} which is set in Dataset is ignored".
format(dataset.queue_num))
......
......@@ -210,7 +210,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
filelist.append(train_file_path)
# config dataset
dataset = fluid.DatasetFactory().create_dataset()
dataset = paddle.fleet.DatasetFactory().create_dataset()
dataset.set_batch_size(batch_size)
dataset.set_use_var(self.feeds)
pipe_command = 'python ctr_dataset_reader.py'
......
......@@ -17,6 +17,7 @@ including create, config, run, etc.
"""
from __future__ import print_function
import paddle
import paddle.fluid as fluid
import paddle.compat as cpt
import paddle.fluid.core as core
......@@ -37,23 +38,26 @@ class TestDataset(unittest.TestCase):
def test_dataset_create(self):
""" Testcase for dataset create. """
try:
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
except:
self.assertTrue(False)
try:
dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"QueueDataset")
except:
self.assertTrue(False)
try:
dataset = fluid.DatasetFactory().create_dataset(
dataset = paddle.fleet.DatasetFactory().create_dataset(
"FileInstantDataset")
except:
self.assertTrue(False)
try:
dataset = fluid.DatasetFactory().create_dataset("MyOwnDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"MyOwnDataset")
self.assertTrue(False)
except:
self.assertTrue(True)
......@@ -91,7 +95,8 @@ class TestDataset(unittest.TestCase):
name=slot, shape=[1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32)
dataset.set_thread(3)
dataset.set_filelist(
......@@ -125,7 +130,7 @@ class TestDataset(unittest.TestCase):
dataset.set_trainer_num(4)
dataset.set_hdfs_config("my_fs_name", "my_fs_ugi")
dataset.set_download_cmd("./read_from_afs my_fs_name my_fs_ugi")
dataset.enable_pv_merge()
dataset.set_enable_pv_merge(False)
thread_num = dataset.get_thread_num()
self.assertEqual(thread_num, 12)
......@@ -171,7 +176,8 @@ class TestDataset(unittest.TestCase):
name=slot, shape=[1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32)
dataset.set_thread(3)
dataset.set_filelist([filename1, filename2])
......@@ -222,7 +228,8 @@ class TestDataset(unittest.TestCase):
name=slot, shape=[1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32)
dataset.set_thread(3)
dataset.set_filelist([
......@@ -293,7 +300,8 @@ class TestDataset(unittest.TestCase):
name=slot, shape=[1], dtype="float32", lod_level=1)
slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32)
dataset.set_thread(1)
dataset.set_parse_ins_id(True)
......@@ -359,7 +367,8 @@ class TestDataset(unittest.TestCase):
name="slot4", shape=[1], dtype="float32", lod_level=0)
slots_vars = [var1, var2, var3, var4]
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32)
dataset.set_thread(1)
dataset.set_parse_ins_id(True)
......@@ -414,7 +423,8 @@ class TestDataset(unittest.TestCase):
name=slot, shape=[1], dtype="float32", lod_level=1)
slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32)
dataset.set_thread(3)
dataset.set_filelist([
......@@ -507,7 +517,7 @@ class TestDataset(unittest.TestCase):
name=slot, shape=[1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset("QueueDataset")
dataset.set_batch_size(32)
dataset.set_thread(3)
dataset.set_filelist(
......@@ -532,7 +542,7 @@ class TestDataset(unittest.TestCase):
except Exception as e:
self.assertTrue(False)
dataset2 = fluid.DatasetFactory().create_dataset("QueueDataset")
dataset2 = paddle.fleet.DatasetFactory().create_dataset("QueueDataset")
dataset2.set_use_var(slots_vars)
dataset2.set_batch_size(32)
dataset2.set_thread(3)
......@@ -573,7 +583,7 @@ class TestDataset(unittest.TestCase):
name=slot, shape=[1], dtype="float32", lod_level=1)
slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset("QueueDataset")
dataset.set_batch_size(32)
dataset.set_thread(3)
dataset.set_filelist(
......@@ -628,7 +638,8 @@ class TestDataset(unittest.TestCase):
name=slot, shape=[None, 1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_input_type(1)
dataset.set_batch_size(1)
dataset.set_thread(2)
......@@ -707,7 +718,7 @@ class TestDatasetWithFetchHandler(unittest.TestCase):
inputs(list): inputs of get_dataset
files(list): files of get_dataset
"""
dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset("QueueDataset")
dataset.set_batch_size(32)
dataset.set_thread(3)
dataset.set_filelist(files)
......@@ -864,7 +875,8 @@ class TestDataset2(unittest.TestCase):
except ImportError as e:
print("warning: no mpi4py")
exe.run(startup_program)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32)
dataset.set_thread(3)
dataset.set_filelist([
......@@ -884,9 +896,6 @@ class TestDataset2(unittest.TestCase):
"""
Testcase for InMemoryDataset from create to run.
"""
self.skipTest("parameter server will add pslib UT later")
with open("test_in_memory_dataset2_run2_a.txt", "w") as f:
data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
......@@ -902,7 +911,7 @@ class TestDataset2(unittest.TestCase):
train_program = fluid.Program()
startup_program = fluid.Program()
scope = fluid.Scope()
from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
with fluid.program_guard(train_program, startup_program):
slots = ["slot1_ff", "slot2_ff", "slot3_ff", "slot4_ff"]
slots_vars = []
......@@ -936,7 +945,8 @@ class TestDataset2(unittest.TestCase):
except ImportError as e:
print("warning: no mpi4py")
exe.run(startup_program)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32)
dataset.set_thread(3)
dataset.set_filelist([
......@@ -952,6 +962,63 @@ class TestDataset2(unittest.TestCase):
print("warning: catch expected error")
fleet._opt_info = None
fleet._fleet_ptr = None
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_rank_offset("")
dataset.set_pv_batch_size(1)
dataset.set_hdfs_config("", "")
d = paddle.fleet.DatasetBase()
try:
dataset.set_feed_type("MultiSlotInMemoryDataFeed")
except:
print("warning: catch expected error")
dataset.thread_num = 0
try:
dataset._prepare_to_run()
except:
print("warning: catch expected error")
dataset.set_parse_logkey(True)
dataset.set_merge_by_sid(True)
dataset.set_enable_pv_merge(True)
try:
dataset.preprocess_instance()
except:
print("warning: catch expected error")
try:
dataset.set_current_phase(1)
except:
print("warning: catch expected error")
try:
dataset.postprocess_instance()
except:
print("warning: catch expected error")
dataset.set_fleet_send_batch_size(1024)
try:
dataset.global_shuffle()
except:
print("warning: catch expected error")
dataset.get_pv_data_size()
dataset.get_memory_data_size()
dataset.get_shuffle_data_size()
dataset = paddle.fleet.DatasetFactory().create_dataset(
"QueueDataset")
try:
dataset.local_shuffle()
except:
print("warning: catch expected error")
try:
dataset.global_shuffle()
except:
print("warning: catch expected error")
dataset = paddle.fleet.FileInstantDataset()
try:
dataset.local_shuffle()
except:
print("warning: catch expected error")
try:
dataset.global_shuffle()
except:
print("warning: catch expected error")
os.remove("./test_in_memory_dataset2_run2_a.txt")
os.remove("./test_in_memory_dataset2_run2_b.txt")
......
......@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.fluid as fluid
import numpy as np
import six
......@@ -96,7 +97,8 @@ class DatasetLoaderTestBase(unittest.TestCase):
def check_batch_number(self, place, randomize_batch_num=False):
main_prog, startup_prog, feeds = self.build_network()
dataset = fluid.DatasetFactory().create_dataset(self.dataset_name)
dataset = paddle.fleet.DatasetFactory().create_dataset(
self.dataset_name)
dataset.set_batch_size(BATCH_SIZE)
if isinstance(place, fluid.CPUPlace):
......
......@@ -14,6 +14,7 @@
"""Test cases for role makers."""
from __future__ import print_function
import paddle
import os
import unittest
......@@ -162,7 +163,8 @@ class TestCloudRoleMaker2(unittest.TestCase):
data = "1 1 1 1\n"
f.write(data)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_filelist(["test_fleet_gloo_role_maker_1.txt"])
dataset.set_use_var([show, label])
dataset.load_into_memory()
......
......@@ -16,6 +16,7 @@ TestCases for Monitor
"""
from __future__ import print_function
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import numpy as np
......@@ -51,7 +52,8 @@ class TestDatasetWithStat(unittest.TestCase):
name=slot, shape=[1], dtype="int64", lod_level=1)
slots_vars.append(var)
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset = paddle.fleet.DatasetFactory().create_dataset(
"InMemoryDataset")
dataset.set_batch_size(32)
dataset.set_thread(3)
dataset.set_filelist([
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -13,6 +13,7 @@
# limitations under the License.
from __future__ import print_function
import paddle
import paddle.fluid as fluid
import paddle.fluid.layers as layers
import numpy as np
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册