/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include #include #include // NOLINT #include #include // NOLINT #include #include "paddle/fluid/framework/data_feed.h" namespace paddle { namespace framework { // Dataset is a abstract class, which defines user interfaces // Example Usage: // Dataset* dataset = DatasetFactory::CreateDataset("InMemoryDataset") // dataset->SetFileList(std::vector{"a.txt", "b.txt"}) // dataset->SetThreadNum(1) // dataset->CreateReaders(); // dataset->SetDataFeedDesc(your_data_feed_desc); // dataset->LoadIntoMemory(); // dataset->SetTrainerNum(2); // dataset->GlobalShuffle(); class Dataset { public: Dataset() {} virtual ~Dataset() {} virtual void SetFileList(const std::vector& filelist) = 0; virtual void SetThreadNum(int thread_num) = 0; virtual void SetTrainerNum(int trainer_num) = 0; virtual void SetHdfsConfig(const std::string& fs_name, const std::string& fs_ugi) = 0; virtual void SetDataFeedDesc(const std::string& data_feed_desc_str) = 0; virtual const std::vector& GetFileList() = 0; virtual int GetThreadNum() = 0; virtual int GetTrainerNum() = 0; virtual const paddle::framework::DataFeedDesc& GetDataFeedDesc() = 0; virtual std::vector>& GetReaders() = 0; virtual void LoadIntoMemory() = 0; virtual void LocalShuffle() = 0; virtual void GlobalShuffle() = 0; virtual void CreateReaders() = 0; virtual void DestroyReaders() = 0; protected: virtual int ReceiveFromClient(int msg_type, int client_id, const std::string& msg) = 0; }; // DatasetImpl is the implementation of Dataset, // it holds memory data if user calls load_into_memory template class DatasetImpl : public Dataset { public: DatasetImpl(); virtual ~DatasetImpl() {} virtual void SetFileList(const std::vector& filelist); virtual void SetThreadNum(int thread_num); virtual void SetTrainerNum(int trainer_num); virtual void SetHdfsConfig(const std::string& fs_name, const std::string& fs_ugi); virtual void SetDataFeedDesc(const std::string& data_feed_desc_str); virtual const std::vector& GetFileList() { return filelist_; } virtual int GetThreadNum() { return thread_num_; } virtual int GetTrainerNum() { return trainer_num_; } virtual const paddle::framework::DataFeedDesc& GetDataFeedDesc() { return data_feed_desc_; } virtual std::vector>& GetReaders(); virtual void LoadIntoMemory(); virtual void LocalShuffle(); virtual void GlobalShuffle(); virtual void CreateReaders(); virtual void DestroyReaders(); protected: virtual int ReceiveFromClient(int msg_type, int client_id, const std::string& msg); std::vector> readers_; std::vector memory_data_; std::mutex mutex_for_update_memory_data_; int thread_num_; paddle::framework::DataFeedDesc data_feed_desc_; int trainer_num_; std::vector filelist_; size_t file_idx_; std::mutex mutex_for_pick_file_; }; // use std::vector as data type class MultiSlotDataset : public DatasetImpl> { public: MultiSlotDataset() {} virtual ~MultiSlotDataset() {} }; } // end namespace framework } // end namespace paddle