for feed_trainer api

f07724da · xiexionghang · e3c436c5 · f07724da · f07724da · f07724da
27 changed file
--- a/BCLOUD
+++ b/BCLOUD
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,6 +54,7 @@ option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FO
 option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
 option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
+option(WITH_CUSTOM_TRAINER  "Turn on trainer implement by custom"       OFF)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
@@ -146,10 +147,12 @@ include(external/dlpack)
 include(external/snappy)    # download snappy
 include(external/snappystream) # download snappystream
 include(external/warpctc)   # download, build, install warpctc
+include(external/yaml-cpp)    # download yaml

 if (NOT WIN32)
 # there is no official support of nccl, cupti in windows
 include(cupti)
+include(external/gzstream)
 endif (NOT WIN32)

 if(WITH_PSLIB)

--- a/cmake/external/yaml-cpp.cmake
+++ b/cmake/external/yaml-cpp.cmake
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include (ExternalProject)
+
+IF(NOT ${WITH_CUSTOM_TRAINER})
+  return()
+ENDIF(NOT ${WITH_CUSTOM_TRAINER})
+
+set(YAML_SOURCES_DIR ${THIRD_PARTY_PATH}/yaml-cpp)
+set(YAML_INSTALL_DIR ${THIRD_PARTY_PATH}/install/yaml-cpp)
+set(YAML_INCLUDE_DIR "${YAML_INSTALL_DIR}/include" CACHE PATH "yaml include directory." FORCE)
+
+SET(YAML_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+
+ExternalProject_Add(
+    extern_yaml
+    GIT_REPOSITORY "https://github.com/jbeder/yaml-cpp"
+    GIT_TAG "yaml-cpp-0.6.2"
+    PREFIX          ${YAML_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS=${YAML_CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                    -DCMAKE_INSTALL_PREFIX=${YAML_INSTALL_DIR}
+                    -DCMAKE_INSTALL_LIBDIR=${YAML_INSTALL_DIR}/lib
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DBUILD_TESTING=OFF
+                    -DYAML_BUILD_TESTS:BOOL=OFF
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                    ${EXTERNAL_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${YAML_INSTALL_DIR}
+                     -DCMAKE_INSTALL_LIBDIR:PATH=${YAML_INSTALL_DIR}/lib
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+)
+set(YAML_LIBRARIES "${YAML_INSTALL_DIR}/lib/libyaml-cpp.a")
+
+add_library(yaml-cpp STATIC IMPORTED GLOBAL)
+set_property(TARGET yaml-cpp PROPERTY IMPORTED_LOCATION ${YAML_LIBRARIES})
+
+include_directories(${YAML_INCLUDE_DIR})
+add_dependencies(yaml-cpp extern_yaml)
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -21,4 +21,7 @@ cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
 if (WITH_MKLML)
    cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
 endif()
+if (WITH_CUSTOM_TRAINER)
+    cc_library(dynload_custom_trainer SRCS custom_trainer.cc DEPS dynamic_loader)
+endif()
 # TODO(TJ): add iomp, mkldnn?
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -48,6 +48,7 @@ DEFINE_string(
    "Specify path for loading tensorrt library, such as libnvinfer.so.");

 DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
+DEFINE_string(custom_trainer_dir, "", "Specify path for loading custom_trainer.so.");

 namespace paddle {
 namespace platform {
@@ -253,6 +254,10 @@ void* GetMKLMLDsoHandle() {
 #endif
 }

+void* GetCustomTrainerDsoHandle() {
+  return GetDsoHandleFromSearchPath(FLAGS_custom_trainer_dir, "custom_trainer.so");
+}
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -32,6 +32,7 @@ void* GetWarpCTCDsoHandle();
 void* GetNCCLDsoHandle();
 void* GetTensorRtDsoHandle();
 void* GetMKLMLDsoHandle();
+void* GetCustomTrainerDsoHandle();

 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
@@ -29,3 +29,7 @@ endfunction(train_test)
 if(WITH_TESTING)
  train_test(recognize_digits ARGS mlp conv)
 endif()
+
+if(WITH_CUSTOM_TRAINER)
+   add_subdirectory(custom_trainer)
+endif()
--- a/paddle/fluid/train/custom_trainer/CMakeLists.txt
+++ b/paddle/fluid/train/custom_trainer/CMakeLists.txt
+add_subdirectory(feed)
--- a/paddle/fluid/train/custom_trainer/feed/CMakeLists.txt
+++ b/paddle/fluid/train/custom_trainer/feed/CMakeLists.txt
+add_subdirectory(common)
+add_subdirectory(process)
+cc_library(custom_trainer_main SRCS main.cc DEPS custom_trainer_process custom_trainer_common)
+
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+
+set(ARCHIVE_START "-Wl,--whole-archive")
+set(ARCHIVE_END "-Wl,--no-whole-archive")
+set(EXTERNAL_LIB "-lrt -ldl -lpthread")
+
+add_executable(feed_trainer main.cc)
+target_link_libraries(feed_trainer
+        ${MACOS_LD_FLAGS}
+        ${ARCHIVE_START}
+        ${ARCHIVE_END}
+        glog gflags protobuf snappystream snappy z xxhash yaml-cpp 
+        paddle_fluid custom_trainer_common custom_trainer_process
+        ${EXTERNAL_LIB})
--- a/paddle/fluid/train/custom_trainer/feed/common/CMakeLists.txt
+++ b/paddle/fluid/train/custom_trainer/feed/common/CMakeLists.txt
+cc_library(custom_trainer_common SRCS registerer.cc DEPS memory)
--- a/paddle/fluid/train/custom_trainer/feed/common/registerer.cc
+++ b/paddle/fluid/train/custom_trainer/feed/common/registerer.cc
+#include "paddle/fluid/train/custom_trainer/feed/common/registerer.h"
+namespace paddle {
+namespace custom_trainer {
+namespace feed {
+
+BaseClassMap& global_factory_map() {
+    static BaseClassMap *base_class = new BaseClassMap();
+    return *base_class;
+}
+BaseClassMap& global_factory_map_cpp() {
+    return global_factory_map();
+}
+
+}// feed
+}// namespace custom_trainer
+}// namespace paddle
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
+
--- a/paddle/fluid/train/custom_trainer/feed/common/registerer.h
+++ b/paddle/fluid/train/custom_trainer/feed/common/registerer.h
+#pragma once
+
+#include <map>
+#include <string>
+#include <iostream>
+#include <vector>
+#include <glog/logging.h>
+
+namespace paddle {
+namespace custom_trainer {
+namespace feed {
+
+class Any {
+public:
+    Any() : content_(NULL) {}
+
+    template<typename ValueType>
+        Any(const ValueType &value) : content_(new Holder<ValueType>(value)) {}
+
+    Any(const Any &other) : content_(other.content_ ? other.content_->clone() : NULL) {}
+    
+    ~Any() {
+        delete content_;
+    }
+
+    template<typename ValueType> ValueType *any_cast() {
+        return content_ ? &static_cast<Holder<ValueType> *>(content_)->held_ : NULL;
+    }
+
+private:
+    class PlaceHolder {
+    public:
+        virtual ~PlaceHolder() {}
+        virtual PlaceHolder *clone() const = 0;
+    };
+
+    template<typename ValueType>
+        class Holder : public PlaceHolder {
+        public:
+            explicit Holder(const ValueType &value) : held_(value) {}
+            virtual PlaceHolder *clone() const {
+                return new Holder(held_);
+            }
+
+            ValueType held_;
+        };
+
+    PlaceHolder *content_;
+};
+
+class ObjectFactory {
+public:
+    ObjectFactory() {}
+    virtual ~ObjectFactory() {}
+    virtual Any NewInstance() {
+        return Any();
+    }
+private:
+};
+
+typedef std::map<std::string, ObjectFactory*> FactoryMap;
+typedef std::map<std::string, FactoryMap> BaseClassMap;
+#ifdef __cplusplus
+extern "C" {
+#endif
+BaseClassMap& global_factory_map();
+#ifdef __cplusplus
+}
+#endif
+
+BaseClassMap& global_factory_map_cpp();
+
+#define REGISTER_REGISTERER(base_class) \
+    class base_class ## Registerer { \
+        public: \
+            static base_class *CreateInstanceByName(const ::std::string &name) { \
+                if (global_factory_map_cpp().find(#base_class) \
+                        == global_factory_map_cpp().end()) { \
+                    LOG(ERROR) << "Can't Find BaseClass For CreateClass with:" << #base_class; \
+                    return NULL; \
+                } \
+                FactoryMap &map = global_factory_map_cpp()[#base_class]; \
+                FactoryMap::iterator iter = map.find(name); \
+                if (iter == map.end()) { \
+                    LOG(ERROR) << "Can't Find Class For Create with:" << name; \
+                    return NULL; \
+                } \
+                Any object = iter->second->NewInstance(); \
+                return *(object.any_cast<base_class*>()); \
+            } \
+    };
+
+#define REGISTER_CLASS(clazz, name) \
+    class ObjectFactory##name : public ObjectFactory { \
+        public: \
+            Any NewInstance() { \
+                return Any(new name()); \
+            } \
+    }; \
+    void register_factory_##name() { \
+        FactoryMap &map = global_factory_map_cpp()[#clazz]; \
+            if (map.find(#name) == map.end()) { \
+                map[#name] = new ObjectFactory##name(); \
+            } \
+    } \
+    void register_factory_##name() __attribute__((constructor)); 
+
+#define CREATE_CLASS(base_class, name) \
+    base_class##Registerer::CreateInstanceByName(name);
+    
+}//namespace feed
+}//namespace custom_trainer
+}//namespace paddle
+/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */
--- a/paddle/fluid/train/custom_trainer/feed/common/runtime_environment.h
+++ b/paddle/fluid/train/custom_trainer/feed/common/runtime_environment.h
+/*
+ *Author: xiexionghang
+ *运行环境，屏蔽MPI or Local环境的运行差异
+ *为了兼容不同环境的底层实现，Env的接口调用条件严格于sum(limit(env[n]))
+ *如：MPI环境下，写接口只允许单线程调用，那么默认对所有Env保证此调用限制
+ */
+#pragma once
+#include "paddle/fluid/string/string_helper.h"
+
+namespace paddle {
+namespace custom_trainer {
+namespace feed {
+
+class RuntimeEnvironment {
+public:
+    RuntimeEnvironment() {}
+    virtual ~RuntimeEnvironment() {}
+    //配置初始化
+    virtual int initialize(YAML::Node& config) = 0;
+    //环境初始化，会在所有依赖模块initialize后调用
+    virtual int wireup() = 0;
+    
+    //多线程可调用接口  Start
+    //当前环境rank_idx
+    virtual uint32_t rank_idx() = 0;
+    //环境定制化log
+    template<class... ARGS>
+    void log(int log_type, const char* fmt, ARGS && ... args) {
+        print_log(log_type, paddle::string::format_string(fmt, args...));
+    }
+    //多线程可调用接口      End
+
+
+    //接口只允许在主线程调用   Start
+    //barrier
+    virtual void barrier_all() = 0;
+    //接口只允许在主线程调用   End
+protected:
+    virtual void print_log(int log_type, const std::string& log_str) = 0;
+};
+
+class MPIRuntimeEnvironment : public RuntimeEnvironment {
+public:
+    MPIRuntimeEnvironment() {}
+    virtual ~MPIRuntimeEnvironment() {}
+    //配置初始化
+    virtual int initialize(YAML::Node& config) = 0;
+    //环境初始化，会在所有依赖模块initialize后调用
+    virtual int wireup() = 0;
+    //当前环境rank_idx
+    virtual uint32_t rank_idx() = 0;
+};
+
+}  // namespace feed
+}  // namespace custom_trainer
+}  // namespace paddle
--- a/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.cc
+++ b/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.cc
+/* DatasetContainer
+ * 保存一个数据源的样本，并驱动样本的异步加载
+ */
+#include <map>
+#include <string>
+#include <vector>
+#include <memory>
+#include <yaml-cpp/yaml.h>
+#include "paddle/fluid/framework/io/shell.h"
+#include "paddle/fluid/string/string_helper.h"
+#include "paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.h"
+
+namespace paddle {
+namespace custom_trainer {
+namespace feed {
+     
+paddle::framework::Channel<DataItem> DatasetContainer::fetch(int epoch_id) {
+    paddle::framework::Channel<DataItem> result;
+    if (_ready_epoch_id < epoch_id) {
+        return result;
+    }
+    _current_epoch_id = epoch_id;
+    _current_dataset_idx = epoch_id % _prefetch_num;
+    //result = _dataset_list[_current_dataset_idx].fetch();
+    //_dataset_list[_current_dataset_idx].reset((decltype(result.get())*)NULL);
+    return result;
+}  
+
+void DatasetContainer::async_download_data() {
+    while (true) {
+            //do download
+        sleep(30);
+    }
+}
+
+}//namespace feed
+}//namespace custom_trainer
+}//namespace paddle
--- a/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.h
+++ b/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.h
+/* DatasetContainer
+ * 保存一个数据源的样本，并驱动样本的异步加载
+ */
+#pragma once
+#include <map>
+#include <string>
+#include <vector>
+#include <memory>
+#include <yaml-cpp/yaml.h>
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/data_set.h"
+#include "paddle/fluid/train/custom_trainer/feed/common/runtime_environment.h"
+
+namespace paddle {
+namespace custom_trainer {
+namespace feed {
+
+//单条样本的原始数据
+class DataItem {
+public:
+    DataItem() {}
+    virtual ~DataItem() {}
+    std::string id;  //样本id标识，可用于shuffle
+    std::string data;//样本完整数据
+};
+
+class DatasetContainer {
+public:
+    DatasetContainer() {}
+    virtual ~DatasetContainer() {}
+    virtual int initialize(const YAML::Node& config) {
+        _dataset_config = config;
+        _prefetch_num = config["prefetch_num"].as<int>();
+        _data_root_path = config["root_path"].as<std::string>();
+        _data_path_generater = config["_data_path_generater"].as<std::string>();
+        return 0;
+    }  
+    virtual void run();
+    //获取特定epoch_i样本，如果数据未ready，Channel内为空指针
+    virtual ::paddle::framework::Channel<DataItem> fetch(int epoch_id);
+    //触发可预取的数据判断
+    virtual void pre_detect_data(RuntimeEnvironment* env);
+    
+protected:
+    //异步样本download
+    virtual void async_download_data();
+    virtual void download(int epoch_id, const std::vector<std::string>& paths);
+   
+    int _prefetch_num = 0;
+    YAML::Node _dataset_config;
+    std::string _data_root_path;
+    std::string _data_path_generater;
+    
+    uint32_t _current_dataset_idx;             //当前样本数据idx
+    int _current_epoch_id = -1;  
+    int _ready_epoch_id = -1; //已下载完成的epoch_id
+    std::vector<std::shared_ptr<::paddle::framework::Dataset>> _dataset_list;
+};
+
+}//namespace feed
+}//namespace custom_trainer
+}//namespace paddle
--- a/paddle/fluid/train/custom_trainer/feed/main.cc
+++ b/paddle/fluid/train/custom_trainer/feed/main.cc
+#include <time.h>
+#include <fstream>
+#include <yaml-cpp/yaml.h>
+#include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/train/custom_trainer/feed/trainer_context.h"
+#include "paddle/fluid/train/custom_trainer/feed/process/process.h"
+#include "paddle/fluid/train/custom_trainer/feed/process/init_env_process.h"
+
+using namespace paddle::custom_trainer::feed;
+
+DEFINE_string(feed_trainer_conf_path, "./conf/trainer.yaml", "path of trainer conf");
+
+int main(int argc, char* argv[]) {
+    //gflags
+    google::ParseCommandLineFlags(&argc, &argv, true);
+    std::string gflag_conf = "./conf/gflags.conf";
+    google::SetCommandLineOption("flagfile", gflag_conf.c_str()); 
+
+    //load trainer config
+    auto trainer_context_ptr = std::make_shared<TrainerContext>();
+    trainer_context_ptr->trainer_config = YAML::LoadFile(FLAGS_feed_trainer_conf_path);    
+ 
+    std::vector<std::string> process_name_list = {
+        "InitEnvProcess"
+    };
+    InitEnvProcess init_process;
+    init_process.run();
+
+    for (const auto& process_name : process_name_list) {
+        Process* process = CREATE_CLASS(Process, process_name);
+        if (process == NULL) {
+            VLOG(1) << "Process:" << process_name << " does not exist"; 
+            return -1;
+        }
+        if (process->initialize(trainer_context_ptr) != 0) {
+            VLOG(1) << "Process:" << process_name << " initialize failed"; 
+            return -1;
+        }
+        trainer_context_ptr->process_list.push_back(std::shared_ptr<Process>(process));
+    } 
+
+    for (auto& process : trainer_context_ptr->process_list) {
+        process->run();
+    }
+
+    return 0;
+}
--- a/paddle/fluid/train/custom_trainer/feed/process/CMakeLists.txt
+++ b/paddle/fluid/train/custom_trainer/feed/process/CMakeLists.txt
+cc_library(custom_trainer_process SRCS process.cc init_env_process.cc DEPS memory)
--- a/paddle/fluid/train/custom_trainer/feed/process/data_set_process.h
+++ b/paddle/fluid/train/custom_trainer/feed/process/data_set_process.h
+/*
+ *Author: xiexionghang
+ *组织训练样本的读取工作
+ */
+#pragma once
+#include "paddle/fluid/train/custom_trainer/feed/process/process.h"
+
+namespace paddle {
+namespace custom_trainer {
+namespace feed {
+
+class DatasetProcess : public Process {
+public:
+    DatasetProcess() {}
+    virtual ~DatasetProcess() {}
+    virtual int initialize(std::shared_ptr<TrainerContext> context_ptr);
+private:
+    std::map<std::string, DatasetContainer> _dataset_map;
+};
+
+}  // namespace feed
+}  // namespace custom_trainer
+}  // namespace paddle
--- a/paddle/fluid/train/custom_trainer/feed/process/init_env_process.cc
+++ b/paddle/fluid/train/custom_trainer/feed/process/init_env_process.cc
+/*
+ *Author: xiexionghang
+ *用于训练环境的整体配置读取、环境初始化工作
+ */
+#include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/train/custom_trainer/feed/process/init_env_process.h"
+
+namespace paddle {
+namespace custom_trainer {
+namespace feed {
+
+int InitEnvProcess::initialize(std::shared_ptr<TrainerContext> context_ptr) {
+    paddle::framework::InitDevices(false);
+    context_ptr->cpu_place = paddle::platform::CPUPlace();
+    VLOG(3) << "Env initialize success"; 
+    return 0;
+}
+
+}  // namespace feed
+}  // namespace custom_trainer
+}  // namespace paddle
--- a/paddle/fluid/train/custom_trainer/feed/process/init_env_process.h
+++ b/paddle/fluid/train/custom_trainer/feed/process/init_env_process.h
+/*
+ *Author: xiexionghang
+ *用于训练环境的整体配置读取、环境初始化工作
+ */
+#pragma once
+#include "paddle/fluid/train/custom_trainer/feed/process/process.h"
+
+namespace paddle {
+namespace custom_trainer {
+namespace feed {
+
+class InitEnvProcess : public Process {
+public:
+    InitEnvProcess() {}
+    virtual ~InitEnvProcess() {}
+    virtual int initialize(std::shared_ptr<TrainerContext> context_ptr);
+};
+
+}  // namespace feed
+}  // namespace custom_trainer
+}  // namespace paddle
--- a/paddle/fluid/train/custom_trainer/feed/process/process.cc
+++ b/paddle/fluid/train/custom_trainer/feed/process/process.cc
+#include "paddle/fluid/train/custom_trainer/feed/process/process.h"
+#include "paddle/fluid/train/custom_trainer/feed/process/init_env_process.h"
+
+namespace paddle {
+namespace custom_trainer {
+namespace feed {
+REGISTER_CLASS(Process, InitEnvProcess);
+int Process::run() {
+    return 0;
+}
+
+
+}  // namespace feed
+}  // namespace custom_trainer
+}  // namespace paddle
--- a/paddle/fluid/train/custom_trainer/feed/process/process.h
+++ b/paddle/fluid/train/custom_trainer/feed/process/process.h
+#pragma once
+#include "paddle/fluid/train/custom_trainer/feed/common/registerer.h"
+#include "paddle/fluid/train/custom_trainer/feed/trainer_context.h"
+
+namespace paddle {
+namespace custom_trainer {
+namespace feed {
+
+class Process {
+public:
+    Process() {}
+    virtual ~Process() {}
+    virtual int initialize(std::shared_ptr<TrainerContext> context_ptr) = 0;
+    virtual int run();
+};
+REGISTER_REGISTERER(Process);
+
+}  // namespace feed
+}  // namespace custom_trainer
+}  // namespace paddle
--- a/paddle/fluid/train/custom_trainer/feed/scripts/start_feed_trainer.sh
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/start_feed_trainer.sh
+#!bash
+export LD_LIBRARY_PATH=LD_LIBRARY_PATH:./so
+./bin/feed_trainer
--- a/paddle/fluid/train/custom_trainer/feed/temp/feed_trainer.cpp
+++ b/paddle/fluid/train/custom_trainer/feed/temp/feed_trainer.cpp
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <time.h>
+#include <fstream>
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace train {
+
+void ReadBinaryFile(const std::string& filename, std::string* contents) {
+  std::ifstream fin(filename, std::ios::in | std::ios::binary);
+  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
+  fin.seekg(0, std::ios::end);
+  contents->clear();
+  contents->resize(fin.tellg());
+  fin.seekg(0, std::ios::beg);
+  fin.read(&(contents->at(0)), contents->size());
+  fin.close();
+}
+
+std::unique_ptr<paddle::framework::ProgramDesc> Load(
+    paddle::framework::Executor* executor, const std::string& model_filename) {
+  VLOG(3) << "loading model from " << model_filename;
+  std::string program_desc_str;
+  ReadBinaryFile(model_filename, &program_desc_str);
+
+  std::unique_ptr<paddle::framework::ProgramDesc> main_program(
+      new paddle::framework::ProgramDesc(program_desc_str));
+  return main_program;
+}
+
+}  // namespace train
+}  // namespace paddle
+
+int main() {
+  paddle::framework::InitDevices(false);
+
+  const auto cpu_place = paddle::platform::CPUPlace();
+
+  paddle::framework::Executor executor(cpu_place);
+  paddle::framework::Scope scope;
+  auto startup_program = paddle::train::Load(&executor, "startup_program");
+  auto train_program = paddle::train::Load(&executor, "main_program");
+
+  std::string loss_name = "";
+  for (auto op_desc : train_program->Block(0).AllOps()) {
+    if (op_desc->Type() == "mean") {
+      loss_name = op_desc->Output("Out")[0];
+      break;
+    }
+  }
+
+  PADDLE_ENFORCE_NE(loss_name, "", "loss not found");
+
+  // init all parameters
+  executor.Run(*startup_program, &scope, 0);
+
+  // prepare data
+  auto x_var = scope.Var("x");
+  auto x_tensor = x_var->GetMutable<paddle::framework::LoDTensor>();
+  x_tensor->Resize({2, 13});
+
+  auto x_data = x_tensor->mutable_data<float>(cpu_place);
+  for (int i = 0; i < 2 * 13; ++i) {
+    x_data[i] = static_cast<float>(i);
+  }
+
+  auto y_var = scope.Var("y");
+  auto y_tensor = y_var->GetMutable<paddle::framework::LoDTensor>();
+  y_tensor->Resize({2, 1});
+  auto y_data = y_tensor->mutable_data<float>(cpu_place);
+  for (int i = 0; i < 2 * 1; ++i) {
+    y_data[i] = static_cast<float>(i);
+  }
+
+  auto loss_var = scope.Var(loss_name);
+
+  paddle::platform::ProfilerState pf_state;
+  pf_state = paddle::platform::ProfilerState::kCPU;
+  paddle::platform::EnableProfiler(pf_state);
+  clock_t t1 = clock();
+
+  for (int i = 0; i < 10; ++i) {
+    executor.Run(*train_program, &scope, 0, false, true);
+    std::cout << "step: " << i << " loss: "
+              << loss_var->Get<paddle::framework::LoDTensor>().data<float>()[0]
+              << std::endl;
+  }
+
+  clock_t t2 = clock();
+  paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kTotal,
+                                    "run_paddle_op_profiler");
+  std::cout << "run_time = " << t2 - t1 << std::endl;
+  return 0;
+}
--- a/paddle/fluid/train/custom_trainer/feed/trainer_context.h
+++ b/paddle/fluid/train/custom_trainer/feed/trainer_context.h
+#pragma once
+#include <string>
+#include <memory>
+#include <vector>
+#include <yaml-cpp/yaml.h>
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/train/custom_trainer/feed/common/runtime_environment.h"
+
+
+namespace paddle {
+namespace custom_trainer {
+namespace feed {
+
+class Process;
+
+class TrainerContext {
+public:
+YAML::Node trainer_config;
+paddle::platform::CPUPlace cpu_place;
+std::shared_ptr<RuntimeEnvironment> environment;
+std::vector<std::shared_ptr<Process>> process_list;
+};
+
+}  // namespace feed
+}  // namespace custom_trainer
+}  // namespace paddle
--- a/publish_include.sh
+++ b/publish_include.sh
+#!bash
+OUTPUT_PATH=../../../bc_out/baidu/feed-mlarch/paddle-trainer/output/include/
+INCLUDE_DIR=paddle/fluid/train/custom_trainer/feed/
+SUB_DIR_LIST=(common dataset params_accessor process shuffler)
+rm -rf ${OUTPUT_PATH}/${INCLUDE_DIR}/*
+
+cp ${INCLUDE_DIR}/*.h ${OUTPUT_PATH}/${INCLUDE_DIR}/
+for sub_name in "${SUB_DIR_LIST[@]}"
+do
+    mkdir ${OUTPUT_PATH}/${INCLUDE_DIR}/${sub_name}
+    cp ${INCLUDE_DIR}/${sub_name}/*.h ${OUTPUT_PATH}/${INCLUDE_DIR}/${sub_name}/
+done
--- a/release.bcloud
+++ b/release.bcloud
+#!/bin/bash
+mkdir -p so
+
+cp baidu_third-party_mklml/so/* so
+rm -rf baidu_third-party_mklml