diff --git a/BCLOUD b/BCLOUD
index e64102a0fe7c5821610606f8232b36efa025756a..7b7aae714aa0a8f6cdbc8e49527f2ea27bb251bf 100755
--- a/BCLOUD
+++ b/BCLOUD
@@ -1,7 +1,7 @@
 WORKROOT('../../../')
 COMPILER('gcc482')
 CPPFLAGS('-D_GNU_SOURCE -DNDEBUG')
-GLOBAL_CFLAGS_STR = '-g -O0 -pipe -fopenmp '
+GLOBAL_CFLAGS_STR = '-g -O3 -pipe -fopenmp '
 CFLAGS(GLOBAL_CFLAGS_STR)
 GLOBAL_CXXFLAGS_STR = GLOBAL_CFLAGS_STR + ' -std=c++11 '
 CXXFLAGS(GLOBAL_CXXFLAGS_STR)
@@ -35,6 +35,7 @@ CONFIGS('baidu/third-party/pybind11@v2.2.4@git_branch')
 CONFIGS('baidu/third-party/python@gcc482output@git_branch')
 CONFIGS('baidu/third-party/yaml-cpp@yaml-cpp_0-6-2-0_GEN_PD_BL@git_tag')
 CONFIGS('baidu/third-party/openmpi@openmpi_1-4-5-0-feed_mlarch@git_branch')
+CONFIGS('baidu/paddlepaddle/pslib@no_abacus_in_proto@git_branch')
 CONFIGS('third-64/gtest@base')
 HEADERS('paddle/fluid/memory/*.h', '$INC/paddle/fluid/memory/')
 HEADERS('paddle/fluid/memory/detail/*.h', '$INC/paddle/fluid/memory/detail/')
@@ -74,6 +75,7 @@ NEED_OUTPUT("baidu/third-party/openmpi")
 OUTPUT('paddle/fluid/train/custom_trainer/feed/conf', '$OUT')
 OUTPUT('paddle/fluid/train/custom_trainer/feed/scripts', '$OUT')
 
+
 def UT_FILE(filename):
     UT_DIR = 'paddle/fluid/train/custom_trainer/feed/unit_test'
     import os
@@ -81,7 +83,7 @@ def UT_FILE(filename):
 custom_trainer_src = GLOB('paddle/fluid/train/custom_trainer/feed/*/*.cc', Exclude(UT_FILE('*')))
 CPPFLAGS_STR = '-DHPPL_STUB_FUNC -DLAPACK_FOUND -DPADDLE_DISABLE_PROFILER -DPADDLE_NO_PYTHON -DCUSTOM_TRAINER -DPADDLE_ON_INFERENCE -DPADDLE_USE_DSO -DPADDLE_USE_PTHREAD_BARRIER -DPADDLE_USE_PTHREAD_SPINLOCK -DPADDLE_VERSION=0.0.0 -DPADDLE_WITH_AVX -DPADDLE_WITH_MKLML -DPADDLE_WITH_XBYAK -DXBYAK64 -DXBYAK_NO_OP_NAMES -D_GNU_SOURCE -D__STDC_LIMIT_MACROS -DPYBIND_AVX_MKLML' + r" -DPADDLE_REVISION=\"%s@%s@%s\"" % (REPO_URL(), REPO_BRANCH(), REPO_REVISION())
 
-CFLAGS_STR = '-m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=sign-compare -Wno-error=unused-local-typedefs -Wno-error=maybe-uninitialized -fopenmp -mavx -O0 -DNDEBUG '
+CFLAGS_STR = '-m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=sign-compare -Wno-error=unused-local-typedefs -Wno-error=maybe-uninitialized -Wno-narrowing -Wnarrowing -fopenmp -mavx -O3 -DNDEBUG '
 CXXFLAGS_STR = '-std=c++11 ' + CFLAGS_STR
 
 SharedLibrary("paddle_fluid_avx_mklml", PreBuilt(True))
diff --git a/paddle/fluid/string/to_string.h b/paddle/fluid/string/to_string.h
index 8caf149420393ec81131389d7787bee925f4a27d..9378a0fe61280340b53c3f7afde819c61d2aa0db 100644
--- a/paddle/fluid/string/to_string.h
+++ b/paddle/fluid/string/to_string.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <sstream>
 #include <string>
+#include <vector>
 #include <typeindex>
 
 namespace paddle {
@@ -31,6 +32,15 @@ inline std::string to_string(T v) {
   return sout.str();
 }
 
+template <typename T>
+inline std::string to_string(const std::vector<T>& v_list) {
+    std::ostringstream sout;
+    for (const auto& v : v_list) {
+        sout << v << " ";
+    }
+    return sout.str();
+}
+
 template <>
 inline std::string to_string(std::type_index t) {
   return t.name();
diff --git a/paddle/fluid/train/custom_trainer/feed/accessor/dense_input_accessor.cc b/paddle/fluid/train/custom_trainer/feed/accessor/dense_input_accessor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f1ca59ecd0fa5582f4d83a11c842473f1538e4bd
--- /dev/null
+++ b/paddle/fluid/train/custom_trainer/feed/accessor/dense_input_accessor.cc
@@ -0,0 +1,152 @@
+#include "paddle/fluid/train/custom_trainer/feed/accessor/input_data_accessor.h"
+
+namespace paddle {
+namespace custom_trainer {
+namespace feed {
+    
+int DenseInputAccessor::initialize(YAML::Node config,
+        std::shared_ptr<TrainerContext> context_ptr) {
+    CHECK(DataInputAccessor::initialize(config, context_ptr) == 0);
+    _total_dim = 0;
+    _pull_request_num.store(0);
+    for (const auto& input : config["input"]) {
+        DenseInputVariable variable;
+        variable.name = input["name"].as<std::string>();
+        variable.gradient_name = paddle::framework::GradVarName(variable.name);
+        variable.shape = input["shape"].as<std::vector<int>>();
+        variable.dim = 1;
+        for (int i = 0; i < variable.shape.size(); ++i) {
+            if (variable.shape[i] <= 0) {
+                variable.shape[i] = 1;
+            }
+            variable.dim *= variable.shape[i];    
+        }
+        _total_dim += variable.dim;
+        _x_variables.emplace_back(variable);
+    }
+    if (config["async_pull"] && config["async_pull"].as<bool>()) {
+        _need_async_pull = true;
+    }
+    return 0;
+}
+
+int32_t DenseInputAccessor::create(::paddle::framework::Scope* scope) {
+    size_t data_buffer_idx = 0;
+    std::vector<paddle::ps::Region> regions;
+    for (auto& variable : _x_variables) {
+        auto* tensor = scope->Var(variable.name)->
+            GetMutable<paddle::framework::LoDTensor>(); 
+        auto* data = tensor->data<float>();
+        regions.emplace_back(data, variable.dim);
+    }
+    auto* ps_client = _trainer_context->pslib->ps_client();
+    auto push_status = ps_client->push_dense_param(regions.data(), regions.size(), _table_id);
+    return push_status.get();
+}
+
+// rpc拉取数据，需保证单线程运行
+int32_t DenseInputAccessor::pull_dense(size_t table_id) {
+    float* data_buffer = NULL;
+    if (_data_buffer == nullptr) {
+        _data_buffer = new float[_total_dim];
+    }
+    // TODO 使用双buffer DataBuffer,避免训练期改写，当前异步SGD下，问题不大
+    data_buffer = _data_buffer;
+    
+    size_t data_buffer_idx = 0;
+    std::vector<paddle::ps::Region> regions;
+    for (auto& variable : _x_variables) {
+        regions.emplace_back(data_buffer + data_buffer_idx, variable.dim);
+        data_buffer_idx += variable.dim;
+    }
+    auto* ps_client = _trainer_context->pslib->ps_client();
+    auto push_status = ps_client->pull_dense(regions.data(), regions.size(), table_id);
+    return push_status.get();
+}
+
+int32_t DenseInputAccessor::forward(SampleInstance* samples, size_t num,
+    paddle::framework::Scope* scope) {
+    // 首次同步pull，之后异步pull
+    if (_data_buffer == nullptr) {
+        _pull_mutex.lock();
+        if (_data_buffer == nullptr) {
+            CHECK(pull_dense(_table_id) == 0);
+            _async_pull_thread = std::make_shared<std::thread>(
+                [this]() {
+                while (_need_async_pull) {
+                    if (_pull_request_num > 0) {
+                        pull_dense(_table_id);
+                        _pull_request_num = 0; 
+                    } else {
+                        usleep(50000);
+                    }     
+                }
+            });
+        }
+        _pull_mutex.unlock();
+    }
+
+    size_t data_buffer_idx = 0;
+    for (auto& variable : _x_variables) {
+        auto* shape_ptr = &(variable.shape[0]);
+        paddle::framework::DDim ddim(shape_ptr, variable.shape.size());
+        auto* tensor = ScopeHelper::resize_lod_tensor(scope, variable.name, ddim);  
+        auto* grad_tensor = ScopeHelper::resize_lod_tensor(scope, variable.gradient_name, ddim);
+        VLOG(5) << "fill scope variable:" << variable.name << ", " << variable.gradient_name;
+        auto* var_data = tensor->mutable_data<float>(_trainer_context->cpu_place);
+        memcpy(var_data, _data_buffer + data_buffer_idx, variable.dim * sizeof(float));
+        data_buffer_idx += variable.dim;
+    }
+    if (_need_async_pull) {
+        ++_pull_request_num;
+    }
+    return 0;
+}
+
+int32_t DenseInputAccessor::backward(SampleInstance* samples, size_t num,
+        paddle::framework::Scope* scope) {
+    if (!_need_gradient) {
+        return 0;
+    } 
+    size_t data_buffer_idx = 0;
+    std::vector<paddle::ps::Region> regions;
+    for (auto& variable : _x_variables) {
+        auto* tensor = scope->Var(variable.gradient_name)->
+            GetMutable<paddle::framework::LoDTensor>(); 
+        auto* grad_data = tensor->mutable_data<float>(_trainer_context->cpu_place);
+        regions.emplace_back(grad_data, variable.dim);
+    }
+    auto* ps_client = _trainer_context->pslib->ps_client();
+    auto push_status = ps_client->push_dense(regions.data(), regions.size(), _table_id);
+    //return push_status.get();
+    return 0;
+}
+
+int32_t EbdVariableInputAccessor::forward(SampleInstance* samples, size_t num,
+    paddle::framework::Scope* scope) {
+    CHECK(_x_variables.size() == 1);
+    CHECK(_x_variables[0].shape.size() == 1);
+    auto& variable = _x_variables[0];
+    auto* tensor = ScopeHelper::resize_lod_tensor(scope, 
+        variable.name, {num, variable.shape[0]});
+    auto* var_data = tensor->mutable_data<float>(_trainer_context->cpu_place);
+    for (size_t i = 0; i < num; ++i) {
+        auto& sample = samples[i];
+        CHECK(sample.embedx.size() == variable.dim);
+        memcpy(var_data, sample.embedx.data(), variable.dim * sizeof(float));
+        var_data += variable.dim;
+    }
+    return 0;
+}
+
+int32_t EbdVariableInputAccessor::backward(SampleInstance* samples, size_t num,
+    paddle::framework::Scope* scope) {
+    return 0;
+}
+
+REGIST_CLASS(DataInputAccessor, DenseInputAccessor);
+REGIST_CLASS(DataInputAccessor, EbdVariableInputAccessor);
+
+}  // namespace feed
+}  // namespace custom_trainer
+}  // namespace paddle
diff --git a/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc b/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc
index 79fe2fc5594554a00633b04a53782ddde7437482..6ef4c398d389a727a86a1241c1155fcc5fec55ab 100644
--- a/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc
+++ b/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc
@@ -23,6 +23,7 @@ namespace feed {
         
         if (!fs->exists(_done_file_path)) {
             VLOG(0) << "missing done file, path:" << _done_file_path;
+            return -1;
         }
 
         std::string done_text = fs->tail(_done_file_path);
@@ -32,43 +33,90 @@ namespace feed {
         _last_checkpoint_path = get_status<std::string>(EpochStatusFiled::CheckpointPathField);
         return 0;
     }
+    
+    int32_t EpochAccessor::epoch_done(uint64_t epoch_id) {
+        struct timeval now; 
+        gettimeofday(&now, NULL); 
+        if (need_save_model(epoch_id, ModelSaveWay::ModelSaveTrainCheckpoint)) {
+            _last_checkpoint_epoch_id = epoch_id;
+            _last_checkpoint_path = model_save_path(epoch_id, ModelSaveWay::ModelSaveTrainCheckpoint);
+        }
+        set_status(EpochStatusFiled::EpochIdField, epoch_id);
+        set_status(EpochStatusFiled::TimestampField, now.tv_sec);
+        set_status(EpochStatusFiled::CheckpointIdField, _last_checkpoint_epoch_id);
+        set_status(EpochStatusFiled::CheckpointPathField, _last_checkpoint_path);
+        set_status(EpochStatusFiled::DateField, format_timestamp(epoch_id, "%Y%m%d"));
 
-    int HourlyEpochAccessor::initialize(YAML::Node config,
-        std::shared_ptr<TrainerContext> context_ptr) {
-        EpochAccessor::initialize(config, context_ptr); 
+        // 非主节点不做状态持久化
+        if (!_trainer_context->environment->is_master_node(EnvironmentRole::WORKER)) {
+            return 0;
+        }
+        auto fs = _trainer_context->file_system.get();
+        std::string done_str = paddle::string::join_strings(_done_status, '\t');
+        // 保留末尾1000数据
+        std::string tail_done_info = paddle::string::trim_spaces(fs->tail(_done_file_path, 1000)); 
+        if (tail_done_info.size() > 0) {
+            tail_done_info = tail_done_info + "\n" + done_str;
+        } else {
+            tail_done_info = done_str;
+        }
+        VLOG(2) << "Write epoch donefile to " << _done_file_path << ", str:" << done_str;
+        bool write_success = false;
+        while (true) {
+            fs->remove(_done_file_path);
+            auto fp = fs->open_write(_done_file_path, "");
+            if (fwrite(tail_done_info.c_str(), tail_done_info.length(), 1, &*fp) == 1) {
+                break;
+            }     
+            sleep(10);   
+        }
+        VLOG(2) << "Write epoch donefile success";
         return 0;
     }
 
-    void HourlyEpochAccessor::next_epoch() {
+    int TimelyEpochAccessor::initialize(YAML::Node config,
+        std::shared_ptr<TrainerContext> context_ptr) {
+        _time_zone_seconds = config["time_zone_seconds"].as<int>();
+        _train_time_interval = config["train_time_interval"].as<int>();
+        CHECK(_train_time_interval > 0 && (_train_time_interval % SecondsPerMin) == 0);
+        _train_num_per_day = SecondsPerDay / _train_time_interval;
+        return EpochAccessor::initialize(config, context_ptr); 
+    }
+
+    void TimelyEpochAccessor::next_epoch() {
         _current_epoch_id = next_epoch_id(_current_epoch_id);
     }
 
-    std::string HourlyEpochAccessor::text(uint64_t epoch_id) {
-        return format_timestamp(epoch_id, "%Y%m%d delta-%H");
+    std::string TimelyEpochAccessor::text(uint64_t epoch_id) {
+        auto delta = delta_id(epoch_id);
+        std::string date = format_timestamp(epoch_id, "%Y%m%d%H%M");
+        return string::format_string("%s delta-%d", date.c_str(), delta);
     }
 
-    uint64_t HourlyEpochAccessor::next_epoch_id(uint64_t epoch_id) {
+    uint64_t TimelyEpochAccessor::next_epoch_id(uint64_t epoch_id) {
         if (epoch_id == 0) {
             struct timeval now; 
             gettimeofday(&now, NULL); 
-            return now.tv_sec / (24 * 3600) * (24 * 3600);
+            // 归整到零点
+            return now.tv_sec / SecondsPerDay * SecondsPerDay;
         } 
-        return epoch_id + 3600;
+        return epoch_id + _train_time_interval;
     }
 
-    bool HourlyEpochAccessor::is_last_epoch(uint64_t epoch_id) {
-        return ((epoch_id / 3600) % 24) == 23;
+    bool TimelyEpochAccessor::is_last_epoch(uint64_t epoch_id) {
+        auto delta = delta_id(epoch_id);
+        return delta == _train_num_per_day;
     }
  
-    uint64_t HourlyEpochAccessor::epoch_time_interval() {
-        return 3600;
+    uint64_t TimelyEpochAccessor::epoch_time_interval() {
+        return _train_time_interval;
     }
 
-    uint64_t HourlyEpochAccessor::epoch_timestamp(uint64_t epoch_id) {
+    uint64_t TimelyEpochAccessor::epoch_timestamp(uint64_t epoch_id) {
         return epoch_id;
     }
  
-    bool HourlyEpochAccessor::need_save_model(uint64_t epoch_id, ModelSaveWay save_way) {
+    bool TimelyEpochAccessor::need_save_model(uint64_t epoch_id, ModelSaveWay save_way) {
         if (epoch_id == 0) {
             return false;
         }
@@ -78,24 +126,30 @@ namespace feed {
             case ModelSaveWay::ModelSaveInferenceBase:
                 return is_last_epoch(epoch_id);
             case ModelSaveWay::ModelSaveTrainCheckpoint:
-                return ((epoch_id / 3600) % 8) == 0;
+                return ((epoch_id / SecondsPerHour) % 8) == 0;
         }
         return false;
     }
 
-    std::string HourlyEpochAccessor::model_save_path(uint64_t epoch_id, ModelSaveWay save_way) {
+    std::string TimelyEpochAccessor::model_save_path(uint64_t epoch_id, ModelSaveWay save_way) {
+        int32_t delta = delta_id(epoch_id);
+        std::string date = format_timestamp(epoch_id, "%Y%m%d");
+        std::string date_with_hour = format_timestamp(epoch_id, "%Y%m%d%H");
         switch (save_way) {
             case ModelSaveWay::ModelSaveInferenceDelta:
-                return _trainer_context->file_system->path_join(_model_root_path, "/xbox/delta-" + std::to_string(epoch_id));
+                return _trainer_context->file_system->path_join(_model_root_path, 
+                    string::format_string("xbox/%s/delta-%d", date.c_str(), delta));
             case ModelSaveWay::ModelSaveInferenceBase:
-                return _trainer_context->file_system->path_join(_model_root_path, "/xbox/base");
+                return _trainer_context->file_system->path_join(_model_root_path, 
+                    string::format_string("xbox/%s/base", date.c_str()));
             case ModelSaveWay::ModelSaveTrainCheckpoint:
-                return _trainer_context->file_system->path_join(_model_root_path, "/xbox/checkpoint");
+                return _trainer_context->file_system->path_join(_model_root_path, 
+                    string::format_string("batch_model/%s", date_with_hour.c_str()));
         }
         return "";
     }
 
-    REGISTER_CLASS(EpochAccessor, HourlyEpochAccessor);
+    REGIST_CLASS(EpochAccessor, TimelyEpochAccessor);
 
 }  // namespace feed
 }  // namespace custom_trainer
diff --git a/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.h b/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.h
index 8646893c64d4ab9f2701b5b4ee6cc4fde25eaefe..07b15c62c59228b3c6532550b4554542648f91e5 100644
--- a/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.h
+++ b/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.h
@@ -31,24 +31,35 @@ public:
     virtual const std::string& checkpoint_path() {
         return _last_checkpoint_path;
     }
-    
+
+    virtual int32_t epoch_done(uint64_t epoch_id);
+
     template <class T>
     T get_status(EpochStatusFiled field) {
         auto status = paddle::string::trim_spaces(_done_status[static_cast<int>(field)]);
         return boost::lexical_cast<T>(status.c_str());
     }
-    
-    virtual void next_epoch()                     = 0;
+    template <class T>
+    void set_status(EpochStatusFiled field, const T& status) {
+        auto str_status = paddle::string::to_string(status);
+        _done_status[static_cast<int>(field)] = str_status;
+        return;
+    }
     virtual std::string model_root_path() {
         return _model_root_path;
     }
+
+    virtual void next_epoch()                     = 0;
+    
     virtual std::string text(uint64_t epoch_id)   = 0;
     virtual uint64_t next_epoch_id(uint64_t epoch_id)  = 0;
     virtual bool is_last_epoch(uint64_t epoch_id) = 0; 
+
     //epoch间的数据时间间隔（秒）
     virtual uint64_t epoch_time_interval() = 0;
     //获取epoch的样本数据时间
     virtual uint64_t epoch_timestamp(uint64_t epoch_id) = 0; 
+
     virtual bool need_save_model(uint64_t epoch_id, ModelSaveWay save_way) = 0;
     virtual std::string model_save_path(uint64_t epoch_id, ModelSaveWay save_way) = 0;
 protected:
@@ -61,12 +72,12 @@ protected:
     std::vector<std::string> _done_status; //当前完成状态，统一存成string
     
 };
-REGISTER_REGISTERER(EpochAccessor);
+REGIST_REGISTERER(EpochAccessor);
 
-class HourlyEpochAccessor : public EpochAccessor {
+class TimelyEpochAccessor : public EpochAccessor {
 public:
-    HourlyEpochAccessor() {}
-    virtual ~HourlyEpochAccessor() {}
+    TimelyEpochAccessor() {}
+    virtual ~TimelyEpochAccessor() {}
     virtual int initialize(YAML::Node config,
         std::shared_ptr<TrainerContext> context_ptr);
     virtual void next_epoch();
@@ -77,6 +88,14 @@ public:
     virtual uint64_t epoch_timestamp(uint64_t epoch_id); 
     virtual bool need_save_model(uint64_t epoch_id, ModelSaveWay save_way);
     virtual std::string model_save_path(uint64_t epoch_id, ModelSaveWay save_way);
+
+private:
+    inline size_t delta_id(uint64_t epoch_id) {
+        return ((epoch_id + _time_zone_seconds) % SecondsPerDay) / _train_time_interval; 
+    } 
+    uint32_t _time_zone_seconds;   // 相对UTC时差(秒)
+    uint32_t _train_time_interval; // 训练时间间隔(秒)
+    uint32_t _train_num_per_day;   // 天级训练总轮数
 };
 
 }  // namespace feed
diff --git a/paddle/fluid/train/custom_trainer/feed/accessor/input_data_accessor.h b/paddle/fluid/train/custom_trainer/feed/accessor/input_data_accessor.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc2063c4288b2623bfd477c65afc5ad0be1dac19
--- /dev/null
+++ b/paddle/fluid/train/custom_trainer/feed/accessor/input_data_accessor.h
@@ -0,0 +1,168 @@
+#pragma once
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/train/custom_trainer/feed/accessor/accessor.h"
+#include "paddle/fluid/train/custom_trainer/feed/dataset/data_reader.h"
+#include "paddle/fluid/train/custom_trainer/feed/common/scope_helper.h"
+
+namespace paddle {
+namespace custom_trainer {
+namespace feed {
+
+class DataInputAccessor : public Accessor {
+public:
+    DataInputAccessor() {}
+    virtual ~DataInputAccessor() {}
+
+    virtual int initialize(YAML::Node config,
+        std::shared_ptr<TrainerContext> context_ptr) {
+        _trainer_context = context_ptr.get();
+        _table_id = config["table_id"].as<int>();
+        _need_gradient = config["need_gradient"].as<bool>();
+        return 0;
+    }
+
+    // 创建，一般用于模型冷启的随机初始化
+    virtual int32_t create(::paddle::framework::Scope* scope) {
+        return 0;
+    }
+
+    // 前向， 一般用于填充输入，在训练网络执行前调用
+    virtual int32_t forward(SampleInstance* samples, size_t num,
+        ::paddle::framework::Scope* scope) = 0;
+
+    // 后向，一般用于更新梯度，在训练网络执行后调用
+    virtual int32_t backward(SampleInstance* samples, size_t num,
+        ::paddle::framework::Scope* scope) = 0;
+protected:
+    size_t _table_id = 0;
+    bool _need_gradient = false;
+    TrainerContext* _trainer_context = nullptr;
+};
+REGIST_REGISTERER(DataInputAccessor);
+
+struct LabelInputVariable {
+    std::string label_name;
+    std::string output_name;
+    size_t label_dim = 0;
+};
+class LabelInputAccessor : public DataInputAccessor {
+public:
+    LabelInputAccessor() {}
+    virtual ~LabelInputAccessor() {}
+    
+    virtual int initialize(YAML::Node config,
+         std::shared_ptr<TrainerContext> context_ptr);
+
+    virtual int32_t forward(SampleInstance* samples, size_t num,
+        ::paddle::framework::Scope* scope);
+
+    virtual int32_t backward(SampleInstance* samples, size_t num,
+        ::paddle::framework::Scope* scope);
+protected:
+    size_t _label_total_dim = 0; 
+    std::vector<LabelInputVariable> _labels;
+};
+
+struct SparseInputVariable {
+    size_t slot_dim;
+    size_t total_dim;
+    std::string name;
+    std::string gradient_name;
+    std::vector<int32_t> slot_idx;
+    std::vector<uint16_t> slot_list;
+};
+
+struct SparseVarRuntimeData {
+    uint32_t row_size;
+    uint32_t total_size;
+    float* variable_data;
+    float* gradient_data;
+};
+
+class BaseSparseInputAccessor : public DataInputAccessor {
+public:
+    BaseSparseInputAccessor() {}
+    virtual ~BaseSparseInputAccessor() {}
+
+    virtual int initialize(YAML::Node config,
+        std::shared_ptr<TrainerContext> context_ptr);
+
+    // forword过程的input填充
+    virtual int32_t forward(SampleInstance* samples, size_t num,
+        paddle::framework::Scope* scope);
+    // 取得单个SparseKey的PullValue, 实现单个SparseValue的填充
+    virtual void fill_input(float* var_data, const float* pull_raw,
+        paddle::ps::ValueAccessor&, SparseInputVariable&, SampleInstance&) = 0;
+    // 所有SparseValue填充完成后，调用，可进一步全局处理
+    virtual void post_process_input(float* var_data, SparseInputVariable&, SampleInstance*, size_t num) = 0;
+
+    // backward过程的梯度push
+    virtual int32_t backward(SampleInstance* samples, size_t num,
+        paddle::framework::Scope* scope);    
+    // SparseGradValue会被依次调用，用于整理push的梯度
+    virtual void fill_gradient(float* push_value, const float* gradient_raw, 
+        paddle::ps::ValueAccessor&, SparseInputVariable&, SampleInstance&) = 0;
+
+protected:
+    // 输入层列表
+    std::vector<SparseInputVariable> _x_variables;       
+};
+
+struct DenseInputVariable {
+    size_t dim;
+    std::string name;
+    std::vector<int> shape;
+    std::string gradient_name;
+};
+
+class DenseInputAccessor : public DataInputAccessor {
+public:
+    DenseInputAccessor() {}
+    virtual ~DenseInputAccessor() {
+        if (_data_buffer) {
+            delete[] _data_buffer;
+        }
+        _need_async_pull = false;
+        if (_async_pull_thread) {
+            _async_pull_thread->join();
+        }
+    }
+    
+    virtual int initialize(YAML::Node config,
+        std::shared_ptr<TrainerContext> context_ptr);
+    
+    virtual int32_t create(::paddle::framework::Scope* scope);
+
+    virtual int32_t forward(SampleInstance* samples, size_t num,
+        paddle::framework::Scope* scope);
+
+    virtual int32_t backward(SampleInstance* samples, size_t num,
+        paddle::framework::Scope* scope);
+protected:
+    virtual int32_t pull_dense(size_t table_id);
+
+    size_t _total_dim = 0;
+    std::mutex _pull_mutex;
+    bool _need_async_pull = false;
+    float* _data_buffer = nullptr;
+    std::atomic<int> _pull_request_num;
+    std::vector<DenseInputVariable> _x_variables; 
+    std::shared_ptr<std::thread> _async_pull_thread;
+};
+
+class EbdVariableInputAccessor : public DenseInputAccessor {
+public:
+    EbdVariableInputAccessor() {}
+    virtual ~EbdVariableInputAccessor() {}
+
+    virtual int32_t forward(SampleInstance* samples, size_t num,
+        paddle::framework::Scope* scope);
+
+    virtual int32_t backward(SampleInstance* samples, size_t num,
+        paddle::framework::Scope* scope);
+};
+
+}  // namespace feed
+}  // namespace custom_trainer
+}  // namespace paddle
diff --git a/paddle/fluid/train/custom_trainer/feed/accessor/label_input_accessor.cc b/paddle/fluid/train/custom_trainer/feed/accessor/label_input_accessor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1a18dfea48c03029bd45d3bfb83e21a42eb38b0b
--- /dev/null
+++ b/paddle/fluid/train/custom_trainer/feed/accessor/label_input_accessor.cc
@@ -0,0 +1,74 @@
+#include "paddle/fluid/train/custom_trainer/feed/accessor/input_data_accessor.h"
+
+namespace paddle {
+namespace custom_trainer {
+namespace feed {
+    
+int LabelInputAccessor::initialize(YAML::Node config,
+        std::shared_ptr<TrainerContext> context_ptr) {
+    _trainer_context = context_ptr.get();
+    _label_total_dim = 0;
+    for (const auto& input : config["input"]) {
+        LabelInputVariable variable;
+        variable.label_name = input["label_name"].as<std::string>();
+        variable.output_name = input["output_name"].as<std::string>();
+        auto shape = input["shape"].as<std::vector<int>>();
+        variable.label_dim = 0;
+        for (auto dim : shape) {
+            variable.label_dim += (dim > 0 ? dim : 0);
+        }
+        _label_total_dim += variable.label_dim;
+        _labels.emplace_back(variable);
+    }
+    return 0;
+}
+
+int32_t LabelInputAccessor::forward(SampleInstance* samples, size_t num,
+    paddle::framework::Scope* scope) {
+    if (num < 1) {
+        return 0;
+    }
+    size_t sample_label_data_idx = 0;
+    for (auto& label : _labels) {
+        auto* tensor = ScopeHelper::resize_lod_tensor(scope, label.label_name, {num, label.label_dim}); 
+        auto* res_tens = ScopeHelper::resize_lod_tensor(scope, label.output_name, {num, label.label_dim}); 
+        auto* var_data = tensor->mutable_data<float>(_trainer_context->cpu_place);        
+        for (size_t i = 0; i < num; ++i) {
+            auto& sample = samples[i];
+            CHECK(sample.labels.size() > sample_label_data_idx);
+            float* sample_label_buffer = sample.labels.data();
+            memcpy(var_data + i * label.label_dim, 
+                sample_label_buffer + sample_label_data_idx, label.label_dim * sizeof(float));
+        }
+        sample_label_data_idx += label.label_dim;  
+    }
+    return 0;
+}
+
+int32_t LabelInputAccessor::backward(SampleInstance* samples, size_t num,
+        paddle::framework::Scope* scope) {
+    if (num < 1) {
+        return 0;
+    }
+    for (size_t i = 0; i < num; ++i) {
+        auto& sample = samples[i];
+        sample.predicts.resize(_label_total_dim);
+        size_t sample_predict_data_idx = 0;
+        float* sample_predict_buffer = sample.predicts.data();
+        for (auto& label : _labels) {
+            auto* tensor = scope->Var(label.output_name)->
+                GetMutable<paddle::framework::LoDTensor>(); 
+            auto* var_data = tensor->mutable_data<float>(_trainer_context->cpu_place);        
+            memcpy(sample_predict_buffer + sample_predict_data_idx, 
+                var_data + i * label.label_dim, label.label_dim * sizeof(float));
+            sample_predict_data_idx += label.label_dim;  
+        }
+    }
+    return 0;
+}
+
+REGIST_CLASS(DataInputAccessor, LabelInputAccessor);
+
+}  // namespace feed
+}  // namespace custom_trainer
+}  // namespace paddle
diff --git a/paddle/fluid/train/custom_trainer/feed/accessor/sparse_input_accessor.cc b/paddle/fluid/train/custom_trainer/feed/accessor/sparse_input_accessor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e51fd1b59b52e8d4a39e655bd3a8a7abdc5de5f6
--- /dev/null
+++ b/paddle/fluid/train/custom_trainer/feed/accessor/sparse_input_accessor.cc
@@ -0,0 +1,224 @@
+#include <math.h>
+#include <vector>
+#include <utility>
+#include "paddle/fluid/string/string_helper.h"
+#include "paddle/fluid/train/custom_trainer/feed/common/scope_helper.h"
+#include "paddle/fluid/train/custom_trainer/feed/accessor/input_data_accessor.h"
+
+namespace paddle {
+namespace custom_trainer {
+namespace feed {
+
+int BaseSparseInputAccessor::initialize(YAML::Node config,
+    std::shared_ptr<TrainerContext> context_ptr) {
+    CHECK(DataInputAccessor::initialize(config, context_ptr) == 0);
+    for (const auto& input : config["input"]) {
+        SparseInputVariable variable;
+        variable.name = input["name"].as<std::string>();
+        variable.gradient_name = paddle::framework::GradVarName(variable.name);
+        auto slots = input["slots"].as<std::vector<int>>();
+        variable.slot_idx.resize(UINT16_MAX, -1);
+        for (int i = 0; i < slots.size(); ++i) {
+            uint16_t slot = (uint16_t)slots[i];
+            variable.slot_idx[slot] = i;
+            variable.slot_list.push_back(slot);
+        }
+        variable.slot_dim = input["slot_dim"].as<int>();
+        variable.total_dim = variable.slot_list.size() * variable.slot_dim; 
+        _x_variables.push_back(variable);
+    }
+    return 0;
+}
+
+// 取sparse数据
+int32_t BaseSparseInputAccessor::forward(SampleInstance* samples,
+    size_t num, paddle::framework::Scope* scope) {
+    CHECK(num > 0);
+    auto* ps_client = _trainer_context->pslib->ps_client();
+    auto* value_accessor = ps_client->table_accessor(_table_id);
+    size_t key_num = 0;
+    for (size_t i = 0; i < num; ++i) {
+        key_num += samples[i].features.size();
+    }
+    std::vector<uint64_t> keys(key_num);
+    float** pull_values = new float*[key_num];
+    auto pull_value_dim = value_accessor->select_dim();
+
+    // 填入sparseKey Request
+    size_t key_idx = 0;
+    for (size_t i = 0; i < num; ++i) {
+        auto& features = samples[i].features;
+        for (auto& feature_item : features) {
+            feature_item.weights.resize(pull_value_dim, 0.0);
+            keys[key_idx] = feature_item.sign();
+            pull_values[key_idx++] = &(feature_item.weights[0]);
+        }
+    }
+    auto pull_status = ps_client->pull_sparse(pull_values, _table_id, keys.data(), key_num);
+    auto ret = pull_status.get();
+    delete[] pull_values;
+    if (ret != 0) {
+        VLOG(0) << "pull sparse failed, table_id:" << _table_id << ", key_num:" << key_num << ", ret:" << ret;
+        return ret;
+    }
+
+    auto* runtime_data_ptr = new std::vector<SparseVarRuntimeData>();
+    auto& var_runtime_data = *runtime_data_ptr;
+    var_runtime_data.resize(_x_variables.size());
+    int64_t runtime_data_for_scope = (int64_t)runtime_data_ptr;
+    ScopeHelper::fill_value(scope, _trainer_context->cpu_place,
+        "sparse_runtime_data", runtime_data_for_scope);
+    // Variable空间初始化 
+    for (size_t i = 0; i < _x_variables.size(); ++i) {
+        const auto& variable = _x_variables[i];
+        var_runtime_data[i].row_size = num;
+        var_runtime_data[i].total_size = num * variable.total_dim;
+        auto* tensor = ScopeHelper::resize_lod_tensor(
+            scope, variable.name, {num, variable.total_dim});
+        auto* grad_tensor = ScopeHelper::resize_lod_tensor(
+            scope, variable.gradient_name, {num, variable.total_dim});
+        VLOG(5) << "fill scope variable:" << variable.name << ", " << variable.gradient_name;
+        var_runtime_data[i].variable_data = tensor->mutable_data<float>(_trainer_context->cpu_place);
+        var_runtime_data[i].gradient_data = grad_tensor->mutable_data<float>(_trainer_context->cpu_place);
+        memset((void*) var_runtime_data[i].variable_data, 0, var_runtime_data[i].total_size * sizeof(float)); 
+        memset((void*) var_runtime_data[i].gradient_data, 0, var_runtime_data[i].total_size * sizeof(float)); 
+    }
+    // 参数填入Variable 
+    for (size_t samp_idx = 0; samp_idx < num; ++samp_idx) {
+        auto& features = samples[samp_idx].features;
+        for (auto& feature_item : features) {
+            for (size_t i = 0; i < _x_variables.size(); ++i) {
+                auto& variable = _x_variables[i];
+                auto slot_idx = variable.slot_idx[feature_item.slot()]; 
+                if (slot_idx < 0) {
+                    continue;
+                }
+                float* item_data =  var_runtime_data[i].variable_data +  
+                samp_idx * variable.total_dim + variable.slot_dim * slot_idx; 
+                fill_input(item_data, &(feature_item.weights[0]), *value_accessor, variable, samples[samp_idx]);
+            }
+        }
+    }
+    // Variable后置处理
+    for (size_t i = 0; i < _x_variables.size(); ++i) {
+        auto& variable = _x_variables[i];
+        post_process_input(var_runtime_data[i].variable_data, variable, samples, num);
+    }
+    return 0;
+}
+
+// 更新spare数据
+int32_t BaseSparseInputAccessor::backward(SampleInstance* samples,
+    size_t num, paddle::framework::Scope* scope) {
+    int64_t runtime_data_for_scope = *ScopeHelper::get_value<int64_t>(
+            scope, _trainer_context->cpu_place, "sparse_runtime_data");
+    auto* runtime_data_ptr = (std::vector<SparseVarRuntimeData>*)runtime_data_for_scope;
+    auto& var_runtime_data = *runtime_data_ptr;
+    DoneGuard gurad([runtime_data_ptr](){
+        delete runtime_data_ptr;
+    });
+    if (!_need_gradient) {
+        return 0;
+    }
+    auto* ps_client = _trainer_context->pslib->ps_client();
+    auto* value_accessor = ps_client->table_accessor(_table_id);
+
+    size_t key_num = 0;
+    for (size_t i = 0; i < num; ++i) {
+        key_num += samples[i].features.size();
+    }
+    std::vector<uint64_t> keys(key_num);
+    float** push_values = new float*[key_num];
+    auto push_value_dim = value_accessor->update_dim();
+        
+    size_t key_idx = 0;
+    for (size_t samp_idx = 0; samp_idx < num; ++samp_idx) {
+        auto& features = samples[samp_idx].features;
+        for (auto& feature_item : features) {
+            feature_item.gradients.resize(push_value_dim, 0.0);
+            for (size_t i = 0; i < _x_variables.size(); ++i) {
+                auto& variable = _x_variables[i];
+                auto slot_idx = variable.slot_idx[feature_item.slot()]; 
+                if (slot_idx < 0) {
+                    continue;
+                }
+                const float* grad_data = var_runtime_data[i].gradient_data +  
+                    samp_idx * variable.total_dim + variable.slot_dim * slot_idx; 
+                fill_gradient(&(feature_item.gradients[0]), grad_data, 
+                    *value_accessor, variable, samples[samp_idx]);
+                keys[key_idx] = feature_item.sign();
+                push_values[key_idx++] = &(feature_item.gradients[0]);
+            }
+        }
+    }
+    auto push_status = ps_client->push_sparse(_table_id, 
+        keys.data(), (const float**)push_values, key_idx);
+    //auto ret = push_status.get();
+    delete[] push_values;
+    return 0;
+} 
+
+class AbacusSparseJoinAccessor : public BaseSparseInputAccessor {
+public:
+    AbacusSparseJoinAccessor() {}
+    virtual ~AbacusSparseJoinAccessor() {}
+    virtual void fill_input(float* var_data, const float* pull_raw,
+        paddle::ps::ValueAccessor& value_accessor, 
+        SparseInputVariable& variable, SampleInstance& sample) {
+        for (size_t i = 0; i < variable.slot_dim; ++i) {
+            var_data[i] += pull_raw[i];
+        }
+    }
+
+    virtual void post_process_input(float* var_data, 
+        SparseInputVariable& variable, SampleInstance* samples, size_t num) {
+        for (size_t i = 0; i < num * variable.slot_list.size(); ++i) {
+            var_data[0] = log(var_data[0] + 1);                  // show
+            var_data[1] = log(var_data[1] + 1) - var_data[0];    // ctr
+            var_data += variable.slot_dim; 
+        }
+    }
+
+    virtual void fill_gradient(float* push_value, const float* gradient_raw,
+        paddle::ps::ValueAccessor& value_accessor, 
+        SparseInputVariable& variable, SampleInstance& sample) {
+        // join阶段不回填梯度
+        CHECK(false);
+        return;
+    }
+};
+REGIST_CLASS(DataInputAccessor, AbacusSparseJoinAccessor);
+
+class AbacusSparseUpdateAccessor : public BaseSparseInputAccessor {
+public:
+    AbacusSparseUpdateAccessor() {}
+    virtual ~AbacusSparseUpdateAccessor() {}
+    virtual void fill_input(float* var_data, const float* pull_raw,
+        paddle::ps::ValueAccessor& value_accessor, 
+        SparseInputVariable& variable, SampleInstance& sample) {
+        for (size_t i = 0; i < variable.slot_dim; ++i) {
+            var_data[i] += pull_raw[i + 2];
+        }
+    }
+
+    virtual void post_process_input(float* var_data, 
+        SparseInputVariable& variable, SampleInstance* samples, size_t num) {
+        return;
+    }
+
+    virtual void fill_gradient(float* push_value, const float* gradient_raw,
+        paddle::ps::ValueAccessor& value_accessor, 
+        SparseInputVariable& variable, SampleInstance& sample) {
+        push_value[0] += 1;
+        push_value[1] += sample.labels[0];
+        for (size_t i = 0; i < variable.slot_dim; ++i) {
+            push_value[i + 2] += gradient_raw[i];
+        }
+        return;
+    }
+};
+REGIST_CLASS(DataInputAccessor, AbacusSparseUpdateAccessor);
+
+}  // namespace feed
+}  // namespace custom_trainer
+}  // namespace paddle
diff --git a/paddle/fluid/train/custom_trainer/feed/common/pipeline.h b/paddle/fluid/train/custom_trainer/feed/common/pipeline.h
index 39e669954d99b1aa7500fb3d7c2c83ce5eeda0c5..52ff8e4685cc6fcc41a2d016b0142c35a853abd0 100644
--- a/paddle/fluid/train/custom_trainer/feed/common/pipeline.h
+++ b/paddle/fluid/train/custom_trainer/feed/common/pipeline.h
@@ -6,12 +6,22 @@ namespace paddle {
 namespace custom_trainer {
 namespace feed {
 
+class DoneGuard {
+public:
+    DoneGuard(std::function<void()> func) : _func(func) {}
+    virtual ~DoneGuard() { _func(); }
+private:
+    std::function<void()>  _func;
+};
+
 class PipelineOptions {
 public:
     PipelineOptions() = default;
-    uint32_t buffer_data_num       = 400  ;  //缓冲区数据个数，需大于batch_size
-    uint32_t batch_size            = 100  ;  //从pipe读数据的batch大小
-    bool need_hold_input_data      = false;  //是否保存input流数据，否则消费后释放
+    uint32_t batch_size        = 10;        // pipe输出的batch大小
+    uint32_t thread_num        = 1;         // converter的并发线程数
+    float input_output_rate    = 1;         // 输入/输出 qps流量比
+    uint32_t buffer_batch_count    = 4;     // pipe预存count组batch数据
+    bool need_hold_input_data      = false; // 是否保存input流数据，否则消费后释放
 };
 
 /*
@@ -29,7 +39,8 @@ public:
     Pipeline() {}
     Pipeline(Pipeline&&) = delete; 
     Pipeline(const Pipeline&) = delete; 
-    typedef std::function<int(const TypeIn*, TypeOut*, size_t num)> PipeDataConverter; 
+    typedef std::function<int(TypeIn*, size_t in_num,
+        TypeOut*, size_t* out_num, size_t thread_idx)> PipeDataConverter; 
     
     int initialize(const PipelineOptions& options,
         ::paddle::framework::Channel<TypeIn> input_channel, 
@@ -42,18 +53,16 @@ public:
         _converter = data_converter;
         _input_channel = input_channel;
         _output_channel = ::paddle::framework::MakeChannel<TypeOut>();
-
-        auto batch_size = options.batch_size;
-        auto buffer_data_num = options.buffer_data_num;
-        _input_channel->SetBlockSize(batch_size);
-        _output_channel->SetBlockSize(batch_size);
-        _input_data_buffer.resize(buffer_data_num);
-        _output_data_buffer.resize(buffer_data_num);
-        if (buffer_data_num / batch_size < 3) {
-            buffer_data_num = batch_size * 3;
+        _output_channel->SetBlockSize(options.batch_size);
+        size_t input_batch_size = options.batch_size * options.input_output_rate;
+        _input_channel->SetBlockSize(input_batch_size);
+        _input_data_buffer.resize(input_batch_size * options.buffer_batch_count);
+        _output_data_buffer.resize(options.batch_size * options.buffer_batch_count);
+        _output_channel->SetCapacity(_output_data_buffer.size());
+        if (_options.need_hold_input_data) {
+            _input_channel_backup = ::paddle::framework::MakeChannel<TypeIn>();
+            _input_channel_backup->SetBlockSize(input_batch_size);
         }
-        buffer_data_num = (buffer_data_num / batch_size) * batch_size;
-        _output_channel->SetCapacity(buffer_data_num);
         CHECK(_input_channel != nullptr) << " Input Channel is null";
         _convert_thread = std::make_shared<std::thread>([this](){
             async_convert_data();
@@ -63,7 +72,9 @@ public:
 
     template <class PreTypeIn>
     int connect_to(Pipeline<PreTypeIn, TypeIn>& pre_pipeline, 
-        PipeDataConverter data_converter) {
+        PipelineOptions& options, PipeDataConverter data_converter) {
+        // 保证全局batch一致
+        options.batch_size = pre_pipeline.options().batch_size / options.input_output_rate;
         return initialize(pre_pipeline.options(), pre_pipeline.output_chnnel(), data_converter);
     }
     
@@ -87,30 +98,36 @@ public:
     inline ::paddle::framework::Channel<TypeOut> output_chnnel() {
         return _output_channel;
     }
+
+    // 返回对input_channel的消费备份
+    inline ::paddle::framework::Channel<TypeIn> backup_channel() {
+        return _input_channel_backup;
+    }
 private:
     void async_convert_data() {
-        size_t convete_batch_size =  _input_data_buffer.size() / 4;
-        if (convete_batch_size < _options.batch_size * 3) {
-            convete_batch_size = 3 * _options.batch_size;
-        }
-        convete_batch_size = (convete_batch_size / _options.batch_size) * _options.batch_size;
+        size_t input_batch_size = _options.batch_size * _options.input_output_rate;
         while (!_is_read_end) {
             while (_output_channel->Size() < _input_data_buffer.size()) {
                 size_t read_size = _input_channel->
-                    Read(convete_batch_size, &_input_data_buffer[0]);
+                    Read(input_batch_size, &_input_data_buffer[0]);
                 if (read_size == 0) {
                     _is_read_end = true;
                     break;
                 }
-                CHECK(_converter(&_input_data_buffer[0], &_output_data_buffer[0], 
-                    read_size) == 0) << "Data Converter Do Failed";
-                _output_channel->WriteMove(read_size, &_output_data_buffer[0]);
-                if (_options.need_hold_input_data) {
+                size_t write_size = 0;
+                CHECK(_converter(&_input_data_buffer[0], read_size,
+                    &_output_data_buffer[0], &write_size, 0) == 0) << "Data Converter Do Failed";
+                _output_channel->WriteMove(write_size, &_output_data_buffer[0]);
+                if (_input_channel_backup) {
                     _input_channel_backup->WriteMove(read_size, &_input_data_buffer[0]);
                 }
             }  
             sleep(1);
         }
+        _output_channel->Close();
+        if (_input_channel_backup) {
+            _input_channel_backup->Close();
+        }
     }    
    
     
diff --git a/paddle/fluid/train/custom_trainer/feed/common/pslib_warpper.cc b/paddle/fluid/train/custom_trainer/feed/common/pslib_warpper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fd7bd2d659b0ad0e87376517ee4ba16122630359
--- /dev/null
+++ b/paddle/fluid/train/custom_trainer/feed/common/pslib_warpper.cc
@@ -0,0 +1,81 @@
+#include <fcntl.h>
+#include <fstream>
+#include <sstream>
+#include "json2pb/json_to_pb.h"
+#include <google/protobuf/text_format.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include "paddle/fluid/train/custom_trainer/feed/common/pslib_warpper.h"
+#include "paddle/fluid/train/custom_trainer/feed/common/runtime_environment.h"
+
+namespace paddle {
+namespace custom_trainer {
+namespace feed {
+
+int PSlib::initialize(const std::string& conf_path, 
+    RuntimeEnvironment* environment) {
+    _environment = environment;
+    init_gflag();    
+    int file_descriptor = open(conf_path.c_str(), O_RDONLY);
+    if (file_descriptor == -1){
+        LOG(ERROR) << "FATAL: cant open " << conf_path;
+        return -1;
+    }
+    google::protobuf::io::FileInputStream fileInput(file_descriptor);
+    if (!google::protobuf::TextFormat::Parse(&fileInput, &_ps_param)) {
+        LOG(ERROR) << "FATAL: fail to parse " << conf_path;
+        return -1;
+    }
+    close(file_descriptor); 
+    init_server();
+    init_client();
+    return 0;
+}
+        
+int PSlib::init_server() {
+    if (_environment->is_role(EnvironmentRole::PSERVER)) {
+        _server_ptr.reset(paddle::ps::PSServerFactory::create(_ps_param));
+        _server_ptr->configure(_ps_param, *(_environment->ps_environment()), 
+            _environment->rank_id(EnvironmentRole::PSERVER));
+        _server_ptr->start(); 
+    }
+    _environment->ps_environment()->gather_ps_servers();
+    return 0;
+}
+
+int PSlib::init_client() {
+    _client_ptr.reset(paddle::ps::PSClientFactory::create(_ps_param));
+    _client_ptr->configure(_ps_param, *(_environment->ps_environment()), 
+        _environment->rank_id(EnvironmentRole::ALL));
+    return 0;
+}
+
+paddle::ps::PSServer* PSlib::ps_server() {
+    return _server_ptr.get();
+}
+
+paddle::ps::PSClient* PSlib::ps_client() {
+    return _client_ptr.get();
+}
+
+paddle::PSParameter* PSlib::get_param() {
+    return &_ps_param;
+}
+
+void PSlib::init_gflag() {
+    int cnt = 4;
+    std::shared_ptr<char*> params(new char*[cnt]);
+    char** params_ptr = params.get();
+    char p0[] = "exe default";
+    char p1[] = "-max_body_size=314217728";
+    char p2[] = "-bthread_concurrency=40";
+    char p3[] = "-socket_max_unwritten_bytes=2048000000";
+    params_ptr[0] = p0;
+    params_ptr[1] = p1;
+    params_ptr[2] = p2;
+    params_ptr[3] = p3;
+    ::google::ParseCommandLineFlags(&cnt, &params_ptr, true);
+}
+
+}  // namespace feed
+}  // namespace custom_trainer
+}  // namespace paddle
diff --git a/paddle/fluid/train/custom_trainer/feed/common/pslib_warpper.h b/paddle/fluid/train/custom_trainer/feed/common/pslib_warpper.h
new file mode 100644
index 0000000000000000000000000000000000000000..d35aa9a70173163e5a55461966922b0efb9a57e6
--- /dev/null
+++ b/paddle/fluid/train/custom_trainer/feed/common/pslib_warpper.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// Hide BLOG
+#define BUTIL_LOGGING_H_
+#define COMPACT_GOOGLE_LOG_NOTICE COMPACT_GOOGLE_LOG_INFO
+#include "communicate/ps_server.h"
+#include "communicate/ps_client.h"
+
+namespace paddle {
+namespace custom_trainer {
+namespace feed {
+    
+class RuntimeEnvironment;
+class PSlib {
+public:
+    PSlib() {}
+    virtual ~PSlib() {}
+    int initialize(const std::string& conf_path, 
+        RuntimeEnvironment* environment);
+        
+    virtual paddle::ps::PSServer* ps_server();
+    virtual paddle::ps::PSClient* ps_client();
+    virtual paddle::PSParameter* get_param();
+private:
+    void init_gflag();
+    virtual int init_server();
+    virtual int init_client();
+
+    paddle::PSParameter _ps_param;
+    RuntimeEnvironment* _environment;
+    std::shared_ptr<paddle::ps::PSServer> _server_ptr;
+    std::shared_ptr<paddle::ps::PSClient> _client_ptr;  
+};
+
+}  // namespace feed
+}  // namespace custom_trainer
+}  // namespace paddle
diff --git a/paddle/fluid/train/custom_trainer/feed/common/registerer.cc b/paddle/fluid/train/custom_trainer/feed/common/registerer.cc
index 04382b47eecf8437828e845137a0bf23d485c638..c2dff1517dfbe88634e16f2bd1068b0688d6113d 100644
--- a/paddle/fluid/train/custom_trainer/feed/common/registerer.cc
+++ b/paddle/fluid/train/custom_trainer/feed/common/registerer.cc
@@ -3,12 +3,12 @@ namespace paddle {
 namespace custom_trainer {
 namespace feed {
 
-BaseClassMap& global_factory_map() {
+BaseClassMap& global_reg_factory_map() {
     static BaseClassMap *base_class = new BaseClassMap();
     return *base_class;
 }
-BaseClassMap& global_factory_map_cpp() {
-    return global_factory_map();
+BaseClassMap& global_reg_factory_map_cpp() {
+    return global_reg_factory_map();
 }
 
 }// feed
diff --git a/paddle/fluid/train/custom_trainer/feed/common/registerer.h b/paddle/fluid/train/custom_trainer/feed/common/registerer.h
index eb57cabea97398b620e94c03fc38975947fd60dd..b5399fdc9df1dafa87fea896a91f3855ff5605af 100644
--- a/paddle/fluid/train/custom_trainer/feed/common/registerer.h
+++ b/paddle/fluid/train/custom_trainer/feed/common/registerer.h
@@ -63,23 +63,23 @@ typedef std::map<std::string, FactoryMap> BaseClassMap;
 #ifdef __cplusplus
 extern "C" {
 #endif
-BaseClassMap& global_factory_map();
+BaseClassMap& global_reg_factory_map();
 #ifdef __cplusplus
 }
 #endif
 
-BaseClassMap& global_factory_map_cpp();
+BaseClassMap& global_reg_factory_map_cpp();
 
-#define REGISTER_REGISTERER(base_class) \
+#define REGIST_REGISTERER(base_class) \
     class base_class ## Registerer { \
         public: \
             static base_class *CreateInstanceByName(const ::std::string &name) { \
-                if (global_factory_map_cpp().find(#base_class) \
-                        == global_factory_map_cpp().end()) { \
+                if (global_reg_factory_map_cpp().find(#base_class) \
+                        == global_reg_factory_map_cpp().end()) { \
                     LOG(ERROR) << "Can't Find BaseClass For CreateClass with:" << #base_class; \
                     return NULL; \
                 } \
-                FactoryMap &map = global_factory_map_cpp()[#base_class]; \
+                FactoryMap &map = global_reg_factory_map_cpp()[#base_class]; \
                 FactoryMap::iterator iter = map.find(name); \
                 if (iter == map.end()) { \
                     LOG(ERROR) << "Can't Find Class For Create with:" << name; \
@@ -90,7 +90,7 @@ BaseClassMap& global_factory_map_cpp();
             } \
     };
 
-#define REGISTER_CLASS(clazz, name) \
+#define REGIST_CLASS(clazz, name) \
     class ObjectFactory##name : public ObjectFactory { \
         public: \
             Any NewInstance() { \
@@ -98,14 +98,14 @@ BaseClassMap& global_factory_map_cpp();
             } \
     }; \
     void register_factory_##name() { \
-        FactoryMap &map = global_factory_map_cpp()[#clazz]; \
+        FactoryMap &map = global_reg_factory_map_cpp()[#clazz]; \
             if (map.find(#name) == map.end()) { \
                 map[#name] = new ObjectFactory##name(); \
             } \
     } \
     void register_factory_##name() __attribute__((constructor)); 
 
-#define CREATE_CLASS(base_class, name) \
+#define CREATE_INSTANCE(base_class, name) \
     base_class##Registerer::CreateInstanceByName(name)
     
 }//namespace feed
diff --git a/paddle/fluid/train/custom_trainer/feed/common/runtime_environment.cc b/paddle/fluid/train/custom_trainer/feed/common/runtime_environment.cc
index fd5aae3db1466952967b44230577e10f245ecf5e..ee9b0c73f8a2f17622ed369129fcc328ec7f6f2e 100644
--- a/paddle/fluid/train/custom_trainer/feed/common/runtime_environment.cc
+++ b/paddle/fluid/train/custom_trainer/feed/common/runtime_environment.cc
@@ -93,9 +93,14 @@ public:
             return -1;
         }
         _roles_node_info.resize(static_cast<int>(EnvironmentRole::ALL) + 1);
-        set_role(EnvironmentRole::ALL);
+        add_role(EnvironmentRole::ALL);
         return 0;
     }
+    
+    virtual paddle::ps::PSEnvironment* ps_environment() {
+        static paddle::ps::MpiPSEnvironment ps_environment;
+        return &ps_environment;
+    }
 
     virtual uint32_t rank_id(EnvironmentRole role) {
         return mpi_node_info(role).rank_id;
@@ -103,7 +108,7 @@ public:
     virtual uint32_t node_num(EnvironmentRole role) {
         return mpi_node_info(role).node_num;
     }
-    virtual int set_role(EnvironmentRole role) {
+    virtual int add_role(EnvironmentRole role) {
         auto& node_info = mpi_node_info(role);
         if (node_info.rank_id < 0) {
             if (role == EnvironmentRole::ALL) {
@@ -115,8 +120,12 @@ public:
             MPI_Comm_rank(node_info.mpi_comm, &(node_info.rank_id));
             MPI_Comm_size(node_info.mpi_comm, &(node_info.node_num));
         }
+        _role_set.insert(role);
         return 0;
     }
+    virtual bool is_role(EnvironmentRole role) {
+        return _role_set.count(role) > 0;
+    }
 
     virtual void barrier(EnvironmentRole role) {
         MPI_Barrier(mpi_node_info(role).mpi_comm);
@@ -154,9 +163,10 @@ protected:
     }
 
 private:
+    std::set<EnvironmentRole> _role_set;
     std::vector<MpiNodeInfo> _roles_node_info;
 };
-REGISTER_CLASS(RuntimeEnvironment, MPIRuntimeEnvironment);
+REGIST_CLASS(RuntimeEnvironment, MPIRuntimeEnvironment);
 
 //用于本地模式单机训练
 class LocalRuntimeEnvironment : public RuntimeEnvironment {
@@ -169,15 +179,22 @@ public:
     virtual int wireup() {
         return 0;
     }
+    virtual paddle::ps::PSEnvironment* ps_environment() {
+        static paddle::ps::LocalPSEnvironment ps_environment;
+        return &ps_environment;
+    }
     virtual uint32_t rank_id(EnvironmentRole role) {
         return 0;
     }
     virtual uint32_t node_num(EnvironmentRole role) {
         return 1;
     }
-    virtual int set_role(EnvironmentRole role) {
+    virtual int add_role(EnvironmentRole role) {
         return 0;
     }
+    virtual bool is_role(EnvironmentRole role) {
+        return true;
+    }
     virtual void barrier(EnvironmentRole role) {
         return;
     }
@@ -196,7 +213,7 @@ protected:
         VLOG(static_cast<int>(level)) << log_str;
     }
 };
-REGISTER_CLASS(RuntimeEnvironment, LocalRuntimeEnvironment);
+REGIST_CLASS(RuntimeEnvironment, LocalRuntimeEnvironment);
 
 }  // namespace feed
 }  // namespace custom_trainer
diff --git a/paddle/fluid/train/custom_trainer/feed/common/runtime_environment.h b/paddle/fluid/train/custom_trainer/feed/common/runtime_environment.h
index 1107dbefd083a730873f82be06fd8d8276b41bcb..aaf602bc4f0d50d9c4b10a904280dcb50f093c81 100644
--- a/paddle/fluid/train/custom_trainer/feed/common/runtime_environment.h
+++ b/paddle/fluid/train/custom_trainer/feed/common/runtime_environment.h
@@ -6,6 +6,7 @@
  */
 #pragma once
 #include <yaml-cpp/yaml.h>
+#include "communicate/ps_env.h"
 #include "paddle/fluid/framework/archive.h"
 #include "paddle/fluid/string/string_helper.h"
 #include "paddle/fluid/train/custom_trainer/feed/common/registerer.h"
@@ -38,45 +39,49 @@ class RuntimeEnvironment {
 public:
     RuntimeEnvironment();
     virtual ~RuntimeEnvironment();
-    //配置初始化
+    // 配置初始化
     virtual int initialize(YAML::Node config) = 0;
-    //设置role
-    virtual int set_role(EnvironmentRole role) = 0;
-    //环境初始化，会在所有依赖模块initialize后调用
+    // 设置role
+    virtual int add_role(EnvironmentRole role) = 0;
+    // 判断role
+    virtual bool is_role(EnvironmentRole role) = 0;
+    // 环境初始化，会在所有依赖模块initialize后调用
     virtual int wireup() = 0;
     
-    //多线程可调用接口  Start
-    //当前环境rank_idx
+    // 多线程可调用接口  Start
+    // 当前环境rank_idx
     virtual uint32_t rank_id(EnvironmentRole role) = 0;
-    //运行环境节点数
+    // 运行环境节点数
     virtual uint32_t node_num(EnvironmentRole role) = 0;
-    //环境内主节点
+    // 环境内主节点
     virtual bool is_master_node(EnvironmentRole role);
+    //For PS
+    virtual paddle::ps::PSEnvironment* ps_environment() = 0;
     
-    //环境定制化log
+    // 环境定制化log
     template<class... ARGS>
     void log(EnvironmentRole role, EnvironmentLogType type, 
         EnvironmentLogLevel level, const char* fmt, ARGS && ... args) {
         print_log(role, type, level, paddle::string::format_string(fmt, args...));
     }
-    //多线程可调用接口      End
+    // 多线程可调用接口      End
 
 
-    //接口只允许在主线程调用   Start
-    //barrier 指定role的节点
+    // 接口只允许在主线程调用   Start
+    // barrier 指定role的节点
     virtual void barrier(EnvironmentRole role) = 0;
-    //bcast 广播
+    // bcast 广播
     virtual void bcast(paddle::framework::BinaryArchive& ar, int root_id, EnvironmentRole role) = 0;
-    //all_reduce sum element 规约元素
+    // all_reduce sum element 规约元素
     virtual double all_reduce_ele(double x) = 0;
-    //all_reduce sum array 规约数组
+    // all_reduce sum array 规约数组
     virtual void all_reduce_arr(double* x, int n) = 0;
-    //接口只允许在主线程调用   End
+    // 接口只允许在主线程调用   End
 protected:
     virtual void print_log(EnvironmentRole role, EnvironmentLogType type, 
         EnvironmentLogLevel level,  const std::string& log_str) = 0;
 };
-REGISTER_REGISTERER(RuntimeEnvironment);
+REGIST_REGISTERER(RuntimeEnvironment);
 
 std::string format_timestamp(time_t time, const char* format);
 inline std::string format_timestamp(time_t time, const std::string& format) {
diff --git a/paddle/fluid/train/custom_trainer/feed/common/scope_helper.h b/paddle/fluid/train/custom_trainer/feed/common/scope_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..307a835ac91ff10bfc85e57e039a3b00e326a113
--- /dev/null
+++ b/paddle/fluid/train/custom_trainer/feed/common/scope_helper.h
@@ -0,0 +1,55 @@
+#pragma once
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+
+namespace paddle {
+namespace custom_trainer {
+namespace feed {
+    
+class ScopeHelper {
+public:
+    //直接取var
+    template <class T>
+    static const T& var(paddle::framework::Scope* scope, const std::string& name) {
+        return scope->Var(name)->Get<T>();
+    }
+    template <class T>
+    static T* mutable_var(paddle::framework::Scope* scope, const std::string& name) {
+        return scope->Var(name)->GetMutable<T>();
+    }
+
+    template <class T>
+    static T* resize_variable(paddle::framework::Scope* scope,
+        const std::string& name, const paddle::framework::DDim& dim) {
+        auto* tensor = scope->Var(name)->GetMutable<T>();
+        tensor->Resize(dim);
+        return tensor; 
+    }
+    
+    static paddle::framework::LoDTensor* resize_lod_tensor(
+        paddle::framework::Scope* scope,
+        const std::string& name, const paddle::framework::DDim& dim) {
+        return resize_variable<paddle::framework::LoDTensor>(scope, name, dim);
+    }
+
+    template <class T>
+    static void fill_value(paddle::framework::Scope* scope,
+        paddle::platform::Place place, const std::string& name, T& value) {
+        auto* tensor = resize_variable<paddle::framework::Tensor>(scope, name, { 1 });
+        T* data = tensor->mutable_data<T>(place);
+        *data = value;
+        return;
+    } 
+    
+    template <class T>
+    static T* get_value(paddle::framework::Scope* scope,
+        paddle::platform::Place place, const std::string& name) {
+        auto* tensor = scope->Var(name)->GetMutable<paddle::framework::Tensor>();
+        return tensor->mutable_data<T>(place);
+    }
+
+};
+
+}  // namespace feed
+}  // namespace custom_trainer
+}  // namespace paddle
diff --git a/paddle/fluid/train/custom_trainer/feed/common/yaml_helper.h b/paddle/fluid/train/custom_trainer/feed/common/yaml_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..71c38a29746b15ae0ab4c11d0234271a94ec8f61
--- /dev/null
+++ b/paddle/fluid/train/custom_trainer/feed/common/yaml_helper.h
@@ -0,0 +1,32 @@
+#pragma once
+#include <glog/logging.h>
+#include <yaml-cpp/yaml.h>
+
+namespace paddle {
+namespace custom_trainer {
+namespace feed {
+    
+class YamlHelper {
+public:
+    // 直接使用node["key"]判断，会导致node数据被加入key键
+    static bool has_key(const YAML::Node& node, const std::string& key) {
+        CHECK(node.Type() == YAML::NodeType::Map);
+        for (const auto& itr : node) {
+            if (key == itr.first.as<std::string>()) {
+                return true;
+            }
+        } 
+        return false;
+    }
+    template <class T>
+    static T get_with_default(YAML::Node node, const std::string& key, const T& default_v) {
+        if (has_key(node, key)) {
+            return node[key].as<T>();
+        }
+        return default_v;
+    }
+};
+
+}  // namespace feed
+}  // namespace custom_trainer
+}  // namespace paddle
diff --git a/paddle/fluid/train/custom_trainer/feed/conf/gflags.conf b/paddle/fluid/train/custom_trainer/feed/conf/gflags.conf
index 3a3c0b05a8f1849c4fda9d9a9d94dea89b73bf1e..c7b8a66344993b10337eec12ad77a6f2b4560c7b 100644
--- a/paddle/fluid/train/custom_trainer/feed/conf/gflags.conf
+++ b/paddle/fluid/train/custom_trainer/feed/conf/gflags.conf
@@ -1,2 +1,5 @@
 -log_dir=log
--v=10
+-v=4
+-logbufsecs=0
+-pslib_push_dense_merge_limit=1
+-pslib_push_sparse_merge_limit=1
diff --git a/paddle/fluid/train/custom_trainer/feed/conf/ps_table_config b/paddle/fluid/train/custom_trainer/feed/conf/ps_table_config
new file mode 100644
index 0000000000000000000000000000000000000000..d21b1e1127c2972d548a67fc429cd37745810d95
--- /dev/null
+++ b/paddle/fluid/train/custom_trainer/feed/conf/ps_table_config
@@ -0,0 +1,120 @@
+server_param {
+  downpour_server_param {
+    downpour_table_param {
+      table_id: 0
+      table_class: "DownpourSparseTable"
+      shard_num: 1950
+      accessor {
+        accessor_class: "DownpourCtrAccessor"
+        sparse_sgd_param {
+          learning_rate: 0.05
+          initial_g2sum: 3.0
+          initial_range: 0.0001
+          weight_bounds: -10.0
+          weight_bounds: 10.0
+        }
+        fea_dim: 11
+        embedx_dim: 8
+        embedx_threshold: 10
+        downpour_accessor_param {
+          nonclk_coeff: 0.1
+          click_coeff: 1
+          base_threshold: 1.5
+          delta_threshold: 0.25
+          delta_keep_days: 16
+          delete_after_unseen_days: 30
+          show_click_decay_rate: 0.98
+          delete_threshold: 0.8
+        }
+        table_accessor_save_param {
+          param: 1
+          converter: "(tool/xbox_compressor_mf.py | tool/xbox_pb_converter)"
+          deconverter:  "(tool/xbox_pb_deconverter | tool/xbox_decompressor_mf.awk)"
+        }   
+        table_accessor_save_param {
+          param: 2
+          converter: "(tool/xbox_compressor_mf.py | tool/xbox_pb_converter)"
+          deconverter:  "(tool/xbox_pb_deconverter | tool/xbox_decompressor_mf.awk)"
+        }
+      }
+      type: PS_SPARSE_TABLE
+      compress_in_save: true
+    }
+    downpour_table_param {
+      table_id: 1
+      table_class: "DownpourDenseTable"
+      accessor {
+        accessor_class: "DownpourDenseValueAccessor"
+        dense_sgd_param {
+          name: "adam"
+          adam {
+            learning_rate: 5e-06
+            avg_decay_rate: 0.999993
+            ada_decay_rate: 0.9999
+            ada_epsilon: 1e-08
+            mom_decay_rate: 0.99
+          }
+          naive {
+            learning_rate: 0.0002
+          }
+        }
+        fea_dim: 2571127
+      }
+      type: PS_DENSE_TABLE
+      compress_in_save: true
+    }
+    downpour_table_param {
+      table_id: 2
+      table_class: "DownpourDenseDoubleTable"
+      accessor {
+        accessor_class: "DownpourDenseValueDoubleAccessor"
+        dense_sgd_param {
+          name: "summarydouble"
+          summary {
+            summary_decay_rate: 0.999999
+          }
+        }
+        fea_dim: 13464
+      }
+      type: PS_DENSE_TABLE
+      compress_in_save: true
+    }
+    downpour_table_param {
+      table_id: 3
+      table_class: "DownpourDenseTable"
+      accessor {
+        accessor_class: "DownpourDenseValueAccessor"
+        dense_sgd_param {
+          name: "adam"
+          adam {
+            learning_rate: 5e-06
+            avg_decay_rate: 0.999993
+            ada_decay_rate: 0.9999
+            ada_epsilon: 1e-08
+            mom_decay_rate: 0.99
+          }
+          naive {
+            learning_rate: 0.0002
+          }
+        }
+        fea_dim: 2072615
+      }
+      type: PS_DENSE_TABLE
+      compress_in_save: true
+    }
+    service_param {
+      server_class: "DownpourBrpcPsServer"
+      client_class: "DownpourBrpcPsClient"
+      service_class: "DownpourPsService"
+      start_server_port: 0
+      server_thread_num: 12
+    }
+  }
+}
+
+fs_client_param {
+  uri: "afs://xingtian.afs.baidu.com:9902"
+  user: "mlarch"
+  passwd: "Fv1M87"
+  hadoop_bin: "$HADOOP_HOME/bin/hadoop"
+}
diff --git a/paddle/fluid/train/custom_trainer/feed/conf/trainer.yaml b/paddle/fluid/train/custom_trainer/feed/conf/trainer.yaml
index 71817d4acc0d5d990df38e0a69813f392e76dfbf..c71205b2468b16226143f660fa806ecd959f9080 100644
--- a/paddle/fluid/train/custom_trainer/feed/conf/trainer.yaml
+++ b/paddle/fluid/train/custom_trainer/feed/conf/trainer.yaml
@@ -1,34 +1,56 @@
-train_thread_num : 10
+train_thread_num: 10
 
-environment :
-    environment_class : LocalRuntimeEnvironment
+environment:
+    environment_class: LocalRuntimeEnvironment
+    ps: ./conf/ps_table_config
+        
+    
 
-io :
-    file_systems :
-        afs :
-            class : HadoopFileSystem
-            buffer_size : 1024000
-            ugis : 
+io:
+    file_systems:
+        afs:
+            class: HadoopFileSystem
+            buffer_size: 1024000
+            ugis: 
                 'default': 'feed_video,D3a0z8'
                 'xingtian.afs.baidu.com:9902': 'feed_video,D3a0z8'
-        default :
-            class : LocalFileSystem
-            buffer_size : 1024000
-dataset :
-    data_list :
-        train_sample :
-            prefetch_num : 2
+        default:
+            class: LocalFileSystem
+            buffer_size: 1024000
+dataset:
+    data_list:
+        train_sample:
+            prefetch_num: 2
             root_path : [./sample]
-            data_spit_interval : 300
-            data_path_formater : '%Y%m%d/%H%M' 
-            data_reader : LineDataReader 
-            done_file : to.hadoop.done 
-            filename_prefix : part
-            pipeline_cmd : cat
-            parser :
-                class : LineDataParser 
+            data_spit_interval: 300
+            data_path_formater: '%Y%m%d/%H%M' 
+            data_reader: LineDataReader 
+            done_file: to.hadoop.done 
+            filename_prefix: part
+            pipeline_cmd: './tool/ins_weight.py | awk -f ./tool/format_newcate_hotnews.awk'
+            parser:
+                class: AbacusTextDataParser 
     
 epoch:
-    epoch_class : HourlyEpochAccessor 
-    model_root_path : ./model/
-    
+    epoch_class: TimelyEpochAccessor 
+    model_root_path: ./model/
+    train_time_interval: 600
+    time_zone_seconds: 28800
+
+executor:
+- name: join
+  class: SimpleExecutor
+  train_data_name: train_sample
+  train_batch_size: 32
+  input_parse_thread_num: 10
+  push_gradient_thread_num: 16
+  train_thread_num: 16
+  need_dump_all_model: true
+- name: update
+  class: SimpleExecutor
+  train_data_name: train_sample
+  train_batch_size: 32
+  input_parse_thread_num: 10
+  push_gradient_thread_num: 16
+  train_thread_num: 16
+  need_dump_all_model: false 
diff --git a/paddle/fluid/train/custom_trainer/feed/dataset/abacus_data_reader.cc b/paddle/fluid/train/custom_trainer/feed/dataset/abacus_data_reader.cc
new file mode 100755
index 0000000000000000000000000000000000000000..55ce639888ec5d4442a30ba1f704fd7a5d422b18
--- /dev/null
+++ b/paddle/fluid/train/custom_trainer/feed/dataset/abacus_data_reader.cc
@@ -0,0 +1,76 @@
+#include "paddle/fluid/train/custom_trainer/feed/dataset/data_reader.h"
+
+#include <cstdio>
+#include <atomic>
+
+#include <glog/logging.h>
+#include <omp.h>
+
+#include "paddle/fluid/train/custom_trainer/feed/io/file_system.h"
+
+namespace paddle {
+namespace custom_trainer {
+namespace feed {
+
+/*解析Abacus格式明文Feasign
+ */
+class AbacusTextDataParser : public LineDataParser {
+public:
+    AbacusTextDataParser() {}
+    virtual ~AbacusTextDataParser() {}
+
+    virtual int parse_to_sample(const DataItem& data, SampleInstance& instance) const {
+        instance.id = data.id;
+        instance.labels.resize(1);
+        size_t len = data.data.size();
+        const char* str = data.data.c_str();
+        const char* line_end = str + len;
+
+        char* cursor = NULL;
+        int show = (int)strtol(str, &cursor, 10);
+        str = cursor;
+        instance.labels[0] = (float)strtol(str, &cursor, 10);// click
+        str = cursor;
+
+        while (*(str += paddle::string::count_nonspaces(str)) != 0) {
+            if (*str == '*') {
+                str++;
+                size_t len = paddle::string::count_nonspaces(str);
+                str += len;
+            } else if (*str == '$') {
+                str++;
+                CHECK(((int)strtol(str, &cursor, 10), cursor != str))<<" sample type parse err:" << str;
+                str = cursor;
+            } else if (*str == '#') {
+                str++;
+                break;
+            } else if (*str == '@') {
+                str++;
+                size_t len = paddle::string::count_nonspaces(str);
+                std::string all_str(str, str + len);
+                str += len;
+            } else {
+                FeatureItem feature_item;
+                feature_item.sign() = (uint64_t)strtoull(str, &cursor, 10);
+                if (cursor == str) { //FIXME abacus没有这种情况
+                    str++;
+                    continue;
+                }
+                str = cursor;
+                CHECK(*str++ == ':');
+                CHECK(!isspace(*str));
+                CHECK((feature_item.slot() = (int) strtol(str, &cursor, 10), cursor != str)) << " format error: " << str;
+                str = cursor;
+                instance.features.emplace_back(feature_item);
+            }
+        }
+        VLOG(5) << "parse sample success, id:" << instance.id << ", fea_sum:" 
+            << instance.features.size() << ", label:" << instance.labels[0];
+        return 0;
+    }
+};
+REGIST_CLASS(DataParser, AbacusTextDataParser);
+
+}  // namespace feed
+}  // namespace custom_trainer
+}  // namespace paddle
diff --git a/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.cc b/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.cc
index a84e9467a25c92e119f12f0b52bcfe1f73aa8917..3898583d82ccc3f08787d5eaddd260c3e83a2aa3 100755
--- a/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.cc
+++ b/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.cc
@@ -12,51 +12,21 @@ namespace paddle {
 namespace custom_trainer {
 namespace feed {
 
-class LineDataParser : public DataParser {
-public:
-    LineDataParser() {}
-
-    virtual ~LineDataParser() {}
-
-    virtual int initialize(const YAML::Node& config, std::shared_ptr<TrainerContext> context) {
-        return 0;
-    }
-
-    virtual int parse(const char* str, size_t len, DataItem& data) const {
-        size_t pos = 0;
-        while (pos < len && str[pos] != ' ') {
-            ++pos;
-        }
-        if (pos >= len) {
-            VLOG(2) << "fail to parse line: " << std::string(str, len) << ", strlen: " << len;
-            return -1;
-        }
-        VLOG(5) << "getline: " << str << " , pos: " << pos << ", len: " << len;
-        data.id.assign(str, pos);
-        data.data.assign(str + pos + 1, len - pos - 1);
-        return 0;
-    }
-
-    virtual int parse(const char* str, DataItem& data) const {
-        size_t pos = 0;
-        while (str[pos] != '\0' && str[pos] != ' ') {
-            ++pos;
-        }
-        if (str[pos] == '\0') {
-            VLOG(2) << "fail to parse line: " << str << ", get '\\0' at pos: " << pos;
-            return -1;
-        }
-        VLOG(5) << "getline: " << str << " , pos: " << pos;
-        data.id.assign(str, pos);
-        data.data.assign(str + pos + 1);
-        return 0;
+int LineDataParser::parse(const char* str, size_t len, DataItem& data) const {
+    size_t pos = 0;
+    while (pos < len && str[pos] != ' ') {
+        ++pos;
     }
-
-    virtual int parse_to_sample(const DataItem& data, SampleInstance& instance) const {
-        return 0;
+    if (pos >= len) {
+        VLOG(2) << "fail to parse line: " << std::string(str, len) << ", strlen: " << len;
+        return -1;
     }
-};
-REGISTER_CLASS(DataParser, LineDataParser);
+    VLOG(5) << "getline: " << str << " , pos: " << pos << ", len: " << len;
+    data.id.assign(str, pos);
+    data.data.assign(str + pos + 1, len - pos - 1);
+    return 0;
+}
+REGIST_CLASS(DataParser, LineDataParser);
 
 /********************************
  * feasign压缩格式
@@ -335,10 +305,6 @@ public:
         return 0;
     }
 
-    virtual int parse(const char* str, DataItem& data) const {
-
-    }
-
     virtual int parse_to_sample(const DataItem& data, SampleInstance& instance) const {
         instance.id = data.id;
         if (data.data.empty()) {
@@ -428,10 +394,10 @@ private:
     std::shared_ptr<SignCacheDict> _index;
 
 };
-REGISTER_CLASS(DataParser, ArchiveDataParse);
+REGIST_CLASS(DataParser, ArchiveDataParse);
 
 int DataReader::initialize(const YAML::Node& config, std::shared_ptr<TrainerContext> context) {
-    _parser.reset(CREATE_CLASS(DataParser, config["parser"]["class"].as<std::string>()));
+    _parser.reset(CREATE_INSTANCE(DataParser, config["parser"]["class"].as<std::string>()));
     if (_parser == nullptr) {
         VLOG(2) << "fail to get parser: " << config["parser"]["class"].as<std::string>();
         return -1;
@@ -457,7 +423,7 @@ public:
 
         if (config["file_system"] && config["file_system"]["class"]) {
             _file_system.reset(
-                    CREATE_CLASS(FileSystem, config["file_system"]["class"].as<std::string>()));
+                    CREATE_INSTANCE(FileSystem, config["file_system"]["class"].as<std::string>()));
             if (_file_system == nullptr ||
                 _file_system->initialize(config["file_system"], context) != 0) {
                 VLOG(2) << "fail to create class: "
@@ -467,7 +433,7 @@ public:
         } else if (context->file_system != nullptr) { 
             _file_system = context->file_system;
         } else {
-            _file_system.reset(CREATE_CLASS(FileSystem, "LocalFileSystem"));
+            _file_system.reset(CREATE_INSTANCE(FileSystem, "LocalFileSystem"));
             if (_file_system == nullptr || _file_system->initialize(YAML::Load(""), context) != 0) {
                 VLOG(2) << "fail to init file system";
                 return -1;
@@ -565,11 +531,6 @@ public:
                 is_failed = true;
                 continue;
             }
-            if (_file_system->err_no() != 0) {
-                _file_system->reset_err_no();
-                is_failed = true;
-                continue;
-            }
         }
         // omp end
 
@@ -593,7 +554,7 @@ private:
     std::string _filename_prefix;
     std::shared_ptr<FileSystem> _file_system;
 };
-REGISTER_CLASS(DataReader, LineDataReader);
+REGIST_CLASS(DataReader, LineDataReader);
 
 }  // namespace feed
 }  // namespace custom_trainer
diff --git a/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.h b/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.h
index 109bcbb0e0b4c560383ce52e57787071c609181e..9bd824f6c92e607a34284b0efabac7bdf1eefa5a 100755
--- a/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.h
+++ b/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.h
@@ -20,6 +20,8 @@ namespace feed {
 class TrainerContext;
 
 struct FeatureItem {
+    std::vector<float> weights;
+    std::vector<float> gradients;
 public:
     FeatureItem() {
     }
@@ -76,13 +78,12 @@ public:
     virtual ~DataParser() {}
     virtual int initialize(const YAML::Node& config, std::shared_ptr<TrainerContext> context) = 0;
     virtual int parse(const std::string& str, DataItem& data) const {
-        return parse(str.c_str(), data);
+        return parse(str.c_str(), str.size(), data);
     }
     virtual int parse(const char* str, size_t len, DataItem& data) const = 0;
-    virtual int parse(const char* str, DataItem& data) const = 0;
     virtual int parse_to_sample(const DataItem& data, SampleInstance& instance) const = 0;  
 };
-REGISTER_REGISTERER(DataParser);
+REGIST_REGISTERER(DataParser);
 
 class DataReader {
 public:
@@ -104,7 +105,24 @@ protected:
     std::shared_ptr<DataParser> _parser;//数据格式转换
     std::string _pipeline_cmd; //将文件流，重定向到pipeline_cmd，再读入
 };
-REGISTER_REGISTERER(DataReader);
+REGIST_REGISTERER(DataReader);
+
+class LineDataParser : public DataParser {
+public:
+    LineDataParser() {}
+
+    virtual ~LineDataParser() {}
+
+    virtual int initialize(const YAML::Node& config, std::shared_ptr<TrainerContext> context) {
+        return 0;
+    }
+
+    virtual int parse(const char* str, size_t len, DataItem& data) const;
+
+    virtual int parse_to_sample(const DataItem& data, SampleInstance& instance) const {
+        return 0;
+    }
+};
 
 }//namespace feed
 }//namespace custom_trainer
diff --git a/paddle/fluid/train/custom_trainer/feed/dataset/dataset.cc b/paddle/fluid/train/custom_trainer/feed/dataset/dataset.cc
index 602fab3c6f50832e8c918f595b825b39eb3e6b03..a4081c0f5296730209afb237a790aa43246c0a16 100644
--- a/paddle/fluid/train/custom_trainer/feed/dataset/dataset.cc
+++ b/paddle/fluid/train/custom_trainer/feed/dataset/dataset.cc
@@ -48,30 +48,30 @@ inline DatasetStatus Dataset::epoch_data_status(
     return _data_containers[data_name]->epoch_data_status(epoch_id);
 }
 
+inline std::vector<std::string> Dataset::epoch_data_path(
+    const std::string& data_name, uint64_t epoch_id) {
+    return _data_containers[data_name]->epoch_data_path(epoch_id);
+}
+
+inline std::vector<std::string> Dataset::epoch_data_path(uint64_t epoch_id) {
+    std::vector<std::string> results;
+    for (auto it = _data_containers.begin(); it != _data_containers.end(); ++it) {
+        auto items = std::move(it->second->epoch_data_path(epoch_id));
+        for (auto& item : items) {
+            results.emplace_back(item);
+        }
+    }
+    return results;
+}
+
 inline ::paddle::framework::Channel<DataItem> Dataset::fetch_data(
     const std::string& data_name, uint64_t epoch_id) {
     return _data_containers[data_name]->fetch(epoch_id);
 }
 
-SampleInstancePipe Dataset::fetch_sample(
-    const std::string& data_name, uint32_t batch_size, uint64_t epoch_id) {
+inline const DataParser* Dataset::data_parser(const std::string& data_name) {
     auto* data_container = _data_containers[data_name].get();
-    auto data_channel = data_container->fetch(epoch_id);
-    const auto* data_parser = data_container->data_parser();
-    PipelineOptions options;
-    options.batch_size = batch_size;
-    options.need_hold_input_data = true;
-    options.buffer_data_num = batch_size * 10;
-    SampleInstancePipe pipe = make_sample_instance_channel();
-    pipe->initialize(options, data_channel, 
-        [data_parser] (const DataItem* data, SampleInstance* sample, size_t num) -> int {
-            int ret = 0;
-            for (int i = 0; i < num; ++i, ++data, ++sample) {
-                ret |= data_parser->parse_to_sample(*data, *sample);
-            }
-            return ret;
-    });
-    return pipe;
+    return data_container->data_parser();
 }
      
 
diff --git a/paddle/fluid/train/custom_trainer/feed/dataset/dataset.h b/paddle/fluid/train/custom_trainer/feed/dataset/dataset.h
index f8ea62c65202cf04622bb3e5e39ae82913a13feb..eeda375204dd4a731996ba46d91ffd16ea3eb8d2 100644
--- a/paddle/fluid/train/custom_trainer/feed/dataset/dataset.h
+++ b/paddle/fluid/train/custom_trainer/feed/dataset/dataset.h
@@ -29,14 +29,17 @@ public:
     virtual DatasetStatus epoch_data_status(uint64_t epoch_id);
     virtual DatasetStatus epoch_data_status(const std::string& data_name, uint64_t epoch_id);
 
+    //获取数据路径
+    virtual std::vector<std::string> epoch_data_path(uint64_t epoch_id);
+    virtual std::vector<std::string> epoch_data_path(const std::string& data_name, uint64_t epoch_id);
+
     //返回各DataContainer内的原始数据(maybe 压缩格式)
     virtual ::paddle::framework::Channel<DataItem> fetch_data(
             const std::string& data_name, uint64_t epoch_id);
 
-    //以管道形式返回标准样本流，管道内会对数据做异步转换
-    virtual SampleInstancePipe fetch_sample(
-            const std::string& data_name, uint32_t batch_size, uint64_t epoch_id);
-     
+    //获取DataItem解析器
+    virtual const DataParser* data_parser(const std::string& data_name);
+    
 private: 
     std::unordered_map<std::string, std::shared_ptr<DatasetContainer>> _data_containers;
 };
diff --git a/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.cc b/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.cc
index 14d3062b4c329844811d6a1bd1a85948f2f10c09..f702511b678a7046bee7ea827c53fe7da3d8a321 100755
--- a/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.cc
+++ b/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.cc
@@ -31,7 +31,7 @@ int DatasetContainer::initialize(
     _data_split_interval = config["data_spit_interval"].as<int>();
     _data_path_formater = config["data_path_formater"].as<std::string>();
     std::string data_reader_class = config["data_reader"].as<std::string>();
-    DataReader* data_reader = CREATE_CLASS(DataReader, data_reader_class);
+    DataReader* data_reader = CREATE_INSTANCE(DataReader, data_reader_class);
     _data_reader.reset(data_reader);
     return _data_reader->initialize(config, context);
 }   
@@ -41,6 +41,21 @@ std::shared_ptr<DatasetInfo> DatasetContainer::dataset(uint64_t timestamp) {
     auto data_idx = timestamp / epoch_accessor->epoch_time_interval();
     return _dataset_list[data_idx % _prefetch_num];
 }
+std::vector<std::string> DatasetContainer::epoch_data_path(uint64_t epoch_id) {
+    std::vector<std::string> results;
+    auto* epoch_accessor = _trainer_context->epoch_accessor.get();
+    time_t timestamp = epoch_accessor->epoch_timestamp(epoch_id);
+    size_t data_num = data_num_for_train(timestamp, epoch_accessor->epoch_time_interval(), _data_split_interval);
+    uint64_t data_timestamp = timestamp % _data_split_interval == 0 ? timestamp : (timestamp / _data_split_interval + 1) * _data_split_interval;
+    for (int i = 0; i < _data_root_paths.size(); ++i) {
+        for (int j = 0; j < data_num; ++j) {
+            std::string path_suffix = format_timestamp(data_timestamp + j * _data_split_interval, _data_path_formater);
+            std::string data_dir = _trainer_context->file_system->path_join(_data_root_paths[i], path_suffix);
+            results.emplace_back(data_dir);
+        }
+    }
+    return results;
+}
 
 void DatasetContainer::pre_detect_data(uint64_t epoch_id) {
     int status = 0;
@@ -55,7 +70,7 @@ void DatasetContainer::pre_detect_data(uint64_t epoch_id) {
             async_download_data(timestamp);
         }));
     }
-    for (int detect_idx = 0 ; detect_idx < _prefetch_num; ++detect_idx) {
+    for (int detect_idx = 0 ; detect_idx < _prefetch_num; ++detect_idx, ++epoch_id) {
         if (DatasetStatus::Empty != data_status(timestamp)) {
             continue;
         }
@@ -74,6 +89,7 @@ void DatasetContainer::pre_detect_data(uint64_t epoch_id) {
             dataset_info->timestamp = timestamp;
             dataset_info->file_path_list = std::move(data_path_list);
             dataset_info->status = DatasetStatus::Detected;
+            VLOG(2) << epoch_accessor->text(epoch_id) << ", data is detected";
         }
         timestamp += epoch_accessor->epoch_time_interval();
     }
@@ -149,16 +165,25 @@ void DatasetContainer::async_download_data(uint64_t start_timestamp) {
     }
     while (!_stop_download) {
         auto dataset_info = dataset(start_timestamp);
-        while (data_status(start_timestamp) != DatasetStatus::Detected) {
+        while (data_status(start_timestamp) == DatasetStatus::Empty) {
             sleep(30);
         }
+        dataset_info->status = DatasetStatus::Downloding;
+
+        VLOG(2) << "Start download data, data_timestap:" << start_timestamp
+            << ", for epoch:" << epoch_accessor->text(start_timestamp);
         const auto& file_list = dataset_info->file_path_list;
         dataset_info->data_channel->Clear();
         while (_data_reader->read_all(file_list, dataset_info->data_channel) != 0) {
             dataset_info->data_channel->Clear();
-            VLOG(0) << "timestamp:" << start_timestamp << " data read failed, retry";
+            VLOG(0) << "Failed download data, data_timestap:" << start_timestamp
+                << ", for epoch:" << epoch_accessor->text(start_timestamp) << ", Retry it";
             sleep(30); 
         }
+        VLOG(2) << "End download data num:" << dataset_info->data_channel->Size()
+            << ", data_timestap:" << start_timestamp
+            << ", for epoch:" << epoch_accessor->text(start_timestamp);
+        dataset_info->status = DatasetStatus::Ready;
         start_timestamp += epoch_accessor->epoch_time_interval();
     }
 }
diff --git a/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.h b/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.h
index 7ed455a4d81fc257aecca2627f3853936ca97514..0215a3563f0d519085c9ec84a13c4eddd8ab631f 100644
--- a/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.h
+++ b/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.h
@@ -49,20 +49,22 @@ public:
     }
     virtual int initialize(
         const YAML::Node& config, std::shared_ptr<TrainerContext> context);
-    //触发可预取的数据判断
+    // 触发可预取的数据判断
     virtual void pre_detect_data(uint64_t epoch_id);
-    //获取数据状态
+    // 获取epoch对应的样本数据目录
+    std::vector<std::string> epoch_data_path(uint64_t epoch_id);
+    // 获取数据状态
     virtual DatasetStatus epoch_data_status(uint64_t epoch_id);
-    //获取特定epoch_i样本，如果数据未ready，Channel内为空指针
+    // 获取特定epoch_i样本，如果数据未ready，Channel内为空指针
     virtual ::paddle::framework::Channel<DataItem> fetch(uint64_t epoch_id);
-    //获取DataItem解析器
+    // 获取DataItem解析器
     virtual const DataParser* data_parser() {
         return _data_reader->get_parser();
     }
 protected:
     virtual DatasetStatus data_status(uint64_t timestamp);
     virtual int read_data_list(const std::string& data_dir, std::vector<std::string>& data_list);
-    //异步样本download
+    // 异步样本download
     virtual void async_download_data(uint64_t start_timestamp);
     virtual std::shared_ptr<DatasetInfo> dataset(uint64_t timestamp);
    
diff --git a/paddle/fluid/train/custom_trainer/feed/executor/executor.cc b/paddle/fluid/train/custom_trainer/feed/executor/executor.cc
index 2692cacabda81fe8af2b27c98c2795e6b5e8f73d..246d4a36a2243cb9f888c81afaf30f4bad7f0a5f 100644
--- a/paddle/fluid/train/custom_trainer/feed/executor/executor.cc
+++ b/paddle/fluid/train/custom_trainer/feed/executor/executor.cc
@@ -1,3 +1,4 @@
+#include <sstream>
 #include "paddle/fluid/train/custom_trainer/feed/executor/executor.h"
 
 #include "paddle/fluid/framework/program_desc.h"
@@ -50,57 +51,53 @@ public:
     virtual ~SimpleExecutor() {};
     virtual int initialize(YAML::Node exe_config,
         std::shared_ptr<TrainerContext> context_ptr) {
-        
         paddle::framework::InitDevices(false);
-        if (exe_config["num_threads"]) {
-            paddle::platform::SetNumThreads(exe_config["num_threads"].as<int>());
-        }
-
-        if (!exe_config["startup_program"] || 
-            !exe_config["main_program"]) {
-            VLOG(2) << "fail to load config";
-            return -1;
-        }
-
+        //if (exe_config["num_threads"]) {
+            
+        //}
+        paddle::platform::SetNumThreads(1);
+        std::string name = exe_config["name"].as<std::string>();
+        std::string main_program = YamlHelper::get_with_default(exe_config, "main_program",
+            string::format_string("./model/%s/main_program", name.c_str()));
+        std::string startup_program = YamlHelper::get_with_default(exe_config, "startup_program",
+            string::format_string("./model/%s/startup_program", name.c_str()));
         try {
             _context.reset(new SimpleExecutor::Context(context_ptr->cpu_place));
-            auto startup_program = Load(&_context->executor, exe_config["startup_program"].as<std::string>());
-            if (startup_program == nullptr) {
-                VLOG(2) << "fail to load startup_program: " << exe_config["startup_program"].as<std::string>();
+            _context->startup_program = Load(&_context->executor, startup_program);
+            if (_context->startup_program == nullptr) {
+                VLOG(0) << "fail to load startup_program: " << startup_program;
                 return -1;
             }
-            
-            _context->executor.Run(*startup_program, this->scope(), 0, false, true);
-
-            _context->main_program = Load(&_context->executor, exe_config["main_program"].as<std::string>());
+            _context->main_program = Load(&_context->executor, main_program);
             if (_context->main_program == nullptr) {
-                VLOG(2) << "fail to load main_program: " << exe_config["main_program"].as<std::string>();
+                VLOG(0) << "fail to load main_program: " << main_program;
                 return -1;
             }
             _context->prepare_context = _context->executor.Prepare(*_context->main_program, 0);
-
-
-            _context->executor.CreateVariables(*_context->main_program, this->scope(), 0);
         } catch (::paddle::platform::EnforceNotMet& err) {
-            VLOG(2) << err.what();
+            VLOG(0) << err.what();
             _context.reset(nullptr);
             return -1;
         }
-
         return 0;
     }
-    virtual int run() {
+    virtual int initialize_scope(::paddle::framework::Scope* scope) {
+        _context->executor.Run(*_context->startup_program, scope, 0, false, true);
+        _context->executor.CreateVariables(*_context->main_program, scope, 0);
+        return 0;
+    }
+    virtual int run(::paddle::framework::Scope* scope) {
         if (_context == nullptr) {
             VLOG(2) << "need initialize before run";
             return -1;
         }
         try {
-            _context->executor.RunPreparedContext(_context->prepare_context.get(), this->scope(),
-                                    false, /* don't create local scope each time*/
-                                    false /* don't create variable each time */);
+            _context->executor.RunPreparedContext(_context->prepare_context.get(), scope,
+                false, /* don't create local scope each time*/
+                false /* don't create variable each time */);
 
             // For some other vector like containers not cleaned after each batch.
-            _context->tensor_array_batch_cleaner.CollectNoTensorVars(this->scope());
+            _context->tensor_array_batch_cleaner.CollectNoTensorVars(scope);
             _context->tensor_array_batch_cleaner.ResetNoTensorVars();
         } catch (::paddle::platform::EnforceNotMet& err) {
             VLOG(2) << err.what();
@@ -115,13 +112,14 @@ protected:
         const ::paddle::platform::Place& place;
         ::paddle::framework::Executor executor;
         ::std::unique_ptr<::paddle::framework::ProgramDesc> main_program;
+        ::std::unique_ptr<::paddle::framework::ProgramDesc> startup_program;
         ::std::unique_ptr<framework::ExecutorPrepareContext> prepare_context;
         details::TensorArrayBatchCleaner tensor_array_batch_cleaner;
     };
     std::unique_ptr<Context> _context;
 };
 
-REGISTER_CLASS(Executor, SimpleExecutor);
+REGIST_CLASS(Executor, SimpleExecutor);
     
 }  // namespace feed
 }  // namespace custom_trainer
diff --git a/paddle/fluid/train/custom_trainer/feed/executor/executor.h b/paddle/fluid/train/custom_trainer/feed/executor/executor.h
index 0c8237b813a2d59abbbcb522013db77330845176..52c14cd977951d20aa23b8f6aa0ebf4f0463d043 100644
--- a/paddle/fluid/train/custom_trainer/feed/executor/executor.h
+++ b/paddle/fluid/train/custom_trainer/feed/executor/executor.h
@@ -13,30 +13,16 @@ public:
     Executor() {}
     virtual ~Executor() {}
 
-    //初始化，包括进行训练网络&配置加载工作
+    // 初始化，包括进行训练网络&配置加载工作
     virtual int initialize(YAML::Node exe_config, 
         std::shared_ptr<TrainerContext> context_ptr) = 0;
     
-    //scope 可用于填充&取 var
-    virtual ::paddle::framework::Scope* scope() {
-        return &_scope;
-    }
-    //直接取var
-    template <class T>
-    const T& var(const std::string& name) {
-        return _scope.Var(name)->Get<T>();
-    }
-    template <class T>
-    T* mutable_var(const std::string& name) {
-        return _scope.Var(name)->GetMutable<T>();
-    }
+    // 初始化scope, 后续反复执行训练，不再初始化
+    virtual int initialize_scope(::paddle::framework::Scope* scope) = 0;
+
+    // 执行训练
+    virtual int run(::paddle::framework::Scope* scope) = 0;
 
-    //执行训练
-    virtual int run() = 0;
-    
-    virtual bool is_dump_all_model() {
-        return false;
-    }
     // cost time millisecond
     virtual uint64_t epoch_cost() const {
         return 0;
@@ -44,7 +30,7 @@ public:
 protected:
     ::paddle::framework::Scope _scope;
 };
-REGISTER_REGISTERER(Executor);
+REGIST_REGISTERER(Executor);
 
 }  // namespace feed
 }  // namespace custom_trainer
diff --git a/paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.cc b/paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..27c7127b6f73ea9367ce3516930bee810fad018a
--- /dev/null
+++ b/paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.cc
@@ -0,0 +1,150 @@
+#include "paddle/fluid/train/custom_trainer/feed/io/file_system.h"
+#include "paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.h"
+
+namespace paddle {
+namespace custom_trainer {
+namespace feed {
+
+int MultiThreadExecutor::initialize(YAML::Node exe_config, 
+    std::shared_ptr<TrainerContext> context_ptr) {
+    int ret = 0;
+    _trainer_context = context_ptr.get();
+    _train_data_name = exe_config["train_data_name"].as<std::string>();
+    _train_batch_size = exe_config["train_batch_size"].as<int>();
+    _input_parse_thread_num = exe_config["input_parse_thread_num"].as<int>();
+    _push_gradient_thread_num = exe_config["push_gradient_thread_num"].as<int>();
+    _train_thread_num = exe_config["train_thread_num"].as<int>();
+    _need_dump_all_model = exe_config["need_dump_all_model"].as<bool>();
+    CHECK(_train_thread_num > 0 && _train_batch_size > 0);
+    _thread_executors.resize(_train_thread_num);
+    auto e_class = exe_config["class"].as<std::string>();
+    _train_exe_name = exe_config["name"].as<std::string>();
+
+    omp_set_num_threads(_train_thread_num);
+    #pragma omp parallel for
+    for (int i = 0; i < _train_thread_num; ++i) {
+        auto* e_ptr = CREATE_INSTANCE(Executor, e_class);
+        _thread_executors[i].reset(e_ptr);
+        if (e_ptr->initialize(exe_config, context_ptr) != 0) {
+            VLOG(0) << "executor initialize failed, name:" << _train_exe_name
+                << " class:" << e_class;
+            ret = -1;
+        }
+    }
+    CHECK(ret == 0);
+
+    // buffer
+    _scope_obj_pool.reset(new paddle::ps::ObjectPool<::paddle::framework::Scope>(
+        [this]() -> ::paddle::framework::Scope* {
+            auto* scope = new ::paddle::framework::Scope();
+            _thread_executors[0]->initialize_scope(scope);
+            return scope;
+        }, _train_thread_num * 8, 0, _train_thread_num * 8));
+
+    // 模型网络加载
+    std::string model_config_path = _trainer_context->file_system->path_join(
+        "./model", string::format_string("%s/model.yaml", _train_exe_name.c_str()));
+    CHECK(_trainer_context->file_system->exists(model_config_path)) 
+        << "miss model config file:" << model_config_path;
+    _model_config = YAML::LoadFile(model_config_path);
+    for (const auto& accessor_config : _model_config["input_accessor"]) {
+        auto accessor_class = accessor_config["class"].as<std::string>();
+        auto* accessor_ptr = CREATE_INSTANCE(DataInputAccessor, accessor_class);
+        _input_accessors.emplace_back(accessor_ptr);
+        CHECK(accessor_ptr->initialize(accessor_config, context_ptr) == 0)
+            << "InputAccessor init Failed, class:" << accessor_class;
+        if (accessor_config["table_id"]) {
+            auto table_id = accessor_config["table_id"].as<int>();
+            if (_table_to_accessors.count(table_id) > 0) {
+                _table_to_accessors[table_id].push_back(accessor_ptr);
+            } else {
+                _table_to_accessors[table_id] = {accessor_ptr};
+            }
+        }
+    } 
+
+    return ret;
+}
+
+paddle::framework::Channel<DataItem> MultiThreadExecutor::run(
+    paddle::framework::Channel<DataItem> input, const DataParser* parser) {
+    PipelineOptions input_pipe_option;
+    input_pipe_option.need_hold_input_data = true;
+    input_pipe_option.batch_size = 1;
+    input_pipe_option.thread_num = _input_parse_thread_num;
+    input_pipe_option.input_output_rate = _train_batch_size;
+    input_pipe_option.buffer_batch_count = _train_thread_num;
+    auto input_pipe = std::make_shared<Pipeline<DataItem, ScopePoolObj>>();
+    input_pipe->initialize(input_pipe_option, input, 
+        [this, parser](DataItem* item, size_t item_num, 
+            ScopePoolObj* scope, size_t* scope_num, size_t thread_idx) -> int {
+            *scope_num = 1;
+            auto scope_obj = _scope_obj_pool->get();   
+            auto* samples = new SampleInstance[item_num];
+            for (size_t i = 0; i <item_num; ++i) {
+                CHECK(parser->parse_to_sample(item[i], samples[i]) == 0);
+            }
+            for (size_t i = 0; i < _input_accessors.size(); ++i) {
+                _input_accessors[i]->forward(samples, item_num, scope_obj.get());
+            }
+            int64_t data_for_scope = (int64_t)samples;
+            ScopeHelper::fill_value(scope_obj.get(), _trainer_context->cpu_place,
+                "sample_data", data_for_scope);
+            data_for_scope = (int64_t)item_num;
+            ScopeHelper::fill_value(scope_obj.get(), _trainer_context->cpu_place,
+                "sample_num", data_for_scope);
+            *scope = std::move(scope_obj);
+            return 0;
+        });
+    
+    PipelineOptions train_pipe_option;
+    train_pipe_option.input_output_rate = 1;
+    train_pipe_option.thread_num = _train_thread_num;
+    train_pipe_option.buffer_batch_count = 2 * _train_thread_num;
+    auto train_pipe = std::make_shared<Pipeline<ScopePoolObj, ScopePoolObj>>();
+    train_pipe->connect_to(*input_pipe, train_pipe_option, 
+        [this] (ScopePoolObj* in_items, size_t in_num, 
+            ScopePoolObj* out_items, size_t* out_num, size_t thread_idx) -> int {
+            auto* executor = _thread_executors[thread_idx].get();
+            size_t& out_idx = *out_num;
+            for (out_idx = 0; out_idx < in_num; ++out_idx) {
+                //CHECK(executor->run(in_items[out_idx].get()) == 0);
+                out_items[out_idx] = std::move(in_items[out_idx]);
+            }
+            return 0;
+        });
+
+    PipelineOptions gradient_pipe_option;
+    gradient_pipe_option.input_output_rate = 1;
+    gradient_pipe_option.thread_num = _push_gradient_thread_num;
+    gradient_pipe_option.buffer_batch_count = 2 * _train_thread_num;
+    auto gradient_pipe = std::make_shared<Pipeline<ScopePoolObj, int>>();
+    gradient_pipe->connect_to(*train_pipe, gradient_pipe_option, 
+        [this] (ScopePoolObj* in_items, size_t in_num, 
+            int* out_items, size_t* out_num, size_t thread_idx) -> int {
+            size_t& out_idx = *out_num;
+            for (out_idx = 0; out_idx < in_num; ++out_idx) {
+                auto* scope = in_items[out_idx].get();
+                auto sample_num = *ScopeHelper::get_value<int64_t>(
+                    scope, _trainer_context->cpu_place, "sample_num");
+                
+                auto* samples = (SampleInstance*)(*ScopeHelper::get_value<int64_t>(
+                    scope, _trainer_context->cpu_place, "sample_data"));
+                for (size_t i = 0; i < _input_accessors.size(); ++i) {
+                    out_items[out_idx] = _input_accessors[i]->
+                        backward(samples, sample_num, scope);
+                }
+                delete[] samples; // 所有pipe完成后，再回收sample
+            }
+            return 0;
+        });
+
+    std::vector<int> gradient_status;
+    while (gradient_pipe->read(gradient_status) > 0) {
+    }
+    return input_pipe->backup_channel();
+}
+
+}  // namespace feed
+}  // namespace custom_trainer
+}  // namespace paddle
diff --git a/paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.h b/paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..68c9f88fe3505634e3a429066250cada227a5e04
--- /dev/null
+++ b/paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.h
@@ -0,0 +1,61 @@
+#pragma once
+#include <functional>
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/train/custom_trainer/feed/executor/executor.h"
+#include "paddle/fluid/train/custom_trainer/feed/accessor/input_data_accessor.h"
+
+namespace paddle {
+namespace custom_trainer {
+namespace feed {
+
+typedef paddle::ps::ObjectPool<::paddle::framework::Scope>::PooledObject ScopePoolObj;
+
+class MultiThreadExecutor {
+public:
+    MultiThreadExecutor() {}
+    virtual ~MultiThreadExecutor() {}
+
+    //初始化，包括进行训练网络&配置加载工作
+    virtual int initialize(YAML::Node exe_config, 
+        std::shared_ptr<TrainerContext> context_ptr);
+
+    //执行训练
+    virtual paddle::framework::Channel<DataItem> run(
+        paddle::framework::Channel<DataItem> input, const DataParser* parser);
+    
+    virtual bool is_dump_all_model() {
+        return _need_dump_all_model;
+    }
+    virtual const std::string& train_exe_name() {
+        return _train_exe_name;
+    }
+    virtual const std::string& train_data_name() {
+        return _train_data_name;
+    }
+    virtual const std::map<uint32_t, std::vector<DataInputAccessor*>>& table_accessors() {
+        return _table_to_accessors;
+    }
+    virtual ScopePoolObj fetch_scope() {
+        ScopePoolObj scope_obj(_scope_obj_pool->get());
+        return scope_obj;
+    }
+protected:
+    std::string _train_data_name;
+    size_t _train_batch_size = 32;
+    size_t _train_thread_num = 12;
+    size_t _input_parse_thread_num = 10;
+    size_t _push_gradient_thread_num = 10;
+    bool _need_dump_all_model = false;
+
+    YAML::Node _model_config;
+    std::string _train_exe_name;
+    TrainerContext* _trainer_context = nullptr;
+    std::vector<std::shared_ptr<Executor>> _thread_executors;
+    std::vector<std::shared_ptr<DataInputAccessor>> _input_accessors;
+    std::map<uint32_t, std::vector<DataInputAccessor*>> _table_to_accessors;
+    std::shared_ptr<paddle::ps::ObjectPool<::paddle::framework::Scope>> _scope_obj_pool;
+};
+
+}  // namespace feed
+}  // namespace custom_trainer
+}  // namespace paddle
diff --git a/paddle/fluid/train/custom_trainer/feed/io/auto_file_system.cc b/paddle/fluid/train/custom_trainer/feed/io/auto_file_system.cc
index db41b6d6390fe42e67cf73855430710fece3fec3..48476a145097ab0cb71a8c885bcad29ec85bc16c 100644
--- a/paddle/fluid/train/custom_trainer/feed/io/auto_file_system.cc
+++ b/paddle/fluid/train/custom_trainer/feed/io/auto_file_system.cc
@@ -17,7 +17,7 @@ public:
         _file_system.clear();
         if (config && config["file_systems"] && config["file_systems"].Type() == YAML::NodeType::Map) {
             for (auto& prefix_fs: config["file_systems"]) {
-                std::unique_ptr<FileSystem> fs(CREATE_CLASS(FileSystem, prefix_fs.second["class"].as<std::string>("")));
+                std::unique_ptr<FileSystem> fs(CREATE_INSTANCE(FileSystem, prefix_fs.second["class"].as<std::string>("")));
                 if (fs == nullptr) {
                     LOG(FATAL)  << "fail to create class: " << prefix_fs.second["class"].as<std::string>("");
                     return -1;
@@ -31,7 +31,7 @@ public:
         }
         if (_file_system.find("default") == _file_system.end()) {
             LOG(WARNING) << "miss default file_system, use LocalFileSystem as default";
-            std::unique_ptr<FileSystem> fs(CREATE_CLASS(FileSystem, "LocalFileSystem"));
+            std::unique_ptr<FileSystem> fs(CREATE_INSTANCE(FileSystem, "LocalFileSystem"));
             if (fs == nullptr || fs->initialize(YAML::Load(""), context) != 0) {
                 return -1;
             }
@@ -62,8 +62,8 @@ public:
         return get_file_system(path)->list(path);
     }
 
-    std::string tail(const std::string& path) override {
-        return get_file_system(path)->tail(path);
+    std::string tail(const std::string& path,  size_t tail_num = 1) override {
+        return get_file_system(path)->tail(path, tail_num);
     }
 
     bool exists(const std::string& path) override {
@@ -86,29 +86,10 @@ public:
         return _file_system["default"].get();
     }
 
-    int err_no() const override {
-        if (_err_no == 0) {
-            for (const auto& file_system : _file_system) {
-                if (file_system.second->err_no() != 0) {
-                    const_cast<int&>(_err_no) = -1;
-                    break;
-                }
-            }
-        }
-        return FileSystem::err_no();
-    }
-
-    void reset_err_no() override {
-        _err_no = 0;
-        for (auto& file_system : _file_system) {
-            file_system.second->reset_err_no();
-        }
-    }
-
 private:
     std::unordered_map<std::string, std::unique_ptr<FileSystem>> _file_system;
 };
-REGISTER_CLASS(FileSystem, AutoFileSystem);
+REGIST_CLASS(FileSystem, AutoFileSystem);
 
 }  // namespace feed
 }  // namespace custom_trainer
diff --git a/paddle/fluid/train/custom_trainer/feed/io/file_system.h b/paddle/fluid/train/custom_trainer/feed/io/file_system.h
index 482ab30de779655d1aeb9dd17b2c00cbbe63babd..0ef5a37b0c0a3e04d2f20d2b036ff2541b4f0f48 100644
--- a/paddle/fluid/train/custom_trainer/feed/io/file_system.h
+++ b/paddle/fluid/train/custom_trainer/feed/io/file_system.h
@@ -21,24 +21,14 @@ public:
     virtual int64_t file_size(const std::string& path) = 0;
     virtual void remove(const std::string& path) = 0;
     virtual std::vector<std::string> list(const std::string& path) = 0;
-    virtual std::string tail(const std::string& path) = 0;
+    virtual std::string tail(const std::string& path, size_t tail_num = 1) = 0;
     virtual bool exists(const std::string& path) = 0;
     virtual void mkdir(const std::string& path) = 0;
     virtual std::string path_join(const std::string& dir, const std::string& path);
     virtual std::pair<std::string, std::string> path_split(const std::string& path);
-    virtual int err_no() const {
-        return _err_no;
-    }
-    inline operator bool() {
-        return err_no() == 0;
-    }
-    virtual void reset_err_no() {
-        _err_no = 0;
-    }
 protected:
-    int _err_no = 0;
 };
-REGISTER_REGISTERER(FileSystem);
+REGIST_REGISTERER(FileSystem);
 
 }  // namespace feed
 }  // namespace custom_trainer
diff --git a/paddle/fluid/train/custom_trainer/feed/io/hadoop_file_system.cc b/paddle/fluid/train/custom_trainer/feed/io/hadoop_file_system.cc
index 7e6d42eba9f39ce37ff6d44a5d9491919e6ac171..2af8e08231f114166ac39808079528f981101a2d 100644
--- a/paddle/fluid/train/custom_trainer/feed/io/hadoop_file_system.cc
+++ b/paddle/fluid/train/custom_trainer/feed/io/hadoop_file_system.cc
@@ -33,6 +33,7 @@ public:
 
     std::shared_ptr<FILE> open_read(const std::string& path, const std::string& converter)
             override {
+        int err_no = 0;
         std::string cmd;
         if (string::end_with(path, ".gz")) {
             cmd = string::format_string(
@@ -43,11 +44,12 @@ public:
 
         bool is_pipe = true;
         shell_add_read_converter(cmd, is_pipe, converter);
-        return shell_open(cmd, is_pipe, "r", _buffer_size, &_err_no);
+        return shell_open(cmd, is_pipe, "r", _buffer_size, &err_no);
     }
 
     std::shared_ptr<FILE> open_write(const std::string& path, const std::string& converter)
             override {
+        int err_no = 0;
         std::string cmd =
                 string::format_string("%s -put - \"%s\"", hdfs_command(path).c_str(), path.c_str());
         bool is_pipe = true;
@@ -57,11 +59,10 @@ public:
         }
 
         shell_add_write_converter(cmd, is_pipe, converter);
-        return shell_open(cmd, is_pipe, "w", _buffer_size, &_err_no);
+        return shell_open(cmd, is_pipe, "w", _buffer_size, &err_no);
     }
 
     int64_t file_size(const std::string& path) override {
-        _err_no = -1;
         LOG(FATAL) << "not support";
         return 0;
     }
@@ -107,13 +108,13 @@ public:
         return list;
     }
 
-    std::string tail(const std::string& path) override {
+    std::string tail(const std::string& path, size_t tail_num = 1) override {
         if (path == "") {
             return "";
         }
 
         return shell_get_command_output(string::format_string(
-                "%s -text %s | tail -1 ", hdfs_command(path).c_str(), path.c_str()));
+                "%s -text %s | tail -%u", hdfs_command(path).c_str(), path.c_str(), tail_num));
     }
 
     bool exists(const std::string& path) override {
@@ -189,7 +190,7 @@ private:
     std::string _hdfs_command;
     std::unordered_map<std::string, std::string> _ugi;
 };
-REGISTER_CLASS(FileSystem, HadoopFileSystem);
+REGIST_CLASS(FileSystem, HadoopFileSystem);
 
 }  // namespace feed
 }  // namespace custom_trainer
diff --git a/paddle/fluid/train/custom_trainer/feed/io/local_file_system.cc b/paddle/fluid/train/custom_trainer/feed/io/local_file_system.cc
index 7fb4eaa881eff777fa77543e496a28dcc5999eda..0b5e5cce0ab62000deee36da92b898e00431195f 100644
--- a/paddle/fluid/train/custom_trainer/feed/io/local_file_system.cc
+++ b/paddle/fluid/train/custom_trainer/feed/io/local_file_system.cc
@@ -64,10 +64,10 @@ public:
         if (path == "") {
             return {};
         }
-
+        int err_no;
         std::shared_ptr<FILE> pipe;
         pipe = shell_popen(
-                string::format_string("find %s -maxdepth 1 -type f", path.c_str()), "r", &_err_no);
+                string::format_string("find %s -maxdepth 1 -type f", path.c_str()), "r", &err_no);
         string::LineFileReader reader;
         std::vector<std::string> list;
 
@@ -78,12 +78,12 @@ public:
         return list;
     }
 
-    std::string tail(const std::string& path) override {
+    std::string tail(const std::string& path,  size_t tail_num = 1) override {
         if (path == "") {
             return "";
         }
 
-        return shell_get_command_output(string::format_string("tail -1 %s ", path.c_str()));
+        return shell_get_command_output(string::format_string("tail -%u %s ", tail_num, path.c_str()));
     }
 
     bool exists(const std::string& path) override {
@@ -115,7 +115,7 @@ public:
 private:
     size_t _buffer_size = 0;
 };
-REGISTER_CLASS(FileSystem, LocalFileSystem);
+REGIST_CLASS(FileSystem, LocalFileSystem);
 
 }  // namespace feed
 }  // namespace custom_trainer
diff --git a/paddle/fluid/train/custom_trainer/feed/main.cc b/paddle/fluid/train/custom_trainer/feed/main.cc
index 1ce087262b73e5b9dcf91c78649e0f1cb121bb2b..8e8c9851db6560c9bf9a29b322dc163ad49410cd 100644
--- a/paddle/fluid/train/custom_trainer/feed/main.cc
+++ b/paddle/fluid/train/custom_trainer/feed/main.cc
@@ -1,8 +1,8 @@
 #include <time.h>
 #include <fstream>
 #include <yaml-cpp/yaml.h>
-#include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/train/custom_trainer/feed/trainer_context.h"
+#include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/train/custom_trainer/feed/process/process.h"
 #include "paddle/fluid/train/custom_trainer/feed/process/init_env_process.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -22,28 +22,57 @@ int main(int argc, char* argv[]) {
     //load trainer config
     auto trainer_context_ptr = std::make_shared<TrainerContext>();
     trainer_context_ptr->trainer_config = YAML::LoadFile(FLAGS_feed_trainer_conf_path);    
- 
-    std::vector<std::string> process_name_list = {
-        "InitEnvProcess",
-        "LearnerProcess"
-    };
 
-    for (const auto& process_name : process_name_list) {
-        Process* process = CREATE_CLASS(Process, process_name);
-        if (process == NULL) {
-            VLOG(1) << "Process:" << process_name << " does not exist"; 
-            return -1;
-        }
-        if (process->initialize(trainer_context_ptr) != 0) {
-            VLOG(1) << "Process:" << process_name << " initialize failed"; 
-            return -1;
-        }
-        trainer_context_ptr->process_list.push_back(std::shared_ptr<Process>(process));
+    //environment
+    auto& config = trainer_context_ptr->trainer_config;
+    std::string env_class = config["environment"]["environment_class"].as<std::string>();
+    trainer_context_ptr->environment.reset(CREATE_INSTANCE(RuntimeEnvironment, env_class));
+    if (trainer_context_ptr->environment->initialize(config["environment"]) != 0) {
+        return -1;
+    }
+    auto* environment = trainer_context_ptr->environment.get();
+    environment->wireup();
+    if (environment->node_num(EnvironmentRole::ALL) == 1) {
+        environment->add_role(EnvironmentRole::WORKER);
+        environment->add_role(EnvironmentRole::PSERVER);
+    } else if (environment->rank_id(EnvironmentRole::ALL) % 2 == 0) {
+        environment->add_role(EnvironmentRole::WORKER);
+    } else {
+        environment->add_role(EnvironmentRole::PSERVER);
     } 
-
-    for (auto& process : trainer_context_ptr->process_list) {
-        process->run();
+    trainer_context_ptr->pslib.reset(new PSlib());
+    std::string ps_config = config["environment"]["ps"].as<std::string>();
+    trainer_context_ptr->pslib->initialize(ps_config, environment);
+    //VLOG(3) << "Node Start With Role:" << role;    
+     
+    
+    if (environment->is_role(EnvironmentRole::WORKER)) {
+        std::vector<std::string> process_name_list = {
+            "InitEnvProcess",
+            "LearnerProcess"
+        };
+        for (const auto& process_name : process_name_list) {
+            Process* process = CREATE_INSTANCE(Process, process_name);
+            if (process == NULL) {
+                VLOG(1) << "Process:" << process_name << " does not exist"; 
+                return -1;
+            }
+            if (process->initialize(trainer_context_ptr) != 0) {
+                VLOG(1) << "Process:" << process_name << " initialize failed"; 
+                return -1;
+            }
+            trainer_context_ptr->process_list.push_back(std::shared_ptr<Process>(process));
+        } 
+        for (auto& process : trainer_context_ptr->process_list) {
+            process->run();
+        }
+     
+    }
+    
+    //TODO exit control
+    bool running = true;
+    while (running) {
+        sleep(10000);
     }
-
     return 0;
 }
diff --git a/paddle/fluid/train/custom_trainer/feed/monitor/monitor.h b/paddle/fluid/train/custom_trainer/feed/monitor/monitor.h
index d6dcdf3885fa04e66ba1d20edffadd3457a25b6f..d205c23d7b1ae2c8897295d0b55770459d24e873 100644
--- a/paddle/fluid/train/custom_trainer/feed/monitor/monitor.h
+++ b/paddle/fluid/train/custom_trainer/feed/monitor/monitor.h
@@ -43,7 +43,7 @@ protected:
     std::shared_ptr<TrainerContext> _context_ptr;
 };
 
-REGISTER_REGISTERER(Monitor);
+REGIST_REGISTERER(Monitor);
 
 }  // namespace feed
 }  // namespace custom_trainer
diff --git a/paddle/fluid/train/custom_trainer/feed/process/init_env_process.cc b/paddle/fluid/train/custom_trainer/feed/process/init_env_process.cc
index 0ec8d011c1407a4afa70ce136bea24feacfd9ffa..05b398d3de16778a37c3e2316333db9e6730529d 100644
--- a/paddle/fluid/train/custom_trainer/feed/process/init_env_process.cc
+++ b/paddle/fluid/train/custom_trainer/feed/process/init_env_process.cc
@@ -20,22 +20,16 @@ int InitEnvProcess::initialize(std::shared_ptr<TrainerContext> context_ptr) {
     context_ptr->cpu_place = paddle::platform::CPUPlace();
     
     YAML::Node config = _context_ptr->trainer_config;
-    //environment
-    std::string env_class = config["environment"]["environment_class"].as<std::string>();
-    context_ptr->environment.reset(CREATE_CLASS(RuntimeEnvironment, env_class));
-    if (context_ptr->environment->initialize(config["environment"]) != 0) {
-        return -1;
-    }
 
     //file_system
-    context_ptr->file_system.reset(CREATE_CLASS(FileSystem, "AutoFileSystem"));
+    context_ptr->file_system.reset(CREATE_INSTANCE(FileSystem, "AutoFileSystem"));
     if (context_ptr->file_system->initialize(config["io"], context_ptr) != 0) {
         return -1;
     }
 
     //epoch
     std::string epoch_class = config["epoch"]["epoch_class"].as<std::string>();
-    context_ptr->epoch_accessor.reset(CREATE_CLASS(EpochAccessor, epoch_class));
+    context_ptr->epoch_accessor.reset(CREATE_INSTANCE(EpochAccessor, epoch_class));
     if (context_ptr->epoch_accessor->initialize(config["epoch"], context_ptr) != 0) {
         return -1;
     }
@@ -55,12 +49,6 @@ int InitEnvProcess::run() {
     VLOG(3) << "Trainer Resume From epoch:" << epoch_accessor->current_epoch_id();
     auto next_epoch_id = epoch_accessor->next_epoch_id(epoch_accessor->current_epoch_id());
     _context_ptr->dataset->pre_detect_data(next_epoch_id);
-    //step 1. psserver init
-    //step2. psserver load
-    VLOG(3) << "Psserver Start Success";
-    
-    //context_ptr->pslib_client()->load_model();
-    VLOG(3) << "Psserver Load Model Success";
     return 0;
 }
 
diff --git a/paddle/fluid/train/custom_trainer/feed/process/learner_process.cc b/paddle/fluid/train/custom_trainer/feed/process/learner_process.cc
index 71568eade83ca5bad02d72cc09aacc8a0ddd1c4e..6eb8fb653e1c310271a98a43d0c342587389f64e 100755
--- a/paddle/fluid/train/custom_trainer/feed/process/learner_process.cc
+++ b/paddle/fluid/train/custom_trainer/feed/process/learner_process.cc
@@ -3,6 +3,7 @@
  *Train样本
  */
 #include <omp.h>
+#include "paddle/fluid/train/custom_trainer/feed/io/file_system.h"
 #include "paddle/fluid/train/custom_trainer/feed/dataset/dataset.h"
 #include "paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.h"
 #include "paddle/fluid/train/custom_trainer/feed/process/learner_process.h"
@@ -14,23 +15,11 @@ namespace feed {
 int LearnerProcess::initialize(std::shared_ptr<TrainerContext> context_ptr) {
     int ret = Process::initialize(context_ptr);
     auto& config = _context_ptr->trainer_config;
-    _train_thread_num = config["train_thread_num"].as<int>();
-    _threads_executor.resize(_train_thread_num);
-    
     if (config["executor"]) {
-        _executor_num = config["executor"].size();
-        omp_set_num_threads(_train_thread_num);
-        #pragma omp parallel for
-        for (int i = 0; i < _train_thread_num; ++i) {
-            _threads_executor[i].resize(_executor_num);
-            for (int e = 0; e < _executor_num; ++e) {
-                auto e_class = config["executor"][e]["class"].as<std::string>();
-                auto* e_ptr = CREATE_CLASS(Executor, e_class);
-                _threads_executor[i][e].reset(e_ptr);  
-                if (e_ptr->initialize(config["executor"][e], context_ptr) != 0) {
-                    ret = -1;
-                }
-            }
+        _executors.resize(config["executor"].size());
+        for (size_t i = 0; i < _executors.size(); ++i) {
+            _executors[i].reset(new MultiThreadExecutor());
+            CHECK(_executors[i]->initialize(config["executor"][i], context_ptr) == 0);
         }
     }
     return 0;
@@ -39,9 +28,12 @@ int LearnerProcess::initialize(std::shared_ptr<TrainerContext> context_ptr) {
 std::future<int> LearnerProcess::save_model(uint64_t epoch_id, int table_id, ModelSaveWay way) {
     std::promise<int> p;
     auto ret = p.get_future();
-    if (_context_ptr->epoch_accessor->need_save_model(epoch_id, way)) {
-        //TODO
-        //context_ptr->pslib_client()->save();
+    auto* ps_client = _context_ptr->pslib->ps_client();
+    auto* epoch_accessor = _context_ptr->epoch_accessor.get();
+    if (epoch_accessor->need_save_model(epoch_id, way)) {
+        VLOG(2) << "Start save model, table_id:" << table_id;
+        auto model_dir = epoch_accessor->model_save_path(epoch_id, way);
+        return ps_client->save(table_id, model_dir, std::to_string((int)way));
     } else {
         p.set_value(0);
     }
@@ -53,14 +45,19 @@ int LearnerProcess::wait_save_model(uint64_t epoch_id, ModelSaveWay way) {
     if (!environment->is_master_node(EnvironmentRole::WORKER)) {
         return 0;
     }
+    std::set<uint32_t> table_set;
+    for (auto& executor : _executors) {
+        const auto& table_accessors = executor->table_accessors();
+        for (auto& itr : table_accessors) {
+            table_set.insert(itr.first);
+        }
+    }
     int ret_size = 0;
-    auto table_num = _context_ptr->params_table_list.size();
+    auto table_num = table_set.size();
     std::future<int> rets[table_num];
-    for (int i = 0; i < table_num; ++i) {
-        auto table_id = _context_ptr->params_table_list[i].table_id();
+    for (auto table_id : table_set) {
         rets[ret_size++] = save_model(epoch_id, table_id, way); 
     }
-
     int all_ret = 0;
     for (int i = 0; i < ret_size; ++i) {
         rets[i].wait();
@@ -69,6 +66,36 @@ int LearnerProcess::wait_save_model(uint64_t epoch_id, ModelSaveWay way) {
     return all_ret;
 }
 
+int LearnerProcess::load_model(uint64_t epoch_id) {
+    auto* environment = _context_ptr->environment.get();
+    if (!environment->is_master_node(EnvironmentRole::WORKER)) {
+        return 0;
+    }
+    std::set<uint32_t> loaded_table_set;
+    auto model_dir = _context_ptr->epoch_accessor->checkpoint_path();
+    for (auto& executor : _executors) {
+        const auto& table_accessors = executor->table_accessors();
+        for (auto& itr : table_accessors) {
+            if (loaded_table_set.count(itr.first)) {
+                continue;
+            }
+            auto table_model_path = _context_ptr->file_system->path_join(
+                model_dir, string::format_string("%03d", itr.first));
+            if (_context_ptr->file_system->list(table_model_path).size() == 0) {
+                VLOG(2) << "miss table_model:" << table_model_path << ", initialize by default";
+                auto scope = std::move(executor->fetch_scope());
+                CHECK(itr.second[0]->create(scope.get()) == 0);
+            } else {
+                auto status = _context_ptr->ps_client()->load(itr.first, 
+                    model_dir, std::to_string((int)ModelSaveWay::ModelSaveTrainCheckpoint));
+                CHECK(status.get() == 0) << "table load failed, id:" << itr.first;
+            }
+            loaded_table_set.insert(itr.first);
+        }
+    }
+    return 0;
+}
+
 int LearnerProcess::run() {
     auto* dataset = _context_ptr->dataset.get();
     auto* environment = _context_ptr->environment.get();
@@ -76,61 +103,82 @@ int LearnerProcess::run() {
     uint64_t epoch_id = epoch_accessor->current_epoch_id();
 
     environment->log(EnvironmentRole::WORKER, EnvironmentLogType::MASTER_LOG, EnvironmentLogLevel::NOTICE, 
-        "Resume train with epoch_id:%d label:%s", epoch_id, _context_ptr->epoch_accessor->text(epoch_id).c_str());
+        "Resume train with epoch_id:%d %s", epoch_id, _context_ptr->epoch_accessor->text(epoch_id).c_str());
     
+    //尝试加载模型 or 初始化
+    CHECK(load_model(epoch_id) == 0);
+    environment->barrier(EnvironmentRole::WORKER); 
+
     //判断是否先dump出base
     wait_save_model(epoch_id, ModelSaveWay::ModelSaveInferenceBase);
     environment->barrier(EnvironmentRole::WORKER); 
     
     while (true) {
         epoch_accessor->next_epoch();
+        bool already_dump_inference_model = false;
         epoch_id = epoch_accessor->current_epoch_id();
-        std::string epoch_log_title= paddle::string::format_string(
+        std::string epoch_log_title = paddle::string::format_string(
             "train epoch_id:%d label:%s", epoch_id, epoch_accessor->text(epoch_id).c_str());
+        std::string data_path = paddle::string::to_string<std::string>(dataset->epoch_data_path(epoch_id));
         
         //Step1. 等待样本ready
-        environment->log(EnvironmentRole::WORKER, EnvironmentLogType::MASTER_LOG, EnvironmentLogLevel::NOTICE, 
-            "Start %s, wait data ready", epoch_log_title.c_str());
-        while (dataset->epoch_data_status(epoch_id) != DatasetStatus::Ready) {
-            sleep(30);  
-            dataset->pre_detect_data(epoch_id);
+        {
             environment->log(EnvironmentRole::WORKER, EnvironmentLogType::MASTER_LOG, EnvironmentLogLevel::NOTICE, 
-                "%s, data not ready, wait 30s", epoch_log_title.c_str());
-        } 
-        environment->log(EnvironmentRole::WORKER, EnvironmentLogType::MASTER_LOG, EnvironmentLogLevel::NOTICE, 
-            "%s, data is ready, start traning", epoch_log_title.c_str());
-        environment->barrier(EnvironmentRole::WORKER); 
-
+                "%s, wait data ready:%s", epoch_log_title.c_str(), data_path.c_str());
+            while (dataset->epoch_data_status(epoch_id) != DatasetStatus::Ready) {
+                sleep(30);  
+                dataset->pre_detect_data(epoch_id);
+                environment->log(EnvironmentRole::WORKER, EnvironmentLogType::MASTER_LOG, EnvironmentLogLevel::NOTICE, 
+                "data not ready, wait 30s");
+            } 
+            environment->log(EnvironmentRole::WORKER, EnvironmentLogType::MASTER_LOG, EnvironmentLogLevel::NOTICE, 
+                "Start %s, data is ready", epoch_log_title.c_str());
+            environment->barrier(EnvironmentRole::WORKER); 
+        }
+    
         //Step2. 运行训练网络
-        bool already_dump_inference_model = false;
-        for (int i = 0; i < _executor_num; ++i) {
-            std::vector<std::shared_ptr<std::thread>> train_threads(_train_thread_num);
-            for (int thread_id = 0; thread_id < _train_thread_num; ++thread_id) {
-                train_threads[i].reset(new std::thread([this](int exe_idx, int thread_idx) {
-                    auto* executor = _threads_executor[thread_idx][exe_idx].get();
-                    run_executor(executor);
-                }, i, thread_id));
-            }   
-            for (int i = 0; i < _train_thread_num; ++i) {
-                train_threads[i]->join();
+        {
+            std::map<std::string, paddle::framework::Channel<DataItem>> backup_input_map;
+            for (auto& executor : _executors) {
+                environment->barrier(EnvironmentRole::WORKER); 
+                VLOG(2) << "Start executor:" << executor->train_exe_name();
+                auto data_name = executor->train_data_name();
+                paddle::framework::Channel<DataItem> input_channel;
+                if (backup_input_map.count(data_name)) {
+                    input_channel = backup_input_map[data_name];
+                } else {
+                    input_channel = dataset->fetch_data(data_name, epoch_id);
+                }
+                input_channel = executor->run(input_channel, dataset->data_parser(data_name));
+                VLOG(2) << "End executor:" << executor->train_exe_name();
+
+                // 等待异步梯度完成
+                _context_ptr->ps_client()->flush();
+                environment->barrier(EnvironmentRole::WORKER); 
+
+                if (executor->is_dump_all_model()) {
+                    already_dump_inference_model = true;
+                    wait_save_model(epoch_id, ModelSaveWay::ModelSaveInferenceDelta);
+                }
+                backup_input_map[data_name] = input_channel;
+                environment->barrier(EnvironmentRole::WORKER); 
             }
-            environment->barrier(EnvironmentRole::WORKER); 
+        }
 
-            if (_threads_executor[0][i]->is_dump_all_model()) {
+        //Step3. Dump Model For Delta&&Checkpoint
+        {
+            if (!already_dump_inference_model) {
                 already_dump_inference_model = true;
                 wait_save_model(epoch_id, ModelSaveWay::ModelSaveInferenceDelta);
-            }
+            } 
+            wait_save_model(epoch_id, ModelSaveWay::ModelSaveTrainCheckpoint);
             environment->barrier(EnvironmentRole::WORKER); 
-        }
 
-        //Step3. Dump Model For Delta&&Checkpoint
-        if (!already_dump_inference_model) {
-            already_dump_inference_model = true;
-            wait_save_model(epoch_id, ModelSaveWay::ModelSaveInferenceDelta);
-        } 
-        wait_save_model(epoch_id, ModelSaveWay::ModelSaveTrainCheckpoint);
-        environment->barrier(EnvironmentRole::WORKER); 
-        
+            epoch_accessor->epoch_done(epoch_id);
+            environment->barrier(EnvironmentRole::WORKER); 
+
+        }
+    
         //Step4. Output Monitor && RunStatus
         //TODO
     }
@@ -138,11 +186,6 @@ int LearnerProcess::run() {
     return 0;
 }
 
-int LearnerProcess::run_executor(Executor* executor) {
-    //TODO
-    return 0;
-}
-
 }  // namespace feed
 }  // namespace custom_trainer
 }  // namespace paddle
diff --git a/paddle/fluid/train/custom_trainer/feed/process/learner_process.h b/paddle/fluid/train/custom_trainer/feed/process/learner_process.h
index 7addb601e9cc2cee07194ae262fe622c00aab4bf..86f0378a305d2604082a4f1b03be9e7f1e93ab19 100644
--- a/paddle/fluid/train/custom_trainer/feed/process/learner_process.h
+++ b/paddle/fluid/train/custom_trainer/feed/process/learner_process.h
@@ -4,13 +4,11 @@
  */
 #pragma once
 #include "paddle/fluid/train/custom_trainer/feed/process/process.h"
-#include "paddle/fluid/train/custom_trainer/feed/executor/executor.h"
+#include "paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.h"
 
 namespace paddle {
 namespace custom_trainer {
 namespace feed {
-
-typedef std::vector<std::shared_ptr<Executor>> MultiExecutor;
 class LearnerProcess : public Process {
 public:
     LearnerProcess() {}
@@ -20,19 +18,15 @@ public:
     virtual int initialize(std::shared_ptr<TrainerContext> context_ptr);
 
 protected:
-//同步保存所有模型
+// 加载所有模型
+virtual int load_model(uint64_t epoch_id);
+// 同步保存所有模型
 virtual int wait_save_model(uint64_t epoch_id, ModelSaveWay way);
-//异步保存指定模型
+// 异步保存指定模型
 virtual std::future<int> save_model(uint64_t epoch_id, int table_id, ModelSaveWay way);
-//执行指定训练网络
-virtual int run_executor(Executor* executor);
-
-
 
 private:
-    int _executor_num = 0;    //需要执行训练的网络个数
-    int _train_thread_num = 1;//并行训练线程数
-    std::vector<MultiExecutor> _threads_executor;
+    std::vector<std::shared_ptr<MultiThreadExecutor>> _executors;
 };
 
 }  // namespace feed
diff --git a/paddle/fluid/train/custom_trainer/feed/process/process.cc b/paddle/fluid/train/custom_trainer/feed/process/process.cc
index 5226c8c59228d89aaa5347e64402f3731f0ae102..0e1cd5fcbeb9bbca14a5822431347cd05a7f2dfd 100644
--- a/paddle/fluid/train/custom_trainer/feed/process/process.cc
+++ b/paddle/fluid/train/custom_trainer/feed/process/process.cc
@@ -5,8 +5,8 @@
 namespace paddle {
 namespace custom_trainer {
 namespace feed {
-REGISTER_CLASS(Process, InitEnvProcess);
-REGISTER_CLASS(Process, LearnerProcess);
+REGIST_CLASS(Process, InitEnvProcess);
+REGIST_CLASS(Process, LearnerProcess);
 int Process::run() {
     return 0;
 }
diff --git a/paddle/fluid/train/custom_trainer/feed/process/process.h b/paddle/fluid/train/custom_trainer/feed/process/process.h
index 2e83e63cbb1e0907e6e62d227d27047dbd350444..127481e9371ca704c2b3cef991241c6d542b4be3 100644
--- a/paddle/fluid/train/custom_trainer/feed/process/process.h
+++ b/paddle/fluid/train/custom_trainer/feed/process/process.h
@@ -18,7 +18,7 @@ public:
 protected:
     TrainerContext* _context_ptr = NULL;
 };
-REGISTER_REGISTERER(Process);
+REGIST_REGISTERER(Process);
 
 }  // namespace feed
 }  // namespace custom_trainer
diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/create_programs.py b/paddle/fluid/train/custom_trainer/feed/scripts/create_programs.py
index e2b1cb7caa7d9a2589a448cd046812ad34805a6c..af0de70ddf070b2a66f2a47423066b59ba02b168 100644
--- a/paddle/fluid/train/custom_trainer/feed/scripts/create_programs.py
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/create_programs.py
@@ -95,7 +95,7 @@ class ModelBuilder:
         main_program = fluid.Program()
         startup_program = fluid.Program()
         with fluid.program_guard(main_program, startup_program):
-            inputs, outputs = self._inference()
+            input_accessor, sparses, inputs, outputs = self._inference()
             test_program = main_program.clone(for_test=True)
             loss, labels = self._loss_function(*outputs)
 
@@ -115,14 +115,34 @@ class ModelBuilder:
                 f.write(program.desc.serialize_to_string())
 
         params = filter(fluid.io.is_parameter, main_program.list_vars())
+        vars = []
+        sums=[]
+        for param in params:
+            if param.name.find("bn") == 0:
+                sums.append({"name": param.name, "shape": param.shape});
+            else:
+                vars.append({"name": param.name, "shape": param.shape});
+
+        for accessor in input_accessor:
+            if (accessor["input"] == "sparses"):
+                accessor["input"] = sparses
+            if (accessor["input"] == "vars"):
+                accessor["input"] = vars
+            if (accessor["input"] == "sums"):
+                accessor["input"] = sums
+            if (accessor["input"] == "labels"):
+                accessor["input"] = [
+                    {"label_name": label.name, "shape": label.shape, "output_name": output.name } 
+                    for (label, output) in zip(labels, outputs) ]
+            
 
         model_desc_path = os.path.join(self._save_path, 'model.yaml')
         model_desc = {
             'inputs': [{"name": var.name, "shape": var.shape} for var in inputs],
             'outputs': [{"name": var.name, "shape": var.shape} for var in outputs],
             'labels': [{"name": var.name, "shape": var.shape} for var in labels],
-            'vars': [{"name": var.name, "shape": var.shape} for var in params],
             'loss': loss.name,
+            'input_accessor': input_accessor
         }
 
         with open(model_desc_path, 'w') as f:
diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/example.py b/paddle/fluid/train/custom_trainer/feed/scripts/example.py
index 96db068ab58f4bdb8b5f46c88bc34039d333f81e..4805e853b1daf9687c78814cab0fcfecbde738ba 100644
--- a/paddle/fluid/train/custom_trainer/feed/scripts/example.py
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/example.py
@@ -18,9 +18,11 @@ def inference():
         list<Variable>: outputs
     """
     # TODO: build network here
-    cvm_input = fluid.layers.data(name='cvm_input', shape=[4488], dtype='float32')
+    cvm_input = fluid.layers.data(name='cvm_input', shape=[4488], dtype='float32', stop_gradient=False)
 
     net = cvm_input
+    net = fluid.layers.data_norm(input=net, name="bn6048", epsilon=1e-4,
+        param_attr={"batch_size":1e4, "batch_sum_default":0.0, "batch_square":1e4})
     net = fluid.layers.fc(net, 512, act='relu', name='fc_1')
     net = fluid.layers.fc(net, 256, act='relu', name='fc_2')
     net = fluid.layers.fc(net, 256, act='relu', name='fc_3')
diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/join.py b/paddle/fluid/train/custom_trainer/feed/scripts/join.py
new file mode 100644
index 0000000000000000000000000000000000000000..f96bdaa423ce01f48e8f956b01c9892efa8c4d52
--- /dev/null
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/join.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python
+#-*- coding:utf-8 -*-
+
+"""
+This is an example of network building
+"""
+
+from __future__ import print_function, division
+import paddle
+from paddle import fluid
+
+def sparse_cvm_dim(sparse_info):
+    return sparse_info['slot_dim'] * len(sparse_info['slots'])
+
+def inference():
+    """Build inference network(without loss and optimizer)
+
+    Returns:
+        list<Dict>: sparse_inputs
+        and
+        list<Variable>: inputs
+        and
+        list<Variable>: outputs
+    """
+    sparse_cvm = { "name": "cvm_input", "slot_dim" : 11, "slots": [6048,6002,6145,6202,6201,6121,6738,6119,6146,6120,6147,6122,6123,6118,6142,6143,6008,6148,6151,6127,6144,6094,6083,6952,6739,6150,6109,6003,6099,6149,6129,6203,6153,6152,6128,6106,6251,7082,7515,6951,6949,7080,6066,7507,6186,6007,7514,6125,7506,10001,6006,7023,6085,10000,6098,6250,6110,6124,6090,6082,6067,6101,6004,6191,7075,6948,6157,6126,6188,7077,6070,6111,6087,6103,6107,6194,6156,6005,6247,6814,6158,7122,6058,6189,7058,6059,6115,7079,7081,6833,7024,6108,13342,13345,13412,13343,13350,13346,13409,6009,6011,6012,6013,6014,6015,6019,6023,6024,6027,6029,6031,6050,6060,6068,6069,6089,6095,6105,6112,6130,6131,6132,6134,6161,6162,6163,6166,6182,6183,6185,6190,6212,6213,6231,6233,6234,6236,6238,6239,6240,6241,6242,6243,6244,6245,6354,7002,7005,7008,7010,7012,7013,7015,7016,7017,7018,7019,7020,7045,7046,7048,7049,7052,7054,7056,7064,7066,7076,7078,7083,7084,7085,7086,7087,7088,7089,7090,7099,7100,7101,7102,7103,7104,7105,7109,7124,7126,7136,7142,7143,7144,7145,7146,7147,7148,7150,7151,7152,7153,7154,7155,7156,7157,7047,7050,6253,6254,6255,6256,6257,6259,6260,6261,7170,7185,7186,6751,6755,6757,6759,6760,6763,6764,6765,6766,6767,6768,6769,6770,7502,7503,7504,7505,7510,7511,7512,7513,6806,6807,6808,6809,6810,6811,6812,6813,6815,6816,6817,6819,6823,6828,6831,6840,6845,6875,6879,6881,6888,6889,6947,6950,6956,6957,6959,10006,10008,10009,10010,10011,10016,10017,10018,10019,10020,10021,10022,10023,10024,10029,10030,10031,10032,10033,10034,10035,10036,10037,10038,10039,10040,10041,10042,10044,10045,10046,10051,10052,10053,10054,10055,10056,10057,10060,10066,10069,6820,6821,6822,13333,13334,13335,13336,13337,13338,13339,13340,13341,13351,13352,13353,13359,13361,13362,13363,13366,13367,13368,13369,13370,13371,13375,13376,5700,5702,13400,13401,13402,13403,13404,13406,13407,13408,13410,13417,13418,13419,13420,13422,13425,13427,13428,13429,13430,13431,13433,13434,13436,13437,13326,13330,13331,5717,13442,13451,13452,13455,13456,13457,13458,13459,13460,13461,13462,13463,13464,13465,13466,13467,13468,1104,1106,1107,1108,1109,1110,1111,1112,1113,1114,1115,1116,1117,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128,1129,13812,13813,6740,1490,1491]} 
+
+    # TODO: build network here
+    cvm_input = fluid.layers.data(name='cvm_input', shape=[sparse_cvm_dim(sparse_cvm)], dtype='float32', stop_gradient=False)
+
+    net = cvm_input
+    net = fluid.layers.data_norm(input=net, name="bn6048", epsilon=1e-4,
+        param_attr={"batch_size":1e4, "batch_sum_default":0.0, "batch_square":1e4})
+    net = fluid.layers.fc(net, 511, act='relu', name='fc_1')
+    net = fluid.layers.fc(net, 255, act='relu', name='fc_2')
+    net = fluid.layers.fc(net, 255, act='relu', name='fc_3')
+    net = fluid.layers.fc(net, 127, act='relu', name='fc_4')
+    net = fluid.layers.fc(net, 127, act='relu', name='fc_5')
+    net = fluid.layers.fc(net, 127, act='relu', name='fc_6')
+    net = fluid.layers.fc(net, 127, act='relu', name='fc_7')
+
+    ctr_output = fluid.layers.fc(net, 1, act='sigmoid', name='ctr')
+    
+    accessors = [
+        { "class": "AbacusSparseUpdateAccessor", "input": "sparses", "table_id": 0, "need_gradient": False},
+        { "class": "DenseInputAccessor", "input": "vars", "table_id": 1, "need_gradient": True, "async_pull": True},
+        { "class": "DenseInputAccessor", "input": "sums", "table_id": 2, "need_gradient": True, "async_pull": True},
+        { "class": "LabelInputAccessor", "input": "labels"}
+    ]
+    return accessors, [sparse_cvm], [cvm_input], [ctr_output]
+
+def loss_function(ctr_output):
+    """
+    Args:
+        *outputs: the second result of inference()
+
+    Returns:
+        Variable: loss
+        and
+        list<Variable>: labels
+    """
+    # TODO: calc loss here
+
+    label = fluid.layers.data(name='label_ctr', shape=ctr_output.shape, dtype='float32')
+    loss = fluid.layers.square_error_cost(input=ctr_output, label=label)
+    loss = fluid.layers.mean(loss, name='loss_ctr')
+
+    return loss, [label]
diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/example/main_program b/paddle/fluid/train/custom_trainer/feed/scripts/model/example/main_program
new file mode 100644
index 0000000000000000000000000000000000000000..cc20c06bd5bff5e10fae6efdcff8d78cffc2aa7c
Binary files /dev/null and b/paddle/fluid/train/custom_trainer/feed/scripts/model/example/main_program differ
diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/example/model.yaml b/paddle/fluid/train/custom_trainer/feed/scripts/model/example/model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..849c39d23c12d4f005ffb4a7b703e1e784d52b2e
--- /dev/null
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/model/example/model.yaml
@@ -0,0 +1,49 @@
+inputs:
+- name: cvm_input
+  shape: [-1, 4488]
+labels:
+- name: label_ctr
+  shape: [-1, 1]
+loss: loss_ctr
+outputs:
+- name: ctr.tmp_2
+  shape: [-1, 1]
+vars:
+- name: bn6048.batch_size
+  shape: [4488]
+- name: bn6048.batch_sum
+  shape: [4488]
+- name: bn6048.batch_square_sum
+  shape: [4488]
+- name: fc_1.w_0
+  shape: [4488, 512]
+- name: fc_1.b_0
+  shape: [512]
+- name: fc_2.w_0
+  shape: [512, 256]
+- name: fc_2.b_0
+  shape: [256]
+- name: fc_3.w_0
+  shape: [256, 256]
+- name: fc_3.b_0
+  shape: [256]
+- name: fc_4.w_0
+  shape: [256, 128]
+- name: fc_4.b_0
+  shape: [128]
+- name: fc_5.w_0
+  shape: [128, 128]
+- name: fc_5.b_0
+  shape: [128]
+- name: fc_6.w_0
+  shape: [128, 128]
+- name: fc_6.b_0
+  shape: [128]
+- name: fc_7.w_0
+  shape: [128, 128]
+- name: fc_7.b_0
+  shape: [128]
+- name: ctr.w_0
+  shape: [128, 1]
+- name: ctr.b_0
+  shape: [1]
diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/example/startup_program b/paddle/fluid/train/custom_trainer/feed/scripts/model/example/startup_program
new file mode 100644
index 0000000000000000000000000000000000000000..259839f93f4f390a55c4589d14f4a4cef3b07652
Binary files /dev/null and b/paddle/fluid/train/custom_trainer/feed/scripts/model/example/startup_program differ
diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/example/test_program b/paddle/fluid/train/custom_trainer/feed/scripts/model/example/test_program
new file mode 100644
index 0000000000000000000000000000000000000000..147833363f095e719093aa22b4da2ac31b847cfb
Binary files /dev/null and b/paddle/fluid/train/custom_trainer/feed/scripts/model/example/test_program differ
diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/join/main_program b/paddle/fluid/train/custom_trainer/feed/scripts/model/join/main_program
new file mode 100644
index 0000000000000000000000000000000000000000..9d5a954648abd1895f32c5aab6ec502ec6767695
Binary files /dev/null and b/paddle/fluid/train/custom_trainer/feed/scripts/model/join/main_program differ
diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/join/model.yaml b/paddle/fluid/train/custom_trainer/feed/scripts/model/join/model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..284827fa35573c42f6fb4745eff4674b153d8dd4
--- /dev/null
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/model/join/model.yaml
@@ -0,0 +1,103 @@
+input_accessor:
+- class: AbacusSparseUpdateAccessor
+  input:
+  - name: cvm_input
+    slot_dim: 11
+    slots: [6048, 6002, 6145, 6202, 6201, 6121, 6738, 6119, 6146, 6120, 6147, 6122,
+      6123, 6118, 6142, 6143, 6008, 6148, 6151, 6127, 6144, 6094, 6083, 6952, 6739,
+      6150, 6109, 6003, 6099, 6149, 6129, 6203, 6153, 6152, 6128, 6106, 6251, 7082,
+      7515, 6951, 6949, 7080, 6066, 7507, 6186, 6007, 7514, 6125, 7506, 10001, 6006,
+      7023, 6085, 10000, 6098, 6250, 6110, 6124, 6090, 6082, 6067, 6101, 6004, 6191,
+      7075, 6948, 6157, 6126, 6188, 7077, 6070, 6111, 6087, 6103, 6107, 6194, 6156,
+      6005, 6247, 6814, 6158, 7122, 6058, 6189, 7058, 6059, 6115, 7079, 7081, 6833,
+      7024, 6108, 13342, 13345, 13412, 13343, 13350, 13346, 13409, 6009, 6011, 6012,
+      6013, 6014, 6015, 6019, 6023, 6024, 6027, 6029, 6031, 6050, 6060, 6068, 6069,
+      6089, 6095, 6105, 6112, 6130, 6131, 6132, 6134, 6161, 6162, 6163, 6166, 6182,
+      6183, 6185, 6190, 6212, 6213, 6231, 6233, 6234, 6236, 6238, 6239, 6240, 6241,
+      6242, 6243, 6244, 6245, 6354, 7002, 7005, 7008, 7010, 7012, 7013, 7015, 7016,
+      7017, 7018, 7019, 7020, 7045, 7046, 7048, 7049, 7052, 7054, 7056, 7064, 7066,
+      7076, 7078, 7083, 7084, 7085, 7086, 7087, 7088, 7089, 7090, 7099, 7100, 7101,
+      7102, 7103, 7104, 7105, 7109, 7124, 7126, 7136, 7142, 7143, 7144, 7145, 7146,
+      7147, 7148, 7150, 7151, 7152, 7153, 7154, 7155, 7156, 7157, 7047, 7050, 6253,
+      6254, 6255, 6256, 6257, 6259, 6260, 6261, 7170, 7185, 7186, 6751, 6755, 6757,
+      6759, 6760, 6763, 6764, 6765, 6766, 6767, 6768, 6769, 6770, 7502, 7503, 7504,
+      7505, 7510, 7511, 7512, 7513, 6806, 6807, 6808, 6809, 6810, 6811, 6812, 6813,
+      6815, 6816, 6817, 6819, 6823, 6828, 6831, 6840, 6845, 6875, 6879, 6881, 6888,
+      6889, 6947, 6950, 6956, 6957, 6959, 10006, 10008, 10009, 10010, 10011, 10016,
+      10017, 10018, 10019, 10020, 10021, 10022, 10023, 10024, 10029, 10030, 10031,
+      10032, 10033, 10034, 10035, 10036, 10037, 10038, 10039, 10040, 10041, 10042,
+      10044, 10045, 10046, 10051, 10052, 10053, 10054, 10055, 10056, 10057, 10060,
+      10066, 10069, 6820, 6821, 6822, 13333, 13334, 13335, 13336, 13337, 13338, 13339,
+      13340, 13341, 13351, 13352, 13353, 13359, 13361, 13362, 13363, 13366, 13367,
+      13368, 13369, 13370, 13371, 13375, 13376, 5700, 5702, 13400, 13401, 13402, 13403,
+      13404, 13406, 13407, 13408, 13410, 13417, 13418, 13419, 13420, 13422, 13425,
+      13427, 13428, 13429, 13430, 13431, 13433, 13434, 13436, 13437, 13326, 13330,
+      13331, 5717, 13442, 13451, 13452, 13455, 13456, 13457, 13458, 13459, 13460,
+      13461, 13462, 13463, 13464, 13465, 13466, 13467, 13468, 1104, 1106, 1107, 1108,
+      1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1119, 1120, 1121, 1122,
+      1123, 1124, 1125, 1126, 1127, 1128, 1129, 13812, 13813, 6740, 1490, 1491]
+  need_gradient: false
+  table_id: 0
+- async_pull: true
+  class: DenseInputAccessor
+  input:
+  - name: fc_1.w_0
+    shape: [4488, 511]
+  - name: fc_1.b_0
+    shape: [511]
+  - name: fc_2.w_0
+    shape: [511, 255]
+  - name: fc_2.b_0
+    shape: [255]
+  - name: fc_3.w_0
+    shape: [255, 255]
+  - name: fc_3.b_0
+    shape: [255]
+  - name: fc_4.w_0
+    shape: [255, 127]
+  - name: fc_4.b_0
+    shape: [127]
+  - name: fc_5.w_0
+    shape: [127, 127]
+  - name: fc_5.b_0
+    shape: [127]
+  - name: fc_6.w_0
+    shape: [127, 127]
+  - name: fc_6.b_0
+    shape: [127]
+  - name: fc_7.w_0
+    shape: [127, 127]
+  - name: fc_7.b_0
+    shape: [127]
+  - name: ctr.w_0
+    shape: [127, 1]
+  - name: ctr.b_0
+    shape: [1]
+  need_gradient: true
+  table_id: 1
+- async_pull: true
+  class: DenseInputAccessor
+  input:
+  - name: bn6048.batch_size
+    shape: [4488]
+  - name: bn6048.batch_sum
+    shape: [4488]
+  - name: bn6048.batch_square_sum
+    shape: [4488]
+  need_gradient: true
+  table_id: 2
+- class: LabelInputAccessor
+  input:
+  - label_name: label_ctr
+    output_name: ctr.tmp_2
+    shape: [-1, 1]
+inputs:
+- name: cvm_input
+  shape: [-1, 4488]
+labels:
+- name: label_ctr
+  shape: [-1, 1]
+loss: loss_ctr
+outputs:
+- name: ctr.tmp_2
+  shape: [-1, 1]
diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/join/startup_program b/paddle/fluid/train/custom_trainer/feed/scripts/model/join/startup_program
new file mode 100644
index 0000000000000000000000000000000000000000..1e0b9f82c10f926a5bf2a08abfcdc9b8cdd810cb
Binary files /dev/null and b/paddle/fluid/train/custom_trainer/feed/scripts/model/join/startup_program differ
diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/join/test_program b/paddle/fluid/train/custom_trainer/feed/scripts/model/join/test_program
new file mode 100644
index 0000000000000000000000000000000000000000..f056e29f9a0bd56332391fd16aee5212ce5e3a20
Binary files /dev/null and b/paddle/fluid/train/custom_trainer/feed/scripts/model/join/test_program differ
diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/update/main_program b/paddle/fluid/train/custom_trainer/feed/scripts/model/update/main_program
new file mode 100644
index 0000000000000000000000000000000000000000..ab8290d6940dc12a2fb052df286f44a4abf8cb10
Binary files /dev/null and b/paddle/fluid/train/custom_trainer/feed/scripts/model/update/main_program differ
diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/update/model.yaml b/paddle/fluid/train/custom_trainer/feed/scripts/model/update/model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9bf93e56714fcbd4d30363fa39fddd12b1147c32
--- /dev/null
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/model/update/model.yaml
@@ -0,0 +1,84 @@
+input_accessor:
+- class: AbacusSparseUpdateAccessor
+  input:
+  - name: cvm_input
+    slot_dim: 9
+    slots: [6048, 6002, 6145, 6202, 6201, 6121, 6738, 6119, 6146, 6120, 6147, 6122,
+      6123, 6118, 6142, 6143, 6008, 6148, 6151, 6127, 6144, 6094, 6083, 6952, 6739,
+      6150, 6109, 6003, 6099, 6149, 6129, 6203, 6153, 6152, 6128, 6106, 6251, 7082,
+      7515, 6951, 6949, 7080, 6066, 7507, 6186, 6007, 7514, 6125, 7506, 10001, 6006,
+      7023, 6085, 10000, 6098, 6250, 6110, 6124, 6090, 6082, 6067, 6101, 6004, 6191,
+      7075, 6948, 6157, 6126, 6188, 7077, 6070, 6111, 6087, 6103, 6107, 6194, 6156,
+      6005, 6247, 6814, 6158, 7122, 6058, 6189, 7058, 6059, 6115, 7079, 7081, 6833,
+      7024, 6108, 13342, 13345, 13412, 13343, 13350, 13346, 13409, 6009, 6011, 6012,
+      6013, 6014, 6015, 6019, 6023, 6024, 6027, 6029, 6031, 6050, 6060, 6068, 6069,
+      6089, 6095, 6105, 6112, 6130, 6131, 6132, 6134, 6161, 6162, 6163, 6166, 6182,
+      6183, 6185, 6190, 6212, 6213, 6231, 6233, 6234, 6236, 6238, 6239, 6240, 6241,
+      6242, 6243, 6244, 6245, 6354, 7002, 7005, 7008, 7010, 7012, 7013, 7015, 7016,
+      7017, 7018, 7019, 7020, 7045, 7046, 7048, 7049, 7052, 7054, 7056, 7064, 7066,
+      7076, 7078, 7083, 7084, 7085, 7086, 7087, 7088, 7089, 7090, 7099, 7100, 7101,
+      7102, 7103, 7104, 7105, 7109, 7124, 7126, 7136, 7142, 7143, 7144, 7145, 7146,
+      7147, 7148, 7150, 7151, 7152, 7153, 7154, 7155, 7156, 7157, 7047, 7050, 6253,
+      6254, 6255, 6256, 6257, 6259, 6260, 6261, 7170, 7185, 7186, 6751, 6755, 6757,
+      6759, 6760, 6763, 6764, 6765, 6766, 6767, 6768, 6769, 6770, 7502, 7503, 7504,
+      7505, 7510, 7511, 7512, 7513, 6806, 6807, 6808, 6809, 6810, 6811, 6812, 6813,
+      6815, 6816, 6817, 6819, 6823, 6828, 6831, 6840, 6845, 6875, 6879, 6881, 6888,
+      6889, 6947, 6950, 6956, 6957, 6959, 10006, 10008, 10009, 10010, 10011, 10016,
+      10017, 10018, 10019, 10020, 10021, 10022, 10023, 10024, 10029, 10030, 10031,
+      10032, 10033, 10034, 10035, 10036, 10037, 10038, 10039, 10040, 10041, 10042,
+      10044, 10045, 10046, 10051, 10052, 10053, 10054, 10055, 10056, 10057, 10060,
+      10066, 10069, 6820, 6821, 6822, 13333, 13334, 13335, 13336, 13337, 13338, 13339,
+      13340, 13341, 13351, 13352, 13353, 13359, 13361, 13362, 13363, 13366, 13367,
+      13368, 13369, 13370, 13371, 13375, 13376, 5700, 5702, 13400, 13401, 13402, 13403,
+      13404, 13406, 13407, 13408, 13410, 13417, 13418, 13419, 13420, 13422, 13425,
+      13427, 13428, 13429, 13430, 13431, 13433, 13434, 13436, 13437, 13326, 13330,
+      13331, 5717, 13442, 13451, 13452, 13455, 13456, 13457, 13458, 13459, 13460,
+      13461, 13462, 13463, 13464, 13465, 13466, 13467, 13468, 1104, 1106, 1107, 1108,
+      1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1119, 1120, 1121, 1122,
+      1123, 1124, 1125, 1126, 1127, 1128, 1129, 13812, 13813, 6740, 1490, 1491]
+  need_gradient: true
+  table_id: 0
+- async_pull: true
+  class: DenseInputAccessor
+  input:
+  - name: fc_1.w_0
+    shape: [3672, 511]
+  - name: fc_1.b_0
+    shape: [511]
+  - name: fc_2.w_0
+    shape: [511, 255]
+  - name: fc_2.b_0
+    shape: [255]
+  - name: fc_3.w_0
+    shape: [255, 127]
+  - name: fc_3.b_0
+    shape: [127]
+  - name: fc_4.w_0
+    shape: [127, 127]
+  - name: fc_4.b_0
+    shape: [127]
+  - name: fc_5.w_0
+    shape: [127, 127]
+  - name: fc_5.b_0
+    shape: [127]
+  - name: ctr.w_0
+    shape: [127, 1]
+  - name: ctr.b_0
+    shape: [1]
+  need_gradient: true
+  table_id: 3
+- class: LabelInputAccessor
+  input:
+  - label_name: label_ctr
+    output_name: ctr.tmp_2
+    shape: [-1, 1]
+inputs:
+- name: cvm_input
+  shape: [-1, 3672]
+labels:
+- name: label_ctr
+  shape: [-1, 1]
+loss: loss_ctr
+outputs:
+- name: ctr.tmp_2
+  shape: [-1, 1]
diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/update/startup_program b/paddle/fluid/train/custom_trainer/feed/scripts/model/update/startup_program
new file mode 100644
index 0000000000000000000000000000000000000000..ad56050377a0dce6412cfdab650483f88ab5f43a
Binary files /dev/null and b/paddle/fluid/train/custom_trainer/feed/scripts/model/update/startup_program differ
diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/update/test_program b/paddle/fluid/train/custom_trainer/feed/scripts/model/update/test_program
new file mode 100644
index 0000000000000000000000000000000000000000..5e6ef75f3b3f7bb164aae179dd5951deecc3b04a
Binary files /dev/null and b/paddle/fluid/train/custom_trainer/feed/scripts/model/update/test_program differ
diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/update.py b/paddle/fluid/train/custom_trainer/feed/scripts/update.py
new file mode 100644
index 0000000000000000000000000000000000000000..635253482f2c1e2e435354964d4953aa7c4b27ca
--- /dev/null
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/update.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+#-*- coding:utf-8 -*-
+
+"""
+This is an example of network building
+"""
+
+from __future__ import print_function, division
+import paddle
+from paddle import fluid
+
+def sparse_cvm_dim(sparse_info):
+    return sparse_info['slot_dim'] * len(sparse_info['slots'])
+
+def inference():
+    """Build inference network(without loss and optimizer)
+
+    Returns:
+        list<Variable>: inputs
+        and
+        list<Variable>: outputs
+    """
+    sparse_cvm = { "name": "cvm_input", "slot_dim" : 9, "slots": [6048,6002,6145,6202,6201,6121,6738,6119,6146,6120,6147,6122,6123,6118,6142,6143,6008,6148,6151,6127,6144,6094,6083,6952,6739,6150,6109,6003,6099,6149,6129,6203,6153,6152,6128,6106,6251,7082,7515,6951,6949,7080,6066,7507,6186,6007,7514,6125,7506,10001,6006,7023,6085,10000,6098,6250,6110,6124,6090,6082,6067,6101,6004,6191,7075,6948,6157,6126,6188,7077,6070,6111,6087,6103,6107,6194,6156,6005,6247,6814,6158,7122,6058,6189,7058,6059,6115,7079,7081,6833,7024,6108,13342,13345,13412,13343,13350,13346,13409,6009,6011,6012,6013,6014,6015,6019,6023,6024,6027,6029,6031,6050,6060,6068,6069,6089,6095,6105,6112,6130,6131,6132,6134,6161,6162,6163,6166,6182,6183,6185,6190,6212,6213,6231,6233,6234,6236,6238,6239,6240,6241,6242,6243,6244,6245,6354,7002,7005,7008,7010,7012,7013,7015,7016,7017,7018,7019,7020,7045,7046,7048,7049,7052,7054,7056,7064,7066,7076,7078,7083,7084,7085,7086,7087,7088,7089,7090,7099,7100,7101,7102,7103,7104,7105,7109,7124,7126,7136,7142,7143,7144,7145,7146,7147,7148,7150,7151,7152,7153,7154,7155,7156,7157,7047,7050,6253,6254,6255,6256,6257,6259,6260,6261,7170,7185,7186,6751,6755,6757,6759,6760,6763,6764,6765,6766,6767,6768,6769,6770,7502,7503,7504,7505,7510,7511,7512,7513,6806,6807,6808,6809,6810,6811,6812,6813,6815,6816,6817,6819,6823,6828,6831,6840,6845,6875,6879,6881,6888,6889,6947,6950,6956,6957,6959,10006,10008,10009,10010,10011,10016,10017,10018,10019,10020,10021,10022,10023,10024,10029,10030,10031,10032,10033,10034,10035,10036,10037,10038,10039,10040,10041,10042,10044,10045,10046,10051,10052,10053,10054,10055,10056,10057,10060,10066,10069,6820,6821,6822,13333,13334,13335,13336,13337,13338,13339,13340,13341,13351,13352,13353,13359,13361,13362,13363,13366,13367,13368,13369,13370,13371,13375,13376,5700,5702,13400,13401,13402,13403,13404,13406,13407,13408,13410,13417,13418,13419,13420,13422,13425,13427,13428,13429,13430,13431,13433,13434,13436,13437,13326,13330,13331,5717,13442,13451,13452,13455,13456,13457,13458,13459,13460,13461,13462,13463,13464,13465,13466,13467,13468,1104,1106,1107,1108,1109,1110,1111,1112,1113,1114,1115,1116,1117,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128,1129,13812,13813,6740,1490,1491]} 
+    # TODO: build network here
+    cvm_input = fluid.layers.data(name='cvm_input', shape=[sparse_cvm_dim(sparse_cvm)], dtype='float32', stop_gradient=False)
+
+    net = cvm_input
+    net = fluid.layers.fc(net, 511, act='relu', name='fc_1')
+    net = fluid.layers.fc(net, 255, act='relu', name='fc_2')
+    net = fluid.layers.fc(net, 127, act='relu', name='fc_3')
+    net = fluid.layers.fc(net, 127, act='relu', name='fc_4')
+    net = fluid.layers.fc(net, 127, act='relu', name='fc_5')
+
+    ctr_output = fluid.layers.fc(net, 1, act='sigmoid', name='ctr')
+
+    accessors = [
+        { "class": "AbacusSparseUpdateAccessor", "input": "sparses", "table_id": 0, "need_gradient": True},
+        { "class": "DenseInputAccessor", "input": "vars", "table_id": 3, "need_gradient": True, "async_pull": True},
+        { "class": "LabelInputAccessor", "input": "labels"}
+    ]
+
+    return accessors, [sparse_cvm], [cvm_input], [ctr_output]
+
+def loss_function(ctr_output):
+    """
+    Args:
+        *outputs: the second result of inference()
+
+    Returns:
+        Variable: loss
+        and
+        list<Variable>: labels
+    """
+    # TODO: calc loss here
+
+    label = fluid.layers.data(name='label_ctr', shape=ctr_output.shape, dtype='float32')
+    loss = fluid.layers.square_error_cost(input=ctr_output, label=label)
+    loss = fluid.layers.mean(loss, name='loss_ctr')
+
+    return loss, [label]
diff --git a/paddle/fluid/train/custom_trainer/feed/tool/format_newcate_hotnews.awk b/paddle/fluid/train/custom_trainer/feed/tool/format_newcate_hotnews.awk
new file mode 100755
index 0000000000000000000000000000000000000000..7820d4050110a1e1b59d739c126648d24681dd18
--- /dev/null
+++ b/paddle/fluid/train/custom_trainer/feed/tool/format_newcate_hotnews.awk
@@ -0,0 +1,21 @@
+#!/bin/awk -f
+{
+    if ($1 !~ /^([0-9a-zA-Z])+$/ || $2 !~ /^([0-9])+$/ || $3 !~ /^([0-9])+$/) {
+        next;
+    }
+    show = $2;
+    clk = $3;
+    if (clk > show) {
+        clk = show;
+    }
+    for (i = 0; i < clk; i++) {
+        $2 = "1";
+        $3 = "1";
+        print $0;
+    }
+    for (i = 0; i < show - clk; i++) {
+        $2 = "1";
+        $3 = "0";
+        print $0;
+    }
+}
diff --git a/paddle/fluid/train/custom_trainer/feed/tool/gdbinit b/paddle/fluid/train/custom_trainer/feed/tool/gdbinit
new file mode 100644
index 0000000000000000000000000000000000000000..1979250bd0771d5a6ac3e17aeab4187a5605272f
--- /dev/null
+++ b/paddle/fluid/train/custom_trainer/feed/tool/gdbinit
@@ -0,0 +1,697 @@
+#                                                                                                        
+#   STL GDB evaluators/views/utilities - 1.03
+#
+#   The new GDB commands:                                                         
+# 	    are entirely non instrumental                                             
+# 	    do not depend on any "inline"(s) - e.g. size(), [], etc
+#       are extremely tolerant to debugger settings
+#                                                                                 
+#   This file should be "included" in .gdbinit as following:
+#   source stl-views.gdb or just paste it into your .gdbinit file
+#
+#   The following STL containers are currently supported:
+#
+#       std::vector<T> -- via pvector command
+#       std::list<T> -- via plist or plist_member command
+#       std::map<T,T> -- via pmap or pmap_member command
+#       std::multimap<T,T> -- via pmap or pmap_member command
+#       std::set<T> -- via pset command
+#       std::multiset<T> -- via pset command
+#       std::deque<T> -- via pdequeue command
+#       std::stack<T> -- via pstack command
+#       std::queue<T> -- via pqueue command
+#       std::priority_queue<T> -- via ppqueue command
+#       std::bitset<n> -- via pbitset command
+#       std::string -- via pstring command
+#       std::widestring -- via pwstring command
+#
+#   The end of this file contains (optional) C++ beautifiers
+#   Make sure your debugger supports $argc
+#
+#   Simple GDB Macros writen by Dan Marinescu (H-PhD) - License GPL
+#   Inspired by intial work of Tom Malnar, 
+#     Tony Novac (PhD) / Cornell / Stanford,
+#     Gilad Mishne (PhD) and Many Many Others.
+#   Contact: dan_c_marinescu@yahoo.com (Subject: STL)
+#
+#   Modified to work with g++ 4.3 by Anders Elton
+#   Also added _member functions, that instead of printing the entire class in map, prints a member.
+
+
+
+#
+# std::vector<>
+#
+
+define pvector
+	if $argc == 0
+		help pvector
+	else
+		set $size = $arg0._M_impl._M_finish - $arg0._M_impl._M_start
+		set $capacity = $arg0._M_impl._M_end_of_storage - $arg0._M_impl._M_start
+		set $size_max = $size - 1
+	end
+	if $argc == 1
+		set $i = 0
+		while $i < $size
+			printf "elem[%u]: ", $i
+			p *($arg0._M_impl._M_start + $i)
+			set $i++
+		end
+	end
+	if $argc == 2
+		set $idx = $arg1
+		if $idx < 0 || $idx > $size_max
+			printf "idx1, idx2 are not in acceptable range: [0..%u].\n", $size_max
+		else
+			printf "elem[%u]: ", $idx
+			p *($arg0._M_impl._M_start + $idx)
+		end
+	end
+	if $argc == 3
+	  set $start_idx = $arg1
+	  set $stop_idx = $arg2
+	  if $start_idx > $stop_idx
+	    set $tmp_idx = $start_idx
+	    set $start_idx = $stop_idx
+	    set $stop_idx = $tmp_idx
+	  end
+	  if $start_idx < 0 || $stop_idx < 0 || $start_idx > $size_max || $stop_idx > $size_max
+	    printf "idx1, idx2 are not in acceptable range: [0..%u].\n", $size_max
+	  else
+	    set $i = $start_idx
+		while $i <= $stop_idx
+			printf "elem[%u]: ", $i
+			p *($arg0._M_impl._M_start + $i)
+			set $i++
+		end
+	  end
+	end
+	if $argc > 0
+		printf "Vector size = %u\n", $size
+		printf "Vector capacity = %u\n", $capacity
+		printf "Element "
+		whatis $arg0._M_impl._M_start
+	end
+end
+
+document pvector
+	Prints std::vector<T> information.
+	Syntax: pvector <vector> <idx1> <idx2>
+	Note: idx, idx1 and idx2 must be in acceptable range [0..<vector>.size()-1].
+	Examples:
+	pvector v - Prints vector content, size, capacity and T typedef
+	pvector v 0 - Prints element[idx] from vector
+	pvector v 1 2 - Prints elements in range [idx1..idx2] from vector
+end 
+
+#
+# std::list<>
+#
+
+define plist
+	if $argc == 0
+		help plist
+	else
+		set $head = &$arg0._M_impl._M_node
+		set $current = $arg0._M_impl._M_node._M_next
+		set $size = 0
+		while $current != $head
+			if $argc == 2
+				printf "elem[%u]: ", $size
+				p *($arg1*)($current + 1)
+			end
+			if $argc == 3
+				if $size == $arg2
+					printf "elem[%u]: ", $size
+					p *($arg1*)($current + 1)
+				end
+			end
+			set $current = $current._M_next
+			set $size++
+		end
+		printf "List size = %u \n", $size
+		if $argc == 1
+			printf "List "
+			whatis $arg0
+			printf "Use plist <variable_name> <element_type> to see the elements in the list.\n"
+		end
+	end
+end
+
+document plist
+	Prints std::list<T> information.
+	Syntax: plist <list> <T> <idx>: Prints list size, if T defined all elements or just element at idx
+	Examples:
+	plist l - prints list size and definition
+	plist l int - prints all elements and list size
+	plist l int 2 - prints the third element in the list (if exists) and list size
+end
+
+define plist_member
+	if $argc == 0
+		help plist_member
+	else
+		set $head = &$arg0._M_impl._M_node
+		set $current = $arg0._M_impl._M_node._M_next
+		set $size = 0
+		while $current != $head
+			if $argc == 3
+				printf "elem[%u]: ", $size
+				p (*($arg1*)($current + 1)).$arg2
+			end
+			if $argc == 4
+				if $size == $arg3
+					printf "elem[%u]: ", $size
+					p (*($arg1*)($current + 1)).$arg2
+				end
+			end
+			set $current = $current._M_next
+			set $size++
+		end
+		printf "List size = %u \n", $size
+		if $argc == 1
+			printf "List "
+			whatis $arg0
+			printf "Use plist_member <variable_name> <element_type> <member> to see the elements in the list.\n"
+		end
+	end
+end
+
+document plist_member
+	Prints std::list<T> information.
+	Syntax: plist <list> <T> <idx>: Prints list size, if T defined all elements or just element at idx
+	Examples:
+	plist_member l int member - prints all elements and list size
+	plist_member l int member 2 - prints the third element in the list (if exists) and list size
+end
+
+
+#
+# std::map and std::multimap
+#
+
+define pmap
+	if $argc == 0
+		help pmap
+	else
+		set $tree = $arg0
+		set $i = 0
+		set $node = $tree._M_t._M_impl._M_header._M_left
+		set $end = $tree._M_t._M_impl._M_header
+		set $tree_size = $tree._M_t._M_impl._M_node_count
+		if $argc == 1
+			printf "Map "
+			whatis $tree
+			printf "Use pmap <variable_name> <left_element_type> <right_element_type> to see the elements in the map.\n"
+		end
+		if $argc == 3
+			while $i < $tree_size
+				set $value = (void *)($node + 1)
+				printf "elem[%u].left: ", $i
+				p *($arg1*)$value
+				set $value = $value + sizeof($arg1)
+				printf "elem[%u].right: ", $i
+				p *($arg2*)$value
+				if $node._M_right != 0
+					set $node = $node._M_right
+					while $node._M_left != 0
+						set $node = $node._M_left
+					end
+				else
+					set $tmp_node = $node._M_parent
+					while $node == $tmp_node._M_right
+						set $node = $tmp_node
+						set $tmp_node = $tmp_node._M_parent
+					end
+					if $node._M_right != $tmp_node
+						set $node = $tmp_node
+					end
+				end
+				set $i++
+			end
+		end
+		if $argc == 4
+			set $idx = $arg3
+			set $ElementsFound = 0
+			while $i < $tree_size
+				set $value = (void *)($node + 1)
+				if *($arg1*)$value == $idx
+					printf "elem[%u].left: ", $i
+					p *($arg1*)$value
+					set $value = $value + sizeof($arg1)
+					printf "elem[%u].right: ", $i
+					p *($arg2*)$value
+					set $ElementsFound++
+				end
+				if $node._M_right != 0
+					set $node = $node._M_right
+					while $node._M_left != 0
+						set $node = $node._M_left
+					end
+				else
+					set $tmp_node = $node._M_parent
+					while $node == $tmp_node._M_right
+						set $node = $tmp_node
+						set $tmp_node = $tmp_node._M_parent
+					end
+					if $node._M_right != $tmp_node
+						set $node = $tmp_node
+					end
+				end
+				set $i++
+			end
+			printf "Number of elements found = %u\n", $ElementsFound
+		end
+		if $argc == 5
+			set $idx1 = $arg3
+			set $idx2 = $arg4
+			set $ElementsFound = 0
+			while $i < $tree_size
+				set $value = (void *)($node + 1)
+				set $valueLeft = *($arg1*)$value
+				set $valueRight = *($arg2*)($value + sizeof($arg1))
+				if $valueLeft == $idx1 && $valueRight == $idx2
+					printf "elem[%u].left: ", $i
+					p $valueLeft
+					printf "elem[%u].right: ", $i
+					p $valueRight
+					set $ElementsFound++
+				end
+				if $node._M_right != 0
+					set $node = $node._M_right
+					while $node._M_left != 0
+						set $node = $node._M_left
+					end
+				else
+					set $tmp_node = $node._M_parent
+					while $node == $tmp_node._M_right
+						set $node = $tmp_node
+						set $tmp_node = $tmp_node._M_parent
+					end
+					if $node._M_right != $tmp_node
+						set $node = $tmp_node
+					end
+				end
+				set $i++
+			end
+			printf "Number of elements found = %u\n", $ElementsFound
+		end
+		printf "Map size = %u\n", $tree_size
+	end
+end
+
+document pmap
+	Prints std::map<TLeft and TRight> or std::multimap<TLeft and TRight> information. Works for std::multimap as well.
+	Syntax: pmap <map> <TtypeLeft> <TypeRight> <valLeft> <valRight>: Prints map size, if T defined all elements or just element(s) with val(s)
+	Examples:
+	pmap m - prints map size and definition
+	pmap m int int - prints all elements and map size
+	pmap m int int 20 - prints the element(s) with left-value = 20 (if any) and map size
+	pmap m int int 20 200 - prints the element(s) with left-value = 20 and right-value = 200 (if any) and map size
+end
+
+
+define pmap_member
+	if $argc == 0
+		help pmap_member
+	else
+		set $tree = $arg0
+		set $i = 0
+		set $node = $tree._M_t._M_impl._M_header._M_left
+		set $end = $tree._M_t._M_impl._M_header
+		set $tree_size = $tree._M_t._M_impl._M_node_count
+		if $argc == 1
+			printf "Map "
+			whatis $tree
+			printf "Use pmap <variable_name> <left_element_type> <right_element_type> to see the elements in the map.\n"
+		end
+		if $argc == 5
+			while $i < $tree_size
+				set $value = (void *)($node + 1)
+				printf "elem[%u].left: ", $i
+				p (*($arg1*)$value).$arg2
+				set $value = $value + sizeof($arg1)
+				printf "elem[%u].right: ", $i
+				p (*($arg3*)$value).$arg4
+				if $node._M_right != 0
+					set $node = $node._M_right
+					while $node._M_left != 0
+						set $node = $node._M_left
+					end
+				else
+					set $tmp_node = $node._M_parent
+					while $node == $tmp_node._M_right
+						set $node = $tmp_node
+						set $tmp_node = $tmp_node._M_parent
+					end
+					if $node._M_right != $tmp_node
+						set $node = $tmp_node
+					end
+				end
+				set $i++
+			end
+		end
+		if $argc == 6
+			set $idx = $arg5
+			set $ElementsFound = 0
+			while $i < $tree_size
+				set $value = (void *)($node + 1)
+				if *($arg1*)$value == $idx
+					printf "elem[%u].left: ", $i
+					p (*($arg1*)$value).$arg2
+					set $value = $value + sizeof($arg1)
+					printf "elem[%u].right: ", $i
+					p (*($arg3*)$value).$arg4
+					set $ElementsFound++
+				end
+				if $node._M_right != 0
+					set $node = $node._M_right
+					while $node._M_left != 0
+						set $node = $node._M_left
+					end
+				else
+					set $tmp_node = $node._M_parent
+					while $node == $tmp_node._M_right
+						set $node = $tmp_node
+						set $tmp_node = $tmp_node._M_parent
+					end
+					if $node._M_right != $tmp_node
+						set $node = $tmp_node
+					end
+				end
+				set $i++
+			end
+			printf "Number of elements found = %u\n", $ElementsFound
+		end
+		printf "Map size = %u\n", $tree_size
+	end
+end
+
+document pmap_member
+	Prints std::map<TLeft and TRight> or std::multimap<TLeft and TRight> information. Works for std::multimap as well.
+	Syntax: pmap <map> <TtypeLeft> <TypeRight> <valLeft> <valRight>: Prints map size, if T defined all elements or just element(s) with val(s)
+	Examples:
+	pmap_member m class1 member1 class2 member2 - prints class1.member1 : class2.member2
+	pmap_member m class1 member1 class2 member2 lvalue - prints class1.member1 : class2.member2 where class1 == lvalue
+end
+
+
+#
+# std::set and std::multiset
+#
+
+define pset
+	if $argc == 0
+		help pset
+	else
+		set $tree = $arg0
+		set $i = 0
+		set $node = $tree._M_t._M_impl._M_header._M_left
+		set $end = $tree._M_t._M_impl._M_header
+		set $tree_size = $tree._M_t._M_impl._M_node_count
+		if $argc == 1
+			printf "Set "
+			whatis $tree
+			printf "Use pset <variable_name> <element_type> to see the elements in the set.\n"
+		end
+		if $argc == 2
+			while $i < $tree_size
+				set $value = (void *)($node + 1)
+				printf "elem[%u]: ", $i
+				p *($arg1*)$value
+				if $node._M_right != 0
+					set $node = $node._M_right
+					while $node._M_left != 0
+						set $node = $node._M_left
+					end
+				else
+					set $tmp_node = $node._M_parent
+					while $node == $tmp_node._M_right
+						set $node = $tmp_node
+						set $tmp_node = $tmp_node._M_parent
+					end
+					if $node._M_right != $tmp_node
+						set $node = $tmp_node
+					end
+				end
+				set $i++
+			end
+		end
+		if $argc == 3
+			set $idx = $arg2
+			set $ElementsFound = 0
+			while $i < $tree_size
+				set $value = (void *)($node + 1)
+				if *($arg1*)$value == $idx
+					printf "elem[%u]: ", $i
+					p *($arg1*)$value
+					set $ElementsFound++
+				end
+				if $node._M_right != 0
+					set $node = $node._M_right
+					while $node._M_left != 0
+						set $node = $node._M_left
+					end
+				else
+					set $tmp_node = $node._M_parent
+					while $node == $tmp_node._M_right
+						set $node = $tmp_node
+						set $tmp_node = $tmp_node._M_parent
+					end
+					if $node._M_right != $tmp_node
+						set $node = $tmp_node
+					end
+				end
+				set $i++
+			end
+			printf "Number of elements found = %u\n", $ElementsFound
+		end
+		printf "Set size = %u\n", $tree_size
+	end
+end
+
+document pset
+	Prints std::set<T> or std::multiset<T> information. Works for std::multiset as well.
+	Syntax: pset <set> <T> <val>: Prints set size, if T defined all elements or just element(s) having val
+	Examples:
+	pset s - prints set size and definition
+	pset s int - prints all elements and the size of s
+	pset s int 20 - prints the element(s) with value = 20 (if any) and the size of s
+end
+
+
+
+#
+# std::dequeue
+#
+
+define pdequeue
+	if $argc == 0
+		help pdequeue
+	else
+		set $size = 0
+		set $start_cur = $arg0._M_impl._M_start._M_cur
+		set $start_last = $arg0._M_impl._M_start._M_last
+		set $start_stop = $start_last
+		while $start_cur != $start_stop
+			p *$start_cur
+			set $start_cur++
+			set $size++
+		end
+		set $finish_first = $arg0._M_impl._M_finish._M_first
+		set $finish_cur = $arg0._M_impl._M_finish._M_cur
+		set $finish_last = $arg0._M_impl._M_finish._M_last
+		if $finish_cur < $finish_last
+			set $finish_stop = $finish_cur
+		else
+			set $finish_stop = $finish_last
+		end
+		while $finish_first != $finish_stop
+			p *$finish_first
+			set $finish_first++
+			set $size++
+		end
+		printf "Dequeue size = %u\n", $size
+	end
+end
+
+document pdequeue
+	Prints std::dequeue<T> information.
+	Syntax: pdequeue <dequeue>: Prints dequeue size, if T defined all elements
+	Deque elements are listed "left to right" (left-most stands for front and right-most stands for back)
+	Example:
+	pdequeue d - prints all elements and size of d
+end
+
+
+
+#
+# std::stack
+#
+
+define pstack
+	if $argc == 0
+		help pstack
+	else
+		set $start_cur = $arg0.c._M_impl._M_start._M_cur
+		set $finish_cur = $arg0.c._M_impl._M_finish._M_cur
+		set $size = $finish_cur - $start_cur
+        set $i = $size - 1
+        while $i >= 0
+            p *($start_cur + $i)
+            set $i--
+        end
+		printf "Stack size = %u\n", $size
+	end
+end
+
+document pstack
+	Prints std::stack<T> information.
+	Syntax: pstack <stack>: Prints all elements and size of the stack
+	Stack elements are listed "top to buttom" (top-most element is the first to come on pop)
+	Example:
+	pstack s - prints all elements and the size of s
+end
+
+
+
+#
+# std::queue
+#
+
+define pqueue
+	if $argc == 0
+		help pqueue
+	else
+		set $start_cur = $arg0.c._M_impl._M_start._M_cur
+		set $finish_cur = $arg0.c._M_impl._M_finish._M_cur
+		set $size = $finish_cur - $start_cur
+        set $i = 0
+        while $i < $size
+            p *($start_cur + $i)
+            set $i++
+        end
+		printf "Queue size = %u\n", $size
+	end
+end
+
+document pqueue
+	Prints std::queue<T> information.
+	Syntax: pqueue <queue>: Prints all elements and the size of the queue
+	Queue elements are listed "top to bottom" (top-most element is the first to come on pop)
+	Example:
+	pqueue q - prints all elements and the size of q
+end
+
+
+
+#
+# std::priority_queue
+#
+
+define ppqueue
+	if $argc == 0
+		help ppqueue
+	else
+		set $size = $arg0.c._M_impl._M_finish - $arg0.c._M_impl._M_start
+		set $capacity = $arg0.c._M_impl._M_end_of_storage - $arg0.c._M_impl._M_start
+		set $i = $size - 1
+		while $i >= 0
+			p *($arg0.c._M_impl._M_start + $i)
+			set $i--
+		end
+		printf "Priority queue size = %u\n", $size
+		printf "Priority queue capacity = %u\n", $capacity
+	end
+end
+
+document ppqueue
+	Prints std::priority_queue<T> information.
+	Syntax: ppqueue <priority_queue>: Prints all elements, size and capacity of the priority_queue
+	Priority_queue elements are listed "top to buttom" (top-most element is the first to come on pop)
+	Example:
+	ppqueue pq - prints all elements, size and capacity of pq
+end
+
+
+
+#
+# std::bitset
+#
+
+define pbitset
+	if $argc == 0
+		help pbitset
+	else
+        p /t $arg0._M_w
+	end
+end
+
+document pbitset
+	Prints std::bitset<n> information.
+	Syntax: pbitset <bitset>: Prints all bits in bitset
+	Example:
+	pbitset b - prints all bits in b
+end
+
+
+
+#
+# std::string
+#
+
+define pstring
+	if $argc == 0
+		help pstring
+	else
+		printf "String \t\t\t= \"%s\"\n", $arg0._M_data()
+		printf "String size/length \t= %u\n", $arg0._M_rep()._M_length
+		printf "String capacity \t= %u\n", $arg0._M_rep()._M_capacity
+		printf "String ref-count \t= %d\n", $arg0._M_rep()._M_refcount
+	end
+end
+
+document pstring
+	Prints std::string information.
+	Syntax: pstring <string>
+	Example:
+	pstring s - Prints content, size/length, capacity and ref-count of string s
+end 
+
+#
+# std::wstring
+#
+
+define pwstring
+	if $argc == 0
+		help pwstring
+	else
+		call printf("WString \t\t= \"%ls\"\n", $arg0._M_data())
+		printf "WString size/length \t= %u\n", $arg0._M_rep()._M_length
+		printf "WString capacity \t= %u\n", $arg0._M_rep()._M_capacity
+		printf "WString ref-count \t= %d\n", $arg0._M_rep()._M_refcount
+	end
+end
+
+document pwstring
+	Prints std::wstring information.
+	Syntax: pwstring <wstring>
+	Example:
+	pwstring s - Prints content, size/length, capacity and ref-count of wstring s
+end 
+
+#
+# C++ related beautifiers (optional)
+#
+
+set print pretty on
+set print object on
+set print static-members on
+set print vtbl on
+set print demangle on
+set demangle-style gnu-v3
+set print sevenbit-strings off
+
+set follow-fork-mode child
+set detach-on-fork off
diff --git a/paddle/fluid/train/custom_trainer/feed/tool/ins_weight.py b/paddle/fluid/train/custom_trainer/feed/tool/ins_weight.py
new file mode 100755
index 0000000000000000000000000000000000000000..8b4d87c34300aaea048c07fd9e9c50aa70e3a07c
--- /dev/null
+++ b/paddle/fluid/train/custom_trainer/feed/tool/ins_weight.py
@@ -0,0 +1,122 @@
+#!/usr/bin/python
+import sys
+import re
+import math
+
+del_text_slot = True
+g_ratio = 1
+w_ratio = 0.01
+slots_str = "6048 6145 6202 6201 6121 6119 6146 6120 6147 6122 6123 6118 6142 6143 6008 6148 6151 6127 6144 6150 6109 6003 6096 6149 6129 6203 6153 6152 6128 6106 6251 7082 7515 7080 6066 7507 6186 6007 7514 6054 6125 7506 10001 6006 6080 7023 6085 10000 6250 6110 6124 6090 6082 6067 7516 6101 6004 6191 6188 6070 6194 6247 6814 7512 10007 6058 6189 6059 7517 10005 7510 7024 7502 7503 6183 7511 6060 6806 7504 6185 6810 6248 10004 6815 6182 10068 6069 6073 6196 6816 7513 6071 6809 6072 6817 6190 7505 6813 6192 6807 6808 6195 6826 6184 6197 6068 6812 7107 6811 6823 6824 6819 6818 6821 6822 6820 6094 6083 6952 6099 6951 6949 6098 7075 6948 6157 6126 7077 6111 6087 6103 6107 6156 6005 6158 7122 6155 7058 6115 7079 7081 6833 6108 6840 6837 7147 7129 6097 6231 6957 7145 6956 7143 6130 7149 7142 6212 6827 7144 6089 6161 7055 6233 6105 7057 6237 6828 6850 6163 7124 6354 6162 7146 6830 7123 6160 6235 7056 6081 6841 6132 6954 6131 6236 6831 6845 6832 6953 6839 6950 7125 7054 6138 6166 6076 6851 6353 7076 7148 6858 6842 6860 7126 6829 6835 7078 6866 6869 6871 7052 6134 6855 6947 6862 6215 6852 7128 6092 6112 6213 6232 6863 6113 6165 6214 6216 6873 6865 6870 6077 6234 6861 6164 6217 7127 6218 6962 7053 7051 6961 6002 6738 6739 10105 7064 6751 6770 7100 6014 6765 6755 10021 10022 6010 10056 6011 6756 10055 6768 10024 6023 10003 6769 10002 6767 6759 10018 6024 6064 6012 6050 10042 6168 6253 10010 10020 6015 6018 10033 10041 10039 10031 10016 6764 7083 7152 7066 6171 7150 7085 6255 10044 10008 7102 6167 6240 6238 6095 10017 10046 6019 6031 6763 6256 6169 6254 10034 7108 7186 6257 10019 6757 10040 6025 7019 7086 10029 10011 7104 6261 6013 6766 10106 7105 7153 7089 6057 7134 7151 7045 7005 7008 7101 6035 7137 10023 6036 6172 7099 7087 6239 7185 6170 10006 6243 6350 7103 7090 7157 6259 7171 6875 7084 7154 6242 6260 7155 7017 7048 7156 6959 7047 10053 7135 6244 7136 10030 7063 6760 7016 7065 7179 6881 7018 6876 10081 10052 10054 10038 6886 10069 7004 10051 7007 7109 10057 6029 6888 10009 6889 7021 10047 6245 6878 10067 6879 6884 7180 7182 10071 7002 6880 6890 6887 10061 6027 6877 6892 10060 6893 7050 10036 7049 10012 10025 7012 7183 10058 7181 10086 6891 6258 6894 6883 7046 6037 7106 10043 10048 10045 10087 6885 10013 10028 7187 10037 10035 10050 6895 7011 7170 7172 10026 10063 10095 10082 10084 6960 10092 10075 6038 7010 7015 10015 10027 10064 7184 10014 10059 7013 7020 10072 10066 10080 6896 10083 10090 6039 10049 7164 7165 10091 10099 6963 7166 10079 10103 7006 7009 7169 6034 7028 7029 7030 7034 7035 7036 7040 7041 7042 10032 6009 6241 7003 7014 7088 13326 13330 13331 13352 13353 6198"
+slot_whitelist = slots_str.split(" ")
+
+def calc_ins_weight(params, label):
+    """calc ins weight"""
+    global g_ratio
+    global w_ratio
+    slots = []
+    s_clk_num = 0
+    s_show_num = 0
+    active = 0
+    attclk_num = 0
+    attshow_num = 0
+    attclk_avg = 0
+    for items in params:
+        if len(items) != 2:
+            continue
+        slot_name = items[0]
+        slot_val = items[1]
+        if slot_name not in slots:
+            slots.append(slot_name)
+        if slot_name == "session_click_num":
+            s_clk_num = int(slot_val)
+        if slot_name == "session_show_num":
+            s_show_num = int(slot_val)
+        if slot_name == "activity":
+            active = float(slot_val) / 10000.0
+    w = 1
+    # for inactive user 
+    if active >= 0 and active < 0.4 and s_show_num >=0 and s_show_num < 20:
+        w = math.log(w_ratio * (420 - (active * 50 + 1) * (s_show_num + 1)) + math.e)
+        if label == "0":
+            w = 1 + (w - 1) * g_ratio
+    return w
+
+def filter_whitelist_slot(tmp_line):
+    terms = tmp_line.split()
+    line = "%s %s %s" % (terms[0], terms[1], terms[2])
+    for item in terms[3:]:
+        feasign = item.split(':')
+        if len(feasign) == 2 and \
+            feasign[1] in slot_whitelist:
+            line = "%s %s" %(line, item)
+    return line
+
+def get_sample_type(line):
+    # vertical_type = 20
+    # if line.find("13038012583501790:6738") > 0:
+    #     return 30
+    # vertical_type = 0/5/1/2/9/11/13/16/29/-1
+    if (line.find("7408512894065610:6738") > 0) or \
+        (line.find("8815887816424655:6738") > 0) or \
+        (line.find("7689987878537419:6738") > 0) or \
+        (line.find("7971462863009228:6738") > 0) or \
+        (line.find("9941787754311891:6738") > 0) or \
+        (line.find("10504737723255509:6738") > 0) or \
+        (line.find("11067687692199127:6738") > 0) or \
+        (line.find("11912112645614554:6738") > 0) or \
+        (line.find("15571287443748071:6738") > 0) or \
+        (line.find("7127025017546227:6738") > 0): 
+        return 20
+    return -1
+
+def main():
+    """ins adjust"""
+    global del_text_slot
+    for l in sys.stdin:
+        l = l.rstrip("\n")
+        items = l.split(" ")
+        if len(items) < 3:
+            continue
+        label = items[2]
+        lines = l.split("\t")
+        line = lines[0]
+        # streaming ins include all ins, sample_type only handle NEWS ins
+        sample_type = -1
+        if 'NEWS' in l:
+            sample_type = get_sample_type(line)
+        #line = filter_whitelist_slot(tmp_line)
+        if len(lines) >= 4:
+            if 'VIDEO' in lines[3]:
+                continue
+            params = lines[2]
+            params = params.split(" ")
+            m = [tuple(i.split(":")) for i in params]
+            if m is None or len(m) == 0:
+                if sample_type > 0:
+                    print "%s $%s *1" % (line, sample_type)
+                else:
+                    print "%s *1" % line
+                sys.stdout.flush()
+                continue
+            weight = calc_ins_weight(m, label)
+            if sample_type > 0:
+                print "%s $%s *%s" % (line, sample_type, weight)
+            else:
+                print "%s *%s" % (line, weight)
+            sys.stdout.flush()
+        else:
+            if sample_type > 0:
+                print "%s $%s *1" % (line, sample_type)
+            else:
+                print "%s *1" % line
+            sys.stdout.flush()
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        if sys.argv[1] == "0":
+            del_text_slot = False
+        if len(sys.argv) > 2:
+            g_ratio = float(sys.argv[2])
+        if len(sys.argv) > 3:
+            w_ratio = float(sys.argv[3])
+    main()
diff --git a/paddle/fluid/train/custom_trainer/feed/tool/xbox_compressor_mf.py b/paddle/fluid/train/custom_trainer/feed/tool/xbox_compressor_mf.py
new file mode 100755
index 0000000000000000000000000000000000000000..b306ddfeb183515c7652b2f0d08cbe98f95033b4
--- /dev/null
+++ b/paddle/fluid/train/custom_trainer/feed/tool/xbox_compressor_mf.py
@@ -0,0 +1,162 @@
+#!/usr/bin/python
+"""
+xbox model compressor
+"""
+
+import sys
+import math
+import time
+import re
+
+#WISE
+#SHOW_COMPRESS_RATIO : 8192
+#CLICK_COMPRESS_RATIO : 8192
+#LR_COMPRESS_RATIO : 1048576
+#MIO_COMPRESS_RATIO:8192
+
+#PC
+#MIO_COMPRESS_RATIO : 1024
+#SHOW_COMPRESS_RATIO : 128
+#CLICK_COMPRESS_RATIO : 1024
+#LR_COMPRESS_RATIO : 8192
+
+#STAMP_COL = 2
+SHOW_COL = 3
+CLICK_COL = 4
+LR_W_COL = 5
+LR_G2SUM_COL = 6
+FM_COL = 9
+
+#DAY_SPAN = 300
+
+#show clk lr = float
+SHOW_RATIO = 1
+#SHOW_RATIO = 1024
+CLK_RATIO = 8
+#CLK_RATIO = 1024
+LR_RATIO = 1024
+MF_RATIO = 1024
+
+base_update_threshold=0.965
+base_xbox_clk_cof=1
+base_xbox_nonclk_cof=0.2
+
+def as_num(x):
+    y='{:.5f}'.format(x)
+    return(y)
+
+def compress_show(xx):
+    """
+    compress show
+    """
+    preci = SHOW_RATIO
+
+    x = float(xx)
+    return str(int(math.floor(x * preci + 0.5)))
+
+
+def compress_clk(xx):
+    """
+    compress clk
+    """
+    preci = CLK_RATIO
+
+    x = float(xx)
+    clk = int(math.floor(x * preci + 0.5))
+    if clk == 0:
+        return ""
+    return str(clk)
+
+
+def compress_lr(xx):
+    """
+    compress lr
+    """
+    preci = LR_RATIO
+
+    x = float(xx)
+    lr = int(math.floor(x * preci + 0.5))
+    if lr == 0:
+        return ""
+    return str(lr)
+
+def compress_mf(xx):
+    """
+    compress mf
+    """
+    preci = MF_RATIO
+
+    x = float(xx)
+    return int(math.floor(x * preci + 0.5))
+
+
+def show_clk_score(show, clk):
+    """
+    calculate show_clk score
+    """
+    return (show - clk) * 0.2 + clk
+
+
+for l in sys.stdin:
+    cols = re.split(r'\s+', l.strip())
+    key = cols[0].strip()
+
+    #day = int(cols[STAMP_COL].strip())
+    #cur_day = int(time.time()/3600/24)
+    #if (day + DAY_SPAN) <= cur_day:
+    #    continue
+
+    # cvm features
+    show = cols[SHOW_COL]
+    click = cols[CLICK_COL]
+    pred = ""
+
+    f_show = float(show)
+    f_clk = float(click)
+    """
+    if f_show != 0:
+        show_log = math.log(f_show)
+    else:
+        show_log = 0
+
+    if f_clk != 0:
+        click_log =  math.log(f_clk) - show_log
+    else:
+        click_log = 0
+    """
+    show_log = f_show
+    click_log = f_clk
+    #print f_show, f_clk
+    #if show_clk_score(f_show, f_clk) < base_update_threshold:
+    #    continue
+
+    #show = compress_show(show)
+    show = compress_show(show_log)
+    #clk = compress_clk(click)
+    clk = compress_clk(click_log)
+
+    # personal lr weight
+    lr_w = cols[LR_W_COL].strip()
+    lr_wei = compress_lr(lr_w)
+
+    # fm weight
+    fm_wei = []
+    fm_sum = 0
+    if len(cols) > 7:
+    #fm_dim = int(cols[FM_COL].strip())
+    #if fm_dim != 0:
+        for v in xrange(FM_COL, len(cols), 1):
+            mf_v = compress_mf(cols[v])
+            #print mf_v
+            fm_wei.append(str(mf_v))
+            fm_sum += (mf_v * mf_v)
+
+    sys.stdout.write("%s\t%s\t%s\t%s" % (key, show, clk, pred))
+    sys.stdout.write("\t")
+    sys.stdout.write("%s" % lr_wei)
+    if len(fm_wei) > 0 and fm_sum > 0:
+        sys.stdout.write("\t%s" % "\t".join(fm_wei))
+    else:
+        sys.stdout.write("\t[\t]")
+    sys.stdout.write("\n")
+
diff --git a/paddle/fluid/train/custom_trainer/feed/tool/xbox_decompressor_mf.awk b/paddle/fluid/train/custom_trainer/feed/tool/xbox_decompressor_mf.awk
new file mode 100755
index 0000000000000000000000000000000000000000..61b2f831cf8354dc5ee95487b3bfd0b280ceb998
--- /dev/null
+++ b/paddle/fluid/train/custom_trainer/feed/tool/xbox_decompressor_mf.awk
@@ -0,0 +1,52 @@
+#!/bin/awk -f
+{
+    OFS="\t";
+    SHOW_RATIO = 1;
+    CLK_RATIO = 8;
+    LR_RATIO = 1024;
+    MF_RATIO = 1024;
+}
+
+function decompress_show(x) {
+    x = x * 1.0 / SHOW_RATIO;
+    return x;
+}
+
+function decompress_clk(x) {
+    if (x == "") {
+        x = 0;
+    }
+    x = x * 1.0 / CLK_RATIO;
+    return x;
+}
+
+function decompress_lr(x) {
+    return x * 1.0 / LR_RATIO;
+}
+
+function decompress_mf(x) {
+    return x * 1.0 / MF_RATIO;
+}
+
+function show_clk_sore(show, clk, nonclk_coeff, clk_coeff) {
+    return (show - clk) * nonclk_coeff + clk * clk_coeff;
+}
+
+#key, show, clk, pred, lr_w, mf_w or [\t]
+{
+    l=split($0, a, "\t");
+
+    show = decompress_show(a[2]);
+    click = decompress_clk(a[3]);
+    lr = decompress_lr(a[5]);
+    printf("%s\t0\t0\t%s\t%s\t%s\t0\t", a[1], show, click, lr);
+    if (l == 7) {
+        printf("0\n");
+    } else {
+        printf("%d", l-5)
+        for(i = 6; i <= l; i++) {
+            printf("\t%s", decompress_mf(a[i]));
+        }
+        printf("\t0\n");
+    }
+}
diff --git a/paddle/fluid/train/custom_trainer/feed/tool/xbox_pb_converter b/paddle/fluid/train/custom_trainer/feed/tool/xbox_pb_converter
new file mode 100755
index 0000000000000000000000000000000000000000..04d925a88cd58ee3719a34b3e51fcacd3c8757da
Binary files /dev/null and b/paddle/fluid/train/custom_trainer/feed/tool/xbox_pb_converter differ
diff --git a/paddle/fluid/train/custom_trainer/feed/tool/xbox_pb_deconverter b/paddle/fluid/train/custom_trainer/feed/tool/xbox_pb_deconverter
new file mode 100755
index 0000000000000000000000000000000000000000..e0c1a18c4ddce4badfa14f1c68e7055a74aeb158
Binary files /dev/null and b/paddle/fluid/train/custom_trainer/feed/tool/xbox_pb_deconverter differ
diff --git a/paddle/fluid/train/custom_trainer/feed/trainer_context.h b/paddle/fluid/train/custom_trainer/feed/trainer_context.h
index 1cde2d2bad7a822b2c17007bce0b421a7df7fc9c..f9c86a45a307791d6700e8bac9759e2184022368 100755
--- a/paddle/fluid/train/custom_trainer/feed/trainer_context.h
+++ b/paddle/fluid/train/custom_trainer/feed/trainer_context.h
@@ -2,8 +2,9 @@
 #include <string>
 #include <memory>
 #include <vector>
-#include <yaml-cpp/yaml.h>
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/train/custom_trainer/feed/common/yaml_helper.h"
+#include "paddle/fluid/train/custom_trainer/feed/common/pslib_warpper.h"
 #include "paddle/fluid/train/custom_trainer/feed/common/runtime_environment.h"
 
 
@@ -16,23 +17,16 @@ class Dataset;
 class FileSystem;
 class EpochAccessor;
 
+const uint32_t SecondsPerMin = 60;
+const uint32_t SecondsPerHour = 3600;
+const uint32_t SecondsPerDay = 24 * 3600;
+
 enum class ModelSaveWay {
     ModelSaveTrainCheckpoint = 0,
     ModelSaveInferenceDelta = 1,
     ModelSaveInferenceBase = 2
 };
 
-class TableMeta {
-public:
-    TableMeta() {}
-    ~TableMeta() {}
-    int table_id() {
-        return _id;
-    }
-private:
-    int _id;
-};
-
 class SignCacheDict {
 public:
     int32_t sign2index(uint64_t sign) {
@@ -46,16 +40,20 @@ public:
 
 class TrainerContext {
 public:
-YAML::Node trainer_config;
-paddle::platform::CPUPlace cpu_place;
-
-std::shared_ptr<Dataset> dataset;                          //训练样本
-std::shared_ptr<FileSystem> file_system;                   //文件操作辅助类
-std::vector<TableMeta> params_table_list;                  //参数表
-std::shared_ptr<EpochAccessor> epoch_accessor;             //训练轮次控制
-std::shared_ptr<RuntimeEnvironment> environment;           //运行环境
-std::vector<std::shared_ptr<Process>> process_list;        //训练流程
-std::shared_ptr<SignCacheDict> cache_dict;                 //大模型cache词典
+    inline paddle::ps::PSClient* ps_client() {
+        return pslib->ps_client();
+    }
+
+    YAML::Node trainer_config;
+    paddle::platform::CPUPlace cpu_place;
+
+    std::shared_ptr<PSlib> pslib;
+    std::shared_ptr<Dataset> dataset;                          //训练样本
+    std::shared_ptr<FileSystem> file_system;                   //文件操作辅助类
+    std::shared_ptr<EpochAccessor> epoch_accessor;             //训练轮次控制
+    std::shared_ptr<RuntimeEnvironment> environment;           //运行环境
+    std::vector<std::shared_ptr<Process>> process_list;        //训练流程
+    std::shared_ptr<SignCacheDict> cache_dict;                 //大模型cache词典
 };
 
 }  // namespace feed
diff --git a/paddle/fluid/train/custom_trainer/feed/unit_test/test_create_programs.cc b/paddle/fluid/train/custom_trainer/feed/unit_test/test_create_programs.cc
index cfd001f0374578bb7319d49563722c5732da1bd8..bff27b2ae2e725f1ca20428c244c1f064b16a7d5 100644
--- a/paddle/fluid/train/custom_trainer/feed/unit_test/test_create_programs.cc
+++ b/paddle/fluid/train/custom_trainer/feed/unit_test/test_create_programs.cc
@@ -9,6 +9,7 @@
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/train/custom_trainer/feed/io/file_system.h"
 #include "paddle/fluid/train/custom_trainer/feed/io/shell.h"
+#include "paddle/fluid/train/custom_trainer/feed/common/scope_helper.h"
 #include "paddle/fluid/string/string_helper.h"
 
 namespace paddle {
@@ -28,7 +29,7 @@ class CreateProgramsTest : public testing::Test
 public:
     static void SetUpTestCase()
     {
-        std::unique_ptr<FileSystem> fs(CREATE_CLASS(FileSystem, "LocalFileSystem"));
+        std::unique_ptr<FileSystem> fs(CREATE_INSTANCE(FileSystem, "LocalFileSystem"));
         if (fs->exists("./scripts/create_programs.py")) {
             shell_execute(string::format_string("python ./scripts/create_programs.py ./scripts/example.py %s", test_data_dir));
         } else if (fs->exists(string::format_string("%s/scripts/create_programs.py", feed_path))) {
@@ -38,7 +39,7 @@ public:
 
     static void TearDownTestCase()
     {
-        std::unique_ptr<FileSystem> fs(CREATE_CLASS(FileSystem, "LocalFileSystem"));
+        std::unique_ptr<FileSystem> fs(CREATE_INSTANCE(FileSystem, "LocalFileSystem"));
         fs->remove(test_data_dir);
     }
 
@@ -61,7 +62,7 @@ public:
 };
 
 TEST_F(CreateProgramsTest, example_network) {
-    std::unique_ptr<Executor> executor(CREATE_CLASS(Executor, "SimpleExecutor"));
+    std::unique_ptr<Executor> executor(CREATE_INSTANCE(Executor, "SimpleExecutor"));
     ASSERT_NE(nullptr, executor);
 
     auto config = YAML::Load(string::format_string("{thread_num: 2, startup_program: %s, main_program: %s}", startup_program_path, main_program_path));
@@ -108,8 +109,10 @@ TEST_F(CreateProgramsTest, example_network) {
     ASSERT_EQ(-1, output_shape[0]);
     ASSERT_EQ(1, output_shape[1]);
 
-    auto input_var = executor->mutable_var<::paddle::framework::LoDTensor>(input_name);
-    auto label_var = executor->mutable_var<::paddle::framework::LoDTensor>(label_name);
+    paddle::framework::Scope scope;
+    executor->initialize_scope(&scope);
+    auto input_var = ScopeHelper::mutable_var<::paddle::framework::LoDTensor>(&scope, input_name);
+    auto label_var = ScopeHelper::mutable_var<::paddle::framework::LoDTensor>(&scope, label_name);
     ASSERT_NE(nullptr, input_var);
     ASSERT_NE(nullptr, label_var);
 
@@ -125,12 +128,12 @@ TEST_F(CreateProgramsTest, example_network) {
     ASSERT_NE(nullptr, label_data);
     label_data[0] = random();
 
-    ASSERT_EQ(0, executor->run());
+    ASSERT_EQ(0, executor->run(&scope));
 
-    auto loss_var = executor->var<::paddle::framework::LoDTensor>(loss_name);
+    auto loss_var = ScopeHelper::var<::paddle::framework::LoDTensor>(&scope, loss_name);
     auto loss = loss_var.data<float>()[0];
 
-    auto output_var = executor->var<::paddle::framework::LoDTensor>(output_name);
+    auto output_var = ScopeHelper::var<::paddle::framework::LoDTensor>(&scope, output_name);
     auto output = output_var.data<float>()[0];
 
     LOG(INFO) << "loss: " << loss << std::endl;
diff --git a/paddle/fluid/train/custom_trainer/feed/unit_test/test_datareader.cc b/paddle/fluid/train/custom_trainer/feed/unit_test/test_datareader.cc
index 8ad66b5df514160516755dbbb363f2e4f98d3457..9e941326ae36251ea2cbf2a05957ef21b71d3b99 100644
--- a/paddle/fluid/train/custom_trainer/feed/unit_test/test_datareader.cc
+++ b/paddle/fluid/train/custom_trainer/feed/unit_test/test_datareader.cc
@@ -3,6 +3,7 @@
 #include <gtest/gtest.h>
 #include <omp.h>
 
+#include "paddle/fluid/train/custom_trainer/feed/trainer_context.h"
 #include "paddle/fluid/train/custom_trainer/feed/executor/executor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -22,7 +23,7 @@ const char test_data_dir[] = "test_data";
 class DataReaderTest : public testing::Test {
 public:
     static void SetUpTestCase() {
-        std::unique_ptr<FileSystem> fs(CREATE_CLASS(FileSystem, "LocalFileSystem"));
+        std::unique_ptr<FileSystem> fs(CREATE_INSTANCE(FileSystem, "LocalFileSystem"));
         fs->mkdir(test_data_dir);
         shell_set_verbose(true);
 
@@ -42,14 +43,14 @@ public:
     }
 
     static void TearDownTestCase() {
-        std::unique_ptr<FileSystem> fs(CREATE_CLASS(FileSystem, "LocalFileSystem"));
+        std::unique_ptr<FileSystem> fs(CREATE_INSTANCE(FileSystem, "LocalFileSystem"));
         fs->remove(test_data_dir);
     }
 
     virtual void SetUp() {
         thread_num = omp_get_max_threads();
         omp_set_num_threads(1);
-        fs.reset(CREATE_CLASS(FileSystem, "LocalFileSystem"));
+        fs.reset(CREATE_INSTANCE(FileSystem, "LocalFileSystem"));
         context_ptr.reset(new TrainerContext());
     }
 
@@ -65,7 +66,7 @@ public:
 };
 
 TEST_F(DataReaderTest, LineDataParser) {
-    std::unique_ptr<DataParser> data_parser(CREATE_CLASS(DataParser, "LineDataParser"));
+    std::unique_ptr<DataParser> data_parser(CREATE_INSTANCE(DataParser, "LineDataParser"));
 
     ASSERT_NE(nullptr, data_parser);
     auto config = YAML::Load("");
@@ -94,7 +95,7 @@ TEST_F(DataReaderTest, LineDataParser) {
 }
 
 TEST_F(DataReaderTest, LineDataReader) {
-    std::unique_ptr<DataReader> data_reader(CREATE_CLASS(DataReader, "LineDataReader"));
+    std::unique_ptr<DataReader> data_reader(CREATE_INSTANCE(DataReader, "LineDataReader"));
     ASSERT_NE(nullptr, data_reader);
 
     auto config = YAML::Load(
@@ -147,7 +148,7 @@ TEST_F(DataReaderTest, LineDataReader) {
 }
 
 TEST_F(DataReaderTest, LineDataReader_filename_prefix) {
-    std::unique_ptr<DataReader> data_reader(CREATE_CLASS(DataReader, "LineDataReader"));
+    std::unique_ptr<DataReader> data_reader(CREATE_INSTANCE(DataReader, "LineDataReader"));
     ASSERT_NE(nullptr, data_reader);
     auto config = YAML::Load(
             "parser:\n"
@@ -182,7 +183,7 @@ TEST_F(DataReaderTest, LineDataReader_filename_prefix) {
 }
 
 TEST_F(DataReaderTest, LineDataReader_FileSystem) {
-    std::unique_ptr<DataReader> data_reader(CREATE_CLASS(DataReader, "LineDataReader"));
+    std::unique_ptr<DataReader> data_reader(CREATE_INSTANCE(DataReader, "LineDataReader"));
     ASSERT_NE(nullptr, data_reader);
     auto config = YAML::Load(
             "parser:\n"
diff --git a/paddle/fluid/train/custom_trainer/feed/unit_test/test_datareader_omp.cc b/paddle/fluid/train/custom_trainer/feed/unit_test/test_datareader_omp.cc
index 62aed43912c1fdf1a72d7438e25a4bd8b273f1c9..bee271ac80736759b0bc72ac57f6113c05629474 100644
--- a/paddle/fluid/train/custom_trainer/feed/unit_test/test_datareader_omp.cc
+++ b/paddle/fluid/train/custom_trainer/feed/unit_test/test_datareader_omp.cc
@@ -4,6 +4,7 @@
 #include <gtest/gtest.h>
 #include <omp.h>
 
+#include "paddle/fluid/train/custom_trainer/feed/trainer_context.h"
 #include "paddle/fluid/train/custom_trainer/feed/executor/executor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -23,7 +24,7 @@ const char test_data_dir[] = "test_data";
 class DataReaderOmpTest : public testing::Test {
 public:
     static void SetUpTestCase() {
-        std::unique_ptr<FileSystem> fs(CREATE_CLASS(FileSystem, "LocalFileSystem"));
+        std::unique_ptr<FileSystem> fs(CREATE_INSTANCE(FileSystem, "LocalFileSystem"));
         if (fs->exists(test_data_dir)) {
             fs->remove(test_data_dir);
         }
@@ -50,14 +51,14 @@ public:
     }
 
     static void TearDownTestCase() {
-        std::unique_ptr<FileSystem> fs(CREATE_CLASS(FileSystem, "LocalFileSystem"));
+        std::unique_ptr<FileSystem> fs(CREATE_INSTANCE(FileSystem, "LocalFileSystem"));
         fs->remove(test_data_dir);
     }
 
     virtual void SetUp() {
         thread_num = omp_get_max_threads();
         omp_set_num_threads(1);
-        fs.reset(CREATE_CLASS(FileSystem, "LocalFileSystem"));
+        fs.reset(CREATE_INSTANCE(FileSystem, "LocalFileSystem"));
         context_ptr.reset(new TrainerContext());
     }
 
@@ -117,7 +118,7 @@ std::vector<DataItem> DataReaderOmpTest::std_items;
 std::vector<DataItem> DataReaderOmpTest::sorted_std_items;
 
 TEST_F(DataReaderOmpTest, LineDataReaderSingleThread) {
-    std::unique_ptr<DataReader> data_reader(CREATE_CLASS(DataReader, "LineDataReader"));
+    std::unique_ptr<DataReader> data_reader(CREATE_INSTANCE(DataReader, "LineDataReader"));
     ASSERT_NE(nullptr, data_reader);
 
     auto config = YAML::Load(
@@ -148,7 +149,7 @@ TEST_F(DataReaderOmpTest, LineDataReaderSingleThread) {
 }
 
 TEST_F(DataReaderOmpTest, LineDataReaderMuiltThread) {
-    std::unique_ptr<DataReader> data_reader(CREATE_CLASS(DataReader, "LineDataReader"));
+    std::unique_ptr<DataReader> data_reader(CREATE_INSTANCE(DataReader, "LineDataReader"));
     ASSERT_NE(nullptr, data_reader);
 
     auto config = YAML::Load(
diff --git a/paddle/fluid/train/custom_trainer/feed/unit_test/test_executor.cc b/paddle/fluid/train/custom_trainer/feed/unit_test/test_executor.cc
index 2b47e4dbb42b1deaaff61f7d35436aa07ccd1d29..50313f35983ead3126dc9ebd4daf871a09b9795a 100644
--- a/paddle/fluid/train/custom_trainer/feed/unit_test/test_executor.cc
+++ b/paddle/fluid/train/custom_trainer/feed/unit_test/test_executor.cc
@@ -2,7 +2,9 @@
 #include <fstream>
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/train/custom_trainer/feed/trainer_context.h"
 #include "paddle/fluid/train/custom_trainer/feed/executor/executor.h"
+#include "paddle/fluid/train/custom_trainer/feed/common/scope_helper.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/train/custom_trainer/feed/io/file_system.h"
@@ -24,7 +26,7 @@ class SimpleExecutorTest : public testing::Test
 public:
     static void SetUpTestCase()
     {
-        std::unique_ptr<FileSystem> fs(CREATE_CLASS(FileSystem, "LocalFileSystem"));
+        std::unique_ptr<FileSystem> fs(CREATE_INSTANCE(FileSystem, "LocalFileSystem"));
         fs->mkdir(test_data_dir);
         shell_set_verbose(true);
 
@@ -57,7 +59,7 @@ public:
 
     static void TearDownTestCase()
     {
-        std::unique_ptr<FileSystem> fs(CREATE_CLASS(FileSystem, "LocalFileSystem"));
+        std::unique_ptr<FileSystem> fs(CREATE_INSTANCE(FileSystem, "LocalFileSystem"));
         fs->remove(test_data_dir);
     }
 
@@ -75,7 +77,7 @@ public:
 };
 
 TEST_F(SimpleExecutorTest, initialize) {
-    std::unique_ptr<Executor> executor(CREATE_CLASS(Executor, "SimpleExecutor"));
+    std::unique_ptr<Executor> executor(CREATE_INSTANCE(Executor, "SimpleExecutor"));
     ASSERT_NE(nullptr, executor);
     YAML::Node config = YAML::Load("[1, 2, 3]");
     ASSERT_NE(0, executor->initialize(config, context_ptr));
@@ -86,13 +88,14 @@ TEST_F(SimpleExecutorTest, initialize) {
 }
 
 TEST_F(SimpleExecutorTest, run) {
-    std::unique_ptr<Executor> executor(CREATE_CLASS(Executor, "SimpleExecutor"));
+    std::unique_ptr<Executor> executor(CREATE_INSTANCE(Executor, "SimpleExecutor"));
     ASSERT_NE(nullptr, executor);
 
     auto config = YAML::Load(string::format_string("{thread_num: 2, startup_program: %s, main_program: %s}", startup_program_path, main_program_path));
     ASSERT_EQ(0, executor->initialize(config, context_ptr));
-
-    auto x_var = executor->mutable_var<::paddle::framework::LoDTensor>("x");
+    paddle::framework::Scope scope;
+    executor->initialize_scope(&scope);
+    auto x_var = ScopeHelper::mutable_var<::paddle::framework::LoDTensor>(&scope, std::string("x"));
     ASSERT_NE(nullptr, x_var);
 
     int x_len = 10;
@@ -106,9 +109,9 @@ TEST_F(SimpleExecutorTest, run) {
     }
     std::cout << std::endl;
 
-    ASSERT_EQ(0, executor->run());
+    ASSERT_EQ(0, executor->run(&scope));
 
-    auto mean_var = executor->var<::paddle::framework::LoDTensor>("mean");
+    auto mean_var = ScopeHelper::var<::paddle::framework::LoDTensor>(&scope, std::string("mean"));
     auto mean = mean_var.data<float>()[0];
     std::cout << "mean: " << mean << std::endl;
     ASSERT_NEAR(4.5, mean, 1e-9);