for runnable trainer

b1a8a46e · xiexionghang · c1c5c20d · 99222206 · b1a8a46e · c1c5c20d
9 changed file
--- a/BCLOUD
+++ b/BCLOUD
--- a/paddle/fluid/pybind/.gitignore
+++ b/paddle/fluid/pybind/.gitignore
-pybind.h
--- a/paddle/fluid/pybind/pybind.h
+++ b/paddle/fluid/pybind/pybind.h
+#ifdef PYBIND_AVX_MKLML
+// Generated by the paddle/fluid/operator/CMakeLists.txt.  DO NOT EDIT!
+USE_NO_KERNEL_OP(feed);
+USE_NO_KERNEL_OP(while);
+USE_NO_KERNEL_OP(get_places);
+USE_NO_KERNEL_OP(fetch);
+USE_NO_KERNEL_OP(conditional_block_infer);
+USE_NO_KERNEL_OP(conditional_block);
+USE_OP(less_than);
+USE_OP(logical_and);
+USE_NO_KERNEL_OP(read_from_array);
+USE_CPU_ONLY_OP(bipartite_match);
+USE_OP(box_coder);
+USE_OP(iou_similarity);
+USE_CPU_ONLY_OP(mine_hard_examples);
+USE_CPU_ONLY_OP(multiclass_nms);
+USE_OP(prior_box);
+USE_OP(density_prior_box);
+USE_OP(anchor_generator);
+USE_OP(target_assign);
+USE_OP(polygon_box_transform);
+USE_CPU_ONLY_OP(rpn_target_assign);
+USE_CPU_ONLY_OP(generate_proposal_labels);
+USE_OP(box_clip);
+USE_CPU_ONLY_OP(yolov3_loss);
+USE_OP(yolo_box);
+USE_OP(box_decoder_and_assign);
+USE_OP(sigmoid_focal_loss);
+USE_CPU_ONLY_OP(retinanet_detection_output);
+USE_CPU_ONLY_OP(generate_proposals);
+USE_CPU_ONLY_OP(distribute_fpn_proposals);
+USE_CPU_ONLY_OP(collect_fpn_proposals);
+USE_OP(roi_perspective_transform);
+USE_CPU_ONLY_OP(generate_mask_labels);
+USE_OP(elementwise_mod);
+USE_OP(elementwise_floordiv);
+USE_OP(elementwise_max);
+USE_OP(elementwise_pow);
+USE_OP(elementwise_sub_grad);
+USE_OP(elementwise_add_grad);
+USE_OP(elementwise_min);
+USE_OP(elementwise_div);
+USE_OP(elementwise_mul);
+USE_CPU_ONLY_OP(fusion_squared_mat_sub);
+USE_CPU_ONLY_OP(fusion_seqpool_concat);
+USE_CPU_ONLY_OP(fused_embedding_fc_lstm);
+USE_CPU_ONLY_OP(fusion_seqexpand_concat_fc);
+USE_CPU_ONLY_OP(fused_embedding_seq_pool);
+USE_CPU_ONLY_OP(fusion_seqconv_eltadd_relu);
+USE_CPU_ONLY_OP(fusion_gru);
+USE_CPU_ONLY_OP(fusion_repeated_fc_relu);
+USE_CPU_ONLY_OP(fusion_lstm);
+USE_OP(fused_elemwise_activation);
+USE_OP(accuracy);
+USE_CPU_ONLY_OP(precision_recall);
+USE_CPU_ONLY_OP(auc);
+USE_OP(adamax);
+USE_OP(sgd);
+USE_OP(lars_momentum);
+USE_OP(adagrad);
+USE_OP(ftrl);
+USE_OP(momentum);
+USE_OP(adadelta);
+USE_OP(rmsprop);
+USE_OP(lamb);
+USE_OP(proximal_gd);
+USE_OP(proximal_adagrad);
+USE_OP(adam);
+USE_OP(decayed_adagrad);
+USE_OP(reduce_all);
+USE_OP(reduce_min);
+USE_OP(reduce_sum);
+USE_OP(reduce_any);
+USE_OP(reduce_max);
+USE_OP(reduce_mean);
+USE_OP(reduce_prod);
+USE_OP(sequence_erase);
+USE_OP(sequence_unpad);
+USE_OP(sequence_mask);
+USE_OP(sequence_expand);
+USE_OP(sequence_pad);
+USE_OP(sequence_enumerate);
+USE_OP(sequence_slice);
+USE_OP(sequence_softmax);
+USE_OP(sequence_expand_as);
+USE_OP(sequence_pool);
+USE_OP(sequence_reverse);
+USE_CPU_ONLY_OP(sequence_scatter);
+USE_OP(sequence_conv);
+USE_OP(sequence_concat);
+USE_OP(sequence_reshape);
+USE_NO_KERNEL_OP(open_files);
+USE_NO_KERNEL_OP(create_random_data_generator);
+USE_NO_KERNEL_OP(create_shuffle_reader);
+USE_NO_KERNEL_OP(create_batch_reader);
+USE_NO_KERNEL_OP(create_recordio_file_reader);
+USE_NO_KERNEL_OP(create_double_buffer_reader);
+USE_NO_KERNEL_OP(create_multi_pass_reader);
+USE_NO_KERNEL_OP(create_custom_reader);
+USE_NO_KERNEL_OP(create_py_reader);
+USE_NO_KERNEL_OP(read);
+USE_OP(increment);
+USE_OP(stack);
+USE_CPU_ONLY_OP(fc);
+USE_NO_KERNEL_OP(assign);
+USE_OP(load);
+USE_NO_KERNEL_OP(fill);
+USE_NO_KERNEL_OP(reorder_lod_tensor_by_rank);
+USE_OP(conv_shift);
+USE_OP(fill_zeros_like);
+USE_CPU_ONLY_OP(hash);
+USE_NO_KERNEL_OP(dequantize);
+USE_OP(fake_quantize_abs_max);
+USE_OP(size);
+USE_OP(scatter);
+USE_OP(uniform_random);
+USE_OP(beam_search);
+USE_NO_KERNEL_OP(beam_search_decode);
+USE_OP(dropout);
+USE_OP(bilinear_interp);
+USE_OP(sampling_id);
+USE_OP(lstm);
+USE_OP(modified_huber_loss);
+USE_OP(temporal_shift);
+USE_OP(sum);
+USE_OP(arg_min);
+USE_OP(psroi_pool);
+USE_NO_KERNEL_OP(uniform_random_batch_size_like);
+USE_NO_KERNEL_OP(rnn_memory_helper);
+USE_CPU_ONLY_OP(crf_decoding);
+USE_OP(where);
+USE_OP(fake_dequantize_max_abs);
+USE_OP(mean_iou);
+USE_OP(roi_align);
+USE_OP(range);
+USE_OP(edit_distance);
+USE_OP(multiplex);
+USE_OP(clip);
+USE_OP(gaussian_random);
+USE_OP(norm);
+USE_OP(rank_loss);
+USE_CPU_ONLY_OP(detection_map);
+USE_OP(lstm_unit);
+USE_OP(shard_index);
+USE_OP(shape);
+USE_OP(arg_max);
+USE_OP(average_accumulates);
+USE_NO_KERNEL_OP(requantize);
+USE_OP(conv2d);
+USE_CPU_ONLY_OP(add_position_encoding);
+USE_OP(gru_unit);
+USE_OP(batch_norm);
+USE_CPU_ONLY_OP(chunk_eval);
+USE_NO_KERNEL_OP(lod_rank_table);
+USE_NO_KERNEL_OP(unsqueeze);
+USE_CPU_ONLY_OP(positive_negative_pair);
+USE_OP(im2sequence);
+USE_OP(margin_rank_loss);
+USE_OP(hinge_loss);
+USE_CPU_ONLY_OP(cvm);
+USE_OP(huber_loss);
+USE_OP(crop);
+USE_OP(relu_grad);
+USE_CPU_ONLY_OP(hierarchical_sigmoid);
+USE_OP(unfold);
+USE_NO_KERNEL_OP(max_sequence_len);
+USE_OP(mul);
+USE_CPU_ONLY_OP(attention_lstm);
+USE_OP(top_k);
+USE_OP(group_norm);
+USE_OP(selu);
+USE_OP(lstmp);
+USE_NO_KERNEL_OP(merge_lod_tensor);
+USE_OP(truncated_gaussian_random);
+USE_OP(label_smooth);
+USE_CPU_ONLY_OP(matmul);
+USE_OP(spp);
+USE_NO_KERNEL_OP(unstack);
+USE_OP(conv2d_transpose);
+USE_OP(diag);
+USE_OP(unpool);
+USE_NO_KERNEL_OP(lod_array_length);
+USE_OP(affine_channel);
+USE_OP(log_loss);
+USE_OP(concat);
+USE_NO_KERNEL_OP(lod_tensor_to_array);
+USE_OP(gru);
+USE_CPU_ONLY_OP(coalesce_tensor);
+USE_OP(fsp);
+USE_OP(linspace);
+USE_OP(reverse);
+USE_NO_KERNEL_OP(recurrent);
+USE_OP(split_selected_rows);
+USE_OP(dgc_clip_by_norm);
+USE_OP(scale);
+USE_OP(save);
+USE_OP(load_combine);
+USE_OP(merge_selected_rows);
+USE_OP(split);
+USE_OP(cumsum);
+USE_OP(deformable_psroi_pooling);
+USE_CPU_ONLY_OP(teacher_student_sigmoid_loss);
+USE_OP(transpose);
+USE_OP(fill_constant_batch_size_like);
+USE_OP(sigmoid_cross_entropy_with_logits);
+USE_OP(shuffle_channel);
+USE_CPU_ONLY_OP(affine_grid);
+USE_NO_KERNEL_OP(split_lod_tensor);
+USE_CPU_ONLY_OP(grid_sampler);
+USE_OP(lookup_table);
+USE_OP(cos_sim);
+USE_NO_KERNEL_OP(quantize);
+USE_OP(spectral_norm);
+USE_OP(cross_entropy);
+USE_NO_KERNEL_OP(print);
+USE_OP(lrn);
+USE_CPU_ONLY_OP(nce);
+USE_CPU_ONLY_OP(similarity_focus);
+USE_CPU_ONLY_OP(get_tensor_from_selected_rows);
+USE_OP(squared_l2_distance);
+USE_OP(cudnn_lstm);
+USE_OP(tree_conv);
+USE_OP(one_hot);
+USE_NO_KERNEL_OP(lookup_sparse_table);
+USE_CPU_ONLY_OP(unique);
+USE_OP(mean);
+USE_OP(prelu);
+USE_NO_KERNEL_OP(delete_var);
+USE_OP(ctc_align);
+USE_OP(argsort);
+USE_CPU_ONLY_OP(data_norm);
+USE_OP(minus);
+USE_NO_KERNEL_OP(shrink_rnn_memory);
+USE_OP(lod_reset);
+USE_OP(l1_norm);
+USE_NO_KERNEL_OP(gaussian_random_batch_size_like);
+USE_OP(is_empty);
+USE_OP(bilinear_tensor_product);
+USE_OP(kldiv_loss);
+USE_NO_KERNEL_OP(squeeze);
+USE_OP(softmax);
+USE_OP(clip_by_norm);
+USE_OP(max_pool2d_with_index);
+USE_OP(linear_chain_crf);
+USE_CPU_ONLY_OP(reshape);
+USE_OP(fill_constant);
+USE_OP(space_to_depth);
+USE_OP(gather);
+USE_OP(softmax_with_cross_entropy);
+USE_OP(slice);
+USE_OP(sign);
+USE_OP(expand);
+USE_OP(smooth_l1_loss);
+USE_NO_KERNEL_OP(tensor_array_to_tensor);
+USE_OP(row_conv);
+USE_OP(pad2d);
+USE_OP(pixel_shuffle);
+USE_OP(assign_value);
+USE_OP(random_crop);
+USE_OP(squared_l2_norm);
+USE_OP(save_combine);
+USE_OP(pool2d);
+USE_OP(cast);
+USE_NO_KERNEL_OP(array_to_lod_tensor);
+USE_OP(fill_any_like);
+USE_NO_KERNEL_OP(flatten);
+USE_OP(sample_logits);
+USE_OP(pad);
+USE_CPU_ONLY_OP(bpr_loss);
+USE_OP(roi_pool);
+USE_OP(pad_constant_like);
+USE_OP(isfinite);
+USE_OP(layer_norm);
+USE_OP(maxout);
+USE_OP(warpctc);
+#elif defined PYBIND_NOAVX_OPENBLAS
+// Generated by the paddle/fluid/operator/CMakeLists.txt.  DO NOT EDIT!
+USE_NO_KERNEL_OP(feed);
+USE_NO_KERNEL_OP(while);
+USE_NO_KERNEL_OP(get_places);
+USE_NO_KERNEL_OP(fetch);
+USE_NO_KERNEL_OP(conditional_block_infer);
+USE_NO_KERNEL_OP(conditional_block);
+USE_OP(less_than);
+USE_OP(logical_and);
+USE_NO_KERNEL_OP(read_from_array);
+USE_CPU_ONLY_OP(bipartite_match);
+USE_OP(box_coder);
+USE_OP(iou_similarity);
+USE_CPU_ONLY_OP(mine_hard_examples);
+USE_CPU_ONLY_OP(multiclass_nms);
+USE_OP(prior_box);
+USE_OP(density_prior_box);
+USE_OP(anchor_generator);
+USE_OP(target_assign);
+USE_OP(polygon_box_transform);
+USE_CPU_ONLY_OP(rpn_target_assign);
+USE_CPU_ONLY_OP(generate_proposal_labels);
+USE_OP(box_clip);
+USE_CPU_ONLY_OP(yolov3_loss);
+USE_OP(yolo_box);
+USE_OP(box_decoder_and_assign);
+USE_OP(sigmoid_focal_loss);
+USE_CPU_ONLY_OP(retinanet_detection_output);
+USE_CPU_ONLY_OP(generate_proposals);
+USE_CPU_ONLY_OP(distribute_fpn_proposals);
+USE_CPU_ONLY_OP(collect_fpn_proposals);
+USE_OP(roi_perspective_transform);
+USE_CPU_ONLY_OP(generate_mask_labels);
+USE_OP(elementwise_mod);
+USE_OP(elementwise_floordiv);
+USE_OP(elementwise_max);
+USE_OP(elementwise_pow);
+USE_OP(elementwise_sub_grad);
+USE_OP(elementwise_add_grad);
+USE_OP(elementwise_min);
+USE_OP(elementwise_div);
+USE_OP(elementwise_mul);
+USE_CPU_ONLY_OP(fusion_squared_mat_sub);
+USE_CPU_ONLY_OP(fusion_seqpool_concat);
+USE_CPU_ONLY_OP(fused_embedding_fc_lstm);
+USE_CPU_ONLY_OP(fusion_seqexpand_concat_fc);
+USE_CPU_ONLY_OP(fused_embedding_seq_pool);
+USE_CPU_ONLY_OP(fusion_seqconv_eltadd_relu);
+USE_CPU_ONLY_OP(fusion_gru);
+USE_CPU_ONLY_OP(fusion_repeated_fc_relu);
+USE_CPU_ONLY_OP(fusion_lstm);
+USE_OP(fused_elemwise_activation);
+USE_OP(accuracy);
+USE_CPU_ONLY_OP(precision_recall);
+USE_CPU_ONLY_OP(auc);
+USE_OP(adamax);
+USE_OP(sgd);
+USE_OP(lars_momentum);
+USE_OP(adagrad);
+USE_OP(ftrl);
+USE_OP(momentum);
+USE_OP(adadelta);
+USE_OP(rmsprop);
+USE_OP(lamb);
+USE_OP(proximal_gd);
+USE_OP(proximal_adagrad);
+USE_OP(adam);
+USE_OP(decayed_adagrad);
+USE_OP(reduce_all);
+USE_OP(reduce_min);
+USE_OP(reduce_sum);
+USE_OP(reduce_any);
+USE_OP(reduce_max);
+USE_OP(reduce_mean);
+USE_OP(reduce_prod);
+USE_OP(sequence_erase);
+USE_OP(sequence_unpad);
+USE_OP(sequence_mask);
+USE_OP(sequence_expand);
+USE_OP(sequence_pad);
+USE_OP(sequence_enumerate);
+USE_OP(sequence_slice);
+USE_OP(sequence_softmax);
+USE_OP(sequence_expand_as);
+USE_OP(sequence_pool);
+USE_OP(sequence_reverse);
+USE_CPU_ONLY_OP(sequence_scatter);
+USE_OP(sequence_conv);
+USE_OP(sequence_concat);
+USE_OP(sequence_reshape);
+USE_NO_KERNEL_OP(open_files);
+USE_NO_KERNEL_OP(create_random_data_generator);
+USE_NO_KERNEL_OP(create_shuffle_reader);
+USE_NO_KERNEL_OP(create_batch_reader);
+USE_NO_KERNEL_OP(create_recordio_file_reader);
+USE_NO_KERNEL_OP(create_double_buffer_reader);
+USE_NO_KERNEL_OP(create_multi_pass_reader);
+USE_NO_KERNEL_OP(create_custom_reader);
+USE_NO_KERNEL_OP(create_py_reader);
+USE_NO_KERNEL_OP(read);
+USE_OP(increment);
+USE_OP(stack);
+USE_CPU_ONLY_OP(fc);
+USE_NO_KERNEL_OP(assign);
+USE_OP(load);
+USE_NO_KERNEL_OP(fill);
+USE_NO_KERNEL_OP(reorder_lod_tensor_by_rank);
+USE_OP(conv_shift);
+USE_OP(fill_zeros_like);
+USE_CPU_ONLY_OP(hash);
+USE_NO_KERNEL_OP(dequantize);
+USE_OP(fake_quantize_abs_max);
+USE_OP(size);
+USE_OP(scatter);
+USE_OP(uniform_random);
+USE_OP(beam_search);
+USE_NO_KERNEL_OP(beam_search_decode);
+USE_OP(dropout);
+USE_OP(bilinear_interp);
+USE_OP(sampling_id);
+USE_OP(lstm);
+USE_OP(modified_huber_loss);
+USE_OP(temporal_shift);
+USE_OP(sum);
+USE_OP(arg_min);
+USE_OP(psroi_pool);
+USE_NO_KERNEL_OP(uniform_random_batch_size_like);
+USE_NO_KERNEL_OP(rnn_memory_helper);
+USE_CPU_ONLY_OP(crf_decoding);
+USE_OP(where);
+USE_OP(fake_dequantize_max_abs);
+USE_OP(mean_iou);
+USE_OP(roi_align);
+USE_OP(range);
+USE_OP(edit_distance);
+USE_OP(multiplex);
+USE_OP(clip);
+USE_OP(gaussian_random);
+USE_OP(norm);
+USE_OP(rank_loss);
+USE_CPU_ONLY_OP(detection_map);
+USE_OP(lstm_unit);
+USE_OP(shard_index);
+USE_OP(shape);
+USE_OP(arg_max);
+USE_OP(average_accumulates);
+USE_NO_KERNEL_OP(requantize);
+USE_OP(conv2d);
+USE_CPU_ONLY_OP(add_position_encoding);
+USE_OP(gru_unit);
+USE_OP(batch_norm);
+USE_CPU_ONLY_OP(chunk_eval);
+USE_NO_KERNEL_OP(lod_rank_table);
+USE_NO_KERNEL_OP(unsqueeze);
+USE_CPU_ONLY_OP(positive_negative_pair);
+USE_OP(im2sequence);
+USE_OP(margin_rank_loss);
+USE_OP(hinge_loss);
+USE_CPU_ONLY_OP(cvm);
+USE_OP(huber_loss);
+USE_OP(crop);
+USE_OP(relu_grad);
+USE_CPU_ONLY_OP(hierarchical_sigmoid);
+USE_OP(unfold);
+USE_NO_KERNEL_OP(max_sequence_len);
+USE_OP(mul);
+USE_CPU_ONLY_OP(attention_lstm);
+USE_OP(top_k);
+USE_OP(group_norm);
+USE_OP(selu);
+USE_OP(lstmp);
+USE_NO_KERNEL_OP(merge_lod_tensor);
+USE_OP(truncated_gaussian_random);
+USE_OP(label_smooth);
+USE_CPU_ONLY_OP(matmul);
+USE_OP(spp);
+USE_NO_KERNEL_OP(unstack);
+USE_OP(conv2d_transpose);
+USE_OP(diag);
+USE_OP(unpool);
+USE_NO_KERNEL_OP(lod_array_length);
+USE_OP(affine_channel);
+USE_OP(log_loss);
+USE_OP(concat);
+USE_NO_KERNEL_OP(lod_tensor_to_array);
+USE_OP(gru);
+USE_CPU_ONLY_OP(coalesce_tensor);
+USE_OP(fsp);
+USE_OP(linspace);
+USE_OP(reverse);
+USE_NO_KERNEL_OP(recurrent);
+USE_OP(split_selected_rows);
+USE_OP(dgc_clip_by_norm);
+USE_OP(scale);
+USE_OP(save);
+USE_OP(load_combine);
+USE_OP(merge_selected_rows);
+USE_OP(split);
+USE_OP(cumsum);
+USE_OP(deformable_psroi_pooling);
+USE_CPU_ONLY_OP(teacher_student_sigmoid_loss);
+USE_OP(transpose);
+USE_OP(fill_constant_batch_size_like);
+USE_OP(sigmoid_cross_entropy_with_logits);
+USE_OP(shuffle_channel);
+USE_CPU_ONLY_OP(affine_grid);
+USE_NO_KERNEL_OP(split_lod_tensor);
+USE_CPU_ONLY_OP(grid_sampler);
+USE_OP(lookup_table);
+USE_OP(cos_sim);
+USE_NO_KERNEL_OP(quantize);
+USE_OP(spectral_norm);
+USE_OP(cross_entropy);
+USE_NO_KERNEL_OP(print);
+USE_OP(lrn);
+USE_CPU_ONLY_OP(nce);
+USE_CPU_ONLY_OP(similarity_focus);
+USE_CPU_ONLY_OP(get_tensor_from_selected_rows);
+USE_OP(squared_l2_distance);
+USE_OP(cudnn_lstm);
+USE_OP(tree_conv);
+USE_OP(one_hot);
+USE_NO_KERNEL_OP(lookup_sparse_table);
+USE_CPU_ONLY_OP(unique);
+USE_OP(mean);
+USE_OP(prelu);
+USE_NO_KERNEL_OP(delete_var);
+USE_OP(ctc_align);
+USE_OP(argsort);
+USE_CPU_ONLY_OP(data_norm);
+USE_OP(minus);
+USE_NO_KERNEL_OP(shrink_rnn_memory);
+USE_OP(lod_reset);
+USE_OP(l1_norm);
+USE_NO_KERNEL_OP(gaussian_random_batch_size_like);
+USE_OP(is_empty);
+USE_OP(bilinear_tensor_product);
+USE_OP(kldiv_loss);
+USE_NO_KERNEL_OP(squeeze);
+USE_OP(softmax);
+USE_OP(clip_by_norm);
+USE_OP(max_pool2d_with_index);
+USE_OP(linear_chain_crf);
+USE_CPU_ONLY_OP(reshape);
+USE_OP(fill_constant);
+USE_OP(space_to_depth);
+USE_OP(gather);
+USE_OP(softmax_with_cross_entropy);
+USE_OP(slice);
+USE_OP(sign);
+USE_OP(expand);
+USE_OP(smooth_l1_loss);
+USE_NO_KERNEL_OP(tensor_array_to_tensor);
+USE_OP(row_conv);
+USE_OP(pad2d);
+USE_OP(pixel_shuffle);
+USE_OP(assign_value);
+USE_OP(random_crop);
+USE_OP(squared_l2_norm);
+USE_OP(save_combine);
+USE_OP(pool2d);
+USE_OP(cast);
+USE_NO_KERNEL_OP(array_to_lod_tensor);
+USE_OP(fill_any_like);
+USE_NO_KERNEL_OP(flatten);
+USE_OP(sample_logits);
+USE_OP(pad);
+USE_CPU_ONLY_OP(bpr_loss);
+USE_OP(roi_pool);
+USE_OP(pad_constant_like);
+USE_OP(isfinite);
+USE_OP(layer_norm);
+USE_OP(maxout);
+USE_OP(warpctc);
+#endif
--- a/paddle/fluid/train/custom_trainer/feed/executor/executor.cc
+++ b/paddle/fluid/train/custom_trainer/feed/executor/executor.cc
+#include "paddle/fluid/train/custom_trainer/feed/executor/executor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/cpu_helper.h"
+#include "paddle/fluid/inference/api/details/reset_tensor_array.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace custom_trainer {
+namespace feed {
+namespace {
+int ReadBinaryFile(const std::string& filename, std::string* contents) {
+    std::ifstream fin(filename, std::ios::in | std::ios::binary);
+    if (!fin) {
+        VLOG(2) << "Cannot open file " << filename;
+        return -1;
+    }
+    fin.seekg(0, std::ios::end);
+    contents->clear();
+    contents->resize(fin.tellg());
+    fin.seekg(0, std::ios::beg);
+    fin.read(&(contents->at(0)), contents->size());
+    fin.close();
+    return 0;
+}
+std::unique_ptr<paddle::framework::ProgramDesc> Load(
+        paddle::framework::Executor* /*executor*/, const std::string& model_filename) {
+    VLOG(3) << "loading model from " << model_filename;
+    std::string program_desc_str;
+    if (ReadBinaryFile(model_filename, &program_desc_str) != 0) {
+        return nullptr;
+    }
+    std::unique_ptr<paddle::framework::ProgramDesc> main_program(
+            new paddle::framework::ProgramDesc(program_desc_str));
+    return main_program;
+}
+}
+struct SimpleExecute::Context {
+    Context(const ::paddle::platform::Place& place) : place(place), executor(place) {
+    }
+    const ::paddle::platform::Place& place;
+    ::paddle::framework::Executor executor;
+    ::std::unique_ptr<::paddle::framework::ProgramDesc> main_program;
+    ::std::unique_ptr<framework::ExecutorPrepareContext> prepare_context;
+    details::TensorArrayBatchCleaner tensor_array_batch_cleaner;
+};
+SimpleExecute::SimpleExecute() {
+}
+SimpleExecute::~SimpleExecute() {
+}
+int SimpleExecute::initialize(YAML::Node exe_config,
+        std::shared_ptr<TrainerContext> context_ptr) {
+    paddle::framework::InitDevices(false);
+    if (exe_config["num_threads"]) {
+        paddle::platform::SetNumThreads(exe_config["num_threads"].as<int>());
+    } else {
+        paddle::platform::SetNumThreads(1);
+    }
+    if (!exe_config["startup_program"] || 
+        !exe_config["main_program"]) {
+        VLOG(2) << "fail to load config";
+        return -1;
+    }
+    try {
+        _context.reset(new SimpleExecute::Context(context_ptr->cpu_place));
+        auto startup_program = Load(&_context->executor, exe_config["startup_program"].as<std::string>());
+        if (startup_program == nullptr) {
+            VLOG(2) << "fail to load startup_program: " << exe_config["startup_program"].as<std::string>();
+            return -1;
+        }
+        _context->executor.Run(*startup_program, this->scope(), 0, false, true);
+        _context->main_program = Load(&_context->executor, exe_config["main_program"].as<std::string>());
+        if (_context->main_program == nullptr) {
+            VLOG(2) << "fail to load main_program: " << exe_config["main_program"].as<std::string>();
+            return -1;
+        }
+        _context->prepare_context = _context->executor.Prepare(*_context->main_program, 0);
+        _context->executor.CreateVariables(*_context->main_program, this->scope(), 0);
+    } catch (::paddle::platform::EnforceNotMet& err) {
+        VLOG(2) << err.what();
+        _context.reset(nullptr);
+        return -1;
+    }
+    return 0;
+}
+int SimpleExecute::run() {
+    if (_context == nullptr) {
+        VLOG(2) << "need initialize before run";
+        return -1;
+    }
+    try {
+        _context->executor.RunPreparedContext(_context->prepare_context.get(), this->scope(),
+                                false, /* don't create local scope each time*/
+                                false /* don't create variable each time */);
+        // For some other vector like containers not cleaned after each batch.
+        _context->tensor_array_batch_cleaner.CollectNoTensorVars(this->scope());
+        _context->tensor_array_batch_cleaner.ResetNoTensorVars();
+    } catch (::paddle::platform::EnforceNotMet& err) {
+        VLOG(2) << err.what();
+        return -1;
+    }
+    return 0;
+}
+}  // namespace feed
+}  // namespace custom_trainer
+}  // namespace paddle
--- a/paddle/fluid/train/custom_trainer/feed/executor/executor.h
+++ b/paddle/fluid/train/custom_trainer/feed/executor/executor.h
 #pragma once
 #include <functional>
-#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/train/custom_trainer/feed/common/registerer.h"
 #include "paddle/fluid/train/custom_trainer/feed/trainer_context.h"
@@ -23,7 +23,7 @@ public:
    }
    //直接取var
    template <class T>
-    T* var(const std::string& name) {
+    const T& var(const std::string& name) {
        return _scope.Var(name)->Get<T>();
    }
    template <class T>
@@ -31,8 +31,8 @@ public:
        return _scope.Var(name)->GetMutable<T>();
    }
-    //执行n轮训练，每轮回调(epoch_id, _scope)
+    //执行训练
-    virtual int run(uint32_t epoch_num, std::function<void(uint32_t, ::paddle::framework::Scope*)>) = 0;
+    virtual int run() = 0;
    virtual bool is_dump_all_model() {
        return false;
@@ -44,13 +44,14 @@ REGISTER_REGISTERER(Executor);
 class SimpleExecutor : public Executor {
 public:
-    SimpleExecutor() {}
+    SimpleExecute();
-    virtual ~SimpleExecutor() {}
+    virtual ~SimpleExecute();
    virtual int initialize(YAML::Node exe_config,
        std::shared_ptr<TrainerContext> context_ptr);
-    virtual int run(uint32_t epoch_num, std::function<void(uint32_t, ::paddle::framework::Scope*)>) = 0;
+    virtual int run();
 protected:
-    std::shared_ptr<::paddle::framework::Executor> _executor; 
+    struct Context;
+    std::unique_ptr<Context> _context;
 };
 }  // namespace feed

--- a/paddle/fluid/train/custom_trainer/feed/main.cc
+++ b/paddle/fluid/train/custom_trainer/feed/main.cc
@@ -5,6 +5,8 @@
 #include "paddle/fluid/train/custom_trainer/feed/trainer_context.h"
 #include "paddle/fluid/train/custom_trainer/feed/process/process.h"
 #include "paddle/fluid/train/custom_trainer/feed/process/init_env_process.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/pybind/pybind.h"
 using namespace paddle::custom_trainer::feed;
@@ -19,7 +21,6 @@ int main(int argc, char* argv[]) {
    //load trainer config
    auto trainer_context_ptr = std::make_shared<TrainerContext>();
    trainer_context_ptr->trainer_config = YAML::LoadFile(FLAGS_feed_trainer_conf_path);    
-    VLOG(3) << "yaml node size" << trainer_context_ptr->trainer_config.size();
    std::vector<std::string> process_name_list = {
        "InitEnvProcess",

--- a/paddle/fluid/train/custom_trainer/feed/process/init_env_process.cc
+++ b/paddle/fluid/train/custom_trainer/feed/process/init_env_process.cc
@@ -17,10 +17,7 @@ int InitEnvProcess::initialize(std::shared_ptr<TrainerContext> context_ptr) {
    paddle::framework::InitDevices(false);
    context_ptr->cpu_place = paddle::platform::CPUPlace();
-    YAML::Node config;
+    YAML::Node config = _context_ptr->trainer_config;
-    config.reset(_context_ptr->trainer_config);
-    VLOG(3) << "yaml node size : " << config.size();
    //environment
    std::string env_class = config["environment"]["environment_class"].as<std::string>();
    auto* environment = CREATE_CLASS(RuntimeEnvironment, env_class);

--- a/paddle/fluid/train/custom_trainer/feed/unit_test/main.cc
+++ b/paddle/fluid/train/custom_trainer/feed/unit_test/main.cc
+#include <gtest/gtest.h>
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/pybind/pybind.h"
+int32_t main(int32_t argc, char** argv) {
+    ::google::InitGoogleLogging(argv[0]);
+    ::testing::InitGoogleTest(&argc, argv);
+    ::google::ParseCommandLineFlags(&argc, &argv, true);
+    return RUN_ALL_TESTS();
+}
--- a/paddle/fluid/train/custom_trainer/feed/unit_test/test_executor.cc
+++ b/paddle/fluid/train/custom_trainer/feed/unit_test/test_executor.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <iostream>
+#include <gtest/gtest.h>
+#include "paddle/fluid/train/custom_trainer/feed/executor/executor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+namespace paddle {
+namespace custom_trainer {
+namespace feed {
+TEST(testSimpleExecute, initialize) {
+    SimpleExecute execute;
+    auto context_ptr = std::make_shared<TrainerContext>();
+    YAML::Node config = YAML::Load("[1, 2, 3]");
+    ASSERT_NE(0, execute.initialize(config, context_ptr));
+    config = YAML::Load("{startup_program: ./data/startup_program, main_program: ./data/main_program}");
+    ASSERT_EQ(0, execute.initialize(config, context_ptr));
+    config = YAML::Load("{thread_num: 2, startup_program: ./data/startup_program, main_program: ./data/main_program}");
+    ASSERT_EQ(0, execute.initialize(config, context_ptr));
+}
+float uniform(float min, float max) {
+    float result = (float)rand() / RAND_MAX;
+    return min + result * (max - min);
+}
+void next_batch(int batch_size, const paddle::platform::Place& place, paddle::framework::LoDTensor* x_tensor, paddle::framework::LoDTensor* y_tensor) {
+	x_tensor->Resize({batch_size, 2});
+	auto x_data = x_tensor->mutable_data<float>(place);
+	y_tensor->Resize({batch_size, 1});
+	auto y_data = y_tensor->mutable_data<float>(place);
+    for (int i = 0; i < batch_size; ++i) {
+        x_data[i * 2] = uniform(-2, 2);
+        x_data[i * 2 + 1] = uniform(-2, 2);
+        float dis = x_data[i * 2] * x_data[i * 2] + x_data[i * 2 + 1] * x_data[i * 2 + 1];
+        y_data[i] = dis < 1.0 ? 1.0 : 0.0;
+    }
+}
+TEST(testSimpleExecute, run) {
+    SimpleExecute execute;
+    auto context_ptr = std::make_shared<TrainerContext>();
+    auto config = YAML::Load("{thread_num: 2, startup_program: ./data/startup_program, main_program: ./data/main_program}");
+    ASSERT_EQ(0, execute.initialize(config, context_ptr));
+	auto x_var = execute.mutable_var<::paddle::framework::LoDTensor>("x");
+	auto y_var = execute.mutable_var<::paddle::framework::LoDTensor>("y");
+    ASSERT_NE(nullptr, x_var);
+    ASSERT_NE(nullptr, y_var);
+    next_batch(1024, context_ptr->cpu_place, x_var, y_var);
+    ASSERT_EQ(0, execute.run());
+	auto loss_var = execute.var<::paddle::framework::LoDTensor>("loss");
+    auto loss = loss_var.data<float>()[0];
+    std::cout << "loss: " << loss << std::endl;
+}
+}  // namespace feed
+}  // namespace custom_trainer
+}  // namespace paddle