diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 3c73b6cc55c187c3f6e7edd1ce38cc58f4e8413d..4fb4ec38ee965a2790d11378a1ce6befa0ef5a00 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -25,11 +25,12 @@ else()
     cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 endif()
 
+cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_base scope lod_tensor)
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
 
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
-        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle)
+        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle)
 
 
 cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker)
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..786d95acb1815e856f684ee6c64664a3297160b2
--- /dev/null
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
@@ -0,0 +1,138 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/data_balance_op_handle.h"
+#include <algorithm>
+#include "paddle/fluid/framework/details/container_cast.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+DataBalanceOpHandle::DataBalanceOpHandle(
+    const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places)
+    : local_scopes_(local_scopes), places_(places) {}
+
+std::string DataBalanceOpHandle::Name() const { return "data balance"; }
+
+std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan(
+    const std::vector<int> &device_sizes) {
+  int device_num = device_sizes.size();
+  int total_size = 0;
+  int empty_num = 0;
+  std::vector<std::array<int, 2>> size_device_vec;
+  size_device_vec.reserve(device_num);
+  for (int i = 0; i < device_num; ++i) {
+    if (device_sizes[i] == 0) {
+      ++empty_num;
+    }
+    total_size += device_sizes[i];
+    size_device_vec.push_back({{device_sizes[i], i}});
+  }
+  std::vector<std::array<int, 3>> res;
+  if (empty_num == 0) {
+    // No need to do data balance.
+    return res;
+  }
+  if (total_size < device_num) {
+    // No enough data.
+    PADDLE_THROW("There is no next data.");
+  }
+  std::sort(size_device_vec.begin(), size_device_vec.end(),
+            [](const std::array<int, 2> &a, const std::array<int, 2> &b) {
+              return a[0] > b[0];
+            });
+  int expected_device_size = total_size / device_num;
+  int src_idx = 0;
+  for (int dst_idx = device_num - empty_num; dst_idx < device_num; ++dst_idx) {
+    if (size_device_vec[src_idx][0] <= expected_device_size) {
+      ++src_idx;
+      PADDLE_ENFORCE_LT(src_idx, device_num - empty_num);
+    }
+    size_device_vec[src_idx][0] -= expected_device_size;
+    size_device_vec[dst_idx][0] += expected_device_size;
+    res.push_back({{size_device_vec[src_idx][1], size_device_vec[dst_idx][1],
+                    expected_device_size}});
+  }
+  return res;
+}
+
+void DataBalanceOpHandle::RunImpl() {
+  if (places_.size() == 1) {
+    return;
+  }
+  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
+  auto out_var_handles = DynamicCast<VarHandle>(outputs_);
+  PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0);
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), out_var_handles.size(),
+      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
+  int data_num = in_var_handles.size() / places_.size();
+  WaitInputVarGenerated();
+
+  std::vector<std::vector<LoDTensor *>> lod_tensors;
+  std::vector<int> device_sizes;
+  for (int i = 0; i < static_cast<int>(in_var_handles.size()); ++i) {
+    PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_,
+                      "The name of input and output should be equal.");
+    int place_idx = i / data_num;
+    int data_idx = i % data_num;
+    auto *local_scope =
+        local_scopes_[place_idx]->FindVar(kLocalExecScopeName)->Get<Scope *>();
+    auto *tensor_var = local_scope->FindVar(in_var_handles[i]->name_);
+    PADDLE_ENFORCE(tensor_var->IsType<LoDTensor>());
+    auto *tensor = tensor_var->GetMutable<LoDTensor>();
+    PADDLE_ENFORCE(places_[place_idx] == tensor->place());
+    lod_tensors[data_idx].push_back(tensor);
+    int ins_size =
+        tensor->lod().empty() ? tensor->dims()[0] : tensor->NumElements();
+    if (data_idx == 0) {
+      device_sizes.emplace_back(ins_size);
+    } else {
+      PADDLE_ENFORCE_EQ(ins_size, device_sizes.at(place_idx));
+    }
+  }
+  const auto &balance_plan = GetBalancePlan(device_sizes);
+
+  for (const auto &trans : balance_plan) {
+    for (int data_idx = 0; data_idx < data_num; ++data_idx) {
+      LoDTensor *src_tensor = lod_tensors[data_idx][trans[0]];
+      LoDTensor *dst_tensor = lod_tensors[data_idx][trans[1]];
+      int trans_ins_size = trans[2];
+      LoD src_lod = src_tensor->lod();
+      int src_ins_size =
+          src_lod.empty() ? src_tensor->dims()[0] : src_tensor->NumElements();
+      int cut_point = src_ins_size - trans_ins_size;
+      if (!src_lod.empty()) {
+        for (auto &level : src_lod) {
+          cut_point = level[cut_point];
+        }
+      }
+      TensorCopySync(src_tensor->Slice(cut_point, src_tensor->dims()[0]),
+                     dst_tensor->place(), dst_tensor);
+      src_tensor->ShareDataWith(src_tensor->Slice(0, cut_point));
+      if (!src_lod.empty()) {
+        dst_tensor->set_lod(SliceInLevel(
+            src_lod, 0, src_ins_size - trans_ins_size, src_ins_size));
+        src_tensor->set_lod(
+            SliceInLevel(src_lod, 0, 0, src_ins_size - trans_ins_size));
+      }
+    }
+  }
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.h b/paddle/fluid/framework/details/data_balance_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..00bc4837d66f40e06b332dff92429d85f3b2acfc
--- /dev/null
+++ b/paddle/fluid/framework/details/data_balance_op_handle.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct DataBalanceOpHandle : public OpHandleBase {
+ public:
+  DataBalanceOpHandle(const std::vector<Scope *> &local_scopes,
+                      const std::vector<platform::Place> &places);
+
+  std::string Name() const override;
+
+  bool IsMultiDeviceTransfer() override { return false; };
+
+ protected:
+  void RunImpl() override;
+
+ private:
+  // std::vector<(src_dev_id, dst_dev_id, trans_size)>
+  std::vector<std::array<int, 3>> GetBalancePlan(
+      const std::vector<int> &batch_size_per_device);
+
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index e7063fb0423cbe8ad79e46ee8bcb0dcca79b1067..357f6ff5dc294a7bc6edd2d61bdd61ee580ac2d2 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/data_balance_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
@@ -217,6 +218,11 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
         // gradients.
         CreateComputationalOps(&result, *op, places_.size());
 
+        if (op->Type() == "read") {
+          const auto &data_var_names = op->Output("Out");
+          InsertDataBalanceOp(&result, data_var_names);
+        }
+
         if (!is_forwarding && places_.size() > 1) {
           // Currently, we assume that once gradient is generated, it can be
           // broadcast, and each gradient is only broadcast once.
@@ -360,6 +366,24 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(SSAGraph *result,
   }
 }
 
+void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
+    SSAGraph *result, const std::vector<std::string> &datas) const {
+  result->ops_.emplace_back(new DataBalanceOpHandle(local_scopes_, places_));
+  auto *op_handle = result->ops_.back().get();
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto &p = places_[i];
+    SetCommunicationContext(op_handle, p);
+    for (const std::string &d_name : datas) {
+      auto &vars = result->vars_[i][d_name];
+      PADDLE_ENFORCE(!vars.empty());
+      op_handle->AddInput(vars.back().get());
+      auto var = new VarHandle(vars.size(), i, d_name, p);
+      vars.emplace_back(var);
+      op_handle->AddOutput(var);
+    }
+  }
+}
+
 bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
     const std::string &og,
     std::unordered_set<std::string> *og_has_been_broadcast) const {
@@ -509,7 +533,8 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
     op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
     // the variable name which contains .block means it was splited by
     // split_byref op
-    // so that we can balance the variable blocks to all the pserver instances.
+    // so that we can balance the variable blocks to all the pserver
+    // instances.
     if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce &&
         op.InputArgumentNames()[0].find(".block") == std::string::npos) {
       op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index 0b6347bf51dc1c347073a0fdcf4ddd91865d846d..a964e024885e56693224a6199e00ff30beaa1df4 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -101,6 +101,9 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 
   void InsertAllReduceOp(SSAGraph *result, const std::string &og) const;
 
+  void InsertDataBalanceOp(SSAGraph *result,
+                           const std::vector<std::string> &datas) const;
+
   void CreateBroadcastOp(SSAGraph *result, const std::string &p_name,
                          size_t src_dev_id) const;
 
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index d29d8ce1c561e45980d10c17c984ca2ed3b453f3..49672e11818266b6f0328206a9985ca6c9bd4287 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -68,7 +68,7 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
   // only print first ten elements
   int64_t size = t.numel() < 10 ? t.numel() : 10;
   for (int64_t i = 0; i < size; ++i) {
-    if (t.type().hash_code() == typeid(float).hash_code()) {
+    if (t.type().hash_code() == typeid(float).hash_code()) {  // NOLINT
       os << t.data<float>()[i] << " ";
     } else if (t.type().hash_code() == typeid(int64_t).hash_code()) {
       os << t.data<int64_t>()[i] << " ";
@@ -89,6 +89,7 @@ std::string LoDToString(const LoD &lod) {
 LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
                  size_t elem_end) {
   PADDLE_ENFORCE_LT(level, in.size());
+  PADDLE_ENFORCE_LT(elem_begin, elem_end);
   PADDLE_ENFORCE_LT(elem_end, in[level].size());
 
   LoD res;
diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc
index 72a27d43584d55cd0859c63577ae85ff0f5fdfa8..8e9f91c185dda4729230bc9a89eb6441cbc6205b 100644
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@@ -66,9 +66,15 @@ class ReadOp : public framework::OperatorBase {
     std::vector<std::string> out_arg_names = Outputs("Out");
     std::vector<framework::LoDTensor> ins;
     reader->ReadNext(&ins);
-    PADDLE_ENFORCE(!ins.empty(), "There is no next data.");
+    if (ins.empty()) {
+      ins.resize(out_arg_names.size());
+      for (auto& tensor : ins) {
+        // data type is not important for subsequent DataBalanceOpHandle
+        tensor.mutable_data<float>(framework::make_ddim({0}), dev_place);
+      }
+    }
     PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
-    for (size_t i = 0; i < ins.size(); ++i) {
+    for (size_t i = 0; i < out_arg_names.size(); ++i) {
       auto* out =
           scope.FindVar(out_arg_names[i])->GetMutable<framework::LoDTensor>();
       out->ShareDataWith(ins[i]);