diff --git a/doc/design/model_format.md b/doc/design/model_format.md
index a1c086775acbee6d05ebf69d7de9c7c3ac2cd36e..e29129fddf775939c9f7a8b49d850d523e6e5a45 100644
--- a/doc/design/model_format.md
+++ b/doc/design/model_format.md
@@ -12,24 +12,22 @@ The topology is saved as a plain text in a detailed self-contain protobuf file.
 
 The parameters are saved as a binary file. As we all know, the protobuf message has a limit of [64M size](https://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.io.coded_stream#CodedInputStream.SetTotalBytesLimit.details). We have done a [benchmark experiment](https://github.com/PaddlePaddle/Paddle/pull/4610), which shows that protobuf is not fit for the task.
 
-As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, the `name` of the tensor, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is, 
-
-|HeaderLength|ContentLength|**LoDTensorDesc**|**TensorValue**|
+As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is, 
 
 The table below shows a tensor's byte view in detail. Note that all the signed values are written in the little-endian format.
 
-```text
-[offset] [type]              [description] 
-0004     4 bytes integer      HeaderLength, the length of LoDTensorDesc
-0008     4 bytes integer      ContentLength, the length of LodTensor Buffer
-0009     1 bytes char         TensorDesc
-00010    1 bytes char         TensorDesc
-...
-00100    1 bytes char         TensorValue
-00101    1 bytes char         TensorValue
-00102    1 bytes char         TensorValue              ..
-...
-```
+|field name  | type | description |
+| --- | --- | --- |
+| version | uint32_t | Version of saved file. Always 0 now. |
+| tensor desc length | uint32_t | TensorDesc(Protobuf message) length in bytes. |
+| tensor desc | void* | TensorDesc protobuf binary message |
+| tensor data | void* | Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()` |
+| lod_level | uint64_t | Level of LoD |
+| length of lod[0] | uint64_t | [Optional] length of lod[0] in bytes. |
+| data of lod[0] | uint64_t*  | [Optional] lod[0].data() |
+| ... | ... | ... |
+
+
 
 ## Summary
 
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 85374a476d51dc4c0e22793e8b53d6d7ba21c8da..0a77859d6148f636dacef2c6759fc00d387f5d5d 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -1,6 +1,5 @@
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)
-proto_library(saver_proto SRCS framework.proto saver.proto)
 
 cc_library(ddim SRCS ddim.cc DEPS eigen3)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
@@ -10,7 +9,7 @@ cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context)
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
-cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor saver_proto framework_proto)
+cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
 
diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h
index c25a62c2b11ead614d93a4be8d63d40d0cc0165a..bafb4fbd480bf2a28e3aa3dc615a310f80cec493 100644
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <typeindex>
 #include "paddle/framework/framework.pb.h"
+#include "paddle/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index 731235cd986c152c9504a49c6c07ed17d16bfdfb..584308a5388da0d02d29f71a28097b02b6ea825f 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -13,7 +13,6 @@
    limitations under the License. */
 
 #include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/saver.pb.h"
 
 #include "paddle/memory/memcpy.h"
 #include "paddle/memory/memory.h"
@@ -136,141 +135,5 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
   PADDLE_ENFORCE_LT(begin, end, "Cannot shrink, the result tensor is empty.");
   ShareDataWith(Slice(begin, end));
 }
-
-std::string LoDTensor::SerializeToString() const {
-  LoDTensorProto desc;
-
-  // set data_type
-  if (this->type() == typeid(int8_t)) desc.set_data_type(DataType::BOOL);
-  if (this->type() == typeid(int16_t)) desc.set_data_type(DataType::INT16);
-  if (this->type() == typeid(int32_t)) desc.set_data_type(DataType::INT32);
-  if (this->type() == typeid(int64_t)) desc.set_data_type(DataType::INT64);
-  // FIXME(dzh): there is no fp16 in standard c++
-
-  if (this->type() == typeid(float))  // NOLINT
-    desc.set_data_type(DataType::FP32);
-  if (this->type() == typeid(double))  // NOLINT
-    desc.set_data_type(DataType::FP64);
-
-  for (int i = 0; i < dims().size(); ++i) {
-    desc.add_dims(dims()[i]);
-  }
-
-  // set lod information
-  desc.set_lod_level(this->NumLevels());
-  for (size_t i = 0; i < this->NumLevels(); ++i) {
-    LoDInfo* lod = desc.add_levels();
-    for (size_t j = 0; j < lod_[i].size(); ++j) {
-      lod->add_level(lod_[i][j]);
-    }
-  }
-
-  desc.set_version(0);
-
-  std::string desc_bytes = desc.SerializeAsString();
-
-  // FIXME(dzh) : implement fix chunk size buffer.
-  size_t DESC_SIZE = desc_bytes.size();
-  size_t DATA_SIZE = holder_->size() - offset_;
-
-  const size_t BUFFER_SIZE = DESC_SIZE + DATA_SIZE + 2 * sizeof(size_t);
-  char* buffer =
-      static_cast<char*>(memory::Alloc(platform::CPUPlace(), BUFFER_SIZE));
-
-  // format: desc_size data_size, desc_bytes, data_bytes.
-  platform::CPUPlace src_place;
-  platform::CPUPlace dst_place;
-
-  memory::Copy(dst_place, buffer, src_place, &BUFFER_SIZE, sizeof(size_t));
-  memory::Copy(dst_place, buffer + sizeof(size_t), src_place, &DESC_SIZE,
-               sizeof(size_t));
-  memory::Copy(dst_place, buffer + sizeof(size_t) * 2, src_place,
-               desc_bytes.c_str(), desc_bytes.size());
-
-  PADDLE_ENFORCE(this->numel() != 0, "Serialize a empty Tensor!");
-
-  platform::Place place = holder_->place();
-  int element_width = holder_->size() / this->numel();
-
-  if (platform::is_cpu_place(place)) {
-    memory::Copy(dst_place, buffer + sizeof(size_t) * 2 + desc_bytes.size(),
-                 boost::get<platform::CPUPlace>(place),
-                 static_cast<char*>(holder_->ptr()) + offset_ / element_width,
-                 DATA_SIZE);
-  }
-#ifdef PADDLE_WITH_GPU
-  if (platform::is_gpu_place(place)) {
-    memory::Copy(dst_place, buffer + sizeof(size_t) * 2 + desc_bytes.size(),
-                 boost::get<platform::GPUPlace>(place),
-                 static_cast<char*>(holder_->ptr()) + offset_ / element_width,
-                 DATA_SIZE);
-  }
-#endif
-
-  std::string ret(buffer, BUFFER_SIZE);
-  memory::Free(platform::CPUPlace(), buffer);
-  return ret;
-}
-
-void LoDTensor::DeserializeFromString(const std::string& s,
-                                      const platform::Place& dst_place) {
-  size_t DESC_SIZE, BUFFER_SIZE;
-  platform::CPUPlace src_place;
-
-  memory::Copy(src_place, &BUFFER_SIZE, src_place, s.c_str(), sizeof(size_t));
-  memory::Copy(src_place, &DESC_SIZE, src_place, s.c_str() + sizeof(size_t),
-               sizeof(size_t));
-
-  const size_t DATA_SIZE = BUFFER_SIZE - DESC_SIZE - sizeof(size_t) * 2;
-
-  // parse LoDTensorDesc
-  LoDTensorProto desc;
-  desc.ParseFromArray(s.c_str() + sizeof(size_t) * 2, DESC_SIZE);
-
-  std::vector<int64_t> dims;
-  std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
-  this->Resize(make_ddim(dims));
-
-  // parse data type
-  void* ptr = nullptr;
-  if (desc.data_type() == DataType::BOOL)
-    ptr = this->mutable_data<bool>(dst_place);
-  if (desc.data_type() == DataType::INT16)
-    ptr = this->mutable_data<int16_t>(dst_place);
-  if (desc.data_type() == DataType::INT32)
-    ptr = this->mutable_data<int32_t>(dst_place);
-  if (desc.data_type() == DataType::INT64)
-    ptr = this->mutable_data<int64_t>(dst_place);
-  // FIXME(dzh): there is no fp16 in standard c++
-
-  if (desc.data_type() == DataType::FP32)
-    ptr = this->mutable_data<float>(dst_place);
-  if (desc.data_type() == DataType::FP64)
-    ptr = this->mutable_data<double>(dst_place);
-
-  LoD lod;
-  std::vector<size_t> levels;
-  for (int i = 0; i < desc.levels().size(); ++i) {
-    auto current_level = desc.levels()[i].level();
-    std::copy(current_level.begin(), current_level.end(),
-              std::back_inserter(levels));
-    lod.emplace_back(levels);
-    levels.clear();
-  }
-
-  this->set_lod(lod);
-
-  if (platform::is_cpu_place(dst_place)) {
-    memory::Copy(boost::get<platform::CPUPlace>(dst_place), ptr, src_place,
-                 s.c_str() + sizeof(size_t) * 2 + DESC_SIZE, DATA_SIZE);
-  }
-#ifdef PADDLE_WITH_GPU
-  if (platform::is_gpu_place(dst_place)) {
-    memory::Copy(boost::get<platform::GPUPlace>(dst_place), ptr, src_place,
-                 s.c_str() + sizeof(size_t) * 2 + DESC_SIZE, DATA_SIZE);
-  }
-#endif
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 735d85f750c30c78e74018b971f8e32fe9f4c8bb..f4fe4cdac6019a1899fd3db8e1b6ca588be0d436 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -85,7 +85,9 @@ class LoDTensor : public Tensor {
 
   void set_lod(const LoD& lod) { lod_ = lod; }
 
-  LoD lod() const { return lod_; }
+  const LoD& lod() const { return lod_; }
+
+  LoD* mutable_lod() { return &lod_; }
 
   /*
    * Get the start offset and end offset of an  element from LoD.
@@ -139,27 +141,6 @@ class LoDTensor : public Tensor {
    */
   void ShrinkInLevel(size_t level, size_t elem_begin, size_t elem_end);
 
-  /**
-   *  @brief Serialize tensor to char bytes.
-   *  Please check model_format.md for the format detail.
-   *  NOTE: GPUTensor will copy data to cpu implicitly.
-   *  @return return string
-   */
-
-  // FIXME(dzh) : Currently, this interface should only be used in
-  // save/restore model and checkpoint. ParameterServer do not use shape
-  // information to do the optimization, as a result, when we serialize
-  // parameter/gradient to string, we should serialize the tensor
-  // to string in the ps trainer instead of LoDTensor.
-  std::string SerializeToString() const;
-
-  /**
-   *  @brief Deserialize char bytes to tensor.
-   *  @return return string
-   */
-  void DeserializeFromString(const std::string& s,
-                             const platform::Place& dst_place);
-
  private:
   LoD lod_;
 };
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
index f309376c8b65e2ce83d0df20496d53cf7e9f3ea9..aa2f6c993d41ae98e0769d470dccad3b410da53e 100644
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -144,21 +144,5 @@ TEST(LodExpand, test) {
   }
 }
 
-TEST_F(LoDTensorTester, SerializeDeserialize) {
-  LoDTensor new_lod_tensor = lod_tensor_;
-  float* src_ptr = lod_tensor_.data<float>();
-  std::string s = lod_tensor_.SerializeToString();
-  LoDTensor dst;
-  dst.DeserializeFromString(s, platform::CPUPlace());
-  float* dst_ptr = dst.data<float>();
-  for (int i = 0; i < kLodTensorSize; ++i) {
-    EXPECT_EQ(dst_ptr[i], src_ptr[i]);
-  }
-
-  ASSERT_EQ(dst.NumElements(0), 2UL);
-  ASSERT_EQ(dst.NumElements(1), 3UL);
-  ASSERT_EQ(dst.NumElements(2), 8UL);
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor_test.cu b/paddle/framework/lod_tensor_test.cu
index 11659be02ac340728150cf0a6438db8626c8e611..c79c4d0c721f9e568c937cb9e524e925fcdc83d0 100644
--- a/paddle/framework/lod_tensor_test.cu
+++ b/paddle/framework/lod_tensor_test.cu
@@ -47,31 +47,4 @@ TEST(LoDTensor, LoDInGPU) {
   for (size_t i = 0; i < src_lod[0].size(); ++i) {
     CHECK_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);
   }
-}
-
-TEST(LoDTensor, SerializeDeserialize) {
-  paddle::framework::LoDTensor lod_tensor;
-  paddle::platform::GPUPlace place(0);
-
-  paddle::framework::LoD src_lod;
-  src_lod.push_back(std::vector<size_t>{0, 2, 4, 6, 8, 10, 12, 14});
-
-  lod_tensor.Resize({14, 16});
-  lod_tensor.mutable_data<float>(place);
-
-  lod_tensor.set_lod(src_lod);
-  CHECK_EQ(lod_tensor.lod_element(0, 2).first, 4UL);
-  CHECK_EQ(lod_tensor.lod_element(0, 4).first, 8UL);
-
-  test<<<1, 8>>>(src_lod[0].data(), src_lod[0].size());
-  cudaDeviceSynchronize();
-
-  std::string s = lod_tensor.SerializeToString();
-  paddle::framework::LoDTensor dst;
-  dst.DeserializeFromString(s, place);
-  paddle::framework::LoD dst_lod = dst.lod();
-
-  for (size_t i = 0; i < dst_lod[0].size(); ++i) {
-    CHECK_EQ(src_lod[0].data()[i], dst_lod[0].data()[i] * 2);
-  }
-}
+}
\ No newline at end of file
diff --git a/paddle/framework/saver.proto b/paddle/framework/saver.proto
deleted file mode 100644
index 90a191a6a79250761489b68916b1fa09116830f2..0000000000000000000000000000000000000000
--- a/paddle/framework/saver.proto
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-syntax = "proto2";
-option optimize_for = LITE_RUNTIME;
-package paddle.framework;
-
-import "framework.proto";
-
-/**
- * This file contains necessary information for model, checkpoint.
- * etc.
- */
-
-message LoDInfo { repeated int64 level = 1; }
-
-/**
- * Save the LoDTensorDesc information through LoDTensorProto, its data memory
- * is copyed to c buffer immediately. See model_format.md for details.
- */
-
-message LoDTensorProto {
-  optional DataType data_type = 1;
-  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
-  repeated LoDInfo levels = 3;
-  optional int32 lod_level = 4 [ default = 0 ];
-  optional int32 version = 5;
-}
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index e31472327dbca45dc12ea2c9e494beddd36860dc..9d2dc6a32bb2d4f6368fd9c7264c55fb9588819c 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -132,6 +132,8 @@ class Tensor {
 
   std::type_index type() const { return holder_->type(); }
 
+  size_t memory_size() const;
+
  private:
   inline void check_memory_size() const;
 
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index f6e801bbb4a056b5590da95a4b140cb90638f322..29ac683f48fcde4dd3b5ad7f04b5d1d7434706ba 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -62,12 +62,16 @@ inline void Tensor::check_memory_size() const {
   PADDLE_ENFORCE_NOT_NULL(
       holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
   PADDLE_ENFORCE_GE(
-      holder_->size(), numel() * SizeOfType(type()) + offset_,
+      holder_->size(), memory_size() + offset_,
       "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
       "first to re-allocate memory.\n"
       "or maybe the required data-type mismatches the data already stored.");
 }
 
+inline size_t Tensor::memory_size() const {
+  return holder_ == nullptr ? 0UL : numel() * SizeOfType(type());
+}
+
 template <typename T>
 inline const T* Tensor::data() const {
   check_memory_size();
diff --git a/paddle/framework/variable.h b/paddle/framework/variable.h
index a80f0e66b5a59bf95efc200d159ad5dd9cf4111a..cde5ec2413ad01a0396e19fa617688af0eafbc75 100644
--- a/paddle/framework/variable.h
+++ b/paddle/framework/variable.h
@@ -46,6 +46,8 @@ class Variable {
            std::type_index(typeid(T)) == std::type_index(holder_->Type());
   }
 
+  void Clear() { holder_.reset(); }
+
  private:
   struct Placeholder {
     virtual ~Placeholder() {}
diff --git a/paddle/memory/memcpy.h b/paddle/memory/memcpy.h
index 9b36182c2b619317da31310141823442d8fd3f94..29c20e18601b71bac5201df8ff0c7ce0bed702dc 100644
--- a/paddle/memory/memcpy.h
+++ b/paddle/memory/memcpy.h
@@ -54,6 +54,5 @@ void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
           cudaStream_t stream);
 
 #endif
-
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index d2d70d8be71208cfa9673f6a6936b1bca16d7426..1ca4ba29d7f1b5e4aeecf7d352f68c1717f288a4 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -82,7 +82,7 @@ function(op_library TARGET)
         # It's enough to just adding one operator to pybind
         file(APPEND ${pybind_file} "USE_OP(sigmoid);\n")
     endif()
-    
+
     # reduce_op contains several operators
     if ("${TARGET}" STREQUAL "reduce_op")
         set(pybind_flag 1)
@@ -148,3 +148,4 @@ cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
 cc_test(dynamic_recurrent_op_test SRCS dynamic_recurrent_op_test.cc DEPS dynamic_recurrent_op recurrent_op tensor_array)
+cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2d4eff0c35af520dd27b9eb197937026a8fbdff9
--- /dev/null
+++ b/paddle/operators/load_op.cc
@@ -0,0 +1,132 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+
+#include <fstream>
+
+namespace paddle {
+namespace operators {
+
+class LoadOp : public framework::OperatorBase {
+ public:
+  LoadOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto filename = Attr<std::string>("file_path");
+    std::ifstream fin(filename);
+    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
+                   filename);
+
+    auto out_var_name = Output("Out");
+    auto *out_var = scope.FindVar(out_var_name);
+    PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
+                   out_var_name);
+
+    auto *tensor = out_var->GetMutable<framework::LoDTensor>();
+
+    uint32_t version;
+    fin.read(reinterpret_cast<char *>(&version), sizeof(version));
+    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+    framework::TensorDesc desc;
+    {  // int32_t size
+       // proto buffer
+      int32_t size;
+      fin.read(reinterpret_cast<char *>(&size), sizeof(size));
+      std::unique_ptr<char[]> buf(new char[size]);
+      fin.read(reinterpret_cast<char *>(buf.get()), size);
+      PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
+                     "Cannot parse tensor desc");
+    }
+    {  // read tensor
+      std::vector<int64_t> dims;
+      dims.reserve(static_cast<size_t>(desc.dims().size()));
+      std::copy(desc.dims().begin(), desc.dims().end(),
+                std::back_inserter(dims));
+      tensor->Resize(framework::make_ddim(dims));
+
+      void *buf;
+      platform::Place cpu = platform::CPUPlace();
+      switch (desc.data_type()) {
+        case framework::FP32:
+          buf = tensor->mutable_data<float>(cpu);
+          break;
+        case framework::FP64:
+          buf = tensor->mutable_data<double>(cpu);
+          break;
+        case framework::INT32:
+          buf = tensor->mutable_data<int>(cpu);
+          break;
+        case framework::INT64:
+          buf = tensor->mutable_data<int64_t>(cpu);
+          break;
+        default:
+          PADDLE_THROW("DataType %d not supported", desc.data_type());
+      }
+      fin.read(static_cast<char *>(buf), tensor->memory_size());
+    }
+    {  // read lod
+      uint64_t lod_level;
+      fin.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
+      auto &lod = *tensor->mutable_lod();
+      lod.resize(lod_level);
+      for (uint64_t i = 0; i < lod_level; ++i) {
+        uint64_t size;
+        fin.read(reinterpret_cast<char *>(&size), sizeof(size));
+        std::vector<size_t> tmp(size / sizeof(size_t));
+        fin.read(reinterpret_cast<char *>(tmp.data()),
+                 static_cast<std::streamsize>(size));
+        lod[i] = tmp;
+      }
+    }
+
+    auto place = dev_ctx.GetPlace();
+    if (platform::is_gpu_place(place)) {
+      // copy CPU to GPU
+      framework::LoDTensor cpu_tensor;
+      cpu_tensor.ShareDataWith(*tensor);
+      cpu_tensor.set_lod(tensor->lod());
+
+      // reset tensor
+      out_var->Clear();
+      tensor = out_var->GetMutable<framework::LoDTensor>();
+      tensor->set_lod(cpu_tensor.lod());
+      tensor->CopyFrom(cpu_tensor, place, dev_ctx);
+    }
+  }
+};
+
+class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LoadOpProtoMaker(framework::OpProto *proto,
+                   framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "The tensor need to be loaded");
+    AddComment(R"DOC(Load Operator
+Load operator will load a tensor variable from disk file.
+)DOC");
+    AddAttr<std::string>("file_path",
+                         "Variable will be loaded from \"file_path\".")
+        .AddCustomChecker(
+            [](const std::string &path) { return !path.empty(); });
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(load, ops::LoadOp, ops::LoadOpProtoMaker);
diff --git a/paddle/operators/save_load_op_test.cc b/paddle/operators/save_load_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fe2b15ec09c6d29ad5f78e5c36f534c6a88497e6
--- /dev/null
+++ b/paddle/operators/save_load_op_test.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "gtest/gtest.h"
+#include "paddle/framework/op_registry.h"
+
+USE_NO_KERNEL_OP(save);
+USE_NO_KERNEL_OP(load);
+
+TEST(SaveLoadOp, CPU) {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+  paddle::platform::CPUDeviceContext ctx(place);
+  auto var = scope.Var("test_var");
+  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+  tensor->Resize({10, 10});
+  paddle::framework::LoD expect_lod;
+  expect_lod.resize(1);
+  expect_lod[0].push_back(0);
+  expect_lod[0].push_back(1);
+  expect_lod[0].push_back(2);
+  expect_lod[0].push_back(3);
+
+  tensor->set_lod(expect_lod);
+  int* expect = tensor->mutable_data<int>(place);
+  for (size_t i = 0; i < paddle::framework::product(tensor->dims()); ++i) {
+    expect[i] = static_cast<int>(i);
+  }
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"file_path", std::string("tensor.save")});
+
+  auto save_op = paddle::framework::OpRegistry::CreateOp(
+      "save", {{"X", {"test_var"}}}, {}, attrs);
+  save_op->Run(scope, ctx);
+
+  auto load_var = scope.Var("out_var");
+  auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
+  auto load_op = paddle::framework::OpRegistry::CreateOp(
+      "load", {}, {{"Out", {"out_var"}}}, attrs);
+  load_op->Run(scope, ctx);
+  int* actual = target->data<int>();
+  for (size_t i = 0; i < paddle::framework::product(tensor->dims()); ++i) {
+    EXPECT_EQ(expect[i], actual[i]);
+  }
+  auto& actual_lod = target->lod();
+  EXPECT_EQ(expect_lod.size(), actual_lod.size());
+  for (size_t i = 0; i < expect_lod.size(); ++i) {
+    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
+      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
+    }
+  }
+}
\ No newline at end of file
diff --git a/paddle/operators/save_op.cc b/paddle/operators/save_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..490256dfa1cf9b891713dac264e9260906ce1025
--- /dev/null
+++ b/paddle/operators/save_op.cc
@@ -0,0 +1,184 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <stdint.h>
+#include <sys/stat.h>
+#include <fstream>
+#include <numeric>
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+// TODO(yuyang18): If the functions below are needed by other files, move them
+// to paddle::filesystem namespace.
+constexpr char kSEP = '/';
+static bool FileExists(const std::string &filepath) {
+  struct stat buffer;
+  return (stat(filepath.c_str(), &buffer) == 0);
+}
+
+static std::string DirName(const std::string &filepath) {
+  auto pos = filepath.rfind(kSEP);
+  if (pos == std::string::npos) {
+    return "";
+  }
+  return filepath.substr(0, pos);
+}
+
+static void MkDir(const char *path) {
+  if (mkdir(path, 0755)) {
+    PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
+  }
+}
+
+static void MkDirRecursively(const char *fullpath) {
+  if (*fullpath == '\0') return;  // empty string
+  if (FileExists(fullpath)) return;
+
+  MkDirRecursively(DirName(fullpath).c_str());
+  MkDir(fullpath);
+}
+
+class SaveOp : public framework::OperatorBase {
+ public:
+  SaveOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto filename = Attr<std::string>("file_path");
+    auto overwrite = Attr<bool>("overwrite");
+
+    if (FileExists(filename) && !overwrite) {
+      PADDLE_THROW("%s is existed, cannot save to it when overwrite=false",
+                   filename, overwrite);
+    }
+
+    MkDirRecursively(DirName(filename).c_str());
+
+    // FIXME(yuyang18): We save variable to local file now, but we should change
+    // it to save an output stream.
+    std::ofstream fout(filename);
+    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+                   filename);
+
+    auto iname = Input("X");
+    auto *var = scope.FindVar(iname);
+    PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op",
+                   iname);
+
+    PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
+                   "SaveOp only support LoDTensor, %s has wrong type", iname);
+
+    auto &tensor = var->Get<framework::LoDTensor>();
+
+    {  // the 1st field, uint32_t version
+      constexpr uint32_t version = 0;
+      fout.write(reinterpret_cast<const char *>(&version), sizeof(version));
+    }
+    {  // the 2nd field, tensor description
+       // int32_t  size
+       // void*    protobuf message
+      framework::TensorDesc desc;
+      desc.set_data_type(framework::ToDataType(tensor.type()));
+      auto dims = framework::vectorize(tensor.dims());
+      auto *pb_dims = desc.mutable_dims();
+      pb_dims->Resize(static_cast<int>(dims.size()), 0);
+      std::copy(dims.begin(), dims.end(), pb_dims->begin());
+      int32_t size = desc.ByteSize();
+      fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
+      auto out = desc.SerializeAsString();
+      fout.write(out.data(), size);
+    }
+    {  // the 3rd field, tensor data
+      uint64_t size = tensor.memory_size();
+      auto *data_ptr = tensor.data<void>();
+      PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
+                     "Index overflow when writing tensor");
+      if (platform::is_gpu_place(tensor.place())) {
+#ifdef PADDLE_WITH_CUDA
+        constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+        std::unique_ptr<char[]> buf(new char[kBufSize]);
+        auto &gpu_dev_ctx =
+            static_cast<const platform::CUDADeviceContext &>(dev_ctx);
+        platform::CPUPlace cpu;
+        uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+        while (size != 0) {
+          size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+          memory::Copy(cpu, buf.get(),
+                       boost::get<platform::GPUPlace>(tensor.place()),
+                       reinterpret_cast<const void *>(data), size_to_write,
+                       gpu_dev_ctx.stream());
+          gpu_dev_ctx.Wait();
+          fout.write(buf.get(), size_to_write);
+          data += size_to_write;
+          size -= size_to_write;
+        }
+#else
+        PADDLE_THROW("Unexpected branch");
+#endif
+      } else {
+        fout.write(static_cast<const char *>(data_ptr),
+                   static_cast<std::streamsize>(size));
+      }
+    }
+    {  // the 4th field, lod information
+       // uint64_t lod_level
+       // uint64_t lod_level_1 size in byte.
+       // int*     lod_level_1 data
+       // ...
+      auto lod = tensor.lod();
+      uint64_t size = lod.size();
+      fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
+
+      for (auto &each : lod) {
+        size = each.size() * sizeof(framework::LoD::value_type::value_type);
+        fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
+        fout.write(reinterpret_cast<const char *>(each.data()),
+                   static_cast<std::streamsize>(size));
+      }
+    }
+  }
+};
+
+class SaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SaveOpProtoMaker(framework::OpProto *proto,
+                   framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The tensor need to be saved");
+    AddComment(R"DOC(Save operator
+Save operator will serialize and write a tensor variable to disk file.
+)DOC");
+    AddAttr<bool>("overwrite", "Overwrite the output file if exist")
+        .SetDefault(true);
+    AddAttr<std::string>("file_path",
+                         "Variable will be saved to \"file_path\".")
+        .AddCustomChecker(
+            [](const std::string &path) { return !path.empty(); });
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(save, ops::SaveOp, ops::SaveOpProtoMaker);
diff --git a/paddle/operators/save_restore_op.cc b/paddle/operators/save_restore_op.cc
deleted file mode 100644
index 314e4e927924bf0442b7afe0184bf344e24c1521..0000000000000000000000000000000000000000
--- a/paddle/operators/save_restore_op.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-#include <fstream>
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-using framework::LoDTensor;
-
-inline static std::string VarToFileName(const std::string& folder_path,
-                                        const std::string& var_name) {
-  return folder_path + "/__" + var_name + "__";
-}
-
-class SaveOp : public framework::OperatorBase {
- public:
-  SaveOp(const std::string& type, const framework::VariableNameMap& inputs,
-         const framework::VariableNameMap& outputs,
-         const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
-    const auto& var_names = this->Inputs("X");
-    for (const auto& name : var_names) {
-      PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
-                              "Can not find variable '%s' in the scope.", name);
-    }
-    std::string folder_path = this->Attr<std::string>("folderPath");
-    PADDLE_ENFORCE(!folder_path.empty(),
-                   "'folderPath' of SaveOp shouldn't be empty.");
-
-    VLOG(1) << "Save variables to folder: " << folder_path;
-    for (const auto& name : var_names) {
-      std::string file_name = VarToFileName(folder_path, name);
-      std::ofstream fout(file_name, std::ofstream::out);
-      PADDLE_ENFORCE(fout.is_open(), "Fail to create file %s.", file_name);
-      const LoDTensor& tensor = scope.FindVar(name)->Get<LoDTensor>();
-      std::string bytes = tensor.SerializeToString();
-      fout << bytes;
-      fout.close();
-    }
-    VLOG(1) << "Compelete saving variables. Items count: " << var_names.size();
-  }
-};
-
-class SaveOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SaveOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(tensor), the tensor count can be 1~INT_MAX, tensors names which "
-             "values will be saved.")
-        .AsDuplicable();
-    AddAttr<std::string>("folderPath", "the folderPath for save model.");
-    AddComment(R"DOC(
-Save the input tensors to a binary file based on input tensor names and absolute path.
-
-All the inputs can carry the LoD (Level of Details) information,
-or not.
-)DOC");
-  }
-};
-
-class RestoreOp : public framework::OperatorBase {
- public:
-  RestoreOp(const std::string& type, const framework::VariableNameMap& inputs,
-            const framework::VariableNameMap& outputs,
-            const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {
-    const auto& var_names = this->Outputs("Out");
-    for (const auto& name : var_names) {
-      PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
-                              "Can not find variable '%s' in the scope.", name);
-    }
-    std::string folder_path = this->Attr<std::string>("folderPath");
-    PADDLE_ENFORCE(!folder_path.empty(),
-                   "'folderPath' of RestoreOp shouldn't be empty.");
-
-    VLOG(1) << "Try loading variables from folder: " << folder_path;
-
-    for (const auto& name : var_names) {
-      std::string file_name = VarToFileName(folder_path, name);
-      std::ifstream fin(file_name, std::ifstream::in);
-      PADDLE_ENFORCE(fin.is_open(), "Fail to open file %s.", file_name);
-      const size_t kBufferSize = 4096;  // equal to linux page size
-      char buffer[kBufferSize];
-      std::string cache;
-      while (!fin.eof()) {
-        fin.read(buffer, kBufferSize);
-        cache.append(buffer, fin.gcount());
-      }
-      LoDTensor* tensor = scope.FindVar(name)->GetMutable<LoDTensor>();
-      tensor->DeserializeFromString(cache, dev_ctx.GetPlace());
-      fin.close();
-    }
-    VLOG(1) << "Complete loading variables.";
-  }
-};
-
-class RestoreOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  RestoreOpMaker(framework::OpProto* proto,
-                 framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddOutput("Out",
-              "(tensor), the tensor count can be 1~INT_MAX, tensors which "
-              "values will be restores.")
-        .AsDuplicable();
-    AddAttr<std::string>("folderPath", "the folderPath for model file.");
-    AddAttr<int>("data_type", "output tensor data type")
-        .SetDefault(framework::DataType::FP32);
-    AddComment(R"DOC(
-Restore the tensors from model file based on absolute path.
-
-All the tensors outputs may carry the LoD (Level of Details) information,
-or not.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(save, paddle::operators::SaveOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::SaveOpMaker);
-
-REGISTER_OPERATOR(restore, paddle::operators::RestoreOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::RestoreOpMaker);
diff --git a/python/paddle/v2/framework/framework.py b/python/paddle/v2/framework/framework.py
index b3f8be8be9ac5c0c6c15646d39d4796df0fd87e2..8f28d3e76688234747c75dda53e7316a202dfd14 100644
--- a/python/paddle/v2/framework/framework.py
+++ b/python/paddle/v2/framework/framework.py
@@ -261,7 +261,7 @@ class Operator(object):
                     self.desc.set_attr(attr_name, attrs[attr_name])
 
         self.desc.check_attrs()
-        no_kernel_op_set = {'feed', 'fetch', 'save', 'restore'}
+        no_kernel_op_set = {'feed', 'fetch', 'save', 'load'}
         if type not in no_kernel_op_set:
             self.desc.infer_var_type(self.block.desc)
             self.desc.infer_shape(self.block.desc)
diff --git a/python/paddle/v2/framework/tests/test_save_restore_op.py b/python/paddle/v2/framework/tests/test_save_restore_op.py
deleted file mode 100644
index 3a36d03f62a7ad50f656e5c3fdb8c87548a120e8..0000000000000000000000000000000000000000
--- a/python/paddle/v2/framework/tests/test_save_restore_op.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import paddle.v2.framework.core as core
-import paddle.v2.framework.framework as framework
-import paddle.v2.framework.executor as executor
-
-import numpy as np
-import unittest
-import os
-import sys
-import shutil
-
-FOLDER_PATH = "./tmp_test_dir"
-
-
-class TestSaveRestoreOp(unittest.TestCase):
-    def test_save_restore_op(self):
-        tensor_1_val = np.random.rand(3, 9).astype("float32")
-        tensor_2_val = np.random.randint(0, 20, size=(4, 2)).astype("int32")
-        place = core.CPUPlace()
-
-        program = framework.Program()
-        block = program.global_block()
-        v_a = block.create_var(
-            dtype="float32", shape=[3, 9], lod_level=0, name="tensor_1")
-        v_b = block.create_var(
-            dtype="int32", shape=[4, 2], lod_level=0, name="tensor_2")
-
-        t_1 = core.LoDTensor()
-        t_1.set(tensor_1_val, place)
-        t_2 = core.LoDTensor()
-        t_2.set(tensor_2_val, place)
-        block.append_op(
-            type="save",
-            inputs={"X": [v_a, v_b]},
-            attrs={"folderPath": FOLDER_PATH})
-        block.append_op(
-            type="fill_constant",
-            outputs={"Out": [v_a]},
-            attrs={"shape": [2, 2],
-                   "value": 0.0})
-        block.append_op(
-            type="fill_constant",
-            outputs={"Out": [v_b]},
-            attrs={"shape": [2, 2],
-                   "value": 0.0})
-        block.append_op(
-            type="restore",
-            outputs={"Out": [v_a, v_b]},
-            attrs={"folderPath": FOLDER_PATH})
-
-        if os.path.exists(FOLDER_PATH):
-            shutil.rmtree(FOLDER_PATH)
-        os.makedirs(FOLDER_PATH)
-
-        exe = executor.Executor(place)
-        out = exe.run(program,
-                      feed={"tensor_1": t_1,
-                            "tensor_2": t_2},
-                      fetch_list=[v_a, v_b])
-
-        self.assertTrue(os.path.isdir(FOLDER_PATH))
-        self.assertTrue(os.path.isfile(FOLDER_PATH + "/__tensor_1__"))
-        self.assertTrue(os.path.isfile(FOLDER_PATH + "/__tensor_2__"))
-
-        self.assertTrue(np.array_equal(np.array(out[0]), tensor_1_val))
-        self.assertTrue(np.array_equal(np.array(out[1]), tensor_2_val))
-
-        shutil.rmtree(FOLDER_PATH)
-
-
-if __name__ == "__main__":
-    unittest.main()