diff --git a/benchmark/fluid/machine_translation.py b/benchmark/fluid/machine_translation.py
index d7a421c10979c3b9d6865a8c0b99a6410e0f46a8..adde5f21acd4e77d58a453d6868abeccfca4bb5a 100644
--- a/benchmark/fluid/machine_translation.py
+++ b/benchmark/fluid/machine_translation.py
@@ -21,7 +21,7 @@ import argparse
 import time
 import distutils.util
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py
index dc10ac2ec195acc9a5693718141ddb32417dfb71..1e2185dfac1072d1f1046f4616a9d53a8fc76061 100644
--- a/benchmark/fluid/mnist.py
+++ b/benchmark/fluid/mnist.py
@@ -20,7 +20,7 @@ import numpy as np
 import argparse
 import time
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.profiler as profiler
 
diff --git a/benchmark/fluid/resnet.py b/benchmark/fluid/resnet.py
index 1af5eaf6b46be47cb6b778cedcf53830c201ef39..831fa2c019fc2868cd85b1ca7b2c8c76a2f1628c 100644
--- a/benchmark/fluid/resnet.py
+++ b/benchmark/fluid/resnet.py
@@ -23,7 +23,7 @@ import time
 
 import cProfile, pstats, StringIO
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.profiler as profiler
diff --git a/benchmark/fluid/stacked_dynamic_lstm.py b/benchmark/fluid/stacked_dynamic_lstm.py
index 5fcbdd64af9dc196c9d5b2b82ce4213478ea1418..73bcc47b4d404af2c01d61ca3dfb11971bbcfe9c 100644
--- a/benchmark/fluid/stacked_dynamic_lstm.py
+++ b/benchmark/fluid/stacked_dynamic_lstm.py
@@ -23,10 +23,10 @@ import random
 import time
 
 import numpy
-import paddle.v2 as paddle
-import paddle.v2.dataset.imdb as imdb
+import paddle
+import paddle.dataset.imdb as imdb
 import paddle.fluid as fluid
-from paddle.v2 import batch
+import paddle.batch as batch
 import paddle.fluid.profiler as profiler
 
 
diff --git a/benchmark/fluid/vgg.py b/benchmark/fluid/vgg.py
index 9d990eff62ec368dc7033f55cc0862fa974a64e0..53e34e0cbd15914791c305db6797f826ebfae34e 100644
--- a/benchmark/fluid/vgg.py
+++ b/benchmark/fluid/vgg.py
@@ -17,7 +17,7 @@ from __future__ import print_function
 import sys
 import time
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import argparse
diff --git a/doc/v2/api/data/dataset.rst b/doc/v2/api/data/dataset.rst
index 02e41564b1e48c07da6ac071fc4b60089169e05a..e7c8be4452bf55e0967d750c2e624e8e316e9330 100644
--- a/doc/v2/api/data/dataset.rst
+++ b/doc/v2/api/data/dataset.rst
@@ -1,82 +1,82 @@
 Dataset
 =======
 
-..  automodule:: paddle.v2.dataset
+..  automodule:: paddle.dataset
     :members:
     :noindex:
 
 mnist
 +++++
 
-..  automodule:: paddle.v2.dataset.mnist
+..  automodule:: paddle.dataset.mnist
     :members:
     :noindex:
 
 cifar
 +++++
 
-..  automodule:: paddle.v2.dataset.cifar
+..  automodule:: paddle.dataset.cifar
     :members:
     :noindex:
 
 conll05
 +++++++
 
-..  automodule:: paddle.v2.dataset.conll05
+..  automodule:: paddle.dataset.conll05
     :members: get_dict,get_embedding,test
     :noindex:
 
 imdb
 ++++
 
-..  automodule:: paddle.v2.dataset.imdb
+..  automodule:: paddle.dataset.imdb
     :members:
     :noindex:
 
 imikolov
 ++++++++
 
-..  automodule:: paddle.v2.dataset.imikolov
+..  automodule:: paddle.dataset.imikolov
     :members:
     :noindex:
 
 movielens
 +++++++++
 
-..  automodule:: paddle.v2.dataset.movielens
+..  automodule:: paddle.dataset.movielens
     :members:
     :noindex:
 
-..  autoclass:: paddle.v2.dataset.movielens.MovieInfo
+..  autoclass:: paddle.dataset.movielens.MovieInfo
     :noindex:
-    
-..  autoclass:: paddle.v2.dataset.movielens.UserInfo
+
+..  autoclass:: paddle.dataset.movielens.UserInfo
     :noindex:
 
 sentiment
 +++++++++
 
-..  automodule:: paddle.v2.dataset.sentiment
+..  automodule:: paddle.dataset.sentiment
     :members:
     :noindex:
 
 uci_housing
 +++++++++++
 
-..  automodule:: paddle.v2.dataset.uci_housing
+..  automodule:: paddle.dataset.uci_housing
     :members:
     :noindex:
 
 wmt14
 +++++
 
-..  automodule:: paddle.v2.dataset.wmt14
+..  automodule:: paddle.dataset.wmt14
     :members:
     :noindex:
 
 wmt16
 +++++
 
-..  automodule:: paddle.v2.dataset.wmt16
+..  automodule:: paddle.dataset.wmt16
     :members:
     :noindex:
diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt
index 845528860f91d0b479bb3c4dbbe05e32c68dc16f..3106978eb0149b14849dfd1aaad8bbe76791f2f6 100644
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -23,5 +23,7 @@ reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_o
 reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc)
 reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
 reader_library(create_threaded_reader_op SRCS create_threaded_reader_op.cc)
+
+cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc)
 # Export local libraries to parent
 set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE)
diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..71684b14176edc8f71efbefa9a7decffc8f3011e
--- /dev/null
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -0,0 +1,112 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <condition_variable>  // NOLINT
+#include <deque>
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+template <typename T>
+class BlockingQueue {
+  // BlockingQueue is for buffered reading and is supposed to use only the
+  // reader package. It is true that we could and we should have been using
+  // framework::Channel, but which has currently a deadlock bug. BlockingQueue
+  // is a workaround and a simplified version of framework::Channel as it
+  // doesn't support GPU and it implements on buffered blocking queue.
+ public:
+  explicit BlockingQueue(size_t capacity)
+      : capacity_(capacity), closed_(false) {
+    PADDLE_ENFORCE_GT(
+        capacity_, 0,
+        "The capacity of a reader::BlockingQueue must be greater than 0.");
+  }
+
+  bool Send(const T& elem) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    send_cv_.wait(lock, [&] { return queue_.size() < capacity_ || closed_; });
+    if (closed_) {
+      VLOG(5)
+          << "WARNING: Sending an element to a closed reader::BlokcingQueue.";
+      return false;
+    }
+    PADDLE_ENFORCE_LT(queue_.size(), capacity_);
+    queue_.push_back(elem);
+    receive_cv_.notify_one();
+    return true;
+  }
+
+  bool Send(T&& elem) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    send_cv_.wait(lock, [&] { return queue_.size() < capacity_ || closed_; });
+    if (closed_) {
+      VLOG(5)
+          << "WARNING: Sending an element to a closed reader::BlokcingQueue.";
+      return false;
+    }
+    PADDLE_ENFORCE_LT(queue_.size(), capacity_);
+    queue_.emplace_back(std::move(elem));
+    receive_cv_.notify_one();
+    return true;
+  }
+
+  bool Receive(T* elem) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    receive_cv_.wait(lock, [&] { return !queue_.empty() || closed_; });
+    if (!queue_.empty()) {
+      PADDLE_ENFORCE_NOT_NULL(elem);
+      *elem = queue_.front();
+      queue_.pop_front();
+      send_cv_.notify_one();
+      return true;
+    } else {
+      PADDLE_ENFORCE(closed_);
+      return false;
+    }
+  }
+
+  void Close() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    closed_ = true;
+    send_cv_.notify_all();
+    receive_cv_.notify_all();
+  }
+
+  bool IsClosed() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return closed_;
+  }
+
+  size_t Cap() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return capacity_;
+  }
+
+ private:
+  size_t capacity_;
+  bool closed_;
+  std::deque<T> queue_;
+
+  std::mutex mutex_;
+  std::condition_variable receive_cv_;
+  std::condition_variable send_cv_;
+};
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index 4372f23fc1dbd85e43b04a9d644977392316c2e9..3fdc31dfa5242b6487c308d395d70d7ff348bc73 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -14,7 +14,7 @@
 
 #include <thread>  // NOLINT
 
-#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/operators/reader/blocking_queue.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 
 namespace paddle {
@@ -23,13 +23,13 @@ namespace reader {
 
 // 'Double buffer' means we shall maintain two batches of input data at the same
 // time. So the kCacheSize shoul be at least 2.
-static constexpr size_t kCacheSize = 2;
+static constexpr size_t kCacheSize = 3;
 // There will be two bacthes out of the channel during training:
 // 1. the one waiting to be sent to the channel
 // 2. the one just be received from the channel, which is also being used by
 // subsequent operators.
 // So the channel size should be kChacheSize - 2
-static constexpr size_t kChannelSize = 0;  // kCacheSize - 2
+static constexpr size_t kChannelSize = 1;  // kCacheSize - 2
 
 class DoubleBufferReader : public framework::DecoratedReader {
  public:
@@ -55,10 +55,8 @@ class DoubleBufferReader : public framework::DecoratedReader {
   ~DoubleBufferReader() { EndPrefetcher(); }
 
  private:
-  bool HasNext() const;
-
   void StartPrefetcher() {
-    channel_ = framework::MakeChannel<size_t>(kChannelSize);
+    channel_ = new reader::BlockingQueue<size_t>(kChannelSize);
     prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
   }
 
@@ -74,7 +72,7 @@ class DoubleBufferReader : public framework::DecoratedReader {
   void PrefetchThreadFunc();
 
   std::thread prefetcher_;
-  framework::Channel<size_t>* channel_;
+  reader::BlockingQueue<size_t>* channel_;
   platform::Place place_;
   std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache_;
   std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache_;
@@ -139,17 +137,16 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
 };
 
 void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
-  out->clear();
-  if (HasNext()) {
-    size_t cached_tensor_id;
-    channel_->Receive(&cached_tensor_id);
+  size_t cached_tensor_id;
+  if (channel_->Receive(&cached_tensor_id)) {
     if (platform::is_gpu_place(place_)) {
       *out = gpu_tensor_cache_[cached_tensor_id];
-      ctxs_[cached_tensor_id]->Wait();
     } else {
       // CPU place
       *out = cpu_tensor_cache_[cached_tensor_id];
     }
+  } else {
+    out->clear();
   }
 }
 
@@ -159,12 +156,6 @@ void DoubleBufferReader::ReInit() {
   StartPrefetcher();
 }
 
-bool DoubleBufferReader::HasNext() const {
-  while (!channel_->IsClosed() && !channel_->CanReceive()) {
-  }
-  return channel_->CanReceive();
-}
-
 void DoubleBufferReader::PrefetchThreadFunc() {
   VLOG(5) << "A new prefetch thread starts.";
   size_t cached_tensor_id = 0;
@@ -185,10 +176,7 @@ void DoubleBufferReader::PrefetchThreadFunc() {
         gpu_batch[i].set_lod(cpu_batch[i].lod());
       }
     }
-    try {
-      size_t tmp = cached_tensor_id;
-      channel_->Send(&tmp);
-    } catch (paddle::platform::EnforceNotMet e) {
+    if (!channel_->Send(cached_tensor_id)) {
       VLOG(5) << "WARNING: The double buffer channel has been closed. The "
                  "prefetch thread will terminate.";
       break;
diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc
index 779dc8a6a0deb7792e0540071e3a2588102fa708..91ad7d56583446ee4686e74187de166f387125df 100644
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -14,7 +14,7 @@
 
 #include <thread>  // NOLINT
 
-#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/operators/reader/blocking_queue.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 
 namespace paddle {
@@ -37,7 +37,6 @@ class MultiFileReader : public framework::ReaderBase {
   ~MultiFileReader() { EndScheduler(); }
 
  private:
-  bool HasNext();
   void StartNewScheduler();
   void EndScheduler();
   void ScheduleThreadFunc();
@@ -48,15 +47,14 @@ class MultiFileReader : public framework::ReaderBase {
   std::thread scheduler_;
   std::vector<std::thread> prefetchers_;
   size_t buffer_size_;
-  framework::Channel<size_t>* waiting_file_idx_;
-  framework::Channel<size_t>* available_thread_idx_;
-  framework::Channel<std::vector<framework::LoDTensor>>* buffer_;
+  reader::BlockingQueue<size_t>* waiting_file_idx_;
+  reader::BlockingQueue<size_t>* available_thread_idx_;
+  reader::BlockingQueue<std::vector<framework::LoDTensor>>* buffer_;
 };
 
 void MultiFileReader::ReadNext(std::vector<framework::LoDTensor>* out) {
-  out->clear();
-  if (HasNext()) {
-    buffer_->Receive(out);
+  if (!buffer_->Receive(out)) {
+    out->clear();
   }
 }
 
@@ -65,25 +63,19 @@ void MultiFileReader::ReInit() {
   StartNewScheduler();
 }
 
-bool MultiFileReader::HasNext() {
-  while (!buffer_->IsClosed() && !buffer_->CanReceive()) {
-  }
-  return buffer_->CanReceive();
-}
-
 void MultiFileReader::StartNewScheduler() {
   size_t thread_num = prefetchers_.size();
-  waiting_file_idx_ = framework::MakeChannel<size_t>(file_names_.size());
-  available_thread_idx_ = framework::MakeChannel<size_t>(thread_num);
-  buffer_ =
-      framework::MakeChannel<std::vector<framework::LoDTensor>>(buffer_size_);
+  waiting_file_idx_ = new reader::BlockingQueue<size_t>(file_names_.size());
+  available_thread_idx_ = new reader::BlockingQueue<size_t>(thread_num);
+  buffer_ = new reader::BlockingQueue<std::vector<framework::LoDTensor>>(
+      buffer_size_);
 
   for (size_t i = 0; i < file_names_.size(); ++i) {
-    waiting_file_idx_->Send(&i);
+    waiting_file_idx_->Send(i);
   }
   waiting_file_idx_->Close();
   for (size_t i = 0; i < thread_num; ++i) {
-    available_thread_idx_->Send(&i);
+    available_thread_idx_->Send(i);
   }
 
   scheduler_ = std::thread([this] { ScheduleThreadFunc(); });
@@ -149,7 +141,7 @@ void MultiFileReader::PrefetchThreadFunc(std::string file_name,
       break;
     }
     try {
-      buffer_->Send(&ins);
+      buffer_->Send(std::move(ins));
     } catch (paddle::platform::EnforceNotMet e) {
       VLOG(5) << "WARNING: The buffer channel has been closed. The prefetch "
                  "thread of file '"
@@ -158,9 +150,7 @@ void MultiFileReader::PrefetchThreadFunc(std::string file_name,
     }
   }
 
-  try {
-    available_thread_idx_->Send(&thread_idx);
-  } catch (paddle::platform::EnforceNotMet e) {
+  if (!available_thread_idx_->Send(thread_idx)) {
     VLOG(5) << "WARNING: The available_thread_idx_ channel has been closed. "
                "Fail to send thread_idx.";
   }
diff --git a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7d1b381d56c8cdc1e79e594b18c1a1ed59ab5284
--- /dev/null
+++ b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
@@ -0,0 +1,219 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <chrono>  // NOLINT
+#include <set>
+#include <thread>  // NOLINT
+#include <vector>
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/operators/reader/blocking_queue.h"
+
+using paddle::operators::reader::BlockingQueue;
+
+TEST(BlockingQueue, CapacityTest) {
+  size_t cap = 10;
+  BlockingQueue<int> q(cap);
+  EXPECT_EQ(q.Cap(), cap);
+}
+
+void FirstInFirstOut(size_t queue_cap, size_t elem_num, size_t send_time_gap,
+                     size_t receive_time_gap) {
+  BlockingQueue<size_t> q(queue_cap);
+  std::thread sender([&]() {
+    for (size_t i = 0; i < elem_num; ++i) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(send_time_gap));
+      EXPECT_TRUE(q.Send(i));
+    }
+    q.Close();
+  });
+  size_t count = 0;
+  while (true) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(receive_time_gap));
+    size_t elem;
+    if (!q.Receive(&elem)) {
+      break;
+    }
+    EXPECT_EQ(elem, count++);
+  }
+  sender.join();
+  EXPECT_EQ(count, elem_num);
+  EXPECT_TRUE(q.IsClosed());
+}
+
+TEST(BlockingQueue, FirstInFirstOutTest) {
+  FirstInFirstOut(2, 5, 2, 50);
+  FirstInFirstOut(2, 5, 50, 2);
+  FirstInFirstOut(10, 3, 50, 2);
+  FirstInFirstOut(10, 3, 2, 50);
+}
+
+TEST(BlockingQueue, SenderBlockingTest) {
+  const size_t queue_cap = 2;
+  BlockingQueue<size_t> q(queue_cap);
+  size_t send_count = 0;
+  std::thread sender([&]() {
+    for (size_t i = 0; i < 5; ++i) {
+      if (!q.Send(i)) {
+        break;
+      }
+      ++send_count;
+    }
+  });
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));
+  q.Close();
+  sender.join();
+  EXPECT_EQ(send_count, queue_cap);
+  std::vector<size_t> res;
+  while (true) {
+    size_t elem;
+    if (!q.Receive(&elem)) {
+      break;
+    }
+    res.push_back(elem);
+  }
+  EXPECT_EQ(res.size(), queue_cap);
+  for (size_t i = 0; i < res.size(); ++i) {
+    EXPECT_EQ(res[i], i);
+  }
+}
+
+TEST(BlockingQueue, ReceiverBlockingTest) {
+  const size_t queue_cap = 5;
+  BlockingQueue<size_t> q(queue_cap);
+  std::vector<size_t> receive_res;
+  std::thread receiver([&]() {
+    size_t elem;
+    while (true) {
+      if (!q.Receive(&elem)) {
+        break;
+      }
+      receive_res.push_back(elem);
+    }
+  });
+  std::vector<size_t> to_send{2, 1, 7};
+  for (auto e : to_send) {
+    q.Send(e);
+  }
+  q.Close();
+  receiver.join();
+  EXPECT_EQ(receive_res.size(), to_send.size());
+  for (size_t i = 0; i < to_send.size(); ++i) {
+    EXPECT_EQ(receive_res[i], to_send[i]);
+  }
+}
+
+void CheckIsUnorderedSame(const std::vector<std::vector<size_t>>& v1,
+                          const std::vector<std::vector<size_t>>& v2) {
+  std::set<size_t> s1;
+  std::set<size_t> s2;
+  for (auto vec : v1) {
+    for (size_t elem : vec) {
+      s1.insert(elem);
+    }
+  }
+  for (auto vec : v2) {
+    for (size_t elem : vec) {
+      s2.insert(elem);
+    }
+  }
+  EXPECT_EQ(s1.size(), s2.size());
+  auto it1 = s1.begin();
+  auto it2 = s2.begin();
+  while (it1 != s1.end()) {
+    EXPECT_EQ(*it1, *it2);
+    ++it1;
+    ++it2;
+  }
+}
+
+void MultiSenderMultiReceiver(const size_t queue_cap,
+                              const std::vector<std::vector<size_t>>& to_send,
+                              size_t receiver_num, size_t send_time_gap,
+                              size_t receive_time_gap) {
+  BlockingQueue<size_t> q(queue_cap);
+  size_t sender_num = to_send.size();
+  std::vector<std::thread> senders;
+  for (size_t s_idx = 0; s_idx < sender_num; ++s_idx) {
+    senders.emplace_back(std::thread([&, s_idx] {
+      for (size_t elem : to_send[s_idx]) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(send_time_gap));
+        EXPECT_TRUE(q.Send(elem));
+      }
+    }));
+  }
+  std::vector<std::thread> receivers;
+  std::mutex mu;
+  std::vector<std::vector<size_t>> res;
+  for (size_t r_idx = 0; r_idx < receiver_num; ++r_idx) {
+    receivers.emplace_back(std::thread([&] {
+      std::vector<size_t> receiver_res;
+      while (true) {
+        std::this_thread::sleep_for(
+            std::chrono::milliseconds(receive_time_gap));
+        size_t elem;
+        if (!q.Receive(&elem)) {
+          break;
+        }
+        receiver_res.push_back(elem);
+      }
+      std::lock_guard<std::mutex> lock(mu);
+      res.push_back(receiver_res);
+    }));
+  }
+  for (auto& t : senders) {
+    t.join();
+  }
+  q.Close();
+  for (auto& t : receivers) {
+    t.join();
+  }
+  CheckIsUnorderedSame(to_send, res);
+}
+
+TEST(BlockingQueue, MultiSenderMultiReaderTest) {
+  std::vector<std::vector<size_t>> to_send_1{{2, 3, 4}, {9}, {0, 7, 15, 6}};
+  MultiSenderMultiReceiver(2, to_send_1, 2, 0, 0);
+  MultiSenderMultiReceiver(10, to_send_1, 2, 0, 0);
+  MultiSenderMultiReceiver(2, to_send_1, 20, 0, 0);
+  MultiSenderMultiReceiver(2, to_send_1, 2, 50, 0);
+  MultiSenderMultiReceiver(2, to_send_1, 2, 0, 50);
+
+  std::vector<std::vector<size_t>> to_send_2{
+      {2, 3, 4}, {}, {0, 7, 15, 6, 9, 32}};
+  MultiSenderMultiReceiver(2, to_send_2, 3, 0, 0);
+  MultiSenderMultiReceiver(20, to_send_2, 3, 0, 0);
+  MultiSenderMultiReceiver(2, to_send_2, 30, 0, 0);
+  MultiSenderMultiReceiver(2, to_send_2, 3, 50, 0);
+  MultiSenderMultiReceiver(2, to_send_2, 3, 0, 50);
+}
+
+struct MyClass {
+  MyClass() : val_(0) {}
+  explicit MyClass(int val) : val_(val) {}
+  MyClass(const MyClass& b) { val_ = b.val_; }
+  MyClass(MyClass&& b) { val_ = b.val_; }
+  void operator=(const MyClass& b) { val_ = b.val_; }
+
+  int val_;
+};
+
+TEST(BlockingQueue, MyClassTest) {
+  BlockingQueue<MyClass> q(2);
+  MyClass a(200);
+  q.Send(std::move(a));
+  MyClass b;
+  q.Receive(&b);
+  EXPECT_EQ(a.val_, b.val_);
+}
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index 1ab55d6b9bf8fdbd14c9c2bd978e3e99dba3e73e..81acaff87d3c2025cf0d6185a1590b018bfbd83c 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -14,10 +14,12 @@
 
 #pragma once
 
+#include <cublasXt.h>
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <dlfcn.h>
 #include <mutex>  // NOLINT
+#include <type_traits>
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
@@ -37,14 +39,14 @@ extern void *cublas_dso_handle;
 #ifdef PADDLE_USE_DSO
 #define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                             \
   struct DynLoad__##__name {                                                 \
+    using FUNC_TYPE = decltype(&::__name);                                   \
     template <typename... Args>                                              \
     inline cublasStatus_t operator()(Args... args) {                         \
-      typedef cublasStatus_t (*cublasFunc)(Args...);                         \
       std::call_once(cublas_dso_flag, []() {                                 \
         cublas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
       });                                                                    \
       void *p_##__name = dlsym(cublas_dso_handle, #__name);                  \
-      return reinterpret_cast<cublasFunc>(p_##__name)(args...);              \
+      return reinterpret_cast<FUNC_TYPE>(p_##__name)(args...);               \
     }                                                                        \
   };                                                                         \
   extern DynLoad__##__name __name
@@ -71,8 +73,8 @@ extern void *cublas_dso_handle;
   __macro(cublasDgemm_v2);                \
   __macro(cublasHgemm);                   \
   __macro(cublasSgemmEx);                 \
-  __macro(cublasSgeam_v2);                \
-  __macro(cublasDgeam_v2);                \
+  __macro(cublasSgeam);                   \
+  __macro(cublasDgeam);                   \
   __macro(cublasCreate_v2);               \
   __macro(cublasDestroy_v2);              \
   __macro(cublasSetStream_v2);            \
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 24475b62ca2825c45ff7edb39328dece3b822b25..34d83e395694f55eafca74d63ebf363169ab30e8 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -34,7 +34,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
   struct DynLoad__##__name {                                               \
     template <typename... Args>                                            \
     auto operator()(Args... args) -> decltype(__name(args...)) {           \
-      using cudnn_func = decltype(__name(args...)) (*)(Args...);           \
+      using cudnn_func = decltype(&::__name);                              \
       std::call_once(cudnn_dso_flag, []() {                                \
         cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
       });                                                                  \
diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h
index d0d676b9d8ac462900b48246bec43166d04ef97b..e64de7c20fc9d145e51cfc4528e321b3c4ec86c8 100644
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -41,7 +41,7 @@ extern void *cupti_dso_handle;
   struct DynLoad__##__name {                                               \
     template <typename... Args>                                            \
     inline CUptiResult CUPTIAPI operator()(Args... args) {                 \
-      typedef CUptiResult CUPTIAPI (*cuptiFunc)(Args...);                  \
+      using cuptiFunc = decltype(&::__name);                               \
       std::call_once(cupti_dso_flag, []() {                                \
         cupti_dso_handle = paddle::platform::dynload::GetCUPTIDsoHandle(); \
       });                                                                  \
diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h
index 4697fb6cd96770127206bdabeea77e43eb09d1f5..46ad4379d5f9572d415ef1d747077217ae29391e 100644
--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
@@ -30,7 +30,7 @@ extern void *curand_dso_handle;
   struct DynLoad__##__name {                                                 \
     template <typename... Args>                                              \
     curandStatus_t operator()(Args... args) {                                \
-      typedef curandStatus_t (*curandFunc)(Args...);                         \
+      using curandFunc = decltype(&::__name);                                \
       std::call_once(curand_dso_flag, []() {                                 \
         curand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
       });                                                                    \
diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h
index c5a10a78a4f432b431680c089f255fea777277cb..37902ae20c5d9d64486232bbd468375c4a50a615 100644
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -33,7 +33,7 @@ extern void* nccl_dso_handle;
   struct DynLoad__##__name {                                             \
     template <typename... Args>                                          \
     auto operator()(Args... args) -> decltype(__name(args...)) {         \
-      using nccl_func = decltype(__name(args...)) (*)(Args...);          \
+      using nccl_func = decltype(&::__name);                             \
       std::call_once(nccl_dso_flag, []() {                               \
         nccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
       });                                                                \
diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h
index 7fa468370463a51c486b80317f401612930bc72e..7c70649d21c547beb824576d4a8ecf6219a9bddf 100644
--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -36,7 +36,7 @@ extern void* warpctc_dso_handle;
   struct DynLoad__##__name {                                                   \
     template <typename... Args>                                                \
     auto operator()(Args... args) -> decltype(__name(args...)) {               \
-      using warpctcFunc = decltype(__name(args...)) (*)(Args...);              \
+      using warpctcFunc = decltype(&::__name);                                 \
       std::call_once(warpctc_dso_flag, []() {                                  \
         warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \
       });                                                                      \
diff --git a/paddle/scripts/paddle_docker_build.sh b/paddle/scripts/paddle_docker_build.sh
index 53df94980fdd3c9fdaa4cf077880a8f7737bbd8a..252227ef88abbe238686dd5d7672e57ad68dab7e 100755
--- a/paddle/scripts/paddle_docker_build.sh
+++ b/paddle/scripts/paddle_docker_build.sh
@@ -75,6 +75,7 @@ function main() {
       build_android)
         start_build_docker
         docker exec ${CONTAINER_ID} bash -c "./paddle/scripts/paddle_build.sh $@"
+        ;;
       *)
         if container_running "${CONTAINER_ID}"; then
             docker exec ${CONTAINER_ID} bash -c "./paddle/scripts/paddle_build.sh $@"