Merge branch 'release/1.0.0' of https://github.com/PaddlePaddle/Paddle into release/1.0.0

Fix import paddle.v2.plot to import paddle.utils

Merge branch 'release/1.0.0' of https://github.com/PaddlePaddle/Paddle into release/1.0.0
Fix import paddle.v2.plot to import paddle.utils
903e3f3f · shippingwang · 77c8ddb5 · 3b7e20b0 · 903e3f3f · 903e3f3f
106 changed file
--- a/Dockerfile
+++ b/Dockerfile
@@ -24,6 +24,7 @@ COPY ./paddle/scripts/docker/root/ /root/

 RUN apt-get update && \
    apt-get install -y --allow-downgrades patchelf \
+    python3 python3-dev python3-pip \
    git python-pip python-dev python-opencv openssh-server bison \
    libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
@@ -70,24 +71,33 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
-RUN easy_install -U pip && \
+RUN pip3 install -U wheel && \
+    pip3 install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
+    easy_install -U pip && \
    pip install -U wheel && \
    pip install -U docopt PyYAML sphinx==1.5.6 && \
    pip install sphinx-rtd-theme==0.1.9 recommonmark

-RUN pip install pre-commit 'ipython==5.3.0' && \
+RUN pip3 install pre-commit 'ipython==5.3.0' && \
+    pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3 install opencv-python && \
+    pip install pre-commit 'ipython==5.3.0' && \
    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
    pip install opencv-python

 #For docstring checker
+RUN pip3 install pylint pytest astroid isort
 RUN pip install pylint pytest astroid isort LinkChecker

 COPY ./python/requirements.txt /root/
+RUN pip3 install -r /root/requirements.txt
 RUN pip install -r /root/requirements.txt

 # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
 RUN apt-get install -y libssl-dev libffi-dev
+RUN pip3 install certifi urllib3[secure]
 RUN pip install certifi urllib3[secure]



--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -40,7 +40,7 @@ set(OPENBLAS_LIB_SEARCH_PATHS
        /usr/local/opt/openblas/lib)

 find_path(OPENBLAS_INC_DIR NAMES cblas.h
-  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
+  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS} NO_DEFAULT_PATH)
 find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h
  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
 find_library(OPENBLAS_LIB NAMES openblas

--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -27,7 +27,7 @@ IF(NOT ${CBLAS_FOUND})

    SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
    SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
-    SET(CBLAS_INCLUDE_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
+    SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)

    SET(CBLAS_LIBRARIES
        "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
@@ -96,7 +96,7 @@ IF(NOT ${CBLAS_FOUND})
    ENDIF(NOT WIN32)
    SET(CBLAS_PROVIDER openblas)
    IF(WITH_C_API)
-        INSTALL(DIRECTORY ${CBLAS_INCLUDE_DIR} DESTINATION third_party/openblas)
+        INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas)
        # Because libopenblas.a is a symbolic link of another library, thus need to
        # install the whole directory.
        IF(ANDROID)
@@ -117,8 +117,8 @@ IF(NOT ${CBLAS_FOUND})
 ENDIF(NOT ${CBLAS_FOUND})

 MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
-MESSAGE(STATUS "BLAS Include: ${CBLAS_INCLUDE_DIR}")
-INCLUDE_DIRECTORIES(${CBLAS_INCLUDE_DIR})
+MESSAGE(STATUS "BLAS Include: ${CBLAS_INC_DIR}")
+INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})

 # FIXME(gangliao): generate cblas target to track all high performance
 # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)

--- a/paddle/contrib/float16/float16_transpiler.py
+++ b/paddle/contrib/float16/float16_transpiler.py
@@ -102,8 +102,8 @@ class Float16Transpiler:
                continue
            for input_arg in current_op.input_arg_names:
                if input_arg in self.input_map:
-                    current_op.rename_input(input_arg,
-                                            self.input_map[input_arg])
+                    current_op._rename_input(input_arg,
+                                             self.input_map[input_arg])

    def _remove_unused_var(self):
        '''
@@ -187,7 +187,7 @@ class Float16Transpiler:
                    shape=var.shape,
                    persistable=var.persistable)
                find_op(var)
-                var.op.rename_output(var_name, tmp_var_name)
+                var.op._rename_output(var_name, tmp_var_name)
                self.block._insert_op(
                    i,
                    type="cast",

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -167,15 +167,8 @@ cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)

-# cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )

 if (NOT WIN32)
 cc_test(rw_lock_test SRCS rw_lock_test.cc)
 endif (NOT WIN32)
-
-# disable test temporarily.
-# TODO https://github.com/PaddlePaddle/Paddle/issues/11971
-# cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
-#         channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
-#         conditional_block_op while_op assign_op print_op executor proto_desc)
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stddef.h>            // for size_t
-#include <condition_variable>  // NOLINT
-#include <typeindex>
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-enum class ChannelAction {
-  SEND = 0,
-  RECEIVE = 1,
-  CLOSE = 2,
-};
-
-// Channel is the abstract class of buffered and un-buffered channels.
-template <typename T>
-class Channel {
- public:
-  virtual bool CanSend() = 0;
-  virtual bool CanReceive() = 0;
-  virtual void Send(T*) = 0;
-  virtual bool Receive(T*) = 0;
-  virtual size_t Cap() = 0;
-  virtual void Lock() = 0;
-
-  virtual void Unlock() = 0;
-  virtual bool IsClosed() = 0;
-  virtual void Close() = 0;
-  virtual ~Channel() {}
-
-  virtual void AddToSendQ(const void* referrer, T* data,
-                          std::shared_ptr<std::condition_variable_any> cond,
-                          std::function<bool(ChannelAction)> cb) = 0;
-  virtual void AddToReceiveQ(const void* referrer, T* data,
-                             std::shared_ptr<std::condition_variable_any> cond,
-                             std::function<bool(ChannelAction)> cb) = 0;
-  virtual void RemoveFromSendQ(const void* referrer) = 0;
-  virtual void RemoveFromReceiveQ(const void* referrer) = 0;
-};
-
-// Forward declaration of channel implementations.
-template <typename T>
-class ChannelImpl;
-
-template <typename T>
-Channel<T>* MakeChannel(size_t buffer_size) {
-  return new ChannelImpl<T>(buffer_size);
-}
-
-template <typename T>
-void CloseChannel(Channel<T>* ch) {
-  ch->Close();
-}
-
-/*
- * The ChannelHolder class serves two main purposes:
- * 1. It acts as a unified wrapper for the different kinds of
- *    channels, i.e. Buffered and Unbuffered channels. This is
- *    similar to the ReaderHolder class.
- * 2. It also helps us in TypeHiding. This is similar to the
- *    PlaceHolder implementations in variable.h and tensor.h.
- */
-class ChannelHolder {
- public:
-  template <typename T>
-  void Reset(size_t buffer_size) {
-    holder_.reset(new PlaceholderImpl<T>(buffer_size));
-  }
-
-  template <typename T>
-  void Send(T* data) {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    PADDLE_ENFORCE_EQ(
-        holder_->Type(), std::type_index(typeid(T)),
-        "Channel type is not same as the type of the data being sent");
-    // Static cast should be safe because we have ensured that types are same
-    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-    PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
-    channel->Send(data);
-  }
-
-  template <typename T>
-  bool Receive(T* data) {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    PADDLE_ENFORCE_EQ(
-        holder_->Type(), std::type_index(typeid(T)),
-        "Channel type is not same as the type of the data being sent");
-    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-    PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
-    return channel->Receive(data);
-  }
-
-  bool IsClosed() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    return holder_->IsClosed();
-  }
-
-  bool CanSend() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    return holder_->CanSend();
-  }
-
-  bool CanReceive() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    return holder_->CanReceive();
-  }
-
-  void close() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    holder_->Close();
-  }
-
-  size_t Cap() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    return holder_->Cap();
-  }
-
-  void Lock() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    holder_->Lock();
-  }
-
-  void Unlock() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    holder_->Unlock();
-  }
-
-  template <typename T>
-  void AddToSendQ(const void* referrer, T* data,
-                  std::shared_ptr<std::condition_variable_any> cond,
-                  std::function<bool(ChannelAction)> cb) {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-    if (channel != nullptr) {
-      channel->AddToSendQ(referrer, data, cond, cb);
-    }
-  }
-
-  template <typename T>
-  void AddToReceiveQ(const void* referrer, T* data,
-                     std::shared_ptr<std::condition_variable_any> cond,
-                     std::function<bool(ChannelAction)> cb) {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-    if (channel != nullptr) {
-      channel->AddToReceiveQ(referrer, data, cond, cb);
-    }
-  }
-
-  void RemoveFromSendQ(const void* referrer) {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    holder_->RemoveFromSendQ(referrer);
-  }
-
-  void RemoveFromReceiveQ(const void* referrer) {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    holder_->RemoveFromReceiveQ(referrer);
-  }
-
-  inline bool IsInitialized() const { return holder_ != nullptr; }
-
-  inline const std::type_index Type() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    return holder_->Type();
-  }
-
- private:
-  /**
-   * @note    Placeholder hides type T, so it doesn't appear as a template
-   *          parameter of ChannelHolder.
-   */
-  struct Placeholder {
-    virtual ~Placeholder() {}
-    virtual const std::type_index Type() const = 0;
-    virtual void* Ptr() const = 0;
-    virtual bool IsClosed() = 0;
-    virtual bool CanSend() = 0;
-    virtual bool CanReceive() = 0;
-    virtual void RemoveFromSendQ(const void* referrer) = 0;
-    virtual void RemoveFromReceiveQ(const void* referrer) = 0;
-    virtual void Close() = 0;
-    virtual void Lock() = 0;
-    virtual void Unlock() = 0;
-    virtual size_t Cap() = 0;
-  };
-
-  template <typename T>
-  struct PlaceholderImpl : public Placeholder {
-    explicit PlaceholderImpl(size_t buffer_size)
-        : type_(std::type_index(typeid(T))) {
-      channel_.reset(MakeChannel<T>(buffer_size));
-    }
-
-    virtual const std::type_index Type() const { return type_; }
-
-    virtual void* Ptr() const { return static_cast<void*>(channel_.get()); }
-
-    virtual bool IsClosed() {
-      if (channel_) {
-        return channel_->IsClosed();
-      }
-      return false;
-    }
-
-    virtual bool CanSend() {
-      if (channel_) {
-        return channel_->CanSend();
-      }
-      return false;
-    }
-
-    virtual bool CanReceive() {
-      if (channel_) {
-        return channel_->CanReceive();
-      }
-      return false;
-    }
-
-    virtual void RemoveFromSendQ(const void* referrer) {
-      if (channel_) {
-        channel_->RemoveFromSendQ(referrer);
-      }
-    }
-
-    virtual void RemoveFromReceiveQ(const void* referrer) {
-      if (channel_) {
-        channel_->RemoveFromReceiveQ(referrer);
-      }
-    }
-
-    virtual void Close() {
-      if (channel_) channel_->Close();
-    }
-
-    virtual size_t Cap() {
-      if (channel_)
-        return channel_->Cap();
-      else
-        return -1;
-    }
-
-    virtual void Lock() {
-      if (channel_) channel_->Lock();
-    }
-
-    virtual void Unlock() {
-      if (channel_) channel_->Unlock();
-    }
-
-    std::unique_ptr<Channel<T>> channel_;
-    const std::type_index type_;
-  };
-
-  // Pointer to a PlaceholderImpl object
-  std::unique_ptr<Placeholder> holder_;
-};
-
-}  // namespace framework
-}  // namespace paddle
-
-#include "paddle/fluid/framework/channel_impl.h"
--- a/paddle/fluid/framework/channel_impl.h
+++ b/paddle/fluid/framework/channel_impl.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <stddef.h>  // for size_t
-#include <atomic>
-#include <condition_variable>  // NOLINT
-#include <deque>
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-template <typename T>
-class ChannelImpl : public paddle::framework::Channel<T> {
-  friend Channel<T> *paddle::framework::MakeChannel<T>(size_t);
-  friend void paddle::framework::CloseChannel<T>(Channel<T> *);
-
- public:
-  virtual bool CanSend();
-  virtual bool CanReceive();
-  virtual void Send(T *);
-  virtual bool Receive(T *);
-  virtual size_t Cap() { return cap_; }
-  virtual void Lock();
-  virtual void Unlock();
-  virtual bool IsClosed();
-  virtual void Close();
-  explicit ChannelImpl(size_t);
-  virtual ~ChannelImpl();
-
-  virtual void AddToSendQ(const void *referrer, T *data,
-                          std::shared_ptr<std::condition_variable_any> cond,
-                          std::function<bool(ChannelAction)> cb);
-  virtual void AddToReceiveQ(const void *referrer, T *data,
-                             std::shared_ptr<std::condition_variable_any> cond,
-                             std::function<bool(ChannelAction)> cb);
-
-  virtual void RemoveFromSendQ(const void *referrer);
-  virtual void RemoveFromReceiveQ(const void *referrer);
-
- private:
-  struct QueueMessage {
-    T *data;
-    std::shared_ptr<std::condition_variable_any> cond;
-    bool chan_closed = false;
-    bool completed = false;
-    const void *referrer;  // TODO(thuan): figure out better way to do this
-    std::function<bool(ChannelAction)> callback;
-
-    explicit QueueMessage(T *item)
-        : data(item), cond(std::make_shared<std::condition_variable_any>()) {}
-
-    QueueMessage(T *item, std::shared_ptr<std::condition_variable_any> cond)
-        : data(item), cond(cond) {}
-
-    void Wait(std::unique_lock<std::recursive_mutex> &lock) {
-      cond->wait(lock, [this]() { return completed; });
-    }
-
-    void Notify() {
-      completed = true;
-      cond->notify_all();
-    }
-  };
-
-  void send_return() {
-    send_ctr--;
-    destructor_cond_.notify_all();
-  }
-
-  bool recv_return(bool value) {
-    recv_ctr--;
-    destructor_cond_.notify_all();
-    return value;
-  }
-
-  std::shared_ptr<QueueMessage> get_first_message(
-      std::deque<std::shared_ptr<QueueMessage>> *queue, ChannelAction action) {
-    while (!queue->empty()) {
-      // Check whether this message was added by Select
-      // If this was added by Select then execute the callback
-      // to check if you can execute this message. The callback
-      // can return false if some other case was executed in Select.
-      // In that case just discard this QueueMessage and process next.
-      std::shared_ptr<QueueMessage> m = queue->front();
-      queue->pop_front();
-      if (m->callback == nullptr || m->callback(action)) return m;
-    }
-    return nullptr;
-  }
-
-  size_t cap_;
-  std::recursive_mutex mu_;
-  bool closed_;
-  std::deque<T> buf_;
-  std::deque<std::shared_ptr<QueueMessage>> recvq;
-  std::deque<std::shared_ptr<QueueMessage>> sendq;
-  std::atomic<unsigned> send_ctr{0};
-  std::atomic<unsigned> recv_ctr{0};
-  std::condition_variable_any destructor_cond_;
-};
-
-template <typename T>
-ChannelImpl<T>::ChannelImpl(size_t capacity)
-    : cap_(capacity), closed_(false), send_ctr(0), recv_ctr(0) {
-  PADDLE_ENFORCE_GE(capacity, 0);
-}
-
-template <typename T>
-bool ChannelImpl<T>::CanSend() {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-  return !closed_ && (!recvq.empty() || buf_.size() < cap_);
-}
-
-template <typename T>
-bool ChannelImpl<T>::CanReceive() {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-  return !(closed_ && buf_.empty()) && (!sendq.empty() || buf_.size() > 0);
-}
-
-template <typename T>
-void ChannelImpl<T>::Send(T *item) {
-  send_ctr++;
-  std::unique_lock<std::recursive_mutex> lock{mu_};
-
-  // If channel is closed, throw exception
-  if (closed_) {
-    send_return();
-    lock.unlock();
-    PADDLE_THROW("Cannot send on closed channel");
-  }
-
-  // If there is a receiver, directly pass the value we want
-  // to send to the receiver, bypassing the channel buffer if any
-  if (!recvq.empty()) {
-    std::shared_ptr<QueueMessage> m =
-        get_first_message(&recvq, ChannelAction::SEND);
-
-    if (m != nullptr) {
-      *(m->data) = std::move(*item);
-      m->Notify();
-      send_return();
-      return;
-    } else {
-      Send(item);
-      send_return();
-      return;
-    }
-  }
-
-  // Unbuffered channel will always bypass this
-  // If buffered channel has space in buffer,
-  // write the element to the buffer.
-  if (buf_.size() < cap_) {
-    // Copy to buffer
-    buf_.push_back(std::move(*item));
-    send_return();
-    return;
-  }
-
-  // Block on channel, because some receiver will complete
-  // the operation for us
-  auto m = std::make_shared<QueueMessage>(item);
-  sendq.push_back(m);
-  m->Wait(lock);
-  if (m->chan_closed) {
-    send_return();
-    lock.unlock();
-    PADDLE_THROW("Cannot send on closed channel");
-  }
-  send_return();
-}
-
-template <typename T>
-bool ChannelImpl<T>::Receive(T *item) {
-  recv_ctr++;
-  std::unique_lock<std::recursive_mutex> lock{mu_};
-
-  // If channel is closed and buffer is empty or
-  // channel is unbuffered
-  if (closed_ && buf_.empty()) return recv_return(false);
-
-  // If there is a sender, directly receive the value we want
-  // from the sender. In case of a buffered channel, read from
-  // buffer and move front of send queue to the buffer
-  if (!sendq.empty()) {
-    std::shared_ptr<QueueMessage> m =
-        get_first_message(&sendq, ChannelAction::RECEIVE);
-    if (buf_.size() > 0) {
-      // Case 1 : Channel is Buffered
-      // Do Data transfer from front of buffer
-      // and add a QueueMessage to the buffer
-      *item = std::move(buf_.front());
-      buf_.pop_front();
-      // If first message from sendq is not null
-      // add it to the buffer and notify it
-      if (m != nullptr) {
-        // Copy to buffer
-        buf_.push_back(std::move(*(m->data)));
-        m->Notify();
-      }  // Ignore if there is no first message
-    } else {
-      // Case 2: Channel is Unbuffered
-      // Do data transfer from front of SendQ
-      // If front is nullptr, then recursively call itself
-      if (m != nullptr) {
-        *item = std::move(*(m->data));
-        m->Notify();
-      } else {
-        return recv_return(Receive(item));
-      }
-    }
-    return recv_return(true);
-  }
-
-  // If this is a buffered channel and there are items in buffer
-  if (buf_.size() > 0) {
-    // Directly read from buffer
-    *item = std::move(buf_.front());
-    buf_.pop_front();
-    // return true
-    return recv_return(true);
-  }
-
-  // No sender available, block on this channel
-  // Some receiver will complete the option for us
-  auto m = std::make_shared<QueueMessage>(item);
-  recvq.push_back(m);
-  m->Wait(lock);
-
-  return recv_return(!m->chan_closed);
-}
-
-template <typename T>
-void ChannelImpl<T>::Lock() {
-  mu_.lock();
-}
-
-template <typename T>
-void ChannelImpl<T>::Unlock() {
-  mu_.unlock();
-}
-
-template <typename T>
-bool ChannelImpl<T>::IsClosed() {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-  return closed_;
-}
-
-template <typename T>
-void ChannelImpl<T>::Close() {
-  std::unique_lock<std::recursive_mutex> lock{mu_};
-
-  if (closed_) {
-    // TODO(abhinavarora): closing an already closed channel should panic
-    lock.unlock();
-    return;
-  }
-
-  closed_ = true;
-
-  // Empty the readers
-  while (!recvq.empty()) {
-    std::shared_ptr<QueueMessage> m = recvq.front();
-    recvq.pop_front();
-    m->chan_closed = true;
-
-    // Execute callback function (if any)
-    if (m->callback != nullptr) {
-      m->callback(ChannelAction::CLOSE);
-    }
-
-    m->Notify();
-  }
-
-  // Empty the senders
-  while (!sendq.empty()) {
-    std::shared_ptr<QueueMessage> m = sendq.front();
-    sendq.pop_front();
-    m->chan_closed = true;
-
-    // Execute callback function (if any)
-    if (m->callback != nullptr) {
-      m->callback(ChannelAction::CLOSE);
-    }
-
-    m->Notify();
-  }
-}
-
-template <typename T>
-void ChannelImpl<T>::AddToSendQ(
-    const void *referrer, T *data,
-    std::shared_ptr<std::condition_variable_any> cond,
-    std::function<bool(ChannelAction)> cb) {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-  auto m = std::make_shared<QueueMessage>(data, cond);
-  m->referrer = referrer;
-  m->callback = cb;
-  sendq.push_back(m);
-}
-
-template <typename T>
-void ChannelImpl<T>::AddToReceiveQ(
-    const void *referrer, T *data,
-    std::shared_ptr<std::condition_variable_any> cond,
-    std::function<bool(ChannelAction)> cb) {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-  auto m = std::make_shared<QueueMessage>(data, cond);
-  m->referrer = referrer;
-  m->callback = cb;
-  recvq.push_back(m);
-}
-
-template <typename T>
-void ChannelImpl<T>::RemoveFromSendQ(const void *referrer) {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-
-  for (auto it = sendq.begin(); it != sendq.end();) {
-    std::shared_ptr<QueueMessage> sendMsg = (std::shared_ptr<QueueMessage>)*it;
-
-    if (sendMsg->referrer == referrer) {
-      it = sendq.erase(it);
-    } else {
-      ++it;
-    }
-  }
-}
-
-template <typename T>
-void ChannelImpl<T>::RemoveFromReceiveQ(const void *referrer) {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-
-  for (auto it = recvq.begin(); it != recvq.end();) {
-    std::shared_ptr<QueueMessage> recvMsg = (std::shared_ptr<QueueMessage>)*it;
-
-    if (recvMsg->referrer == referrer) {
-      it = recvq.erase(it);
-    } else {
-      ++it;
-    }
-  }
-}
-
-template <typename T>
-ChannelImpl<T>::~ChannelImpl() {
-  Close();
-  // The destructor must wait for all readers and writers to complete their task
-  // The channel has been closed, so we will not accept new readers and writers
-  std::unique_lock<std::recursive_mutex> lock{mu_};
-  destructor_cond_.wait(lock,
-                        [this]() { return send_ctr == 0 && recv_ctr == 0; });
-}
-
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
--- a/paddle/fluid/framework/concurrency_test.cc
+++ b/paddle/fluid/framework/concurrency_test.cc
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thread>  // NOLINT
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-USE_NO_KERNEL_OP(go);
-USE_NO_KERNEL_OP(channel_close);
-USE_NO_KERNEL_OP(channel_create);
-USE_NO_KERNEL_OP(channel_recv);
-USE_NO_KERNEL_OP(channel_send);
-USE_NO_KERNEL_OP(elementwise_add);
-USE_NO_KERNEL_OP(select);
-USE_NO_KERNEL_OP(conditional_block);
-USE_NO_KERNEL_OP(equal);
-USE_NO_KERNEL_OP(assign);
-USE_NO_KERNEL_OP(while);
-USE_NO_KERNEL_OP(print);
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-namespace paddle {
-namespace framework {
-
-template <typename T>
-LoDTensor *CreateVariable(Scope *scope, const p::CPUPlace &place,
-                          std::string name, T value) {
-  // Create LoDTensor<int> of dim [1]
-  auto var = scope->Var(name);
-  auto tensor = var->GetMutable<LoDTensor>();
-  tensor->Resize({1});
-  T *expect = tensor->mutable_data<T>(place);
-  expect[0] = value;
-  return tensor;
-}
-
-void AddOp(const std::string &type, const VariableNameMap &inputs,
-           const VariableNameMap &outputs, AttributeMap attrs,
-           BlockDesc *block) {
-  // insert op
-  auto op = block->AppendOp();
-  op->SetType(type);
-  for (auto &kv : inputs) {
-    op->SetInput(kv.first, kv.second);
-  }
-  for (auto &kv : outputs) {
-    op->SetOutput(kv.first, kv.second);
-  }
-  op->SetAttrMap(attrs);
-}
-
-void AddCase(ProgramDesc *program, Scope *scope, p::CPUPlace *place,
-             BlockDesc *casesBlock, int caseId, int caseType,
-             std::string caseChannel, std::string caseVarName,
-             std::function<void(BlockDesc *, Scope *)> func) {
-  std::string caseCondName = std::string("caseCond") + std::to_string(caseId);
-  std::string caseCondXVarName =
-      std::string("caseCondX") + std::to_string(caseId);
-
-  BlockDesc *caseBlock = program->AppendBlock(*casesBlock);
-  func(caseBlock, scope);
-
-  CreateVariable(scope, *place, caseCondName, false);
-  CreateVariable(scope, *place, caseCondXVarName, caseId);
-  CreateVariable(scope, *place, caseVarName, caseId);
-
-  scope->Var("step_scope");
-
-  AddOp("equal", {{"X", {caseCondXVarName}}, {"Y", {"caseToExecute"}}},
-        {{"Out", {caseCondName}}}, {}, casesBlock);
-
-  AddOp("conditional_block", {{"X", {caseCondName}}, {"Params", {}}},
-        {{"Out", {}}, {"Scope", {"step_scope"}}},
-        {{"sub_block", caseBlock}, {"is_scalar_condition", true}}, casesBlock);
-}
-
-void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program,
-                        BlockDesc *parentBlock, std::string dataChanName,
-                        std::string quitChanName) {
-  BlockDesc *whileBlock = program->AppendBlock(*parentBlock);
-
-  CreateVariable(scope, *place, "whileExitCond", true);
-  CreateVariable(scope, *place, "caseToExecute", -1);
-  CreateVariable(scope, *place, "case1var", 0);
-
-  CreateVariable(scope, *place, "xtemp", 0);
-
-  // TODO(thuan): Need to create fibXToSend, since channel send moves the actual
-  // data,
-  // which causes the data to be no longer accessible to do the fib calculation
-  // TODO(abhinav): Change channel send to do a copy instead of a move!
-  CreateVariable(scope, *place, "fibXToSend", 0);
-
-  CreateVariable(scope, *place, "fibX", 0);
-  CreateVariable(scope, *place, "fibY", 1);
-  CreateVariable(scope, *place, "quitVar", 0);
-
-  BlockDesc *casesBlock = program->AppendBlock(*whileBlock);
-  std::function<void(BlockDesc * caseBlock)> f = [](BlockDesc *caseBlock) {};
-
-  // TODO(thuan): Remove this once we change channel send to do a copy instead
-  // of move
-  AddOp("assign", {{"X", {"fibX"}}}, {{"Out", {"fibXToSend"}}}, {}, whileBlock);
-
-  // Case 0: Send to dataChanName
-  std::function<void(BlockDesc * caseBlock, Scope * scope)> case0Func = [&](
-      BlockDesc *caseBlock, Scope *scope) {
-    AddOp("assign", {{"X", {"fibX"}}}, {{"Out", {"xtemp"}}}, {}, caseBlock);
-    AddOp("assign", {{"X", {"fibY"}}}, {{"Out", {"fibX"}}}, {}, caseBlock);
-    AddOp("elementwise_add", {{"X", {"xtemp"}}, {"Y", {"fibY"}}},
-          {{"Out", {"fibY"}}}, {}, caseBlock);
-  };
-  AddCase(program, scope, place, casesBlock, 0, 1, dataChanName, "fibXToSend",
-          case0Func);
-  std::string case0Config =
-      std::string("0,1,") + dataChanName + std::string(",fibXToSend");
-
-  // Case 1: Receive from quitChanName
-  std::function<void(BlockDesc * caseBlock, Scope * scope)> case2Func = [&](
-      BlockDesc *caseBlock, Scope *scope) {
-    // Exit the while loop after we receive from quit channel.
-    // We assign a false to "whileExitCond" variable, which will
-    // break out of while_op loop
-    CreateVariable(scope, *place, "whileFalse", false);
-    AddOp("assign", {{"X", {"whileFalse"}}}, {{"Out", {"whileExitCond"}}}, {},
-          caseBlock);
-  };
-  AddCase(program, scope, place, casesBlock, 1, 2, quitChanName, "quitVar",
-          case2Func);
-  std::string case1Config =
-      std::string("1,2,") + quitChanName + std::string(",quitVar");
-
-  // Select block
-  AddOp("select", {{"X", {dataChanName, quitChanName}},
-                   {"case_to_execute", {"caseToExecute"}}},
-        {{"Out", {}}},
-        {{"sub_block", casesBlock},
-         {"cases", std::vector<std::string>{case0Config, case1Config}}},
-        whileBlock);
-
-  scope->Var("stepScopes");
-  AddOp("while",
-        {{"X", {dataChanName, quitChanName}}, {"Condition", {"whileExitCond"}}},
-        {{"Out", {}}, {"StepScopes", {"stepScopes"}}},
-        {{"sub_block", whileBlock}}, parentBlock);
-}
-
-TEST(Concurrency, Go_Op) {
-  Scope scope;
-  p::CPUPlace place;
-
-  // Initialize scope variables
-  p::CPUDeviceContext ctx(place);
-
-  // Create channel variable
-  scope.Var("Channel");
-
-  // Create Variables, x0 will be put into channel,
-  // result will be pulled from channel
-  CreateVariable(&scope, place, "Status", false);
-  CreateVariable(&scope, place, "x0", 99);
-  CreateVariable(&scope, place, "result", 0);
-
-  framework::Executor executor(place);
-  ProgramDesc program;
-  BlockDesc *block = program.MutableBlock(0);
-
-  // Create channel OP
-  AddOp("channel_create", {}, {{"Out", {"Channel"}}},
-        {{"capacity", 10}, {"data_type", f::proto::VarType::LOD_TENSOR}},
-        block);
-
-  // Create Go Op routine
-  BlockDesc *goOpBlock = program.AppendBlock(program.Block(0));
-  AddOp("channel_send", {{"Channel", {"Channel"}}, {"X", {"x0"}}},
-        {{"Status", {"Status"}}}, {}, goOpBlock);
-
-  // Create Go Op
-  AddOp("go", {{"X", {"Channel", "x0"}}}, {}, {{"sub_block", goOpBlock}},
-        block);
-
-  // Create Channel Receive Op
-  AddOp("channel_recv", {{"Channel", {"Channel"}}},
-        {{"Status", {"Status"}}, {"Out", {"result"}}}, {}, block);
-
-  // Create Channel Close Op
-  AddOp("channel_close", {{"Channel", {"Channel"}}}, {}, {}, block);
-
-  // Check the result tensor to make sure it is set to 0
-  const LoDTensor &tensor = (scope.FindVar("result"))->Get<LoDTensor>();
-  auto *initialData = tensor.data<int>();
-  EXPECT_EQ(initialData[0], 0);
-
-  executor.Run(program, &scope, 0, true, true);
-
-  // After we call executor.run, the Go operator should do a channel_send to
-  // set the "result" variable to 99.
-  auto *finalData = tensor.data<int>();
-  EXPECT_EQ(finalData[0], 99);
-}
-
-/**
- * This test implements the fibonacci function using go_op and select_op
- */
-TEST(Concurrency, Select) {
-  Scope scope;
-  p::CPUPlace place;
-
-  // Initialize scope variables
-  p::CPUDeviceContext ctx(place);
-
-  CreateVariable(&scope, place, "Status", false);
-  CreateVariable(&scope, place, "result", 0);
-  CreateVariable(&scope, place, "currentXFib", 0);
-
-  framework::Executor executor(place);
-  ProgramDesc program;
-  BlockDesc *block = program.MutableBlock(0);
-
-  // Create channel OP
-  std::string dataChanName = "Channel";
-  scope.Var(dataChanName);
-  AddOp("channel_create", {}, {{"Out", {dataChanName}}},
-        {{"capacity", 0}, {"data_type", f::proto::VarType::LOD_TENSOR}}, block);
-
-  std::string quitChanName = "Quit";
-  scope.Var(quitChanName);
-  AddOp("channel_create", {}, {{"Out", {quitChanName}}},
-        {{"capacity", 0}, {"data_type", f::proto::VarType::LOD_TENSOR}}, block);
-
-  // Create Go Op routine, which loops 10 times over fibonacci sequence
-  CreateVariable(&scope, place, "xReceiveVar", 0);
-
-  BlockDesc *goOpBlock = program.AppendBlock(program.Block(0));
-  for (int i = 0; i < 10; ++i) {
-    AddOp("channel_recv", {{"Channel", {dataChanName}}},
-          {{"Status", {"Status"}}, {"Out", {"currentXFib"}}}, {}, goOpBlock);
-    AddOp("print", {{"In", {"currentXFib"}}}, {{"Out", {"currentXFib"}}},
-          {{"first_n", 100},
-           {"summarize", -1},
-           {"print_tensor_name", false},
-           {"print_tensor_type", true},
-           {"print_tensor_shape", false},
-           {"print_tensor_lod", false},
-           {"print_phase", std::string("FORWARD")},
-           {"message", std::string("X: ")}},
-          goOpBlock);
-  }
-
-  CreateVariable(&scope, place, "quitSignal", 0);
-  AddOp("channel_send", {{"Channel", {quitChanName}}, {"X", {"quitSignal"}}},
-        {{"Status", {"Status"}}}, {}, goOpBlock);
-
-  // Create Go Op
-  AddOp("go", {{"X", {dataChanName, quitChanName}}}, {},
-        {{"sub_block", goOpBlock}}, block);
-
-  AddFibonacciSelect(&scope, &place, &program, block, dataChanName,
-                     quitChanName);
-
-  // Create Channel Close Op
-  AddOp("channel_close", {{"Channel", {dataChanName}}}, {}, {}, block);
-  AddOp("channel_close", {{"Channel", {quitChanName}}}, {}, {}, block);
-
-  executor.Run(program, &scope, 0, true, true);
-
-  // After we call executor.run, "result" variable should be equal to 34
-  // (which is 10 loops through fibonacci sequence)
-  const LoDTensor &tensor = (scope.FindVar("currentXFib"))->Get<LoDTensor>();
-  auto *finalData = tensor.data<int>();
-  EXPECT_EQ(finalData[0], 34);
-}
-
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -14,7 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/executor.h"

-#include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
@@ -76,15 +75,13 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
    var->GetMutable<platform::PlaceList>();
  } else if (var_type == proto::VarType::READER) {
    var->GetMutable<ReaderHolder>();
-  } else if (var_type == proto::VarType::CHANNEL) {
-    var->GetMutable<ChannelHolder>();
  } else if (var_type == proto::VarType::RAW) {
    // GetMutable will be called in operator
  } else {
    PADDLE_THROW(
        "Variable type %d is not in "
        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]",
+        "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]",
        var_type);
  }
 }

--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -126,7 +126,6 @@ message VarType {
    LOD_TENSOR_ARRAY = 13;
    PLACE_LIST = 14;
    READER = 15;
-    CHANNEL = 16;
    // Any runtime decided variable type is raw
    // raw variables should manage their own allocations
    // in operators like nccl_op
@@ -158,12 +157,6 @@ message VarType {
  message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; }
  optional ReaderDesc reader = 5;

-  message ChannelDesc {
-    required Type data_type = 1;
-    required int64 capacity = 2;
-  }
-  optional ChannelDesc channel = 6;
-
  message Tuple { repeated Type element_type = 1; }
  optional Tuple tuple = 7;
 }

--- a/paddle/fluid/framework/ir/graph_traits.cc
+++ b/paddle/fluid/framework/ir/graph_traits.cc
@@ -14,6 +14,8 @@

 #include "paddle/fluid/framework/ir/graph_traits.h"

+#include <vector>
+
 namespace paddle {
 namespace framework {
 namespace ir {

--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
@@ -27,8 +27,11 @@ class SelectedRowsTester : public ::testing::Test {
    selected_rows_.reset(new SelectedRows(rows, height));

    Tensor* value = selected_rows_->mutable_value();
-    value->mutable_data<float>(
+    auto* data = value->mutable_data<float>(
        make_ddim({static_cast<int64_t>(rows.size()), row_numel}), place_);
+    for (int64_t i = 0; i < value->numel(); ++i) {
+      data[i] = static_cast<float>(i);
+    }
  }

 protected:
@@ -60,6 +63,10 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
  ASSERT_EQ(selected_rows_->height(), dst_tensor.height());
  ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims());
  ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
+  auto* dst_data = dst_tensor.value().data<float>();
+  for (int64_t i = 0; i < dst_tensor.value().numel(); ++i) {
+    ASSERT_EQ(dst_data[i], static_cast<float>(i));
+  }
 }

 TEST(SelectedRows, SparseTable) {

--- a/paddle/fluid/framework/tuple.h
+++ b/paddle/fluid/framework/tuple.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/var_desc.h"

--- a/paddle/fluid/framework/var_desc.cc
+++ b/paddle/fluid/framework/var_desc.cc
@@ -88,13 +88,7 @@ std::vector<std::vector<int64_t>> VarDesc::GetShapes() const {
 }

 void VarDesc::SetDataType(proto::VarType::Type data_type) {
-  switch (desc_.type().type()) {
-    case proto::VarType::CHANNEL:
-      mutable_channel_desc()->set_data_type(data_type);
-      break;
-    default:
-      mutable_tensor_desc()->set_data_type(data_type);
-  }
+  mutable_tensor_desc()->set_data_type(data_type);
 }

 void VarDesc::SetDataTypes(
@@ -115,13 +109,7 @@ void VarDesc::SetDataTypes(
 }

 proto::VarType::Type VarDesc::GetDataType() const {
-  switch (desc_.type().type()) {
-    case proto::VarType::CHANNEL:
-      return channel_desc().data_type();
-      break;
-    default:
-      return tensor_desc().data_type();
-  }
+  return tensor_desc().data_type();
 }

 std::vector<proto::VarType::Type> VarDesc::GetDataTypes() const {
@@ -134,17 +122,6 @@ std::vector<proto::VarType::Type> VarDesc::GetDataTypes() const {
  return res;
 }

-void VarDesc::SetCapacity(int64_t capacity) {
-  switch (desc_.type().type()) {
-    case proto::VarType::CHANNEL:
-      desc_.mutable_type()->mutable_channel()->set_capacity(capacity);
-      break;
-    default:
-      PADDLE_THROW("Setting 'capacity' is not supported by the type of var %s.",
-                   this->Name());
-  }
-}
-
 void VarDesc::SetLoDLevel(int32_t lod_level) {
  switch (desc_.type().type()) {
    case proto::VarType::LOD_TENSOR:
@@ -214,19 +191,6 @@ std::vector<int32_t> VarDesc::GetLoDLevels() const {
  }
 }

-const proto::VarType::ChannelDesc &VarDesc::channel_desc() const {
-  PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set.");
-  PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
-  switch (desc_.type().type()) {
-    case proto::VarType::CHANNEL:
-      return desc_.type().channel();
-    default:
-      PADDLE_THROW(
-          "Getting 'channel_desc' is not supported by the type of var %s.",
-          this->Name());
-  }
-}
-
 const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
  PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set.");
  PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
@@ -262,20 +226,6 @@ std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
  }
 }

-proto::VarType::ChannelDesc *VarDesc::mutable_channel_desc() {
-  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
-  PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
-  switch (desc_.type().type()) {
-    case proto::VarType::CHANNEL:
-      return desc_.mutable_type()->mutable_channel();
-    default:
-      PADDLE_THROW(
-          "Getting 'mutable_channel_desc' is not supported by the type of var "
-          "%s.",
-          this->Name());
-  }
-}
-
 proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
  PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");

--- a/paddle/fluid/framework/var_desc.h
+++ b/paddle/fluid/framework/var_desc.h
@@ -87,8 +87,6 @@ class VarDesc {
  void SetDataTypes(
      const std::vector<proto::VarType::Type> &multiple_data_type);

-  void SetCapacity(int64_t capacity);
-
  proto::VarType::Type GetDataType() const;

  std::vector<proto::VarType::Type> GetDataTypes() const;
@@ -110,10 +108,8 @@ class VarDesc {
  void SetPersistable(bool persistable) { desc_.set_persistable(persistable); }

 private:
-  const proto::VarType::ChannelDesc &channel_desc() const;
  const proto::VarType::TensorDesc &tensor_desc() const;
  std::vector<proto::VarType::TensorDesc> tensor_descs() const;
-  proto::VarType::ChannelDesc *mutable_channel_desc();
  proto::VarType::TensorDesc *mutable_tensor_desc();
  std::vector<proto::VarType::TensorDesc *> mutable_tensor_descs();


--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-#include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -41,8 +40,6 @@ inline proto::VarType::Type ToVarType(std::type_index type) {
    return proto::VarType_Type_SELECTED_ROWS;
  } else if (IsType<ReaderHolder>(type)) {
    return proto::VarType_Type_READER;
-  } else if (IsType<ChannelHolder>(type)) {
-    return proto::VarType_Type_CHANNEL;
  } else {
    PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
  }
@@ -66,9 +63,6 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
    case proto::VarType_Type_READER:
      visitor(var.Get<ReaderHolder>());
      return;
-    case proto::VarType_Type_CHANNEL:
-      visitor(var.Get<ChannelHolder>());
-      return;
    default:
      PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type()));
  }

--- a/paddle/fluid/inference/analysis/analysis_pass.h
+++ b/paddle/fluid/inference/analysis/analysis_pass.h
@@ -41,12 +41,6 @@ class AnalysisPass {
  // all passes have run.
  virtual bool Finalize() { return false; }

-  // Get a Pass appropriate to print the Node this pass operates on.
-  virtual AnalysisPass *CreatePrinterPass(std::ostream &os,
-                                          const std::string &banner) const {
-    return nullptr;
-  }
-
  // Create a debugger Pass that draw the DFG by graphviz toolkit.
  virtual AnalysisPass *CreateGraphvizDebugerPass() const { return nullptr; }


--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -37,12 +37,16 @@ TEST(Analyzer, analysis_without_tensorrt) {
 TEST(Analyzer, analysis_with_tensorrt) {
  FLAGS_IA_enable_tensorrt_subgraph_engine = true;
  Argument argument;
+  argument.Set<int>("minimum_subgraph_size", new int(0));
+  argument.Set<int>("max_batch_size", new int(3));
+  argument.Set<int>("workspace_size", new int(1 << 20));
+  argument.Set<std::string>("precision_mode", new std::string("FP32"));
  argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir));
  Analyzer analyser;
  analyser.Run(&argument);
 }

-void TestWord2vecPrediction(const std::string &model_path) {
+void TestWord2vecPrediction(const std::string& model_path) {
  NativeConfig config;
  config.model_dir = model_path;
  config.use_gpu = false;
@@ -73,8 +77,8 @@ void TestWord2vecPrediction(const std::string &model_path) {
  // The outputs' buffers are in CPU memory.
  for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
    LOG(INFO) << "data: "
-              << static_cast<float *>(outputs.front().data.data())[i];
-    PADDLE_ENFORCE(static_cast<float *>(outputs.front().data.data())[i],
+              << static_cast<float*>(outputs.front().data.data())[i];
+    PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
                   result[i]);
  }
 }

--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -97,8 +97,10 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
  }
 }

-void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
+void CreateTrtEngineOp(Node *node, Argument *argument,
                       framework::proto::BlockDesc *block) {
+  PADDLE_ENFORCE(argument->main_dfg.get());
+  const DataFlowGraph &graph = *(argument->main_dfg);
  static int counter{0};
  PADDLE_ENFORCE(node->IsFunctionBlock());
  framework::OpDesc desc;
@@ -204,7 +206,10 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,

  PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc");
  // Set attrs
+
  SetAttr(desc.Proto(), "subgraph", block->SerializeAsString());
+  SetAttr(desc.Proto(), "max_batch_size", argument->Get<int>("max_batch_size"));
+  SetAttr(desc.Proto(), "workspace_size", argument->Get<int>("workspace_size"));
  SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
  SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
  SetAttr(desc.Proto(), "output_name_mapping", output_mapping);
@@ -248,7 +253,7 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
  *block_desc.Proto()->mutable_vars() =
      argument_->origin_program_desc->blocks(0).vars();
  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
-  CreateTrtEngineOp(node, *argument_->main_dfg, block_desc.Proto());
+  CreateTrtEngineOp(node, argument_, block_desc.Proto());
  auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
  auto *op = main_block->add_ops();
  PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");

--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -309,6 +309,8 @@ void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); }
 void SubGraphFuse::ReplaceNodesWithSubGraphs() {
  auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)();
  for (auto &subgraph : subgraphs) {
+    if (subgraph.size() <= argument_->Get<int>("minimum_subgraph_size"))
+      continue;
    std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
    // replace this sub-graph with the first node. Two steps: 1. Create a Block
    // Node that contains this subgraph 2. Mark the nodes inside the sub-graph

--- a/paddle/fluid/inference/analysis/subgraph_splitter.h
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.h
@@ -20,6 +20,7 @@ limitations under the License. */

 #include <vector>

+#include "paddle/fluid/inference/analysis/argument.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/node.h"

@@ -63,8 +64,11 @@ class SubGraphFuse {
 public:
  using NodeInsideSubgraphTeller = SubGraphSplitter::NodeInsideSubgraphTeller;

-  SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller)
-      : graph_(graph), node_inside_subgraph_teller_(teller) {}
+  SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller,
+               Argument *argument)
+      : graph_(graph),
+        node_inside_subgraph_teller_(teller),
+        argument_(argument) {}

  // The main method which run all the logic.
  void operator()();
@@ -76,6 +80,7 @@ class SubGraphFuse {
 private:
  DataFlowGraph *graph_;
  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+  Argument *argument_;
 };

 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
@@ -66,10 +66,12 @@ TEST(SubGraphSplitter, Split) {
 TEST(SubGraphSplitter, Fuse) {
  auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__");
  auto dfg = ProgramDescToDFG(desc);
+  Argument argument;
+  argument.Set<int>("minimum_subgraph_size", new int(3));

  size_t count0 = dfg.nodes.size();

-  SubGraphFuse fuse(&dfg, teller);
+  SubGraphFuse fuse(&dfg, teller, &argument);
  fuse();

  int count1 = 0;

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
@@ -24,7 +24,7 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
    : node_inside_subgraph_teller_(teller) {}

 void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
-  SubGraphFuse(graph, node_inside_subgraph_teller_)();
+  SubGraphFuse(graph, node_inside_subgraph_teller_, argument_)();
  VLOG(4) << "debug info "
          << graph->HumanReadableInfo(false /*show_values*/,
                                      true /*show_functions*/);

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
@@ -33,7 +33,10 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {

  explicit TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller);

-  bool Initialize(Argument* argument) override { return true; }
+  bool Initialize(Argument* argument) override {
+    argument_ = argument;
+    return true;
+  }

  // This class get a sub-graph as input and determine whether to transform this
  // sub-graph into TensorRT.
@@ -46,6 +49,7 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {

 private:
  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+  Argument* argument_;
 };

 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
@@ -36,6 +36,10 @@ TEST(TensorRTSubGraphPass, main) {
  };

  Argument argument(FLAGS_inference_model_dir);
+  argument.Set<int>("minimum_subgraph_size", new int(0));
+  argument.Set<int>("max_batch_size", new int(3));
+  argument.Set<int>("workspace_size", new int(1 << 20));
+  argument.Set<std::string>("precision_mode", new std::string("FP32"));

  DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
  DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};

--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -21,6 +21,12 @@ limitations under the License. */
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/tests/test_helper.h"

+#ifdef __clang__
+#define ACC_DIFF 4e-3
+#else
+#define ACC_DIFF 1e-3
+#endif
+
 DEFINE_string(dirname, "", "Directory of the inference model.");

 namespace paddle {
@@ -99,8 +105,8 @@ void MainWord2Vec(bool use_gpu) {

  float* lod_data = output1.data<float>();
  for (int i = 0; i < output1.numel(); ++i) {
-    EXPECT_LT(lod_data[i] - data[i], 1e-3);
-    EXPECT_GT(lod_data[i] - data[i], -1e-3);
+    EXPECT_LT(lod_data[i] - data[i], ACC_DIFF);
+    EXPECT_GT(lod_data[i] - data[i], -ACC_DIFF);
  }
 }

@@ -144,7 +150,7 @@ void MainImageClassification(bool use_gpu) {
  float* data = static_cast<float*>(outputs[0].data.data());
  float* lod_data = output1.data<float>();
  for (size_t j = 0; j < len / sizeof(float); ++j) {
-    EXPECT_NEAR(lod_data[j], data[j], 1e-3);
+    EXPECT_NEAR(lod_data[j], data[j], ACC_DIFF);
  }
 }

@@ -199,7 +205,7 @@ void MainThreadsWord2Vec(bool use_gpu) {
      float* ref_data = refs[tid].data<float>();
      EXPECT_EQ(refs[tid].numel(), static_cast<int64_t>(len / sizeof(float)));
      for (int i = 0; i < refs[tid].numel(); ++i) {
-        EXPECT_NEAR(ref_data[i], data[i], 1e-3);
+        EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF);
      }
    });
  }
@@ -251,7 +257,7 @@ void MainThreadsImageClassification(bool use_gpu) {
      float* ref_data = refs[tid].data<float>();
      EXPECT_EQ((size_t)refs[tid].numel(), len / sizeof(float));
      for (int i = 0; i < refs[tid].numel(); ++i) {
-        EXPECT_NEAR(ref_data[i], data[i], 1e-3);
+        EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF);
      }
    });
  }

--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -35,8 +35,6 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
  bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
    FLAGS_IA_enable_tensorrt_subgraph_engine = true;
    VLOG(3) << "Predictor::init()";
-    FLAGS_tensorrt_max_batch_size = config_.max_batch_size;
-    FLAGS_tensorrt_workspace_size = config_.workspace_size;
    if (config_.use_gpu) {
      place_ = paddle::platform::CUDAPlace(config_.device);
    } else {
@@ -92,6 +90,14 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
  void OptimizeInferenceProgram() {
    // Analyze inference_program
    Argument argument;
+
+    argument.Set<int>("minimum_subgraph_size",
+                      new int(config_.minimum_subgraph_size));
+    argument.Set<int>("max_batch_size", new int(config_.max_batch_size));
+    argument.Set<int>("workspace_size", new int(config_.workspace_size));
+    argument.Set<std::string>("precision_mode",
+                              new std::string(config_.precision_mode));
+
    if (!config_.model_dir.empty()) {
      argument.fluid_model_dir.reset(new std::string(config_.model_dir));
    } else {

--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -194,6 +194,14 @@ struct MixedRTConfig : public NativeConfig {
  // For workspace_size, refer it from here:
  // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
  int workspace_size{1 << 30};
+  //  We transform the Ops that can be converted into TRT layer in the model,
+  //  and aggregate these Ops into subgraphs for TRT execution.
+  //  We set this variable to control the minimum number of nodes in the
+  //  subgraph, 3 as default value.
+  int minimum_subgraph_size = 3;
+  // Reserved configuration
+  // We just support "FP32" now, "FP16" and "INT8" will be supported.
+  std::string precision_mode = "FP32";
 };

 // NOTE WIP, not stable yet.

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -85,3 +85,13 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
               DEPS inference_anakin_api_shared dynload_cuda SERIAL)
   endif()
 endif()
+
+if(WITH_GPU AND TENSORRT_FOUND)
+   set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt")
+   if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
+       inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
+   endif()
+   cc_test(test_trt_models SRCS trt_models_tester.cc  
+     ARGS --dirname=${TRT_MODEL_INSTALL_DIR}/trt_test_models
+     DEPS paddle_inference_tensorrt_subgraph_engine)
+endif()
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+namespace paddle {
+using paddle::contrib::MixedRTConfig;
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+NativeConfig GetConfigNative() {
+  NativeConfig config;
+  config.model_dir = FLAGS_dirname;
+  // LOG(INFO) << "dirname  " << config.model_dir;
+  config.fraction_of_gpu_memory = 0.45;
+  config.use_gpu = true;
+  config.device = 0;
+  return config;
+}
+
+MixedRTConfig GetConfigTRT() {
+  MixedRTConfig config;
+  config.model_dir = FLAGS_dirname;
+  config.use_gpu = true;
+  config.fraction_of_gpu_memory = 0.2;
+  config.device = 0;
+  config.max_batch_size = 3;
+  return config;
+}
+
+void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
+  NativeConfig config0 = GetConfigNative();
+  config0.model_dir = model_dirname;
+
+  MixedRTConfig config1 = GetConfigTRT();
+  config1.model_dir = model_dirname;
+  config1.max_batch_size = batch_size;
+
+  auto predictor0 =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
+  auto predictor1 =
+      CreatePaddlePredictor<MixedRTConfig,
+                            PaddleEngineKind::kAutoMixedTensorRT>(config1);
+  // Prepare inputs
+  int height = 224;
+  int width = 224;
+  float *data = new float[batch_size * 3 * height * width];
+  memset(data, 0, sizeof(float) * (batch_size * 3 * height * width));
+  data[0] = 1.0f;
+
+  // Prepare inputs
+  PaddleTensor tensor;
+  tensor.name = "input_0";
+  tensor.shape = std::vector<int>({batch_size, 3, height, width});
+  tensor.data = PaddleBuf(static_cast<void *>(data),
+                          sizeof(float) * (batch_size * 3 * height * width));
+  tensor.dtype = PaddleDType::FLOAT32;
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+
+  // Prepare outputs
+  std::vector<PaddleTensor> outputs0;
+  std::vector<PaddleTensor> outputs1;
+  CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0));
+
+  CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size));
+
+  // Get output.
+  ASSERT_EQ(outputs0.size(), 1UL);
+  ASSERT_EQ(outputs1.size(), 1UL);
+
+  const size_t num_elements = outputs0.front().data.length() / sizeof(float);
+  const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
+  EXPECT_EQ(num_elements, num_elements1);
+
+  auto *data0 = static_cast<float *>(outputs0.front().data.data());
+  auto *data1 = static_cast<float *>(outputs1.front().data.data());
+
+  ASSERT_GT(num_elements, 0UL);
+  for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
+    EXPECT_NEAR(data0[i], data1[i], 1e-3);
+  }
+}
+
+TEST(trt_models_test, main) {
+  std::vector<std::string> infer_models = {"mobilenet", "resnet50",
+                                           "resnext50"};
+  for (auto &model_dir : infer_models) {
+    CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + model_dir);
+  }
+}
+}  // namespace paddle
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -301,6 +301,7 @@ op_library(fusion_lstm_op DEPS cpu_lstm_compute)
 if (WITH_GPU)
    op_library(conv_op DEPS vol2col depthwise_conv im2col)
    op_library(layer_norm_op DEPS cub)
+    op_library(reduce_mean_op DEPS cub)
 else()
    op_library(conv_op DEPS vol2col im2col)
 endif()
@@ -313,11 +314,6 @@ op_library(save_combine_op DEPS lod_tensor)
 op_library(load_combine_op DEPS lod_tensor)
 op_library(concat_op DEPS concat)

-# FIXME(thuan): Move CSP operators to paddle/fluid/framework/operators/concurrency
-add_subdirectory(concurrency)
-op_library(channel_send_op DEPS concurrency)
-op_library(channel_recv_op DEPS concurrency)
-
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})

 foreach(src ${GENERAL_OPS})

--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -36,11 +36,16 @@ class AucOp : public framework::OperatorWithKernel {
                      "Out and Label should have same height.");

    int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;
+    int slide_steps = ctx->Attrs().Get<int>("slide_steps");
+
+    PADDLE_ENFORCE_GE(num_pred_buckets, 1, "num_thresholds must larger than 1");
+    PADDLE_ENFORCE_GE(slide_steps, 0, "slide_steps must be natural number");

    ctx->SetOutputDim("AUC", {1});
-    ctx->SetOutputDim("BatchAUC", {1});
-    ctx->SetOutputDim("StatPosOut", {num_pred_buckets});
-    ctx->SetOutputDim("StatNegOut", {num_pred_buckets});
+
+    slide_steps = slide_steps == 0 ? 1 : slide_steps;
+    ctx->SetOutputDim("StatPosOut", {slide_steps, num_pred_buckets});
+    ctx->SetOutputDim("StatNegOut", {slide_steps, num_pred_buckets});
  }

 protected:
@@ -62,6 +67,7 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Label",
             "A 2D int tensor indicating the label of the training data. "
             "shape: [batch_size, 1]");
+
    // TODO(typhoonzero): support weight input
    AddInput("StatPos", "Statistic value when label = 1");
    AddInput("StatNeg", "Statistic value when label = 0");
@@ -69,18 +75,19 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("AUC",
              "A scalar representing the "
              "current area-under-the-curve.");
-    AddOutput("BatchAUC", "The AUC for current batch");
+
    AddOutput("StatPosOut", "Statistic value when label = 1");
    AddOutput("StatNegOut", "Statistic value when label = 0");

    AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
        .SetDefault("ROC");

-    AddAttr<int>("num_thresholds",
-                 "The number of thresholds to use when discretizing the"
-                 " roc curve.")
+    AddAttr<int>(
+        "num_thresholds",
+        "The number of thresholds to use when discretizing the roc curve.")
        .SetDefault((2 << 12) - 1);
-
+    AddAttr<int>("slide_steps", "Use slide steps to calc batch auc.")
+        .SetDefault(1);
    AddComment(R"DOC(
 Area Under The Curve (AUC) Operator.


--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@@ -32,7 +32,9 @@ class AucKernel : public framework::OpKernel<T> {

    std::string curve = ctx.Attr<std::string>("curve");
    int num_thresholds = ctx.Attr<int>("num_thresholds");
+    // buckets contain numbers from 0 to num_thresholds
    int num_pred_buckets = num_thresholds + 1;
+    int slide_steps = ctx.Attr<int>("slide_steps");

    // Only use output var for now, make sure it's persistable and
    // not cleaned up for each batch.
@@ -40,16 +42,19 @@ class AucKernel : public framework::OpKernel<T> {
    auto *stat_pos = ctx.Output<Tensor>("StatPosOut");
    auto *stat_neg = ctx.Output<Tensor>("StatNegOut");

-    auto *stat_pos_data = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
-    auto *stat_neg_data = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
-    calcAuc(ctx, label, predict, stat_pos_data, stat_neg_data, num_thresholds,
-            auc);
+    auto *origin_stat_pos = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
+    auto *origin_stat_neg = stat_neg->mutable_data<int64_t>(ctx.GetPlace());

-    auto *batch_auc = ctx.Output<Tensor>("BatchAUC");
-    std::vector<int64_t> stat_pos_batch(num_pred_buckets, 0);
-    std::vector<int64_t> stat_neg_batch(num_pred_buckets, 0);
-    calcAuc(ctx, label, predict, stat_pos_batch.data(), stat_neg_batch.data(),
-            num_thresholds, batch_auc);
+    std::vector<int64_t> stat_pos_data(num_pred_buckets, 0);
+    std::vector<int64_t> stat_neg_data(num_pred_buckets, 0);
+
+    auto stat_pos_calc = stat_pos_data.data();
+    auto stat_neg_calc = stat_neg_data.data();
+
+    statAuc(label, predict, num_pred_buckets, num_thresholds, slide_steps,
+            origin_stat_pos, origin_stat_neg, &stat_pos_calc, &stat_neg_calc);
+
+    calcAuc(ctx, stat_pos_calc, stat_neg_calc, num_thresholds, auc);
  }

 private:
@@ -58,29 +63,76 @@ class AucKernel : public framework::OpKernel<T> {
    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
  }

-  inline static void calcAuc(const framework::ExecutionContext &ctx,
-                             const framework::Tensor *label,
+  inline static void statAuc(const framework::Tensor *label,
                             const framework::Tensor *predict,
-                             int64_t *stat_pos, int64_t *stat_neg,
-                             int num_thresholds,
-                             framework::Tensor *auc_tensor) {
+                             const int num_pred_buckets,
+                             const int num_thresholds, const int slide_steps,
+                             int64_t *origin_stat_pos, int64_t *origin_stat_neg,
+                             int64_t **stat_pos, int64_t **stat_neg) {
    size_t batch_size = predict->dims()[0];
    size_t inference_width = predict->dims()[1];
    const T *inference_data = predict->data<T>();
    const auto *label_data = label->data<int64_t>();

-    auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
-
    for (size_t i = 0; i < batch_size; i++) {
      uint32_t binIdx = static_cast<uint32_t>(
          inference_data[i * inference_width + 1] * num_thresholds);
      if (label_data[i]) {
-        stat_pos[binIdx] += 1.0;
+        (*stat_pos)[binIdx] += 1.0;
      } else {
-        stat_neg[binIdx] += 1.0;
+        (*stat_neg)[binIdx] += 1.0;
      }
    }

+    int bucket_length = num_pred_buckets * sizeof(int64_t);
+
+    // will stat auc unlimited.
+    if (slide_steps == 0) {
+      for (int slide = 0; slide < num_pred_buckets; ++slide) {
+        origin_stat_pos[slide] += (*stat_pos)[slide];
+        origin_stat_neg[slide] += (*stat_neg)[slide];
+      }
+
+      *stat_pos = origin_stat_pos;
+      *stat_neg = origin_stat_neg;
+
+    } else {
+      for (int slide = 1; slide < slide_steps; ++slide) {
+        int dst_idx = (slide - 1) * num_pred_buckets;
+        int src_inx = slide * num_pred_buckets;
+        std::memcpy(origin_stat_pos + dst_idx, origin_stat_pos + src_inx,
+                    bucket_length);
+        std::memcpy(origin_stat_neg + dst_idx, origin_stat_neg + src_inx,
+                    bucket_length);
+      }
+
+      std::memcpy(origin_stat_pos + (slide_steps - 1) * num_pred_buckets,
+                  *stat_pos, bucket_length);
+      std::memcpy(origin_stat_neg + (slide_steps - 1) * num_pred_buckets,
+                  *stat_neg, bucket_length);
+
+      std::memset(*stat_pos, 0, bucket_length);
+      std::memset(*stat_neg, 0, bucket_length);
+
+      for (int slide = 0; slide < num_pred_buckets; ++slide) {
+        int stat_pos_steps = 0;
+        int stat_neg_steps = 0;
+        for (int step = 0; step < slide_steps; ++step) {
+          stat_pos_steps += origin_stat_pos[slide + step * num_pred_buckets];
+          stat_neg_steps += origin_stat_neg[slide + step * num_pred_buckets];
+        }
+        (*stat_pos)[slide] += stat_pos_steps;
+        (*stat_neg)[slide] += stat_neg_steps;
+      }
+    }
+  }
+
+  inline static void calcAuc(const framework::ExecutionContext &ctx,
+                             int64_t *stat_pos, int64_t *stat_neg,
+                             int num_thresholds,
+                             framework::Tensor *auc_tensor) {
+    auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
+
    *auc = 0.0f;

    double totPos = 0.0;
@@ -96,7 +148,6 @@ class AucKernel : public framework::OpKernel<T> {
      totPos += stat_pos[idx];
      totNeg += stat_neg[idx];
      *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
-
      --idx;
    }


--- a/paddle/fluid/operators/channel_close_op.cc
+++ b/paddle/fluid/operators/channel_close_op.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace pf = paddle::framework;
-static constexpr char kChannel[] = "Channel";
-
-namespace paddle {
-namespace operators {
-
-class ChannelCloseOp : public framework::OperatorBase {
- public:
-  ChannelCloseOp(const std::string &type,
-                 const framework::VariableNameMap &inputs,
-                 const framework::VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto &inp = *scope.FindVar(Input(kChannel));
-
-    // Get the mutable version of the channel variable and closes it.
-    pf::ChannelHolder *ch = inp.GetMutable<framework::ChannelHolder>();
-    ch->close();
-  }
-};
-
-class ChannelCloseOpOpInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("Channel"),
-                   "The input of ChannelClose op must be set");
-  }
-};
-
-class ChannelCloseOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(kChannel,
-             "The Channel Variable that should be closed by"
-             " the ChannelClose Op.");
-    AddComment(R"DOC(
-Channel Close Operator.
-
-This operator closes an open channel.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(channel_close, paddle::operators::ChannelCloseOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::ChannelCloseOpMaker);
--- a/paddle/fluid/operators/channel_create_op.cc
+++ b/paddle/fluid/operators/channel_create_op.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/reader.h"
-
-namespace pf = paddle::framework;
-
-static constexpr char kOutput[] = "Out";
-
-namespace paddle {
-namespace operators {
-
-class ChannelCreateOp : public framework::OperatorBase {
- public:
-  ChannelCreateOp(const std::string &type,
-                  const framework::VariableNameMap &inputs,
-                  const framework::VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto &out = *scope.FindVar(Output(kOutput));
-
-    // Determine the datatype and capacity of the channel to be created
-    // from the attributes provided.
-    auto dtype =
-        static_cast<framework::proto::VarType::Type>(Attr<int>("data_type"));
-    auto capacity = Attr<int>("capacity");
-
-    // Based on the datatype, create a new channel holder initialized with
-    // the given capacity. When capacity is 0, an unbuffered channel is
-    // created.
-    pf::ChannelHolder *ch = out.GetMutable<framework::ChannelHolder>();
-    if (dtype == framework::proto::VarType::LOD_TENSOR) {
-      ch->Reset<pf::LoDTensor>(capacity);
-    } else if (dtype == framework::proto::VarType::SELECTED_ROWS) {
-      ch->Reset<pf::SelectedRows>(capacity);
-    } else if (dtype == framework::proto::VarType::LOD_RANK_TABLE) {
-      ch->Reset<pf::LoDRankTable>(capacity);
-    } else if (dtype == framework::proto::VarType::LOD_TENSOR_ARRAY) {
-      ch->Reset<pf::LoDTensorArray>(capacity);
-    } else if (dtype == framework::proto::VarType::READER) {
-      ch->Reset<pf::ReaderHolder>(capacity);
-    } else if (dtype == framework::proto::VarType::CHANNEL) {
-      ch->Reset<pf::ChannelHolder>(capacity);
-    } else if (dtype == framework::proto::VarType::BOOL) {
-      ch->Reset<bool>(capacity);
-    } else if (dtype == framework::proto::VarType::INT32) {
-      ch->Reset<int>(capacity);
-    } else if (dtype == framework::proto::VarType::INT64) {
-      ch->Reset<int64_t>(capacity);
-    } else if (dtype == framework::proto::VarType::FP32) {
-      ch->Reset<float>(capacity);
-    } else if (dtype == framework::proto::VarType::FP64) {
-      ch->Reset<double>(capacity);
-    } else {
-      PADDLE_THROW(
-          "Data type %d is not in "
-          "[LOD_TENSOR, SELECTED_ROWS, LOD_RANK_TABLE, LOD_TENSOR_ARRAY, "
-          "READER, CHANNEL, BOOL, INT32, INT64, FP32, FP64]",
-          dtype);
-    }
-  }
-};
-
-class ChannelCreateOpOpInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasOutput(kOutput),
-                   "The output of ChannelCreate op must be set");
-    context->SetOutputDim(kOutput, {1});
-  }
-};
-
-class ChannelCreateOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddOutput(kOutput,
-              "The object of a Channel type created by ChannelCreate Op.");
-    AddAttr<int>("capacity", "The size of the buffer of Channel.")
-        .SetDefault(0);
-    AddAttr<int>("data_type", "The data type of elements inside the Channel.");
-    AddComment(R"DOC(
-Channel Create Operator.
-
-This operator creates an object of the VarType Channel and returns it.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(channel_create, paddle::operators::ChannelCreateOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::ChannelCreateOpMaker);
--- a/paddle/fluid/operators/channel_recv_op.cc
+++ b/paddle/fluid/operators/channel_recv_op.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/channel.h"
-#include <paddle/fluid/framework/lod_rank_table.h>
-#include <paddle/fluid/framework/lod_tensor_array.h>
-#include <paddle/fluid/framework/reader.h>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/concurrency/channel_util.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-static constexpr char Channel[] = "Channel";
-static constexpr char Status[] = "Status";
-static constexpr char Out[] = "Out";
-
-namespace paddle {
-namespace operators {
-
-void SetReceiveStatus(const platform::Place &dev_place,
-                      framework::Variable *status_var, bool status) {
-  auto cpu = platform::CPUPlace();
-  auto status_tensor =
-      status_var->GetMutable<framework::LoDTensor>()->mutable_data<bool>({1},
-                                                                         cpu);
-  status_tensor[0] = status;
-}
-
-class ChannelRecvOp : public framework::OperatorBase {
- public:
-  ChannelRecvOp(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const {
-    PADDLE_ENFORCE(ctx->HasInput(Channel),
-                   "Input(Channel) of ChannelRecvOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(Out),
-                   "Input(Channel) of ChannelRecvOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(Status),
-                   "Output(Status) of ChannelRecvOp should not be null.");
-    ctx->SetOutputDim("Status", {1});
-  }
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    // Get the channel holder created by channel_create op, passed as input.
-    framework::ChannelHolder *ch =
-        scope.FindVar(Input(Channel))->GetMutable<framework::ChannelHolder>();
-    auto output_var = scope.FindVar(Output(Out));
-    // Receive the data from the channel.
-    bool ok = concurrency::ChannelReceive(ch, output_var);
-
-    // Set the status output of the `ChannelReceive` call.
-    SetReceiveStatus(dev_place, scope.FindVar(Output(Status)), ok);
-  }
-};
-
-class ChannelRecvOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(Channel,
-             "(Channel) A variable which \"receives\" the a value sent"
-             "to it by a channel_send op.")
-        .AsDuplicable();
-    AddOutput(Out,
-              "(Variable) Output Variable that will hold the data received"
-              " from the Channel")
-        .AsDuplicable();
-    AddOutput(Status,
-              "(Tensor) An LoD Tensor that returns a boolean status of the"
-              "result of the receive operation.")
-        .AsDuplicable();
-    AddComment(R"DOC(
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(channel_recv, paddle::operators::ChannelRecvOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::ChannelRecvOpMaker);
--- a/paddle/fluid/operators/channel_send_op.cc
+++ b/paddle/fluid/operators/channel_send_op.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/channel.h"
-#include <paddle/fluid/framework/lod_rank_table.h>
-#include <paddle/fluid/framework/lod_tensor_array.h>
-#include <paddle/fluid/framework/reader.h>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/concurrency/channel_util.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-static constexpr char Channel[] = "Channel";
-static constexpr char X[] = "X";
-
-namespace paddle {
-namespace operators {
-
-class ChannelSendOp : public framework::OperatorBase {
- public:
-  ChannelSendOp(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const {
-    PADDLE_ENFORCE(ctx->HasInput(Channel),
-                   "Input(Channel) of ChannelSendOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(X),
-                   "Input(X) of ChannelSendOp should not be null.");
-  }
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    // Get the channel holder created by channel_create op, passed as input.
-    framework::ChannelHolder *ch =
-        scope.FindVar(Input(Channel))->GetMutable<framework::ChannelHolder>();
-    auto input_var = scope.FindVar(Input(X));
-
-    // Send the input data through the channel.
-    concurrency::ChannelSend(ch, input_var);
-  }
-};
-
-class ChannelSendOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(Channel,
-             "(Channel) A variable which \"sends\" the passed in value to "
-             "a listening receiver.")
-        .AsDuplicable();
-    AddInput(X, "(Variable) The value which gets sent by the channel.")
-        .AsDuplicable();
-    AddComment(R"DOC(
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(channel_send, paddle::operators::ChannelSendOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::ChannelSendOpMaker);
--- a/paddle/fluid/operators/concurrency/CMakeLists.txt
+++ b/paddle/fluid/operators/concurrency/CMakeLists.txt
-cc_library(concurrency SRCS channel_util.cc DEPS device_context framework_proto boost eigen3)
--- a/paddle/fluid/operators/concurrency/channel_util.cc
+++ b/paddle/fluid/operators/concurrency/channel_util.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/concurrency/channel_util.h"
-#include "paddle/fluid/framework/var_type.h"
-
-namespace poc = paddle::operators::concurrency;
-
-void poc::ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) {
-  auto type = framework::ToVarType(var->Type());
-  if (type == framework::proto::VarType_Type_LOD_TENSOR)
-    ch->Send(var->GetMutable<framework::LoDTensor>());
-  else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
-    ch->Send(var->GetMutable<framework::LoDRankTable>());
-  else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
-    ch->Send(var->GetMutable<framework::LoDTensorArray>());
-  else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
-    ch->Send(var->GetMutable<framework::SelectedRows>());
-  else if (type == framework::proto::VarType_Type_READER)
-    ch->Send(var->GetMutable<framework::ReaderHolder>());
-  else if (type == framework::proto::VarType_Type_CHANNEL)
-    ch->Send(var->GetMutable<framework::ChannelHolder>());
-  else
-    PADDLE_THROW("ChannelSend:Unsupported type");
-}
-
-bool poc::ChannelReceive(framework::ChannelHolder *ch,
-                         framework::Variable *var) {
-  // Get type of channel and use that to call mutable data for Variable
-  auto type = framework::ToVarType(ch->Type());
-  if (type == framework::proto::VarType_Type_LOD_TENSOR)
-    return ch->Receive(var->GetMutable<framework::LoDTensor>());
-  else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
-    return ch->Receive(var->GetMutable<framework::LoDRankTable>());
-  else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
-    return ch->Receive(var->GetMutable<framework::LoDTensorArray>());
-  else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
-    return ch->Receive(var->GetMutable<framework::SelectedRows>());
-  else if (type == framework::proto::VarType_Type_READER)
-    return ch->Receive(var->GetMutable<framework::ReaderHolder>());
-  else if (type == framework::proto::VarType_Type_CHANNEL)
-    return ch->Receive(var->GetMutable<framework::ChannelHolder>());
-  else
-    PADDLE_THROW("ChannelReceive:Unsupported type");
-}
-
-void poc::ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer,
-                            framework::Variable *var,
-                            std::shared_ptr<std::condition_variable_any> cond,
-                            std::function<bool(framework::ChannelAction)> cb) {
-  auto type = framework::ToVarType(var->Type());
-  if (type == framework::proto::VarType_Type_LOD_TENSOR) {
-    ch->AddToSendQ(referrer, var->GetMutable<framework::LoDTensor>(), cond, cb);
-  } else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) {
-    ch->AddToSendQ(referrer, var->GetMutable<framework::LoDRankTable>(), cond,
-                   cb);
-  } else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) {
-    ch->AddToSendQ(referrer, var->GetMutable<framework::LoDTensorArray>(), cond,
-                   cb);
-  } else if (type == framework::proto::VarType_Type_SELECTED_ROWS) {
-    ch->AddToSendQ(referrer, var->GetMutable<framework::SelectedRows>(), cond,
-                   cb);
-  } else if (type == framework::proto::VarType_Type_READER) {
-    ch->AddToSendQ(referrer, var->GetMutable<framework::ReaderHolder>(), cond,
-                   cb);
-  } else if (type == framework::proto::VarType_Type_CHANNEL) {
-    ch->AddToSendQ(referrer, var->GetMutable<framework::ChannelHolder>(), cond,
-                   cb);
-  } else {
-    PADDLE_THROW("ChannelAddToSendQ:Unsupported type");
-  }
-}
-
-void poc::ChannelAddToReceiveQ(
-    framework::ChannelHolder *ch, const void *referrer,
-    framework::Variable *var, std::shared_ptr<std::condition_variable_any> cond,
-    std::function<bool(framework::ChannelAction)> cb) {
-  auto type = framework::ToVarType(var->Type());
-  if (type == framework::proto::VarType_Type_LOD_TENSOR) {
-    ch->AddToReceiveQ(referrer, var->GetMutable<framework::LoDTensor>(), cond,
-                      cb);
-  } else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) {
-    ch->AddToReceiveQ(referrer, var->GetMutable<framework::LoDRankTable>(),
-                      cond, cb);
-  } else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) {
-    ch->AddToReceiveQ(referrer, var->GetMutable<framework::LoDTensorArray>(),
-                      cond, cb);
-  } else if (type == framework::proto::VarType_Type_SELECTED_ROWS) {
-    ch->AddToReceiveQ(referrer, var->GetMutable<framework::SelectedRows>(),
-                      cond, cb);
-  } else if (type == framework::proto::VarType_Type_READER) {
-    ch->AddToReceiveQ(referrer, var->GetMutable<framework::ReaderHolder>(),
-                      cond, cb);
-  } else if (type == framework::proto::VarType_Type_CHANNEL) {
-    ch->AddToReceiveQ(referrer, var->GetMutable<framework::ChannelHolder>(),
-                      cond, cb);
-  } else {
-    PADDLE_THROW("ChannelAddToReceiveQ:Unsupported type");
-  }
-}
--- a/paddle/fluid/operators/concurrency/channel_util.h
+++ b/paddle/fluid/operators/concurrency/channel_util.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/framework/variable.h"
-
-namespace paddle {
-namespace operators {
-namespace concurrency {
-
-void ChannelSend(framework::ChannelHolder *ch, framework::Variable *var);
-bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var);
-
-void ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer,
-                       framework::Variable *var,
-                       std::shared_ptr<std::condition_variable_any> cond,
-                       std::function<bool(framework::ChannelAction)> cb);
-void ChannelAddToReceiveQ(framework::ChannelHolder *ch, const void *referrer,
-                          framework::Variable *var,
-                          std::shared_ptr<std::condition_variable_any> cond,
-                          std::function<bool(framework::ChannelAction)> cb);
-
-}  // namespace concurrency
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -380,7 +380,8 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
    math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;

    auto& dev_ctx = context.template device_context<DeviceContext>();
-    depthwiseConv(dev_ctx, *input, filter, strides, paddings, output);
+    depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
+                  output);
  }
 };

@@ -415,14 +416,14 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
      input_grad->mutable_data<T>(context.GetPlace());
      set_zero(dev_ctx, input_grad, static_cast<T>(0));
      depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
-                             paddings, input_grad);
+                             paddings, dilations, input_grad);
    }

    if (filter_grad) {
      filter_grad->mutable_data<T>(context.GetPlace());
      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
      depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings,
-                              filter_grad);
+                              dilations, filter_grad);
    }
  }
 };

--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -345,7 +345,7 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
    math::DepthwiseConvInputGradFunctor<DeviceContext, T>
        depthwiseConvInputGrad;
    depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings,
-                           output);
+                           dilations, output);
  }
 };

@@ -367,10 +367,11 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
    auto& dev_ctx = context.template device_context<DeviceContext>();
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");

    if (input_grad) {
      math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
-      depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings,
+      depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings, dilations,
                    input_grad);
    }

@@ -382,7 +383,7 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
      math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
          depthwiseConvFilterGrad;
      depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings,
-                              filter_grad);
+                              dilations, filter_grad);
    }
  }
 };

--- a/paddle/fluid/operators/cub_reduce.h
+++ b/paddle/fluid/operators/cub_reduce.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <set>
+#include <vector>
+
+#include <cub/cub.cuh>  // NOLINT
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+
+namespace detail {
+template <typename T, size_t ElementCount>
+struct Array {
+ public:
+  HOSTDEVICE inline Array() {}
+
+  HOSTDEVICE inline T& operator[](size_t index) { return data_[index]; }
+
+  HOSTDEVICE inline const T& operator[](size_t index) const {
+    return data_[index];
+  }
+
+  HOSTDEVICE constexpr inline size_t size() const { return ElementCount; }
+
+  template <typename VectorLikeType>
+  static inline Array<T, ElementCount> From(const VectorLikeType& vec) {
+    PADDLE_ENFORCE_EQ(vec.size(), ElementCount, "size not match");
+    size_t n = static_cast<size_t>(vec.size());
+    Array<T, ElementCount> ret;
+    for (size_t i = 0; i < n; ++i) ret[i] = vec[i];
+    return ret;
+  }
+
+ private:
+  T data_[ElementCount];
+};
+
+// reduce the last axis of 2d array
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
+          int BlockDim>
+__global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer,
+                               TransformOp transformer, Ty init,
+                               int reduce_num) {
+  __shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
+  int idx_x = blockIdx.x * reduce_num;
+  int idx_y = threadIdx.x;
+  Ty reduce_var = init;
+  for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim)
+    reduce_var = reducer(reduce_var, transformer(x[idx_x + idx_y]));
+
+  reduce_var =
+      cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
+
+  if (threadIdx.x == 0) {
+    y[blockIdx.x] = reduce_var;
+  }
+}
+
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
+          int BlockDim, int Rank, int ReduceRank>
+__global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer,
+                             TransformOp transformer, Ty init, int reduce_num,
+                             Array<int, Rank> x_strides,
+                             Array<int, ReduceRank> reduce_dim,
+                             Array<int, ReduceRank> reduce_strides,
+                             Array<int, Rank - ReduceRank> left_dim,
+                             Array<int, Rank - ReduceRank> left_strides) {
+  __shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
+  Array<int, Rank> sub_index;
+  int left_idx = blockIdx.x;
+  for (int i = 0; i < Rank - ReduceRank; ++i) {
+    sub_index[left_dim[i]] = left_idx / left_strides[i];
+    left_idx %= left_strides[i];
+  }
+
+  int reduce_idx = threadIdx.x;
+  for (int j = 0; j < ReduceRank; ++j) {
+    sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
+    reduce_idx %= reduce_strides[j];
+  }
+
+  int idx_x = 0;
+  for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
+  Ty reduce_var = static_cast<Ty>(transformer(x[idx_x]));
+
+  for (int i = threadIdx.x + BlockDim; i < reduce_num; i += BlockDim) {
+    int reduce_idx = i;
+    for (int j = 0; j < ReduceRank; ++j) {
+      sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
+      reduce_idx %= reduce_strides[j];
+    }
+
+    int idx_x = 0;
+    for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
+    reduce_var = static_cast<Ty>(reducer(reduce_var, transformer(x[idx_x])));
+  }
+
+  reduce_var =
+      cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
+
+  if (threadIdx.x == 0) {
+    y[blockIdx.x] = reduce_var;
+  }
+}
+
+static inline std::vector<int> GetStrides(const std::vector<int>& dims) {
+  int n = static_cast<int>(dims.size());
+  if (n == 0) return std::vector<int>();
+  std::vector<int> strides(n);
+  strides.back() = 1;
+  for (int i = n - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * dims[i + 1];
+  }
+  return strides;
+}
+
+static inline std::vector<int> GetStrides(const std::vector<int>& dims,
+                                          const std::vector<int>& idx) {
+  int n = static_cast<int>(idx.size());
+  if (n == 0) return std::vector<int>();
+  std::vector<int> strides(n);
+  strides.back() = 1;
+  for (int i = n - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * dims[idx[i + 1]];
+  }
+  return strides;
+}
+
+constexpr int kMaxBlockDim = 512;
+
+static inline int GetDesiredBlockDim(int block_dim) {
+  return block_dim >= kMaxBlockDim
+             ? kMaxBlockDim
+             : (1 << static_cast<int>(std::log2(block_dim)));
+}
+
+template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
+          typename TransformOp>
+static void TensorReduceImpl(
+    const Tx* x_data, Ty* y_data, const platform::Place& place,
+    const ReduceOp& reducer, const TransformOp& transformer, const Ty& init,
+    int left_num, int reduce_num, const std::vector<int>& x_strides,
+    const std::vector<int>& reduce_dim, const std::vector<int>& reduce_strides,
+    const std::vector<int>& left_dim, const std::vector<int>& left_strides,
+    cudaStream_t stream) {
+#define CUB_RANK_CASE(i, ...)             \
+  case i: {                               \
+    constexpr auto kRank = i;             \
+    switch (reduce_rank) { __VA_ARGS__; } \
+  } break
+
+#define CUB_REDUCE_RANK_CASE(i, ...)                              \
+  case i: {                                                       \
+    constexpr auto kReduceRank = i;                               \
+    ReduceKernel<Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank,  \
+                 kReduceRank><<<left_num, BlockDim, 0, stream>>>( \
+        x_data, y_data, reducer, transformer, init, reduce_num,   \
+        Array<int, kRank>::From(x_strides),                       \
+        Array<int, kReduceRank>::From(reduce_dim),                \
+        Array<int, kReduceRank>::From(reduce_strides),            \
+        Array<int, kRank - kReduceRank>::From(left_dim),          \
+        Array<int, kRank - kReduceRank>::From(left_strides));     \
+  } break
+
+  int rank = x_strides.size();
+  int reduce_rank = reduce_strides.size();
+  if (rank == reduce_rank) {
+    cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
+        x_data, transformer);
+    size_t temp_storage_bytes = 0;
+    cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
+                              reduce_num, reducer, init, stream);
+    framework::Tensor tmp;
+    auto* temp_storage = tmp.mutable_data<uint8_t>(
+        framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
+        place);
+    cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data,
+                              reduce_num, reducer, init, stream);
+    return;
+  }
+  if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) {
+    ReduceKernel2D<Tx, Ty, ReduceOp, TransformOp,
+                   BlockDim><<<left_num, BlockDim, 0, stream>>>(
+        x_data, y_data, reducer, transformer, init, reduce_num);
+    return;
+  }
+  /*
+  if (rank == 3 && reduce_rank == 1 && reduce_dim[0] == 1) {
+    // TODO(liangdun): we can optimize 3d case which the 2nd axis is reduced.
+    // Currently, it is handled by code below, but inefficient
+    return;
+  }
+  */
+
+  switch (rank) {
+    CUB_RANK_CASE(2, CUB_REDUCE_RANK_CASE(1););
+
+    CUB_RANK_CASE(3, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2););
+
+    CUB_RANK_CASE(4, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
+                  CUB_REDUCE_RANK_CASE(3););
+
+    CUB_RANK_CASE(5, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
+                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4););
+
+    CUB_RANK_CASE(6, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
+                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
+                  CUB_REDUCE_RANK_CASE(5););
+
+    CUB_RANK_CASE(7, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
+                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
+                  CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6););
+
+    CUB_RANK_CASE(8, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
+                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
+                  CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6););
+
+    CUB_RANK_CASE(9, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);
+                  CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);
+                  CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6);
+                  CUB_REDUCE_RANK_CASE(7); CUB_REDUCE_RANK_CASE(8););
+  }
+
+#undef CUB_REDUCE_RANK_CASE
+#undef CUB_RANK_CASE
+}
+
+}  // namespace detail
+
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
+void TensorReduce(const framework::Tensor& x, framework::Tensor* y,
+                  std::vector<int> origin_reduce_dims, const Ty& init,
+                  const ReduceOp& reducer, const TransformOp& transformer,
+                  cudaStream_t stream) {
+  auto x_dim = framework::vectorize2int(x.dims());
+  std::vector<int> new_x_dim, new_reduce_dims;
+  int is_reduced = 0;
+  for (auto e : origin_reduce_dims) {
+    auto pos = e >= 0 ? e : e + x_dim.size();
+    is_reduced |= 1 << e;
+  }
+  for (int i = 0; i < x_dim.size(); i++) {
+    if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) {
+      new_x_dim.push_back(x_dim[i]);
+      if ((is_reduced >> i) & 1)
+        new_reduce_dims.push_back(new_x_dim.size() - 1);
+    } else {
+      new_x_dim[new_x_dim.size() - 1] *= x_dim[i];
+    }
+  }
+  x_dim = new_x_dim;
+  origin_reduce_dims = new_reduce_dims;
+  int x_rank = static_cast<int>(x_dim.size());
+  std::set<int> left_set, reduce_set;
+  for (int i = 0; i < x_rank; ++i) left_set.insert(i);
+
+  for (auto e : origin_reduce_dims) {
+    left_set.erase(e);
+    reduce_set.insert(e);
+  }
+
+  std::vector<int> reduce_dim(reduce_set.begin(), reduce_set.end());
+  std::vector<int> left_dim(left_set.begin(), left_set.end());
+
+  std::vector<int> x_strides = detail::GetStrides(x_dim);
+  std::vector<int> reduce_strides = detail::GetStrides(x_dim, reduce_dim);
+  std::vector<int> left_strides = detail::GetStrides(x_dim, left_dim);
+  int reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]];
+  int left_num = 1;
+  if (left_dim.size()) left_num = left_strides[0] * x_dim[left_dim[0]];
+
+  std::vector<int> y_dim(left_dim.size());
+  for (int i = 0; i < left_dim.size(); ++i) {
+    y_dim[i] = x_dim[left_dim[i]];
+  }
+  auto x_data = x.data<Tx>();
+  auto y_data = y->mutable_data<Ty>(x.place());
+  if (reduce_num == 1) return;
+
+#define CUB_BLOCK_DIM_CASE(block_dim)                                    \
+  case block_dim: {                                                      \
+    constexpr auto kBlockDim = block_dim;                                \
+    detail::TensorReduceImpl<Tx, Ty, block_dim, ReduceOp, TransformOp>(  \
+        x_data, y_data, x.place(), reducer, transformer, init, left_num, \
+        reduce_num, x_strides, reduce_dim, reduce_strides, left_dim,     \
+        left_strides, stream);                                           \
+  } break
+
+  switch (detail::GetDesiredBlockDim(reduce_num)) {
+    CUB_BLOCK_DIM_CASE(512);
+    CUB_BLOCK_DIM_CASE(256);
+    CUB_BLOCK_DIM_CASE(128);
+    CUB_BLOCK_DIM_CASE(64);
+    CUB_BLOCK_DIM_CASE(32);
+    CUB_BLOCK_DIM_CASE(16);
+    CUB_BLOCK_DIM_CASE(8);
+    CUB_BLOCK_DIM_CASE(4);
+    CUB_BLOCK_DIM_CASE(2);
+  }
+#undef CUB_BLOCK_DIM_CASE
+}
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once

 #include <time.h>
+#include <atomic>

 #include <chrono>              // NOLINT
 #include <condition_variable>  // NOLINT

--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -15,6 +15,7 @@
 #pragma once

 #include <time.h>
+#include <condition_variable>  // NOLINT

 #include <functional>
 #include <string>

--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ b/paddle/fluid/operators/distributed/rpc_server.h
@@ -14,6 +14,7 @@

 #pragma once

+#include <atomic>
 #include <set>
 #include <string>
 #include <thread>  // NOLINT

--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -89,7 +89,7 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<bool>("use_mkldnn", "(bool, default false). Used by MKLDNN.")
        .SetDefault(false);
    AddComment(string::Sprintf(R"DOC(
-Limited Elementwise %s Operator
+Elementwise %s Operator

 The equation is:


--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
@@ -290,12 +290,13 @@ class FusionGRUKernel : public framework::OpKernel<T> {
  void BatchCompute(const framework::ExecutionContext& ctx) const {
    using DeviceContext = paddle::platform::CPUDeviceContext;
    auto* x = ctx.Input<LoDTensor>("X");
+    INIT_BASE_INPUT_OUTPUT
+    INIT_BASE_SIZES
    if (x->lod()[0].size() == 2) {
+      xx->Resize({total_T, D3});
      SeqCompute(ctx);
      return;
    }
-    INIT_BASE_INPUT_OUTPUT
-    INIT_BASE_SIZES
    INIT_VEC_FUNC

    auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");

--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -424,11 +424,12 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
  void BatchCompute(const framework::ExecutionContext& ctx) const {
    using DeviceContext = platform::CPUDeviceContext;
    INIT_BASE_INPUT_OUTPUT
+    INIT_BASE_SIZES
    if (x->lod()[0].size() == 2) {
+      xx->Resize({x_dims[0], D4});
      SeqCompute(ctx);
      return;
    }
-    INIT_BASE_SIZES
    INIT_VEC_FUNC
    INIT_BASE_INPUT_DATAS


--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
--- a/paddle/fluid/operators/math/depthwise_conv.h
+++ b/paddle/fluid/operators/math/depthwise_conv.h
@@ -32,7 +32,8 @@ class DepthwiseConvFunctor {
  void operator()(const DeviceContext& context, const framework::Tensor& input,
                  const framework::Tensor& filter,
                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, framework::Tensor* output);
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations, framework::Tensor* output);
 };

 template <typename DeviceContext, typename T>
@@ -43,6 +44,7 @@ class DepthwiseConvInputGradFunctor {
                  const framework::Tensor& output_grad,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
                  framework::Tensor* input_grad);
 };

@@ -53,6 +55,7 @@ class DepthwiseConvFilterGradFunctor {
                  const framework::Tensor& output_grad,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
                  framework::Tensor* filter_grad);
 };


--- a/paddle/fluid/operators/reduce_mean_op.cu
+++ b/paddle/fluid/operators/reduce_mean_op.cu
@@ -12,17 +12,64 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <vector>
+#include "paddle/fluid/operators/cub_reduce.h"
 #include "paddle/fluid/operators/reduce_mean_op.h"

-REGISTER_OP_CUDA_KERNEL(reduce_mean,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::MeanFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          double, ops::MeanFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int, ops::MeanFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t, ops::MeanFunctor>);
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct DivideFunctor {
+  HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {}
+
+  HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
+
+ private:
+  T n_inv;
+};
+
+template <typename T>
+class ReduceMeanKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+
+    auto dims = context.Attr<std::vector<int>>("dim");
+    bool keep_dim = context.Attr<bool>("keep_dim");
+
+    std::vector<int> reduce_dims;
+    if (reduce_all) {
+      reduce_dims.resize(input->dims().size());
+      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+    } else {
+      for (auto e : dims) {
+        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
+      }
+    }
+
+    int reduce_num = 1;
+    for (int i = 0; i < reduce_dims.size(); ++i) {
+      reduce_num *= input->dims()[reduce_dims[i]];
+    }
+
+    auto stream = context.cuda_device_context().stream();
+    TensorReduce<T, T, cub::Sum, DivideFunctor<T>>(
+        *input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
+        DivideFunctor<T>(reduce_num), stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel<float>,
+                        ops::ReduceMeanKernel<double>,
+                        ops::ReduceMeanKernel<int>,
+                        ops::ReduceMeanKernel<int64_t>);
+
 REGISTER_OP_CUDA_KERNEL(
    reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
                                            float, ops::MeanGradFunctor>,

--- a/paddle/fluid/operators/reduce_sum_op.cu
+++ b/paddle/fluid/operators/reduce_sum_op.cu
@@ -12,17 +12,59 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include "paddle/fluid/operators/cub_reduce.h"
 #include "paddle/fluid/operators/reduce_sum_op.h"

-REGISTER_OP_CUDA_KERNEL(reduce_sum,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::SumFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          double, ops::SumFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int, ops::SumFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t, ops::SumFunctor>);
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct IdentityFunctor {
+  HOSTDEVICE explicit inline IdentityFunctor() {}
+
+  HOSTDEVICE inline T operator()(const T& x) const { return x; }
+};
+
+template <typename T>
+class ReduceSumKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+
+    auto dims = context.Attr<std::vector<int>>("dim");
+    bool keep_dim = context.Attr<bool>("keep_dim");
+
+    std::vector<int> reduce_dims;
+    if (reduce_all) {
+      reduce_dims.resize(input->dims().size());
+      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+    } else {
+      for (auto e : dims) {
+        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
+      }
+    }
+
+    int reduce_num = 1;
+    for (int i = 0; i < reduce_dims.size(); ++i) {
+      reduce_num *= input->dims()[reduce_dims[i]];
+    }
+
+    auto stream = context.cuda_device_context().stream();
+    TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
+        *input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
+        IdentityFunctor<T>(), stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel<float>,
+                        ops::ReduceSumKernel<double>, ops::ReduceSumKernel<int>,
+                        ops::ReduceSumKernel<int64_t>);
+
 REGISTER_OP_CUDA_KERNEL(
    reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
                                           float, ops::SumGradFunctor>,

--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -77,8 +77,10 @@ class ScaleOpVarTypeInference : public framework::VarTypeInference {
    auto out_var_name = op_desc.Output("Out").front();
    auto *out_var = block->FindVarRecursive(out_var_name);

-    out_var->SetType(in_var.GetType());
-    out_var->SetDataType(in_var.GetDataType());
+    if (in_var_name != out_var_name) {
+      out_var->SetType(in_var.GetType());
+      out_var->SetDataType(in_var.GetDataType());
+    }
  }
 };


--- a/paddle/fluid/operators/select_op.cc
+++ b/paddle/fluid/operators/select_op.cc
--- a/paddle/fluid/operators/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_slice_op.h
@@ -75,11 +75,11 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
    }

    for (size_t i = 0; i < n; ++i) {
-      PADDLE_ENFORCE_LT(0, offset_data[i],
+      PADDLE_ENFORCE_LE(0, offset_data[i],
                        "The offset[%d] must greater than zero.", i);
      PADDLE_ENFORCE_LT(0, length_data[i],
                        "The length[%d] must greater than zero.", i);
-      PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i],
+      PADDLE_ENFORCE_LE(lod[0][i] + offset_data[i] + length_data[i],
                        lod[0][i + 1], "The target tensor's length overflow.");
    }


--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -32,7 +32,7 @@ class SumKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
    auto in_vars = context.MultiInputVar("X");
-    int N = in_vars.size();
+    size_t in_num = in_vars.size();
    auto out_var = context.OutputVar("Out");

    bool in_place = out_var == in_vars[0];
@@ -53,7 +53,7 @@ class SumKernel : public framework::OpKernel<T> {
      auto &place =
          *context.template device_context<DeviceContext>().eigen_device();
      // If in_place, just skip the first tensor
-      for (int i = in_place ? 1 : 0; i < N; i++) {
+      for (size_t i = in_place ? 1 : 0; i < in_num; i++) {
        if (in_vars[i]->IsType<framework::LoDTensor>()) {
          auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
          if (in_t.numel() == 0) {
@@ -101,13 +101,13 @@ class SumKernel : public framework::OpKernel<T> {

      // Runtime InferShape
      size_t first_dim = 0;
-      for (int i = 0; i < N; i++) {
+      for (size_t i = 0; i < in_num; i++) {
        auto &sel_row = get_selected_row(i);
        first_dim += sel_row.rows().size();
      }

      std::vector<int64_t> in_dim;
-      for (int i = 0; i < N; i++) {
+      for (size_t i = 0; i < in_num; i++) {
        auto &sel_row = get_selected_row(i);
        if (sel_row.rows().size() > 0) {
          in_dim = framework::vectorize(sel_row.value().dims());
@@ -116,7 +116,8 @@ class SumKernel : public framework::OpKernel<T> {
      }
      if (in_dim.empty()) {
        VLOG(3) << "WARNING: all the inputs are empty";
-        in_dim = framework::vectorize(get_selected_row(N - 1).value().dims());
+        in_dim =
+            framework::vectorize(get_selected_row(in_num - 1).value().dims());
      } else {
        in_dim[0] = static_cast<int64_t>(first_dim);
      }
@@ -133,7 +134,7 @@ class SumKernel : public framework::OpKernel<T> {
      math::SelectedRowsAddTo<DeviceContext, T> functor;

      int64_t offset = 0;
-      for (int i = 0; i < N; i++) {
+      for (size_t i = 0; i < in_num; i++) {
        auto &sel_row = get_selected_row(i);
        if (sel_row.rows().size() == 0) {
          continue;

--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -22,8 +22,6 @@
 namespace paddle {

 DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
-DEFINE_int32(tensorrt_max_batch_size, 1, "TensorRT maximum batch size");
-DEFINE_int32(tensorrt_workspace_size, 16 << 20, "TensorRT workspace size");

 namespace operators {

@@ -34,6 +32,8 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Ys", "A list of outputs").AsDuplicable();
    AddAttr<std::string>("subgraph", "the subgraph.");
    AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
+    AddAttr<int>("max_batch_size", "the maximum batch size.");
+    AddAttr<int>("workspace_size", "the workspace size.");
    AddComment("TensorRT engine operator.");
  }
 };

--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -28,8 +28,6 @@
 namespace paddle {

 DECLARE_int32(tensorrt_engine_batch_size);
-DECLARE_int32(tensorrt_max_batch_size);
-DECLARE_int32(tensorrt_workspace_size);

 namespace operators {

@@ -92,14 +90,14 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto engine_name = context.Attr<std::string>("engine_uniq_key");
+    int max_batch_size = context.Attr<int>("max_batch_size");
    if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
      Prepare(context);
    }
    auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
    auto input_names = context.op().Inputs("Xs");
    PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
-    PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size,
-                      FLAGS_tensorrt_max_batch_size);
+    PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, max_batch_size);

    std::vector<std::string> output_maps =
        context.Attr<std::vector<std::string>>("output_name_mapping");
@@ -173,8 +171,9 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
    // Get the ProgramDesc and pass to convert.
    framework::proto::BlockDesc block_desc;
    block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
-    int max_batch = FLAGS_tensorrt_max_batch_size;
-    auto max_workspace = FLAGS_tensorrt_workspace_size;
+    int max_batch_size = context.Attr<int>("max_batch_size");
+    int workspace_size = context.Attr<int>("workspace_size");
+
    auto params = context.Attr<std::vector<std::string>>("parameters");
    std::unordered_set<std::string> parameters;
    for (const auto& param : params) {
@@ -186,7 +185,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {

    // TODO(Superjomn) replace this with a different stream
    auto* engine = Singleton<TRT_EngineManager>::Global().Create(
-        max_batch, max_workspace, nullptr /*engine hold its own stream*/,
+        max_batch_size, workspace_size, nullptr /*engine hold its own stream*/,
        context.Attr<std::string>("engine_uniq_key"),
        boost::get<platform::CUDAPlace>(context.GetPlace()).device);


--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -58,8 +58,6 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
 using inference::analysis::SetAttr;

 TEST(TensorRTEngineOp, manual) {
-  FLAGS_tensorrt_engine_batch_size = 2;
-  FLAGS_tensorrt_max_batch_size = 2;
  framework::ProgramDesc program;
  auto* block_ = program.Proto()->add_blocks();
  block_->set_idx(0);
@@ -101,6 +99,8 @@ TEST(TensorRTEngineOp, manual) {
  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                       block_->SerializeAsString());
+  SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", 2);
+  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 10);
  SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
                                    std::vector<std::string>({}));
@@ -129,8 +129,6 @@ TEST(TensorRTEngineOp, manual) {
 }

 void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
-  FLAGS_tensorrt_engine_batch_size = batch_size;
-  FLAGS_tensorrt_max_batch_size = batch_size;
  framework::ProgramDesc program;
  framework::Scope scope;
  platform::CUDAPlace place;
@@ -195,8 +193,8 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {

  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
                       block_->SerializeAsString());
-  SetAttr<int>(engine_op_desc.Proto(), "max_batch", batch_size);
-  SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 2 << 10);
+  SetAttr<int>(engine_op_desc.Proto(), "max_batch_size", batch_size);
+  SetAttr<int>(engine_op_desc.Proto(), "workspace_size", 2 << 10);
  SetAttr<std::vector<std::string>>(
      engine_op_desc.Proto(), "parameters",
      std::vector<std::string>({"y0", "y1", "y2", "y3"}));

--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -214,7 +214,6 @@ void BindVarDsec(pybind11::module *m) {
      .def("set_shapes", &pd::VarDesc::SetShapes)
      .def("set_dtype", &pd::VarDesc::SetDataType)
      .def("set_dtypes", &pd::VarDesc::SetDataTypes)
-      .def("set_capacity", &pd::VarDesc::SetCapacity)
      .def("shape", &pd::VarDesc::GetShape,
           pybind11::return_value_policy::reference)
      .def("shapes", &pd::VarDesc::GetShapes,
@@ -251,7 +250,6 @@ void BindVarDsec(pybind11::module *m) {
      .value("STEP_SCOPES", pd::proto::VarType::STEP_SCOPES)
      .value("LOD_RANK_TABLE", pd::proto::VarType::LOD_RANK_TABLE)
      .value("LOD_TENSOR_ARRAY", pd::proto::VarType::LOD_TENSOR_ARRAY)
-      .value("CHANNEL", pd::proto::VarType::CHANNEL)
      .value("PLACE_LIST", pd::proto::VarType::PLACE_LIST)
      .value("READER", pd::proto::VarType::READER)
      .value("RAW", pd::proto::VarType::RAW);
@@ -285,12 +283,12 @@ void BindOpDesc(pybind11::module *m) {
      .def("set_output", &pd::OpDesc::SetOutput)
      .def("input_arg_names", &pd::OpDesc::InputArgumentNames)
      .def("output_arg_names", &pd::OpDesc::OutputArgumentNames)
-      .def("rename_input", &pd::OpDesc::RenameInput)
-      .def("rename_output", &pd::OpDesc::RenameOutput)
+      .def("_rename_input", &pd::OpDesc::RenameInput)
+      .def("_rename_output", &pd::OpDesc::RenameOutput)
      .def("has_attr", &pd::OpDesc::HasAttr)
      .def("attr_type", &pd::OpDesc::GetAttrType)
      .def("attr_names", &pd::OpDesc::AttrNames)
-      .def("set_attr", &pd::OpDesc::SetAttr)
+      .def("_set_attr", &pd::OpDesc::SetAttr)
      .def("attr", &pd::OpDesc::GetAttr)
      .def("set_block_attr", &pd::OpDesc::SetBlockAttr)
      .def("set_blocks_attr", &pd::OpDesc::SetBlocksAttr)
@@ -300,8 +298,8 @@ void BindOpDesc(pybind11::module *m) {
             std::string ser(seriralized);
             self.SetAttr(name, ser);
           })
-      .def("block_attr_id", &pd::OpDesc::GetBlockAttrId)
-      .def("blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds)
+      .def("_block_attr_id", &pd::OpDesc::GetBlockAttrId)
+      .def("_blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds)
      .def("check_attrs", &pd::OpDesc::CheckAttrs)
      .def("infer_shape", &pd::OpDesc::InferShape)
      .def("infer_var_type", &pd::OpDesc::InferVarType)

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>

-#include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/framework.pb.h"

--- a/paddle/legacy/trainer/tests/CMakeLists.txt
+++ b/paddle/legacy/trainer/tests/CMakeLists.txt
@@ -16,7 +16,11 @@ endfunction()
 trainer_test(test_Compare)
 trainer_test(test_PyDataProviderWrapper)
 trainer_test(test_recurrent_machine_generation)
-trainer_test(test_Trainer)
+if(NOT APPLE)
+  trainer_test(test_Trainer)
+else()
+  message(WARNING "These tests has been disabled in OSX for random fail: \n test_Trainer") 
+endif()

 ############### test_TrainerOnePass ##########################
 if(WITH_PYTHON)

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -70,8 +70,8 @@ function cmake_gen() {
    PYTHON_FLAGS=""
    SYSTEM=`uname -s`
    if [ "$SYSTEM" == "Darwin" ]; then
+        echo "using python abi: $1"
        if [[ "$1" == "cp27-cp27m" ]] || [[ "$1" == "" ]]; then
-            echo "using python abi: $1"
            if [ -d "/Library/Frameworks/Python.framework/Versions/2.7" ]; then
                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7
                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7
@@ -82,7 +82,18 @@ function cmake_gen() {
            else
                exit 1
            fi
-        # TODO: qiyang add python3 part here 
+        elif [ "$1" == "cp35-cp35m" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.5" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/
+                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/:${PATH}
+                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/
+            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib"
+                WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
+            else
+                exit 1
+            fi
        fi
    else 
        if [ "$1" != "" ]; then
@@ -384,10 +395,11 @@ EOF
        ctest --output-on-failure -j $1     
        # make install should also be test when unittest 
        make install -j 8
-        pip install /usr/local/opt/paddle/share/wheels/*.whl
+        pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
        if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
            paddle version
        fi
+        pip uninstall -y paddlepaddle
    fi
 }

@@ -586,7 +598,7 @@ EOF
 EOF

    if [[ ${WITH_GPU} == "ON"  ]]; then
-        NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.2.13-1+cuda${CUDA_MAJOR} libnccl-dev=2.2.13-1+cuda${CUDA_MAJOR} &&"
+        NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.2.13-1+cuda${CUDA_MAJOR} libnccl-dev=2.2.13-1+cuda${CUDA_MAJOR} || true"
    else
        NCCL_DEPS=""
    fi
@@ -602,9 +614,8 @@ EOF
    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
    ADD python/dist/*.whl /
    # run paddle version to install python packages first
-    RUN apt-get update &&\
-        ${NCCL_DEPS}\
-        apt-get install -y wget python-pip python-opencv libgtk2.0-dev dmidecode python-tk && easy_install -U pip && \
+    RUN apt-get update && ${NCCL_DEPS}
+    RUN apt-get install -y wget python-pip python-opencv libgtk2.0-dev dmidecode python-tk && easy_install -U pip && \
        pip install /*.whl; apt-get install -f -y && \
        apt-get clean -y && \
        rm -f /*.whl && \
@@ -735,7 +746,7 @@ function main() {
        cmake_gen ${PYTHON_ABI:-""}
        build
        run_test
-        assert_api_not_changed
+        assert_api_not_changed ${PYTHON_ABI:-""}
        ;;
      *)
        print_usage

--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -77,13 +77,14 @@ def download(url, module_name, md5sum, save_name=None):
    retry_limit = 3
    while not (os.path.exists(filename) and md5file(filename) == md5sum):
        if os.path.exists(filename):
-            print("file md5", md5file(filename), md5sum)
+            sys.stderr.write("file %s  md5 %s" % (md5file(filename), md5sum))
        if retry < retry_limit:
            retry += 1
        else:
            raise RuntimeError("Cannot download {0} within retry limit {1}".
                               format(url, retry_limit))
-        print("Cache file %s not found, downloading %s" % (filename, url))
+        sys.stderr.write("Cache file %s not found, downloading %s" %
+                         (filename, url))
        r = requests.get(url, stream=True)
        total_length = r.headers.get('content-length')

@@ -100,10 +101,11 @@ def download(url, module_name, md5sum, save_name=None):
                    dl += len(data)
                    f.write(data)
                    done = int(50 * dl / total_length)
-                    sys.stdout.write("\r[%s%s]" % ('=' * done,
+                    sys.stderr.write("\r[%s%s]" % ('=' * done,
                                                   ' ' * (50 - done)))
                    sys.stdout.flush()
-
+    sys.stderr.write("\n")
+    sys.stdout.flush()
    return filename



--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -75,8 +75,8 @@ class ErrorClipByValue(BaseErrorClipAttr):
        clip_op_desc.set_type("clip")
        clip_op_desc.set_input("X", [grad_name])
        clip_op_desc.set_output("Out", [grad_name])
-        clip_op_desc.set_attr("min", self.min)
-        clip_op_desc.set_attr("max", self.max)
+        clip_op_desc._set_attr("min", self.min)
+        clip_op_desc._set_attr("max", self.max)


 def error_clip_callback(block, context):
@@ -271,7 +271,8 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
                    "All parameters' 'clip_norm' of a same group should be the same"
                )

-        local_norm_var = layers.reduce_sum(input=layers.pow(x=grad, factor=2.0))
+        square = grad * grad
+        local_norm_var = layers.cast(layers.reduce_sum(input=square), 'float64')
        context[self.group_name].append(local_norm_var)

        self.context = context
@@ -281,6 +282,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
        if group_scale_name not in self.context:
            group_norm_var = layers.sums(input=self.context[self.group_name])
            group_norm_var = layers.sqrt(x=group_norm_var)
+            group_norm_var = layers.cast(group_norm_var, 'float32')
            clip_var = self.context[self.group_name + "_clip"]
            group_scale_var = layers.elementwise_div(
                x=clip_var,

--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -21,7 +21,7 @@ from .. import core
 from ..framework import Program, Variable, Operator
 from ..layer_helper import LayerHelper, unique_name
 from ..initializer import force_init_on_cpu
-from .ops import logical_and, logical_not, logical_or
+from .nn import logical_and, logical_not, logical_or
 import numpy
 import warnings
 import six

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt
--- a/python/paddle/fluid/tests/no_test_concurrency.py
+++ b/python/paddle/fluid/tests/no_test_concurrency.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr.py
--- a/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
--- a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
--- a/python/paddle/fluid/tests/unittests/dist_text_classification.py
+++ b/python/paddle/fluid/tests/unittests/dist_text_classification.py
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
--- a/python/paddle/fluid/tests/unittests/dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/dist_word2vec.py
--- a/python/paddle/fluid/tests/unittests/test_auc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auc_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
--- a/python/paddle/fluid/tests/notest_concurrency.py
+++ b/python/paddle/fluid/tests/notest_concurrency.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
--- a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
--- a/python/paddle/fluid/tests/unittests/test_dist_text_classification.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_text_classification.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
--- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
--- a/python/paddle/fluid/tests/unittests/test_infer_shape.py
+++ b/python/paddle/fluid/tests/unittests/test_infer_shape.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py