Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into hot-fix-utest

cf4dbaa9 · lujun · 218e02ac · ad45a083 · cf4dbaa9 · cf4dbaa9
39 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -75,7 +75,6 @@ option(WITH_INFERENCE_API_TEST   "Test fluid inference high-level api interface"
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
 option(WITH_FAST_MATH   "Make use of fast math library, might affect the precision to some extent" ON)
-option(WITH_WBAES       "Compile PaddlePaddle with WBAES support"       ON)

 # PY_VERSION
 if(NOT PY_VERSION)
@@ -149,7 +148,6 @@ include(external/dlpack)
 include(external/snappy)    # download snappy
 include(external/snappystream) # download snappystream
 include(external/warpctc)   # download, build, install warpctc
-include(external/wbaes)     # download wbaes

 if (NOT WIN32)
 # there is no official support of nccl, cupti in windows

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -157,7 +157,3 @@ endif(WITH_BRPC_RDMA)
 if(ON_INFER)
    add_definitions(-DPADDLE_ON_INFERENCE)
 endif(ON_INFER)
-
-if(WITH_WBAES)
-    add_definitions(-DPADDLE_WITH_WBAES)
-endif(WITH_WBAES)
--- a/cmake/external/wbaes.cmake
+++ b/cmake/external/wbaes.cmake
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-IF(NOT ${WITH_WBAES})
-    return()
-ENDIF(NOT ${WITH_WBAES})
-
-INCLUDE(ExternalProject)
-SET(WBAES_DST_DIR       "wbaes")
-SET(WBAES_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
-SET(WBAES_INSTALL_DIR   ${WBAES_INSTALL_ROOT}/${WBAES_DST_DIR})
-SET(WBAES_ROOT          ${WBAES_INSTALL_DIR})
-SET(WBAES_INC_DIR       ${WBAES_ROOT}/include)
-SET(WBAES_LIB_DIR       ${WBAES_ROOT}/lib)
-
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${WBAES_ROOT}/lib")
-SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
-
-IF(APPLE)
-    SET(WBAES_TAG   "v1.0.0" CACHE STRING "" FORCE)
-    SET(WBAES_URL   "http://paddlepaddledeps.bj.bcebos.com/wbaes-sdk.mac.${WBAES_TAG}.tgz" CACHE STRING "" FORCE)
-    SET(WBAES_LIB   ${WBAES_LIB_DIR}/libwbaes.dylib)
-    SET(WBAES_SHARED_LIB   ${WBAES_LIB_DIR}/libwbaes.dylib)
-ELSEIF(WIN32)
-    SET(WBAES_TAG   "v1.0.0" CACHE STRING "" FORCE)
-    SET(WBAES_URL   "http://paddlepaddledeps.bj.bcebos.com/wbaes-sdk.windows-x64.${WBAES_TAG}.tgz" CACHE STRING "" FORCE)
-    SET(WBAES_LIB   ${WBAES_LIB_DIR}/libwbaes.lib)
-    SET(WBAES_SHARED_LIB   ${WBAES_LIB_DIR}/libwbaes.dll)
-ELSE()
-    SET(WBAES_TAG   "v1.0.2" CACHE STRING "" FORCE)
-    SET(WBAES_URL   "http://paddlepaddledeps.bj.bcebos.com/wbaes-sdk.linux-x86_64.${WBAES_TAG}.tgz" CACHE STRING "" FORCE)
-    SET(WBAES_LIB   ${WBAES_LIB_DIR}/libwbaes.so)
-    SET(WBAES_SHARED_LIB   ${WBAES_LIB_DIR}/libwbaes.so)
-ENDIF()
-
-SET(WBAES_PROJECT       "extern_wbaes")
-MESSAGE(STATUS "WBAES_URL: ${WBAES_URL}, WBAES_LIB: ${WBAES_LIB}")
-SET(WBAES_SOURCE_DIR    "${THIRD_PARTY_PATH}/wbaes") 
-SET(WBAES_DOWNLOAD_DIR  "${WBAES_SOURCE_DIR}/src/${WBAES_PROJECT}")
-
-ExternalProject_Add(
-    ${WBAES_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                  ${WBAES_SOURCE_DIR}
-    URL                     ${WBAES_URL}
-    DOWNLOAD_DIR            ${WBAES_DOWNLOAD_DIR}
-    DOWNLOAD_NO_PROGRESS    1
-    CONFIGURE_COMMAND       ""
-    BUILD_COMMAND           ""
-    INSTALL_COMMAND         ""
-        ${CMAKE_COMMAND} -E copy_directory ${WBAES_DOWNLOAD_DIR}/include ${WBAES_INC_DIR} &&
-        ${CMAKE_COMMAND} -E copy_directory ${WBAES_DOWNLOAD_DIR}/lib ${WBAES_LIB_DIR}
-)
-
-INCLUDE_DIRECTORIES(${WBAES_INC_DIR})
-
-ADD_LIBRARY(wbaes SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET wbaes PROPERTY IMPORTED_LOCATION ${WBAES_LIB})
-SET_PROPERTY(TARGET wbaes PROPERTY IMPORTED_NO_SONAME 1)
-ADD_DEPENDENCIES(wbaes ${WBAES_PROJECT})
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -264,14 +264,6 @@ function(cc_library TARGET_NAME)
        list(REMOVE_ITEM cc_library_DEPS warpctc)
        add_dependencies(${TARGET_NAME} warpctc)
      endif()
-      # Only deps libwbaes.so, not link
-      if("${cc_library_DEPS};" MATCHES "wbaes;")
-        list(REMOVE_ITEM cc_library_DEPS wbaes)
-        if(NOT "${TARGET_NAME}" MATCHES "dynload_wbaes")
-          list(APPEND cc_library_DEPS dynload_wbaes)
-        endif()
-        add_dependencies(${TARGET_NAME} wbaes)
-      endif()
      # Only deps libmklml.so, not link
      if("${cc_library_DEPS};" MATCHES "mklml;")
        list(REMOVE_ITEM cc_library_DEPS mklml)

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -170,14 +170,6 @@ copy(snappystream_lib
        DSTS ${dst_dir} ${dst_dir}/lib
        DEPS snappystream)

-if (WITH_WBAES)
-    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/wbaes")
-    copy(wbaes_lib
-            SRCS ${WBAES_INC_DIR} ${WBAES_LIB}
-            DSTS ${dst_dir} ${dst_dir}/lib
-            DEPS wbaes)
-endif ()
-
 set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
 copy(zlib_lib
        SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -235,6 +235,7 @@ paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], vararg
 paddle.fluid.layers.kldiv_loss (ArgSpec(args=['x', 'target', 'reduction', 'name'], varargs=None, keywords=None, defaults=('mean', None)), ('document', '776d536cac47c89073abc7ee524d5aec'))
 paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607'))
 paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329'))
+paddle.fluid.layers.pixel_shuffle (ArgSpec(args=['x', 'upscale_factor'], varargs=None, keywords=None, defaults=None), ('document', 'ad669cdf83e72a69ebc5ed79e36486de'))
 paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', 'b76ccca3735bea4a58a0dbf0d77c5393'))
 paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139'))
 paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc'))

--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -3,4 +3,7 @@ cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybi
 cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind)
 cc_library(engine SRCS engine.cc)
 cc_library(imperative_profiler SRCS profiler.cc)
+cc_library(nccl_context SRCS nccl_context.cc DEPS device_context)
+
+cc_test(nccl_context_test SRCS nccl_context_test.cc  DEPS nccl_context)
 endif()
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/nccl_context.h"
+
+namespace paddle {
+namespace imperative {
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+void NCCLParallelContext::RecvNCCLID(const std::string &ep,
+                                     ncclUniqueId *nccl_id) {
+  auto addr = paddle::string::Split(ep, ':');
+  PADDLE_ENFORCE_EQ(addr.size(), 2UL,
+                    "The endpoint should contain host and port: %s", ep);
+  std::string host = addr[0];
+  int port = std::stoi(addr[1]);
+
+  int server_fd, new_socket;
+  struct sockaddr_in address;
+  int addrlen = sizeof(address);
+  char buffer[1024] = {0};
+  int opt = 0;
+  // creating socket fd
+  if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == 0)
+    PADDLE_THROW("create server fd failed");
+  if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt,
+                 sizeof(opt)))
+    PADDLE_THROW("set socket opt failed");
+
+  address.sin_family = AF_INET;
+  address.sin_addr.s_addr = INADDR_ANY;
+  address.sin_port = htons(port);
+
+  if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0)
+    PADDLE_THROW("binding failed on ep: %s", ep);
+  VLOG(3) << "listening on: " << ep;
+  if (listen(server_fd, 3) < 0) PADDLE_THROW("listen on server fd failed");
+
+  if ((new_socket =
+           accept(server_fd, reinterpret_cast<struct sockaddr *>(&address),
+                  reinterpret_cast<socklen_t *>(&addrlen))) < 0)
+    PADDLE_THROW("accept the new socket fd failed");
+
+  if (read(new_socket, buffer, 1024) < 0)
+    PADDLE_THROW("reading the ncclUniqueId from socket failed");
+  VLOG(3) << "recevived the ncclUniqueId";
+  memcpy(nccl_id, buffer, NCCL_UNIQUE_ID_BYTES);
+
+  VLOG(3) << "closing the socket server: " << ep;
+  close(server_fd);
+}
+
+void NCCLParallelContext::SendNCCLID(const std::string &ep,
+                                     ncclUniqueId *nccl_id) {
+  auto addr = paddle::string::Split(ep, ':');
+  PADDLE_ENFORCE_EQ(addr.size(), 2UL,
+                    "The endpoint should contain host and port: %s", ep);
+  std::string host = addr[0];
+  int port = std::stoi(addr[1]);
+  // struct sockaddr_in address;
+  int sock = 0;
+  struct sockaddr_in serv_addr;
+  char buffer[1024] = {0};
+
+  memcpy(buffer, nccl_id, NCCL_UNIQUE_ID_BYTES);
+  if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0)
+    PADDLE_THROW("create socket failed");
+
+  memset(&serv_addr, '0', sizeof(serv_addr));
+  serv_addr.sin_family = AF_INET;
+  serv_addr.sin_port = htons(port);
+
+  if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0)
+    PADDLE_THROW("invalied address: %s", ep);
+
+  while (true) {
+    if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
+      VLOG(0) << "worker: " << ep
+              << " is not ready, will retry after 3 seconds...";
+      std::this_thread::sleep_for(std::chrono::seconds(3));
+      continue;
+    }
+    VLOG(3) << "sending the ncclUniqueId to " << ep;
+    send(sock, buffer, NCCL_UNIQUE_ID_BYTES, 0);
+    break;
+  }
+}
+
+void NCCLParallelContext::BcastNCCLId(ncclUniqueId *nccl_id, int root) {
+  if (strategy_.local_rank_ == root) {
+    for (auto ep : strategy_.trainer_endpoints_) {
+      if (ep != strategy_.current_endpoint_) SendNCCLID(ep, nccl_id);
+    }
+  } else {
+    RecvNCCLID(strategy_.current_endpoint_, nccl_id);
+  }
+}
+
+void NCCLParallelContext::Init() {
+  ncclUniqueId nccl_id;
+  ncclComm_t comm;
+  if (strategy_.local_rank_ == 0) {
+    // generate the unique ncclid on the root worker
+    platform::dynload::ncclGetUniqueId(&nccl_id);
+    BcastNCCLId(&nccl_id, 0);
+  } else {
+    BcastNCCLId(&nccl_id, 0);
+  }
+  int gpu_id = boost::get<platform::CUDAPlace>(place_).device;
+  VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
+          << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id;
+
+  PADDLE_ENFORCE(cudaSetDevice(gpu_id));
+  PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
+      &comm, strategy_.nranks_, nccl_id, strategy_.local_rank_));
+
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(pool.Get(place_));
+  dev_ctx->set_nccl_comm(comm);
+}
+#endif
+
+}  //  namespace imperative
+}  //  namespace paddle
--- a/paddle/fluid/imperative/nccl_context.h
+++ b/paddle/fluid/imperative/nccl_context.h
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+// network header files
+#ifndef _WIN32
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#endif
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/device_context.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/dynload/nccl.h"
+#endif
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/split.h"
+
+namespace paddle {
+namespace imperative {
+
+struct ParallelStrategy {
+  int nranks_{1};
+  int local_rank_{0};
+  std::vector<std::string> trainer_endpoints_{};
+  std::string current_endpoint_{""};
+};
+
+class ParallelContext {
+ public:
+  explicit ParallelContext(const ParallelStrategy& strategy,
+                           const platform::Place& place)
+      : strategy_(strategy), place_(place) {}
+
+  virtual ~ParallelContext() {}
+
+  virtual void Init() = 0;
+
+ protected:
+  ParallelStrategy strategy_;
+  platform::Place place_;
+};
+
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+class NCCLParallelContext : ParallelContext {
+ public:
+  explicit NCCLParallelContext(const ParallelStrategy& strategy,
+                               const platform::Place& place)
+      : ParallelContext(strategy, place) {}
+
+  ~NCCLParallelContext() {}
+
+  void BcastNCCLId(ncclUniqueId* nccl_id, int root);
+
+  void Init() override;
+
+ protected:
+  void RecvNCCLID(const std::string& endpoint, ncclUniqueId* nccl_id);
+
+  void SendNCCLID(const std::string& endpoint, ncclUniqueId* nccl_id);
+};
+#endif
+
+}  //  namespace imperative
+}  //  namespace paddle
--- a/paddle/fluid/imperative/nccl_context_test.cc
+++ b/paddle/fluid/imperative/nccl_context_test.cc
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/nccl_context.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace imperative = paddle::imperative;
+namespace platform = paddle::platform;
+
+imperative::ParallelStrategy GetStrategy(int local_rank) {
+  std::vector<std::string> eps = {"127.0.0.1:9866", "127.0.0.1:9867"};
+  imperative::ParallelStrategy strategy;
+  strategy.trainer_endpoints_ = eps;
+  strategy.current_endpoint_ = eps[local_rank];
+  strategy.nranks_ = 2;
+  strategy.local_rank_ = local_rank;
+  return strategy;
+}
+
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+void BcastNCCLId(int local_rank, ncclUniqueId *nccl_id) {
+  auto strategy = GetStrategy(local_rank);
+  platform::CUDAPlace gpu(local_rank);
+  imperative::NCCLParallelContext ctx(strategy, gpu);
+  ctx.BcastNCCLId(nccl_id, 0);
+}
+
+TEST(BcastNCCLId, Run) {
+  ncclUniqueId nccl_id;
+  platform::dynload::ncclGetUniqueId(&nccl_id);
+  std::thread t(BcastNCCLId, 0, &nccl_id);
+
+  ncclUniqueId recv_nccl_id;
+  BcastNCCLId(1, &recv_nccl_id);
+
+  t.join();
+  EXPECT_EQ(0, std::memcmp(nccl_id.internal, recv_nccl_id.internal,
+                           NCCL_UNIQUE_ID_BYTES));
+}
+#endif
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -177,7 +177,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
        current_vars_map[out->Name()] = out;
      }

-      VLOG(3) << "input var name: " << out->Name()
+      VLOG(3) << "output var name: " << out->Name()
              << " inited: " << out->var_->IsInitialized()
              << " stop_grad: " << out->IsStopGradient();
    }
@@ -215,6 +215,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,

  framework::Scope scope;
  op->place_ = GetExpectedPlace(expected_place, inputs);
+
  PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_);
  prepared_op.op.RuntimeInferShape(scope, op->place_, ctx);
  prepared_op.func(

--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -87,6 +87,7 @@ const std::vector<std::string> kAnakinSubgraphPasses({
 GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
  passes_.assign({
    "infer_clean_graph_pass",          //
+        "runtime_context_cache_pass",  //
        //   "identity_scale_op_clean_pass",              //
        "conv_affine_channel_fuse_pass",             //
        "conv_eltwiseadd_affine_channel_fuse_pass",  //
@@ -96,7 +97,6 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
        "conv_elementwise_add_act_fuse_pass",   //
        "conv_elementwise_add2_act_fuse_pass",  //
        "conv_elementwise_add_fuse_pass",       //
-        "runtime_context_cache_pass",           //
 #endif                                          //
        "transpose_flatten_concat_fuse_pass",
  });
@@ -117,6 +117,10 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
  // not be damaged by smaller ones.
  passes_.assign({
      "infer_clean_graph_pass",  //
+      // TODO(luotao): runtime_context_cache_pass should be located in the
+      // front, see https://github.com/PaddlePaddle/Paddle/issues/16609,
+      // will enhance this pass later.
+      "runtime_context_cache_pass",     //
      "attention_lstm_fuse_pass",       //
      "seqpool_concat_fuse_pass",       //
      "seqconv_eltadd_relu_fuse_pass",  //
@@ -132,7 +136,6 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
      "conv_bn_fuse_pass",             //
      "conv_eltwiseadd_bn_fuse_pass",  //
      "is_test_pass",                  //
-      "runtime_context_cache_pass",    //
  });

  use_gpu_ = false;

--- a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
@@ -23,18 +23,11 @@ namespace analysis {

 void SetConfig(AnalysisConfig *cfg) {
  cfg->SetModel(FLAGS_infer_model);
-  cfg->SetProgFile("__model__");
  cfg->DisableGpu();
  cfg->SwitchIrOptim();
-  cfg->SwitchSpecifyInputNames(false);
+  cfg->SwitchSpecifyInputNames();
  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
  cfg->EnableMKLDNN();
-  cfg->pass_builder()->SetPasses(
-      {"infer_clean_graph_pass", "mkldnn_placement_pass",
-       "depthwise_conv_mkldnn_pass", "conv_bn_fuse_pass",
-       "conv_eltwiseadd_bn_fuse_pass", "conv_bias_mkldnn_fuse_pass",
-       "conv_elementwise_add_mkldnn_fuse_pass", "conv_relu_mkldnn_fuse_pass",
-       "fc_fuse_pass", "is_test_pass"});
 }

 template <typename T>
@@ -84,13 +77,13 @@ std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
          std::to_string(num_images) + " is bigger than all test data size.");

  PaddleTensor images;
-  images.name = "input";
+  images.name = "image";
  images.shape = {num_images, 3, 224, 224};
  images.dtype = PaddleDType::FLOAT32;
  images.data.Resize(sizeof(float) * num_images * 3 * 224 * 224);

  PaddleTensor labels;
-  labels.name = "labels";
+  labels.name = "label";
  labels.shape = {num_images, 1};
  labels.dtype = PaddleDType::INT64;
  labels.data.Resize(sizeof(int64_t) * num_images);
@@ -132,7 +125,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
      images_offset_in_file + sizeof(float) * total_images * 3 * 224 * 224;

  TensorReader<float> image_reader(file, images_offset_in_file,
-                                   image_batch_shape, "input");
+                                   image_batch_shape, "image");
  TensorReader<int64_t> label_reader(file, labels_offset_in_file,
                                     label_batch_shape, "label");


--- a/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md
+++ b/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md
+# INT8 MKL-DNN quantization 
+
+This document describes how to use Paddle inference Engine to convert the FP32 model to INT8 model on ResNet-50 and MobileNet-V1. We provide the instructions on enabling INT8 MKL-DNN quantization in Paddle inference and show the ResNet-50 and MobileNet-V1 results in accuracy and performance.
+
+## 0. Install PaddlePaddle 
+Follow PaddlePaddle [installation instruction](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification#installation) to install PaddlePaddle. If you build PaddlePaddle yourself, please use the following cmake arguments. 
+```
+cmake ..  -DWITH_TESTING=ON -WITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_MKL=ON  -WITH_SWIG_PY=OFF -DWITH_INFERENCE_API_TEST=ON -DON_INFER=ON
+
+```  
+Note: MKL-DNN and MKL are required.
+
+## 1. Enable INT8 MKL-DNN quantization 
+For reference, please examine the code of unit test enclosed in [analyzer_int8_image_classification_tester.cc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc).
+
+* ### Create Analysis config
+INT8 quantization is one of the optimizations in analysis config. More information about analysis config can be found [here](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/advanced_usage/deploy/inference/native_infer_en.md#upgrade-performance-based-on-contribanalysisconfig-prerelease) 
+
+* ### Create quantize config by analysis config
+We enable the MKL-DNN quantization procedure by calling an appropriate method from analysis config. Afterwards, all the required quantization parameters (quantization op names, quantization strategies etc.) can be set through quantizer config which is present in the analysis config. It is also necessary to specify a pre-processed warmup dataset and desired batch size.
+
+```cpp
+//Enable MKL-DNN quantization
+cfg.EnableMkldnnQuantizer();
+
+//use analysis config to call the MKL-DNN quantization config
+cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data); 
+cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(100);
+```
+
+## 2. Accuracy and Performance benchmark
+
+We provide the results of accuracy and performance measured on Intel(R) Xeon(R) Gold 6271 on single core.
+
+   >**I. Top-1 Accuracy on Intel(R) Xeon(R) Gold 6271**
+
+| Model  | Dataset  | FP32 Accuracy  | INT8 Accuracy  | Accuracy Diff  |
+| :------------: | :------------: | :------------: | :------------: | :------------: |
+| ResNet-50  | Full ImageNet Val  | 76.63%  | 76.48%  | 0.15% |
+| MobileNet-V1 | Full ImageNet Val  | 70.78%  | 70.36%  | 0.42%  |
+
+   >**II. Throughput on Intel(R) Xeon(R) Gold 6271 (batch size 1 on single core)**
+
+| Model  | Dataset  | FP32 Throughput  | INT8 Throughput  |  Ratio(INT8/FP32)  |
+| :------------: | :------------: | :------------: | :------------: | :------------: |
+| ResNet-50  | Full ImageNet Val  |  13.17 images/s | 49.84 images/s | 3.78 |
+| MobileNet-V1 | Full ImageNet Val  | 75.49 images/s | 232.38 images/s | 3.07  |
+
+Notes:
+* Measurement of accuracy requires a model which accepts two inputs: data and labels.
+* Different sampling batch size data may cause slight difference on INT8 top accuracy.
+* CAPI performance data is better than python API performance data because of the python overhead. Especially for the small computational model, python overhead will be more obvious. 
+
+
+## 3. Commands to reproduce the above accuracy and performance benchmark
+* #### Full dataset (Single core)
+   * ##### Download full ImageNet Validation Dataset
+```bash
+cd /PATH/TO/PADDLE/build
+python ../paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
+```
+The converted data binary file is saved by default in ~/.cache/paddle/dataset/int8/download/int8_full_val.bin
+   * ##### ResNet50 Full dataset benchmark
+```bash
+./paddle/fluid/inference/tests/api/test_analyzer_int8_resnet50 --infer_model=third_party/inference_demo/int8v2/resnet50/model --infer_data=/path/to/converted/int8_full_val.bin --batch_size=1 --paddle_num_threads=1
+```
+   * ##### Mobilenet-v1 Full dataset benchmark
+```bash
+./paddle/fluid/inference/tests/api/test_analyzer_int8_mobilenet --infer_model=third_party/inference_demo/int8v2/mobilenet/model --infer_data=/path/to/converted/int8_full_val.bin --batch_size=1 --paddle_num_threads=1
+```
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -316,7 +316,8 @@ void PredictionRun(PaddlePredictor *predictor,
                   int num_threads, int tid) {
  int num_times = FLAGS_repeat;
  int iterations = inputs.size();  // process the whole dataset ...
-  if (FLAGS_iterations > 0 && FLAGS_iterations < inputs.size())
+  if (FLAGS_iterations > 0 &&
+      FLAGS_iterations < static_cast<int64_t>(inputs.size()))
    iterations =
        FLAGS_iterations;  // ... unless the number of iterations is set
  outputs->resize(iterations);
@@ -329,14 +330,14 @@ void PredictionRun(PaddlePredictor *predictor,
 #endif
  if (!FLAGS_zero_copy) {
    run_timer.tic();
-    for (size_t i = 0; i < iterations; i++) {
+    for (int i = 0; i < iterations; i++) {
      for (int j = 0; j < num_times; j++) {
        predictor->Run(inputs[i], &(*outputs)[i], FLAGS_batch_size);
      }
    }
    elapsed_time = run_timer.toc();
  } else {
-    for (size_t i = 0; i < iterations; i++) {
+    for (int i = 0; i < iterations; i++) {
      ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]);
      run_timer.tic();
      for (int j = 0; j < num_times; j++) {
@@ -366,9 +367,8 @@ void TestOneThreadPrediction(
    const std::vector<std::vector<PaddleTensor>> &inputs,
    std::vector<std::vector<PaddleTensor>> *outputs, bool use_analysis = true) {
  auto predictor = CreateTestPredictor(config, use_analysis);
-  PredictionWarmUp(predictor.get(), inputs, outputs, FLAGS_paddle_num_threads,
-                   0);
-  PredictionRun(predictor.get(), inputs, outputs, FLAGS_paddle_num_threads, 0);
+  PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0);
+  PredictionRun(predictor.get(), inputs, outputs, 1, 0);
 }

 void TestMultiThreadPrediction(

--- a/paddle/fluid/operators/controlflow/conditional_block_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc
@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/var_type.h"
@@ -174,24 +177,41 @@ class ConditionalBlockGradOp : public ConditionalOp {

      framework::Executor exec(dev_place);
      auto *block = Attr<framework::BlockDesc *>("sub_block");
-      exec.Run(*block->Program(), &cur_scope, block->ID(), false);

-      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Input"),
-                                  Outputs(framework::GradVarName("Input")));
+      const auto &ins = Inputs("Input");
+      const auto &d_ins = Outputs(framework::GradVarName("Input"));
+      const auto &conds = Inputs("Cond");
+      const auto &d_conds = Outputs(framework::GradVarName("Cond"));
+
+      std::vector<std::string> ins_conds_grads;
+      ins_conds_grads.reserve(ins.size() + conds.size());
+      for (auto &in : ins) {
+        ins_conds_grads.emplace_back(framework::GradVarName(in));
+      }
+      for (auto &cond : conds) {
+        ins_conds_grads.emplace_back(framework::GradVarName(cond));
+      }
+
+      exec.Run(*block->Program(), &cur_scope, block->ID(), false, true,
+               ins_conds_grads);
+
+      AssignLocalGradientToGlobal(dev_place, cur_scope, ins_conds_grads.data(),
+                                  ins.size(), d_ins);

-      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Cond"),
-                                  Outputs(framework::GradVarName("Cond")));
+      AssignLocalGradientToGlobal(dev_place, cur_scope,
+                                  ins_conds_grads.data() + ins.size(),
+                                  conds.size(), d_conds);
    }
  }

 private:
  void AssignLocalGradientToGlobal(
      const platform::Place &place, const framework::Scope &cur_scope,
-      const std::vector<std::string> &p_names,
+      const std::string *p_grad_names, size_t p_grad_names_num,
      const std::vector<std::string> &pg_names) const {
-    for (size_t i = 0; i < p_names.size(); ++i) {
+    for (size_t i = 0; i < p_grad_names_num; ++i) {
      auto out_grad_name = pg_names[i];
-      auto in_grad_name = framework::GradVarName(p_names[i]);
+      const auto &in_grad_name = p_grad_names[i];
      auto *in_var = cur_scope.FindVar(in_grad_name);
      if (in_var == nullptr) {
        continue;

--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -991,15 +991,17 @@ TEST(JITKernel_pool, jitpool) {

 TEST(JITKernel_pool, more) {
  const auto& kers = jit::KernelPool::Instance().AllKernels();
-#if defined(__APPLE__) || defined(__OSX__)
-  EXPECT_EQ(kers.size(), 10UL);
-#else
-#ifdef PADDLE_WITH_MKLML
-  EXPECT_EQ(kers.size(), 22UL);
-#else
-  EXPECT_EQ(kers.size(), 8UL);
+  size_t target_num = 8;
+
+#ifdef __AVX__
+  target_num += 2;
 #endif
+
+#ifdef PADDLE_WITH_MKLML
+  target_num += 12;
 #endif
+
+  EXPECT_EQ(kers.size(), target_num);
 }

 TEST(JITKernel_pool, refer) {

--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -29,7 +29,7 @@ class LoadOp : public framework::OperatorWithKernel {
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
    framework::OpKernelType kt = framework::OpKernelType(
-        framework::proto::VarType::FP32, platform::CPUPlace());
+        framework::proto::VarType::FP32, ctx.GetPlace());
    return kt;
  }
 };

--- a/paddle/fluid/operators/pixel_shuffle_op.cc
+++ b/paddle/fluid/operators/pixel_shuffle_op.cc
+/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/pixel_shuffle_op.h"
+#include <memory>
+
+namespace paddle {
+namespace operators {
+
+class PixelShuffleOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of PixelShuffleOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of PixelShuffleOp should not be null.");
+
+    auto input_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
+    auto upscale_factor = ctx->Attrs().Get<int>("upscale_factor");
+
+    PADDLE_ENFORCE(input_dims[1] % (upscale_factor * upscale_factor) == 0,
+                   "Upscale_factor should devide the number of channel");
+
+    auto output_dims = input_dims;
+    output_dims[0] = input_dims[0];
+    output_dims[1] = input_dims[1] / (upscale_factor * upscale_factor);
+    output_dims[2] = input_dims[2] * upscale_factor;
+    output_dims[3] = input_dims[3] * upscale_factor;
+    ctx->SetOutputDim("Out", output_dims);
+  }
+};
+
+class PixelShuffleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "X",
+        "(Tensor, default Tensor<float>), "
+        "the input feature data of PixelShuffleOp, the layout is [N C H W].");
+    AddOutput(
+        "Out",
+        "(Tensor, default Tensor<float>), the output of "
+        "PixelShuffleOp. The layout is [N,C/factor^2,H*factor,W*factor].");
+    AddAttr<int>("upscale_factor",
+                 "the factor to increase spatial resolution by.")
+        .SetDefault(1)
+        .AddCustomChecker([](const int& upscale_factor) {
+          PADDLE_ENFORCE_GE(upscale_factor, 1,
+                            "upscale_factor should be larger than 0.");
+        });
+
+    AddComment(R"DOC(
+		Pixel Shuffle operator
+		This operator rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)`
+    		to a tensor of shape :math:`(C, H \times r, W \times r)`.
+
+		This is useful for implementing efficient sub-pixel convolution
+    		with a stride of :math:`1/r`.
+
+		Please refer to the paper:
+		 `Real-Time Single Image and Video Super-Resolution Using an Efficient 
+		 Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_
+    		by Shi et. al (2016) for more details. 
+
+        )DOC");
+  }
+};
+
+class PixelShuffleGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("pixel_shuffle_grad");
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetAttrMap(Attrs());
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
+class PixelShuffleGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@Grad) should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@Grad) should not be null");
+
+    auto do_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    PADDLE_ENFORCE(do_dims.size() == 4, "The layout of input is NCHW.");
+
+    auto upscale_factor = ctx->Attrs().Get<int>("upscale_factor");
+
+    auto dx_dims = do_dims;
+    dx_dims[0] = do_dims[0];
+    dx_dims[1] = do_dims[1] * (upscale_factor * upscale_factor);
+    dx_dims[2] = do_dims[2] / upscale_factor;
+    dx_dims[3] = do_dims[3] / upscale_factor;
+    ctx->SetOutputDim(framework::GradVarName("X"), dx_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(pixel_shuffle, ops::PixelShuffleOp, ops::PixelShuffleOpMaker,
+                  ops::PixelShuffleGradMaker);
+
+REGISTER_OPERATOR(pixel_shuffle_grad, ops::PixelShuffleGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    pixel_shuffle,
+    ops::PixelShuffleOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PixelShuffleOpKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    pixel_shuffle_grad,
+    ops::PixelShuffleGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PixelShuffleGradOpKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/platform/dynload/wbaes.cc
+++ b/paddle/fluid/platform/dynload/wbaes.cc
@@ -12,23 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifdef PADDLE_WITH_WBAES
-
-#include "paddle/fluid/platform/dynload/wbaes.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-std::once_flag wbaes_dso_flag;
-void *wbaes_dso_handle = nullptr;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-WBAES_ROUTINE_EACH(DEFINE_WRAP);
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
-
-#endif
+#include "paddle/fluid/operators/pixel_shuffle_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    pixel_shuffle, ops::PixelShuffleOpKernel<plat::CUDADeviceContext, float>,
+    ops::PixelShuffleOpKernel<plat::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    pixel_shuffle_grad,
+    ops::PixelShuffleGradOpKernel<plat::CUDADeviceContext, float>,
+    ops::PixelShuffleGradOpKernel<plat::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/pixel_shuffle_op.h
+++ b/paddle/fluid/operators/pixel_shuffle_op.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class PixelShuffleOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    int factor = ctx.Attr<int>("upscale_factor");
+
+    auto in_dims = in->dims();
+    auto o_dims = out->dims();
+
+    framework::Tensor t;
+    t.ShareDataWith(*in);
+    t.Resize({in_dims[0], o_dims[1], factor, factor, in_dims[2], in_dims[3]});
+
+    std::vector<int> axis = {0, 1, 4, 2, 5, 3};
+
+    framework::Tensor o;
+    o.ShareDataWith(*out);
+    o.Resize({in_dims[0], o_dims[1], in_dims[2], factor, in_dims[3], factor});
+
+    math::Transpose<DeviceContext, T, 6> trans;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    trans(dev_ctx, t, &o, axis);
+    out->Resize(o_dims);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class PixelShuffleGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    int factor = ctx.Attr<int>("upscale_factor");
+
+    auto do_dims = dout->dims();
+    auto dx_dims = dx->dims();
+
+    framework::Tensor t;
+    t.ShareDataWith(*dout);
+    t.Resize({do_dims[0], do_dims[1], dx_dims[2], factor, dx_dims[3], factor});
+
+    std::vector<int> axis = {0, 1, 3, 5, 2, 4};
+
+    framework::Tensor o;
+    o.ShareDataWith(*dx);
+    o.Resize({do_dims[0], do_dims[1], factor, factor, dx_dims[2], dx_dims[3]});
+
+    math::Transpose<DeviceContext, T, 6> trans;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    trans(dev_ctx, t, &o, axis);
+    dx->Resize(dx_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -17,9 +17,6 @@ if (CUPTI_FOUND)
 endif(CUPTI_FOUND)
 nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
 cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
-if (WITH_WBAES)
-    cc_library(dynload_wbaes SRCS wbaes.cc DEPS dynamic_loader wbaes)
-endif()
 if (WITH_MKLML)
    cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
 endif()

--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -48,8 +48,6 @@ DEFINE_string(

 DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");

-DEFINE_string(wbaes_dir, "", "Specify path for loading libwbaes.so.");
-
 namespace paddle {
 namespace platform {
 namespace dynload {
@@ -248,16 +246,6 @@ void* GetMKLMLDsoHandle() {
 #endif
 }

-void* GetWBAESDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_wbaes_dir, "libwbaes.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_wbaes_dir, "libwbaes.dll");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_wbaes_dir, "libwbaes.so");
-#endif
-}
-
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -32,7 +32,6 @@ void* GetWarpCTCDsoHandle();
 void* GetNCCLDsoHandle();
 void* GetTensorRtDsoHandle();
 void* GetMKLMLDsoHandle();
-void* GetWBAESDsoHandle();

 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/wbaes.h
+++ b/paddle/fluid/platform/dynload/wbaes.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#ifdef PADDLE_WITH_WBAES
-
-#include <WBAESLib.h>
-#include <mutex>  // NOLINT
-
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-extern std::once_flag wbaes_dso_flag;
-extern void *wbaes_dso_handle;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load wbaes routine
- * via operator overloading.
- */
-
-#define DYNAMIC_LOAD_WBAES_WRAP(__name)                                    \
-  struct DynLoad__##__name {                                               \
-    template <typename... Args>                                            \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
-      using wbaesFunc = decltype(&::__name);                               \
-      std::call_once(wbaes_dso_flag, []() {                                \
-        wbaes_dso_handle = paddle::platform::dynload::GetWBAESDsoHandle(); \
-      });                                                                  \
-      static void *p_##__name = dlsym(wbaes_dso_handle, #__name);          \
-      return reinterpret_cast<wbaesFunc>(p_##__name)(args...);             \
-    }                                                                      \
-  };                                                                       \
-  extern DynLoad__##__name __name
-
-#define DECLARE_DYNAMIC_LOAD_WBAES_WRAP(__name) DYNAMIC_LOAD_WBAES_WRAP(__name)
-
-#define WBAES_ROUTINE_EACH(__macro) __macro(GSECF);
-
-WBAES_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WBAES_WRAP);
-
-#undef DYNAMIC_LOAD_WBAES_WRAP
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
-
-#endif
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
 set(PYBIND_DEPS pybind python proto_desc memory executor async_executor fleet_wrapper prune
  feed_fetch_method pass_builder parallel_executor profiler layer scope_pool
-  tracer analysis_predictor imperative_profiler)
+  tracer analysis_predictor imperative_profiler nccl_context)

 if(WITH_PYTHON)
  list(APPEND PYBIND_DEPS py_func_op)

--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -29,7 +29,7 @@ namespace paddle {
 namespace pybind {

 // Bind Methods
-void BindTracer(pybind11::module* m) {
+void BindImperative(pybind11::module* m) {
  pybind11::class_<imperative::Tracer>(*m, "Tracer", "")
      .def("__init__",
           [](imperative::Tracer& self, framework::BlockDesc* root_block) {
@@ -59,6 +59,47 @@ void BindTracer(pybind11::module* m) {
           })
      .def("py_trace", &imperative::Tracer::PyTrace,
           pybind11::return_value_policy::take_ownership);
+
+  // define parallel context
+  pybind11::class_<imperative::ParallelStrategy> parallel_strategy(
+      *m, "ParallelStrategy", "");
+  parallel_strategy.def(pybind11::init())
+      .def_property(
+          "nranks",
+          [](const imperative::ParallelStrategy& self) { return self.nranks_; },
+          [](imperative::ParallelStrategy& self, int nranks) {
+            self.nranks_ = nranks;
+          })
+      .def_property("local_rank",
+                    [](const imperative::ParallelStrategy& self) {
+                      return self.local_rank_;
+                    },
+                    [](imperative::ParallelStrategy& self, int local_rank) {
+                      self.local_rank_ = local_rank;
+                    })
+      .def_property(
+          "trainer_endpoints",
+          [](const imperative::ParallelStrategy& self) {
+            return self.trainer_endpoints_;
+          },
+          [](imperative::ParallelStrategy& self, std::vector<std::string> eps) {
+            self.trainer_endpoints_ = eps;
+          })
+      .def_property("current_endpoint",
+                    [](const imperative::ParallelStrategy& self) {
+                      return self.current_endpoint_;
+                    },
+                    [](imperative::ParallelStrategy& self,
+                       const std::string& ep) { self.current_endpoint_ = ep; });
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  pybind11::class_<imperative::NCCLParallelContext> nccl_ctx(
+      *m, "NCCLParallelContext");
+
+  nccl_ctx
+      .def(pybind11::init<const imperative::ParallelStrategy&,
+                          const platform::CUDAPlace&>())
+      .def("init", [](imperative::NCCLParallelContext& self) { self.Init(); });
+#endif
 }

 }  // namespace pybind

--- a/paddle/fluid/pybind/imperative.h
+++ b/paddle/fluid/pybind/imperative.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/imperative/nccl_context.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"

@@ -46,7 +47,7 @@ class PyVarBase : public imperative::VarBase {
  using imperative::VarBase::VarBase;  // Inherit constructors
 };

-void BindTracer(pybind11::module* m);
+void BindImperative(pybind11::module* m);

 }  // namespace pybind
 }  // namespace paddle
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -288,7 +288,7 @@ PYBIND11_MODULE(core, m) {
                  })
      .def_static("num_funcs", &imperative::PyLayer::NumFuncs);

-  BindTracer(&m);
+  BindImperative(&m);

  py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
      .def_buffer(

--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -32,6 +32,7 @@ default_envs = {
    "NCCL_SOCKET_IFNAME": "eth0",
    "NCCL_IB_GID_INDEX": "3",
    "NCCL_IB_RETRY_CNT": "0",
+    "PYTHONPATH": os.getenv("PYTHONPATH", ""),
 }

 GPUS = 8

--- a/python/paddle/fluid/dygraph/__init__.py
+++ b/python/paddle/fluid/dygraph/__init__.py
@@ -29,6 +29,9 @@ from .tracer import *
 from . import profiler
 from .profiler import *

+from . import parallel
+from .parallel import *
+
 from . import checkpoint
 from .checkpoint import *

@@ -41,5 +44,6 @@ __all__ += base.__all__
 __all__ += nn.__all__
 __all__ += tracer.__all__
 __all__ += profiler.__all__
+__all__ += parallel.__all__
 __all__ += checkpoint.__all__
 __all__ += learning_rate_scheduler.__all__
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -48,7 +48,7 @@ class Conv2D(layers.Layer):
                 bias_attr=None,
                 dtype=core.VarDesc.VarType.FP32):
        assert param_attr is not False, "param_attr should not be False here."
-        super(Conv2D, self).__init__(name_scope)
+        super(Conv2D, self).__init__(name_scope, dtype)
        self._groups = groups
        self._stride = utils.convert_to_list(stride, 2, 'stride')
        self._padding = utils.convert_to_list(padding, 2, 'padding')
@@ -503,7 +503,7 @@ class FC(layers.Layer):
                 num_flatten_dims=1,
                 dtype=core.VarDesc.VarType.FP32,
                 act=None):
-        super(FC, self).__init__(name_scope)
+        super(FC, self).__init__(name_scope, dtype)

        self._size = size
        self._num_flatten_dims = num_flatten_dims
@@ -608,7 +608,7 @@ class BatchNorm(layers.Layer):
                 do_model_average_for_mean_and_var=False,
                 fuse_with_relu=False,
                 use_global_stats=False):
-        super(BatchNorm, self).__init__(name_scope)
+        super(BatchNorm, self).__init__(name_scope, dtype)
        self._param_attr = param_attr
        self._param_attr = bias_attr
        self._act = act
@@ -760,7 +760,7 @@ class Embedding(layers.Layer):
                 param_attr=None,
                 dtype='float32'):

-        super(Embedding, self).__init__(name_scope)
+        super(Embedding, self).__init__(name_scope, dtype)
        self._size = size
        self._is_sparse = is_sparse
        self._is_distributed = is_distributed
@@ -1008,7 +1008,7 @@ class GRUUnit(layers.Layer):
                 gate_activation='sigmoid',
                 origin_mode=False,
                 dtype='float32'):
-        super(GRUUnit, self).__init__(name_scope)
+        super(GRUUnit, self).__init__(name_scope, dtype)

        activation_dict = dict(
            identity=0,

--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except jin compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from .. import core
+
+__all__ = ["prepare_context"]
+
+ParallelStrategy = core.ParallelStrategy
+
+__parallel_ctx__clz__ = None
+
+
+def prepare_context(parallel_strategy, place):
+    global __parallel_ctx__clz__
+    assert __parallel_ctx__clz__ is None, "ParallelContext can only be initialized once."
+
+    if isinstance(place, core.CUDAPlace):
+        __parallel_ctx__clz__ = core.NCCLParallelContext(parallel_strategy,
+                                                         place)
+    else:
+        # TODO(Yancey1989): add Gloo Parallel Context to support CPU parallel computation
+        assert ("Only support CUDAPlace for now.")
+    __parallel_ctx__clz__.init()
+
+
+class Env(object):
+    def __init__(self):
+        self._nranks = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
+        self._local_rank = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+        self._dev_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
+                                            "").split(",")
+        self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "")
+
+    @property
+    def nranks(self):
+        return self._nranks
+
+    @property
+    def local_rank(self):
+        return self._local_rank
+
+    @property
+    def dev_id(self):
+        return self._dev_id
+
+    @property
+    def current_endpoint(self):
+        return self._current_endpoint
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -191,6 +191,7 @@ __all__ = [
    'kldiv_loss',
    'tree_conv',
    'npair_loss',
+    'pixel_shuffle',
    'fsp_matrix',
 ]

@@ -480,6 +481,8 @@ def dynamic_lstm(input,
            forward, _ = fluid.layers.dynamic_lstm(
                input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
    """
+    assert _in_dygraph_mode(
+    ) is not True, "please use lstm instead of dynamic_lstm in dygraph mode!"
    assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
    helper = LayerHelper('lstm', **locals())
    size = size // 4
@@ -864,6 +867,9 @@ def dynamic_lstmp(input,
                                                     proj_activation="tanh")
    """

+    assert _in_dygraph_mode(
+    ) is not True, "please use lstm instead of dynamic_lstmp in dygraph mode!"
+
    assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
    helper = LayerHelper('lstmp', **locals())
    size = size // 4
@@ -1035,6 +1041,9 @@ def dynamic_gru(input,
            hidden = fluid.layers.dynamic_gru(input=x, size=hidden_dim)
    """

+    assert _in_dygraph_mode(
+    ) is not True, "please use gru instead of dynamic_gru in dygraph mode!"
+
    helper = LayerHelper('gru', **locals())
    dtype = helper.input_dtype()

@@ -1751,6 +1760,8 @@ def sequence_conv(input,
        Variable: output of sequence_conv
    """

+    assert not _in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
    helper = LayerHelper('sequence_conv', **locals())
    dtype = helper.input_dtype()
    filter_shape = [filter_size * input.shape[1], num_filters]
@@ -1810,6 +1821,8 @@ def sequence_softmax(input, use_cudnn=False, name=None):
                              dtype='float32', lod_level=1)
             x_sequence_softmax = fluid.layers.sequence_softmax(input=x)
    """
+    assert not _in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
    helper = LayerHelper('sequence_softmax', **locals())
    dtype = helper.input_dtype()
    softmax_out = helper.create_variable_for_type_inference(dtype)
@@ -2302,6 +2315,8 @@ def sequence_pool(input, pool_type, is_test=False):
             last_x = fluid.layers.sequence_pool(input=x, pool_type='last')
             first_x = fluid.layers.sequence_pool(input=x, pool_type='first')
    """
+    assert not _in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
    helper = LayerHelper('sequence_pool', **locals())
    dtype = helper.input_dtype()
    pool_out = helper.create_variable_for_type_inference(dtype)
@@ -2341,6 +2356,8 @@ def sequence_concat(input, name=None):

           out = fluid.layers.sequence_concat(input=[seq1, seq2, seq3])
    """
+    assert not _in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
    helper = LayerHelper('sequence_concat', **locals())
    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
    helper.append_op(
@@ -2468,6 +2485,8 @@ def sequence_slice(input, offset, length, name=None):
             subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset,
                                                   length=length)
    """
+    assert not _in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
    helper = LayerHelper("sequence_slice", **locals())
    dtype = helper.input_dtype()
    out = helper.create_variable_for_type_inference(dtype)
@@ -3927,6 +3946,8 @@ def sequence_expand(x, y, ref_level=-1, name=None):
                             dtype='float32', lod_level=1)
            out = layers.sequence_expand(x=x, y=y, ref_level=0)
    """
+    assert not _in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
    helper = LayerHelper('sequence_expand', input=x, **locals())
    dtype = helper.input_dtype()
    tmp = helper.create_variable_for_type_inference(dtype)
@@ -3993,6 +4014,8 @@ def sequence_expand_as(x, y, name=None):
                             dtype='float32', lod_level=1)
            out = layers.sequence_expand_as(x=x, y=y)
    """
+    assert not _in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
    helper = LayerHelper('sequence_expand_as', input=x, **locals())
    dtype = helper.input_dtype()
    tmp = helper.create_variable_for_type_inference(dtype)
@@ -4039,6 +4062,8 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
            out = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
    """

+    assert not _in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
    helper = LayerHelper('sequence_pad', input=x, **locals())
    dtype = helper.input_dtype()
    out = helper.create_variable_for_type_inference(dtype)
@@ -4105,6 +4130,8 @@ def sequence_unpad(x, length, name=None):
            out = fluid.layers.sequence_unpad(x=x, length=len)
    """

+    assert not _in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
    helper = LayerHelper('sequence_unpad', input=x, **locals())
    dtype = helper.input_dtype()
    out = helper.create_variable_for_type_inference(dtype)
@@ -5278,6 +5305,8 @@ def sequence_reshape(input, new_dim):
            x = fluid.layers.data(shape=[5, 20], dtype='float32', lod_level=1)
            x_reshaped = fluid.layers.sequence_reshape(input=x, new_dim=10)
    """
+    assert not _in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
    helper = LayerHelper('sequence_reshape', **locals())
    out = helper.create_variable_for_type_inference(helper.input_dtype())
    helper.append_op(
@@ -5812,6 +5841,8 @@ def im2sequence(input,
                input=layer, stride=[1, 1], filter_size=[2, 2])

    """
+    assert not _in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")

    if isinstance(filter_size, int):
        filter_size = [filter_size, filter_size]
@@ -6228,7 +6259,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
        },
        outputs={'Diff': diff,
                 'Out': loss},
-        attrs={'sigma': sigma})
+        attrs={'sigma': sigma if sigma is not None else 1.0})
    return loss


@@ -7589,6 +7620,8 @@ def sequence_scatter(input, index, updates, name=None):
            output = fluid.layers.sequence_scatter(input, index, updates)

    """
+    assert not _in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
    helper = LayerHelper('sequence_scatter', **locals())
    dtype = helper.input_dtype()
    out = helper.create_variable_for_type_inference(dtype)
@@ -8677,6 +8710,8 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
            x = fluid.layers.data(shape[30, 1], dtype='int32', lod_level=1)
            out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0)
    """
+    assert not _in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
    helper = LayerHelper('sequence_enumerate', **locals())
    out = helper.create_variable_for_type_inference(
        helper.input_dtype(), stop_gradient=True)
@@ -8716,6 +8751,8 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
        Variable: The output sequence mask.

    """
+    assert not _in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")

    helper = LayerHelper('sequence_mask', **locals())
    if name is None:
@@ -9766,6 +9803,8 @@ def sequence_reverse(x, name=None):
    Returns:
        out(${y_type}): ${y_comment}
    """
+    assert not _in_dygraph_mode(), (
+        "sequence layer is not supported in dygraph mode yet.")
    helper = LayerHelper("sequence_reverse", **locals())
    if name is None:
        out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -10923,6 +10962,65 @@ def npair_loss(anchor, positive, labels, l2_reg=0.002):
    return l2loss + celoss


+def pixel_shuffle(x, upscale_factor):
+    """
+
+    **Pixel Shuffle Layer**
+
+    This layer rearranges elements in a tensor of shape [N, C, H, W]
+    to a tensor of shape [N, C/r**2, H*r, W*r].
+    This is useful for implementing efficient sub-pixel convolution
+    with a stride of 1/r.
+    Please refer to the paper: `Real-Time Single Image and Video Super-Resolution 
+    Using an Efficient Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_ .
+    by Shi et. al (2016) for more details.
+
+        .. code-block:: text
+        
+            Given a 4-D tensor with the shape:
+                x.shape = [1, 9, 4, 4]
+            Given upscale_factor:
+                upscale_factor= 3
+            output shape is:
+                [1, 1, 12, 12]
+    
+    Args:
+
+        x(Variable): The input tensor variable.
+        upscale_factor(int): factor to increase spatial resolution
+
+    Returns:
+
+        Out(Variable): the pixel shuffle result is a tensor variable with the same shape and the same type as the input.
+
+    Raises:
+
+        ValueError: If the square of upscale_factor cannot divide the channels of input.
+
+    Examples:
+
+        .. code-block:: python
+
+            input = fluid.layers.data(shape=[9,4,4])
+            output = fluid.layers.pixel_shuffle(x=input, upscale_factor=3)
+
+    """
+
+    helper = LayerHelper("pixel_shuffle", **locals())
+
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    if not isinstance(upscale_factor, int):
+        raise TypeError("upscale factor must be int type")
+
+    helper.append_op(
+        type="pixel_shuffle",
+        inputs={"X": x},
+        outputs={"Out": out},
+        attrs={"upscale_factor": upscale_factor})
+    return out
+
+
 def fsp_matrix(x, y):
    """


--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_conditional_block.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_conditional_block.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import unittest
+
+fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
+
+from test_conditional_block import *
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
@@ -302,8 +302,11 @@ use_py_reader = False
 # if we run sync mode
 sync = False

-# how many batches we use
-batch_num = 50
+if not core.is_compiled_with_cuda():
+    # how many batches we use
+    batch_num = 50
+else:
+    batch_num = 5

 np.random.seed = 1
 src_word_np = np.random.randint(

--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
+++ b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestPixelShuffle(OpTest):
+    def setUp(self):
+        self.op_type = "pixel_shuffle"
+        n, c, h, w = 2, 9, 4, 4
+        up_factor = 3
+        shape = [n, c, h, w]
+        x = np.random.random(shape).astype("float32")
+        new_shape = (n, c // (up_factor * up_factor), up_factor, up_factor, h,
+                     w)
+        # reshape to (num,output_channel,upscale_factor,upscale_factor,h,w)
+        npresult = np.reshape(x, new_shape)
+        # transpose to (num,output_channel,h,upscale_factor,w,upscale_factor)
+        npresult = npresult.transpose(0, 1, 4, 2, 5, 3)
+        oshape = [n, c // (up_factor * up_factor), h * up_factor, w * up_factor]
+        npresult = np.reshape(npresult, oshape)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': npresult}
+        self.attrs = {'upscale_factor': up_factor}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -157,10 +157,6 @@ package_data['paddle.libs']= []
 package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name]
 shutil.copy('${WARPCTC_LIBRARIES}', libs_path)

-if '${WITH_WBAES}' == 'ON':
-    package_data['paddle.libs'] += ['libwbaes' + ext_name]
-    shutil.copy('${WBAES_SHARED_LIB}', libs_path)
-
 if '${WITH_MKL}' == 'ON':
    shutil.copy('${MKLML_SHARED_LIB}', libs_path)
    shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path)