diff --git a/CMakeLists.txt b/CMakeLists.txt
index df00e977ebb547980e69ee421779c57717d771a9..6aa2e1715b92d73aa4e5e97d5e52ffac51451d80 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -127,6 +127,9 @@ set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
 set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING
   "A path setting fluid shared and static libraries")
 
+set(FLUID_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_install_dir" CACHE STRING
+  "A path setting fluid inference shared and static libraries")
+
 if (WITH_C_API AND WITH_PYTHON)
   message(WARNING "It is suggest not embedded a python interpreter in Paddle "
     "when using C-API. It will give an unpredictable behavior when using a "
diff --git a/README.md b/README.md
index 46fdef5e376d3f5bf49ef10c62f5b3a6637913c1..de924fc5fc756f2486ff825b0bccd999402b2997 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
 
 
-### Latest PaddlePaddle Release: [Fluid 0.15.0](https://github.com/PaddlePaddle/Paddle/tree/v0.15.0)
+### Latest PaddlePaddle Release: [Fluid 1.0.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.0.0)
 ### Install Latest Stable Release:
 ```
 # Linux CPU
@@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==0.15.0.post85
 
 ## Installation
 
-It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/install/install_doc.html) on our website.
+It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) on our website.
 
 ## Documentation
 
-We provide [English](http://paddlepaddle.org/documentation/docs/en/0.15.0/getstarted/index_en.html) and
-[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/beginners_guide/index.html) documentation.
+We provide [English](http://paddlepaddle.org/documentation/docs/en/1.0.0/getstarted/index_en.html) and
+[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) documentation.
 
 - [Deep Learning 101](https://github.com/PaddlePaddle/book)
 
   You might want to start from this online interactive book that can run in a Jupyter Notebook.
 
-- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/user_guides/howto/training/cluster_howto.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.0/user_guides/howto/training/cluster_howto.html)
 
   You can run distributed training jobs on MPI clusters.
 
-- [Python API](http://paddlepaddle.org/documentation/api/zh/0.15.0/fluid.html)
+- [Python API](http://paddlepaddle.org/documentation/api/zh/1.0/fluid.html)
 
    Our new API enables much shorter programs.
 
-- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.15.0/new_docs/advanced_usage/development/contribute_to_paddle.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.0/advanced_usage/development/contribute_to_paddle.html)
 
    We appreciate your contributions!
 
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index a3e682e54ac496e37ed4a33a7b30d9fdca381d9d..67cca09b64c1ed7a503a886e78347d786eae0de7 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -150,16 +150,16 @@ if (WITH_ANAKIN AND WITH_MKL)
         SRCS
         ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libinference_anakin_api* # compiled anakin api
         ${ANAKIN_INSTALL_DIR} # anakin release
-        DSTS ${dst_dir}/inference/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin)
+        DSTS ${FLUID_INSTALL_DIR}/third_party/install/anakin ${FLUID_INSTALL_DIR}/third_party/install/anakin)
      list(APPEND inference_deps anakin_inference_lib)
 endif()
 
 set(module "inference")
 copy(inference_lib DEPS ${inference_deps}
   SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
-       ${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci
+       ${src_dir}/${module}/api/paddle_inference_api.h
        ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
 )
 
 set(module "platform")
@@ -188,18 +188,38 @@ copy(cmake_cache
 # This command generates a complete fluid library for both train and inference
 add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep}) 
 
+# Following commands generate a inference-only fluid library
+# third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR}
+copy(third_party DEPS fluid_lib_dist
+  SRCS ${FLUID_INSTALL_DIR}/third_party ${FLUID_INSTALL_DIR}/CMakeCache.txt
+  DSTS ${FLUID_INFERENCE_INSTALL_DIR} ${FLUID_INFERENCE_INSTALL_DIR}
+)
+
+# only need libpaddle_fluid.so/a and paddle_inference_api.h for inference-only library
+copy(inference_api_lib DEPS fluid_lib_dist
+  SRCS ${FLUID_INSTALL_DIR}/paddle/fluid/inference/libpaddle_fluid.*
+       ${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_inference_api.h
+  DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include
+)
+
+add_custom_target(inference_lib_dist DEPENDS third_party inference_api_lib)
+
 # paddle fluid version
-execute_process(
-  COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
-  WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-  OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
-set(version_file ${FLUID_INSTALL_DIR}/version.txt)
-file(WRITE ${version_file}
-  "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
-  "WITH_MKL: ${WITH_MKL}\n"
-  "WITH_GPU: ${WITH_GPU}\n")
-if(WITH_GPU)
-  file(APPEND ${version_file}
-    "CUDA version: ${CUDA_VERSION}\n"
-    "CUDNN version: v${CUDNN_MAJOR_VERSION}\n")
-endif()
+function(version version_file)
+  execute_process(
+    COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+    OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
+  file(WRITE ${version_file}
+    "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
+    "WITH_MKL: ${WITH_MKL}\n"
+    "WITH_MKLDNN: ${WITH_MKLDNN}\n"
+    "WITH_GPU: ${WITH_GPU}\n")
+  if(WITH_GPU)
+    file(APPEND ${version_file}
+      "CUDA version: ${CUDA_VERSION}\n"
+      "CUDNN version: v${CUDNN_MAJOR_VERSION}\n")
+  endif()
+endfunction()
+version(${FLUID_INSTALL_DIR}/version.txt)
+version(${FLUID_INFERENCE_INSTALL_DIR}/version.txt)
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 6a37b5ca433a3baa1388dd4f720d782ca53e4e99..1241ae784e373cc9485537a6887cf19ce9667bb9 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -173,6 +173,7 @@ paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None
 paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
 paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index ec8471ef960a2fc44af23c52be09cd678fab3f70..03f0f726eb61c2619c7719a865383090f86b5b7f 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -77,7 +77,7 @@ endif(NOT WIN32)
 link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
 link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
 link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
-link_directories("${PADDLE_LIB}/paddle/fluid/inference")
+link_directories("${PADDLE_LIB}/paddle/lib")
 
 add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
 
@@ -97,10 +97,10 @@ endif()
 # Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
 if(WITH_STATIC_LIB)
   set(DEPS
-      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
+      ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
 else()
   set(DEPS
-      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
+      ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
 endif()
 
 if (NOT WIN32)
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 65c95f0834a9356fc14faed8342f5d1e474edf8f..67994aad70a40c0e0c8a311914d4ea40b96eaf1e 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -5,12 +5,13 @@ TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
 DATA_DIR=$4 # dataset
 TENSORRT_INCLUDE_DIR=$5 # TensorRT header file dir, defalut to /usr/local/TensorRT/include
 TENSORRT_LIB_DIR=$6 # TensorRT lib file dir, default to /usr/local/TensorRT/lib
+inference_install_dir=${PADDLE_ROOT}/build/fluid_inference_install_dir
 
 cd `dirname $0`
 current_dir=`pwd`
 if [ $2 == ON ]; then
   # You can export yourself if move the install path
-  MKL_LIB=${PADDLE_ROOT}/build/fluid_install_dir/third_party/install/mklml/lib
+  MKL_LIB=${inference_install_dir}/third_party/install/mklml/lib
   export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${MKL_LIB}
 fi
 if [ $3 == ON ]; then
@@ -55,7 +56,7 @@ cd build
 for WITH_STATIC_LIB in ON OFF; do
   # -----simple_on_word2vec-----
   rm -rf *
-  cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
+  cmake .. -DPADDLE_LIB=${inference_install_dir} \
     -DWITH_MKL=$TURN_ON_MKL \
     -DDEMO_NAME=simple_on_word2vec \
     -DWITH_GPU=$TEST_GPU_CPU \
@@ -75,7 +76,7 @@ for WITH_STATIC_LIB in ON OFF; do
   fi
   # ---------vis_demo---------
   rm -rf *
-  cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
+  cmake .. -DPADDLE_LIB=${inference_install_dir} \
     -DWITH_MKL=$TURN_ON_MKL \
     -DDEMO_NAME=vis_demo \
     -DWITH_GPU=$TEST_GPU_CPU \
@@ -98,7 +99,7 @@ for WITH_STATIC_LIB in ON OFF; do
   # --------tensorrt mobilenet------
   if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then
     rm -rf *
-    cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
+    cmake .. -DPADDLE_LIB=${inference_install_dir} \
       -DWITH_MKL=$TURN_ON_MKL \
       -DDEMO_NAME=trt_mobilenet_demo \
       -DWITH_GPU=$TEST_GPU_CPU \
diff --git a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
index 8058d7e881025b1d806efe187d4523adadff367d..5446fd4d4256c10442a53ea09a447cf308cbd681 100644
--- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 #include <memory>
 #include <thread>  //NOLINT
 
-#include "paddle/fluid/inference/paddle_inference_api.h"
+#include "paddle/include/paddle_inference_api.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
 DEFINE_bool(use_gpu, false, "Whether use gpu.");
@@ -42,8 +42,7 @@ void Main(bool use_gpu) {
   config.use_gpu = use_gpu;
   config.fraction_of_gpu_memory = 0.15;
   config.device = 0;
-  auto predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  auto predictor = CreatePaddlePredictor<NativeConfig>(config);
 
   for (int batch_id = 0; batch_id < 3; batch_id++) {
     //# 2. Prepare input.
@@ -85,8 +84,7 @@ void MainThreads(int num_threads, bool use_gpu) {
   config.use_gpu = use_gpu;
   config.fraction_of_gpu_memory = 0.15;
   config.device = 0;
-  auto main_predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
 
   std::vector<std::thread> threads;
   for (int tid = 0; tid < num_threads; ++tid) {
diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
index ffb12b5871f088f15e43a1b0ff7e2a8b2f5fd079..4a8404f21c6ec6a1647e964ac3538b4b49151009 100644
--- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 
 #include <gflags/gflags.h>
 #include <glog/logging.h>  // use glog instead of CHECK to avoid importing other paddle header files.
-#include "paddle/fluid/inference/demo_ci/utils.h"
+#include "utils.h"  // NOLINT
 
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DEFINE_string(modeldir, "", "Directory of the inference model.");
diff --git a/paddle/fluid/inference/api/demo_ci/utils.h b/paddle/fluid/inference/api/demo_ci/utils.h
index 4792c97fe7d0a3f9c904774ad4a8e580cefcf237..d70c6aea791219a40c3164b51499f9d5e562be71 100644
--- a/paddle/fluid/inference/api/demo_ci/utils.h
+++ b/paddle/fluid/inference/api/demo_ci/utils.h
@@ -18,7 +18,7 @@
 #include <iostream>
 #include <string>
 #include <vector>
-#include "paddle/fluid/inference/paddle_inference_api.h"
+#include "paddle/include/paddle_inference_api.h"
 
 namespace paddle {
 namespace demo {
diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
index db61786e2fefda29256d84b5357028ec9c39b014..8d546e3e9c740c10bcf2984e073c956e3612625c 100644
--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 
 #include <gflags/gflags.h>
 #include <glog/logging.h>  // use glog instead of CHECK to avoid importing other paddle header files.
-#include "paddle/fluid/inference/demo_ci/utils.h"
+#include "utils.h"  // NOLINT
 
 #ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
@@ -34,12 +34,13 @@ DEFINE_bool(use_gpu, false, "Whether use gpu.");
 namespace paddle {
 namespace demo {
 
+using contrib::AnalysisConfig;
 /*
- * Use the native fluid engine to inference the demo.
+ * Use the native and analysis fluid engine to inference the demo.
  */
 void Main(bool use_gpu) {
-  std::unique_ptr<PaddlePredictor> predictor;
-  NativeConfig config;
+  std::unique_ptr<PaddlePredictor> predictor, analysis_predictor;
+  AnalysisConfig config;
   config.param_file = FLAGS_modeldir + "/__params__";
   config.prog_file = FLAGS_modeldir + "/__model__";
   config.use_gpu = use_gpu;
@@ -49,8 +50,8 @@ void Main(bool use_gpu) {
   }
 
   VLOG(3) << "init predictor";
-  predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  predictor = CreatePaddlePredictor<NativeConfig>(config);
+  analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
 
   VLOG(3) << "begin to process data";
   // Just a single batch of data.
@@ -68,7 +69,7 @@ void Main(bool use_gpu) {
   input.dtype = PaddleDType::FLOAT32;
 
   VLOG(3) << "run executor";
-  std::vector<PaddleTensor> output;
+  std::vector<PaddleTensor> output, analysis_output;
   predictor->Run({input}, &output, 1);
 
   VLOG(3) << "output.size " << output.size();
@@ -77,6 +78,10 @@ void Main(bool use_gpu) {
 
   // compare with reference result
   CheckOutput(FLAGS_refer, tensor);
+
+  // the analysis_output has some diff with native_output,
+  // TODO(luotao): add CheckOutput for analysis_output later.
+  analysis_predictor->Run({input}, &analysis_output, 1);
 }
 
 }  // namespace demo
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
index 5b6c922f95cf6d2d0683a1e9328463fe21f6bc38..6399476680c0af83a6d26aea952c58543bdce9ae 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -311,8 +311,7 @@ TEST(Analyzer_rnn1, ZeroCopy) {
   auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
 
   config.use_feed_fetch_ops = true;
-  auto native_predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  auto native_predictor = CreatePaddlePredictor<NativeConfig>(config);
 
   config.use_feed_fetch_ops = true;  // the analysis predictor needs feed/fetch.
   auto analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index d228f2dddd60f8707da94fa1f2a754efd783a5e7..bff781af25d3ce5b8e672a5f9b6c4f201a907d38 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -81,8 +81,7 @@ std::unique_ptr<PaddlePredictor> CreateTestPredictor(
   if (use_analysis) {
     return CreatePaddlePredictor<contrib::AnalysisConfig>(config);
   } else {
-    return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
-        config);
+    return CreatePaddlePredictor<NativeConfig>(config);
   }
 }
 
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index df3e3fcd9c75f03f4d9b0a7c12788f06bfdefd7f..c97225669a572cd62250729a9e4e9f7b674816e4 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -305,6 +305,7 @@ if (WITH_GPU)
     op_library(conv_op DEPS vol2col depthwise_conv im2col)
     op_library(layer_norm_op DEPS cub)
     op_library(reduce_mean_op DEPS cub)
+    op_library(affine_channel_op DEPS cub)
 else()
     op_library(conv_op DEPS vol2col im2col)
 endif()
diff --git a/paddle/fluid/operators/affine_channel_op.cc b/paddle/fluid/operators/affine_channel_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8944a749674c3ba6c83526e4d66f449075716f43
--- /dev/null
+++ b/paddle/fluid/operators/affine_channel_op.cc
@@ -0,0 +1,255 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class AffineChannelOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor) Feature map input can be a 4D tensor with order NCHW "
+             "or NHWC. It also can be a 2D tensor and C is the second "
+             "dimension.");
+    AddInput("Scale",
+             "(Tensor) 1D input of shape (C), the c-th element "
+             "is the scale factor of the affine transformation "
+             "for the c-th channel of the input.");
+    AddInput("Bias",
+             "(Tensor) 1D input of shape (C), the c-th element "
+             "is the bias of the affine transformation for the "
+             "c-th channel of the input.");
+    AddAttr<std::string>(
+        "data_layout",
+        "(string, default NCHW) Only used in "
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\". Specify the data format of the output data, "
+        "the input will be transformed automatically. ")
+        .SetDefault("AnyLayout");
+    AddOutput("Out", "(Tensor) A tensor of the same shape and order with X.");
+    AddComment(R"DOC(
+
+Applies a separate affine transformation to each channel of the input. Useful
+for replacing spatial batch norm with its equivalent fixed transformation.
+The input also can be 2D tensor and applies a affine transformation in second
+dimension.
+
+$$Out = Scale*X + Bias$$
+
+)DOC");
+  }
+};
+
+class AffineChannelOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of AffineChannelOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Scale"),
+                   "Input(Scale) of AffineChannelOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                   "Input(Bias) of AffineChannelOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of AffineChannelOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", "Out");
+  }
+};
+
+class AffineChannelOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      PADDLE_ENFORCE(ctx->HasInput("Scale"),
+                     "Input(Scale) should not be null.");
+      ctx->SetOutputDim(framework::GradVarName("X"),
+                        ctx->GetInputDim(framework::GradVarName("Out")));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
+      // Scale@GRAD and Bias@GRAD must exist at the same time.
+      PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")),
+                     "Output(Scale@GRAD) should not be null.");
+      PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+      ctx->SetOutputDim(framework::GradVarName("Scale"),
+                        ctx->GetInputDim("Scale"));
+      ctx->SetOutputDim(framework::GradVarName("Bias"),
+                        ctx->GetInputDim("Scale"));
+    }
+  }
+};
+
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+
+template <typename DeviceContext, typename T>
+class AffineChannelKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* scale = ctx.Input<framework::Tensor>("Scale");
+    auto* bias = ctx.Input<framework::Tensor>("Bias");
+
+    auto* y = ctx.Output<framework::Tensor>("Out");
+    y->mutable_data<T>(ctx.GetPlace());
+
+    const framework::DataLayout layout =
+        framework::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
+
+    auto dims = x->dims();
+    int N = dims[0];
+    int C = layout == framework::DataLayout::kNCHW ? dims[1]
+                                                   : dims[dims.size() - 1];
+    int HxW = x->numel() / N / C;
+
+    auto* scale_d = scale->data<T>();
+    auto* bias_d = bias->data<T>();
+    ConstEigenVectorArrayMap<T> a_e(scale_d, C);
+    ConstEigenVectorArrayMap<T> b_e(bias_d, C);
+
+    auto* x_d = x->data<T>();
+    auto* y_d = y->data<T>();
+    if (layout == framework::DataLayout::kNCHW) {
+      int stride = C * HxW;
+      for (int i = 0; i < N; i++) {
+        ConstEigenArrayMap<T> x_e(x_d, HxW, C);
+        EigenArrayMap<T> y_e(y_d, HxW, C);
+        y_e = (x_e.rowwise() * a_e.transpose()).rowwise() + b_e.transpose();
+        x_d += stride;
+        y_d += stride;
+      }
+    } else {
+      int num = N * HxW;
+      ConstEigenArrayMap<T> x_e(x_d, C, num);
+      EigenArrayMap<T> y_e(y_d, C, num);
+      y_e = (x_e.colwise() * a_e).colwise() + b_e;
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class AffineChannelGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* scale = ctx.Input<framework::Tensor>("Scale");
+    auto* dy = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+
+    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dscale =
+        ctx.Output<framework::Tensor>(framework::GradVarName("Scale"));
+    auto* dbias = ctx.Output<framework::Tensor>(framework::GradVarName("Bias"));
+
+    const framework::DataLayout layout =
+        framework::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
+
+    auto dims = x->dims();
+    int N = dims[0];
+    int C = layout == framework::DataLayout::kNCHW ? dims[1]
+                                                   : dims[dims.size() - 1];
+    int HxW = x->numel() / N / C;
+
+    auto* x_d = x->data<T>();
+    auto* dy_d = dy->data<T>();
+    auto* scale_d = scale->data<T>();
+    ConstEigenVectorArrayMap<T> scale_e(scale_d, C);
+
+    T* dx_d = dx ? dx->mutable_data<T>(ctx.GetPlace()) : nullptr;
+    T* dscale_d = dscale ? dscale->mutable_data<T>(ctx.GetPlace()) : nullptr;
+    T* dbias_d = dbias ? dbias->mutable_data<T>(ctx.GetPlace()) : nullptr;
+    EigenVectorArrayMap<T> dscale_e(dscale_d, C);
+    EigenVectorArrayMap<T> dbias_e(dbias_d, C);
+
+    if (layout == framework::DataLayout::kNCHW) {
+      // compute dx
+      int stride = C * HxW;
+      if (dx) {
+        for (int i = 0; i < N; i++) {
+          ConstEigenArrayMap<T> dy_e(dy_d, HxW, C);
+          EigenArrayMap<T> dx_e(dx_d, HxW, C);
+          dx_e = dy_e.rowwise() * scale_e.transpose();
+          dy_d += stride;
+          dx_d += stride;
+        }
+      }
+      // compute dscale and dbias
+      if (dscale && dbias) {
+        dy_d = dy->data<T>();
+        for (int i = 0; i < N; i++) {
+          ConstEigenArrayMap<T> x_e(x_d, HxW, C);
+          ConstEigenArrayMap<T> dy_e(dy_d, HxW, C);
+          if (i == 0) {
+            dscale_e = (x_e * dy_e).colwise().sum();
+          } else {
+            dscale_e += (x_e * dy_e).colwise().sum();
+          }
+          if (i == 0) {
+            dbias_e = dy_e.colwise().sum();
+          } else {
+            dbias_e += dy_e.colwise().sum();
+          }
+          x_d += stride;
+          dy_d += stride;
+        }
+      }
+    } else {
+      int num = N * HxW;
+      ConstEigenArrayMap<T> dy_e(dy_d, C, num);
+      // compute dx
+      if (dx) {
+        EigenArrayMap<T> dx_e(dx_d, C, num);
+        dx_e = dy_e.colwise() * scale_e;
+      }
+      // compute dscale and dbias
+      if (dscale && dbias) {
+        ConstEigenArrayMap<T> x_e(x_d, C, num);
+        dscale_e = (x_e * dy_e).rowwise().sum();
+        dbias_e = dy_e.rowwise().sum();
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(affine_channel, ops::AffineChannelOp,
+                  ops::AffineChannelOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(affine_channel_grad, ops::AffineChannelOpGrad);
+
+REGISTER_OP_CPU_KERNEL(affine_channel, ops::AffineChannelKernel<CPU, float>,
+                       ops::AffineChannelKernel<CPU, double>);
+REGISTER_OP_CPU_KERNEL(affine_channel_grad,
+                       ops::AffineChannelGradKernel<CPU, float>,
+                       ops::AffineChannelGradKernel<CPU, double>);
diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2bebdb345ab324eb0a2dafd54c74833dd21bdb6d
--- /dev/null
+++ b/paddle/fluid/operators/affine_channel_op.cu
@@ -0,0 +1,187 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "cub/cub.cuh"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, framework::DataLayout layout, bool HasBias>
+__global__ void KeAffineChannelCUDA(const T* x, const T* scale, const T* bias,
+                                    const int C, const int HxW, const int num,
+                                    T* y) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
+    if (HasBias) {
+      y[i] = scale[c] * x[i] + bias[c];
+    } else {
+      y[i] = scale[c] * x[i];
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class AffineChannelCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* scale = ctx.Input<framework::Tensor>("Scale");
+    auto* bias = ctx.Input<framework::Tensor>("Bias");
+
+    auto* y = ctx.Output<framework::Tensor>("Out");
+    y->mutable_data<T>(ctx.GetPlace());
+
+    const framework::DataLayout layout =
+        framework::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    auto dims = x->dims();
+    const int num = x->numel();
+    int N = dims[0];
+    int C = layout == framework::DataLayout::kNCHW ? dims[1]
+                                                   : dims[dims.size() - 1];
+    int HxW = num / N / C;
+
+    const T* x_d = x->data<T>();
+    const T* scale_d = scale->data<T>();
+    const T* bias_d = bias->data<T>();
+    T* y_d = y->data<T>();
+
+    int block = 1024;
+    int grid = (num + block - 1) / block;
+    if (layout == framework::DataLayout::kNCHW) {
+      KeAffineChannelCUDA<T, framework::DataLayout::kNCHW,
+                          true><<<grid, block, 0, dev_ctx.stream()>>>(
+          x_d, scale_d, bias_d, C, HxW, num, y_d);
+    } else {
+      KeAffineChannelCUDA<T, framework::DataLayout::kNHWC,
+                          true><<<grid, block, 0, dev_ctx.stream()>>>(
+          x_d, scale_d, bias_d, C, HxW, num, y_d);
+    }
+  }
+};
+
+template <typename T, int BlockDim, framework::DataLayout layout>
+__global__ void AffineChannelScaleBiasGradientCUDAKernel(
+    const T* dy, const T* x, const int N, const int C, const int HxW, T* dscale,
+    T* dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ds_storage;
+  __shared__ typename BlockReduce::TempStorage db_storage;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    T ds_sum = 0;
+    T db_sum = 0;
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == framework::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      ds_sum += dy[index] * x[index];
+      db_sum += dy[index];
+    }
+    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
+    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale[i] = ds_sum;
+      dbias[i] = db_sum;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename DeviceContext, typename T>
+class AffineChannelGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* scale = ctx.Input<framework::Tensor>("Scale");
+    auto* bias = ctx.Input<framework::Tensor>("Bias");
+    auto* dy = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+
+    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dscale =
+        ctx.Output<framework::Tensor>(framework::GradVarName("Scale"));
+    auto* dbias = ctx.Output<framework::Tensor>(framework::GradVarName("Bias"));
+
+    const framework::DataLayout layout =
+        framework::StringToDataLayout(ctx.Attr<std::string>("data_layout"));
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    auto dims = x->dims();
+    const int num = x->numel();
+    int N = dims[0];
+    int C = layout == framework::DataLayout::kNCHW ? dims[1]
+                                                   : dims[dims.size() - 1];
+    int HxW = num / N / C;
+
+    const T* x_d = x->data<T>();
+    const T* dy_d = dy->data<T>();
+    const T* s_d = scale->data<T>();
+
+    T* dx_d = dx ? dx->mutable_data<T>(ctx.GetPlace()) : nullptr;
+    T* ds_d = dscale ? dscale->mutable_data<T>(ctx.GetPlace()) : nullptr;
+    T* db_d = dbias ? dbias->mutable_data<T>(ctx.GetPlace()) : nullptr;
+
+    const int block = 1024;
+    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+    const int max_blocks = std::max(max_threads / block, 1);
+    int grid1 = (num + block - 1) / block;
+    int grid2 = std::min(C, max_blocks);
+    if (layout == framework::DataLayout::kNCHW) {
+      if (dx) {
+        KeAffineChannelCUDA<T, framework::DataLayout::kNCHW,
+                            false><<<grid1, block, 0, dev_ctx.stream()>>>(
+            dy_d, s_d, nullptr, C, HxW, num, dx_d);
+      }
+      if (dscale && dbias) {
+        AffineChannelScaleBiasGradientCUDAKernel<
+            T, block, framework::DataLayout::kNCHW><<<grid2, block, 0,
+                                                      dev_ctx.stream()>>>(
+            dy_d, x_d, N, C, HxW, ds_d, db_d);
+      }
+    } else {
+      if (dx) {
+        KeAffineChannelCUDA<T, framework::DataLayout::kNCHW,
+                            false><<<grid1, block, 0, dev_ctx.stream()>>>(
+            dy_d, s_d, nullptr, C, HxW, num, dx_d);
+      }
+      if (dscale && dbias) {
+        AffineChannelScaleBiasGradientCUDAKernel<
+            T, block, framework::DataLayout::kNHWC><<<grid2, block, 0,
+                                                      dev_ctx.stream()>>>(
+            dy_d, x_d, N, C, HxW, ds_d, db_d);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CUDA = paddle::platform::CUDADeviceContext;
+
+REGISTER_OP_CUDA_KERNEL(affine_channel,
+                        ops::AffineChannelCUDAKernel<CUDA, float>,
+                        ops::AffineChannelCUDAKernel<CUDA, double>);
+REGISTER_OP_CUDA_KERNEL(affine_channel_grad,
+                        ops::AffineChannelGradCUDAKernel<CUDA, float>,
+                        ops::AffineChannelGradCUDAKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index 0e4a90fcf4749171c882b0e05882920bf046cdd8..076ecc1f01d89913081892eb6aa828b095b09656 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/distributed/grpc_client.h"
-
 #include <sys/time.h>
-
 #include <limits>
 
 #include "glog/logging.h"  // For VLOG
 #include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/operators/distributed/grpc_client.h"
 #include "paddle/fluid/operators/distributed/grpc_serde.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -336,8 +334,11 @@ void GRPCClient::Proceed() {
       VLOG(3) << c->GetVarHandlePtr()->String() << " process";
       c->Process();
     } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) {
+      // FIXME(gongwb): parse error_details?
       LOG(ERROR) << c->GetVarHandlePtr()->String()
-                 << " meets grpc error:" << c->status_.error_message();
+                 << " meets grpc error, error_code:" << c->status_.error_code()
+                 << " error_message:" << c->status_.error_message()
+                 << " error_details:" << c->status_.error_details();
       {
         std::lock_guard<std::mutex> lk(sync_mutex_);
         ok_ = false;
@@ -345,7 +346,10 @@ void GRPCClient::Proceed() {
       c->Finish(false);
     } else {
       LOG(FATAL) << c->GetVarHandlePtr()->String()
-                 << " meets grpc error:" << c->status_.error_message();
+                 << " meets grpc error, error_code:" << c->status_.error_code()
+                 << " error_message:" << c->status_.error_message()
+                 << " error_details:" << c->status_.error_details();
+
       c->Finish(false);
     }
 
diff --git a/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
index 0cd3d3887cf5167c779a8b20442fdb458cd7eab4..8d2f055d53a0c5bbef624ff3b01b01724d0b3a21 100644
--- a/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
@@ -136,9 +136,9 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
     // since infershape can not get lod info
     PADDLE_ENFORCE_EQ(ref_lod.size(), 1UL, "Only support input lod size is 1.");
     PADDLE_ENFORCE_EQ(in1_lod.size(), 1UL, "Only support input lod size is 1.");
-    PADDLE_ENFORCE_EQ(in1_lod[0].size() - 1, N,
+    PADDLE_ENFORCE_EQ(static_cast<int>(in1_lod[0].size() - 1), N,
                       "Batch size of all inputs should be equal.");
-    PADDLE_ENFORCE_EQ(in1_lod[0][N], N,
+    PADDLE_ENFORCE_EQ(static_cast<int>(in1_lod[0][N]), N,
                       "Seq_length of other inputs should be 1.");
     PADDLE_ENFORCE_EQ(in1_dims[0], N, "input height should be batch size.");
     for (size_t i = 2; i < ins.size(); ++i) {
diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc
index 26590171bbeaa385ac09b04e5faf483924176598..7fdd1c6b76aebcea757540e7312a679b8c08402a 100644
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <sys/time.h>
 #include <cmath>    // for exp
 #include <cstring>  // for memcpy
+#include <random>
 #include <string>
 #include <vector>
 #include "gflags/gflags.h"
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 87b9e7d5a21d18d63f1300c5edf272eed5b174c6..85493c10549c290330ed09b9f28accb7a980de6a 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -661,6 +661,7 @@ function gen_fluid_lib() {
 EOF
         cmake .. -DWITH_DISTRIBUTE=OFF
         make -j `nproc` fluid_lib_dist
+        make -j `nproc` inference_lib_dist
       fi
 }
 
@@ -674,6 +675,8 @@ EOF
         cd ${PADDLE_ROOT}/build
         cp -r fluid_install_dir fluid
         tar -czf fluid.tgz fluid
+        cp -r fluid_inference_install_dir fluid_inference
+        tar -czf fluid_inference.tgz fluid_inference
       fi
 }
 
@@ -685,7 +688,9 @@ function test_fluid_lib() {
     ========================================
 EOF
         cd ${PADDLE_ROOT}/paddle/fluid/inference/api/demo_ci
-        ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} ${TENSORRT_INCLUDE_DIR:-/usr/local/TensorRT/include} ${TENSORRT_LIB_DIR:-/usr/local/TensorRT/lib}
+        ./run.sh ${PADDLE_ROOT} ${WITH_MKL:-ON} ${WITH_GPU:-OFF} ${INFERENCE_DEMO_INSTALL_DIR} \
+                 ${TENSORRT_INCLUDE_DIR:-/usr/local/TensorRT/include} \
+                 ${TENSORRT_LIB_DIR:-/usr/local/TensorRT/lib}
         ./clean.sh
       fi
 }
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 224781e6596e2b2bc6d3084f4780a33ed2903118..249e0c5c3990cd7b9cc33b24651c64942a1f7521 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -153,6 +153,7 @@ __all__ = [
     'mul',
     'sigmoid_cross_entropy_with_logits',
     'maxout',
+    'affine_channel',
 ]
 
 
@@ -7268,3 +7269,44 @@ def maxout(x, groups, name=None):
         attrs={"groups": groups},
         outputs={"Out": out})
     return out
+
+
+def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
+    """
+    Applies a separate affine transformation to each channel of the input.
+    Useful for replacing spatial batch norm with its equivalent fixed
+    transformation. The input also can be 2D tensor and applies a affine
+    transformation in second dimension.
+    
+    Args:
+        x (Variable): Feature map input can be a 4D tensor with order NCHW
+            or NHWC. It also can be a 2D tensor and the affine transformation
+            is applied in the second dimension.
+        scale (Variable): 1D input of shape (C), the c-th element is the scale
+            factor of the affine transformation for the c-th channel of
+            the input.
+        bias (Variable): 1D input of shape (C), the c-th element is the bias
+            of the affine transformation for the c-th channel of the input.
+        data_layout (string, default NCHW): NCHW or NHWC. If input is 2D
+            tensor, you can ignore data_layout.
+        name (str, default None): The name of this layer.
+
+    Returns:
+        out (Variable): A tensor of the same shape and data layout with x.
+    """
+    helper = LayerHelper("affine_channel", **locals())
+
+    if name is None:
+        out = helper.create_tmp_variable(dtype=x.dtype)
+    else:
+        out = helper.create_variable(
+            name=name, dtype=x.dtype, persistable=False)
+
+    helper.append_op(
+        type="affine_channel",
+        inputs={"X": x,
+                'Scale': scale,
+                'Bias': bias},
+        attrs={"data_layout": data_layout},
+        outputs={"Out": out})
+    return out
diff --git a/python/paddle/fluid/tests/unittests/test_affine_channel_op.py b/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c9a063e6ee75371e0d05e1ff6964753017881a1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_affine_channel_op.py
@@ -0,0 +1,106 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+
+
+def affine_channel(x, scale, bias, layout):
+    C = x.shape[1] if layout == 'NCHW' else x.shape[-1]
+    if len(x.shape) == 4:
+        new_shape = (1, C, 1, 1) if layout == 'NCHW' else (1, 1, 1, C)
+    else:
+        new_shape = (1, C)
+    scale = scale.reshape(new_shape)
+    bias = bias.reshape(new_shape)
+    return x * scale + bias
+
+
+class TestAffineChannelOp(OpTest):
+    def setUp(self):
+        self.op_type = "affine_channel"
+        self.init_test_case()
+
+        x = np.random.random(self.shape).astype("float32")
+        scale = np.random.random(self.C).astype("float32")
+        bias = np.random.random(self.C).astype("float32")
+
+        y = affine_channel(x, scale, bias, self.layout)
+
+        self.inputs = {'X': x, 'Scale': scale, 'Bias': bias}
+        self.attrs = {'data_layout': self.layout}
+        self.outputs = {'Out': y}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X', 'Scale', 'Bias'], 'Out')
+
+    def test_check_grad_stopgrad_dx(self):
+        self.check_grad(['Scale', 'Bias'], 'Out', no_grad_set=set('X'))
+
+    def test_check_grad_stopgrad_dscale_dbias(self):
+        self.check_grad(['X'], 'Out', no_grad_set=set(['Scale', 'Bias']))
+
+    def init_test_case(self):
+        self.shape = [2, 32, 14, 14]
+        self.C = 32
+        self.layout = 'NCHW'
+
+
+class TestAffineChannelNHWC(TestAffineChannelOp):
+    def init_test_case(self):
+        self.shape = [2, 14, 14, 32]
+        self.C = 32
+        self.layout = 'NHWC'
+
+
+class TestAffineChannel2D(TestAffineChannelOp):
+    def init_test_case(self):
+        self.shape = [16, 64]
+        self.C = 64
+        self.layout = 'NCHW'
+
+
+class TestAffineChannelNCHWLargeShape(TestAffineChannelOp):
+    def init_test_case(self):
+        self.shape = [64, 128, 112, 112]
+        self.C = 128
+        self.layout = 'NCHW'
+
+    # since the gradient check is very slow in large shape, so skip check_grad
+    def test_check_grad(self):
+        pass
+
+    def test_check_grad_stopgrad_dx(self):
+        pass
+
+    def test_check_grad_stopgrad_dscale_dbias(self):
+        pass
+
+
+class TestAffineChannelNCHWLargeShape(TestAffineChannelNCHWLargeShape):
+    def init_test_case(self):
+        self.shape = [64, 112, 112, 512]
+        self.C = 512
+        self.layout = 'NHWC'
+
+
+if __name__ == '__main__':
+    unittest.main()