Merge branch 'windows/build' into windows/online

test=develop

Merge branch 'windows/build' into windows/online
test=develop
a7c8e42d · peizhilin · 5d6b370a · 67562a6f · a7c8e42d · a7c8e42d
43 changed file
--- a/.gitignore
+++ b/.gitignore
+python/paddle/fluid/tests/unittests/reader_reset_test.recordio
 paddle/operators/check_t.save
 paddle/operators/check_tensor.ls
 paddle/operators/tensor.save

--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -42,6 +42,7 @@
 | QiJune | Jun Qi |
 | qingqing01 | Qing-Qing Dang |
 | reyoung | Yang Yu |
+| Sand3r- | Michal Gallus |
 | Superjom | Chun-Wei Yan |
 | tensor-tang | Jian Tang |
 | tianbingsz | Tian-Bing Xu |

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -166,8 +166,8 @@ copy(framework_lib DEPS ${framework_lib_deps}

 set(module "memory")
 copy(memory_lib
-        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h
-        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
+        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h ${src_dir}/${module}/allocation/*.h
+        DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail ${dst_dir}/${module}/allocation
        )

 set(inference_deps paddle_fluid_shared paddle_fluid)

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -31,9 +31,7 @@ function(windows_symbolic TARGET)
 endfunction()

 add_subdirectory(ir)
-if (NOT WIN32)
 add_subdirectory(details)
-endif (NOT WIN32)
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)

@@ -118,13 +116,8 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)

-if (NOT WIN32)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
    shape_inference data_transform lod_tensor profiler)
-else()
-cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
-    shape_inference data_transform lod_tensor)
-endif(NOT WIN32)

 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)

@@ -179,12 +172,10 @@ else()
  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()

-if (NOT WIN32)
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
        graph build_strategy
        fast_threaded_ssa_graph_executor)
-endif() # NOT WIN32

 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)

--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -13,9 +13,9 @@
 // limitations under the License.

 #pragma once
+#include <ThreadPool.h>
 #include <string>
 #include <vector>
-#include "ThreadPool.h"
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -100,6 +100,7 @@ class OperatorBase {

  const std::string& Type() const { return type_; }

+  bool HasAttr(const std::string& name) const { return attrs_.count(name); }
  template <typename T>
  inline const T& Attr(const std::string& name) const {
    PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",

--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -7,16 +7,17 @@ set(analysis_deps # analysis_deps can be extended accross the project
 add_subdirectory(ir_passes)
 add_subdirectory(passes)

-cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass ${INFER_IR_PASSES})
+cc_library(analysis_helper SRCS helper.cc DEPS framework_proto proto_desc graph paddle_fluid_api)
+
+cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass ${INFER_IR_PASSES} analysis_helper)

 cc_library(argument SRCS argument.cc DEPS scope proto_desc)
 cc_library(analysis_pass SRCS analysis_pass.cc DEPS proto_desc)

 cc_library(analysis SRCS
  analyzer.cc
-  helper.cc
  analysis_pass
-  DEPS ${analysis_deps}
+  DEPS ${analysis_deps} analysis_helper
  )

 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)

--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -30,6 +30,7 @@ TEST(Analyzer, analysis_without_tensorrt) {
  Argument argument;
  argument.SetModelDir(FLAGS_inference_model_dir);
  argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
+  argument.SetUseGPU(false);

  Analyzer analyser;
  analyser.Run(&argument);
@@ -41,6 +42,7 @@ TEST(Analyzer, analysis_with_tensorrt) {
  argument.SetTensorRtWorkspaceSize(1 << 20);
  argument.SetModelDir(FLAGS_inference_model_dir);
  argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
+  argument.SetUseGPU(false);

  Analyzer analyser;
  analyser.Run(&argument);

--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -116,6 +116,7 @@ struct Argument {
                      std::vector<std::string>);

  DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
+  DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
  DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
  DECL_ARGUMENT_FIELD(tensorrt_node_teller, TensorRtNodeTeller,
                      std::function<bool(const framework::ir::Node*)>);

--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
@@ -4,4 +4,6 @@ set(analysis_deps ${analysis_deps}
        subgraph_detector tensorrt_subgraph_pass
        CACHE INTERNAL "")

+set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
+file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n")
 set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc
@@ -46,7 +46,7 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
          {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
           "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
           "elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
-           "conv2d_transpose"});
+           "conv2d_transpose", "leaky_relu"});
      if (!node->IsOp()) return false;

      if (teller_set.count(node->Op()->Type())) {

--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -30,15 +30,28 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
  if (!argument->scope_valid()) {
    argument->SetScope(new framework::Scope);
  }
+  PADDLE_ENFORCE(argument->use_gpu_valid());
+
+  // The load program should run on the same device with the inference program,
+  // so that the parameters will on the same device, or they will keep copying
+  // between difference devices.
+  platform::Place place;
+  if (argument->use_gpu()) {
+    PADDLE_ENFORCE(argument->gpu_device_id_valid());
+    place = platform::CUDAPlace(argument->gpu_device_id());
+  } else {
+    place = platform::CPUPlace();
+  }

  if (argument->model_dir_valid()) {
-    auto program = LoadModel(argument->model_dir(), argument->scope_ptr());
+    auto program =
+        LoadModel(argument->model_dir(), argument->scope_ptr(), place);
    argument->SetMainProgram(program.release());
  } else if (argument->model_program_path_valid() &&
             argument->model_params_path_valid()) {
    auto program =
        LoadModel(argument->model_program_path(), argument->model_params_path(),
-                  argument->scope_ptr());
+                  argument->scope_ptr(), place);
    argument->SetMainProgram(program.release());
  } else {
    PADDLE_THROW(
@@ -52,16 +65,15 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
 }

 std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
-    const std::string &path, framework::Scope *scope) {
-  platform::CPUPlace place;
+    const std::string &path, framework::Scope *scope,
+    const platform::Place &place) {
  framework::Executor exe(place);
  return Load(&exe, scope, path);
 }

 std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
    const std::string &program_path, const std::string &params_path,
-    framework::Scope *scope) {
-  platform::CPUPlace place;
+    framework::Scope *scope, const platform::Place &place) {
  framework::Executor exe(place);
  return Load(&exe, scope, program_path, params_path);
 }

--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
@@ -17,6 +17,7 @@
 #include <string>
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
+#include "paddle/fluid/platform/place.h"

 namespace paddle {
 namespace inference {
@@ -32,11 +33,12 @@ class IrGraphBuildPass : public AnalysisPass {
  std::string repr() const override;

 private:
-  std::unique_ptr<framework::ProgramDesc> LoadModel(const std::string &path,
-                                                    framework::Scope *scope);
+  std::unique_ptr<framework::ProgramDesc> LoadModel(
+      const std::string &path, framework::Scope *scope,
+      const platform::Place &place);
  std::unique_ptr<framework::ProgramDesc> LoadModel(
      const std::string &program_path, const std::string &params_path,
-      framework::Scope *scope);
+      framework::Scope *scope, const platform::Place &place);

  std::string model_binary_str_;
 };

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -27,11 +27,10 @@ endif()
 cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope)
 cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder)
 cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
-cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder)
-cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder)
-cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api)
-cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api)
-
+cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager)
+cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS scope lod_tensor enforce)
+cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc)
+cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder DEPS zero_copy_tensor)

 cc_test(test_paddle_inference_api
        SRCS api_tester.cc

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -285,6 +285,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
  status_program_optimized_ = true;

  argument_.SetUseGPU(config_.use_gpu);
+  argument_.SetGPUDeviceId(config_.device);
  // Analyze inference_program
  if (!config_.model_dir.empty()) {
    argument_.SetModelDir(config_.model_dir);
@@ -491,8 +492,7 @@ bool AnalysisPredictor::LoadParameters() {
  }

  // Use NaiveExecutor to Load parameters.
-  platform::CPUPlace place;
-  framework::NaiveExecutor e(place);
+  framework::NaiveExecutor e(place_);
  e.Prepare(scope_.get(), *load_program, 0, false);
  e.Run();
  VLOG(3) << "get " << scope_->LocalVarNames().size() << " vars after load";
@@ -551,4 +551,5 @@ USE_TRT_CONVERTER(pad);
 USE_TRT_CONVERTER(split);
 USE_TRT_CONVERTER(prelu);
 USE_TRT_CONVERTER(conv2d_transpose);
+USE_TRT_CONVERTER(leaky_relu);
 #endif
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -116,8 +116,12 @@ class CpuPassStrategy : public PassStrategy {
 class GpuPassStrategy : public PassStrategy {
 public:
  GpuPassStrategy() : PassStrategy({}) {
+    // TODO(NHZlX) Problem with Data synchronization between GPU and CPU
+    // When running in GPU mode, the parameters are all on GPU. But the
+    // opearations of "conv_bn_fuse_pass" are on CPU.
    passes_.assign({
-        "infer_clean_graph_pass", "conv_bn_fuse_pass",
+        "infer_clean_graph_pass",
+        // "infer_clean_graph_pass", "conv_bn_fuse_pass",
    });
  }


--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -2,7 +2,7 @@
 nv_library(tensorrt_converter
           SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
                batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
-                pad_op.cc split_op.cc prelu_op.cc
+                pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc
           DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)

 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
@@ -38,3 +38,5 @@ nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc
 nv_test(test_trt_prelu_op SRCS test_prelu_op.cc prelu_op.cc
        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
        prelu_op SERIAL)
+nv_test(test_trt_leaky_relu_op SRCS test_leaky_relu_op.cc leaky_relu_op.cc
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine activation_op SERIAL)
--- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+// LeakyRelu converter from fluid to tensorRT
+class LeakyReluOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert fluid leaky_relu op to tensorrt layer";
+
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    int input_num = op_desc.Input("X").size();
+    PADDLE_ENFORCE(input_num == 1);
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    // Get output
+    size_t output_num = op_desc.Output("Out").size();
+    PADDLE_ENFORCE(output_num == 1);
+    // Get attrs
+    float alpha = boost::get<float>(op_desc.GetAttr("alpha"));
+
+    platform::CPUPlace place;
+    std::unique_ptr<framework::LoDTensor> alpha_tensor(
+        new framework::LoDTensor());
+    alpha_tensor->Resize(framework::make_ddim({2}));
+    float* alpha_data = alpha_tensor->mutable_data<float>(place);
+    alpha_data[0] = alpha;
+    alpha_data[1] = 1.f - alpha;
+    // the leaky relu formula y = (x > 0) ? x : alpha * x is equal to
+    // y = alpha * x + (x > 0) ? (1 - alpha) * x : 0
+    TensorRTEngine::Weight scale{nvinfer1::DataType::kFLOAT, &alpha_data[0], 1};
+    TensorRTEngine::Weight shift{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    TensorRTEngine::Weight power{nvinfer1::DataType::kFLOAT, nullptr, 0};
+    // y_scale = alpha * x
+    auto* scale_layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Scale, *input, nvinfer1::ScaleMode::kUNIFORM, shift.get(),
+        scale.get(), power.get());
+    PADDLE_ENFORCE(nullptr != scale_layer);
+    // y_relu = (x > 0) : x : 0
+    auto* relu_layer = TRT_ENGINE_ADD_LAYER(engine_, Activation, *input,
+                                            nvinfer1::ActivationType::kRELU);
+    PADDLE_ENFORCE(nullptr != relu_layer);
+    //
+    TensorRTEngine::Weight sub_scale{nvinfer1::DataType::kFLOAT, &alpha_data[1],
+                                     1};
+    auto* scale_relu_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Scale, *(relu_layer->getOutput(0)),
+                             nvinfer1::ScaleMode::kUNIFORM, shift.get(),
+                             sub_scale.get(), power.get());
+    PADDLE_ENFORCE(nullptr != scale_relu_layer);
+    auto* output_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *(scale_layer->getOutput(0)),
+                             *(scale_relu_layer->getOutput(0)),
+                             nvinfer1::ElementWiseOperation::kSUM);
+    PADDLE_ENFORCE(nullptr != output_layer);
+    // keep alpha tensor to avoid release it's memory
+    std::string alpha_name = op_desc.Output("Out")[0] + "_alpha";
+    PADDLE_ENFORCE(engine_->weight_map.find(alpha_name) ==
+                   engine_->weight_map.end());
+    engine_->weight_map[alpha_name] = std::move(alpha_tensor);
+
+    std::string layer_name = "leaky_relu (Output: ";
+    auto output_name = op_desc.Output("Out")[0];
+    output_layer->getOutput(0)->setName(output_name.c_str());
+    engine_->SetITensor(output_name, output_layer->getOutput(0));
+    layer_name += output_name;
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+    output_layer->setName((layer_name + ")").c_str());
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(leaky_relu, LeakyReluOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(leaky_relu_op, test_leaky_relu) {
+  std::unordered_set<std::string> parameters;
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("leaky_relu_input", nvinfer1::DimsCHW(3, 2, 2));
+  validator.DeclOutputVar("leaky_relu_out", nvinfer1::DimsCHW(3, 2, 2));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("leaky_relu");
+  desc.SetInput("X", {"leaky_relu_input"});
+  desc.SetOutput("Out", {"leaky_relu_out"});
+
+  desc.SetAttr("alpha", 0.1f);
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(1);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+// USE_OP(leaky_relu);
+USE_OP(leaky_relu);
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
 nv_library(tensorrt_plugin
           SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu
-           DEPS enforce device_context)
+           DEPS enforce tensorrt_engine)
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
 set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor)

+if(WITH_GPU AND TENSORRT_FOUND)
+    set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor)
+endif()
+
 function(download_model install_dir model_name)
    if (NOT EXISTS ${install_dir})
        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${model_name})
@@ -27,14 +31,14 @@ function(inference_analysis_api_test_with_fake_data target install_dir filename
 endfunction()

 # RNN1
-if(NOT APPLE)
+if(NOT APPLE AND WITH_MKLML)
    set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
    download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz")
    inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc)
 else()
-    # TODO: fix this test on MACOS, the reason is that
-    # fusion_seqexpand_concat_fc_op is not supported on MACOS
-    message(WARNING "These tests has been disabled in OSX before being fixed: \n test_analyzer_rnn1")
+    # TODO: fix this test on MACOS and OPENBLAS, the reason is that
+    # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
+    message(WARNING "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_rnn1")
 endif()

 # RNN2
@@ -75,11 +79,11 @@ endif()
 inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)

 # resnet50
-inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 
+inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
  "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz")

 # mobilenet with depthwise_conv op
-inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet 
+inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet
  "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz")

 # anakin
@@ -89,15 +93,15 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
   set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1")
   inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn.anakin2.model.bin")
   inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn_data.txt")
-   cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc 
-           ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin 
+   cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc
+           ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin
                --datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt
           DEPS inference_anakin_api_shared SERIAL)
   # anakin mobilenet
   if(WITH_GPU)
       set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet")
       inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin")
-       cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc 
+       cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc
               ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin
               DEPS inference_anakin_api_shared dynload_cuda SERIAL)
   endif()
@@ -109,6 +113,6 @@ if(WITH_GPU AND TENSORRT_FOUND)
       inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
   endif()
   inference_analysis_test(test_trt_models SRCS trt_models_tester.cc
-      EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor
+      EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
        ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
 endif()
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -222,19 +222,36 @@ void TestMultiThreadPrediction(
      // The inputs of each thread are all the same.
      std::vector<PaddleTensor> outputs_tid;
      auto &predictor = predictors[tid];
-      LOG(INFO) << "running thread " << tid;
-      Timer timer;
-      timer.tic();
-      for (int i = 0; i < num_times; i++) {
-        for (const auto &input : inputs) {
-          ASSERT_TRUE(predictor->Run(input, &outputs_tid));
+
+      // warmup run
+      LOG(INFO) << "Running thread " << tid << ", warm up run...";
+      {
+        Timer warmup_timer;
+        warmup_timer.tic();
+        predictor->Run(inputs[0], outputs, batch_size);
+        PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1);
+#if !defined(_WIN32)
+        if (FLAGS_profile) {
+          paddle::platform::ResetProfiler();
        }
+#endif
      }

-      auto time = timer.toc();
-      total_time += time;
-      PrintTime(batch_size, num_times, num_threads, tid, time / num_times,
-                inputs.size());
+      LOG(INFO) << "Thread " << tid << " run " << num_times << " times...";
+      {
+        Timer timer;
+        timer.tic();
+        for (int i = 0; i < num_times; i++) {
+          for (const auto &input : inputs) {
+            ASSERT_TRUE(predictor->Run(input, &outputs_tid));
+          }
+        }
+
+        auto time = timer.toc();
+        total_time += time;
+        PrintTime(batch_size, num_times, num_threads, tid, time / num_times,
+                  inputs.size());
+      }
    });
  }
  for (int i = 0; i < num_threads; ++i) {

--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -145,5 +145,3 @@ TEST(TensorRT_mobilenet, analysis) {

 }  // namespace inference
 }  // namespace paddle
-
-USE_PASS(tensorrt_subgraph_pass);
--- a/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_mkldnn_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <mkldnn/include/mkldnn.hpp>
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+#include "paddle/fluid/operators/math/jit_kernel.h"
+#include "xbyak.h"
+#include "xbyak_util.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::DataLayout;
+using mkldnn::memory;
+
+static mkldnn::memory::format StringToMKLDNNFormat(std::string& format) {
+  std::transform(format.begin(), format.end(), format.begin(), ::tolower);
+
+  if (!format.compare("nchw")) {
+    return memory::format::nchw;
+  } else if (!format.compare("nchw16c")) {
+    return memory::format::nChw16c;
+  } else if (!format.compare("nchw8c")) {
+    return memory::format::nChw8c;
+  } else if (!format.compare("nhwc")) {
+    return memory::format::nhwc;
+  } else {
+    return memory::format::any;
+  }
+}
+
+static void UpdateDataFormat(const framework::ExecutionContext& ctx,
+                             framework::Tensor* tensor, const char* attribute) {
+  if (ctx.op().HasAttr(attribute)) {
+    auto format_as_string = ctx.Attr<std::string>(attribute);
+    auto format = StringToMKLDNNFormat(format_as_string);
+    if (format != memory::format::any) {
+      tensor->set_format(format);
+    }
+  }
+}
+
+template <typename T>
+static void ReorderInput(framework::Tensor* tensor,
+                         const platform::Place& place,
+                         const mkldnn::engine& engine, bool isFourDim) {
+  using platform::to_void_cast;
+  auto dims = paddle::framework::vectorize2int(tensor->dims());
+  framework::Tensor out_tensor;
+  out_tensor.Resize(tensor->dims());
+  out_tensor.set_format(isFourDim ? memory::format::nchw : memory::format::nc);
+  out_tensor.set_layout(tensor->layout());
+  mkldnn::memory input_memory = {
+      {{dims, platform::MKLDNNGetDataType<T>(), tensor->format()}, engine},
+      to_void_cast<T>(tensor->data<T>())};
+  mkldnn::memory output_memory = {
+      {{dims, platform::MKLDNNGetDataType<T>(), out_tensor.format()}, engine},
+      to_void_cast<T>(out_tensor.mutable_data<T>(place))};
+  platform::Reorder(input_memory, output_memory);
+  tensor->ShareDataWith(out_tensor);
+}
+
+template <typename T>
+class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    int axis = ctx.Attr<int>("axis");
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    const T* x_data = x->data<T>();
+    const T* y_data = y->data<T>();
+    T* z_data = z->mutable_data<T>(ctx.GetPlace());
+
+    auto x_dims = x->dims();
+    auto y_dims_untrimmed = y->dims();
+    auto x_int_dims = paddle::framework::vectorize2int(x_dims);
+
+    UpdateDataFormat(ctx, (Tensor*)x, "x_data_format");
+    UpdateDataFormat(ctx, (Tensor*)y, "y_data_format");
+
+    Xbyak::util::Cpu cpu;
+    const bool is_avx512_enabled = cpu.has(Xbyak::util::Cpu::tAVX512F);
+    const bool are_dims_divisable = !(x_int_dims[1] % 16);
+    const bool is_x_format_correct = x->format() == memory::format::nChw16c;
+    const bool is_y_format_correct = y->format() == memory::format::nc;
+    if (is_x_format_correct && is_y_format_correct && are_dims_divisable &&
+        is_avx512_enabled) {
+      int pre, n, post;
+      get_mid_dims(x_dims, y_dims_untrimmed, axis, &pre, &n, &post);
+
+      if (post == 1) {
+        PADDLE_THROW("Not implemented when post is 1");
+      } else {
+        // Just check whether it works for RE-Resnext.
+        PADDLE_ENFORCE_EQ(x_dims.size(), 4, "X should have 4 dimensions");
+
+        int n = x_dims[0];
+        int c = x_dims[1];
+        int h = x_dims[2];
+        int w = x_dims[3];
+
+        PADDLE_ENFORCE(y_dims_untrimmed[0] == n && y_dims_untrimmed[1] == c,
+                       "Y should be in nc format");
+
+        constexpr int simd_width = 16;
+        int C = c / simd_width;
+
+        const auto& multiply =
+            math::jitkernel::KernelPool::Instance()
+                .template Get<math::jitkernel::EltwiseMulnChw16cNCKernel<T>>(n);
+
+#pragma omp parallel for collapse(2)
+        for (int ni = 0; ni < n; ni++) {
+          for (int ci = 0; ci < C; ci++) {
+            auto ptr_x =
+                x_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
+
+            auto ptr_y = y_data + ni * C * simd_width + ci * simd_width;
+            auto ptr_z =
+                z_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
+
+            multiply->Compute(ptr_x, ptr_y, ptr_z, h, w);
+          }
+        }
+      }
+
+      z->set_layout(DataLayout::kMKLDNN);
+      z->set_format(x->format());
+    } else {
+      // Fallback to naive version:
+      const bool are_inputs_in_same_format = x->format() == y->format();
+      const bool is_x_nchw = x->format() == memory::format::nchw;
+      const bool is_x_nc = x->format() == memory::format::nc;
+      const bool is_y_nchw = y->format() == memory::format::nchw;
+      const bool is_y_nc = y->format() == memory::format::nc;
+      if (!are_inputs_in_same_format) {
+        using platform::MKLDNNDeviceContext;
+        auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+        const auto& mkldnn_engine = dev_ctx.GetEngine();
+        if (!(is_x_nchw || is_x_nc))
+          ReorderInput<T>((Tensor*)x, ctx.GetPlace(), mkldnn_engine,
+                          x->dims().size() == 4);
+        if (!(is_y_nchw || is_y_nc))
+          ReorderInput<T>((Tensor*)y, ctx.GetPlace(), mkldnn_engine,
+                          y->dims().size() == 4);
+      }
+
+      auto mul_func = [](T a, T b) -> T { return a * b; };
+
+      TransformFunctor<decltype(mul_func), T,
+                       paddle::platform::CPUDeviceContext, T>
+          functor(
+              x, y, z,
+              ctx.template device_context<paddle::platform::CPUDeviceContext>(),
+              mul_func);
+
+      axis = (axis == -1 ? x_dims.size() - y_dims_untrimmed.size() : axis);
+      PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                     "Axis should be in range [0, x_dims)");
+
+      auto y_dims = trim_trailing_singular_dims(y_dims_untrimmed);
+      axis = (y_dims.size() == 0) ? x_dims.size() : axis;
+
+      int pre, n, post;
+      get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
+
+      if (post == 1) {
+        functor.RunRowWise(n, pre);
+      } else {
+        functor.RunMidWise(n, pre, post);
+      }
+      z->set_layout(DataLayout::kMKLDNN);
+      z->set_format(x->format());
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(elementwise_mul, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::ElementwiseMulMKLDNNKernel<float>)
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -97,6 +97,20 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
        .EqualGreaterThan(-1);
    AddAttr<bool>("use_mkldnn", "(bool, default false). Used by MKLDNN.")
        .SetDefault(false);
+    AddAttr<std::string>(
+        "x_data_format",
+        "(string, default NCHW) Only used in mkldnn"
+        "An optional string from: \"NHWC\", \"NCHW\", \"NCHW16C\", \"NCHW8C\". "
+        "Defaults to \"\". Specify the data format of the output data, "
+        "the input will be transformed automatically. ")
+        .SetDefault("");
+    AddAttr<std::string>(
+        "y_data_format",
+        "(string, default \"\") Only used in mkldnn"
+        "An optional string from: \"NHWC\", \"NCHW\", \"NCHW16C\", \"NCHW8C\". "
+        "Defaults to \"\". Specify the data format of the output data, "
+        "the input will be transformed automatically. ")
+        .SetDefault("");
    AddComment(string::Sprintf(R"DOC(
 Elementwise %s Operator


--- a/paddle/fluid/operators/math/jit_code.h
+++ b/paddle/fluid/operators/math/jit_code.h
@@ -322,6 +322,42 @@ class VActJitCode : public JitCode {
  ymm_t ymm_dst = ymm_t(1);
 };

+#ifdef PADDLE_WITH_MKLDNN
+struct EltwiseMulnChw16cNC : public Xbyak::CodeGenerator {
+  explicit EltwiseMulnChw16cNC(size_t code_size = 256 * 1024)
+      : Xbyak::CodeGenerator(code_size) {
+    // RDI is ptr x_input
+    // RSI is ptr y_input
+    // RDX is ptr output
+    // RCX is height
+    // r8 is width
+
+    push(rbx);
+
+    xor_(rax, rax);
+    xor_(r10, r10);
+    vmovups(zmm3, ptr[rsi]);
+
+    L("h_loop");
+    xor_(rbx, rbx);
+    L("w_loop");
+    vmovups(zmm2, ptr[rdi + rax]);
+    vmulps(zmm1, zmm2, zmm3);
+    vmovups(ptr[rdx + rax], zmm1);
+    add(rax, 64);
+    inc(rbx);
+    cmp(r8, rbx);
+    jnz("w_loop");
+    inc(r10);
+    cmp(r10, rcx);
+    jnz("h_loop");
+
+    pop(rbx);
+    ret();
+  }
+};
+#endif
+
 }  // namespace gen
 }  // namespace jitkernel
 }  // namespace math

--- a/paddle/fluid/operators/math/jit_kernel.h
+++ b/paddle/fluid/operators/math/jit_kernel.h
@@ -95,6 +95,15 @@ class VAddBiasKernel : public Kernel {
  void (*Compute)(const T *, const T *, T *, int);
 };

+#ifdef PADDLE_WITH_MKLDNN
+template <typename T>
+class EltwiseMulnChw16cNCKernel : public Kernel {
+ public:
+  // nChw16c = nChw16c .* NC
+  void (*Compute)(const float *, const float *, float *, int, int);
+};
+#endif
+
 template <typename T>
 class VActKernel : public Kernel {
 public:

--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
@@ -226,6 +226,44 @@ bool VAddKernelImpl<double>::useMKL(int d) {
 }
 #endif

+#ifdef PADDLE_WITH_MKLDNN
+/* EltwiseMul for nChw16c & NC inputs JitKernel */
+template <typename T>
+class EltwiseMulnChw16cNCKernelImpl
+    : public math::jitkernel::EltwiseMulnChw16cNCKernel<T> {
+ public:
+  JITKERNEL_DECLARE_STATIC_FUNC;
+  explicit EltwiseMulnChw16cNCKernelImpl(int d)
+      : EltwiseMulnChw16cNCKernel<T>() {
+    using mul_func_t = void (*)(const float*, const float*, float*, int, int);
+#ifdef PADDLE_WITH_XBYAK
+    if (useJIT(d)) {
+      // roughly estimate the size of code
+      size_t sz = 96 + d / YMM_FLOAT_BLOCK * 4 * 8;
+      sz = sz > 4096 ? sz : 4096;
+      jitcode_.reset(new gen::EltwiseMulnChw16cNC(sz));
+      this->Compute = (mul_func_t)jitcode_->getCode();
+      return;
+    }
+#endif
+    PADDLE_THROW(
+        "This kernel shouldn't be used in Non-Xbyak, Non-MKL-DNN "
+        "environemnt");
+  }
+
+#ifdef PADDLE_WITH_XBYAK
+
+ private:
+  std::unique_ptr<gen::EltwiseMulnChw16cNC> jitcode_{nullptr};
+};
+
+template <>
+bool EltwiseMulnChw16cNCKernelImpl<float>::useJIT(int d) {
+  return true;
+}
+#endif
+#endif
+
 /* VAddRelu JitKernel */
 template <typename T>
 class VAddReluKernelImpl : public VAddReluKernel<T> {
@@ -394,6 +432,9 @@ REGISTER_JITKERNEL(vscal, VScalKernel);
 REGISTER_JITKERNEL(vaddbias, VAddBiasKernel);
 REGISTER_JITKERNEL(vrelu, VReluKernel);
 REGISTER_JITKERNEL(videntity, VIdentityKernel);
+#ifdef PADDLE_WITH_MKLDNN
+REGISTER_JITKERNEL(eltwise_mul_nchw16c, EltwiseMulnChw16cNCKernel);
+#endif

 }  // namespace jitkernel
 }  // namespace math

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
-if (NOT WIN32)
 proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto)
 py_proto_compile(profiler_py_proto SRCS profiler.proto)

@@ -6,11 +5,19 @@ add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch _

 add_dependencies(profiler_py_proto profiler_py_proto_init)

+if (NOT WIN32)
 add_custom_command(TARGET profiler_py_proto POST_BUILD
        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
        COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
        COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+else(NOT WIN32)
+string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler/")
+add_custom_command(TARGET profiler_py_proto POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
+        COMMAND copy /Y *.py ${proto_dstpath}
+        COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 endif(NOT WIN32)

 if(WITH_GPU)
@@ -60,12 +67,9 @@ cc_test(init_test SRCS init_test.cc DEPS device_context)
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)

-
-if (NOT WIN32)
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
-endif(NOT WIN32)

 nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
 cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)

--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -13,17 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once

-#if !defined(_WIN32)
-#include <sys/time.h>
-#else
-#include <windows.h>
-#endif  // !_WIN32
-
-#include <time.h>
 #include <chrono>  // NOLINT
 #include <string>

 #include "paddle/fluid/platform/dynload/cupti.h"
+#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.pb.h"

 namespace paddle {
@@ -32,15 +26,11 @@ namespace platform {
 ///////////////////////
 // WARN: Under Development. Don't depend on it yet.
 //////////////////////
-#if !defined(_WIN32)
 inline uint64_t PosixInNsec() {
  struct timeval tv;
  gettimeofday(&tv, nullptr);
  return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
 }
-#else
-inline uint64_t PosixInNsec() { return static_cast<uint64_t>(0); }
-#endif  // !_WIN32

 // DeviceTracer performs the following tasks:
 // 1. Register cuda callbacks for various events: kernel, memcpy, etc.

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -38,6 +38,7 @@ std::once_flag p2p_init_flag;

 void InitGflags(std::vector<std::string> argv) {
  std::call_once(gflags_init_flag, [&]() {
+    FLAGS_logtostderr = true;
    argv.insert(argv.begin(), "dummy");
    int argc = argv.size();
    char **arr = new char *[argv.size()];

--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -17,6 +17,7 @@
 #include <cstdio>
 #include <stdexcept>

+#include <time.h>
 #include <memory>
 #include <string>

@@ -27,6 +28,7 @@
 #include <dlfcn.h>     //  dladdr
 #include <execinfo.h>  // backtrace
 #include <sys/stat.h>
+#include <sys/time.h>
 #include <algorithm>  // std::accumulate
 #else
 #include <io.h>  // _popen, _pclose
@@ -57,6 +59,25 @@ static void *dlopen(const char *filename, int flag) {
  return reinterpret_cast<void *>(hModule);
 }

+static int gettimeofday(struct timeval *tp, void *tzp) {
+  time_t clock;
+  struct tm tm;
+  SYSTEMTIME wtm;
+
+  GetLocalTime(&wtm);
+  tm.tm_year = wtm.wYear - 1900;
+  tm.tm_mon = wtm.wMonth - 1;
+  tm.tm_mday = wtm.wDay;
+  tm.tm_hour = wtm.wHour;
+  tm.tm_min = wtm.wMinute;
+  tm.tm_sec = wtm.wSecond;
+  tm.tm_isdst = -1;
+  clock = mktime(&tm);
+  tp->tv_sec = clock;
+  tp->tv_usec = wtm.wMilliseconds * 1000;
+
+  return (0);
+}
 #endif  // !_WIN32

 static void ExecShellCommand(const std::string &cmd, std::string *message) {

--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/port.h"

-#include <sys/time.h>
 #include <algorithm>
 #include <iomanip>
 #include <limits>
@@ -438,10 +438,10 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
            event_items[index].total_time += event_time;
            // min time
            event_items[index].min_time =
-                std::min(event_time, event_items[index].min_time);
+                (std::min)(event_time, event_items[index].min_time);
            // max time
            event_items[index].max_time =
-                std::max(event_time, event_items[index].max_time);
+                (std::max)(event_time, event_items[index].max_time);
          }

          // remove the push marker from the list

--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -69,7 +69,6 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx);

 void PopEvent(const std::string& name, const DeviceContext* dev_ctx);

-#if !defined(_WIN32)
 struct RecordEvent {
  // dev_ctx can be set to nullptr if device is cpu.
  RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
@@ -106,15 +105,6 @@ struct RecordBlock {
  std::string name_;
  uint64_t start_ns_;
 };
-#else
-// windows do not support profiler temporarily.
-struct RecordEvent {
-  RecordEvent(const std::string& name, const DeviceContext* dev_ctx) {}
-};
-struct RecordBlock {
-  explicit RecordBlock(int block_id) {}
-};
-#endif

 // Return the event list of all threads. Assumed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.

--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -45,16 +45,15 @@ class StreamCallbackManager {
  inline void AddCallback(Callback &&callback) const {
    auto *stream_callback_context =
        new StreamCallbackContext(this, std::forward<Callback>(callback));
-    PADDLE_ENFORCE(
 #if CUDA_VERSION >= 10000
-        cudaLaunchHostFunc(stream_, StreamCallbackManager::StreamCallbackFunc,
-                           stream_callback_context)
+    PADDLE_ENFORCE(cudaLaunchHostFunc(stream_,
+                                      StreamCallbackManager::StreamCallbackFunc,
+                                      stream_callback_context));  // NOLINT
 #else
-        cudaStreamAddCallback(stream_,
-                              StreamCallbackManager::StreamCallbackFunc,
-                              stream_callback_context, 0)
+    PADDLE_ENFORCE(cudaStreamAddCallback(
+        stream_, StreamCallbackManager::StreamCallbackFunc,
+        stream_callback_context, 0));  // NOLINT
 #endif
-            );  // NOLINT
  }

  void Wait() const { thread_pool_.reset(new ThreadPool(1)); }

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt

-set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder)
+set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder parallel_executor profiler)
 set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc)
-if(NOT WIN32)
-  list(APPEND PYBIND_DEPS parallel_executor profiler)
-endif(NOT WIN32)
 if(WITH_PYTHON)
  if(WITH_AMD_GPU)
    hip_library(paddle_pybind SHARED

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -36,9 +36,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
-#ifndef _WIN32
 #include "paddle/fluid/framework/parallel_executor.h"
-#endif
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -637,7 +635,6 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
 #endif

-#ifndef _WIN32
  py::enum_<platform::ProfilerState>(m, "ProfilerState", py::arithmetic())
      .value("kDisabled", platform::ProfilerState::kDisabled)
      .value("kCPU", platform::ProfilerState::kCPU)
@@ -658,7 +655,6 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("disable_profiler", platform::DisableProfiler);
  m.def("is_profiler_enabled", platform::IsProfileEnabled);
  m.def("reset_profiler", platform::ResetProfiler);
-#endif

  py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
  pass.def(py::init())
@@ -687,7 +683,6 @@ All parameter, weight, gradient are variables in Paddle.
      .def("remove_pass",
           [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });

-#ifndef _WIN32
  // -- python binds for parallel executor.
  py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
  py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
@@ -913,7 +908,6 @@ All parameter, weight, gradient are variables in Paddle.
        pybind11::gil_scoped_release release;
        self.Run(fetch_tensors, fetched_var_name);
      });
-#endif

  BindRecordIOWriter(&m);
  return m.ptr();

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -47,8 +47,7 @@ from . import profiler
 from . import unique_name
 from . import recordio_writer
 from . import parallel_executor
-if os.name != 'nt':
-    from .parallel_executor import *
+from .parallel_executor import *
 from paddle.fluid.layers.math_op_patch import monkey_patch_variable

 Tensor = LoDTensor

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -25,264 +25,263 @@ import os

 __all__ = ['ParallelExecutor', 'ExecutionStrategy', 'BuildStrategy']

-if os.name != 'nt':
-    ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
-    BuildStrategy = core.ParallelExecutor.BuildStrategy
-
-    class ParallelExecutor(object):
+ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
+BuildStrategy = core.ParallelExecutor.BuildStrategy
+
+
+class ParallelExecutor(object):
+    """
+    ParallelExecutor is designed for data parallelism, which focuses on distributing
+    the data across different nodes and every node operates on the data in parallel.
+    If you use ParallelExecutor to run the current program on GPU, the node means GPU
+    device, and ParallelExecutor will get the available GPU device automatically on
+    the current machine. If you use ParallelExecutor to run the current program on CPU,
+    the node means the CPU device, and you can specify the CPU device number by adding
+    'CPU_NUM' environment variable, for example 'CPU_NUM=4', if the environment variable
+    is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number
+    of CPUs in the system.
+
+    Args:
+        use_cuda (bool): Whether to use CUDA or not.
+        loss_name (str): The loss name must set in training. Default None.
+        main_program (Program): The program that need to run, if not provided,
+            then default_main_program will be used. Default None.
+        share_vars_from(ParallelExecutor): If provide, it will share variables
+            from the specified ParallelExecutor. Default None.
+        exec_strategy(ExecutionStrategy): exec_strategy is used to control how to run
+            the program in ParallelExecutor, for example how many threads are used to
+            execute the program, how many iterations to clean up the temp variables
+            which is generated during execution. For more information, please refer
+            to fluid.ExecutionStrategy. Default None.
+        build_strategy(BuildStrategy): build_strategy is used to control how to
+            build the SSA Graph in ParallelExecutor by setting the property,
+            for example reduce_strategy, gradient_scale_strategy. For more information,
+            please refer to fluid.BuildStrategy. Default None.
+        num_trainers(int): If greater than 1, NCCL will be initialized with
+            multiple rank of nodes, each node should have same number of GPUs.
+            Distributed training will be enabled then. Default 1.
+        trainer_id(int): Must use together with num_trainers. trainer_id is the
+            "rank" of current node starts from 0. Default 0.
+        scope(Scope): scope to run with, default use fluid.global_scope().
+
+    Returns:
+        ParallelExecutor: The initialized ParallelExecutor object.
+
+    Raises:
+        TypeError: If share_vars_from is provided, but not ParallelExecutor object.
+
+    Examples:
+        .. code-block:: python
+
+          train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name)
+          test_exe = fluid.ParallelExecutor(use_cuda=True,
+                                            main_program=test_program,
+                                            share_vars_from=train_exe)
+
+          train_loss, = train_exe.run([loss.name], feed=feed_dict)
+          test_loss, = test_exe.run([loss.name], feed=feed_dict)
+    """
+
+    def __init__(self,
+                 use_cuda,
+                 loss_name=None,
+                 main_program=None,
+                 share_vars_from=None,
+                 exec_strategy=None,
+                 build_strategy=None,
+                 num_trainers=1,
+                 trainer_id=0,
+                 scope=None):
+        self._places = []
+        self._act_places = []
+        if use_cuda:
+            for i in six.moves.range(core.get_cuda_device_count()):
+                p = core.Place()
+                self._act_places.append(core.CUDAPlace(i))
+                p.set_place(self._act_places[-1])
+                self._places.append(p)
+        else:
+            cpu_num = int(
+                os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+            for i in six.moves.range(cpu_num):
+                p = core.Place()
+                self._act_places.append(core.CPUPlace())
+                p.set_place(self._act_places[-1])
+                self._places.append(p)
+        assert self._places, "no place for execution"
+
+        if exec_strategy is None:
+            exec_strategy = ExecutionStrategy()
+        exec_strategy.use_cuda = use_cuda
+
+        if exec_strategy.num_threads == 0:
+            if use_cuda:
+                # Experiments on se-resnext shows that too many threads hurt
+                # performance. Worth tunning for other models in the future.
+                exec_strategy.num_threads = len(self._places) * 4
+            else:
+                cpu_num = int(
+                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+                exec_strategy.num_threads = cpu_num * 2
+
+        # Set 1 thread num under nccl2 distribute 
+        #   env to make sure all gpus run ops in same order.
+        if num_trainers > 1:
+            assert (use_cuda)
+            # FIXME(gongwb): avoid this set.
+            exec_strategy.num_threads = 1
+
+        if build_strategy is None:
+            build_strategy = BuildStrategy()
+
+        main = main_program
+        main = main if main else framework.default_main_program()
+        if scope == None:
+            scope = executor.global_scope()
+
+        if share_vars_from and not isinstance(share_vars_from,
+                                              ParallelExecutor):
+            raise TypeError("share_vars_from must be ParallelExecutor.")
+
+        local_scopes = share_vars_from.executor.local_scopes(
+        ) if share_vars_from else []
+
+        self.persistable_vars = [
+            v.name for v in [
+                var for var in main.list_vars()
+                if var.persistable and var.type != core.VarDesc.VarType.RAW
+            ]
+        ]
+
+        self.executor = core.ParallelExecutor(
+            self._places,
+            set([
+                cpt.to_text(p.name)
+                for p in main.global_block().iter_parameters()
+                if not p.stop_gradient
+            ]),
+            set(cpt.to_text(var) for var in self.persistable_vars), main.desc,
+            cpt.to_text(loss_name)
+            if loss_name else six.u(''), scope, local_scopes, exec_strategy,
+            build_strategy, num_trainers, trainer_id)
+        self.scope = scope
+
+    def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
        """
-        ParallelExecutor is designed for data parallelism, which focuses on distributing
-        the data across different nodes and every node operates on the data in parallel.
-        If you use ParallelExecutor to run the current program on GPU, the node means GPU
-        device, and ParallelExecutor will get the available GPU device automatically on
-        the current machine. If you use ParallelExecutor to run the current program on CPU,
-        the node means the CPU device, and you can specify the CPU device number by adding
-        'CPU_NUM' environment variable, for example 'CPU_NUM=4', if the environment variable
-        is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number
-        of CPUs in the system.
+        Run a parallel executor with fetch_list.
+
+        The feed parameter can be a dict or a list. If feed is a dict, the
+        feed data will be split into multiple devices. If feed is a list, we
+        assume the data has been splitted into multiple devices, the each
+        element in the list will be copied to each device directly.
+
+        For example, if the feed is a dict:
+
+        >>> exe = ParallelExecutor()
+        >>> # the image will be splitted into devices. If there is two devices
+        >>> # each device will process an image with shape (24, 1, 28, 28)
+        >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))})
+
+        For example, if the feed is a list:
+
+        >>> exe = ParallelExecutor()
+        >>> # each device will process each element in the list.
+        >>> # the 1st device will process an image with shape (48, 1, 28, 28)
+        >>> # the 2nd device will process an image with shape (32, 1, 28, 28)
+        >>> #
+        >>> # you can use exe.device_count to get the device number.
+        >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))},
+        >>>               {"image": numpy.random.random(size=(32, 1, 28, 28))},
+        >>>              ])

        Args:
-            use_cuda (bool): Whether to use CUDA or not.
-            loss_name (str): The loss name must set in training. Default None.
-            main_program (Program): The program that need to run, if not provided,
-                then default_main_program will be used. Default None.
-            share_vars_from(ParallelExecutor): If provide, it will share variables
-                from the specified ParallelExecutor. Default None.
-            exec_strategy(ExecutionStrategy): exec_strategy is used to control how to run
-                the program in ParallelExecutor, for example how many threads are used to
-                execute the program, how many iterations to clean up the temp variables
-                which is generated during execution. For more information, please refer
-                to fluid.ExecutionStrategy. Default None.
-            build_strategy(BuildStrategy): build_strategy is used to control how to
-                build the SSA Graph in ParallelExecutor by setting the property,
-                for example reduce_strategy, gradient_scale_strategy. For more information,
-                please refer to fluid.BuildStrategy. Default None.
-            num_trainers(int): If greater than 1, NCCL will be initialized with
-                multiple rank of nodes, each node should have same number of GPUs.
-                Distributed training will be enabled then. Default 1.
-            trainer_id(int): Must use together with num_trainers. trainer_id is the
-                "rank" of current node starts from 0. Default 0.
-            scope(Scope): scope to run with, default use fluid.global_scope().
+            fetch_list(list): The fetched variable names
+            feed(list|dict|None): The feed variables. If the feed is a dict,
+                tensors in that dict will be splitted into each devices. If
+                the feed is a list, each element of the list will be copied
+                to each device. Default None.
+            feed_dict: Alias for feed parameter, for backward compatibility.
+                This parameter has been deprecated. Default None.
+            return_numpy(bool): Whether converts the fetched tensor to numpy.
+                Default: True.

        Returns:
-            ParallelExecutor: The initialized ParallelExecutor object.
+            List: The fetched result list.

        Raises:
-            TypeError: If share_vars_from is provided, but not ParallelExecutor object.
+            ValueError: If the feed is a list, but its length is not equal the
+                length of active places, or its element's is not dict.
+
+        NOTES:
+            1. If the feed's type is dict, the number of data that feeds to
+               ParallelExecutor must be bigger than active places. Otherwise,
+               it will throw exception from C++ side. Special attention should be
+               paid to check whether the last batch of the dataset is bigger
+               than active places.
+            2. If active places are more than one, the fetch results for each
+               variable is a list, and each element of this list is the variable of
+               respective active place.

        Examples:
            .. code-block:: python

-              train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name)
-              test_exe = fluid.ParallelExecutor(use_cuda=True,
-                                                main_program=test_program,
-                                                share_vars_from=train_exe)
-
-              train_loss, = train_exe.run([loss.name], feed=feed_dict)
-              test_loss, = test_exe.run([loss.name], feed=feed_dict)
+                pe = fluid.ParallelExecutor(use_cuda=use_cuda,
+                                            loss_name=avg_cost.name,
+                                            main_program=fluid.default_main_program())
+                loss = pe.run(feed=feeder.feed(cur_batch),
+                              fetch_list=[avg_cost.name]))
        """
-
-        def __init__(self,
-                     use_cuda,
-                     loss_name=None,
-                     main_program=None,
-                     share_vars_from=None,
-                     exec_strategy=None,
-                     build_strategy=None,
-                     num_trainers=1,
-                     trainer_id=0,
-                     scope=None):
-            self._places = []
-            self._act_places = []
-            if use_cuda:
-                for i in six.moves.range(core.get_cuda_device_count()):
-                    p = core.Place()
-                    self._act_places.append(core.CUDAPlace(i))
-                    p.set_place(self._act_places[-1])
-                    self._places.append(p)
-            else:
-                cpu_num = int(
-                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-                for i in six.moves.range(cpu_num):
-                    p = core.Place()
-                    self._act_places.append(core.CPUPlace())
-                    p.set_place(self._act_places[-1])
-                    self._places.append(p)
-            assert self._places, "no place for execution"
-
-            if exec_strategy is None:
-                exec_strategy = ExecutionStrategy()
-            exec_strategy.use_cuda = use_cuda
-
-            if exec_strategy.num_threads == 0:
-                if use_cuda:
-                    # Experiments on se-resnext shows that too many threads hurt
-                    # performance. Worth tunning for other models in the future.
-                    exec_strategy.num_threads = len(self._places) * 4
-                else:
-                    cpu_num = int(
-                        os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-                    exec_strategy.num_threads = cpu_num * 2
-
-            # Set 1 thread num under nccl2 distribute
-            #   env to make sure all gpus run ops in same order.
-            if num_trainers > 1:
-                assert (use_cuda)
-                # FIXME(gongwb): avoid this set.
-                exec_strategy.num_threads = 1
-
-            if build_strategy is None:
-                build_strategy = BuildStrategy()
-
-            main = main_program
-            main = main if main else framework.default_main_program()
-            if scope == None:
-                scope = executor.global_scope()
-
-            if share_vars_from and not isinstance(share_vars_from,
-                                                  ParallelExecutor):
-                raise TypeError("share_vars_from must be ParallelExecutor.")
-
-            local_scopes = share_vars_from.executor.local_scopes(
-            ) if share_vars_from else []
-
-            self.persistable_vars = [
-                v.name for v in [
-                    var for var in main.list_vars()
-                    if var.persistable and var.type != core.VarDesc.VarType.RAW
-                ]
-            ]
-
-            self.executor = core.ParallelExecutor(
-                self._places,
-                set([
-                    cpt.to_text(p.name)
-                    for p in main.global_block().iter_parameters()
-                    if not p.stop_gradient
-                ]),
-                set(cpt.to_text(var)
-                    for var in self.persistable_vars), main.desc,
-                cpt.to_text(loss_name)
-                if loss_name else six.u(''), scope, local_scopes, exec_strategy,
-                build_strategy, num_trainers, trainer_id)
-            self.scope = scope
-
-        def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
-            """
-            Run a parallel executor with fetch_list.
-
-            The feed parameter can be a dict or a list. If feed is a dict, the
-            feed data will be split into multiple devices. If feed is a list, we
-            assume the data has been splitted into multiple devices, the each
-            element in the list will be copied to each device directly.
-
-            For example, if the feed is a dict:
-
-            >>> exe = ParallelExecutor()
-            >>> # the image will be splitted into devices. If there is two devices
-            >>> # each device will process an image with shape (24, 1, 28, 28)
-            >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))})
-
-            For example, if the feed is a list:
-
-            >>> exe = ParallelExecutor()
-            >>> # each device will process each element in the list.
-            >>> # the 1st device will process an image with shape (48, 1, 28, 28)
-            >>> # the 2nd device will process an image with shape (32, 1, 28, 28)
-            >>> #
-            >>> # you can use exe.device_count to get the device number.
-            >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))},
-            >>>               {"image": numpy.random.random(size=(32, 1, 28, 28))},
-            >>>              ])
-
-            Args:
-                fetch_list(list): The fetched variable names
-                feed(list|dict|None): The feed variables. If the feed is a dict,
-                    tensors in that dict will be splitted into each devices. If
-                    the feed is a list, each element of the list will be copied
-                    to each device. Default None.
-                feed_dict: Alias for feed parameter, for backward compatibility.
-                    This parameter has been deprecated. Default None.
-                return_numpy(bool): Whether converts the fetched tensor to numpy.
-                    Default: True.
-
-            Returns:
-                List: The fetched result list.
-
-            Raises:
-                ValueError: If the feed is a list, but its length is not equal the
-                    length of active places, or its element's is not dict.
-
-            NOTES:
-                1. If the feed's type is dict, the number of data that feeds to
-                   ParallelExecutor must be bigger than active places. Otherwise,
-                   it will throw exception from C++ side. Special attention should be
-                   paid to check whether the last batch of the dataset is bigger
-                   than active places.
-                2. If active places are more than one, the fetch results for each
-                   variable is a list, and each element of this list is the variable of
-                   respective active place.
-
-            Examples:
-                .. code-block:: python
-
-                    pe = fluid.ParallelExecutor(use_cuda=use_cuda,
-                                                loss_name=avg_cost.name,
-                                                main_program=fluid.default_main_program())
-                    loss = pe.run(feed=feeder.feed(cur_batch),
-                                  fetch_list=[avg_cost.name]))
-            """
-            if feed is None and feed_dict is not None:
-                feed = feed_dict
-                print(
-                    "`feed_dict` is deprecated. Please use `feed=`",
-                    file=sys.stderr)
-
-            if isinstance(feed, dict):
-                feed_tensor_dict = dict()
-                for feed_name in feed:
-                    feed_tensor = feed[feed_name]
-                    if not isinstance(feed_tensor, core.LoDTensor):
-                        feed_tensor = core.LoDTensor()
-                        # always set to CPU place, since the tensor need to be splitted
-                        # it is fast in CPU
-                        feed_tensor.set(feed[feed_name], core.CPUPlace())
-                    feed_tensor_dict[feed_name] = feed_tensor
-
-                self.executor.feed_and_split_tensor_into_local_scopes(
-                    feed_tensor_dict)
-            elif isinstance(feed, list) or isinstance(feed, tuple):
-                if len(feed) != len(self._act_places):
-                    raise ValueError(
-                        "Feed a list of tensor, the list should be the same size as places"
-                    )
-
-                res = list()
-
-                for i, each in enumerate(feed):
-                    if not isinstance(each, dict):
-                        raise TypeError(
-                            "Each element of feed list should be a dict")
-                    res_dict = dict()
-                    for feed_name in each:
-                        tensor = each[feed_name]
-                        if not isinstance(tensor, core.LoDTensor):
-                            tmp = core.LoDTensor()
-                            tmp.set(tensor, self._act_places[i])
-                            tensor = tmp
-                        res_dict[feed_name] = tensor
-                    res.append(res_dict)
-                self.executor.feed_tensors_into_local_scopes(res)
-
-            fetch_var_name = '@FETCHED_VAR_NAME@'
-            self.executor.run(fetch_list, fetch_var_name)
-            arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
-
-            if return_numpy:
-                return executor.as_numpy(arr)
-
-            return [arr[i] for i in range(len(arr))]
-
-        @property
-        def device_count(self):
-            return len(self._act_places)
+        if feed is None and feed_dict is not None:
+            feed = feed_dict
+            print(
+                "`feed_dict` is deprecated. Please use `feed=`",
+                file=sys.stderr)
+
+        if isinstance(feed, dict):
+            feed_tensor_dict = dict()
+            for feed_name in feed:
+                feed_tensor = feed[feed_name]
+                if not isinstance(feed_tensor, core.LoDTensor):
+                    feed_tensor = core.LoDTensor()
+                    # always set to CPU place, since the tensor need to be splitted
+                    # it is fast in CPU
+                    feed_tensor.set(feed[feed_name], core.CPUPlace())
+                feed_tensor_dict[feed_name] = feed_tensor
+
+            self.executor.feed_and_split_tensor_into_local_scopes(
+                feed_tensor_dict)
+        elif isinstance(feed, list) or isinstance(feed, tuple):
+            if len(feed) != len(self._act_places):
+                raise ValueError(
+                    "Feed a list of tensor, the list should be the same size as places"
+                )
+
+            res = list()
+
+            for i, each in enumerate(feed):
+                if not isinstance(each, dict):
+                    raise TypeError(
+                        "Each element of feed list should be a dict")
+                res_dict = dict()
+                for feed_name in each:
+                    tensor = each[feed_name]
+                    if not isinstance(tensor, core.LoDTensor):
+                        tmp = core.LoDTensor()
+                        tmp.set(tensor, self._act_places[i])
+                        tensor = tmp
+                    res_dict[feed_name] = tensor
+                res.append(res_dict)
+            self.executor.feed_tensors_into_local_scopes(res)
+
+        fetch_var_name = '@FETCHED_VAR_NAME@'
+        self.executor.run(fetch_list, fetch_var_name)
+        arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
+
+        if return_numpy:
+            return executor.as_numpy(arr)
+
+        return [arr[i] for i in range(len(arr))]
+
+    @property
+    def device_count(self):
+        return len(self._act_places)
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -45,6 +45,10 @@ if(APPLE)
    list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
    list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass)
 endif()
+if(NOT WITH_MKLML)
+    # this op is not support on openblas
+    list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
+endif()

 function(py_test_modules TARGET_NAME)
  if(WITH_TESTING)

--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -362,7 +362,9 @@ class OpTest(unittest.TestCase):
            else:
                return []
        places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
+        cpu_only = self._cpu_only if hasattr(self, '_cpu_only') else False
+        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type)\
+           and not cpu_only:
            places.append(core.CUDAPlace(0))
        return places


--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_mkldnn_op.py
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from test_elementwise_mul_op import *
+
+
+class TestElementwiseMulMKLDNNOp_BroadcastNCHW16c(ElementwiseMulOp):
+    def init_input_output(self):
+        x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
+        self.x = x.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
+        self.y = np.random.rand(1, 16).astype(self.dtype)
+
+        self.out = x * self.y.reshape(1, 16, 1, 1)
+        self.out = self.out.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
+
+    def setUp(self):
+        super(TestElementwiseMulMKLDNNOp_BroadcastNCHW16c, self).setUp()
+        self.attrs["x_data_format"] = "nchw16c"
+        self.attrs["y_data_format"] = "nc"
+        self._cpu_only = True
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def init_axis(self):
+        self.axis = 0
+
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ingore_x(self):
+        pass
+
+    def test_check_grad_ingore_y(self):
+        pass
+
+
+@unittest.skip(
+    "Not implemented yet.")  # TODO(mgallus): enable when implemented.
+class TestElementwiseMulMKLDNNOp_BroadcastNCHW8c(ElementwiseMulOp):
+    def init_input_output(self):
+        x = np.random.rand(1, 8, 2, 2).astype(self.dtype)
+        self.x = x.transpose(0, 2, 3, 1).reshape(1, 8, 2, 2)
+        self.y = np.random.rand(1, 8).astype(self.dtype)
+
+        self.out = x * self.y.reshape(1, 8, 1, 1)
+        self.out = self.out.transpose(0, 2, 3, 1).reshape(1, 8, 2, 2)
+
+    def setUp(self):
+        super(TestElementwiseMulMKLDNNOp_BroadcastNCHW8c, self).setUp()
+        self.attrs["x_data_format"] = "nchw8c"
+        self.attrs["y_data_format"] = "nc"
+        self._cpu_only = True
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def init_axis(self):
+        self.axis = 0
+
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ingore_x(self):
+        pass
+
+    def test_check_grad_ingore_y(self):
+        pass
+
+
+class TestElementwiseMulMKLDNNOp_FallbackNCHW(ElementwiseMulOp):
+    def init_input_output(self):
+        self.x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
+        self.y = np.random.rand(1, 16).astype(self.dtype)
+
+        self.out = self.x * self.y.reshape(1, 16, 1, 1)
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def init_axis(self):
+        self.axis = 0
+
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ingore_x(self):
+        pass
+
+    def test_check_grad_ingore_y(self):
+        pass
+
+
+class TestElementwiseMulMKLDNNOp_FallbackNCHW16C(ElementwiseMulOp):
+    def init_input_output(self):
+        x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
+        self.x = x.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
+        y = np.random.rand(1, 16, 2, 2).astype(self.dtype)
+        self.y = y.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
+
+        self.out = self.x * self.y
+
+    def setUp(self):
+        super(TestElementwiseMulMKLDNNOp_FallbackNCHW16C, self).setUp()
+        self.attrs["x_data_format"] = "nchw16c"
+        self.attrs["y_data_format"] = "nchw16c"
+        self._cpu_only = True
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def init_axis(self):
+        self.axis = 0
+
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ingore_x(self):
+        pass
+
+    def test_check_grad_ingore_y(self):
+        pass
+
+
+class TestElementwiseMulMKLDNNOp_FallbackNoReorders(ElementwiseMulOp):
+    def init_input_output(self):
+        x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
+        self.x = x.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
+        y = np.random.rand(1, 16, 2, 2).astype(self.dtype)
+        self.y = y.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
+
+        self.out = self.x * self.y
+
+    def setUp(self):
+        super(TestElementwiseMulMKLDNNOp_FallbackNoReorders, self).setUp()
+        self.attrs["x_data_format"] = "nchw16c"
+        self.attrs["y_data_format"] = "nchw16c"
+        self._cpu_only = True
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def init_axis(self):
+        self.axis = 0
+
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ingore_x(self):
+        pass
+
+    def test_check_grad_ingore_y(self):
+        pass
+
+
+class TestElementwiseMulMKLDNNOp_FallbackWithReorder1(ElementwiseMulOp):
+    def init_input_output(self):
+        self.x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
+        y = np.random.rand(1, 16, 2, 2).astype(self.dtype)
+        self.y = y.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
+
+        self.out = self.x * y
+
+    def setUp(self):
+        super(TestElementwiseMulMKLDNNOp_FallbackWithReorder1, self).setUp()
+        self.attrs["x_data_format"] = "nchw"
+        self.attrs["y_data_format"] = "nchw16c"
+        self._cpu_only = True
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def init_axis(self):
+        self.axis = 0
+
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ingore_x(self):
+        pass
+
+    def test_check_grad_ingore_y(self):
+        pass
+
+
+class TestElementwiseMulMKLDNNOp_FallbackWithReorder2(ElementwiseMulOp):
+    def init_input_output(self):
+        self.y = np.random.rand(1, 16, 2, 2).astype(self.dtype)
+        x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
+        self.x = x.transpose(0, 2, 3, 1).reshape(1, 16, 2, 2)
+
+        self.out = x * self.y
+
+    def setUp(self):
+        super(TestElementwiseMulMKLDNNOp_FallbackWithReorder2, self).setUp()
+        self.attrs["x_data_format"] = "nchw16c"
+        self.attrs["y_data_format"] = "nchw"
+        self._cpu_only = True
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def init_axis(self):
+        self.axis = 0
+
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ingore_x(self):
+        pass
+
+    def test_check_grad_ingore_y(self):
+        pass
+
+
+class TestElementwiseMulMKLDNNOp_FallbackNoReorders2(ElementwiseMulOp):
+    def init_input_output(self):
+        self.x = np.random.rand(1, 16).astype(self.dtype)
+        self.y = np.random.rand(1, 16).astype(self.dtype)
+
+        self.out = self.x * self.y
+
+    def setUp(self):
+        super(TestElementwiseMulMKLDNNOp_FallbackNoReorders2, self).setUp()
+        self.attrs["x_data_format"] = "nc"
+        self.attrs["y_data_format"] = "nc"
+        self._cpu_only = True
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def init_axis(self):
+        self.axis = 0
+
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ingore_x(self):
+        pass
+
+    def test_check_grad_ingore_y(self):
+        pass
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
@@ -21,13 +21,24 @@ from paddle.fluid.op import Operator


 class ElementwiseMulOp(OpTest):
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
    def setUp(self):
        self.op_type = "elementwise_mul"
+        self.dtype = np.float32
+        self.axis = -1
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float64"),
-            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float64")
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
        }
-        self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])}
+        self.outputs = {'Out': self.out}
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}

    def test_check_output(self):
        self.check_output()
@@ -41,6 +52,17 @@ class ElementwiseMulOp(OpTest):
    def test_check_grad_ingore_y(self):
        self.check_grad(['X'], 'Out', no_grad_set=set('Y'))

+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.multiply(self.x, self.y)
+
+    def init_dtype(self):
+        pass
+
+    def init_axis(self):
+        pass
+

 class TestElementwiseMulOp_scalar(ElementwiseMulOp):
    def setUp(self):
@@ -63,17 +85,13 @@ class TestElementwiseMulOp_Vector(ElementwiseMulOp):


 class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp):
-    def setUp(self):
-        self.op_type = "elementwise_mul"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float64),
-            'Y': np.random.rand(2).astype(np.float64)
-        }
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(2).astype(self.dtype)
+        self.out = self.x * self.y.reshape(2, 1, 1)

-        self.attrs = {'axis': 0}
-        self.outputs = {
-            'Out': self.inputs['X'] * self.inputs['Y'].reshape(2, 1, 1)
-        }
+    def init_axis(self):
+        self.axis = 0


 class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp):