diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 99bf8ec8dc34d24478ca5dc8abfffb01ddb194e4..564878131c87afdba249a14f82f19adc67e7876c 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -173,6 +173,7 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 if (NOT WIN32) # windows msvc2015 support c++11 natively. 
 # -std=c++11 -fPIC not recoginize by msvc
 list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
+# in cuda9, suppress cuda warning on eigen with "-w"
 list(APPEND CUDA_NVCC_FLAGS "-w" "-Xcompiler -fPIC")
 else(NOT WIN32)
 list(APPEND CUDA_NVCC_FLAGS "-w" "-Xcompiler -fPIC" "-Xcompiler /w")
@@ -181,7 +182,7 @@ endif(NOT WIN32)
 if(WITH_FAST_MATH)
   # Make use of fast math library. https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
-# in cuda9, suppress cuda warning on eigen 
+endif(WITH_FAST_MATH)
 
 # Set :expt-relaxed-constexpr to suppress Eigen warnings
 list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake
index 0159815fed81bdff6de3e561af569e9edc75f947..21527fe538b15abcf1dd13d11b1f5a3a08729a55 100644
--- a/cmake/external/threadpool.cmake
+++ b/cmake/external/threadpool.cmake
@@ -3,6 +3,7 @@ INCLUDE(ExternalProject)
 SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool)
 SET(THREADPOOL_INCLUDE_DIR ${THREADPOOL_SOURCE_DIR}/src/extern_threadpool)
 INCLUDE_DIRECTORIES(${THREADPOOL_INCLUDE_DIR})
+message("Debug" ${THREADPOOL_INCLUDE_DIR})
 
 ExternalProject_Add(
     extern_threadpool
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index d2f64ef07cce2e362f950fd495e3e0cebef0585d..0476d2f598348adf51bf6c0e988eb4e28a0fcf36 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -143,26 +143,14 @@ set(GPU_COMMON_FLAGS
     -Wno-error=unused-function  # Warnings in Numpy Header.
     -Wno-error=array-bounds # Warnings in Eigen::array
 )
-set(COMMON_FLAGS 
-    -fPIC
-    -fno-omit-frame-pointer)
-set(GPU_COMMON_FLAGS
-    -fPIC
-    -fno-omit-frame-pointer)
-
-else(NOT WIN32)
-set(COMMON_FLAGS
-    "/w") #disable all warnings.
-
-set(GPU_COMMON_FLAGS
-    "/w") #disable all warnings
-
-endif(NOT WIN32)
-
 else(NOT WIN32)
 set(COMMON_FLAGS
+    -fPIC
+    -fno-omit-frame-pointer
     "/w") #disable all warnings.
 set(GPU_COMMON_FLAGS
+    -fPIC
+    -fno-omit-frame-pointer
     "/w") #disable all warnings
 endif(NOT WIN32)
 
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 814dec4aa473ee5b731d724ce0619117e8488dad..9ab1d1fa28dbca4a5f2eaa587a153293c00860ac 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -48,6 +48,7 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
   VLOG(5) << "destroy ExecutorPrepareContext";
 }
 
+#ifndef _WIN32
 template <typename RefCntMap>
 static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
                                 GarbageCollector<Tensor>* gc,
@@ -82,6 +83,7 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
     gc->Add(erase_tensors);
   }
 }
+#endif
 
 Executor::Executor(const platform::Place& place) : place_(place) {}
 
@@ -331,97 +333,35 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
 
 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
     const ProgramDesc& program, int block_id) {
-  VLOG(3) << "before create prepare" << block_id << " " << program.Size();
   std::unique_ptr<ExecutorPrepareContext> ctx(
       new ExecutorPrepareContext(program, block_id));
-  VLOG(3) << "after create prepare";
-  // PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
-  VLOG(3) << "before create op_desc";
+  PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
   auto& block = program.Block(block_id);
-  VLOG(3) << "create before" << ctx->ops_.size() << " "
-          << block.AllOps().size();
   int counter = 0;
   for (auto& op_desc : block.AllOps()) {
     ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
-    VLOG(3) << "create op "
-            << "index " << ++counter << " type " << op_desc->Type();
   }
-  VLOG(3) << "create finished" << ctx->ops_.size() << " "
-          << block.AllOps().size();
   return ctx;
 }
 
 std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
     const ProgramDesc& program, const std::vector<int>& block_ids) {
-  VLOG(3) << "inside prepare";
   std::vector<std::shared_ptr<ExecutorPrepareContext>> result;
-  VLOG(3) << "before go through block_ids";
   for (auto& bid : block_ids) {
-    VLOG(3) << "block id" << bid;
     auto* ctx = new ExecutorPrepareContext(program, bid);
-    // PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
+    PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
     auto& block = program.Block(bid);
-    int counter = 0;
-    VLOG(3) << "create before" << ctx->ops_.size() << " "
-            << block.AllOps().size();
     for (auto& op_desc : block.AllOps()) {
       ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
-      VLOG(3) << "create op "
-              << "index " << ++counter << " type " << op_desc->Type();
     }
-    VLOG(3) << "create finished" << ctx->ops_.size() << " "
-            << block.AllOps().size();
     result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx));
   }
   return result;
 }
 
-// void CheckResult(const std::string op_type, ExecutorPrepareContext* ctx,
-// Scope* local_scope) {
-//     VLOG(3) << "before checking result";
-//   auto& dev_ctx = *platform::DeviceContextPool::Instance().Get(place_);
-//   std::vector<std::string> outputs;
-//   auto& block = ctx->prog_.Block(0);
-//   bool found = false;
-//   framework::OpDesc* myop = nullptr;
-//   for(auto& op : block.AllOps()) {
-//     if(op->Type() == "load_combine" || op->Type() == "fetch" || op->Type() ==
-//     "feed") return;
-//     if (op->Type() == op_type) {
-//         found = true;
-//         myop = op;
-//         break;
-//       }
-//     }
-//   }
-//   if(!found) {
-//     VLOG(3) << "not found op!";
-//     return;
-//   }
-//     auto* op = myop;
-//      VLOG(3) << "start op output" << op->Type();
-//     for(auto var_name: op->OutputArgumentNames()) {
-//       auto* var = local_scope->Var(var_name);
-//       auto* var_desc = block.FindVar(var_name);
-//       if (var_desc->Persistable()) continue;
-//       auto* tensor = var->GetMutable<framework::LoDTensor>();
-//       framework::Tensor check;
-//       VLOG(3) << "before tensor copy";
-//       framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
-//       VLOG(3) << "after tensor copy";
-//       float sum = .0;
-//       for(size_t i=0; i < check.numel(); ++i) {
-//           sum += check.data<float>()[i];
-//       }
-//       VLOG(3) << "op " << op->Type() << " output var " << var_name << " sum "
-//       << sum;
-//   VLOG(3) << "after checking result";
-// }
-
 void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                                   bool create_local_scope, bool create_vars,
                                   bool keep_kids) {
-  VLOG(3) << "RunPreparedContext inside";
   Scope* local_scope = scope;
   if (create_vars) {
     if (create_local_scope) {
@@ -430,6 +370,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
     CreateVariables(ctx->prog_, local_scope, ctx->block_id_);
   }
 
+#ifndef _WIN32
   int64_t max_memory_size = GetEagerDeletionThreshold();
   std::unique_ptr<GarbageCollector<Tensor>> gc;
   // WhileOp would set keep_kids to false
@@ -471,6 +412,16 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   } else {
     platform::DeviceContextPool::Instance().Get(place_)->Wait();
   }
+#else   // WIN32
+  for (auto& op : ctx->ops_) {
+    op->Run(*local_scope, place_);
+    if (FLAGS_benchmark) {
+      VLOG(2) << "Memory used after operator " + op->Type() + " running: "
+              << memory::memory_usage(place_);
+    }
+  }
+  platform::DeviceContextPool::Instance().Get(place_)->Wait();
+#endif  // NOT WIN32
 
   if (local_scope != scope) {
     scope->DeleteScope(local_scope);
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 36b36d49c2728dbef93042158dffa26d8f56d529..a2a6c6bfb13be8a18e2a65343743f8b3e4518eb1 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -17,12 +17,14 @@ limitations under the License. */
 #include <map>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
+#ifndef _WIN32
+#include "paddle/fluid/framework/garbage_collector.h"
+#endif
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index b8311623b0ef071aeebc45a5d37d82fd196733dc..7b2f6e5bc622f7e26a5c130be99f653d73f1c142 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -35,9 +35,10 @@ endif()
 
 # Create static library
 if (WIN32)
-cc_library(paddle_fluid DEPS ${fluid_modules} ${fluid_thirdpa} paddle_fluid_api paddle_inference_api)
+cc_library(paddle_fluid DEPS ${fluid_modules} ${fluid_third_partys} paddle_fluid_api paddle_inference_api)
 else(WIND32)
 cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
+endif(WIN32)
 
 if(NOT APPLE)
   # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index e2027b7cb4d584ffcc48624d2c01e65a61829975..aea75074af2dce776ab343d863652091fd0f7468 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -51,6 +51,7 @@ function(inference_api_test TARGET_NAME)
 endfunction(inference_api_test)
 
 cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope)
+cc_library(helper SRCS helper.cc DEPS reset_tensor_array lod_tensor scope)
 cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope)
 cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor)
 cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api)
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index 01ea942d3c8d20180cfc9664b8601ba87a898e86..20fab8078fedf837564496aa296648bf5970a348 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -16,7 +16,6 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle_inference_api.h"
 
 namespace paddle {
 
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index ba9b32de35e10a04216cd5b6aba7ce8e1012e963..eea5689da642ba71c14204e39441295e81fb4e5d 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -260,9 +260,8 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
   if (config.use_gpu) {
     // 1. GPU memeroy
     PADDLE_ENFORCE_GT(
-       config.fraction_of_gpu_memory, 0.f,
-       "fraction_of_gpu_memory in the config should be set to range (0.,
-       1.]");
+        config.fraction_of_gpu_memory, 0.f,
+        "fraction_of_gpu_memory in the config should be set to range (0.,1.]");
     PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
     std::vector<std::string> flags;
     if (config.fraction_of_gpu_memory >= 0.0f ||
diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
index 4e4ab47ca9c5e37f2714ebd48d250c23c7e9b117..ed3bdd8de7f59b4c086eef48bf5b51da635ab572 100644
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -31,10 +31,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle_inference_api.h"  // NOLINT
 
 namespace paddle {
 
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index e7a5109648b6f03e7f3130efa42e491c0fcb8341..a3f3d67deca5d4d72297c5311b9ca201c45c150a 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -14,8 +14,9 @@
 
 #pragma once
 
+#define GLOG_NO_ABBREVIATED_SEVERITIES
+#define GOOGLE_GLOG_DLL_DECL
 #include <glog/logging.h>
-
 #include <algorithm>
 #include <chrono>  // NOLINT
 #include <iterator>
@@ -23,9 +24,7 @@
 #include <sstream>
 #include <string>
 #include <vector>
-#include "paddle/fluid/string/printf.h"
-#include "paddle_inference_api.h"
-#include "timer.h"
+#include "paddle_inference_api.h"  //NOLINT
 
 namespace paddle {
 namespace inference {
@@ -97,7 +96,7 @@ static void TensorAssignData(PaddleTensor *tensor,
 }
 
 template <typename T>
-static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor,
+static int ZeroCopyTensorAssignData(paddle::ZeroCopyTensor *tensor,
                                     const std::vector<std::vector<T>> &data) {
   int size{0};
   auto *ptr = tensor->mutable_data<T>(PaddlePlace::kCPU);
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 5c18c46aa6c94be1064fb27a539908f245a4438e..19a8e5f4b3de465670d78f8b6d16753e79f14bad 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -284,12 +284,10 @@ op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
 op_library(max_sequence_len_op DEPS lod_rank_table)
 op_library(sequence_conv_op DEPS context_project)
 op_library(sequence_pool_op DEPS sequence_pooling)
-if (NOT WIN32)
-    op_library(lstm_op DEPS sequence2batch lstm_compute)
-    op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
-    op_library(lstmp_op DEPS sequence2batch lstm_compute)
-    op_library(gru_op DEPS sequence2batch gru_compute)
-endif(NOT WIN32)
+op_library(lstm_op DEPS sequence2batch lstm_compute)
+op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
+op_library(lstmp_op DEPS sequence2batch lstm_compute)
+op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 op_library(cos_sim_op DEPS cos_sim_functor)
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
index c82930cc4994c3854e60f40ae9909a90d82cbff6..e70945a2bd1025ba542dbf4da556dc04a3f5b91f 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
@@ -31,12 +31,12 @@ namespace operators {
 
 template <typename T>
 __device__ bool GT_E(T a, T b) {
-  return (a > b) || fabs(a - b) < 1e-4;
+  return (a > b) || fabsf(static_cast<float>(a - b)) < 1e-4;
 }
 
 template <typename T>
 __device__ bool LT_E(T a, T b) {
-  return (a < b) || fabs(a - b) < 1e-4;
+  return (a < b) || fabsf(static_cast<float>(a - b)) < 1e-4;
 }
 
 template <typename T>
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 17b675fba8067851f6149edafcc9096690a3fd34..dcc3520abe90b28f93c7133b4c047fade2eeaff0 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -57,9 +57,6 @@ math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
-if (NOT WIN32)
-    math_library(matrix_bit_code)
-endif (NOT WIN32)
 math_library(unpooling)
 math_library(vol2col)
 
@@ -75,7 +72,10 @@ if(WITH_GPU)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
-cc_library(jit_kernel 
-    SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc
-    DEPS cpu_info cblas)
-cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
+if (NOT WIN32)
+    math_library(matrix_bit_code)
+    cc_library(jit_kernel
+        SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc
+        DEPS cpu_info cblas)
+    cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
+endif (NOT WIN32)
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 7c511e20bae8304ee7720f8217a500ff797bfc28..fc365d0948a11c40e431f7304ffcc6db4688dbe6 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -235,7 +235,9 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
             << ", Runtime Version: " << runtime_version_ / 1000 << "."
             << (runtime_version_ % 100) / 10;
 
+#ifndef _WIN32
   callback_manager_.reset(new StreamCallbackManager(stream_));
+#endif  // NOT WIN32
 }
 
 CUDADeviceContext::~CUDADeviceContext() {
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 942e13a724339dc85ed1fc72c11e208ddce36dbb..fcd7529b3112ab78264a10ccc6c08a446c212efc 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -31,7 +31,7 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/stream_callback_manager.h"
 #endif
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -115,6 +115,7 @@ class CUDADeviceContext : public DeviceContext {
     PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
   }
 
+#ifndef _WIN32
   template <typename Callback>
   void AddStreamCallback(Callback&& callback) const {
     std::lock_guard<std::mutex> guard(callback_mtx_);
@@ -125,6 +126,16 @@ class CUDADeviceContext : public DeviceContext {
     std::lock_guard<std::mutex> guard(callback_mtx_);
     callback_manager_->Wait();
   }
+#else
+  template <typename Callback>
+  void AddStreamCallback(Callback&& callback) const {
+    // ugly empty functor.
+  }
+
+  void WaitStreamCallback() const {
+    // ugly empty functor.
+  }
+#endif
 
  private:
   CUDAPlace place_;
@@ -143,10 +154,12 @@ class CUDADeviceContext : public DeviceContext {
 
   mutable std::mutex mtx_;
 
+#ifndef _WIN32
   // This lock is only used by callback
   // If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes
   mutable std::mutex callback_mtx_;
   std::unique_ptr<StreamCallbackManager> callback_manager_;
+#endif
 };
 
 template <>