diff --git a/CMakeLists.txt b/CMakeLists.txt
index ed704585d8a6bf3befd9a549aa5a62a33fea3da9..4625516458769c26e066863df0c620b58fc0c0f4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,6 +26,7 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
         "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 if(WIN32)
     set(CMAKE_STATIC_LIBRARY_PREFIX lib)
+    set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "/MT") #create multithread dynamic library
 endif(WIN32)
 
 if(NOT CMAKE_CROSSCOMPILING)
@@ -33,7 +34,6 @@ if(NOT CMAKE_CROSSCOMPILING)
 endif(NOT CMAKE_CROSSCOMPILING)
 find_package(Git REQUIRED)
 find_package(Threads REQUIRED)
-
 include(simd)
 
 ################################ Configurations #######################################
@@ -178,10 +178,10 @@ include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/cub)
-include(external/xxhash)    # download xxhash
 
 if (NOT WIN32)
 # there is no official support of snappystream, warpctc, nccl, cupti in windows
+include(external/xxhash)    # download xxhash
 include(external/snappy)    # download snappy
 include(external/snappystream) # download snappystream
 include(external/warpctc)   # download, build, install warpctc
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index f507bb41a1103c093e9569176ee868cfaac6bf7b..564878131c87afdba249a14f82f19adc67e7876c 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -169,18 +169,21 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
+
 if (NOT WIN32) # windows msvc2015 support c++11 natively. 
-# -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake.
+# -std=c++11 -fPIC not recoginize by msvc
 list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
-list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+# in cuda9, suppress cuda warning on eigen with "-w"
+list(APPEND CUDA_NVCC_FLAGS "-w" "-Xcompiler -fPIC")
+else(NOT WIN32)
+list(APPEND CUDA_NVCC_FLAGS "-w" "-Xcompiler -fPIC" "-Xcompiler /w")
 endif(NOT WIN32)
 
 if(WITH_FAST_MATH)
   # Make use of fast math library. https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
   list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
-endif()
-# in cuda9, suppress cuda warning on eigen 
-list(APPEND CUDA_NVCC_FLAGS "-w")
+endif(WITH_FAST_MATH)
+
 # Set :expt-relaxed-constexpr to suppress Eigen warnings
 list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
 
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index cd51533926de7bb132ab7bfab1686d664a331410..813611b032f7dd4f68200ebdb3ac1dc9b8fa5fca 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -48,7 +48,6 @@ find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a
           NO_DEFAULT_PATH
     DOC "Path to cuDNN library.")
 
-
 if(CUDNN_INCLUDE_DIR AND CUDNN_LIBRARY)
     set(CUDNN_FOUND ON)
 else()
@@ -83,7 +82,7 @@ if(CUDNN_FOUND)
 
         if(NOT CUDNN_MAJOR_VERSION)
             set(CUDNN_VERSION "???")
-        else()
+        else() 
             math(EXPR CUDNN_VERSION
                 "${CUDNN_MAJOR_VERSION} * 1000 +
                  ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index ada61de8eb15ae10288ac54f588e9adf84acee37..65f55b64cad8d6f08fa6f53d2bc3c9ebdc31d1e6 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -48,7 +48,7 @@ ExternalProject_Add(
     DOWNLOAD_DIR          ${BOOST_DOWNLOAD_DIR}
     DOWNLOAD_COMMAND      wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz
     && tar zxf ${BOOST_TAR}.tar.gz
-    DOWNLOAD_NO_PROGRESS  1
+DOWNLOAD_NO_PROGRESS  1
     PREFIX                ${BOOST_SOURCES_DIR}
     CONFIGURE_COMMAND     ""
     BUILD_COMMAND         ""
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index cf58cc39762351f8b37d073bcd218d249285bf52..0d4cecd4de75aaafeae66c1dda1d96d527b02171 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -35,7 +35,9 @@ ExternalProject_Add(
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                     -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                     -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DBUILD_STATIC_LIBS=ON
                     -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                     -DBUILD_TESTING=OFF
@@ -45,6 +47,10 @@ ExternalProject_Add(
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
+
+ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
+ADD_DEPENDENCIES(gflags extern_gflags)
 IF(WIN32)
   IF(NOT EXISTS "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib")
     add_custom_command(TARGET extern_gflags POST_BUILD
@@ -52,9 +58,6 @@ IF(WIN32)
   )
   ENDIF()
 ENDIF(WIN32)
-ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
-ADD_DEPENDENCIES(gflags extern_gflags)
 
 LIST(APPEND external_project_dependencies gflags)
 
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 25ef2970ac52f12f961c9c6d3a589fec4c80983f..a205d4ec7780711927ddf2920e29ee67df60f4d0 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -34,7 +34,6 @@ ELSE()
   SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
   SET(GLOG_TAG "v0.3.5")
 ENDIF()
-
 ExternalProject_Add(
     extern_glog
     ${EXTERNAL_PROJECT_LOG_ARGS}
@@ -46,6 +45,7 @@ ExternalProject_Add(
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                     -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                     -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
                     -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
                     -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index d335298742c73bf1fe44e6a778ab3c142711063d..bfb04916dc9adeb62e7310b9e4d168a74b67a989 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -51,6 +51,7 @@ IF(WITH_TESTING)
                         -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                         -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
                         -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                        -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                         -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                         -DBUILD_GMOCK=ON
@@ -70,6 +71,5 @@ IF(WITH_TESTING)
     ADD_LIBRARY(gtest_main STATIC IMPORTED GLOBAL)
     SET_PROPERTY(TARGET gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES})
     ADD_DEPENDENCIES(gtest_main extern_gtest)
-
     LIST(APPEND external_project_dependencies gtest gtest_main)
 ENDIF(WITH_TESTING)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 755dbd610c40c2d9b85d3017b6f000a869b0f39a..abc906d31fa68d86b35c93ce667d4acd8194c6bc 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -124,6 +124,7 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
 # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
 SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
 FILE(WRITE ${dummyfile} "const char *dummy_cblas = \"${dummyfile}\";")
+
 ADD_LIBRARY(cblas STATIC ${dummyfile})
 
 IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 343e44ab4bc21c1a656048b675062f1b897bbc77..a652b844c656ddac3b20557d3b0cc67980a8597e 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -144,11 +144,14 @@ set(GPU_COMMON_FLAGS
     -Wno-error=unused-function  # Warnings in Numpy Header.
     -Wno-error=array-bounds # Warnings in Eigen::array
 )
-
 else(NOT WIN32)
 set(COMMON_FLAGS
+    -fPIC
+    -fno-omit-frame-pointer
     "/w") #disable all warnings.
 set(GPU_COMMON_FLAGS
+    -fPIC
+    -fno-omit-frame-pointer
     "/w") #disable all warnings
 endif(NOT WIN32)
 
@@ -164,8 +167,8 @@ endif(APPLE)
 if(LINUX)
     set(GPU_COMMON_FLAGS
         -Wall
-        -Wextra
         -Werror
+        -Wextra
         ${GPU_COMMON_FLAGS})
 endif(LINUX)
 
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 62227c67849dbb476339a176e0c98e295cbf529c..7421a012a123ab131924c48e57d2a1ee715193af 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -238,6 +238,7 @@ function(cc_library TARGET_NAME)
       # add libxxx.lib prefix in windows
       set(${TARGET_NAME}_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}")
   endif(WIN32)
+
   if(cc_library_SRCS)
     if(cc_library_SHARED OR cc_library_shared) # build *.so
       add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
@@ -307,7 +308,11 @@ function(cc_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS ARGS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
+    if(WIN32) # in windows deps. shlwapi library.
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog shlwapi)
+    else(WIN32)
     target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    endif(WIN32)
     add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     add_test(NAME ${TARGET_NAME}
              COMMAND ${TARGET_NAME} ${cc_test_ARGS}
@@ -378,7 +383,11 @@ function(nv_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
+    if(WIN32)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog shlwapi)
+    else(WIN32)
     target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    endif(WIN32)
     add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     add_test(${TARGET_NAME} ${TARGET_NAME})
     if (nv_test_SERIAL)
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index efdb093a7b28e19f3b2a774dd54f2e7f042e9ca7..72ce7070c84089433ca8ac3940baa5e745c9c38c 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -31,10 +31,31 @@ function(copy TARGET)
     foreach(index RANGE ${len})
         list(GET copy_lib_SRCS ${index} src)
         list(GET copy_lib_DSTS ${index} dst)
-        add_custom_command(TARGET ${TARGET} PRE_BUILD
+        if (WIN32) 
+        # windows cmd shell will not expand wildcard automatically.
+        # below expand the files,libs and copy them by rules.
+        file(GLOB header_files ${src} "*.h")
+        file(GLOB static_lib_files ${src} "*.lib")
+        file(GLOB dll_lib_files ${src} "*.dll")
+        set(src_files ${header_files} ${static_lib_files} ${dll_lib_files})
+
+        if (NOT "${src_files}" STREQUAL "")
+        list(REMOVE_DUPLICATES src_files)
+        endif()
+        add_custom_command(TARGET ${TARGET} PRE_BUILD 
+          COMMAND ${CMAKE_COMMAND} -E make_directory  "${dst}"
+          )
+        foreach(src_file ${src_files}) 
+          add_custom_command(TARGET ${TARGET} PRE_BUILD 
+          COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}"
+          COMMENT "copying ${src_file} -> ${dst}")
+        endforeach()
+        else(WIN32) # not windows
+          add_custom_command(TARGET ${TARGET} PRE_BUILD 
           COMMAND mkdir -p "${dst}"
           COMMAND cp -r "${src}" "${dst}"
           COMMENT "copying ${src} -> ${dst}")
+        endif(WIN32)
     endforeach()
 endfunction()
 
@@ -66,13 +87,14 @@ copy(boost_lib
   DSTS ${dst_dir}
   DEPS boost
 )
-
+if(NOT WIN32)
 set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/xxhash")
 copy(xxhash_lib
   SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
   DSTS ${dst_dir} ${dst_dir}/lib
   DEPS xxhash
 )
+endif(NOT WIN32)
 
 if(NOT PROTOBUF_FOUND)
     set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf")
diff --git a/cmake/version.cmake b/cmake/version.cmake
index ac10bdf067be549fe90112aef73fd6e1fbe0ac48..fbf559f76bb73db4f90896817cecab91e5915c78 100644
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@@ -44,5 +44,5 @@ while ("${PADDLE_VERSION}" STREQUAL "")
   endif()
 endwhile()
 
-add_definitions(-DPADDLE_VERSION=${PADDLE_VERSION})
+add_definitions(-DPADDLE_VERSION="${PADDLE_VERSION}")
 message(STATUS "Paddle version is ${PADDLE_VERSION}")
diff --git a/doc/fluid/dev/contribute_to_paddle_cn.md b/doc/fluid/dev/contribute_to_paddle_cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..bcb71b3da1f1fad3def2d820b6f9b7330a6e4d54
--- /dev/null
+++ b/doc/fluid/dev/contribute_to_paddle_cn.md
@@ -0,0 +1 @@
+../../v2/dev/contribute_to_paddle_cn.md
diff --git a/doc/fluid/dev/contribute_to_paddle_en.md b/doc/fluid/dev/contribute_to_paddle_en.md
new file mode 120000
index 0000000000000000000000000000000000000000..16679a406334cadce45f7a838773ea8b781c67de
--- /dev/null
+++ b/doc/fluid/dev/contribute_to_paddle_en.md
@@ -0,0 +1 @@
+../../v2/dev/contribute_to_paddle_en.md
diff --git a/doc/fluid/new_docs/advanced_usage/development/contribute_to_paddle.md b/doc/fluid/new_docs/advanced_usage/development/contribute_to_paddle.md
new file mode 120000
index 0000000000000000000000000000000000000000..9f1af6133fdaa6d82b4e0bda0efaca10c750f80e
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/development/contribute_to_paddle.md
@@ -0,0 +1 @@
+../../../dev/contribute_to_paddle_cn.md
diff --git a/doc/fluid/new_docs/advanced_usage/development/cpu_profiling_cn.md b/doc/fluid/new_docs/advanced_usage/development/cpu_profiling_cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..8c13564629a4e0851bb7540f23df468a78b7bae2
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/development/cpu_profiling_cn.md
@@ -0,0 +1 @@
+../../../howto/optimization/cpu_profiling_cn.md
diff --git a/doc/fluid/new_docs/advanced_usage/development/host_memory_profiling_cn.md b/doc/fluid/new_docs/advanced_usage/development/host_memory_profiling_cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..5501686e9823c1b270b46f38c946ddd18d8f5c05
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/development/host_memory_profiling_cn.md
@@ -0,0 +1 @@
+../../../howto/optimization/host_memory_profiling_cn.md
diff --git a/doc/fluid/new_docs/advanced_usage/development/new_op.md b/doc/fluid/new_docs/advanced_usage/development/new_op.md
new file mode 120000
index 0000000000000000000000000000000000000000..a0d1af57ba6f836cec151501a6fdb68bc3756d8d
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/development/new_op.md
@@ -0,0 +1 @@
+../../../dev/new_op_cn.md
diff --git a/doc/fluid/new_docs/advanced_usage/development/timeline_cn.md b/doc/fluid/new_docs/advanced_usage/development/timeline_cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..1a782fd363a70b28cff18ef1dc5a2c084b3aa3ae
--- /dev/null
+++ b/doc/fluid/new_docs/advanced_usage/development/timeline_cn.md
@@ -0,0 +1 @@
+../../../howto/optimization/timeline_cn.md
diff --git a/doc/v2/dev/contribute_to_paddle_en.md b/doc/v2/dev/contribute_to_paddle_en.md
index c97564d93a7f0a753a23cd97d2467d595bd154ff..72723396444c0a6cc0516f6f2379b2d868ba59f7 120000
--- a/doc/v2/dev/contribute_to_paddle_en.md
+++ b/doc/v2/dev/contribute_to_paddle_en.md
@@ -1 +1 @@
-../../../CONTRIBUTING.md
\ No newline at end of file
+../../../CONTRIBUTING.md
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 8ed0ba1dfa68b3e22f370c3f2dd0f83c3e5506b0..93624b76ec6a0a197e0613fc5017779450bd1dd8 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <algorithm>
+
 #include "paddle/fluid/framework/executor.h"
 
 #include "paddle/fluid/framework/feed_fetch_method.h"
@@ -46,6 +48,7 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
   VLOG(5) << "destroy ExecutorPrepareContext";
 }
 
+#ifndef _WIN32
 template <typename RefCntMap>
 static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
                                 GarbageCollector<Tensor>* gc,
@@ -80,6 +83,7 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
     gc->Add(erase_tensors);
   }
 }
+#endif
 
 Executor::Executor(const platform::Place& place) : place_(place) {}
 
@@ -367,6 +371,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
     CreateVariables(ctx->prog_, local_scope, ctx->block_id_);
   }
 
+#ifndef _WIN32
   int64_t max_memory_size = GetEagerDeletionThreshold();
   std::unique_ptr<GarbageCollector<Tensor>> gc;
   // WhileOp would set keep_kids to false
@@ -408,6 +413,16 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   } else {
     platform::DeviceContextPool::Instance().Get(place_)->Wait();
   }
+#else   // WIN32
+  for (auto& op : ctx->ops_) {
+    op->Run(*local_scope, place_);
+    if (FLAGS_benchmark) {
+      VLOG(2) << "Memory used after operator " + op->Type() + " running: "
+              << memory::memory_usage(place_);
+    }
+  }
+  platform::DeviceContextPool::Instance().Get(place_)->Wait();
+#endif  // NOT WIN32
 
   if (local_scope != scope) {
     scope->DeleteScope(local_scope);
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 36b36d49c2728dbef93042158dffa26d8f56d529..a2a6c6bfb13be8a18e2a65343743f8b3e4518eb1 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -17,12 +17,14 @@ limitations under the License. */
 #include <map>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
+#ifndef _WIN32
+#include "paddle/fluid/framework/garbage_collector.h"
+#endif
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc
index 9277abe8c1b79c5f76f4610d0554bf337f329518..30879b1f36e30d429cbbfd6a6a8d8c588ae90333 100644
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@@ -17,7 +17,12 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
+// msvc15 don't support constexpr in correct way.
+#if !defined(_WIN32)
 constexpr char Node::kControlDepVarName[];
+#else
+const char Node::kControlDepVarName[] = "__control_var";
+#endif
 int Node::count_ = 0;
 
 std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index d6d42f5e92080aa57445e2d6ce59aa3faa89d22d..a3be133344ac9b844fcfd7dd923a75cde3c5ebdd 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -28,7 +28,11 @@ namespace ir {
 class Node {
  public:
   enum class Type { kOperation, kVariable };
+#if !defined(_WIN32)  // msvc not support constexpr correctly.
   static constexpr char kControlDepVarName[] = "__control_var";
+#else
+  static const char kControlDepVarName[];
+#endif
 
   Type NodeType() const { return type_; }
 
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index 9570c59cff2a6afeb1c607f7219b7b455974d6ce..ddbe0ddc121ca93db6c9d116524b7ae5226bf25b 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
@@ -195,6 +196,7 @@ struct PassRegistrar : public Registrar {
                              __test_global_namespace_##uniq_name##__>::value, \
                 msg)
 
+#if !defined(_WIN32)
 // Register a new pass that can be applied on the IR.
 #define REGISTER_PASS(pass_type, pass_class)                          \
   STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                                \
@@ -217,7 +219,32 @@ struct PassRegistrar : public Registrar {
   extern int TouchPassRegistrar_##pass_type();                        \
   static int use_pass_itself_##pass_type##_ __attribute__((unused)) = \
       TouchPassRegistrar_##pass_type()
+#else
+// windows version of __attribute__((unused))
+#define UNUSED(x) __pragma(warning(suppress : 4100)) x
+#define REGISTER_PASS(pass_type, pass_class)                        \
+  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                              \
+      __reg_pass__##pass_type,                                      \
+      "REGISTER_PASS must be called in global namespace");          \
+  static ::paddle::framework::ir::PassRegistrar<pass_class>         \
+      __pass_registrar_##pass_type##__(#pass_type);                 \
+  int TouchPassRegistrar_##pass_type() {                            \
+    __pass_registrar_##pass_type##__.Touch();                       \
+    return 0;                                                       \
+  }                                                                 \
+  static ::paddle::framework::ir::PassRegistrar<pass_class> UNUSED( \
+      &__pass_tmp_registrar_##pass_type##__) =                      \
+      __pass_registrar_##pass_type##__
+
+#define USE_PASS(pass_type)                           \
+  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                \
+      __use_pass_itself_##pass_type,                  \
+      "USE_PASS must be called in global namespace"); \
+  extern int TouchPassRegistrar_##pass_type();        \
+  static int UNUSED(use_pass_itself_##pass_type##_) = \
+      TouchPassRegistrar_##pass_type()
 
+#endif  // !_WIN32
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index f1d268548578fea12082e2edb213a3749eccbfaf..dd984445dba147eaa568fe0daa0cc609208b0e00 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -20,6 +20,11 @@ limitations under the License. */
 #include <typeindex>
 #include <vector>
 
+#if defined(_WIN32)
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#define GOOGLE_GLOG_DLL_DECL
+#endif
+
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/memory/memory.h"
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index e5678cf607a8ff3763e79c1f321a81c021846fb1..ad023ec46c69b7beb3ebfb148ce0645a1c89d6bf 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -16,6 +16,10 @@ cc_library(paddle_fluid_api
     DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) 
 
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+get_property(fluid_third_partys GLOBAL PROPERTY FLUID_THRID_PARTYS)
+if (WIN32)
+list(APPEND fluid_third_partys gflags glog protobuf cblas)
+endif(WIN32)
 
 # paddle_fluid_origin exclude inference api interface
 cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
@@ -33,7 +37,11 @@ if (WITH_GPU AND TENSORRT_FOUND)
 endif()
 
 # Create static library
+if (WIN32)
+cc_library(paddle_fluid DEPS ${fluid_modules} ${fluid_third_partys} paddle_fluid_api paddle_inference_api)
+else(WIND32)
 cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array)
+endif(WIN32)
 
 if(NOT APPLE)
   # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index e8fb0775b45761f64fd6fd28306c35b76d1e40c4..3242aced39e82099f838a2adb612868ebc47c888 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -26,6 +26,7 @@
 #include <string>
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
@@ -102,7 +103,6 @@ struct Argument {
   std::unordered_map<std::string, std::function<void()>> attr_deleters_;
 };
 
-#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
 #define ANALYSIS_ARGUMENT_CHECK_FIELD(field__)               \
   if (UNLIKELY(!(field__))) {                                \
     LOG(ERROR) << "field " << #field__ << " should be set."; \
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index 5151e2b69ac199dea136535ba445e890596f6227..e20ddfa24fcc7c3c7214e6cd62466c13d95ad949 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include <sys/stat.h>
 #include <cstdio>
 #include <fstream>
 #include <string>
@@ -26,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace inference {
@@ -124,20 +124,6 @@ T &GetFromScope(const framework::Scope &scope, const std::string &name) {
   return *var->GetMutable<T>();
 }
 
-static void ExecShellCommand(const std::string &cmd, std::string *message) {
-  char buffer[128];
-  std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
-  if (!pipe) {
-    LOG(ERROR) << "error running command: " << cmd;
-    return;
-  }
-  while (!feof(pipe.get())) {
-    if (fgets(buffer, 128, pipe.get()) != nullptr) {
-      *message += buffer;
-    }
-  }
-}
-
 static framework::proto::ProgramDesc LoadProgramDesc(
     const std::string &model_path) {
   std::ifstream fin(model_path, std::ios::in | std::ios::binary);
@@ -159,16 +145,6 @@ static bool FileExists(const std::string &filepath) {
   return exists;
 }
 
-static bool PathExists(const std::string &path) {
-  struct stat statbuf;
-  if (stat(path.c_str(), &statbuf) != -1) {
-    if (S_ISDIR(statbuf.st_mode)) {
-      return true;
-    }
-  }
-  return false;
-}
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 49a9ebe3ddec1e4fd59ae1155a706859e249d25c..5e55acf892718223263e9c689d64316dc6682780 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -24,6 +24,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
 endif()
 
 cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope)
+cc_library(helper SRCS helper.cc DEPS reset_tensor_array lod_tensor scope)
 cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope)
 cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor)
 cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api)
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index 01ea942d3c8d20180cfc9664b8601ba87a898e86..20fab8078fedf837564496aa296648bf5970a348 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -16,7 +16,6 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle_inference_api.h"
 
 namespace paddle {
 
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index d06ab8f8c8e3c0adf4a4074eb1450012126e83ea..27f272f2d82fc7fc854f039b71db6c8343f153ef 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+#include <fstream>
 #include <map>
 #include <set>
 #include <sstream>
@@ -24,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/timer.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -31,16 +33,6 @@ DEFINE_bool(profile, false, "Turn on profiler for fluid");
 DECLARE_int32(paddle_num_threads);
 
 namespace paddle {
-namespace {
-using paddle::inference::Timer;
-
-template <class T>
-std::string num2str(T a) {
-  std::stringstream istr;
-  istr << a;
-  return istr.str();
-}
-}  // namespace
 
 void NativePaddlePredictor::PrepareFeedFetch() {
   for (auto *op : inference_program_->Block(0).AllOps()) {
@@ -63,7 +55,6 @@ void NativePaddlePredictor::PrepareFeedFetch() {
 
 bool NativePaddlePredictor::Init(
     std::shared_ptr<framework::Scope> parent_scope) {
-  VLOG(3) << "Predictor::init()";
 #if !defined(_WIN32)
   if (FLAGS_profile) {
     LOG(WARNING) << "Profiler is actived, might affect the performance";
@@ -91,21 +82,21 @@ bool NativePaddlePredictor::Init(
     paddle::framework::InitDevices(false);
     scope_.reset(new paddle::framework::Scope());
   }
-
   executor_.reset(new paddle::framework::Executor(place_));
-
   // Initialize the inference program
   if (!config_.model_dir.empty()) {
     // Parameters are saved in separate files sited in
     // the specified `dirname`.
     inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(),
                                                  config_.model_dir);
+
   } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
     // All parameters are saved in a single file.
     // The file names should be consistent with that used
     // in Python API `fluid.io.save_inference_model`.
     inference_program_ = paddle::inference::Load(
         executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
+
   } else {
     LOG(ERROR) << "fail to load inference model from " << config_.model_dir;
     return false;
@@ -135,7 +126,7 @@ NativePaddlePredictor::~NativePaddlePredictor() {
 bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
                                 std::vector<PaddleTensor> *output_data,
                                 int batch_size) {
-  VLOG(3) << "Predictor::predict";
+  using Timer = paddle::inference::Timer;
   Timer timer;
   timer.tic();
   // set feed variable
@@ -147,11 +138,9 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
   }
   // Run the inference program
   // if share variables, we need not create variables
-  VLOG(4) << "Run prepared context";
   executor_->RunPreparedContext(ctx_.get(), scope,
                                 false, /* don't create local scope each time*/
                                 false /* don't create variable each time */);
-  VLOG(4) << "Finish prepared context";
   // get fetch variable
   if (!GetFetch(output_data, scope)) {
     LOG(ERROR) << "fail to get fetches";
@@ -166,7 +155,6 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
 }
 
 std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
-  VLOG(3) << "Predictor::clone";
   std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
 
   if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init(scope_)) {
@@ -184,7 +172,6 @@ std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
 
 bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                                     framework::Scope *scope) {
-  VLOG(3) << "Predictor::set_feed";
   if (inputs.size() != feeds_.size()) {
     LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
                << inputs.size();
@@ -244,7 +231,6 @@ void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch,
 
 bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
                                      framework::Scope *scope) {
-  VLOG(3) << "Predictor::get_fetch";
   outputs->resize(fetchs_.size());
   for (size_t i = 0; i < fetchs_.size(); ++i) {
     int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
@@ -269,25 +255,22 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) {
-  VLOG(3) << "create NativePaddlePredictor";
   if (config.use_gpu) {
     // 1. GPU memeroy
     PADDLE_ENFORCE_GT(
         config.fraction_of_gpu_memory, 0.f,
-        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
+        "fraction_of_gpu_memory in the config should be set to range (0.,1.]");
     PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
     std::vector<std::string> flags;
     if (config.fraction_of_gpu_memory >= 0.0f ||
         config.fraction_of_gpu_memory <= 0.95f) {
       flags.push_back("dummpy");
       std::string flag = "--fraction_of_gpu_memory_to_use=" +
-                         num2str<float>(config.fraction_of_gpu_memory);
+                         std::to_string(config.fraction_of_gpu_memory);
       flags.push_back(flag);
-      VLOG(3) << "set flag: " << flag;
       framework::InitGflags(flags);
     }
   }
-
   std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config));
   if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
     return nullptr;
diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
index 4e4ab47ca9c5e37f2714ebd48d250c23c7e9b117..ed3bdd8de7f59b4c086eef48bf5b51da635ab572 100644
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -31,10 +31,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle_inference_api.h"  // NOLINT
 
 namespace paddle {
 
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 49683eab07a2f5bc008272038a27bdb277396284..a742ba71eea5bf3439c1156d97a143ed7febd098 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -6,13 +6,13 @@ option(WITH_STATIC_LIB "Compile demo with static/shared library, default use sta
 option(USE_TENSORRT "Compile demo with TensorRT."   OFF)
 
 macro(safe_set_static_flag)
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
-      if(${flag_var} MATCHES "/MD")
-        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-      endif(${flag_var} MATCHES "/MD")
-    endforeach(flag_var)
+  foreach(flag_var
+      CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+      CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+    if(${flag_var} MATCHES "/MD")
+      string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+    endif(${flag_var} MATCHES "/MD")
+  endforeach(flag_var)
 endmacro()
 
 if (WIN32)
@@ -37,26 +37,25 @@ if(NOT DEFINED DEMO_NAME)
 endif()
 
 
-if(WITH_GPU)
+if(WITH_GPU) # default gpu path
   if(NOT WIN32)
     set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library")
   else()
     if(CUDA_LIB STREQUAL "")
-    set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64")
+      set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64")
     endif()
   endif(NOT WIN32)
 endif()
 
-include_directories("D:/Paddle/")
 include_directories("${PADDLE_LIB}")
 include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
 include_directories("${PADDLE_LIB}/third_party/install/glog/include")
 include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
 include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
 if (NOT WIN32)
-include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
-include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
-include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
+  include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
+  include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
+  include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
 endif(NOT WIN32)
 
 include_directories("${PADDLE_LIB}/third_party/boost")
@@ -64,15 +63,15 @@ include_directories("${PADDLE_LIB}/third_party/eigen3")
 
 if (NOT WIN32)
   if (USE_TENSORRT AND WITH_GPU)
-      include_directories("${TENSORRT_INCLUDE_DIR}")
-      link_directories("${TENSORRT_LIB_DIR}")
+    include_directories("${TENSORRT_INCLUDE_DIR}")
+    link_directories("${TENSORRT_LIB_DIR}")
   endif()
 endif(NOT WIN32)
 
 if (NOT WIN32)
-link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
-link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
-link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
+  link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
+  link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
+  link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
 endif(NOT WIN32)
 
 link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
@@ -86,7 +85,7 @@ add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
 if(WITH_MKL)
   include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
   set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
-               ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
+    ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
   set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
   if(EXISTS ${MKLDNN_PATH})
     include_directories("${MKLDNN_PATH}/include")
@@ -99,25 +98,25 @@ endif()
 # Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
 if(WITH_STATIC_LIB)
   set(DEPS
-      ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
+    ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
 else()
   set(DEPS
-      ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
+    ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
 endif()
 
 if (NOT WIN32)
-set(EXTERNAL_LIB "-lrt -ldl -lpthread")
-set(DEPS ${DEPS}
+  set(EXTERNAL_LIB "-lrt -ldl -lpthread")
+  set(DEPS ${DEPS}
     ${MATH_LIB} ${MKLDNN_LIB}
     glog gflags protobuf snappystream snappy z xxhash
     ${EXTERNAL_LIB})
 else()
-set(DEPS ${DEPS}
+  set(DEPS ${DEPS}
     ${MATH_LIB} ${MKLDNN_LIB}
     ${CMAKE_STATIC_LIBRARY_PREFIX}glog  ${CMAKE_STATIC_LIBRARY_PREFIX}gflags  ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf
     ${EXTERNAL_LIB})
-# NOTE(dzhwinter) shlwapi is deprecated.
-set(DEPS ${DEPS} libcmt shlwapi)
+  # NOTE(dzhwinter) shlwapi will be deprecated.
+  set(DEPS ${DEPS} libcmt shlwapi)
 endif(NOT WIN32)
 
 if(WITH_GPU)
@@ -129,8 +128,8 @@ if(WITH_GPU)
     set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
   else()
     set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} )
-  set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} )
-  set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} )
+    set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} )
+    set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} )
   endif()
 endif()
 
diff --git a/paddle/fluid/inference/api/demo_ci/inference_icnet.cc b/paddle/fluid/inference/api/demo_ci/inference_icnet.cc
new file mode 100644
index 0000000000000000000000000000000000000000..88e220c0b62a7737c614da0e2c46bb5af59f10f9
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/inference_icnet.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#define GOOGLE_GLOG_DLL_DECL
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <chrono>  // NOLINT
+#include <fstream>
+#include <iostream>
+#include <thread>  // NOLINT
+#include <utility>
+#include "paddle/fluid/inference/paddle_inference_api.h"
+
+namespace paddle {
+
+NativeConfig GetConfig() {
+  NativeConfig config;
+  config.prog_file = "hs_lb_without_bn_cudnn/__model__";
+  config.param_file = "hs_lb_without_bn_cudnn/__params__";
+  config.fraction_of_gpu_memory = 0.0;
+  config.use_gpu = true;
+  config.device = 0;
+  return config;
+}
+
+using Time = decltype(std::chrono::high_resolution_clock::now());
+Time TimeNow() { return std::chrono::high_resolution_clock::now(); }
+double TimeDiff(Time t1, Time t2) {
+  typedef std::chrono::microseconds ms;
+  auto diff = t2 - t1;
+  ms counter = std::chrono::duration_cast<ms>(diff);
+  return counter.count() / 1000.0;
+}
+
+std::vector<PaddleTensor> PrepareData() {
+  int height = 449;
+  int width = 581;
+  std::vector<float> data;
+  for (int i = 0; i < 3 * height * width; ++i) {
+    data.push_back(0.0);
+  }
+  PaddleTensor tensor;
+  tensor.shape = std::vector<int>({batch_size, 3, height, width});
+  tensor.data.Resize(sizeof(float) * batch_size * 3 * height * width);
+  std::copy(data.begin(), data.end(), static_cast<float*>(tensor.data.data()));
+  tensor.dtype = PaddleDType::FLOAT32;
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+  return std::move(paddle_tensor_feeds);
+}
+
+void TestNaive(int batch_size, int thread_num) {
+  NativeConfig config = GetConfig();
+
+  int num_jobs = thread_num;   // parallel jobs.
+  constexpr int epoches = 10;  // each job run epoches.
+  std::vector<std::thread> threads;
+  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+  for (int tid = 0; tid < num_jobs; ++tid) {
+    auto& pred = CreatePaddlePredictor<NativeConfig>(config);
+    predictors.emplace_back(std::move(pred));
+  }
+
+  auto time1 = TimeNow();
+  for (int tid = 0; tid < num_jobs; ++tid) {
+    threads.emplace_back([&, tid]() {
+      auto& predictor = predictors[tid];
+      PaddleTensor tensor_out;
+      std::vector<PaddleTensor> outputs(1, tensor_out);
+      for (size_t i = 0; i < epoches; i++) {
+        ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
+        VLOG(3) << "tid : " << tid << " run: " << i << "finished";
+        ASSERT_EQ(outputs.size(), 1UL);
+      }
+    });
+  }
+  for (int i = 0; i < num_jobs; ++i) {
+    threads[i].join();
+  }
+  auto time2 = TimeNow();
+  VLOG(3) << "Thread num " << thread_num << "total time cost"
+          << (time2 - time1);
+}
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  paddle::TestNaive(1, 1);  // single thread.
+  paddle::TestNaive(1, 5);  // 5 threads.
+  return 0;
+}
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index e46dc1326951f68fd030f2208b9bea1647d0026d..f5c83bcd546d096e5b0df0a2c5ca4e1f00633b5e 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -14,36 +14,22 @@
 
 #pragma once
 
+#define GLOG_NO_ABBREVIATED_SEVERITIES
+#define GOOGLE_GLOG_DLL_DECL
 #include <glog/logging.h>
-#include <sys/time.h>
+#include <algorithm>
 #include <chrono>  // NOLINT
+#include <iterator>
 #include <numeric>
 #include <sstream>
 #include <string>
 #include <vector>
-#include "paddle/fluid/string/printf.h"
-#include "paddle_inference_api.h"
+#include "paddle/fluid/inference/api/timer.h"
+#include "paddle_inference_api.h"  //NOLINT
 
 namespace paddle {
 namespace inference {
 
-// Timer for timer
-class Timer {
- public:
-  std::chrono::high_resolution_clock::time_point start;
-  std::chrono::high_resolution_clock::time_point startu;
-
-  void tic() { start = std::chrono::high_resolution_clock::now(); }
-  double toc() {
-    startu = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double> time_span =
-        std::chrono::duration_cast<std::chrono::duration<double>>(startu -
-                                                                  start);
-    double used_time_ms = static_cast<double>(time_span.count()) * 1000.0;
-    return used_time_ms;
-  }
-};
-
 static void split(const std::string &str, char sep,
                   std::vector<std::string> *pieces) {
   pieces->clear();
diff --git a/paddle/fluid/inference/api/timer.h b/paddle/fluid/inference/api/timer.h
new file mode 100644
index 0000000000000000000000000000000000000000..2df5274dc1f2e7ad8e434f1da9d5ae6aee94c784
--- /dev/null
+++ b/paddle/fluid/inference/api/timer.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <chrono>  // NOLINT
+
+namespace paddle {
+namespace inference {
+
+// Timer for timer
+class Timer {
+ public:
+  std::chrono::high_resolution_clock::time_point start;
+  std::chrono::high_resolution_clock::time_point startu;
+
+  void tic() { start = std::chrono::high_resolution_clock::now(); }
+  double toc() {
+    startu = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> time_span =
+        std::chrono::duration_cast<std::chrono::duration<double>>(startu -
+                                                                  start);
+    double used_time_ms = static_cast<double>(time_span.count()) * 1000.0;
+    return used_time_ms;
+  }
+};
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index 26ef27c3caafadb4801b0ae52133f6175655ce0a..ce283f0621b164d8b66d281065779fcdb0e48077 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -11,7 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
+#define GLOG_NO_ABBREVIATED_SEVERITIES
+#define GOOGLE_GLOG_DLL_DECL
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "glog/logging.h"
 
diff --git a/paddle/fluid/memory/detail/meta_cache.cc b/paddle/fluid/memory/detail/meta_cache.cc
index b86e4f38c42a26e155f276f9b73cbed1d0d83f7d..2a283733f5c25b0f666b1067c3e5151ce9f75873 100644
--- a/paddle/fluid/memory/detail/meta_cache.cc
+++ b/paddle/fluid/memory/detail/meta_cache.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#define GLOG_NO_ABBREVIATED_SEVERITIES
+#define GOOGLE_GLOG_DLL_DECL
 #include "glog/logging.h"
 #include "paddle/fluid/memory/detail/memory_block.h"
 #include "paddle/fluid/platform/assert.h"
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 1b96798d23cec34a1863f56c1e4027ce32b2eec5..92849bc2c081a4d3454e7d0a725387a2ee4a5db8 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #define GLOG_NO_ABBREVIATED_SEVERITIES
+#define GOOGLE_GLOG_DLL_DECL
 
 #include "paddle/fluid/memory/detail/system_allocator.h"
 
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 919ad96f7adfc5025d9a8367c467f639c6fe3101..c43f0a21594dbed32bf15a0d7cbe4ad1f7ab2a58 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -86,7 +86,7 @@ function(op_library TARGET)
     # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
     foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
      "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
-      "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
+      "fusion_seqconv_eltadd_relu_op" "hash_op")
         if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
           return()
         endif()
@@ -284,12 +284,10 @@ op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
 op_library(max_sequence_len_op DEPS lod_rank_table)
 op_library(sequence_conv_op DEPS context_project)
 op_library(sequence_pool_op DEPS sequence_pooling)
-if (NOT WIN32)
-    op_library(lstm_op DEPS sequence2batch lstm_compute)
-    op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
-    op_library(lstmp_op DEPS sequence2batch lstm_compute)
-    op_library(gru_op DEPS sequence2batch gru_compute)
-endif(NOT WIN32)
+op_library(lstm_op DEPS sequence2batch lstm_compute)
+op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
+op_library(lstmp_op DEPS sequence2batch lstm_compute)
+op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 op_library(cos_sim_op DEPS cos_sim_functor)
diff --git a/paddle/fluid/operators/accuracy_op.h b/paddle/fluid/operators/accuracy_op.h
index 803244dd48efc634bf5e654a35cb3dd572842882..8d3313db9687937db9b6f326348fcb273a615d65 100644
--- a/paddle/fluid/operators/accuracy_op.h
+++ b/paddle/fluid/operators/accuracy_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
+
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h
index 8fa0416049f8fa128d7ab61f8350b41960f07263..ea710aaad5cf94aeb0db0d905c13decf742f60f9 100644
--- a/paddle/fluid/operators/cast_op.h
+++ b/paddle/fluid/operators/cast_op.h
@@ -54,6 +54,7 @@ class CastOpKernel : public framework::OpKernel<InT> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<framework::Tensor>("X");
     auto* out = context.Output<framework::Tensor>("Out");
+
     framework::VisitDataType(
         static_cast<framework::proto::VarType::Type>(
             context.Attr<int>("out_dtype")),
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
index c82930cc4994c3854e60f40ae9909a90d82cbff6..e70945a2bd1025ba542dbf4da556dc04a3f5b91f 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
@@ -31,12 +31,12 @@ namespace operators {
 
 template <typename T>
 __device__ bool GT_E(T a, T b) {
-  return (a > b) || fabs(a - b) < 1e-4;
+  return (a > b) || fabsf(static_cast<float>(a - b)) < 1e-4;
 }
 
 template <typename T>
 __device__ bool LT_E(T a, T b) {
-  return (a < b) || fabs(a - b) < 1e-4;
+  return (a < b) || fabsf(static_cast<float>(a - b)) < 1e-4;
 }
 
 template <typename T>
diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h
index 93204216f947e5203863a3493005faa0c03ae4af..29276955fee925c6f4969e6ab78a9cf9080cc914 100644
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include <glog/logging.h>
 #include <algorithm>
 #include <iterator>
 #include <vector>
diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc
index 0522a94195786c767194ec727d982a60451e7c62..59f44b112cddddff5ff423f462650615710856a7 100644
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <fstream>
+#include <memory>
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -32,9 +33,15 @@ class LoadCombineOp : public framework::OperatorBase {
                const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
     auto load_as_fp16 = Attr<bool>("load_as_fp16");
-
-    std::ifstream fin(filename);
-    PADDLE_ENFORCE(static_cast<bool>(fin),
+    auto format = Attr<std::string>("format");
+    std::unique_ptr<std::ifstream> fin;
+    if (format == "windows") {
+      fin.reset(new std::ifstream(filename,
+                                  std::ios_base::in | std::ios_base::binary));
+    } else {
+      fin.reset(new std::ifstream(filename));
+    }
+    PADDLE_ENFORCE(static_cast<bool>(*fin),
                    "Cannot open file %s for load_combine op", filename);
 
     auto out_var_names = Outputs("Out");
@@ -54,11 +61,11 @@ class LoadCombineOp : public framework::OperatorBase {
       auto *tensor = out_var->GetMutable<framework::LoDTensor>();
 
       // Error checking
-      PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot read more from file %s",
+      PADDLE_ENFORCE(static_cast<bool>(*fin), "Cannot read more from file %s",
                      filename);
 
       // Get data from fin to tensor
-      DeserializeFromStream(fin, tensor, dev_ctx);
+      DeserializeFromStream(*fin, tensor, dev_ctx);
 
       auto in_dtype = framework::ToDataType(tensor->type());
       auto out_dtype =
@@ -103,6 +110,18 @@ class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
                          "LoDTensors will be loaded from \"file_path\".")
         .AddCustomChecker(
             [](const std::string &path) { return !path.empty(); });
+    AddAttr<std::string>("format",
+                         R"DOC((windows|linux)" "saved model file format
+                         windows and linux file newline symbol is
+different. windows(newline is \n\r) or linux(newline is \r)
+So if you set attribute format to windows, then we saved model file in binary.
+It can be used both linux and windows. If you set format to linux,
+it will save file in normal file, newline symbol is \r. Need to note
+that these two format is not inter-compatible.)DOC")
+        .SetDefault("linux")
+        .AddCustomChecker([](const std::string &s) {
+          return s == "windows" || s == "linux";
+        });
     AddComment(R"DOC(
 LoadCombine Operator.
 
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index 51219504ffa2a778b56351f759e8a8dfb951ad91..e0e2c3dc4fa0af6bd6a58106364e21099d7bc517 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <fstream>
+#include <memory>
 
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -34,8 +35,15 @@ class LoadOp : public framework::OperatorBase {
     // FIXME(yuyang18): We save variable to local file now, but we should change
     // it to save an output stream.
     auto filename = Attr<std::string>("file_path");
-    std::ifstream fin(filename);
-    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
+    auto format = Attr<std::string>("format");
+    std::unique_ptr<std::ifstream> fin;
+    if (format == "windows") {
+      fin.reset(new std::ifstream(filename,
+                                  std::ios_base::in | std::ios_base::binary));
+    } else {
+      fin.reset(new std::ifstream(filename));
+    }
+    PADDLE_ENFORCE(static_cast<bool>(*fin), "Cannot open file %s for load op",
                    filename);
 
     auto out_var_name = Output("Out");
@@ -44,9 +52,9 @@ class LoadOp : public framework::OperatorBase {
                    out_var_name);
 
     if (out_var->IsType<framework::LoDTensor>()) {
-      LoadLodTensor(fin, place, out_var);
+      LoadLodTensor(*fin, place, out_var);
     } else if (out_var->IsType<framework::SelectedRows>()) {
-      LoadSelectedRows(fin, place, out_var);
+      LoadSelectedRows(*fin, place, out_var);
     } else {
       PADDLE_ENFORCE(
           false,
@@ -110,6 +118,18 @@ class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
                          R"(Variable will be loaded from "file_path")")
         .AddCustomChecker(
             [](const std::string &path) { return !path.empty(); });
+    AddAttr<std::string>("format",
+                         R"DOC((windows|linux)" "saved model file format
+                         windows and linux file newline symbol is
+different. windows(newline is \n\r) or linux(newline is \r)
+So if you set attribute format to windows, then we saved model file in binary.
+It can be used both linux and windows. If you set format to linux,
+it will save file in normal file, newline symbol is \r. Need to note
+that these two format is not inter-compatible.)DOC")
+        .SetDefault("linux")
+        .AddCustomChecker([](const std::string &s) {
+          return s == "windows" || s == "linux";
+        });
     AddComment(
         "Load operator will load a LoDTensor / SelectedRows variable from disk "
         "file.");
diff --git a/paddle/fluid/operators/lstm_unit_op.h b/paddle/fluid/operators/lstm_unit_op.h
index 4ead9c22934dde6e42f9ede47cc1ddf502948fc4..5d1d667fe1ec9a3d8978ae1dcbb4d92ad24fe96e 100644
--- a/paddle/fluid/operators/lstm_unit_op.h
+++ b/paddle/fluid/operators/lstm_unit_op.h
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 868a7a706471717ce0c8f268d5eaa6dc4789588c..f2f398b8a1a6e39eae97929f5d757d3a72461c90 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -57,9 +57,6 @@ math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
-if (NOT WIN32)
-    math_library(matrix_bit_code)
-endif (NOT WIN32)
 math_library(unpooling)
 math_library(vol2col)
 
@@ -75,7 +72,9 @@ if(WITH_GPU)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
-
+if (NOT WIN32)
+    math_library(matrix_bit_code)
+endif (NOT WIN32)
 set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc)
 set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
 if(WITH_XBYAK)
diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
index 0aed253c80fc28560716cbcfa70f74ef9c84f9b6..38df5776bfaa54fc4018cd592cc9cf0478132cf2 100644
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -18,10 +18,6 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
-
 #ifdef PADDLE_WITH_MKLML
 #include "paddle/fluid/platform/dynload/mklml.h"
 #endif
diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h
index b127fbe8c8515e7fe57b07ea1d4291675ec4efca..24df1f93edd85145d703ed3277b0d1ca06e67009 100644
--- a/paddle/fluid/operators/math/detail/activation_functions.h
+++ b/paddle/fluid/operators/math/detail/activation_functions.h
@@ -15,13 +15,10 @@ limitations under the License. */
 #pragma once
 #include <math.h>
 #include <string>
+#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
-
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc
index f976953a245e424e6cb26bbf1cff2f120f84c133..73089a4f0c846122987298b824ccee3f0fe05a5f 100644
--- a/paddle/fluid/operators/math/jit_kernel_blas.cc
+++ b/paddle/fluid/operators/math/jit_kernel_blas.cc
@@ -25,10 +25,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/mklml.h"
 #endif
 
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
-
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
index a4861c347e44ad86a066861d3375b556302a84bc..4626ff5cb3ab6d1756119c1cd9ecc9102e8a0cae 100644
--- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
+++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc
@@ -16,9 +16,6 @@ limitations under the License. */
 #include <limits>
 #include <string>
 #include "paddle/fluid/operators/math/jit_kernel_macro.h"
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
 
 namespace paddle {
 namespace operators {
@@ -263,6 +260,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
     }                                                                          \
   }
 
+#ifndef _WIN32  // commented out crf decoding
 #ifdef __AVX__
 INTRIAVX_FLOAT(kEQ8);
 INTRIAVX_FLOAT(kGT8LT16);
@@ -275,6 +273,7 @@ INTRIAVX2_FLOAT(jit::avx2, kGT8LT16);
 INTRIAVX2_FLOAT(jit::avx2, kEQ16);
 INTRIAVX2_FLOAT(jit::avx2, kGT16);
 #endif
+#endif  // WIN32
 #ifdef __AVX512F__
 INTRIAVX2_FLOAT(jit::avx512f, kEQ8);
 INTRIAVX2_FLOAT(jit::avx512f, kGT8LT16);
diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc
index d7c177e6782e19e199542e10e1d62587ee0df4cf..131c226589a0a233f3f78e89dfe56e6f017798d4 100644
--- a/paddle/fluid/operators/math/jit_kernel_exp.cc
+++ b/paddle/fluid/operators/math/jit_kernel_exp.cc
@@ -20,10 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/mklml.h"
 #endif
 
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -66,14 +62,18 @@ namespace detail {
 
 #ifdef __AVX__
 
+#if defined(_WIN32)
+#define ALIGN32 __declspec(align(32))
+#else
 #define ALIGN32 __attribute__((aligned(32)))
+#endif  // _WIN32
 
 #define _PS256_CONST(Name, Val)                                      \
-  static const float _ps256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \
+  static const float ALIGN32 _ps256_##Name[8] = {Val, Val, Val, Val, \
                                                  Val, Val, Val, Val}
 
 #define _PI256_CONST(Name, Val)                                    \
-  static const int _pi256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \
+  static const int ALIGN32 _pi256_##Name[8] = {Val, Val, Val, Val, \
                                                Val, Val, Val, Val}
 
 _PI256_CONST(0x7f, 0x7f);
@@ -98,7 +98,7 @@ typedef union imm_xmm_union {
 
 #define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \
   {                                         \
-    imm_xmm_union u ALIGN32;                \
+    imm_xmm_union ALIGN32 u;                \
     u.imm = imm_;                           \
     xmm0_ = u.xmm[0];                       \
     xmm1_ = u.xmm[1];                       \
@@ -106,7 +106,7 @@ typedef union imm_xmm_union {
 
 #define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \
   {                                         \
-    imm_xmm_union u ALIGN32;                \
+    imm_xmm_union ALIGN32 u;                \
     u.xmm[0] = xmm0_;                       \
     u.xmm[1] = xmm1_;                       \
     imm_ = u.imm;                           \
@@ -508,12 +508,14 @@ class VTanhKernelImpl : public VTanhKernel<T> {
     vaddbias_->Compute(-1.f, y, y);                                           \
   }
 
+#ifndef __WIN32
 #ifdef __AVX__
 INTRI8_FLOAT(jit::avx, detail::ExpAVX);
 INTRI16_FLOAT(jit::avx, detail::ExpAVX);
 INTRI_GT8LT16_FLOAT(jit::avx, detail::ExpAVX);
 INTRI_GT16_FLOAT(jit::avx, detail::ExpAVX);
-#endif
+#endif  // AVX
+#endif  // WIN32
 #ifdef __AVX2__
 INTRI8_FLOAT(jit::avx2, detail::ExpAVX2);
 INTRI16_FLOAT(jit::avx2, detail::ExpAVX2);
diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc
index ba3e917377cf12192a068a9d71238442e12d5e5e..fc6a3caef00765957ce0627d0ae4e6446c1d6355 100644
--- a/paddle/fluid/operators/math/jit_kernel_rnn.cc
+++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc
@@ -18,10 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
-
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index c4fccdbf862fda8a599869c30ae598573ca367aa..ddd6b2a531c1f1c05255f0fe89995990b4184763 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/math_function_impl.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/float16.h"
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
index 0015fafbc892912424dfa6dbd1778438d384ca19..51da6de26e2a47da2c22a1c2e2e1a9412badc58f 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -16,13 +16,12 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_pooling.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace operators {
 namespace math {
 
-#define FLT_MAX __FLT_MAX__
-
 template <typename T>
 struct MaxPoolFunctor {
   HOSTDEVICE void operator()(const T* input, const size_t start,
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
index e7f1caf4d3a81dc7633139933c6a4c3d51a4e2a0..e18bc17fd642478a03e0a28fa448b19738bc27e0 100644
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #include <algorithm>
+#include <iostream>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/var_type.h"
 
diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
index 5b05f757c0355ed15617dea925b5d4929fcbfee0..f1cd7c6ff64e43c7c2ddc25e8965e577c357894d 100644
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <stdint.h>
 #include <fstream>
+#include <memory>
 #include <numeric>
 #include <sstream>
 #include "paddle/fluid/framework/data_type.h"
@@ -41,6 +42,7 @@ class SaveCombineOp : public framework::OperatorBase {
     auto filename = Attr<std::string>("file_path");
     auto overwrite = Attr<bool>("overwrite");
     auto save_as_fp16 = Attr<bool>("save_as_fp16");
+    auto format = Attr<std::string>("format");
 
     bool is_present = FileExists(filename);
     if (is_present && !overwrite) {
@@ -49,8 +51,14 @@ class SaveCombineOp : public framework::OperatorBase {
     }
 
     MkDirRecursively(DirName(filename).c_str());
-    std::ofstream fout(filename);
-    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+    std::unique_ptr<std::ofstream> fout;
+    if (format == "windows") {
+      fout.reset(new std::ofstream(filename,
+                                   std::ios_base::out | std::ios_base::binary));
+    } else {
+      fout.reset(new std::ofstream(filename));
+    }
+    PADDLE_ENFORCE(static_cast<bool>(*fout), "Cannot open %s to write",
                    filename);
 
     auto inp_var_names = Inputs("X");
@@ -86,12 +94,11 @@ class SaveCombineOp : public framework::OperatorBase {
         // copy LoD info to the new tensor
         out.set_lod(tensor.lod());
         framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
-        framework::SerializeToStream(fout, out, dev_ctx);
+        framework::SerializeToStream(*fout, out, dev_ctx);
       } else {
-        framework::SerializeToStream(fout, tensor, dev_ctx);
+        framework::SerializeToStream(*fout, tensor, dev_ctx);
       }
     }
-    fout.close();
   }
 };
 
@@ -124,6 +131,18 @@ to a file on disk.
         "The \"file_path\" where the LoDTensor variables will be saved.")
         .AddCustomChecker(
             [](const std::string &path) { return !path.empty(); });
+    AddAttr<std::string>("format",
+                         R"DOC((windows|linux)" "saved model file format
+                         windows and linux file newline symbol is
+different. windows(newline is \n\r) or linux(newline is \r)
+So if you set attribute format to windows, then we saved model file in binary.
+It can be used both linux and windows. If you set format to linux,
+it will save file in normal file, newline symbol is \r. Need to note
+that these two format is not inter-compatible.)DOC")
+        .SetDefault("linux")
+        .AddCustomChecker([](const std::string &s) {
+          return s == "windows" || s == "linux";
+        });
   }
 };
 
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index e79cffcf498c52ed14db235f6221cfdf08399c9d..9eea9e1a9517e84edcb11695ca33c5b7bfdc66f1 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <stdint.h>
 #include <fstream>
+#include <memory>
 #include <numeric>
 
 #include "paddle/fluid/framework/data_type.h"
@@ -64,6 +65,7 @@ class SaveOp : public framework::OperatorBase {
                      framework::Variable *var) const {
     auto filename = Attr<std::string>("file_path");
     auto overwrite = Attr<bool>("overwrite");
+    auto format = Attr<std::string>("format");
 
     if (FileExists(filename) && !overwrite) {
       PADDLE_THROW("%s is existed, cannot save to it when overwrite=false",
@@ -80,8 +82,14 @@ class SaveOp : public framework::OperatorBase {
 
     // FIXME(yuyang18): We save variable to local file now, but we should change
     // it to save an output stream.
-    std::ofstream fout(filename);
-    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+    std::unique_ptr<std::ofstream> fout;
+    if (format == "windows") {
+      fout.reset(new std::ofstream(filename,
+                                   std::ios_base::out | std::ios_base::binary));
+    } else {
+      fout.reset(new std::ofstream(filename));
+    }
+    PADDLE_ENFORCE(static_cast<bool>(*fout), "Cannot open %s to write",
                    filename);
 
     auto save_as_fp16 = Attr<bool>("save_as_fp16");
@@ -95,11 +103,10 @@ class SaveOp : public framework::OperatorBase {
       framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
       // copy LoD info to the new tensor
       out.set_lod(tensor.lod());
-      framework::SerializeToStream(fout, out, dev_ctx);
+      framework::SerializeToStream(*fout, out, dev_ctx);
     } else {
-      framework::SerializeToStream(fout, tensor, dev_ctx);
+      framework::SerializeToStream(*fout, tensor, dev_ctx);
     }
-    fout.close();
   }
 
   void SaveSelectedRows(const framework::Scope &scope,
@@ -110,6 +117,7 @@ class SaveOp : public framework::OperatorBase {
         lt_var != nullptr,
         "Can not find variable kLookupTablePath for SaveSelectedRows");
     std::string filename = lt_var->data();
+    auto format = Attr<std::string>("format");
     VLOG(4) << "SaveSelectedRows get File name: " << filename;
 
     MkDirRecursively(DirName(filename).c_str());
@@ -122,11 +130,16 @@ class SaveOp : public framework::OperatorBase {
 
     // FIXME(yuyang18): We save variable to local file now, but we should change
     // it to save an output stream.
-    std::ofstream fout(filename);
-    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+    std::unique_ptr<std::ofstream> fout;
+    if (format == "windows") {
+      fout.reset(new std::ofstream(filename,
+                                   std::ios_base::out | std::ios_base::binary));
+    } else {
+      fout.reset(new std::ofstream(filename));
+    }
+    PADDLE_ENFORCE(static_cast<bool>(*fout), "Cannot open %s to write",
                    filename);
-    framework::SerializeToStream(fout, selectedRows, dev_ctx);
-    fout.close();
+    framework::SerializeToStream(*fout, selectedRows, dev_ctx);
   }
 };
 
@@ -154,6 +167,18 @@ This operator will serialize and write LoDTensor / SelectedRows variable to file
                          "The \"file_path\" where the variable will be saved.")
         .AddCustomChecker(
             [](const std::string &path) { return !path.empty(); });
+    AddAttr<std::string>("format",
+                         R"DOC((windows|linux)" "saved model file format
+                         windows and linux file newline symbol is
+different. windows(newline is \n\r) or linux(newline is \r)
+So if you set attribute format to windows, then we saved model file in binary.
+It can be used both linux and windows. If you set format to linux,
+it will save file in normal file, newline symbol is \r. Need to note
+that these two format is not inter-compatible.)DOC")
+        .SetDefault("linux")
+        .AddCustomChecker([](const std::string &s) {
+          return s == "windows" || s == "linux";
+        });
   }
 };
 
diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc
index 767449cde981e5925b7144ff1038560c67651f3e..cfe491f4c59b71fef5381d8caabefdebfd2fd719 100644
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
index 673f86da76ee0712b4d941f5b33594f89926b973..f30668fd21e5cf1b000e273cf853f0e07527f6f5 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -34,7 +34,7 @@ namespace operators {
 using FluidDT = framework::proto::VarType_Type;
 using TRT_DT = nvinfer1::DataType;
 
-namespace {
+namespace {  // NOLINT
 
 TRT_DT FluidDataType2TRT(FluidDT type) {
   switch (type) {
@@ -60,7 +60,7 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
   return nvinfer1::DimsCHW(shape[1], 1, 1);
 }
 
-}  // namespace
+}  // NOLINT  // namespace
 
 using inference::Singleton;
 using inference::tensorrt::TRT_EngineManager;
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index 6810a1651a14cdb2080af846b21cad242b70bf35..bc0204e579d717062eb9754d1d0531649d119c04 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -16,6 +16,18 @@ limitations under the License. */
 
 #include <stddef.h>
 
+#ifdef _WIN32
+#if defined(__AVX2__)
+#include <immintrin.h>  //avx2
+#elif defined(__AVX__)
+#include <intrin.h>  //avx
+#endif               // AVX
+#else                // WIN32
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+#endif  // WIN32
+
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index 07bb02be1962f758e50cab1f27de43e89f3953c3..0ec3a2a8595f743d77511b8cb81896608e2b4718 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -59,6 +59,7 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
 #define CUDNN_VERSION_MIN(major, minor, patch) \
   (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
 
+#if !defined(_WIN32)
 #define CUDNN_ENFORCE(condition)                                     \
   do {                                                               \
     cudnnStatus_t status = condition;                                \
@@ -66,6 +67,16 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
       PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \
     }                                                                \
   } while (false)
+#else
+// windows
+#define CUDNN_ENFORCE(condition)                                    \
+  do {                                                              \
+    cudnnStatus_t status = condition;                               \
+    if (status != CUDNN_STATUS_SUCCESS) {                           \
+      std::cerr << ::paddle::platform::cudnnGetErrorString(status); \
+    }                                                               \
+  } while (false)
+#endif
 
 enum class DataLayout {  // Not use
   kNHWC,
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index ff49a1d57fd977a6d6b4502b44e48aad34cde872..b95e25e2c146daf2ed78c85f6a4315b1acf962bd 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -55,7 +55,6 @@ DeviceContextPool::DeviceContextPool(
   for (auto& p : places) {
     set.insert(p);
   }
-
   for (auto& p : set) {
     if (platform::is_cpu_place(p)) {
 #ifdef PADDLE_WITH_MKLDNN
@@ -205,7 +204,9 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
                           << ", Runtime Version: " << runtime_version_ / 1000
                           << "." << (runtime_version_ % 100) / 10;
 
+#ifndef _WIN32
   callback_manager_.reset(new StreamCallbackManager(stream_));
+#endif  // NOT WIN32
 }
 
 CUDADeviceContext::~CUDADeviceContext() {
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index df248f9bb15591d5015ad01278797ec7e31ef9d1..51cac83961d7571727e6b6d41f964ec60500910f 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -32,7 +32,7 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/stream_callback_manager.h"
 #endif
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -173,6 +173,7 @@ class CUDADeviceContext : public DeviceContext {
     PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
   }
 
+#ifndef _WIN32
   template <typename Callback>
   void AddStreamCallback(Callback&& callback) const {
     std::lock_guard<std::mutex> guard(callback_mtx_);
@@ -183,6 +184,16 @@ class CUDADeviceContext : public DeviceContext {
     std::lock_guard<std::mutex> guard(callback_mtx_);
     callback_manager_->Wait();
   }
+#else
+  template <typename Callback>
+  void AddStreamCallback(Callback&& callback) const {
+    // ugly empty functor.
+  }
+
+  void WaitStreamCallback() const {
+    // ugly empty functor.
+  }
+#endif
 
  private:
   CUDAPlace place_;
@@ -201,10 +212,12 @@ class CUDADeviceContext : public DeviceContext {
 
   mutable std::mutex mtx_;
 
+#ifndef _WIN32
   // This lock is only used by callback
   // If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes
   mutable std::mutex callback_mtx_;
   std::unique_ptr<StreamCallbackManager> callback_manager_;
+#endif
 };
 
 template <>
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index a251bfcd9914422cb6300adbbcdef3dfa79f441c..23f64170eb2e936ca3af1c467c5d4a621045528d 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -127,7 +127,7 @@ struct EOFException : public std::exception {
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
 #else
 // there is no equivalent intrinsics in msvc.
-#define UNLIKELY(condition) (condition == 0)
+#define UNLIKELY(condition) ((condition) == 0)
 #endif
 
 #if !defined(_WIN32)
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 2211e5504373b4a30e5fda0db22a41bdcd9f2421..e373a34d1e84a9eb3d54079fce497b5d0cc27502 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -167,7 +167,9 @@ void InitGLOG(const std::string &prog_name) {
   // glog will not hold the ARGV[0] inside.
   // Use strdup to alloc a new string.
   google::InitGoogleLogging(strdup(prog_name.c_str()));
+#if !defined(_WIN32)
   google::InstallFailureSignalHandler();
+#endif
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/platform/macros.h b/paddle/fluid/platform/macros.h
index 32b7efc04c1f2ecc22f93c08387aec69ded4930a..906ed6e8258d4157762d1cc2f26de2d6da1707e1 100644
--- a/paddle/fluid/platform/macros.h
+++ b/paddle/fluid/platform/macros.h
@@ -28,3 +28,16 @@ limitations under the License. */
 #if defined(__FLT_MAX__)
 #define FLT_MAX __FLT_MAX__
 #endif  // __FLT_MAX__
+
+#ifdef _WIN32
+#if defined(PADDLE_COMPILE)
+// by default, msvc has predefined macro _LIB for static library
+// only shared library need to export and import symbols
+// static library export all symbols by default.
+#define PADDLE_DLL __declspec(dllexport)
+#else
+#define PADDLE_DLL __declspec(dllimport)
+#endif
+#else
+#define PADDLE_DLL
+#endif
diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
index cf9f4aa95bc1cb79d95b79331fbc09e11af64194..8f1e3bdd317507c246b3851e200e4acf7c58f675 100644
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -15,12 +15,13 @@
 #pragma once
 
 #include <cstdio>
-#include <stdexcept>
-
 #include <memory>
+#include <memory>  // NOLINT
+#include <stdexcept>
 #include <string>
 
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#define GOOGLE_GLOG_DLL_DECL
 #include "glog/logging.h"
 
 #if !defined(_WIN32)
@@ -61,7 +62,6 @@ static void *dlopen(const char *filename, int flag) {
   }
   return reinterpret_cast<void *>(hModule);
 }
-
 #endif  // !_WIN32
 
 static void ExecShellCommand(const std::string &cmd, std::string *message) {