Merge remote-tracking branch 'dzhwinter/windows/support' into windows/support

5993155d · wanghaoshuang · f9e7cfb0 · e41a3fcd · 5993155d · 5993155d
11 changed file
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -43,13 +43,13 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
 cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
 if(WITH_GPU)
-  if (WIN32)
+  # // if (WIN32)
-    windows_symbolic(tensor_util SRCS tensor_util.cu)
+  # //   windows_symbolic(tensor_util SRCS tensor_util.cu)
-    nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
+  # //   nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
-    add_dependencies(tensor tensor_util)
+  # //   add_dependencies(tensor tensor_util)
-  else()
+  # // else()
    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
-  endif(WIN32)
+   # endif(WIN32)
 else()
  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
 endif()
@@ -93,15 +93,15 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
        DEPS operator op_registry device_context math_function)
 if(WITH_GPU)
-  if (WIN32)
+  # if (WIN32)
-    # windows treat symbolic file as a real file, which is different with unix
+  #   # windows treat symbolic file as a real file, which is different with unix
-    # We create a hidden file and compile it instead of origin source file.
+  #   # We create a hidden file and compile it instead of origin source file.
-      windows_symbolic(hidden_file SRCS data_type_transform.cu)
+  #     windows_symbolic(hidden_file SRCS data_type_transform.cu)
-      nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor)
+  #     nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor)
-      add_dependencies(data_type_transform hidden_file)
+  #     add_dependencies(data_type_transform hidden_file)
-  else()
+  # else()
      nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
-  endif(WIN32)
+  # endif(WIN32)
  nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
 else()
  cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)

--- a/paddle/fluid/framework/data_type_transform.cu
+++ b/paddle/fluid/framework/data_type_transform.cu
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
+Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
+you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
+You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
+Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
+distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
+See the License for the specific language governing permissions and
-// limitations under the License.
+limitations under the License. */
-data_type_transform.cc
+#include "paddle/fluid/framework/data_type_transform.h"
\ No newline at end of file
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/transform.h"
+namespace paddle {
+namespace framework {
+template <typename InType, typename OutType>
+struct CastDataTypeFunctor {
+  HOSTDEVICE inline OutType operator()(InType in) const {
+    return static_cast<OutType>(in);
+  }
+};
+template <typename InType>
+struct CastDataType {
+  CastDataType(const framework::Tensor& in, framework::Tensor* out,
+               const platform::DeviceContext* ctx)
+      : in_(in), out_(out), ctx_(ctx) {}
+  const framework::Tensor in_;
+  framework::Tensor* out_;
+  const platform::DeviceContext* ctx_;
+  template <typename OutType>
+  void apply() {
+    auto* in_begin = in_.data<InType>();
+    auto* in_end = in_begin + in_.numel();
+    auto* out_begin = out_->mutable_data<OutType>(in_.place());
+    if (platform::is_cpu_place(in_.place())) {
+      platform::Transform<platform::CPUDeviceContext> trans;
+      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
+      trans(*context, in_begin, in_end, out_begin,
+            CastDataTypeFunctor<InType, OutType>());
+#ifdef __NVCC__
+    } else if (platform::is_gpu_place(in_.place())) {
+      platform::Transform<platform::CUDADeviceContext> trans;
+      auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_);
+      trans(*context, in_begin, in_end, out_begin,
+            CastDataTypeFunctor<InType, OutType>());
+      context->Wait();
+#endif
+    } else {
+      PADDLE_THROW("Unsupported place!");
+    }
+  }
+};
+void TransDataType(const OpKernelType& kernel_type_for_var,
+                   const OpKernelType& expected_kernel_type, const Tensor& in,
+                   Tensor* out) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  out->Resize(in.dims());
+  auto src_type = kernel_type_for_var.data_type_;
+  auto dst_type = expected_kernel_type.data_type_;
+  auto ctx = pool.Get(in.place());
+  switch (src_type) {
+    case proto::VarType::FP16:
+      framework::VisitDataType(dst_type,
+                               CastDataType<platform::float16>(in, out, ctx));
+      break;
+    case proto::VarType::FP32:
+      framework::VisitDataType(dst_type, CastDataType<float>(in, out, ctx));
+      break;
+    case proto::VarType::FP64:
+      framework::VisitDataType(dst_type, CastDataType<double>(in, out, ctx));
+      break;
+    case proto::VarType::INT32:
+      framework::VisitDataType(dst_type, CastDataType<int>(in, out, ctx));
+      break;
+    case proto::VarType::INT64:
+      framework::VisitDataType(dst_type, CastDataType<int64_t>(in, out, ctx));
+      break;
+    case proto::VarType::BOOL:
+      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
+      break;
+    case proto::VarType::INT16:
+      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
+      break;
+    case proto::VarType::UINT8:
+      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
+      break;
+    default:
+      PADDLE_THROW("Not support type %d", src_type);
+  }
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@@ -17,8 +17,11 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 namespace ir {
+#if !defined(_WIN32)
-constexpr char Node::kControlDepVarName[];
+constexpr char Node::kControlDepVarName[] =  "__control_var";
+#else
+const char Node::kControlDepVarName[] = "__control_var";
+#endif
 int Node::count_ = 0;
 }  // namespace ir
 }  // namespace framework

--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -27,7 +27,11 @@ namespace ir {
 class Node {
 public:
  enum class Type { kOperation, kVariable };
+#if !defined(_WIN32) // msvc not support constexpr correctly.
  static constexpr char kControlDepVarName[] = "__control_var";
+#else 
+  static const char kControlDepVarName[];
+#endif
  explicit Node(const std::string& name, Type type)
      : name_(name),

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -689,7 +689,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  auto expected_kernel_key =
      this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
-  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+  VLOG(3) << "expected_kernel_key: " << expected_kernel_key;
  auto kernel_iter = kernels.find(expected_kernel_key);
 #ifdef PADDLE_WITH_MKLDNN

--- a/paddle/fluid/framework/tensor_util.cu
+++ b/paddle/fluid/framework/tensor_util.cu
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
+   Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
+   you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
+   You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
+   Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
+   distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
+   See the License for the specific language governing permissions and
-// limitations under the License.
+   limitations under the License. */
+#include "paddle/fluid/framework/tensor_util.h"
-tensor_util.cc
+#include <algorithm>
\ No newline at end of file
+#include <limits>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+namespace paddle {
+namespace framework {
+void TensorCopy(const Tensor& src, const platform::Place& dst_place,
+                const platform::DeviceContext& ctx, Tensor* dst) {
+  VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
+          << dst_place;
+  src.check_memory_size();
+  dst->Resize(src.dims());
+  dst->set_layout(src.layout());
+  auto src_place = src.place();
+  auto src_ptr = src.data<void>();
+  auto dst_ptr = dst->mutable_data(dst_place, src.type());
+  auto size = src.numel() * SizeOfType(src.type());
+  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
+    auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
+  } else if (platform::is_gpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    if (platform::is_same_place(src_place, dst_place)) {
+      memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+                   stream);
+    } else {
+      if (platform::is_same_place(ctx_place, src_place)) {
+        memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+                     stream);
+        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
+      } else if (platform::is_same_place(ctx_place, dst_place)) {
+        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
+        memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+                     stream);
+      } else {
+        PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place.");
+      }
+    }
+  }
+#endif
+}
+void TensorCopy(const Tensor& src, const platform::Place& dst_place,
+                Tensor* dst) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  const platform::DeviceContext* dev_ctx;
+  if (platform::is_gpu_place(dst_place)) {
+    dev_ctx = pool.Get(dst_place);
+  } else {
+    dev_ctx = pool.Get(src.place());
+  }
+  TensorCopy(src, dst_place, *dev_ctx, dst);
+}
+void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
+                    Tensor* dst) {
+  VLOG(3) << "TensorCopySync " << src.dims() << " from " << src.place()
+          << " to " << dst_place;
+  src.check_memory_size();
+  dst->Resize(src.dims());
+  dst->set_layout(src.layout());
+  auto src_place = src.place();
+  auto src_ptr = src.data<void>();
+  auto dst_ptr = dst->mutable_data(dst_place, src.type());
+  auto size = src.numel() * SizeOfType(src.type());
+  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
+    auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
+    memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
+    memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
+  } else if (platform::is_gpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
+    memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
+  }
+#endif
+}
+template <typename Predicate, typename DevCtx>
+struct AnyDTypeVisitor {
+  Predicate predicate_;
+  const Tensor& tensor_;
+  const DevCtx& ctx_;
+  Tensor* out_;
+  AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx,
+                  Tensor* out)
+      : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {}
+  template <typename T>
+  void apply() const {
+    auto t = EigenVector<T>::Flatten(tensor_);
+    auto o = EigenScalar<bool>::From(*out_);
+    // return any of predicate_(t) is true.
+    o.device(*ctx_.eigen_device()) = predicate_(t).any();
+  }
+};
+template <typename Predicate, typename DevCtx>
+inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor,
+                    const DevCtx& ctx, framework::Tensor* out) {
+  VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor<Predicate, DevCtx>(
+                                               predicate, tensor, ctx, out));
+}
+template <typename Predicate>
+struct AnyVisitor : public boost::static_visitor<bool> {
+  const framework::Tensor& tensor_;
+  Predicate predicate_;
+  AnyVisitor(const framework::Tensor& tensor, Predicate predicate)
+      : tensor_(tensor), predicate_(std::move(predicate)) {}
+  template <typename Place>
+  bool operator()(const Place& place) const {
+    framework::Tensor out;
+    out.Resize({1});
+    out.mutable_data<bool>(place);
+    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
+    AnyImpl(predicate_, tensor_, *ctx, &out);
+    return this->GetResult(out, place);
+  }
+  bool GetResult(const framework::Tensor& out,
+                 const platform::CUDAPlace& gpu) const {
+    platform::CPUPlace cpu;
+    framework::Tensor tmp;
+    tmp.Resize({1});
+    tmp.mutable_data<bool>(cpu);
+    auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
+    gpuctx->Wait();
+    TensorCopy(out, cpu, *gpuctx, &tmp);
+    gpuctx->Wait();
+    return GetResult(tmp, cpu);
+  }
+  bool GetResult(const framework::Tensor& out,
+                 const platform::CPUPlace& cpu) const {
+    return *out.data<bool>();
+  }
+  bool GetResult(const framework::Tensor& out,
+                 const platform::CUDAPinnedPlace& cpu) const {
+    return *out.data<bool>();
+  }
+};
+template <typename Predicate>
+inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
+  AnyVisitor<Predicate> visitor(tensor, predicate);
+  auto place = tensor.place();
+  return platform::VisitPlace(place, visitor);
+}
+struct ContainsNANPredicate {
+  template <typename T>
+  auto operator()(const T& eigen_vec) const
+      -> decltype(std::declval<T>().isnan()) {
+    // Cast eigen_vector to vector of bool. true if is inf.
+    return eigen_vec.isnan();
+  }
+};
+bool TensorContainsNAN(const framework::Tensor& tensor) {
+  ContainsNANPredicate predicate;
+  return Any(tensor, predicate);
+}
+struct ContainsInfPredicate {
+  template <typename T>
+  auto operator()(const T& eigen_vec) const
+      -> decltype(std::declval<T>().isinf()) {
+    // Cast eigen_vector to vector of bool. true if is inf.
+    return eigen_vec.isinf();
+  }
+};
+bool TensorContainsInf(const framework::Tensor& tensor) {
+  ContainsInfPredicate predicate;
+  return Any(tensor, predicate);
+}
+void TensorToStream(std::ostream& os, const Tensor& tensor,
+                    const platform::DeviceContext& dev_ctx) {
+  {  // the 1st field, uint32_t version
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char*>(&version), sizeof(version));
+  }
+  {  // the 2nd field, tensor description
+     // int32_t  size
+     // void*    protobuf message
+    proto::VarType::TensorDesc desc;
+    desc.set_data_type(framework::ToDataType(tensor.type()));
+    auto dims = framework::vectorize(tensor.dims());
+    auto* pb_dims = desc.mutable_dims();
+    pb_dims->Resize(static_cast<int>(dims.size()), 0);
+    std::copy(dims.begin(), dims.end(), pb_dims->begin());
+    int32_t size = desc.ByteSize();
+    os.write(reinterpret_cast<const char*>(&size), sizeof(size));
+    auto out = desc.SerializeAsString();
+    os.write(out.data(), size);
+  }
+  {  // the 3rd field, tensor data
+    uint64_t size = tensor.numel() * framework::SizeOfType(tensor.type());
+    auto* data_ptr = tensor.data<void>();
+    PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
+                   "Index overflow when writing tensor");
+    if (platform::is_gpu_place(tensor.place())) {
+#ifdef PADDLE_WITH_CUDA
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto& gpu_dev_ctx =
+          static_cast<const platform::CUDADeviceContext&>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(),
+                     boost::get<platform::CUDAPlace>(tensor.place()),
+                     reinterpret_cast<const void*>(data), size_to_write,
+                     gpu_dev_ctx.stream());
+        gpu_dev_ctx.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW("Unexpected branch");
+#endif
+    } else {
+      os.write(static_cast<const char*>(data_ptr),
+               static_cast<std::streamsize>(size));
+    }
+  }
+}
+struct DeserializedDataFunctor {
+  DeserializedDataFunctor(void** buf, Tensor* tensor,
+                          const platform::Place& place)
+      : buf_(buf), tensor_(tensor), place_(place) {}
+  template <typename T>
+  void apply() {
+    *buf_ = tensor_->mutable_data<T>(place_);
+  }
+  void** buf_;
+  Tensor* tensor_;
+  platform::Place place_;
+};
+void TensorFromStream(std::istream& is, Tensor* tensor,
+                      const platform::DeviceContext& dev_ctx) {
+  uint32_t version;
+  is.read(reinterpret_cast<char*>(&version), sizeof(version));
+  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  proto::VarType::TensorDesc desc;
+  {  // int32_t size
+     // proto buffer
+    int32_t size;
+    is.read(reinterpret_cast<char*>(&size), sizeof(size));
+    std::unique_ptr<char[]> buf(new char[size]);
+    is.read(reinterpret_cast<char*>(buf.get()), size);
+    PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
+                   "Cannot parse tensor desc");
+  }
+  {  // read tensor
+    std::vector<int64_t> dims;
+    dims.reserve(static_cast<size_t>(desc.dims().size()));
+    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
+    tensor->Resize(framework::make_ddim(dims));
+    void* buf;
+    auto ctx = platform::CPUDeviceContext();
+    size_t size =
+        tensor->numel() *
+        framework::SizeOfType(framework::ToTypeIndex(desc.data_type()));
+    if (platform::is_gpu_place(dev_ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+      Tensor cpu_tensor;
+      cpu_tensor.Resize(framework::make_ddim(dims));
+      framework::VisitDataType(
+          desc.data_type(),
+          DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
+      is.read(static_cast<char*>(buf), size);
+      auto dst_place = dev_ctx.GetPlace();
+      framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
+#else
+      PADDLE_THROW("Unexpected branch");
+#endif
+    } else {
+      framework::VisitDataType(
+          desc.data_type(),
+          DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
+      is.read(static_cast<char*>(buf), size);
+    }
+  }
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/inference/api/demo_ci/inference_icnet.cc
+++ b/paddle/fluid/inference/api/demo_ci/inference_icnet.cc
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
-Licensed under the Apache License, Version 2.0 (the "License");
+// Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
+// you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+// You may obtain a copy of the License at
+//
-http://www.apache.org/licenses/LICENSE-2.0
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
-Unless required by applicable law or agreed to in writing, software
+// Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
+// distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
+// See the License for the specific language governing permissions and
-limitations under the License. */
+// limitations under the License.
-/*
+#include <cassert>
- * This file contains a simple demo for how to take a model for inference.
+#include <chrono>
- */
+#include <iostream>
-#include <cassert>
+#include <fstream>
-#include <cctype>
+#include <algorithm>
+#include <vector>
-#include <algorithm>
+#include <string>
-#include <fstream>
-#include <iostream>
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include <iterator>
-#include <memory>
+namespace paddle {
-#include <sstream>
-#include <string>
+std::string DIRNAME = "./Release/infer_model";
-#include <thread>  //NOLINT
+std::string DATA = "./test-image.txt"; 
-#include "paddle/fluid/inference/paddle_inference_api.h"
+const int C = 3; // image channel
+const int H = 449; // image height
-std::string MODELDIR = ""; /* "Directory of the inference model." */ // NOLINT
+const int W = 581; // image width
-std::string REFER = "";
+// 数据格式
-/*"path to reference result for comparison."*/ //NOTLINT
+// "<space splitted floats as data>\t<space splitted ints as shape"
-/*path of data; each line is a record, format:
+// 1. 存储为float32格式。
-<space splitted floats as data>\t<space splitted ints as shape>
+// 2. 必须减去均值。 CHW三个通道为 mean = 112.15, 109.41, 185.42
-Please check the demo data of data.txt for details.
+struct Record
- */
+{
-std::string DATA = "";  
+  std::vector<float> data;
-bool USE_GPU = true;     /*"Whether use gpu."*/
+  std::vector<int32_t> shape;
+};
-auto message_err = []()
-{
+NativeConfig GetConfig() {
-  std::cout << "Copyright (c) 2018 PaddlePaddle Authors." << std::endl;
+  NativeConfig config;
-  std::cout << "Demo Case for windows inference. "
+  config.prog_file=DIRNAME + "/__model__";
-            << "\n"
+  config.param_file=DIRNAME + "/__params__";
-            << "Usage: Input your model path and use_gpu as the guide requires,"
+  config.fraction_of_gpu_memory = 0.0;
-            << "then run the demo inference, and will get a result."
+  config.use_gpu = true;
-            << std::endl;
+  config.device = 0;
-  std::cout << std::endl;
+  return config;
-};
+}
-namespace paddle
+using Time = decltype(std::chrono::high_resolution_clock::now());
-{
-	namespace demo
+Time time() { return std::chrono::high_resolution_clock::now(); };
-	{
-		void split(const std::string& str, char sep,
+double time_diff(Time t1, Time t2) {
-			std::vector<std::string>* pieces)
+  typedef std::chrono::microseconds ms;
-		{
+  auto diff = t2 - t1;
-			pieces->clear();
+  ms counter = std::chrono::duration_cast<ms>(diff);
-			if (str.empty())
+  return counter.count() / 1000.0;
-			{
+}
-				return;
-			}
+static void split(const std::string& str, char sep,
-			size_t pos = 0;
+                  std::vector<std::string>* pieces) {
-			size_t next = str.find(sep, pos);
+  pieces->clear();
-			while (next != std::string::npos)
+  if (str.empty()) {
-			{
+    return;
-				pieces->push_back(str.substr(pos, next - pos));
+  }
-				pos = next + 1;
+  size_t pos = 0;
-				next = str.find(sep, pos);
+  size_t next = str.find(sep, pos);
-			}
+  while (next != std::string::npos) {
-			if (!str.substr(pos).empty())
+    pieces->push_back(str.substr(pos, next - pos));
-			{
+    pos = next + 1;
-				pieces->push_back(str.substr(pos));
+    next = str.find(sep, pos);
-			}
+  }
-		}
+  if (!str.substr(pos).empty()) {
+    pieces->push_back(str.substr(pos));
-		/*
+  }
-		 * Get a summary of a PaddleTensor content.
+}
-		 */
-		std::string SummaryTensor(const PaddleTensor& tensor)
+Record ProcessALine(const std::string& line) {
-		{
+  std::vector<std::string> columns;
-			std::stringstream ss;
+  split(line, '\t', &columns);
-			int num_elems = tensor.data.length() / PaddleDtypeSize(tensor.dtype);
+  Record record;
-			ss << "data[:10]\t";
+  std::vector<std::string> data_strs;
-			switch (tensor.dtype)
+  split(columns[0], ' ', &data_strs);
-			{
+  for (auto& d : data_strs) {
-			case PaddleDType::INT64:
+    record.data.push_back(std::stof(d));
-				for (int i = 0; i < std::min(num_elems, 10); i++)
+  }
-				{
-					ss << static_cast<int64_t*>(tensor.data.data())[i] << " ";
+  std::vector<std::string> shape_strs;
-				}
+  split(columns[1], ' ', &shape_strs);
-				break;
+  for (auto& s : shape_strs) {
-			case PaddleDType::FLOAT32:
+    record.shape.push_back(std::stoi(s));
-				for (int i = 0; i < std::min(num_elems, 10); i++)
+  }
-				{
+  return record;
-					ss << static_cast<float*>(tensor.data.data())[i] << " ";
+}
-				}
-				break;
+void test_naive(int batch_size){
-			}
+  NativeConfig config = GetConfig();
-			return ss.str();
+  auto predictor = CreatePaddlePredictor<NativeConfig>(config);
-		}
+  int height = H;
+  int width = W;
-		std::string ToString(const NativeConfig& config)
+  int channel = C;
-		{
+  int num_sum = height * width * channel * batch_size;
-			std::stringstream ss;
-			ss << "Use GPU : " << (config.use_gpu ? "True" : "False") << "\n"
+  // 1. use fake data
-				<< "Device : " << config.device << "\n"
+  std::vector<float> data;
-				<< "fraction_of_gpu_memory : " << config.fraction_of_gpu_memory << "\n"
+  for(int i = 0; i < num_sum; i++) {
-				<< "specify_input_name : "
+    data.push_back(0.0);
-				<< (config.specify_input_name ? "True" : "False") << "\n"
+  }
-				<< "Program File : " << config.prog_file << "\n"
-				<< "Param File : " << config.param_file;
+  PaddleTensor tensor;
-			return ss.str();
+  tensor.shape = std::vector<int>({batch_size, channel, height, width});
-		}
+  tensor.data.Resize(sizeof(float) * batch_size * channel * height * width);
+  std::copy(data.begin(), data.end(), static_cast<float*>(tensor.data.data()));
-		struct Record
+  tensor.dtype = PaddleDType::FLOAT32;
-		{
-			std::vector<float> data;
+  // 2. read data from file
-			std::vector<int32_t> shape;
+  // std::string line;
-		};
+  // std::ifstream file(DATA);
+  // std::getline(file, line);
-		Record ProcessALine(const std::string& line)
+  // auto record = ProcessALine(line);
-		{
+  // file.close();
-			std::cout << "process a line" << std::endl;
+  // PaddleTensor tensor;
-			std::vector<std::string> columns;
+  // tensor.shape = record.shape;
-			split(line, '\t', &columns);
+  // tensor.data =
-			assert(columns.size() == 2UL, "data format error, should be <data>\t<shape>");
+  //     PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
-			Record record;
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
-			std::vector<std::string> data_strs;
+  PaddleTensor tensor_out;
-			split(columns[0], ' ', &data_strs);
-			//将数据字符串转换为整型数据并放到record.data中
+  std::vector<PaddleTensor> outputs(1, tensor_out);
-			for (auto& d : data_strs)
-			{
+  predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
-				record.data.push_back(std::stof(d));
+  auto time1 = time(); 
-			} 
+  for(size_t i = 0; i < 2; i++) {
-			std::vector<std::string> shape_strs;
+    std::cout << "Pass " << i << "predict";
-			split(columns[1], ' ', &shape_strs);
+    predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
-			for (auto& s : shape_strs)
+  } 
-			{
-				record.shape.push_back(std::stoi(s));
+  auto time2 = time(); 
-			}
+  std::ofstream ofresult("naive_test_result.txt", std::ios::app);
-			std::cout << "data size " << record.data.size() << std::endl;
-			std::cout << "data shape size " << record.shape.size() << std::endl;
+  std::cout <<"batch: " << batch_size << " predict cost: " << time_diff(time1, time2) / 100.0 << "ms" << std::endl;
-			return record;
+  std::cout << outputs.size() << std::endl;
-		}
+}
-		void CheckOutput(const std::string& referfile, const PaddleTensor& output)
+}  // namespace paddle
-		{
-			std::string line;
+int main(int argc, char** argv) {
-			std::ifstream file(referfile);
+  paddle::test_naive(1 << 0);
-			std::getline(file, line);
+  return 0;
-			auto refer = ProcessALine(line);
+}
-			file.close();
\ No newline at end of file
-			size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
-			std::cout << "predictor output numel " << numel << std::endl;
-			std::cout << "reference output numel " << refer.data.size() << std::endl;
-			assert(numel == refer.data.size());
-			switch (output.dtype)
-			{
-			case PaddleDType::INT64:
-				for (size_t i = 0; i < numel; ++i)
-				{
-					assert(static_cast<int64_t*>(output.data.data())[i] == refer.data[i]);
-				}
-				break;
-			case PaddleDType::FLOAT32:
-				for (size_t i = 0; i < numel; ++i)
-				{
-					assert(fabs(static_cast<float*>(output.data.data())[i] - refer.data[i]) <= 1e-5);
-				}
-				break;
-			}
-		}
-		/*
-		 * Use the native fluid engine to inference the demo.
-		 */
-		void Main(bool use_gpu)
-		{
-			NativeConfig config;
-			config.model_dir = MODELDIR;
-			//config.param_file = MODELDIR + "/__params__";
-			//config.prog_file = MODELDIR + "/__model__";
-			config.use_gpu = USE_GPU;
-			config.device = 0;
-			if (USE_GPU)
-			{
-				config.fraction_of_gpu_memory = 0.1f;  // set by yourself
-			}
-			std::cout << ToString(config) << std::endl;
-			std::cout << "init predictor" << std::endl;
-			auto predictor = CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
-			std::cout << "begin to process data" << std::endl;
-			// Just a single batch of data.
-			std::string line;
-			std::cout << "data : " << std::endl;
-			std::ifstream file(DATA);
-			if (!file.is_open()) 
-			{
-				std::cout << "failed open data" << DATA << std::endl;
-				exit(0);
-			}
-			std::getline(file, line);
-			auto record = ProcessALine(line);
-			file.close();
-			// Inference.
-			PaddleTensor input;
-			input.shape = record.shape;
-			input.data =
-				PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
-			input.dtype = PaddleDType::FLOAT32;
-			std::cout << "run executor" << std::endl;
-			std::vector<PaddleTensor> output;
-			predictor->Run({ input }, &output);
-			std::cout << "output.size " << output.size() << std::endl;
-			auto& tensor = output.front();
-			std::cout << "output: " << SummaryTensor(tensor) << std::endl;
-			// compare with reference result
-			std::cout << "refer result : " << REFER << std::endl;
-			CheckOutput(REFER, tensor);
-		}
-	}
-}
-int main(int argc, char** argv)
-{
-	MODELDIR = "./LB_icnet_model";
-	//DATA = "./icnet_image.txt";
-	DATA = "./1.png.txt";
-	REFER = "./icnet_label.txt";
-	paddle::demo::Main(USE_GPU);
-	system("pause");
-	return 0;
-}
--- a/paddle/fluid/inference/api/demo_ci/naive_model_test.cc
+++ b/paddle/fluid/inference/api/demo_ci/naive_model_test.cc
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <chrono>
-#include <iostream>
-#include <fstream>
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-namespace paddle {
-std::string DIRNAME = "./LB_icnet_model";
-//std::string DIRNAME = "./infer_models";
-NativeConfig GetConfig() {
-  NativeConfig config;
-  config.prog_file=DIRNAME + "/__model__";
-  config.param_file=DIRNAME + "/__params__";
-  config.fraction_of_gpu_memory = 0.8;
-  config.use_gpu = true;
-  config.device = 0;
-  return config;
-}
-using Time = decltype(std::chrono::high_resolution_clock::now());
-Time time() { return std::chrono::high_resolution_clock::now(); };
-double time_diff(Time t1, Time t2) {
-  typedef std::chrono::microseconds ms;
-  auto diff = t2 - t1;
-  ms counter = std::chrono::duration_cast<ms>(diff);
-  return counter.count() / 1000.0;
-}
-void test_naive(int batch_size){
-  NativeConfig config = GetConfig();
-  // config.model_dir = model_path;
-  auto predictor = CreatePaddlePredictor<NativeConfig>(config);
-  int height = 449;
-  int width = 581;
-  //int height = 3;
-  //int width = 3;
-  int num_sum = height * width * 3 * batch_size;
-  std::vector<float> data;
-  for(int i = 0; i < num_sum; i++) {
-    data.push_back(0.0);
-  }
-  PaddleTensor tensor;
-  tensor.shape = std::vector<int>({batch_size, 3, height, width});
-  tensor.data.Resize(sizeof(float) * batch_size * 3 * height * width);
-  std::copy(data.begin(), data.end(), static_cast<float*>(tensor.data.data()));
-  tensor.dtype = PaddleDType::FLOAT32;
-  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
-  PaddleTensor tensor_out;
-  std::vector<PaddleTensor> outputs(1, tensor_out);
-  predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
-  std::cout << "start predict123:" << std::endl;
-  auto time1 = time(); 
-  for(size_t i = 0; i < 2; i++) {
-    predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
-    std::cout << "pass " << i;
-  } 
-  auto time2 = time(); 
-  std::ofstream ofresult("naive_test_result.txt", std::ios::app);
-  std::cout <<"batch: " << batch_size << " predict cost: " << time_diff(time1, time2) / 100.0 << "ms" << std::endl;
-  std::cout << outputs.size() << std::endl;
-  /*
-  int64_t * data_o = static_cast<int64_t*>(outputs[0].data.data());
-  for (size_t j = 0; j < outputs[0].data.length() / sizeof(int64_t); ++j) {
-    ofresult << std::to_string(data_o[j]) << " ";
-  }
-  ofresult << std::endl;
-  ofresult.close();
-  */
-}
-}  // namespace paddle
-int main(int argc, char** argv) {
-  paddle::test_naive(1 << 0);
-  return 0;
-}
\ No newline at end of file
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -43,6 +43,7 @@ template <typename T>
 class CUDNNConvOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    VLOG(3) << "inside cudnn";
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                   "It must use CUDAPlace.");
    auto* input = ctx.Input<Tensor>("Input");
@@ -59,7 +60,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
    const T* input_data = input->data<T>();
    const T* filter_data = filter->data<T>();
    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    VLOG(3) << "get all inputs";
    // ------------------- cudnn descriptors ---------------------
    ScopedTensorDescriptor input_desc;
    ScopedTensorDescriptor output_desc;
@@ -72,7 +73,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
    cudnnConvolutionDescriptor_t cudnn_conv_desc =
        conv_desc.descriptor<T>(paddings, strides, dilations);
+    VLOG(3) << "create tensor descriptor";
 #if CUDNN_VERSION_MIN(7, 0, 1)
    // cudnn 7 can support groups, no need to do it mannually
    // FIXME(typhoonzero): find a better way to disable groups
@@ -81,7 +82,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
        cudnn_conv_desc, groups));
    groups = 1;
 #endif
+    VLOG(3) << "before create tensor descriptor";
    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
        layout, framework::vectorize2int(input->dims()), groups);
    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
@@ -111,7 +112,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
      output_height = output->dims()[2];
      output_width = output->dims()[3];
    }
+    VLOG(3) << "after create tensor descriptor";
    int group_offset_in =
        input_channels / groups * input_height * input_width * input_depth;
    int group_offset_out =
@@ -129,6 +130,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    auto handle = dev_ctx.cudnn_handle();
+    VLOG(3) << "set cudnn algorithm";
    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
        cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
@@ -149,7 +151,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
          cudnn_conv_desc, CUDNN_DEFAULT_MATH));
    }
 #endif
+    VLOG(3) << "before get workspace";
    // get workspace size able to allocate
    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
@@ -158,10 +160,12 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
    // the limit because the algo is overrided to use tensor core.
    PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
                      "workspace_size to be allocated exceeds the limit");
+    VLOG(3) << "after get workspace";
    // Allocate on GPU memory
    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    workspace_size_in_bytes = 1024;
    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
+    VLOG(3) << "allocate memory";
    // ------------------- cudnn conv forward ---------------------
    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
    for (int i = 0; i < groups; i++) {
@@ -171,8 +175,10 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
          cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
          &beta, cudnn_output_desc, output_data + i * group_offset_out));
    }
+    VLOG(3) << "cudnn forward";
    // Release the cudnn workspace
    paddle::memory::Free(gpu, cudnn_workspace);
+    VLOG(3) << "cudnn pass";
  }
 };
@@ -318,6 +324,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
    // Already on GPU
    void* cudnn_workspace = nullptr;
    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    workspace_size_in_bytes = 1024;
    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
    // ------------------- cudnn conv backward data ---------------------
    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;

--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -33,8 +33,8 @@ class LoadCombineOp : public framework::OperatorBase {
    auto filename = Attr<std::string>("file_path");
    auto load_as_fp16 = Attr<bool>("load_as_fp16");
-    std::ifstream fin(filename);
+    std::ifstream fin(filename, std::ios_base::in | std::ios_base::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fin),
+    PADDLE_ENFORCE(!fin.bad(),
                   "Cannot open file %s for load_combine op", filename);
    auto out_var_names = Outputs("Out");
@@ -46,20 +46,21 @@ class LoadCombineOp : public framework::OperatorBase {
    auto &dev_ctx = *pool.Get(place);
    for (size_t i = 0; i < out_var_names.size(); i++) {
+      VLOG(3) << "load " << out_var_names[i];
      auto *out_var = scope.FindVar(out_var_names[i]);
      PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
                     out_var_names[i]);
      auto *tensor = out_var->GetMutable<framework::LoDTensor>();
+      VLOG(3) << "Get Tensor";
      // Error checking
-      PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot read more from file %s",
+      PADDLE_ENFORCE(!fin.bad(), "Cannot read more from file %s",
                     filename);
+      VLOG(3) << "before deserialization";
      // Get data from fin to tensor
-      DeserializeFromStream(fin, tensor, dev_ctx);
+      DeserializeFromStream(fin, tensor, dev_ctx); 
+      VLOG(3) << "after deserialization";
      auto in_dtype = framework::ToDataType(tensor->type());
      auto out_dtype =
          load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
@@ -80,6 +81,7 @@ class LoadCombineOp : public framework::OperatorBase {
        tensor->set_lod(fp16_tensor.lod());
        tensor->ShareDataWith(fp16_tensor);
      }
+      VLOG(3) << "load " << out_var_names[i] << " finished";
    }
  }
 };

--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -59,6 +59,7 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
 #define CUDNN_VERSION_MIN(major, minor, patch) \
  (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
+#if !defined(_WIN32)
 #define CUDNN_ENFORCE(condition)                                     \
  do {                                                               \
    cudnnStatus_t status = condition;                                \
@@ -66,6 +67,9 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
      PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \
    }                                                                \
  } while (false)
+#else
+#define CUDNN_ENFORCE(condition)
+#endif
 enum class DataLayout {  // Not use
  kNHWC,