From c6dcffc61a12a505da7043f2c1de2a56deef105a Mon Sep 17 00:00:00 2001
From: dzhwinter <dzhwinter@gmail.com>
Date: Wed, 24 Oct 2018 05:13:34 +0800
Subject: [PATCH] lb. add debug output

---
 paddle/fluid/framework/executor.cc            | 102 ++++++-
 .../inference/api/demo_ci/CMakeLists.txt      |  21 +-
 .../inference/api/demo_ci/inference_icnet.cc  | 249 +++++++++---------
 .../inference/api/demo_ci/inference_icnet.h   |  21 ++
 .../api/demo_ci/real_data_icnet_tester.cc     | 123 +++++++++
 paddle/fluid/inference/api/demo_ci/test.cc    |  99 +++++++
 .../api/demo_ci/thread_icnet_test.cc          | 105 ++++++++
 paddle/fluid/operators/batch_norm_op.cu.cc    |  21 ++
 paddle/fluid/operators/load_combine_op.cc     |  12 +-
 9 files changed, 626 insertions(+), 127 deletions(-)
 create mode 100644 paddle/fluid/inference/api/demo_ci/inference_icnet.h
 create mode 100644 paddle/fluid/inference/api/demo_ci/real_data_icnet_tester.cc
 create mode 100644 paddle/fluid/inference/api/demo_ci/test.cc
 create mode 100644 paddle/fluid/inference/api/demo_ci/thread_icnet_test.cc
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 1101707f8..c318c5fc1 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -333,9 +333,49 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
   return result;
 }
 
+// void CheckResult(const std::string op_type, ExecutorPrepareContext* ctx, Scope* local_scope) {
+//     VLOG(3) << "before checking result";
+//   auto& dev_ctx = *platform::DeviceContextPool::Instance().Get(place_);
+//   std::vector<std::string> outputs;
+//   auto& block = ctx->prog_.Block(0);
+//   bool found = false;
+//   framework::OpDesc* myop = nullptr;
+//   for(auto& op : block.AllOps()) {
+//     if(op->Type() == "load_combine" || op->Type() == "fetch" || op->Type() == "feed") return;
+//     if (op->Type() == op_type) {
+//         found = true;
+//         myop = op;
+//         break;
+//       }
+//     }
+//   }
+//   if(!found) {
+//     VLOG(3) << "not found op!";
+//     return;
+//   }
+//     auto* op = myop;
+//      VLOG(3) << "start op output" << op->Type();
+//     for(auto var_name: op->OutputArgumentNames()) {
+//       auto* var = local_scope->Var(var_name);
+//       auto* var_desc = block.FindVar(var_name);
+//       if (var_desc->Persistable()) continue;
+//       auto* tensor = var->GetMutable<framework::LoDTensor>();
+//       framework::Tensor check;
+//       VLOG(3) << "before tensor copy";
+//       framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
+//       VLOG(3) << "after tensor copy";
+//       float sum = .0;
+//       for(size_t i=0; i < check.numel(); ++i) {
+//           sum += check.data<float>()[i];
+//       }
+//       VLOG(3) << "op " << op->Type() << " output var " << var_name << " sum " << sum;
+//   VLOG(3) << "after checking result";
+// }
+
 void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                                   bool create_local_scope, bool create_vars,
                                   bool keep_kids) {
+  VLOG(3) << "RunPreparedContext inside";
   Scope* local_scope = scope;
   if (create_vars) {
     if (create_local_scope) {
@@ -346,13 +386,73 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
 
   for (auto& op : ctx->ops_) {
     op->Run(*local_scope, place_);
-
+   // CheckResult(op->Type(), ctx, local_scope);
     if (FLAGS_benchmark) {
       VLOG(2) << "Memory used after operator " + op->Type() + " running: "
               << memory::memory_usage(place_);
     }
   }
   platform::DeviceContextPool::Instance().Get(place_)->Wait();
+
+  VLOG(3) << "start checking";
+    auto& dev_ctx = *platform::DeviceContextPool::Instance().Get(place_);
+  std::vector<std::string> outputs;
+  auto& block = ctx->prog_.Block(0);
+
+  for(auto& op : block.AllOps()) {
+    if(op->Type() == "load_combine" || op->Type() == "fetch" || op->Type() == "feed") continue;
+    // for(auto& real_op : ctx->ops_) {
+    //   if(real_op->Type() == op->Type()) {
+    //     VLOG(3) << real_op->Type() << " " <<place_ << " " << real_op->DebugStringEx(local_scope);
+    //   }
+    // }
+     
+     //VLOG(3) << "start op output" << op->Type();
+        for(auto var_name: op->InputArgumentNames()) {
+      auto* var = local_scope->Var(var_name);
+      auto* var_desc = block.FindVar(var_name);
+      if (var_desc->Persistable()) continue;
+      auto* tensor = var->GetMutable<framework::LoDTensor>();
+      framework::Tensor check;
+      VLOG(3) << "before tensor copy";
+   
+      framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
+      
+      VLOG(3) << "after tensor copy";
+      float sum = .0;
+      for(size_t i=0; i < check.numel(); ++i) {
+          sum += check.data<float>()[i];
+      }
+      VLOG(3) << "op " << op->Type() << " input var " << var_name << " sum " << sum;
+    }
+
+    VLOG(3) << "op " << op->Type() << "input finished";
+    for(auto var_name: op->OutputArgumentNames()) {
+      auto* var = local_scope->Var(var_name);
+      auto* var_desc = block.FindVar(var_name);
+      if (var_desc->Persistable()) continue;
+      auto* tensor = var->GetMutable<framework::LoDTensor>();
+      framework::Tensor check;
+      VLOG(3) << "before tensor copy";
+      if(op->Type() == "batch_norm" && platform::is_gpu_place(place_)) {
+        VLOG(3) << "op " << op->Type() << " output var " << var_name << " " << tensor->numel();
+        tensor->mutable_data<float>(place_);
+         framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
+      } else {
+         framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
+      }
+      
+      VLOG(3) << "after tensor copy";
+      float sum = .0;
+      for(size_t i=0; i < check.numel(); ++i) {
+          sum += check.data<float>()[i];
+      }
+      VLOG(3) << "op " << op->Type() << " output var " << var_name << " sum " << sum;
+    }
+  }
+
+  VLOG(3) << "after checking result";
+
   if (local_scope != scope) {
     scope->DeleteScope(local_scope);
   } else {
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 4c30e1b32..93b554c83 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -46,7 +46,7 @@ if(WITH_GPU)
   endif(NOT WIN32)
 endif()
 
-include_directories("D:/Paddle/")
+include_directories("E:/Paddle/")
 include_directories("${PADDLE_LIB}")
 include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
 include_directories("${PADDLE_LIB}/third_party/install/glog/include")
@@ -72,7 +72,12 @@ link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
 link_directories("${PADDLE_LIB}/paddle/fluid/inference")
 
 # add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
-add_library(${DEMO_NAME} ${DEMO_NAME}.cc)
+ # add_library(${DEMO_NAME} ${DEMO_NAME}.cc)
+ add_library(${DEMO_NAME} SHARED  ${DEMO_NAME}.cc)
+add_executable(real_data_icnet_tester real_data_icnet_tester.cc)
+add_executable(test test.cc)
+add_executable(thread_icnet_test thread_icnet_test.cc)
+
 if(WITH_MKL)
   include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
   set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} 
@@ -89,7 +94,11 @@ endif()
 # Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
 if(WITH_STATIC_LIB)
   set(DEPS
-      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
+#  	   ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}
+	  D:/Paddle/bazel-dll/fluid_install_dir/paddle/fluid/inference/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}
+	#       E:/Paddle/build/paddle/fluid/inference/api/Release/libpaddle_inference_api${CMAKE_STATIC_LIBRARY_SUFFIX}
+	  	  D:/Paddle/bazel-dll/paddle/fluid/inference/api/Release/libpaddle_inference_api${CMAKE_STATIC_LIBRARY_SUFFIX}
+   )
 else()
   set(DEPS
       ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
@@ -121,3 +130,9 @@ if(WITH_GPU)
 endif()
 
 target_link_libraries(${DEMO_NAME} ${DEPS})
+target_link_libraries(test ${DEMO_NAME} )
+target_link_libraries(thread_icnet_test ${DEPS})
+target_link_libraries(real_data_icnet_tester ${DEPS})
+
+target_compile_definitions(${DEMO_NAME} PRIVATE "API_DEFINITION")
+
diff --git a/paddle/fluid/inference/api/demo_ci/inference_icnet.cc b/paddle/fluid/inference/api/demo_ci/inference_icnet.cc
index 869002b94..8b1635160 100644
--- a/paddle/fluid/inference/api/demo_ci/inference_icnet.cc
+++ b/paddle/fluid/inference/api/demo_ci/inference_icnet.cc
@@ -19,139 +19,144 @@
 #include <algorithm>
 #include <vector>
 #include <string>
+#include <memory>
 
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "inference_icnet.h"
 
-namespace paddle {
-
-std::string DIRNAME = "./infer_model";
-std::string DATA = "./test-image.txt"; 
-const int C = 3; // image channel
-const int H = 449; // image height
-const int W = 581; // image width
 // 数据格式
 // "<space splitted floats as data>\t<space splitted ints as shape"
 // 1. 存储为float32格式。
 // 2. 必须减去均值。 CHW三个通道为 mean = 112.15, 109.41, 185.42
-
-struct Record
-{
-  std::vector<float> data;
-  std::vector<int32_t> shape;
+using namespace paddle;
+
+class Predictor {
+private:
+	std::unique_ptr<PaddlePredictor> predictor;
+	struct Record
+	{
+		std::vector<float> data;
+		std::vector<int32_t> shape;
+	};
+
+	const int C = 3; // image channel
+	const int H = 449; // image height
+	const int W = 581; // image width
+
+	using Time = decltype(std::chrono::high_resolution_clock::now());
+
+	Time time() { return std::chrono::high_resolution_clock::now(); };
+
+	double time_diff(Time t1, Time t2) {
+		typedef std::chrono::microseconds ms;
+		auto diff = t2 - t1;
+		ms counter = std::chrono::duration_cast<ms>(diff);
+		return counter.count() / 1000.0;
+	}
+
+	static void split(const std::string& str, char sep,
+		std::vector<std::string>* pieces) {
+		pieces->clear();
+		if (str.empty()) {
+			return;
+		}
+		size_t pos = 0;
+		size_t next = str.find(sep, pos);
+		while (next != std::string::npos) {
+			pieces->push_back(str.substr(pos, next - pos));
+			pos = next + 1;
+			next = str.find(sep, pos);
+		}
+		if (!str.substr(pos).empty()) {
+			pieces->push_back(str.substr(pos));
+		}
+	}
+
+	Record ProcessALine(const std::string& line) {
+		std::vector<std::string> columns;
+		split(line, '\t', &columns);
+
+		Record record;
+		std::vector<std::string> data_strs;
+		split(columns[0], ' ', &data_strs);
+		for (auto& d : data_strs) {
+			record.data.push_back(std::stof(d));
+		}
+
+		std::vector<std::string> shape_strs;
+		split(columns[1], ' ', &shape_strs);
+		for (auto& s : shape_strs) {
+			record.shape.push_back(std::stoi(s));
+		}
+		return record;
+	}
+
+public:
+	Predictor (const char* prog_file,
+		const char* param_file, const float fraction_of_gpu_memory,
+		const bool use_gpu, const int device) {
+
+		NativeConfig config;
+		config.prog_file = prog_file;
+		config.param_file = param_file;
+		config.fraction_of_gpu_memory = fraction_of_gpu_memory;
+		config.use_gpu = use_gpu;
+		config.device = device;
+
+		predictor = CreatePaddlePredictor<NativeConfig>(config);
+	}
+
+	void predict(float* input, const int channel, const int height, const int width, 
+		int64_t** output, int* output_length, int batch_size) {
+		std::vector<float> data;
+		int intput_length = channel * height * width * batch_size;
+		for (int i = 0; i < intput_length; i++) {
+			data.push_back(*((float*)input + i));
+		}
+
+		// initialize the input data 
+		PaddleTensor tensor;
+		tensor.shape = std::vector<int>({ batch_size, channel, height, width });
+		tensor.data.Resize(sizeof(float) * batch_size * channel * height * width);
+		std::copy(data.begin(), data.end(), static_cast<float*>(tensor.data.data()));
+
+		tensor.dtype = PaddleDType::FLOAT32;
+		std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+
+		// initialize the output data
+		PaddleTensor tensor_out;
+		std::vector<PaddleTensor> outputs(1, tensor_out);
+		predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
+		*output_length = (int)outputs[0].data.length();
+		std::memcpy(static_cast<void *>(*output), outputs[0].data.data(), outputs[0].data.length());
+		int64_t sum_out = 0;
+		for(int i=0; i < outputs[0].data.length()/sizeof(int64_t); ++i) {
+			int64_t item = static_cast<int64_t*>(outputs[0].data.data())[i];
+			sum_out += item;
+			if (item != 0) {
+				std::cout << item << std::endl;
+			}
+		}
+
+		std::cout << "sum_out" << sum_out << std::endl;
+	}
 };
 
-NativeConfig GetConfig() {
-  NativeConfig config;
-  config.prog_file=DIRNAME + "/__model__";
-  config.param_file=DIRNAME + "/__params__";
-  config.fraction_of_gpu_memory = 0.0;
-  config.use_gpu = true;
-  config.device = 0;
-  return config;
-}
-
-using Time = decltype(std::chrono::high_resolution_clock::now());
-
-Time time() { return std::chrono::high_resolution_clock::now(); };
-
-double time_diff(Time t1, Time t2) {
-  typedef std::chrono::microseconds ms;
-  auto diff = t2 - t1;
-  ms counter = std::chrono::duration_cast<ms>(diff);
-  return counter.count() / 1000.0;
+API_REFERENCE void * init_predictor(const char* prog_file,
+	const char* param_file, const float fraction_of_gpu_memory,
+	const bool use_gpu, const int device) {
+	return new Predictor(prog_file, param_file, fraction_of_gpu_memory, use_gpu, device);
 }
 
-static void split(const std::string& str, char sep,
-                  std::vector<std::string>* pieces) {
-  pieces->clear();
-  if (str.empty()) {
-    return;
-  }
-  size_t pos = 0;
-  size_t next = str.find(sep, pos);
-  while (next != std::string::npos) {
-    pieces->push_back(str.substr(pos, next - pos));
-    pos = next + 1;
-    next = str.find(sep, pos);
-  }
-  if (!str.substr(pos).empty()) {
-    pieces->push_back(str.substr(pos));
-  }
+API_REFERENCE void predict(void* handle, float* input, const int channel, const int height, const int width, 
+	int64_t** output, int* output_length, int batch_size) {
+	assert(handle != nullptr);
+	((Predictor*)handle)->predict(input, channel, height, width, output, output_length, batch_size);
 }
 
-Record ProcessALine(const std::string& line) {
-  std::vector<std::string> columns;
-  split(line, '\t', &columns);
-
-  Record record;
-  std::vector<std::string> data_strs;
-  split(columns[0], ' ', &data_strs);
-  for (auto& d : data_strs) {
-    record.data.push_back(std::stof(d));
-  }
-
-  std::vector<std::string> shape_strs;
-  split(columns[1], ' ', &shape_strs);
-  for (auto& s : shape_strs) {
-    record.shape.push_back(std::stoi(s));
-  }
-  return record;
+API_REFERENCE void destory_predictor(void *handle) {
+	if (handle) {
+		delete handle;
+		handle = nullptr;
+	}
 }
-
-void test_naive(int batch_size){
-  NativeConfig config = GetConfig();
-  auto predictor = CreatePaddlePredictor<NativeConfig>(config);
-  int height = H;
-  int width = W;
-  int channel = C;
-  int num_sum = height * width * channel * batch_size;
-  
-  // 1. use fake data
-  std::vector<float> data;
-  for(int i = 0; i < num_sum; i++) {
-    data.push_back(0.0);
-  }
-  
-  PaddleTensor tensor;
-  tensor.shape = std::vector<int>({batch_size, channel, height, width});
-  tensor.data.Resize(sizeof(float) * batch_size * channel * height * width);
-  std::copy(data.begin(), data.end(), static_cast<float*>(tensor.data.data()));
-  tensor.dtype = PaddleDType::FLOAT32;
-
-  // 2. read data from file
-  // std::string line;
-  // std::ifstream file(DATA);
-  // std::getline(file, line);
-  // auto record = ProcessALine(line);
-  // file.close();
-  // PaddleTensor tensor;
-  // tensor.shape = record.shape;
-  // tensor.data =
-  //     PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
-
-  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
-  PaddleTensor tensor_out;
-
-  std::vector<PaddleTensor> outputs(1, tensor_out);
-
-  predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
-  auto time1 = time(); 
-  
-  for(size_t i = 0; i < 2; i++) {
-    std::cout << "Pass " << i << "predict";
-    predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
-  } 
-
-  auto time2 = time(); 
-  std::ofstream ofresult("naive_test_result.txt", std::ios::app);
-
-  std::cout <<"batch: " << batch_size << " predict cost: " << time_diff(time1, time2) / 100.0 << "ms" << std::endl;
-  std::cout << outputs.size() << std::endl;
-
-}
-}  // namespace paddle
-
-int main(int argc, char** argv) {
-  paddle::test_naive(1 << 0);
-  return 0;
-}
\ No newline at end of file
diff --git a/paddle/fluid/inference/api/demo_ci/inference_icnet.h b/paddle/fluid/inference/api/demo_ci/inference_icnet.h
new file mode 100644
index 000000000..b2657e798
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/inference_icnet.h
@@ -0,0 +1,21 @@
+
+#ifdef _WIN32
+#ifdef inference_icnet_EXPORTS
+#define API_REFERENCE extern "C" __declspec(dllexport) 
+#else
+#define API_REFERENCE extern "C" __declspec(dllimport) 
+#endif
+#else
+#define API_REFERENCE
+#endif
+
+//API_REFERENCE void * init_predictor();
+//API_REFERENCE void destory_predictor(void *handle);
+//API_REFERENCE void predict(void *handle, int n);
+
+API_REFERENCE void * init_predictor(const char* prog_file,
+	const char* param_file, const float fraction_of_gpu_memory,
+	const bool use_gpu, const int device);
+API_REFERENCE void predict(void* handle, float* input, const int channel, const int height,
+	const int width, int64_t** output, int* output_length, int batch_size);
+API_REFERENCE void destory_predictor(void *handle);
diff --git a/paddle/fluid/inference/api/demo_ci/real_data_icnet_tester.cc b/paddle/fluid/inference/api/demo_ci/real_data_icnet_tester.cc
new file mode 100644
index 000000000..677a6b976
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/real_data_icnet_tester.cc
@@ -0,0 +1,123 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#define GOOGLE_GLOG_DLL_DECL
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
+namespace paddle {
+
+// DEFINE_string(dirname, "./lb",
+//               "Directory of the inference model.");
+
+NativeConfig GetConfig() {
+  NativeConfig config;
+  // config.model_dir = FLAGS_dirname;
+  config.prog_file= "lb/__model__";
+  config.param_file= "lb/__params__";
+  config.fraction_of_gpu_memory = 0.8;
+  config.use_gpu = true;
+  config.device = 0;
+  return config;
+}
+
+using Time = decltype(std::chrono::high_resolution_clock::now());
+Time time() { return std::chrono::high_resolution_clock::now(); };
+double time_diff(Time t1, Time t2) {
+  typedef std::chrono::microseconds ms;
+  auto diff = t2 - t1;
+  ms counter = std::chrono::duration_cast<ms>(diff);
+  return counter.count() / 1000.0;
+}
+
+void test_naive(int batch_size){
+  NativeConfig config = GetConfig();
+  auto predictor = CreatePaddlePredictor<NativeConfig>(config);
+  int height = 449;
+  int width = 581;
+  
+  // =============read file list =============
+  std::ifstream infile("new_file.list");
+  std::string temp_s;
+  std::vector<std::string> all_files;
+  while (!infile.eof()) {
+    infile >> temp_s;
+    all_files.push_back(temp_s);
+  }
+
+  // size_t file_num = all_files.size();
+  infile.close();
+  // =============read file list =============
+  for (size_t f_k = 0; f_k < 1; f_k ++) {
+          std::ifstream in_img(all_files[f_k]);
+          std::cout << all_files[f_k] << std::endl;
+          float temp_v;
+
+         float sum_n = 0.0;
+	 std::vector<float> data;
+         while (!in_img.eof()) {
+            in_img >> temp_v;
+            data.push_back(float(temp_v));
+            // std::cout << temp_v << " ";
+            sum_n += temp_v;
+         }
+
+          in_img.close();
+          std::cout << "sum: " << sum_n << std::endl;
+          
+	  PaddleTensor tensor;
+	  tensor.shape = std::vector<int>({batch_size, 3, height, width});
+          tensor.data.Resize(sizeof(float) * batch_size * 3 * height * width);
+          std::copy(data.begin(), data.end(), static_cast<float*>(tensor.data.data()));
+	  tensor.dtype = PaddleDType::FLOAT32;
+	  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+	  PaddleTensor tensor_out;
+
+	  std::vector<PaddleTensor> outputs(1, tensor_out);
+	  predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
+	  std::cout << "start predict123:" << std::endl;
+	  auto time1 = time(); 
+
+	  
+	  for(size_t i = 0; i < 1; i++) {
+	    predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
+	  } 
+
+	  auto time2 = time(); 
+	  std::ofstream ofresult("naive_test_result.txt", std::ios::app);
+
+	  std::cout <<"batch: " << batch_size << " predict cost: " << time_diff(time1, time2) / 1000.0 << "ms" << std::endl;
+          std::cout << outputs.size() << std::endl;
+	  int64_t * data_o = static_cast<int64_t*>(outputs[0].data.data());
+    int64_t sum_out = 0;
+	  for (size_t j = 0; j < outputs[0].data.length() / sizeof(int64_t); ++j) {
+	    ofresult << std::to_string(data_o[j]) << " ";
+      sum_out += data_o[j];
+	  }
+    std::cout << "sum_out " << sum_out << std::endl;
+	  ofresult << std::endl;
+	  ofresult.close();
+ }
+}
+
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+//  google::ParseCommandLineFlags(&argc, &argv, true);
+  paddle::test_naive(1<<0);
+  return 0;
+}
diff --git a/paddle/fluid/inference/api/demo_ci/test.cc b/paddle/fluid/inference/api/demo_ci/test.cc
new file mode 100644
index 000000000..41f05a9b5
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/test.cc
@@ -0,0 +1,99 @@
+
+#include<windows.h>
+#include <fstream>
+#include "inference_icnet.h"
+#include <thread>
+#include <vector>
+#include <string>
+#include <iostream>
+
+#include <sstream>
+using namespace std;
+
+
+template <class Type>
+Type stringToNum(const string& str)
+{
+	istringstream iss(str);
+	Type num;
+	iss >> num;
+	return num;
+}
+
+void test_imgs() {
+	void *h = init_predictor("./lb/__model__", "./lb/__params__", 0.3f, true, 0);
+
+	std::ifstream infile("new_file.list");
+	std::ofstream ofs("./1.png.output.txt");
+
+	std::string temp_s;
+	std::vector<std::string> all_files;
+	while (!infile.eof()) {
+		infile >> temp_s;
+		all_files.push_back(temp_s);
+	}
+	// size_t file_num = all_files.size();
+	infile.close();
+	// =============read file list =============
+	for (size_t f_k = 0; f_k < 1; f_k++) {
+		// std::string path = "D:\\Paddle\\paddle\\fluid\\inference\\api\\demo_ci\\build\\Release\\";
+		// std::ifstream in_img(path + all_files[f_k]);
+		std::string mypath = "D:\\Paddle\\paddle\\fluid\\inference\\api\\demo_ci\\build\\Release\\1.png.txt";
+		std::cout << "file" << mypath << std::endl;
+		std::ifstream in_img(mypath);
+		//std::cout << path + all_files[f_k] << std::endl;
+		double temp_v;
+		const int size = 3 * 449 * 581 * 1;
+		float * data = new float[size];
+		std::string value;
+
+		if (!in_img.is_open()) {
+			cout << "open failed" << endl;
+		}
+		double sum_input = .0;
+		for (auto i = 0; i < size; i++) {
+			getline(in_img, value, '\n');
+			double v = stringToNum<double>(value);
+			data[i] = static_cast<float>(v);
+			sum_input += v;
+		}  
+		std::cout << "sum_input" << sum_input << std::endl;
+
+		in_img.close();
+		const int SIZE = 449 * 581 * 1;
+		int64_t * p = new int64_t[SIZE]();
+		int out_size = 0;
+		//memset(p, 0, size);
+		predict(h, data, 3, 449, 581, &p, &out_size, 1);
+		std::cout << "out_size = " << out_size << std::endl;
+	
+		double out_sum = .0;
+		for (auto i = 0; i < out_size / sizeof(int64_t); i++) {
+			out_sum += p[i];
+			ofs << p[i] << " ";
+		}
+		ofs.close();
+
+		std::cout << "inferece out sum" << out_sum << std::endl;
+		delete p;
+	}
+
+	destory_predictor(h);
+}
+
+int main(int argc, char** argv) {
+	//if (true) {
+	//	std::thread t1(func, init_predictor("./infer_model/__model__", "./infer_model/__params__", 0.1f, true, 0));
+	//	std::thread t2(func, init_predictor("./infer_model/__model__", "./infer_model/__params__", 0.1f, true, 0));
+	//	//std::thread t3(func, init_predictor("./infer_model/__model__", "./infer_model/__params__", 0.1f, true, 0));
+	//	//std::thread t4(func, init_predictor("./infer_model/__model__", "./infer_model/__params__", 0.1f, true, 0));
+	//	t1.join();
+	//	t2.join();
+	//	//t3.join();
+	//	//t4.join();
+	//	//Sleep(1);
+	//}
+	test_imgs();
+
+  return 0;
+}
diff --git a/paddle/fluid/inference/api/demo_ci/thread_icnet_test.cc b/paddle/fluid/inference/api/demo_ci/thread_icnet_test.cc
new file mode 100644
index 000000000..d669b04dc
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/thread_icnet_test.cc
@@ -0,0 +1,105 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define GOOGLE_GLOG_DLL_DECL
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+//#include <gtest/gtest.h>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include <thread>  // NOLINT
+
+#define ASSERT_TRUE(x) x
+#define ASSERT_EQ(x, y) assert(x == y)
+
+namespace paddle {
+
+// DEFINE_string(dirname, "./LB_icnet_model",
+//               "Directory of the inference model.");
+
+NativeConfig GetConfig() {
+  NativeConfig config;
+  config.prog_file= "./dzh_lb/__model__";
+  config.param_file= "./dzh_lb/__params__";
+  config.fraction_of_gpu_memory = 0.08;
+  config.use_gpu = true;
+  config.device = 0;
+  return config;
+}
+
+using Time = decltype(std::chrono::high_resolution_clock::now());
+Time time() { return std::chrono::high_resolution_clock::now(); };
+double time_diff(Time t1, Time t2) {
+  typedef std::chrono::microseconds ms;
+  auto diff = t2 - t1;
+  ms counter = std::chrono::duration_cast<ms>(diff);
+  return counter.count() / 1000.0;
+}
+
+void test_naive(int batch_size, std::string model_path){
+  PaddlePredictor* pres[2];
+  
+  NativeConfig config = GetConfig();
+  // config.model_dir = model_path;
+  auto predictor0 = CreatePaddlePredictor<NativeConfig>(config);
+  auto predictor1 = CreatePaddlePredictor<NativeConfig>(config);
+  pres[0] = predictor0.get();
+  pres[1] = predictor1.get();
+
+  int height = 449;
+  int width = 581;
+  
+  std::vector<float> data;
+  for (int i = 0; i < 3 * height * width; i++) {
+    data.push_back(0);
+  }
+  
+  PaddleTensor tensor;
+  tensor.shape = std::vector<int>({batch_size, 3, height, width});
+  tensor.data.Resize(sizeof(float) * batch_size * 3 * height * width);
+  std::copy(data.begin(), data.end(), static_cast<float*>(tensor.data.data()));
+  tensor.dtype = PaddleDType::FLOAT32;
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+
+  constexpr int num_jobs = 5;  // each job run 1 batch
+  std::vector<std::thread> threads;
+  for (int tid = 0; tid < num_jobs; ++tid) {
+    threads.emplace_back([&, tid]() {
+      auto predictor = pres[tid];
+      std::vector<PaddleTensor> local_outputs;
+     for(size_t i = 0; i < 1000; i++) {
+      ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &local_outputs));
+      std::cout << "run: " << tid << std::endl; 
+      }
+      ASSERT_EQ(local_outputs.size(), 1UL);
+    });
+  }
+  for (int i = 0; i < num_jobs; ++i) {
+    threads[i].join();
+  }
+}
+
+//TEST(alexnet, naive) {
+//  test_naive(1 << 0, "./trt_models/vgg19");
+//}
+
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+	paddle::test_naive(1 << 0, "");
+}
+
diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu.cc
index ca6cd8669..08a10757e 100644
--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
@@ -141,6 +141,27 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
           bias->template data<BatchNormParamType<T>>(),
           est_mean->template data<BatchNormParamType<T>>(),
           est_var->template data<BatchNormParamType<T>>(), epsilon));
+
+      VLOG(3) << "before tensor copy";
+      Tensor mean_, var_, x_, y_;
+      framework::TensorCopy(*est_mean, platform::CPUPlace(), dev_ctx, &mean_);
+      framework::TensorCopy(*est_var, platform::CPUPlace(), dev_ctx, &var_);
+      framework::TensorCopy(*x, platform::CPUPlace(), dev_ctx, &x_);
+      framework::TensorCopy(*y, platform::CPUPlace(), dev_ctx, &y_);
+      VLOG(3) << "after tensor copy";
+      auto check_tensor = [&](const Tensor& check) {
+      float sum = .0;
+      for(size_t i=0; i < check.numel(); ++i) {
+          sum += check.data<float>()[i];
+      }
+      return sum;
+      };
+      VLOG(3) << "BatchNormKernel";
+      VLOG(3) << "mean" << check_tensor(mean_);
+      VLOG(3) << "var" << check_tensor(var_);
+      VLOG(3) << "x" << check_tensor(x_);
+      VLOG(3) << "y" << check_tensor(y_);
+      
     } else {
       // Run training mode.
       // obtain running mean and running inv var, and see if we need to
diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc
index e2f98164b..ccc497aff 100644
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <fstream>
+#include <vector>
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -34,6 +35,7 @@ class LoadCombineOp : public framework::OperatorBase {
     auto load_as_fp16 = Attr<bool>("load_as_fp16");
 
     std::ifstream fin(filename, std::ios_base::in | std::ios_base::binary);
+    //std::ifstream fin(filename, std::ios_base::in);
     PADDLE_ENFORCE(!fin.bad(),
                    "Cannot open file %s for load_combine op", filename);
 
@@ -46,7 +48,7 @@ class LoadCombineOp : public framework::OperatorBase {
     auto &dev_ctx = *pool.Get(place);
 
     for (size_t i = 0; i < out_var_names.size(); i++) {
-      VLOG(3) << "load " << out_var_names[i];
+      VLOG(3) << "load variable " << out_var_names[i];
       auto *out_var = scope.FindVar(out_var_names[i]);
 
       PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
@@ -61,6 +63,13 @@ class LoadCombineOp : public framework::OperatorBase {
       // Get data from fin to tensor
       DeserializeFromStream(fin, tensor, dev_ctx); 
       VLOG(3) << "after deserialization";
+      framework::Tensor check;
+      framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
+      float sum = .0;
+      for(size_t i=0; i < check.numel(); ++i) {
+          sum += check.data<float>()[i];
+      }
+      VLOG(3) << "sum result" << sum;
       auto in_dtype = framework::ToDataType(tensor->type());
       auto out_dtype =
           load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
@@ -80,6 +89,7 @@ class LoadCombineOp : public framework::OperatorBase {
         tensor = out_var->GetMutable<framework::LoDTensor>();
         tensor->set_lod(fp16_tensor.lod());
         tensor->ShareDataWith(fp16_tensor);
+
       }
       VLOG(3) << "load " << out_var_names[i] << " finished";
     }
-- 
GitLab