lb. add debug output

c6dcffc6 · dzhwinter · 607080e8 · c6dcffc6 · c6dcffc6 · c6dcffc6
9 changed file
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -333,9 +333,49 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
  return result;
 }
+// void CheckResult(const std::string op_type, ExecutorPrepareContext* ctx, Scope* local_scope) {
+//     VLOG(3) << "before checking result";
+//   auto& dev_ctx = *platform::DeviceContextPool::Instance().Get(place_);
+//   std::vector<std::string> outputs;
+//   auto& block = ctx->prog_.Block(0);
+//   bool found = false;
+//   framework::OpDesc* myop = nullptr;
+//   for(auto& op : block.AllOps()) {
+//     if(op->Type() == "load_combine" || op->Type() == "fetch" || op->Type() == "feed") return;
+//     if (op->Type() == op_type) {
+//         found = true;
+//         myop = op;
+//         break;
+//       }
+//     }
+//   }
+//   if(!found) {
+//     VLOG(3) << "not found op!";
+//     return;
+//   }
+//     auto* op = myop;
+//      VLOG(3) << "start op output" << op->Type();
+//     for(auto var_name: op->OutputArgumentNames()) {
+//       auto* var = local_scope->Var(var_name);
+//       auto* var_desc = block.FindVar(var_name);
+//       if (var_desc->Persistable()) continue;
+//       auto* tensor = var->GetMutable<framework::LoDTensor>();
+//       framework::Tensor check;
+//       VLOG(3) << "before tensor copy";
+//       framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
+//       VLOG(3) << "after tensor copy";
+//       float sum = .0;
+//       for(size_t i=0; i < check.numel(); ++i) {
+//           sum += check.data<float>()[i];
+//       }
+//       VLOG(3) << "op " << op->Type() << " output var " << var_name << " sum " << sum;
+//   VLOG(3) << "after checking result";
+// }
 void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                                  bool create_local_scope, bool create_vars,
                                  bool keep_kids) {
+  VLOG(3) << "RunPreparedContext inside";
  Scope* local_scope = scope;
  if (create_vars) {
    if (create_local_scope) {
@@ -346,13 +386,73 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  for (auto& op : ctx->ops_) {
    op->Run(*local_scope, place_);
+   // CheckResult(op->Type(), ctx, local_scope);
    if (FLAGS_benchmark) {
      VLOG(2) << "Memory used after operator " + op->Type() + " running: "
              << memory::memory_usage(place_);
    }
  }
  platform::DeviceContextPool::Instance().Get(place_)->Wait();
+  VLOG(3) << "start checking";
+    auto& dev_ctx = *platform::DeviceContextPool::Instance().Get(place_);
+  std::vector<std::string> outputs;
+  auto& block = ctx->prog_.Block(0);
+  for(auto& op : block.AllOps()) {
+    if(op->Type() == "load_combine" || op->Type() == "fetch" || op->Type() == "feed") continue;
+    // for(auto& real_op : ctx->ops_) {
+    //   if(real_op->Type() == op->Type()) {
+    //     VLOG(3) << real_op->Type() << " " <<place_ << " " << real_op->DebugStringEx(local_scope);
+    //   }
+    // }
+     //VLOG(3) << "start op output" << op->Type();
+        for(auto var_name: op->InputArgumentNames()) {
+      auto* var = local_scope->Var(var_name);
+      auto* var_desc = block.FindVar(var_name);
+      if (var_desc->Persistable()) continue;
+      auto* tensor = var->GetMutable<framework::LoDTensor>();
+      framework::Tensor check;
+      VLOG(3) << "before tensor copy";
+      framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
+      VLOG(3) << "after tensor copy";
+      float sum = .0;
+      for(size_t i=0; i < check.numel(); ++i) {
+          sum += check.data<float>()[i];
+      }
+      VLOG(3) << "op " << op->Type() << " input var " << var_name << " sum " << sum;
+    }
+    VLOG(3) << "op " << op->Type() << "input finished";
+    for(auto var_name: op->OutputArgumentNames()) {
+      auto* var = local_scope->Var(var_name);
+      auto* var_desc = block.FindVar(var_name);
+      if (var_desc->Persistable()) continue;
+      auto* tensor = var->GetMutable<framework::LoDTensor>();
+      framework::Tensor check;
+      VLOG(3) << "before tensor copy";
+      if(op->Type() == "batch_norm" && platform::is_gpu_place(place_)) {
+        VLOG(3) << "op " << op->Type() << " output var " << var_name << " " << tensor->numel();
+        tensor->mutable_data<float>(place_);
+         framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
+      } else {
+         framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
+      }
+      VLOG(3) << "after tensor copy";
+      float sum = .0;
+      for(size_t i=0; i < check.numel(); ++i) {
+          sum += check.data<float>()[i];
+      }
+      VLOG(3) << "op " << op->Type() << " output var " << var_name << " sum " << sum;
+    }
+  }
+  VLOG(3) << "after checking result";
  if (local_scope != scope) {
    scope->DeleteScope(local_scope);
  } else {

--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -46,7 +46,7 @@ if(WITH_GPU)
  endif(NOT WIN32)
 endif()
-include_directories("D:/Paddle/")
+include_directories("E:/Paddle/")
 include_directories("${PADDLE_LIB}")
 include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
 include_directories("${PADDLE_LIB}/third_party/install/glog/include")
@@ -72,7 +72,12 @@ link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
 link_directories("${PADDLE_LIB}/paddle/fluid/inference")
 # add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
-add_library(${DEMO_NAME} ${DEMO_NAME}.cc)
+ # add_library(${DEMO_NAME} ${DEMO_NAME}.cc)
+ add_library(${DEMO_NAME} SHARED  ${DEMO_NAME}.cc)
+add_executable(real_data_icnet_tester real_data_icnet_tester.cc)
+add_executable(test test.cc)
+add_executable(thread_icnet_test thread_icnet_test.cc)
 if(WITH_MKL)
  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} 
@@ -89,7 +94,11 @@ endif()
 # Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
 if(WITH_STATIC_LIB)
  set(DEPS
-      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
+#  	   ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}
+	  D:/Paddle/bazel-dll/fluid_install_dir/paddle/fluid/inference/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}
+	#       E:/Paddle/build/paddle/fluid/inference/api/Release/libpaddle_inference_api${CMAKE_STATIC_LIBRARY_SUFFIX}
+	  	  D:/Paddle/bazel-dll/paddle/fluid/inference/api/Release/libpaddle_inference_api${CMAKE_STATIC_LIBRARY_SUFFIX}
+   )
 else()
  set(DEPS
      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
@@ -121,3 +130,9 @@ if(WITH_GPU)
 endif()
 target_link_libraries(${DEMO_NAME} ${DEPS})
+target_link_libraries(test ${DEMO_NAME} )
+target_link_libraries(thread_icnet_test ${DEPS})
+target_link_libraries(real_data_icnet_tester ${DEPS})
+target_compile_definitions(${DEMO_NAME} PRIVATE "API_DEFINITION")
--- a/paddle/fluid/inference/api/demo_ci/inference_icnet.cc
+++ b/paddle/fluid/inference/api/demo_ci/inference_icnet.cc
@@ -19,49 +19,42 @@
 #include <algorithm>
 #include <vector>
 #include <string>
+#include <memory>
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "inference_icnet.h"
-namespace paddle {
-std::string DIRNAME = "./infer_model";
-std::string DATA = "./test-image.txt"; 
-const int C = 3; // image channel
-const int H = 449; // image height
-const int W = 581; // image width
 // 数据格式
 // "<space splitted floats as data>\t<space splitted ints as shape"
 // 1. 存储为float32格式。
 // 2. 必须减去均值。 CHW三个通道为 mean = 112.15, 109.41, 185.42
+using namespace paddle;
-struct Record
+class Predictor {
-{
+private:
+	std::unique_ptr<PaddlePredictor> predictor;
+	struct Record
+	{
 		std::vector<float> data;
 		std::vector<int32_t> shape;
-};
+	};
-NativeConfig GetConfig() {
+	const int C = 3; // image channel
-  NativeConfig config;
+	const int H = 449; // image height
-  config.prog_file=DIRNAME + "/__model__";
+	const int W = 581; // image width
-  config.param_file=DIRNAME + "/__params__";
-  config.fraction_of_gpu_memory = 0.0;
-  config.use_gpu = true;
-  config.device = 0;
-  return config;
-}
-using Time = decltype(std::chrono::high_resolution_clock::now());
+	using Time = decltype(std::chrono::high_resolution_clock::now());
-Time time() { return std::chrono::high_resolution_clock::now(); };
+	Time time() { return std::chrono::high_resolution_clock::now(); };
-double time_diff(Time t1, Time t2) {
+	double time_diff(Time t1, Time t2) {
 		typedef std::chrono::microseconds ms;
 		auto diff = t2 - t1;
 		ms counter = std::chrono::duration_cast<ms>(diff);
 		return counter.count() / 1000.0;
-}
+	}
-static void split(const std::string& str, char sep,
+	static void split(const std::string& str, char sep,
 		std::vector<std::string>* pieces) {
 		pieces->clear();
 		if (str.empty()) {
@@ -77,9 +70,9 @@ static void split(const std::string& str, char sep,
 		if (!str.substr(pos).empty()) {
 			pieces->push_back(str.substr(pos));
 		}
-}
+	}
-Record ProcessALine(const std::string& line) {
+	Record ProcessALine(const std::string& line) {
 		std::vector<std::string> columns;
 		split(line, '\t', &columns);
@@ -96,62 +89,74 @@ Record ProcessALine(const std::string& line) {
 			record.shape.push_back(std::stoi(s));
 		}
 		return record;
-}
+	}
+public:
+	Predictor (const char* prog_file,
+		const char* param_file, const float fraction_of_gpu_memory,
+		const bool use_gpu, const int device) {
-void test_naive(int batch_size){
+		NativeConfig config;
-  NativeConfig config = GetConfig();
+		config.prog_file = prog_file;
-  auto predictor = CreatePaddlePredictor<NativeConfig>(config);
+		config.param_file = param_file;
-  int height = H;
+		config.fraction_of_gpu_memory = fraction_of_gpu_memory;
-  int width = W;
+		config.use_gpu = use_gpu;
-  int channel = C;
+		config.device = device;
-  int num_sum = height * width * channel * batch_size;
+		predictor = CreatePaddlePredictor<NativeConfig>(config);
+	}
-  // 1. use fake data
+	void predict(float* input, const int channel, const int height, const int width, 
+		int64_t** output, int* output_length, int batch_size) {
 		std::vector<float> data;
-  for(int i = 0; i < num_sum; i++) {
+		int intput_length = channel * height * width * batch_size;
-    data.push_back(0.0);
+		for (int i = 0; i < intput_length; i++) {
+			data.push_back(*((float*)input + i));
 		}
+		// initialize the input data 
 		PaddleTensor tensor;
-  tensor.shape = std::vector<int>({batch_size, channel, height, width});
+		tensor.shape = std::vector<int>({ batch_size, channel, height, width });
 		tensor.data.Resize(sizeof(float) * batch_size * channel * height * width);
 		std::copy(data.begin(), data.end(), static_cast<float*>(tensor.data.data()));
-  tensor.dtype = PaddleDType::FLOAT32;
-  // 2. read data from file
-  // std::string line;
-  // std::ifstream file(DATA);
-  // std::getline(file, line);
-  // auto record = ProcessALine(line);
-  // file.close();
-  // PaddleTensor tensor;
-  // tensor.shape = record.shape;
-  // tensor.data =
-  //     PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
+		tensor.dtype = PaddleDType::FLOAT32;
 		std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
-  PaddleTensor tensor_out;
+		// initialize the output data
+		PaddleTensor tensor_out;
 		std::vector<PaddleTensor> outputs(1, tensor_out);
-  predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
-  auto time1 = time(); 
-  for(size_t i = 0; i < 2; i++) {
-    std::cout << "Pass " << i << "predict";
 		predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
+		*output_length = (int)outputs[0].data.length();
+		std::memcpy(static_cast<void *>(*output), outputs[0].data.data(), outputs[0].data.length());
+		int64_t sum_out = 0;
+		for(int i=0; i < outputs[0].data.length()/sizeof(int64_t); ++i) {
+			int64_t item = static_cast<int64_t*>(outputs[0].data.data())[i];
+			sum_out += item;
+			if (item != 0) {
+				std::cout << item << std::endl;
+			}
 		}
-  auto time2 = time(); 
+		std::cout << "sum_out" << sum_out << std::endl;
-  std::ofstream ofresult("naive_test_result.txt", std::ios::app);
+	}
+};
-  std::cout <<"batch: " << batch_size << " predict cost: " << time_diff(time1, time2) / 100.0 << "ms" << std::endl;
+API_REFERENCE void * init_predictor(const char* prog_file,
-  std::cout << outputs.size() << std::endl;
+	const char* param_file, const float fraction_of_gpu_memory,
+	const bool use_gpu, const int device) {
+	return new Predictor(prog_file, param_file, fraction_of_gpu_memory, use_gpu, device);
+}
+API_REFERENCE void predict(void* handle, float* input, const int channel, const int height, const int width, 
+	int64_t** output, int* output_length, int batch_size) {
+	assert(handle != nullptr);
+	((Predictor*)handle)->predict(input, channel, height, width, output, output_length, batch_size);
 }
-}  // namespace paddle
-int main(int argc, char** argv) {
+API_REFERENCE void destory_predictor(void *handle) {
-  paddle::test_naive(1 << 0);
+	if (handle) {
-  return 0;
+		delete handle;
+		handle = nullptr;
+	}
 }
--- a/paddle/fluid/inference/api/demo_ci/inference_icnet.h
+++ b/paddle/fluid/inference/api/demo_ci/inference_icnet.h
+#ifdef _WIN32
+#ifdef inference_icnet_EXPORTS
+#define API_REFERENCE extern "C" __declspec(dllexport) 
+#else
+#define API_REFERENCE extern "C" __declspec(dllimport) 
+#endif
+#else
+#define API_REFERENCE
+#endif
+//API_REFERENCE void * init_predictor();
+//API_REFERENCE void destory_predictor(void *handle);
+//API_REFERENCE void predict(void *handle, int n);
+API_REFERENCE void * init_predictor(const char* prog_file,
+	const char* param_file, const float fraction_of_gpu_memory,
+	const bool use_gpu, const int device);
+API_REFERENCE void predict(void* handle, float* input, const int channel, const int height,
+	const int width, int64_t** output, int* output_length, int batch_size);
+API_REFERENCE void destory_predictor(void *handle);
--- a/paddle/fluid/inference/api/demo_ci/real_data_icnet_tester.cc
+++ b/paddle/fluid/inference/api/demo_ci/real_data_icnet_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#define GOOGLE_GLOG_DLL_DECL
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+namespace paddle {
+// DEFINE_string(dirname, "./lb",
+//               "Directory of the inference model.");
+NativeConfig GetConfig() {
+  NativeConfig config;
+  // config.model_dir = FLAGS_dirname;
+  config.prog_file= "lb/__model__";
+  config.param_file= "lb/__params__";
+  config.fraction_of_gpu_memory = 0.8;
+  config.use_gpu = true;
+  config.device = 0;
+  return config;
+}
+using Time = decltype(std::chrono::high_resolution_clock::now());
+Time time() { return std::chrono::high_resolution_clock::now(); };
+double time_diff(Time t1, Time t2) {
+  typedef std::chrono::microseconds ms;
+  auto diff = t2 - t1;
+  ms counter = std::chrono::duration_cast<ms>(diff);
+  return counter.count() / 1000.0;
+}
+void test_naive(int batch_size){
+  NativeConfig config = GetConfig();
+  auto predictor = CreatePaddlePredictor<NativeConfig>(config);
+  int height = 449;
+  int width = 581;
+  // =============read file list =============
+  std::ifstream infile("new_file.list");
+  std::string temp_s;
+  std::vector<std::string> all_files;
+  while (!infile.eof()) {
+    infile >> temp_s;
+    all_files.push_back(temp_s);
+  }
+  // size_t file_num = all_files.size();
+  infile.close();
+  // =============read file list =============
+  for (size_t f_k = 0; f_k < 1; f_k ++) {
+          std::ifstream in_img(all_files[f_k]);
+          std::cout << all_files[f_k] << std::endl;
+          float temp_v;
+         float sum_n = 0.0;
+	 std::vector<float> data;
+         while (!in_img.eof()) {
+            in_img >> temp_v;
+            data.push_back(float(temp_v));
+            // std::cout << temp_v << " ";
+            sum_n += temp_v;
+         }
+          in_img.close();
+          std::cout << "sum: " << sum_n << std::endl;
+	  PaddleTensor tensor;
+	  tensor.shape = std::vector<int>({batch_size, 3, height, width});
+          tensor.data.Resize(sizeof(float) * batch_size * 3 * height * width);
+          std::copy(data.begin(), data.end(), static_cast<float*>(tensor.data.data()));
+	  tensor.dtype = PaddleDType::FLOAT32;
+	  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+	  PaddleTensor tensor_out;
+	  std::vector<PaddleTensor> outputs(1, tensor_out);
+	  predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
+	  std::cout << "start predict123:" << std::endl;
+	  auto time1 = time(); 
+	  for(size_t i = 0; i < 1; i++) {
+	    predictor->Run(paddle_tensor_feeds, &outputs, batch_size);
+	  } 
+	  auto time2 = time(); 
+	  std::ofstream ofresult("naive_test_result.txt", std::ios::app);
+	  std::cout <<"batch: " << batch_size << " predict cost: " << time_diff(time1, time2) / 1000.0 << "ms" << std::endl;
+          std::cout << outputs.size() << std::endl;
+	  int64_t * data_o = static_cast<int64_t*>(outputs[0].data.data());
+    int64_t sum_out = 0;
+	  for (size_t j = 0; j < outputs[0].data.length() / sizeof(int64_t); ++j) {
+	    ofresult << std::to_string(data_o[j]) << " ";
+      sum_out += data_o[j];
+	  }
+    std::cout << "sum_out " << sum_out << std::endl;
+	  ofresult << std::endl;
+	  ofresult.close();
+ }
+}
+}  // namespace paddle
+int main(int argc, char** argv) {
+//  google::ParseCommandLineFlags(&argc, &argv, true);
+  paddle::test_naive(1<<0);
+  return 0;
+}
--- a/paddle/fluid/inference/api/demo_ci/test.cc
+++ b/paddle/fluid/inference/api/demo_ci/test.cc
+#include<windows.h>
+#include <fstream>
+#include "inference_icnet.h"
+#include <thread>
+#include <vector>
+#include <string>
+#include <iostream>
+#include <sstream>
+using namespace std;
+template <class Type>
+Type stringToNum(const string& str)
+{
+	istringstream iss(str);
+	Type num;
+	iss >> num;
+	return num;
+}
+void test_imgs() {
+	void *h = init_predictor("./lb/__model__", "./lb/__params__", 0.3f, true, 0);
+	std::ifstream infile("new_file.list");
+	std::ofstream ofs("./1.png.output.txt");
+	std::string temp_s;
+	std::vector<std::string> all_files;
+	while (!infile.eof()) {
+		infile >> temp_s;
+		all_files.push_back(temp_s);
+	}
+	// size_t file_num = all_files.size();
+	infile.close();
+	// =============read file list =============
+	for (size_t f_k = 0; f_k < 1; f_k++) {
+		// std::string path = "D:\\Paddle\\paddle\\fluid\\inference\\api\\demo_ci\\build\\Release\\";
+		// std::ifstream in_img(path + all_files[f_k]);
+		std::string mypath = "D:\\Paddle\\paddle\\fluid\\inference\\api\\demo_ci\\build\\Release\\1.png.txt";
+		std::cout << "file" << mypath << std::endl;
+		std::ifstream in_img(mypath);
+		//std::cout << path + all_files[f_k] << std::endl;
+		double temp_v;
+		const int size = 3 * 449 * 581 * 1;
+		float * data = new float[size];
+		std::string value;
+		if (!in_img.is_open()) {
+			cout << "open failed" << endl;
+		}
+		double sum_input = .0;
+		for (auto i = 0; i < size; i++) {
+			getline(in_img, value, '\n');
+			double v = stringToNum<double>(value);
+			data[i] = static_cast<float>(v);
+			sum_input += v;
+		}  
+		std::cout << "sum_input" << sum_input << std::endl;
+		in_img.close();
+		const int SIZE = 449 * 581 * 1;
+		int64_t * p = new int64_t[SIZE]();
+		int out_size = 0;
+		//memset(p, 0, size);
+		predict(h, data, 3, 449, 581, &p, &out_size, 1);
+		std::cout << "out_size = " << out_size << std::endl;
+		double out_sum = .0;
+		for (auto i = 0; i < out_size / sizeof(int64_t); i++) {
+			out_sum += p[i];
+			ofs << p[i] << " ";
+		}
+		ofs.close();
+		std::cout << "inferece out sum" << out_sum << std::endl;
+		delete p;
+	}
+	destory_predictor(h);
+}
+int main(int argc, char** argv) {
+	//if (true) {
+	//	std::thread t1(func, init_predictor("./infer_model/__model__", "./infer_model/__params__", 0.1f, true, 0));
+	//	std::thread t2(func, init_predictor("./infer_model/__model__", "./infer_model/__params__", 0.1f, true, 0));
+	//	//std::thread t3(func, init_predictor("./infer_model/__model__", "./infer_model/__params__", 0.1f, true, 0));
+	//	//std::thread t4(func, init_predictor("./infer_model/__model__", "./infer_model/__params__", 0.1f, true, 0));
+	//	t1.join();
+	//	t2.join();
+	//	//t3.join();
+	//	//t4.join();
+	//	//Sleep(1);
+	//}
+	test_imgs();
+  return 0;
+}
--- a/paddle/fluid/inference/api/demo_ci/thread_icnet_test.cc
+++ b/paddle/fluid/inference/api/demo_ci/thread_icnet_test.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#define GOOGLE_GLOG_DLL_DECL
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+//#include <gtest/gtest.h>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include <thread>  // NOLINT
+#define ASSERT_TRUE(x) x
+#define ASSERT_EQ(x, y) assert(x == y)
+namespace paddle {
+// DEFINE_string(dirname, "./LB_icnet_model",
+//               "Directory of the inference model.");
+NativeConfig GetConfig() {
+  NativeConfig config;
+  config.prog_file= "./dzh_lb/__model__";
+  config.param_file= "./dzh_lb/__params__";
+  config.fraction_of_gpu_memory = 0.08;
+  config.use_gpu = true;
+  config.device = 0;
+  return config;
+}
+using Time = decltype(std::chrono::high_resolution_clock::now());
+Time time() { return std::chrono::high_resolution_clock::now(); };
+double time_diff(Time t1, Time t2) {
+  typedef std::chrono::microseconds ms;
+  auto diff = t2 - t1;
+  ms counter = std::chrono::duration_cast<ms>(diff);
+  return counter.count() / 1000.0;
+}
+void test_naive(int batch_size, std::string model_path){
+  PaddlePredictor* pres[2];
+  NativeConfig config = GetConfig();
+  // config.model_dir = model_path;
+  auto predictor0 = CreatePaddlePredictor<NativeConfig>(config);
+  auto predictor1 = CreatePaddlePredictor<NativeConfig>(config);
+  pres[0] = predictor0.get();
+  pres[1] = predictor1.get();
+  int height = 449;
+  int width = 581;
+  std::vector<float> data;
+  for (int i = 0; i < 3 * height * width; i++) {
+    data.push_back(0);
+  }
+  PaddleTensor tensor;
+  tensor.shape = std::vector<int>({batch_size, 3, height, width});
+  tensor.data.Resize(sizeof(float) * batch_size * 3 * height * width);
+  std::copy(data.begin(), data.end(), static_cast<float*>(tensor.data.data()));
+  tensor.dtype = PaddleDType::FLOAT32;
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+  constexpr int num_jobs = 5;  // each job run 1 batch
+  std::vector<std::thread> threads;
+  for (int tid = 0; tid < num_jobs; ++tid) {
+    threads.emplace_back([&, tid]() {
+      auto predictor = pres[tid];
+      std::vector<PaddleTensor> local_outputs;
+     for(size_t i = 0; i < 1000; i++) {
+      ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &local_outputs));
+      std::cout << "run: " << tid << std::endl; 
+      }
+      ASSERT_EQ(local_outputs.size(), 1UL);
+    });
+  }
+  for (int i = 0; i < num_jobs; ++i) {
+    threads[i].join();
+  }
+}
+//TEST(alexnet, naive) {
+//  test_naive(1 << 0, "./trt_models/vgg19");
+//}
+}  // namespace paddle
+int main(int argc, char** argv) {
+	paddle::test_naive(1 << 0, "");
+}
--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
@@ -141,6 +141,27 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
          bias->template data<BatchNormParamType<T>>(),
          est_mean->template data<BatchNormParamType<T>>(),
          est_var->template data<BatchNormParamType<T>>(), epsilon));
+      VLOG(3) << "before tensor copy";
+      Tensor mean_, var_, x_, y_;
+      framework::TensorCopy(*est_mean, platform::CPUPlace(), dev_ctx, &mean_);
+      framework::TensorCopy(*est_var, platform::CPUPlace(), dev_ctx, &var_);
+      framework::TensorCopy(*x, platform::CPUPlace(), dev_ctx, &x_);
+      framework::TensorCopy(*y, platform::CPUPlace(), dev_ctx, &y_);
+      VLOG(3) << "after tensor copy";
+      auto check_tensor = [&](const Tensor& check) {
+      float sum = .0;
+      for(size_t i=0; i < check.numel(); ++i) {
+          sum += check.data<float>()[i];
+      }
+      return sum;
+      };
+      VLOG(3) << "BatchNormKernel";
+      VLOG(3) << "mean" << check_tensor(mean_);
+      VLOG(3) << "var" << check_tensor(var_);
+      VLOG(3) << "x" << check_tensor(x_);
+      VLOG(3) << "y" << check_tensor(y_);
    } else {
      // Run training mode.
      // obtain running mean and running inv var, and see if we need to

--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <fstream>
+#include <vector>
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -34,6 +35,7 @@ class LoadCombineOp : public framework::OperatorBase {
    auto load_as_fp16 = Attr<bool>("load_as_fp16");
    std::ifstream fin(filename, std::ios_base::in | std::ios_base::binary);
+    //std::ifstream fin(filename, std::ios_base::in);
    PADDLE_ENFORCE(!fin.bad(),
                   "Cannot open file %s for load_combine op", filename);
@@ -46,7 +48,7 @@ class LoadCombineOp : public framework::OperatorBase {
    auto &dev_ctx = *pool.Get(place);
    for (size_t i = 0; i < out_var_names.size(); i++) {
-      VLOG(3) << "load " << out_var_names[i];
+      VLOG(3) << "load variable " << out_var_names[i];
      auto *out_var = scope.FindVar(out_var_names[i]);
      PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
@@ -61,6 +63,13 @@ class LoadCombineOp : public framework::OperatorBase {
      // Get data from fin to tensor
      DeserializeFromStream(fin, tensor, dev_ctx); 
      VLOG(3) << "after deserialization";
+      framework::Tensor check;
+      framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
+      float sum = .0;
+      for(size_t i=0; i < check.numel(); ++i) {
+          sum += check.data<float>()[i];
+      }
+      VLOG(3) << "sum result" << sum;
      auto in_dtype = framework::ToDataType(tensor->type());
      auto out_dtype =
          load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
@@ -80,6 +89,7 @@ class LoadCombineOp : public framework::OperatorBase {
        tensor = out_var->GetMutable<framework::LoDTensor>();
        tensor->set_lod(fp16_tensor.lod());
        tensor->ShareDataWith(fp16_tensor);
      }
      VLOG(3) << "load " << out_var_names[i] << " finished";
    }