diff --git a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
index 49dbb0106b634b35898c36dde4a5b27698f3cbb2..fe2bd27524fbfc7f3d2b68f5e3da8feefd46557a 100644
--- a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
@@ -2,7 +2,7 @@ cc_library(code_generator
     SRCS operation.cc code_generator.cc code_generator_helper.cc
     DEPS graph subgraph_detector)
 if(WITH_GPU)
-#    cc_test(test_code_generator SRCS code_generator_tester.cc DEPS code_generator device_code lod_tensor graph_viz_pass)
+    cc_test(test_code_generator SRCS code_generator_tester.cc DEPS code_generator device_code lod_tensor graph_viz_pass)
 endif()
 
 cc_library(fusion_group_pass
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator.cc b/paddle/fluid/framework/ir/fusion_group/code_generator.cc
index 6555a65c3bd4c21740b3d44ac18a756d50d22f00..431d3c05f6dd4b44074729716555744773f950e7 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator.cc
@@ -209,7 +209,10 @@ std::set<int> CodeGenerator::DistilIntermediateIds(
   for (size_t i = 0; i < expressions.size(); i++) {
     for (auto id : expressions[i].GetOutputIds()) {
       auto intermediate_state = expressions[i].GetIntermediateState();
-      if (intermediate_state[id]) intermediate_ids.insert(id);
+      if (intermediate_state.find(id) != intermediate_state.end() &&
+          intermediate_state[id]) {
+        intermediate_ids.insert(id);
+      }
     }
   }
   return intermediate_ids;
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
index 63197c00ff1ced99bd1e91a51a530e7999e77268..03d28277afbbb7467f638d521414d702bc8e8179 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
@@ -44,10 +44,11 @@ static inline std::string VarName(int index) {
 
 class OperationExpression {
  public:
-  explicit OperationExpression(std::string op_type, std::vector<int> input_ids,
-                               std::vector<int> output_ids,
-                               std::string rhs_type, std::string lhs_type,
-                               std::unordered_map<int, bool> intermediate_state)
+  explicit OperationExpression(
+      std::string op_type, const std::vector<int>& input_ids,
+      const std::vector<int>& output_ids, std::string rhs_type,
+      std::string lhs_type,
+      const std::unordered_map<int, bool>& intermediate_state = {})
       : op_type_(op_type),
         input_ids_(input_ids),
         output_ids_(output_ids),
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
index 92dec555cab996e2dc7549311ae813953e6ce9dd..89b05fc577bb46606ff5c43d0dd697bd7b8aed38 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
@@ -176,7 +176,6 @@ void TestMainImpl(std::string func_name, std::string code_str,
   bool is_float16 = std::type_index(typeid(T)) ==
                     std::type_index(typeid(paddle::platform::float16));
 
-  paddle::framework::InitDevices(false, {0});
   paddle::platform::CUDAPlace place = paddle::platform::CUDAPlace(0);
   paddle::platform::CUDADeviceCode device_code(place, func_name, code_str);
   device_code.Compile(is_float16);
@@ -266,7 +265,7 @@ void TestElementwiseMain(
   }
 
   int n = cpu_tensors[0].numel();
-  if (dtype == "float16") {
+  if (dtype == "__half") {
     TestMainImpl<paddle::platform::float16>(func_name, code_str, cpu_tensors, n,
                                             input_ids, output_ids);
   } else {
@@ -275,7 +274,7 @@ void TestElementwiseMain(
   }
 
   // Check the results
-  float eps = (dtype == "float16") ? 1E-2 : 1E-5;
+  float eps = (dtype == "__half") ? 1E-2 : 1E-5;
   for (int i = 0; i < n; i++) {
     fusion_group::CheckOutput(expressions, cpu_tensors, input_ids, output_ids,
                               i, eps);
@@ -312,7 +311,7 @@ void TestMain(fusion_group::SubGraph* subgraph, std::vector<int> input_ids,
 }
 
 TEST(code_generator, elementwise) {
-  for (std::string dtype : {"float", "float16"}) {
+  for (std::string dtype : {"float", "__half"}) {
     // t2 = t0 * t1
     // t4 = t2 + t3
     // t6 = t4 - t5
@@ -342,7 +341,7 @@ TEST(code_generator, elementwise) {
 }
 
 TEST(code_generator, elementwise_grad) {
-  for (std::string dtype : {"float", "float16"}) {
+  for (std::string dtype : {"float", "__half"}) {
     // The var order: t0, t1, t2, t3, t0', t1', t2', t3'
     // t2 = t0 * t1
     // t3 = relu(t2)
@@ -407,7 +406,7 @@ std::unique_ptr<paddle::framework::ir::Graph> BuildGraph(bool backward,
 
   std::unique_ptr<paddle::framework::ir::Graph> graph(
       new paddle::framework::ir::Graph(layers.main_program()));
-  auto proto_dtype = (dtype == "float16")
+  auto proto_dtype = (dtype == "__half")
                          ? paddle::framework::proto::VarType::FP16
                          : paddle::framework::proto::VarType::FP32;
   for (auto* n : graph->Nodes()) {
@@ -463,10 +462,10 @@ std::unordered_set<paddle::framework::ir::Node*> DistilGradNodes(
 }
 
 TEST(code_generator, subgraph) {
-  for (std::string dtype : {"float", "float16"}) {
+  for (std::string dtype : {"float", "__half"}) {
     std::unique_ptr<paddle::framework::ir::Graph> graph =
         BuildGraph(false, dtype);
-    fusion_group::SubGraph subgraph(0, "elementwise_kernel_1", true,
+    fusion_group::SubGraph subgraph(0, "elementwise_kernel_1", false,
                                     graph->Nodes());
 
     // Expressions generated by code_generator (they may be different):
@@ -482,10 +481,10 @@ TEST(code_generator, subgraph) {
 }
 
 TEST(code_generator, subgraph_grad) {
-  for (std::string dtype : {"float", "float16"}) {
+  for (std::string dtype : {"float", "__half"}) {
     std::unique_ptr<paddle::framework::ir::Graph> graph =
         BuildGraph(true, dtype);
-    fusion_group::SubGraph subgraph(0, "elementwise_grad_kernel_1", true,
+    fusion_group::SubGraph subgraph(0, "elementwise_grad_kernel_1", false,
                                     DistilGradNodes(graph));
 
     // Expressions generated by code_generator (they may be different):
diff --git a/paddle/fluid/framework/ir/fusion_group/cuda_resources.h b/paddle/fluid/framework/ir/fusion_group/cuda_resources.h
index 6514b87b06ec1602ef5deb55e53094aeb1803f65..67838b4e3c75387e617efc3712b82dfe15382395 100644
--- a/paddle/fluid/framework/ir/fusion_group/cuda_resources.h
+++ b/paddle/fluid/framework/ir/fusion_group/cuda_resources.h
@@ -35,6 +35,7 @@ __device__ inline double Sqrt(double x) { return sqrt(x); }
 
 )";
 
+// List some bulit-in functions of __half implemented in cuda_fp16.hpp
 static constexpr char predefined_cuda_functions_fp16[] = R"(
 #define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
 #define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
@@ -193,6 +194,12 @@ __CUDA_FP16_DECL__ __half __hdiv(__half a, __half b) {
     return v;
 }
 
+__CUDA_FP16_DECL__ __half __hneg(const __half a)
+{
+    __half zero;
+    zero = __float2half(0.0);
+    return __hsub(zero, a);
+}
 
 /* Some basic arithmetic operations expected of a builtin */
 __device__ __forceinline__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); }
@@ -200,6 +207,10 @@ __device__ __forceinline__ __half operator-(const __half &lh, const __half &rh)
 __device__ __forceinline__ __half operator*(const __half &lh, const __half &rh) { return __hmul(lh, rh); }
 __device__ __forceinline__ __half operator/(const __half &lh, const __half &rh) { return __hdiv(lh, rh); }
 
+/* Unary plus and inverse operators */
+__device__ __forceinline__ __half operator+(const __half &h) { return h; }
+__device__ __forceinline__ __half operator-(const __half &h) { return __hneg(h); }
+
 /* Some basic comparison operations to make it look like a builtin */
 __device__ __forceinline__ bool operator==(const __half &lh, const __half &rh) { return __heq(lh, rh); }
 __device__ __forceinline__ bool operator!=(const __half &lh, const __half &rh) { return __hne(lh, rh); }
diff --git a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
index d70c81736dcab584312f83990a32d994f004cdcf..4dbb8ae4c6ec703c80da2f3a7a3e4bc59f7b2439 100644
--- a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
+++ b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
@@ -30,6 +30,14 @@ namespace ir {
 void FusionGroupPass::ApplyImpl(ir::Graph* graph) const {
   FusePassBase::Init("fusion_group_pass", graph);
   if (Get<bool>("use_gpu")) {
+    // TODO(liuyiqun): open this check.
+    // if (!platform::CUDADeviceCode::IsAvailable()) {
+    //   LOG(WARNING)
+    //       << "Disable fusion_group because CUDA Driver or NVRTC is not
+    //       avaiable.";
+    //   return 0;
+    // }
+
     fusion_group::OperationMap::Init();
     int num_elementwise_groups = DetectFusionGroup(graph, 0);
     AddStatis(num_elementwise_groups);
@@ -61,7 +69,7 @@ int FusionGroupPass::DetectFusionGroup(Graph* graph, int type) const {
       subgraph.DetectIntermediateOutWithGraph(graph);
     }
     if (subgraph.IsValid(min_subgraph_size)) {
-      subgraph.SetFuncName("FusedElementwise" + std::to_string(index++));
+      subgraph.SetFuncName("fused_elementwise_" + std::to_string(index++));
       if (GenerateCode(&subgraph)) {
         InsertFusionGroupOp(graph, &subgraph);
         num_subgraphs++;
diff --git a/paddle/fluid/platform/device_code.cc b/paddle/fluid/platform/device_code.cc
index 4f13f8e38893bd55c12771e9c8b180ac37a5259a..e8b2d5d4ed12d516776eace322d0aff6cb381d71 100644
--- a/paddle/fluid/platform/device_code.cc
+++ b/paddle/fluid/platform/device_code.cc
@@ -79,9 +79,61 @@ DeviceCodePool::DeviceCodePool(const std::vector<platform::Place>& places) {
 #endif
     }
   }
+
+#ifdef PADDLE_WITH_CUDA
+  CUDADeviceCode::CheckAvailableStatus();
+#endif
 }
 
 #ifdef PADDLE_WITH_CUDA
+static bool CheckCUDADriverResult(CUresult result, std::string caller,
+                                  std::string kernel_name = "") {
+  if (result != CUDA_SUCCESS) {
+    const char* error = nullptr;
+    dynload::cuGetErrorString(result, &error);
+    LOG_FIRST_N(WARNING, 1) << "Call " << caller << " for < " << kernel_name
+                            << " > failed: " << error << " (" << result << ")";
+    return false;
+  }
+  return true;
+}
+
+bool CUDADeviceCode::available_ = false;
+void CUDADeviceCode::CheckAvailableStatus() {
+  available_ = false;
+  if (!dynload::HasNVRTC() || !dynload::HasCUDADriver()) {
+    LOG_FIRST_N(WARNING, 1)
+        << "NVRTC and CUDA driver are need for JIT compiling of CUDA code.";
+    return;
+  }
+
+  int nvrtc_major = 0;
+  int nvrtc_minor = 0;
+  nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor);
+
+  int driver_version = 0;
+  int dirver_major = 0;
+  int driver_minor = 0;
+  CUresult driver_result = dynload::cuDriverGetVersion(&driver_version);
+  if (driver_result == CUDA_SUCCESS) {
+    dirver_major = driver_version / 1000;
+    driver_minor = (driver_version % 1000) / 10;
+  }
+
+  LOG_FIRST_N(INFO, 1) << "CUDA Driver Version: " << dirver_major << "."
+                       << driver_minor << "; NVRTC Version: " << nvrtc_major
+                       << "." << nvrtc_minor;
+  if (nvrtc_result != NVRTC_SUCCESS || driver_result != CUDA_SUCCESS) {
+    return;
+  }
+
+  int count = 0;
+  if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count),
+                            "cuDeviceGetCount")) {
+    available_ = true;
+  }
+}
+
 static std::string FindCUDAIncludePath() {
   auto EndWith = [](std::string str, std::string substr) -> bool {
     size_t pos = str.rfind(substr);
@@ -137,7 +189,7 @@ CUDADeviceCode::CUDADeviceCode(const Place& place, const std::string& name,
 bool CUDADeviceCode::Compile(bool include_path) {
   is_compiled_ = false;
   if (!dynload::HasNVRTC() || !dynload::HasCUDADriver()) {
-    LOG(WARNING)
+    LOG_FIRST_N(WARNING, 1)
         << "NVRTC and CUDA driver are need for JIT compiling of CUDA code.";
     return false;
   }
@@ -160,10 +212,11 @@ bool CUDADeviceCode::Compile(bool include_path) {
   std::string compute_flag =
       "--gpu-architecture=compute_" + std::to_string(compute_capability);
   std::vector<const char*> options = {"--std=c++11", compute_flag.c_str()};
+  std::string include_option;
   if (include_path) {
     std::string cuda_include_path = FindCUDAIncludePath();
     if (!cuda_include_path.empty()) {
-      std::string include_option = "--include-path=" + cuda_include_path;
+      include_option = "--include-path=" + cuda_include_path;
       options.push_back(include_option.c_str());
     }
   }
@@ -209,13 +262,13 @@ bool CUDADeviceCode::Compile(bool include_path) {
   }
 
   if (!CheckCUDADriverResult(dynload::cuModuleLoadData(&module_, ptx_.data()),
-                             "cuModuleLoadData")) {
+                             "cuModuleLoadData", name_)) {
     return false;
   }
 
   if (!CheckCUDADriverResult(
           dynload::cuModuleGetFunction(&function_, module_, name_.c_str()),
-          "cuModuleGetFunction")) {
+          "cuModuleGetFunction", name_)) {
     return false;
   }
 
@@ -253,19 +306,9 @@ void CUDADeviceCode::Launch(const size_t n, std::vector<void*>* args) const {
 bool CUDADeviceCode::CheckNVRTCResult(nvrtcResult result,
                                       std::string function) {
   if (result != NVRTC_SUCCESS) {
-    LOG(WARNING) << "Call " << function
-                 << " failed: " << dynload::nvrtcGetErrorString(result);
-    return false;
-  }
-  return true;
-}
-
-bool CUDADeviceCode::CheckCUDADriverResult(CUresult result,
-                                           std::string function) {
-  if (result != CUDA_SUCCESS) {
-    const char* error = nullptr;
-    LOG(WARNING) << "Call " << function
-                 << " failed: " << dynload::cuGetErrorString(result, &error);
+    LOG_FIRST_N(WARNING, 1)
+        << "Call " << function << " for < " << name_
+        << " > failed: " << dynload::nvrtcGetErrorString(result);
     return false;
   }
   return true;
diff --git a/paddle/fluid/platform/device_code.h b/paddle/fluid/platform/device_code.h
index 385207544063bfc3c1d53a51cc55e2ff18e60b50..6128d8b78db05e6ba92ffae46ee4c9cc8895c105 100644
--- a/paddle/fluid/platform/device_code.h
+++ b/paddle/fluid/platform/device_code.h
@@ -56,9 +56,13 @@ class CUDADeviceCode : public DeviceCode {
     workload_per_thread_ = workload_per_thread;
   }
 
+  static void CheckAvailableStatus();
+  static bool IsAvailable() { return available_; }
+
  private:
   bool CheckNVRTCResult(nvrtcResult result, std::string function);
-  bool CheckCUDADriverResult(CUresult result, std::string function);
+
+  static bool available_;
 
   bool is_compiled_{false};
   int max_threads_{0};
diff --git a/paddle/fluid/platform/dynload/cuda_driver.h b/paddle/fluid/platform/dynload/cuda_driver.h
index a37a47b7900d7ac50b277197b42d4f431bde7179..d39aceaa11bca0cce4f160c0a66dbd92d1813acb 100644
--- a/paddle/fluid/platform/dynload/cuda_driver.h
+++ b/paddle/fluid/platform/dynload/cuda_driver.h
@@ -60,6 +60,8 @@ extern bool HasCUDADriver();
  * include all needed cuda driver functions
  **/
 #define CUDA_ROUTINE_EACH(__macro)                      \
+  __macro(cuInit);                                      \
+  __macro(cuDriverGetVersion);                          \
   __macro(cuGetErrorString);                            \
   __macro(cuModuleLoadData);                            \
   __macro(cuModuleGetFunction);                         \
@@ -68,7 +70,7 @@ extern bool HasCUDADriver();
   __macro(cuLaunchKernel);                              \
   __macro(cuCtxCreate);                                 \
   __macro(cuCtxGetCurrent);                             \
-  __macro(cuDeviceGet);                                 \
+  __macro(cuDeviceGetCount);                            \
   __macro(cuDevicePrimaryCtxGetState)
 
 CUDA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index dd62265e333c9832eb6b1f7bed7cffe726a17c70..8ed998b0a464eb2e732b3dec7b1bd82c84163e3f 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -164,10 +164,10 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
   }
   auto error_msg =
       "Failed to find dynamic library: %s ( %s ) \n Please specify "
-      "its path correctly using following ways: \n Method. set "
+      "its path correctly using following ways: \n   set "
       "environment variable LD_LIBRARY_PATH on Linux or "
-      "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: "
-      "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, "
+      "DYLD_LIBRARY_PATH on Mac OS. \n   For instance, issue command: "
+      "export LD_LIBRARY_PATH=... \n   Note: After Mac OS 10.11, "
       "using the DYLD_LIBRARY_PATH is impossible unless System "
       "Integrity Protection (SIP) is disabled.";
 #if !defined(_WIN32)
@@ -238,17 +238,17 @@ void* GetCusolverDsoHandle() {
 
 void* GetNVRTCDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib");
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false);
 #else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so");
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false);
 #endif
 }
 
 void* GetCUDADsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib");
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false);
 #else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so");
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so", false);
 #endif
 }
 
diff --git a/paddle/fluid/platform/dynload/nvrtc.h b/paddle/fluid/platform/dynload/nvrtc.h
index b4437099baa63b46bb646242c45b34de524da916..08f81d4ea85610238aa0053c0008c0a03e2361d6 100644
--- a/paddle/fluid/platform/dynload/nvrtc.h
+++ b/paddle/fluid/platform/dynload/nvrtc.h
@@ -60,6 +60,7 @@ extern bool HasNVRTC();
  * include all needed nvrtc functions
  **/
 #define NVRTC_ROUTINE_EACH(__macro) \
+  __macro(nvrtcVersion);            \
   __macro(nvrtcGetErrorString);     \
   __macro(nvrtcCompileProgram);     \
   __macro(nvrtcCreateProgram);      \