diff --git a/src/framework/cl/cl_engine.h b/src/framework/cl/cl_engine.h
index f9f373b2a74087960b03c55ec922f95f187cfbc4..76d08513aa4301b9aa22b159a70a17b7b0619b92 100644
--- a/src/framework/cl/cl_engine.h
+++ b/src/framework/cl/cl_engine.h
@@ -114,6 +114,9 @@ class CLEngine {
 
   cl_device_id DeviceID(int index = 0) { return devices_[index]; }
 
+  std::string GetCLPath() { return cl_path_; }
+  void setClPath(std::string cl_path) { cl_path_ = cl_path; }
+
  private:
   CLEngine() { initialized_ = false; }
 
@@ -129,6 +132,7 @@ class CLEngine {
 
   cl_int status_;
 
+  std::string cl_path_;
   std::unique_ptr<_cl_program, CLProgramDeleter> program_;
 
   //  bool SetClContext();
diff --git a/src/framework/cl/cl_scope.h b/src/framework/cl/cl_scope.h
index 0965b133e6d8270b7cd6e28c8ed9a33739b2e2a8..c7c06ca75f47cd65d2350dfa6930068aca73ced0 100644
--- a/src/framework/cl/cl_scope.h
+++ b/src/framework/cl/cl_scope.h
@@ -58,7 +58,8 @@ class CLScope {
     }
 
     auto program = CLEngine::Instance()->CreateProgramWith(
-        context_.get(), "./cl_kernel/" + file_name);
+        context_.get(),
+        CLEngine::Instance()->GetCLPath() + "/cl_kernel/" + file_name);
 
     DLOG << " --- begin build program -> " << file_name << " --- ";
     CLEngine::Instance()->BuildProgram(program.get());
diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp
index 1b441bc5bdfed264f525dda571278067d2ae99b7..81bfaf3a4d07f5a3ef82c19de57f1681dfc1f8c7 100644
--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -649,12 +649,14 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {
 
 template <>
 void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
-  char *origin_data;
+  char *origin_data = nullptr;
+  bool self_alloc = false;
   if (program_.combined_params_buf && program_.combined_params_len) {
     LOG(kLOG_INFO) << "use outter memory";
     origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
   } else {
     LOG(kLOG_INFO) << " begin init combine memory";
+    self_alloc = true;
     origin_data = ReadFileToBuff(program_.para_path);
   }
   PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
@@ -701,7 +703,9 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
       }
     }
   }
-  delete origin_data;
+  if (self_alloc) {
+    delete origin_data;
+  }
   LOG(kLOG_INFO) << " end init combine memory ";
 }
 
diff --git a/src/framework/loader.cpp b/src/framework/loader.cpp
index a434314730eb40b7e4017050a84a7d9742934396..eb07e9f6155370880f6fb8e302a8e396df17954d 100644
--- a/src/framework/loader.cpp
+++ b/src/framework/loader.cpp
@@ -82,6 +82,54 @@ void Loader<GPU_CL, Precision::FP32>::InitMemoryFromProgram(
     }
   }
 }
+template <>
+const Program<GPU_CL, Precision::FP32>
+Loader<GPU_CL, Precision::FP32>::LoadCombinedMemory(
+    size_t read_size, const uint8_t *buf, size_t combined_params_len,
+    uint8_t *combined_params_buf, bool optimize, bool quantification) {
+  bool can_add_split = false;
+
+  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
+  PADDLE_MOBILE_ENFORCE(buf != nullptr, "read from __model__ is null");
+
+  c_program = paddle_mobile__framework__proto__program_desc__unpack(
+      nullptr, read_size, buf);
+  //
+  PADDLE_MOBILE_ENFORCE(c_program != nullptr, "program is null");
+  //
+  DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
+  //
+
+  auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
+
+  Program<GPU_CL, Precision::FP32> program;
+  program.combined = true;
+  program.originProgram = originProgramDesc;
+  program.quantification = quantification;
+  program.combined_params_len = combined_params_len;
+  program.combined_params_buf = combined_params_buf;
+
+  auto scope = std::make_shared<Scope>();
+  program.scope = scope;
+  InitMemoryFromProgram(originProgramDesc, scope);
+  if (optimize) {
+    ProgramOptimize program_optimize;
+    program.optimizeProgram =
+        program_optimize.FusionOptimize(originProgramDesc, can_add_split);
+    if (!program.optimizeProgram) {
+      program.optimizeProgram = originProgramDesc;
+    }
+  }
+  if (optimize) {
+    program.optimizeProgram->Description("optimize: ");
+  } else {
+    originProgramDesc->Description("program: ");
+  }
+  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program,
+                                                               nullptr);
+  return program;
+}
+
 #endif
 
 /**
diff --git a/src/io/api_paddle_mobile.cc b/src/io/api_paddle_mobile.cc
index 67f255315fa71acbf24f5071735020c0a435ce64..144cf127a44c78279ca1d95815646a4f01fed6bd 100644
--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
@@ -29,7 +29,9 @@ PaddleMobilePredictor<Dtype, P>::PaddleMobilePredictor(
 template <typename Dtype, Precision P>
 bool PaddleMobilePredictor<Dtype, P>::Init(const PaddleMobileConfig &config) {
   paddle_mobile_.reset(new PaddleMobile<Dtype, P>());
-
+#ifdef PADDLE_MOBILE_CL
+  paddle_mobile_->SetCLPath(config.cl_path);
+#endif
   if (config.memory_pack.from_memory) {
     DLOG << "load from memory!";
     paddle_mobile_->LoadCombinedMemory(config.memory_pack.model_size,
diff --git a/src/io/paddle_inference_api.h b/src/io/paddle_inference_api.h
index d37895d3aaa108edb1a8956ccbcb91cbe4b97725..3c9ffa00c7e749d1c9d77562b2db0b42ee605164 100644
--- a/src/io/paddle_inference_api.h
+++ b/src/io/paddle_inference_api.h
@@ -132,6 +132,7 @@ struct PaddleMobileConfig : public PaddlePredictor::Config {
   int thread_num = 1;
   std::string prog_file;
   std::string param_file;
+  std::string cl_path;
   struct PaddleModelMemoryPack memory_pack;
 };
 
diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp
index 3cd7c38b2b102659739aefc66b4b25f61cc48bcf..921b72520f1905fcdc7b2a0d15ee4ec5d844cda7 100644
--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -158,6 +158,13 @@ void PaddleMobile<Dtype, P>::Predict_To(int end) {
 }
 #endif
 
+#ifdef PADDLE_MOBILE_CL
+template <typename Dtype, Precision P>
+void PaddleMobile<Dtype, P>::SetCLPath(std::string path) {
+  framework::CLEngine::Instance()->setClPath(path);
+}
+#endif
+
 template class PaddleMobile<CPU, Precision::FP32>;
 template class PaddleMobile<FPGA, Precision::FP32>;
 template class PaddleMobile<GPU_MALI, Precision::FP32>;
diff --git a/src/io/paddle_mobile.h b/src/io/paddle_mobile.h
index 0e86fa988fe8a07131d3ea19fe7c606c27d70c2c..1e8f81c51e02ea6bdbdea8694aa62c9c30e6e6a8 100644
--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -26,6 +26,9 @@ limitations under the License. */
 #include "framework/load_ops.h"
 #include "framework/loader.h"
 #include "framework/tensor.h"
+#ifdef PADDLE_MOBILE_CL
+#include "framework/cl/cl_engine.h"
+#endif
 
 namespace paddle_mobile {
 
@@ -68,6 +71,11 @@ class PaddleMobile {
   void Predict_To(int end);
 #endif
 
+#ifdef PADDLE_MOBILE_CL
+ public:
+  void SetCLPath(std::string cl_path);
+#endif
+
  private:
   std::shared_ptr<framework::Loader<Dtype, P>> loader_;
   std::shared_ptr<framework::Executor<Dtype, P>> executor_;
diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp
index 44621ba99a92a3ed456b8d7d0959e3580662d910..605fa17c3c70ec3151cc1a2fb249edab336548a1 100644
--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -3307,8 +3307,13 @@ void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
       float *local_A = packedA + MC * KC * local_threads;
       float *local_C = packedC + MC * NC * local_threads;
       (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A);
-      InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C,
-                          &C(i, 0), ldc, relu, bias + i);
+      if (bias == nullptr) {
+        InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C,
+                            &C(i, 0), ldc, relu, nullptr);
+      } else {
+        InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C,
+                            &C(i, 0), ldc, relu, bias + i);
+      }
     }
   } else {
 #pragma omp parallel for
diff --git a/test/framework/test_load_memory_inference_api.cpp b/test/framework/test_load_memory_inference_api.cpp
index 05d51910172547c6dab7adc8231663be55c916bf..5b2773f8f1a21c3b9253b34fc5c18cd64ece27e7 100644
--- a/test/framework/test_load_memory_inference_api.cpp
+++ b/test/framework/test_load_memory_inference_api.cpp
@@ -55,11 +55,11 @@ static char *Get_binary_data(std::string filename) {
 paddle_mobile::PaddleMobileConfig GetConfig() {
   paddle_mobile::PaddleMobileConfig config;
   config.precision = paddle_mobile::PaddleMobileConfig::FP32;
-  config.device = paddle_mobile::PaddleMobileConfig::kCPU;
+  config.device = paddle_mobile::PaddleMobileConfig::kGPU_CL;
   const std::shared_ptr<paddle_mobile::PaddleModelMemoryPack> &memory_pack =
       std::make_shared<paddle_mobile::PaddleModelMemoryPack>();
-  auto model_path = std::string(g_genet_combine) + "/model";
-  auto params_path = std::string(g_genet_combine) + "/params";
+  auto model_path = std::string(g_mobilenet_combined) + "/model";
+  auto params_path = std::string(g_mobilenet_combined) + "/params";
   memory_pack->model_size =
       ReadBuffer(model_path.c_str(), &memory_pack->model_buf);
   std::cout << "sizeBuf: " << memory_pack->model_size << std::endl;
diff --git a/test/net/test_mobilenet_GPU.cpp b/test/net/test_mobilenet_GPU.cpp
index a5276d6e521855ad81e6b9e2edb58c271ae713d9..07582e10dd5db8985f87bae215b8cf1808431565 100644
--- a/test/net/test_mobilenet_GPU.cpp
+++ b/test/net/test_mobilenet_GPU.cpp
@@ -22,7 +22,7 @@ int main() {
   auto time1 = paddle_mobile::time();
   //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
   //                     std::string(g_mobilenet_detect) + "/params", true);
-
+  paddle_mobile.SetCLPath(".");
   auto isok = paddle_mobile.Load(std::string(g_mobilenet), true);
   if (isok) {
     auto time2 = paddle_mobile::time();