diff --git a/paddle/capi/Main.cpp b/paddle/capi/Main.cpp
index bb8249a5511c089ec2f2263ff4cc290f0a5a8fce..c038789340033fcf6dcc07a41b033a50e980c965 100644
--- a/paddle/capi/Main.cpp
+++ b/paddle/capi/Main.cpp
@@ -43,4 +43,11 @@ paddle_error paddle_init(int argc, char** argv) {
   isInit = true;
   return kPD_NO_ERROR;
 }
+
+paddle_error paddle_init_thread() {
+  if (FLAGS_use_gpu) {
+    hl_init(FLAGS_gpu_id);
+  }
+  return kPD_NO_ERROR;
+}
 }
diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
index 30f3a766f0c65187c8f2dd4603e3d26c9b9a6a3d..cbacd1fb71c14f490ff548db714e728772292b4b 100644
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -40,7 +40,7 @@ paddle_error paddle_matrix_destroy(paddle_matrix mat) {
 paddle_error paddle_matrix_set_row(paddle_matrix mat,
                                    uint64_t rowID,
                                    paddle_real* rowArray) {
-  if (mat == nullptr) return kPD_NULLPTR;
+  if (mat == nullptr || rowArray == nullptr) return kPD_NULLPTR;
   auto ptr = cast(mat);
   if (ptr->mat == nullptr) return kPD_NULLPTR;
   if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
diff --git a/paddle/capi/error.cpp b/paddle/capi/error.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..169b65f92104336d9ec12e2a5a6778db25080270
--- /dev/null
+++ b/paddle/capi/error.cpp
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "error.h"
+
+const char* paddle_error_string(paddle_error err) {
+  switch (err) {
+    case kPD_NULLPTR:
+      return "nullptr error";
+    case kPD_OUT_OF_RANGE:
+      return "out of range error";
+    case kPD_PROTOBUF_ERROR:
+      return "protobuf error";
+    case kPD_NOT_SUPPORTED:
+      return "not supported error";
+    case kPD_UNDEFINED_ERROR:
+      return "undefined error";
+    default:
+      return "";
+  }
+}
diff --git a/paddle/capi/error.h b/paddle/capi/error.h
index 44d8c2040d1aad698398089baeee6f13c3deeb55..9d9d0ed63a5276c6b9a8747e1ee1fce6872bdc9e 100644
--- a/paddle/capi/error.h
+++ b/paddle/capi/error.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #ifndef __PADDLE_CAPI_ERROR_H__
 #define __PADDLE_CAPI_ERROR_H__
 
+#include "config.h"
+
 /**
  * Error Type for Paddle API.
  */
@@ -27,4 +29,9 @@ typedef enum {
   kPD_UNDEFINED_ERROR = -1,
 } paddle_error;
 
+/**
+ * Error string for Paddle API.
+ */
+PD_API const char* paddle_error_string(paddle_error err);
+
 #endif
diff --git a/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt b/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
index 98e411ddc02a46034e8f6ceb00657622d998c9f3..2fc8debddedeab6ae982b0df49ec2b73bc0f85f5 100644
--- a/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
+++ b/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
@@ -1,8 +1,29 @@
 project(multi_thread)
 cmake_minimum_required(VERSION 2.8)
-aux_source_directory(. SRC_LIST)
-add_executable(${PROJECT_NAME} ${SRC_LIST})
+
 find_package (Threads)
+
+if(NOT PADDLE_ROOT)
+  set(PADDLE_ROOT $ENV{PADDLE_ROOT} CACHE PATH "Paddle Path")
+endif()
+if(PADDLE_ROOT)
+  include_directories(${PADDLE_ROOT}/include)
+  link_directories(${PADDLE_ROOT}/lib)
+endif()
+
+set(CPU_SRCS main.c)
+add_executable(${PROJECT_NAME} ${CPU_SRCS})
 set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
-target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared
-  ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${PROJECT_NAME}
+                      -lpaddle_capi_shared
+                      ${CMAKE_THREAD_LIBS_INIT})
+
+find_package(CUDA QUIET)
+if(CUDA_FOUND)
+  set(GPU_SRCS main_gpu.c)
+  cuda_add_executable(${PROJECT_NAME}_gpu ${GPU_SRCS})
+  set_property(TARGET ${PROJECT_NAME}_gpu PROPERTY C_STANDARD 99)
+  target_link_libraries(${PROJECT_NAME}_gpu
+                        -lpaddle_capi_shared
+                        ${CMAKE_THREAD_LIBS_INIT})
+endif(CUDA_FOUND)
diff --git a/paddle/capi/examples/model_inference/multi_thread/main_gpu.c b/paddle/capi/examples/model_inference/multi_thread/main_gpu.c
new file mode 100644
index 0000000000000000000000000000000000000000..6fd376e0d1a2fee4f9a0f676b53c6f2891795cab
--- /dev/null
+++ b/paddle/capi/examples/model_inference/multi_thread/main_gpu.c
@@ -0,0 +1,113 @@
+#include <paddle/capi.h>
+#include <pthread.h>
+#include <time.h>
+#include "../common/common.h"
+
+#define CONFIG_BIN "./trainer_config.bin"
+#define NUM_THREAD 4
+#define NUM_ITER 1000
+
+pthread_mutex_t mutex;
+
+/*
+ * @brief It is an simple inference example that runs multi-threads on a GPU.
+ *        Each thread holds it own local gradient_machine but shares the same
+ *        parameters.
+ *        If you want to run on different GPUs, you need to launch
+ *        multi-processes or set trainer_count > 1.
+ */
+void* thread_main(void* gm_ptr) {
+  // Initialize the thread environment of Paddle.
+  CHECK(paddle_init_thread());
+
+  paddle_gradient_machine machine = (paddle_gradient_machine)(gm_ptr);
+  // Create input arguments.
+  paddle_arguments in_args = paddle_arguments_create_none();
+  // Create input matrix.
+  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
+                                           /* size */ 784,
+                                           /* useGPU */ true);
+  // Create output arguments.
+  paddle_arguments out_args = paddle_arguments_create_none();
+  // Create output matrix.
+  paddle_matrix prob = paddle_matrix_create_none();
+
+  // CPU buffer to cache the input and output.
+  paddle_real* cpu_input = (paddle_real*)malloc(784 * sizeof(paddle_real));
+  paddle_real* cpu_output = (paddle_real*)malloc(10 * sizeof(paddle_real));
+  for (int iter = 0; iter < NUM_ITER; ++iter) {
+    // There is only one input layer of this network.
+    CHECK(paddle_arguments_resize(in_args, 1));
+    CHECK(paddle_arguments_set_value(in_args, 0, mat));
+
+    for (int i = 0; i < 784; ++i) {
+      cpu_input[i] = rand() / ((float)RAND_MAX);
+    }
+    CHECK(paddle_matrix_set_value(mat, cpu_input));
+
+    CHECK(paddle_gradient_machine_forward(machine,
+                                          in_args,
+                                          out_args,
+                                          /* isTrain */ false));
+
+    CHECK(paddle_arguments_get_value(out_args, 0, prob));
+    CHECK(paddle_matrix_get_value(prob, cpu_output));
+
+    pthread_mutex_lock(&mutex);
+    printf("Prob: ");
+    for (int i = 0; i < 10; ++i) {
+      printf("%.2f ", cpu_output[i]);
+    }
+    printf("\n");
+    pthread_mutex_unlock(&mutex);
+  }
+
+  CHECK(paddle_matrix_destroy(prob));
+  CHECK(paddle_arguments_destroy(out_args));
+  CHECK(paddle_matrix_destroy(mat));
+  CHECK(paddle_arguments_destroy(in_args));
+  CHECK(paddle_gradient_machine_destroy(machine));
+
+  free(cpu_input);
+  free(cpu_output);
+
+  return NULL;
+}
+
+int main() {
+  // Initalize Paddle
+  char* argv[] = {"--use_gpu=True"};
+  CHECK(paddle_init(1, (char**)argv));
+
+  // Reading config binary file. It is generated by `convert_protobin.sh`
+  long size;
+  void* buf = read_config(CONFIG_BIN, &size);
+
+  // Create a gradient machine for inference.
+  paddle_gradient_machine machine;
+  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
+  CHECK(paddle_gradient_machine_randomize_param(machine));
+
+  // Loading parameter. Uncomment the following line and change the directory.
+  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
+  //                                                "./some_where_to_params"));
+  srand(time(0));
+  pthread_mutex_init(&mutex, NULL);
+
+  pthread_t threads[NUM_THREAD];
+
+  for (int i = 0; i < NUM_THREAD; ++i) {
+    paddle_gradient_machine thread_local_machine;
+    CHECK(paddle_gradient_machine_create_shared_param(
+        machine, buf, size, &thread_local_machine));
+    pthread_create(&threads[i], NULL, thread_main, thread_local_machine);
+  }
+
+  for (int i = 0; i < NUM_THREAD; ++i) {
+    pthread_join(threads[i], NULL);
+  }
+
+  pthread_mutex_destroy(&mutex);
+
+  return 0;
+}
diff --git a/paddle/capi/main.h b/paddle/capi/main.h
index 893ebcbd58dd24cf835fb2005865c94c9ba2a810..99c4e8428dbaa14d36dc2d36b2a4f16c9ec3e0d1 100644
--- a/paddle/capi/main.h
+++ b/paddle/capi/main.h
@@ -26,6 +26,13 @@ extern "C" {
  */
 PD_API paddle_error paddle_init(int argc, char** argv);
 
+/**
+ * Initialize the thread environment of Paddle.
+ * @note it is requisite for GPU runs but optional for CPU runs.
+ *       For GPU runs, all threads will run on the same GPU devices.
+ */
+PD_API paddle_error paddle_init_thread();
+
 #ifdef __cplusplus
 }
 #endif